From c7eec7904e08029b3ab31d3e7b21afa1ea9ab7e6 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Wed, 4 May 2022 16:24:53 +0200 Subject: refactor(compute/service): Remove OpenTelemetry from "compute" modules This change removes the OpenTelemetry integration from the OpenDC Compute modules. Previously, we chose to integrate OpenTelemetry to provide a unified way to report metrics to the users. Although this worked as expected, the overhead of the OpenTelemetry when collecting metrics during simulation was considerable and lacked more optimization opportunities (other than providing a separate API implementation). Furthermore, since we were tied to OpenTelemetry's SDK implementation, we experienced issues with throttling and registering multiple instruments. We will instead use another approach, where we expose the core metrics in OpenDC via specialized interfaces (see the commits before) such that access is fast and can be done without having to interface with OpenTelemetry. In addition, we will provide an adapter to that is able to forward these metrics to OpenTelemetry implementations, so we can still integrate with the wider ecosystem. --- .../opendc-compute-service/build.gradle.kts | 2 - .../org/opendc/compute/service/ComputeService.kt | 6 +- .../service/driver/telemetry/GuestSystemStats.kt | 2 +- .../service/driver/telemetry/HostSystemStats.kt | 2 +- .../compute/service/internal/ComputeServiceImpl.kt | 88 +--------------------- .../compute/service/internal/InternalServer.kt | 18 ----- .../opendc/compute/service/ComputeServiceTest.kt | 3 +- 7 files changed, 6 insertions(+), 115 deletions(-) (limited to 'opendc-compute/opendc-compute-service') diff --git a/opendc-compute/opendc-compute-service/build.gradle.kts b/opendc-compute/opendc-compute-service/build.gradle.kts index b42c2919..fd15b6e7 100644 --- a/opendc-compute/opendc-compute-service/build.gradle.kts +++ b/opendc-compute/opendc-compute-service/build.gradle.kts @@ -29,10 +29,8 @@ plugins { dependencies { api(projects.opendcCompute.opendcComputeApi) - api(projects.opendcTelemetry.opendcTelemetryApi) implementation(projects.opendcCommon) implementation(libs.kotlin.logging) - implementation(libs.opentelemetry.semconv) testImplementation(projects.opendcSimulator.opendcSimulatorCore) testRuntimeOnly(libs.log4j.slf4j) diff --git a/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/ComputeService.kt b/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/ComputeService.kt index 3a6baaa1..c0b70268 100644 --- a/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/ComputeService.kt +++ b/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/ComputeService.kt @@ -22,8 +22,6 @@ package org.opendc.compute.service -import io.opentelemetry.api.metrics.Meter -import io.opentelemetry.api.metrics.MeterProvider import org.opendc.compute.api.ComputeClient import org.opendc.compute.api.Server import org.opendc.compute.service.driver.Host @@ -79,18 +77,16 @@ public interface ComputeService : AutoCloseable { * * @param context The [CoroutineContext] to use in the service. * @param clock The clock instance to use. - * @param meterProvider The [MeterProvider] for creating a [Meter] for the service. * @param scheduler The scheduler implementation to use. * @param schedulingQuantum The interval between scheduling cycles. */ public operator fun invoke( context: CoroutineContext, clock: Clock, - meterProvider: MeterProvider, scheduler: ComputeScheduler, schedulingQuantum: Duration = Duration.ofMinutes(5), ): ComputeService { - return ComputeServiceImpl(context, clock, meterProvider, scheduler, schedulingQuantum) + return ComputeServiceImpl(context, clock, scheduler, schedulingQuantum) } } } diff --git a/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/driver/telemetry/GuestSystemStats.kt b/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/driver/telemetry/GuestSystemStats.kt index b3958473..6fec5175 100644 --- a/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/driver/telemetry/GuestSystemStats.kt +++ b/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/driver/telemetry/GuestSystemStats.kt @@ -35,5 +35,5 @@ import java.time.Instant public data class GuestSystemStats( val uptime: Duration, val downtime: Duration, - val bootTime: Instant + val bootTime: Instant? ) diff --git a/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/driver/telemetry/HostSystemStats.kt b/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/driver/telemetry/HostSystemStats.kt index 1c07023f..9d34a5ce 100644 --- a/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/driver/telemetry/HostSystemStats.kt +++ b/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/driver/telemetry/HostSystemStats.kt @@ -41,7 +41,7 @@ import java.time.Instant public data class HostSystemStats( val uptime: Duration, val downtime: Duration, - val bootTime: Instant, + val bootTime: Instant?, val powerUsage: Double, val energyUsage: Double, val guestsTerminated: Int, diff --git a/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/internal/ComputeServiceImpl.kt b/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/internal/ComputeServiceImpl.kt index e8664e5c..21aaa19e 100644 --- a/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/internal/ComputeServiceImpl.kt +++ b/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/internal/ComputeServiceImpl.kt @@ -22,11 +22,6 @@ package org.opendc.compute.service.internal -import io.opentelemetry.api.common.AttributeKey -import io.opentelemetry.api.common.Attributes -import io.opentelemetry.api.metrics.Meter -import io.opentelemetry.api.metrics.MeterProvider -import io.opentelemetry.api.metrics.ObservableLongMeasurement import kotlinx.coroutines.* import mu.KotlinLogging import org.opendc.common.util.Pacer @@ -49,14 +44,12 @@ import kotlin.math.max * * @param context The [CoroutineContext] to use in the service. * @param clock The clock instance to use. - * @param meterProvider The [MeterProvider] for creating a [Meter] for the service. * @param scheduler The scheduler implementation to use. * @param schedulingQuantum The interval between scheduling cycles. */ internal class ComputeServiceImpl( private val context: CoroutineContext, private val clock: Clock, - meterProvider: MeterProvider, private val scheduler: ComputeScheduler, schedulingQuantum: Duration ) : ComputeService, HostListener { @@ -70,11 +63,6 @@ internal class ComputeServiceImpl( */ private val logger = KotlinLogging.logger {} - /** - * The [Meter] to track metrics of the [ComputeService]. - */ - private val meter = meterProvider.get("org.opendc.compute.service") - /** * The [Random] instance used to generate unique identifiers for the objects. */ @@ -117,72 +105,20 @@ internal class ComputeServiceImpl( private var maxCores = 0 private var maxMemory = 0L - - /** - * The number of scheduling attempts. - */ - private val _schedulingAttempts = meter.counterBuilder("scheduler.attempts") - .setDescription("Number of scheduling attempts") - .setUnit("1") - .build() - private val _schedulingAttemptsSuccessAttr = Attributes.of(AttributeKey.stringKey("result"), "success") - private val _schedulingAttemptsFailureAttr = Attributes.of(AttributeKey.stringKey("result"), "failure") - private val _schedulingAttemptsErrorAttr = Attributes.of(AttributeKey.stringKey("result"), "error") private var _attemptsSuccess = 0L private var _attemptsFailure = 0L private var _attemptsError = 0L - - /** - * The response time of the service. - */ - private val _schedulingLatency = meter.histogramBuilder("scheduler.latency") - .setDescription("End to end latency for a server to be scheduled (in multiple attempts)") - .ofLongs() - .setUnit("ms") - .build() - - /** - * The number of servers that are pending. - */ - private val _servers = meter.upDownCounterBuilder("scheduler.servers") - .setDescription("Number of servers managed by the scheduler") - .setUnit("1") - .build() - private val _serversPendingAttr = Attributes.of(AttributeKey.stringKey("state"), "pending") - private val _serversActiveAttr = Attributes.of(AttributeKey.stringKey("state"), "active") private var _serversPending = 0 private var _serversActive = 0 /** * The [Pacer] to use for scheduling the scheduler cycles. */ - private val pacer = Pacer(scope.coroutineContext, clock, schedulingQuantum.toMillis(), ::doSchedule) + private val pacer = Pacer(scope.coroutineContext, clock, schedulingQuantum.toMillis()) { doSchedule() } override val hosts: Set get() = hostToView.keys - init { - val upState = Attributes.of(AttributeKey.stringKey("state"), "up") - val downState = Attributes.of(AttributeKey.stringKey("state"), "down") - - meter.upDownCounterBuilder("scheduler.hosts") - .setDescription("Number of hosts registered with the scheduler") - .setUnit("1") - .buildWithCallback { result -> - val total = hosts.size - val available = availableHosts.size.toLong() - - result.record(available, upState) - result.record(total - available, downState) - } - - meter.gaugeBuilder("system.time.provision") - .setDescription("The most recent timestamp where the server entered a provisioned state") - .setUnit("1") - .ofLongs() - .buildWithCallback(::collectProvisionTime) - } - override fun newClient(): ComputeClient { check(scope.isActive) { "Service is already closed" } return object : ComputeClient { @@ -355,7 +291,6 @@ internal class ComputeServiceImpl( server.launchedAt = Instant.ofEpochMilli(now) queue.add(request) _serversPending++ - _servers.add(1, _serversPendingAttr) requestSchedulingCycle() return request } @@ -387,14 +322,13 @@ internal class ComputeServiceImpl( /** * Run a single scheduling iteration. */ - private fun doSchedule(now: Long) { + private fun doSchedule() { while (queue.isNotEmpty()) { val request = queue.peek() if (request.isCancelled) { queue.poll() _serversPending-- - _servers.add(-1, _serversPendingAttr) continue } @@ -407,9 +341,7 @@ internal class ComputeServiceImpl( // Remove the incoming image queue.poll() _serversPending-- - _servers.add(-1, _serversPendingAttr) _attemptsFailure++ - _schedulingAttempts.add(1, _schedulingAttemptsFailureAttr) logger.warn { "Failed to spawn $server: does not fit [${clock.instant()}]" } @@ -425,8 +357,6 @@ internal class ComputeServiceImpl( // Remove request from queue queue.poll() _serversPending-- - _servers.add(-1, _serversPendingAttr) - _schedulingLatency.record(now - request.submitTime, server.attributes) logger.info { "Assigned server $server to host $host." } @@ -442,10 +372,8 @@ internal class ComputeServiceImpl( host.spawn(server) activeServers[server] = host - _servers.add(1, _serversActiveAttr) _serversActive++ _attemptsSuccess++ - _schedulingAttempts.add(1, _schedulingAttemptsSuccessAttr) } catch (e: Throwable) { logger.error(e) { "Failed to deploy VM" } @@ -454,7 +382,6 @@ internal class ComputeServiceImpl( hv.availableMemory += server.flavor.memorySize _attemptsError++ - _schedulingAttempts.add(1, _schedulingAttemptsErrorAttr) } } } @@ -511,7 +438,6 @@ internal class ComputeServiceImpl( if (activeServers.remove(server) != null) { _serversActive-- - _servers.add(-1, _serversActiveAttr) } val hv = hostToView[host] @@ -527,14 +453,4 @@ internal class ComputeServiceImpl( requestSchedulingCycle() } } - - /** - * Collect the timestamp when each server entered its provisioning state most recently. - */ - private fun collectProvisionTime(result: ObservableLongMeasurement) { - for ((_, server) in servers) { - val launchedAt = server.launchedAt ?: continue - result.record(launchedAt.toEpochMilli(), server.attributes) - } - } } diff --git a/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/internal/InternalServer.kt b/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/internal/InternalServer.kt index d2a2d896..f9da24d8 100644 --- a/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/internal/InternalServer.kt +++ b/opendc-compute/opendc-compute-service/src/main/kotlin/org/opendc/compute/service/internal/InternalServer.kt @@ -22,9 +22,6 @@ package org.opendc.compute.service.internal -import io.opentelemetry.api.common.AttributeKey -import io.opentelemetry.api.common.Attributes -import io.opentelemetry.semconv.resource.attributes.ResourceAttributes import mu.KotlinLogging import org.opendc.compute.api.* import org.opendc.compute.service.driver.Host @@ -53,21 +50,6 @@ internal class InternalServer( */ private val watchers = mutableListOf() - /** - * The attributes of a server. - */ - @JvmField internal val attributes: Attributes = Attributes.builder() - .put(ResourceAttributes.HOST_NAME, name) - .put(ResourceAttributes.HOST_ID, uid.toString()) - .put(ResourceAttributes.HOST_TYPE, flavor.name) - .put(AttributeKey.longKey("host.num_cpus"), flavor.cpuCount.toLong()) - .put(AttributeKey.longKey("host.mem_capacity"), flavor.memorySize) - .put(AttributeKey.stringArrayKey("host.labels"), labels.map { (k, v) -> "$k:$v" }) - .put(ResourceAttributes.HOST_ARCH, ResourceAttributes.HostArchValues.AMD64) - .put(ResourceAttributes.HOST_IMAGE_NAME, image.name) - .put(ResourceAttributes.HOST_IMAGE_ID, image.uid.toString()) - .build() - /** * The [Host] that has been assigned to host the server. */ diff --git a/opendc-compute/opendc-compute-service/src/test/kotlin/org/opendc/compute/service/ComputeServiceTest.kt b/opendc-compute/opendc-compute-service/src/test/kotlin/org/opendc/compute/service/ComputeServiceTest.kt index eb106817..cc7be4a8 100644 --- a/opendc-compute/opendc-compute-service/src/test/kotlin/org/opendc/compute/service/ComputeServiceTest.kt +++ b/opendc-compute/opendc-compute-service/src/test/kotlin/org/opendc/compute/service/ComputeServiceTest.kt @@ -23,7 +23,6 @@ package org.opendc.compute.service import io.mockk.* -import io.opentelemetry.api.metrics.MeterProvider import kotlinx.coroutines.delay import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.Assertions.assertNull @@ -59,7 +58,7 @@ internal class ComputeServiceTest { filters = listOf(ComputeFilter(), VCpuFilter(allocationRatio = 1.0), RamFilter(allocationRatio = 1.0)), weighers = listOf(RamWeigher()) ) - service = ComputeService(scope.coroutineContext, clock, MeterProvider.noop(), computeScheduler) + service = ComputeService(scope.coroutineContext, clock, computeScheduler) } @Test -- cgit v1.2.3