From c7eec7904e08029b3ab31d3e7b21afa1ea9ab7e6 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Wed, 4 May 2022 16:24:53 +0200 Subject: refactor(compute/service): Remove OpenTelemetry from "compute" modules This change removes the OpenTelemetry integration from the OpenDC Compute modules. Previously, we chose to integrate OpenTelemetry to provide a unified way to report metrics to the users. Although this worked as expected, the overhead of the OpenTelemetry when collecting metrics during simulation was considerable and lacked more optimization opportunities (other than providing a separate API implementation). Furthermore, since we were tied to OpenTelemetry's SDK implementation, we experienced issues with throttling and registering multiple instruments. We will instead use another approach, where we expose the core metrics in OpenDC via specialized interfaces (see the commits before) such that access is fast and can be done without having to interface with OpenTelemetry. In addition, we will provide an adapter to that is able to forward these metrics to OpenTelemetry implementations, so we can still integrate with the wider ecosystem. --- .../opendc-compute-simulator/build.gradle.kts | 3 - .../kotlin/org/opendc/compute/simulator/SimHost.kt | 171 +-------------------- .../org/opendc/compute/simulator/internal/Guest.kt | 144 +---------------- .../org/opendc/compute/simulator/SimHostTest.kt | 3 - 4 files changed, 10 insertions(+), 311 deletions(-) (limited to 'opendc-compute/opendc-compute-simulator') diff --git a/opendc-compute/opendc-compute-simulator/build.gradle.kts b/opendc-compute/opendc-compute-simulator/build.gradle.kts index e81d87ec..72962147 100644 --- a/opendc-compute/opendc-compute-simulator/build.gradle.kts +++ b/opendc-compute/opendc-compute-simulator/build.gradle.kts @@ -32,11 +32,8 @@ dependencies { api(projects.opendcSimulator.opendcSimulatorCompute) api(libs.commons.math3) implementation(projects.opendcCommon) - implementation(libs.opentelemetry.semconv) implementation(libs.kotlin.logging) testImplementation(projects.opendcSimulator.opendcSimulatorCore) - testImplementation(projects.opendcTelemetry.opendcTelemetrySdk) - testImplementation(projects.opendcTelemetry.opendcTelemetryCompute) testRuntimeOnly(libs.slf4j.simple) } diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt index 323ae4fe..c28239b4 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt @@ -22,12 +22,6 @@ package org.opendc.compute.simulator -import io.opentelemetry.api.common.AttributeKey -import io.opentelemetry.api.common.Attributes -import io.opentelemetry.api.metrics.Meter -import io.opentelemetry.api.metrics.MeterProvider -import io.opentelemetry.api.metrics.ObservableDoubleMeasurement -import io.opentelemetry.api.metrics.ObservableLongMeasurement import kotlinx.coroutines.* import org.opendc.compute.api.Flavor import org.opendc.compute.api.Server @@ -67,7 +61,6 @@ public class SimHost( override val meta: Map, context: CoroutineContext, engine: FlowEngine, - meterProvider: MeterProvider, hypervisorProvider: SimHypervisorProvider, scalingGovernor: ScalingGovernor = PerformanceScalingGovernor(), powerDriver: PowerDriver = SimplePowerDriver(ConstantPowerModel(0.0)), @@ -85,11 +78,6 @@ public class SimHost( */ private val clock = engine.clock - /** - * The [Meter] to track metrics of the simulated host. - */ - private val meter = meterProvider.get("org.opendc.compute.simulator") - /** * The event listeners registered with this host. */ @@ -142,48 +130,6 @@ public class SimHost( init { launch() - - meter.upDownCounterBuilder("system.guests") - .setDescription("Number of guests on this host") - .setUnit("1") - .buildWithCallback(::collectGuests) - meter.gaugeBuilder("system.cpu.limit") - .setDescription("Amount of CPU resources available to the host") - .buildWithCallback(::collectCpuLimit) - meter.gaugeBuilder("system.cpu.demand") - .setDescription("Amount of CPU resources the guests would use if there were no CPU contention or CPU limits") - .setUnit("MHz") - .buildWithCallback { result -> result.record(hypervisor.cpuDemand) } - meter.gaugeBuilder("system.cpu.usage") - .setDescription("Amount of CPU resources used by the host") - .setUnit("MHz") - .buildWithCallback { result -> result.record(hypervisor.cpuUsage) } - meter.gaugeBuilder("system.cpu.utilization") - .setDescription("Utilization of the CPU resources of the host") - .setUnit("%") - .buildWithCallback { result -> result.record(hypervisor.cpuUsage / _cpuLimit) } - meter.counterBuilder("system.cpu.time") - .setDescription("Amount of CPU time spent by the host") - .setUnit("s") - .buildWithCallback(::collectCpuTime) - meter.gaugeBuilder("system.power.usage") - .setDescription("Power usage of the host ") - .setUnit("W") - .buildWithCallback { result -> result.record(machine.powerUsage) } - meter.counterBuilder("system.power.total") - .setDescription("Amount of energy used by the CPU") - .setUnit("J") - .ofDoubles() - .buildWithCallback { result -> result.record(machine.energyUsage) } - meter.counterBuilder("system.time") - .setDescription("The uptime of the host") - .setUnit("s") - .buildWithCallback(::collectUptime) - meter.gaugeBuilder("system.time.boot") - .setDescription("The boot time of the host") - .setUnit("1") - .ofLongs() - .buildWithCallback(::collectBootTime) } override fun canFit(server: Server): Boolean { @@ -278,7 +224,7 @@ public class SimHost( return HostSystemStats( Duration.ofMillis(_uptime), Duration.ofMillis(_downtime), - Instant.ofEpochMilli(_bootTime), + _bootTime, machine.powerUsage, machine.energyUsage, terminated, @@ -358,7 +304,7 @@ public class SimHost( _ctx = machine.startWorkload(object : SimWorkload { override fun onStart(ctx: SimMachineContext) { try { - _bootTime = clock.millis() + _bootTime = clock.instant() _state = HostState.UP hypervisor.onStart(ctx) } catch (cause: Throwable) { @@ -422,80 +368,11 @@ public class SimHost( return MachineModel(processingUnits, memoryUnits) } - private val STATE_KEY = AttributeKey.stringKey("state") - - private val terminatedState = Attributes.of(STATE_KEY, "terminated") - private val runningState = Attributes.of(STATE_KEY, "running") - private val errorState = Attributes.of(STATE_KEY, "error") - private val invalidState = Attributes.of(STATE_KEY, "invalid") - - /** - * Helper function to collect the guest counts on this host. - */ - private fun collectGuests(result: ObservableLongMeasurement) { - var terminated = 0L - var running = 0L - var error = 0L - var invalid = 0L - - val guests = _guests.listIterator() - for (guest in guests) { - when (guest.state) { - ServerState.TERMINATED -> terminated++ - ServerState.RUNNING -> running++ - ServerState.ERROR -> error++ - ServerState.DELETED -> { - // Remove guests that have been deleted - this.guests.remove(guest.server) - guests.remove() - } - else -> invalid++ - } - } - - result.record(terminated, terminatedState) - result.record(running, runningState) - result.record(error, errorState) - result.record(invalid, invalidState) - } - - private val _cpuLimit = machine.model.cpus.sumOf { it.frequency } - - /** - * Helper function to collect the CPU limits of a machine. - */ - private fun collectCpuLimit(result: ObservableDoubleMeasurement) { - result.record(_cpuLimit) - - val guests = _guests - for (i in guests.indices) { - guests[i].collectCpuLimit(result) - } - } - - private val _activeState = Attributes.of(STATE_KEY, "active") - private val _stealState = Attributes.of(STATE_KEY, "steal") - private val _lostState = Attributes.of(STATE_KEY, "lost") - private val _idleState = Attributes.of(STATE_KEY, "idle") - - /** - * Helper function to track the CPU time of a machine. - */ - private fun collectCpuTime(result: ObservableLongMeasurement) { - val stats = getCpuStats() - - result.record(stats.activeTime, _activeState) - result.record(stats.idleTime, _idleState) - result.record(stats.stealTime, _stealState) - result.record(stats.lostTime, _lostState) - - val guests = _guests - for (i in guests.indices) { - guests[i].collectCpuTime(result) - } - } - private var _lastReport = clock.millis() + private var _uptime = 0L + private var _downtime = 0L + private var _bootTime: Instant? = null + private val _cpuLimit = machine.model.cpus.sumOf { it.frequency } /** * Helper function to track the uptime of a machine. @@ -517,40 +394,4 @@ public class SimHost( guests[i].updateUptime() } } - - private var _uptime = 0L - private var _downtime = 0L - private val _upState = Attributes.of(STATE_KEY, "up") - private val _downState = Attributes.of(STATE_KEY, "down") - - /** - * Helper function to track the uptime of a machine. - */ - private fun collectUptime(result: ObservableLongMeasurement) { - updateUptime() - - result.record(_uptime, _upState) - result.record(_downtime, _downState) - - val guests = _guests - for (i in guests.indices) { - guests[i].collectUptime(result) - } - } - - private var _bootTime = Long.MIN_VALUE - - /** - * Helper function to track the boot time of a machine. - */ - private fun collectBootTime(result: ObservableLongMeasurement) { - if (_bootTime != Long.MIN_VALUE) { - result.record(_bootTime) - } - - val guests = _guests - for (i in guests.indices) { - guests[i].collectBootTime(result) - } - } } diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt index 0d4c550d..ea3c6549 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt @@ -22,12 +22,6 @@ package org.opendc.compute.simulator.internal -import io.opentelemetry.api.common.AttributeKey -import io.opentelemetry.api.common.Attributes -import io.opentelemetry.api.common.AttributesBuilder -import io.opentelemetry.api.metrics.ObservableDoubleMeasurement -import io.opentelemetry.api.metrics.ObservableLongMeasurement -import io.opentelemetry.semconv.resource.attributes.ResourceAttributes import kotlinx.coroutines.* import mu.KotlinLogging import org.opendc.compute.api.Server @@ -76,11 +70,6 @@ internal class Guest( */ var state: ServerState = ServerState.TERMINATED - /** - * The attributes of the guest. - */ - val attributes: Attributes = GuestAttributes(this) - /** * Start the guest. */ @@ -158,7 +147,7 @@ internal class Guest( return GuestSystemStats( Duration.ofMillis(_uptime), Duration.ofMillis(_downtime), - Instant.ofEpochMilli(_bootTime) + _bootTime ) } @@ -235,7 +224,7 @@ internal class Guest( * This method is invoked when the guest was started on the host and has booted into a running state. */ private fun onStart() { - _bootTime = clock.millis() + _bootTime = clock.instant() state = ServerState.RUNNING listener.onStart(this) } @@ -250,18 +239,11 @@ internal class Guest( listener.onStop(this) } - private val STATE_KEY = AttributeKey.stringKey("state") - private var _uptime = 0L private var _downtime = 0L - private val _upState = attributes.toBuilder() - .put(STATE_KEY, "up") - .build() - private val _downState = attributes.toBuilder() - .put(STATE_KEY, "down") - .build() - private var _lastReport = clock.millis() + private var _bootTime: Instant? = null + private val _cpuLimit = machine.model.cpus.sumOf { it.frequency } /** * Helper function to track the uptime and downtime of the guest. @@ -277,122 +259,4 @@ internal class Guest( _downtime += duration } } - - /** - * Helper function to track the uptime of the guest. - */ - fun collectUptime(result: ObservableLongMeasurement) { - updateUptime() - - result.record(_uptime, _upState) - result.record(_downtime, _downState) - } - - private var _bootTime = Long.MIN_VALUE - - /** - * Helper function to track the boot time of the guest. - */ - fun collectBootTime(result: ObservableLongMeasurement) { - if (_bootTime != Long.MIN_VALUE) { - result.record(_bootTime, attributes) - } - } - - private val _activeState = attributes.toBuilder() - .put(STATE_KEY, "active") - .build() - private val _stealState = attributes.toBuilder() - .put(STATE_KEY, "steal") - .build() - private val _lostState = attributes.toBuilder() - .put(STATE_KEY, "lost") - .build() - private val _idleState = attributes.toBuilder() - .put(STATE_KEY, "idle") - .build() - - /** - * Helper function to track the CPU time of a machine. - */ - fun collectCpuTime(result: ObservableLongMeasurement) { - val counters = machine.counters - counters.flush() - - result.record(counters.cpuActiveTime / 1000, _activeState) - result.record(counters.cpuIdleTime / 1000, _idleState) - result.record(counters.cpuStealTime / 1000, _stealState) - result.record(counters.cpuLostTime / 1000, _lostState) - } - - private val _cpuLimit = machine.model.cpus.sumOf { it.frequency } - - /** - * Helper function to collect the CPU limits of a machine. - */ - fun collectCpuLimit(result: ObservableDoubleMeasurement) { - result.record(_cpuLimit, attributes) - } - - /** - * An optimized [Attributes] implementation. - */ - private class GuestAttributes(private val uid: String, private val attributes: Attributes) : Attributes by attributes { - /** - * Construct a [GuestAttributes] instance from a [Guest]. - */ - constructor(guest: Guest) : this( - guest.server.uid.toString(), - Attributes.builder() - .put(ResourceAttributes.HOST_NAME, guest.server.name) - .put(ResourceAttributes.HOST_ID, guest.server.uid.toString()) - .put(ResourceAttributes.HOST_TYPE, guest.server.flavor.name) - .put(AttributeKey.longKey("host.num_cpus"), guest.server.flavor.cpuCount.toLong()) - .put(AttributeKey.longKey("host.mem_capacity"), guest.server.flavor.memorySize) - .put(AttributeKey.stringArrayKey("host.labels"), guest.server.labels.map { (k, v) -> "$k:$v" }) - .put(ResourceAttributes.HOST_ARCH, ResourceAttributes.HostArchValues.AMD64) - .put(ResourceAttributes.HOST_IMAGE_NAME, guest.server.image.name) - .put(ResourceAttributes.HOST_IMAGE_ID, guest.server.image.uid.toString()) - .build() - ) - - override fun get(key: AttributeKey): T? { - // Optimize access to the HOST_ID key which is accessed quite often - if (key == ResourceAttributes.HOST_ID) { - @Suppress("UNCHECKED_CAST") - return uid as T? - } - return attributes.get(key) - } - - override fun toBuilder(): AttributesBuilder { - val delegate = attributes.toBuilder() - return object : AttributesBuilder { - - override fun putAll(attributes: Attributes): AttributesBuilder { - delegate.putAll(attributes) - return this - } - - override fun put(key: AttributeKey, value: Int): AttributesBuilder { - delegate.put(key, value) - return this - } - - override fun put(key: AttributeKey, value: T): AttributesBuilder { - delegate.put(key, value) - return this - } - - override fun build(): Attributes = GuestAttributes(uid, delegate.build()) - } - } - - override fun equals(other: Any?): Boolean = attributes == other - - // Cache hash code - private val _hash = attributes.hashCode() - - override fun hashCode(): Int = _hash - } } diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt index fd54ad1d..5ba4a667 100644 --- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt +++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt @@ -22,7 +22,6 @@ package org.opendc.compute.simulator -import io.opentelemetry.api.metrics.MeterProvider import kotlinx.coroutines.* import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.BeforeEach @@ -75,7 +74,6 @@ internal class SimHostTest { meta = emptyMap(), coroutineContext, engine, - MeterProvider.noop(), SimFairShareHypervisorProvider() ) val vmImageA = MockImage( @@ -158,7 +156,6 @@ internal class SimHostTest { meta = emptyMap(), coroutineContext, engine, - MeterProvider.noop(), SimFairShareHypervisorProvider() ) val image = MockImage( -- cgit v1.2.3