From 67920d2e83658b92a39e25956999c2ed61738ade Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Tue, 3 May 2022 14:39:57 +0200 Subject: refactor(compute): Expose CPU and system stats via Host interface This change updates the `Host` interface to directly expose CPU and system stats to be used by components that interface with the `Host` interface. Previously, this would require the user to interact with the OpenTelemetry SDK. Although that is still possible for more advanced usage cases, users can use the following methods to easily access common host and guest statistics. --- .../kotlin/org/opendc/compute/simulator/SimHost.kt | 90 +++++++++++++--- .../org/opendc/compute/simulator/internal/Guest.kt | 47 +++++++- .../org/opendc/compute/simulator/SimHostTest.kt | 120 +++++---------------- 3 files changed, 147 insertions(+), 110 deletions(-) (limited to 'opendc-compute/opendc-compute-simulator') diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt index 4eb6392e..323ae4fe 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt @@ -29,11 +29,14 @@ import io.opentelemetry.api.metrics.MeterProvider import io.opentelemetry.api.metrics.ObservableDoubleMeasurement import io.opentelemetry.api.metrics.ObservableLongMeasurement import kotlinx.coroutines.* -import mu.KotlinLogging import org.opendc.compute.api.Flavor import org.opendc.compute.api.Server import org.opendc.compute.api.ServerState import org.opendc.compute.service.driver.* +import org.opendc.compute.service.driver.telemetry.GuestCpuStats +import org.opendc.compute.service.driver.telemetry.GuestSystemStats +import org.opendc.compute.service.driver.telemetry.HostCpuStats +import org.opendc.compute.service.driver.telemetry.HostSystemStats import org.opendc.compute.simulator.internal.Guest import org.opendc.compute.simulator.internal.GuestListener import org.opendc.simulator.compute.* @@ -49,6 +52,8 @@ import org.opendc.simulator.compute.power.PowerDriver import org.opendc.simulator.compute.power.SimplePowerDriver import org.opendc.simulator.compute.workload.SimWorkload import org.opendc.simulator.flow.FlowEngine +import java.time.Duration +import java.time.Instant import java.util.* import kotlin.coroutines.CoroutineContext @@ -80,11 +85,6 @@ public class SimHost( */ private val clock = engine.clock - /** - * The logger instance of this server. - */ - private val logger = KotlinLogging.logger {} - /** * The [Meter] to track metrics of the simulated host. */ @@ -112,6 +112,9 @@ public class SimHost( private val guests = HashMap() private val _guests = mutableListOf() + override val instances: Set + get() = guests.keys + override val state: HostState get() = _state private var _state: HostState = HostState.DOWN @@ -249,6 +252,68 @@ public class SimHost( machine.cancel() } + override fun getSystemStats(): HostSystemStats { + updateUptime() + + var terminated = 0 + var running = 0 + var error = 0 + var invalid = 0 + + val guests = _guests.listIterator() + for (guest in guests) { + when (guest.state) { + ServerState.TERMINATED -> terminated++ + ServerState.RUNNING -> running++ + ServerState.ERROR -> error++ + ServerState.DELETED -> { + // Remove guests that have been deleted + this.guests.remove(guest.server) + guests.remove() + } + else -> invalid++ + } + } + + return HostSystemStats( + Duration.ofMillis(_uptime), + Duration.ofMillis(_downtime), + Instant.ofEpochMilli(_bootTime), + machine.powerUsage, + machine.energyUsage, + terminated, + running, + error, + invalid + ) + } + + override fun getSystemStats(server: Server): GuestSystemStats { + val guest = requireNotNull(guests[server]) { "Unknown server ${server.uid} at host $uid" } + return guest.getSystemStats() + } + + override fun getCpuStats(): HostCpuStats { + val counters = hypervisor.counters + counters.flush() + + return HostCpuStats( + counters.cpuActiveTime / 1000L, + counters.cpuIdleTime / 1000L, + counters.cpuStealTime / 1000L, + counters.cpuLostTime / 1000L, + hypervisor.cpuCapacity, + hypervisor.cpuDemand, + hypervisor.cpuUsage, + hypervisor.cpuUsage / _cpuLimit + ) + } + + override fun getCpuStats(server: Server): GuestCpuStats { + val guest = requireNotNull(guests[server]) { "Unknown server ${server.uid} at host $uid" } + return guest.getCpuStats() + } + override fun hashCode(): Int = uid.hashCode() override fun equals(other: Any?): Boolean { @@ -417,13 +482,12 @@ public class SimHost( * Helper function to track the CPU time of a machine. */ private fun collectCpuTime(result: ObservableLongMeasurement) { - val counters = hypervisor.counters - counters.flush() + val stats = getCpuStats() - result.record(counters.cpuActiveTime / 1000L, _activeState) - result.record(counters.cpuIdleTime / 1000L, _idleState) - result.record(counters.cpuStealTime / 1000L, _stealState) - result.record(counters.cpuLostTime / 1000L, _lostState) + result.record(stats.activeTime, _activeState) + result.record(stats.idleTime, _idleState) + result.record(stats.stealTime, _stealState) + result.record(stats.lostTime, _lostState) val guests = _guests for (i in guests.indices) { @@ -450,7 +514,7 @@ public class SimHost( val guests = _guests for (i in guests.indices) { - guests[i].updateUptime(duration) + guests[i].updateUptime() } } diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt index bb378ee3..0d4c550d 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt @@ -32,6 +32,8 @@ import kotlinx.coroutines.* import mu.KotlinLogging import org.opendc.compute.api.Server import org.opendc.compute.api.ServerState +import org.opendc.compute.service.driver.telemetry.GuestCpuStats +import org.opendc.compute.service.driver.telemetry.GuestSystemStats import org.opendc.compute.simulator.SimHost import org.opendc.compute.simulator.SimWorkloadMapper import org.opendc.simulator.compute.kernel.SimHypervisor @@ -39,6 +41,8 @@ import org.opendc.simulator.compute.kernel.SimVirtualMachine import org.opendc.simulator.compute.runWorkload import org.opendc.simulator.compute.workload.SimWorkload import java.time.Clock +import java.time.Duration +import java.time.Instant import kotlin.coroutines.CoroutineContext /** @@ -145,6 +149,37 @@ internal class Guest( doStart() } + /** + * Obtain the system statistics of this guest. + */ + fun getSystemStats(): GuestSystemStats { + updateUptime() + + return GuestSystemStats( + Duration.ofMillis(_uptime), + Duration.ofMillis(_downtime), + Instant.ofEpochMilli(_bootTime) + ) + } + + /** + * Obtain the CPU statistics of this guest. + */ + fun getCpuStats(): GuestCpuStats { + val counters = machine.counters + counters.flush() + + return GuestCpuStats( + counters.cpuActiveTime / 1000L, + counters.cpuIdleTime / 1000L, + counters.cpuStealTime / 1000L, + counters.cpuLostTime / 1000L, + machine.cpuCapacity, + machine.cpuUsage, + machine.cpuUsage / _cpuLimit + ) + } + /** * The [Job] representing the current active virtual machine instance or `null` if no virtual machine is active. */ @@ -209,6 +244,8 @@ internal class Guest( * This method is invoked when the guest stopped. */ private fun onStop(target: ServerState) { + updateUptime() + state = target listener.onStop(this) } @@ -224,10 +261,16 @@ internal class Guest( .put(STATE_KEY, "down") .build() + private var _lastReport = clock.millis() + /** * Helper function to track the uptime and downtime of the guest. */ - fun updateUptime(duration: Long) { + fun updateUptime() { + val now = clock.millis() + val duration = now - _lastReport + _lastReport = now + if (state == ServerState.RUNNING) { _uptime += duration } else if (state == ServerState.ERROR) { @@ -239,6 +282,8 @@ internal class Guest( * Helper function to track the uptime of the guest. */ fun collectUptime(result: ObservableLongMeasurement) { + updateUptime() + result.record(_uptime, _upState) result.record(_downtime, _downState) } diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt index f0325023..67689dc8 100644 --- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt +++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt @@ -22,8 +22,7 @@ package org.opendc.compute.simulator -import io.opentelemetry.sdk.metrics.SdkMeterProvider -import io.opentelemetry.sdk.resources.Resource +import io.opentelemetry.api.metrics.MeterProvider import kotlinx.coroutines.* import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.BeforeEach @@ -42,13 +41,6 @@ import org.opendc.simulator.compute.workload.SimTraceFragment import org.opendc.simulator.compute.workload.SimTraceWorkload import org.opendc.simulator.core.runBlockingSimulation import org.opendc.simulator.flow.FlowEngine -import org.opendc.telemetry.compute.ComputeMetricExporter -import org.opendc.telemetry.compute.HOST_ID -import org.opendc.telemetry.compute.table.HostTableReader -import org.opendc.telemetry.compute.table.ServerTableReader -import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader -import org.opendc.telemetry.sdk.toOtelClock -import java.time.Duration import java.util.* import kotlin.coroutines.resume @@ -73,45 +65,16 @@ internal class SimHostTest { */ @Test fun testOvercommitted() = runBlockingSimulation { - var idleTime = 0L - var activeTime = 0L - var stealTime = 0L - - val hostId = UUID.randomUUID() - val hostResource = Resource.builder() - .put(HOST_ID, hostId.toString()) - .build() - - // Setup metric reader val duration = 5 * 60L - val reader = CoroutineMetricReader( - this, - object : ComputeMetricExporter() { - override fun record(reader: HostTableReader) { - activeTime += reader.cpuActiveTime - idleTime += reader.cpuIdleTime - stealTime += reader.cpuStealTime - } - }, - exportInterval = Duration.ofSeconds(duration) - ) - - val meterProvider = SdkMeterProvider - .builder() - .setResource(hostResource) - .setClock(clock.toOtelClock()) - .registerMetricReader(reader) - .build() - val engine = FlowEngine(coroutineContext, clock) - val virtDriver = SimHost( - uid = hostId, + val host = SimHost( + uid = UUID.randomUUID(), name = "test", model = machineModel, meta = emptyMap(), coroutineContext, engine, - meterProvider, + MeterProvider.noop(), SimFairShareHypervisorProvider() ) val vmImageA = MockImage( @@ -150,11 +113,11 @@ internal class SimHostTest { val flavor = MockFlavor(2, 0) coroutineScope { - launch { virtDriver.spawn(MockServer(UUID.randomUUID(), "a", flavor, vmImageA)) } - launch { virtDriver.spawn(MockServer(UUID.randomUUID(), "b", flavor, vmImageB)) } + launch { host.spawn(MockServer(UUID.randomUUID(), "a", flavor, vmImageA)) } + launch { host.spawn(MockServer(UUID.randomUUID(), "b", flavor, vmImageB)) } suspendCancellableCoroutine { cont -> - virtDriver.addListener(object : HostListener { + host.addListener(object : HostListener { private var finished = 0 override fun onStateChanged(host: Host, server: Server, newState: ServerState) { @@ -168,13 +131,14 @@ internal class SimHostTest { // Ensure last cycle is collected delay(1000L * duration) - virtDriver.close() - meterProvider.close() + host.close() + + val cpuStats = host.getCpuStats() assertAll( - { assertEquals(658, activeTime, "Active time does not match") }, - { assertEquals(2341, idleTime, "Idle time does not match") }, - { assertEquals(637, stealTime, "Steal time does not match") }, + { assertEquals(658, cpuStats.activeTime, "Active time does not match") }, + { assertEquals(2341, cpuStats.idleTime, "Idle time does not match") }, + { assertEquals(637, cpuStats.stealTime, "Steal time does not match") }, { assertEquals(1500001, clock.millis()) } ) } @@ -184,54 +148,16 @@ internal class SimHostTest { */ @Test fun testFailure() = runBlockingSimulation { - var activeTime = 0L - var idleTime = 0L - var uptime = 0L - var downtime = 0L - var guestUptime = 0L - var guestDowntime = 0L - - val hostId = UUID.randomUUID() - val hostResource = Resource.builder() - .put(HOST_ID, hostId.toString()) - .build() - - // Setup metric reader val duration = 5 * 60L - val reader = CoroutineMetricReader( - this, - object : ComputeMetricExporter() { - override fun record(reader: HostTableReader) { - activeTime += reader.cpuActiveTime - idleTime += reader.cpuIdleTime - uptime += reader.uptime - downtime += reader.downtime - } - - override fun record(reader: ServerTableReader) { - guestUptime += reader.uptime - guestDowntime += reader.downtime - } - }, - exportInterval = Duration.ofSeconds(duration) - ) - - val meterProvider = SdkMeterProvider - .builder() - .setResource(hostResource) - .setClock(clock.toOtelClock()) - .registerMetricReader(reader) - .build() - val engine = FlowEngine(coroutineContext, clock) val host = SimHost( - uid = hostId, + uid = UUID.randomUUID(), name = "test", model = machineModel, meta = emptyMap(), coroutineContext, engine, - meterProvider, + MeterProvider.noop(), SimFairShareHypervisorProvider() ) val image = MockImage( @@ -275,15 +201,17 @@ internal class SimHostTest { // Ensure last cycle is collected delay(1000L * duration) - meterProvider.close() + val cpuStats = host.getCpuStats() + val sysStats = host.getSystemStats() + val guestSysStats = host.getSystemStats(server) assertAll( - { assertEquals(1775, idleTime, "Idle time does not match") }, - { assertEquals(624, activeTime, "Active time does not match") }, - { assertEquals(900001, uptime, "Uptime does not match") }, - { assertEquals(300000, downtime, "Downtime does not match") }, - { assertEquals(900000, guestUptime, "Guest uptime does not match") }, - { assertEquals(300000, guestDowntime, "Guest downtime does not match") }, + { assertEquals(1775, cpuStats.idleTime, "Idle time does not match") }, + { assertEquals(624, cpuStats.activeTime, "Active time does not match") }, + { assertEquals(900001, sysStats.uptime.toMillis(), "Uptime does not match") }, + { assertEquals(300000, sysStats.downtime.toMillis(), "Downtime does not match") }, + { assertEquals(900001, guestSysStats.uptime.toMillis(), "Guest uptime does not match") }, + { assertEquals(300000, guestSysStats.downtime.toMillis(), "Guest downtime does not match") }, ) } -- cgit v1.2.3 From 564911a2458b3c54834d5cbfed91f502e9856566 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Wed, 4 May 2022 14:43:17 +0200 Subject: refactor(compute): Directly expose scheduler stats to user This change updates the `ComputeService` interface to directly expose statistics about the scheduler to the user, such that they do not necessarily have to interact with OpenTelemetry to obtain these values. --- .../src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt | 3 +++ 1 file changed, 3 insertions(+) (limited to 'opendc-compute/opendc-compute-simulator') diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt index 67689dc8..fd54ad1d 100644 --- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt +++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt @@ -41,6 +41,7 @@ import org.opendc.simulator.compute.workload.SimTraceFragment import org.opendc.simulator.compute.workload.SimTraceWorkload import org.opendc.simulator.core.runBlockingSimulation import org.opendc.simulator.flow.FlowEngine +import java.time.Instant import java.util.* import kotlin.coroutines.resume @@ -260,6 +261,8 @@ internal class SimHostTest { override val state: ServerState = ServerState.TERMINATED + override val launchedAt: Instant? = null + override suspend fun start() {} override suspend fun stop() {} -- cgit v1.2.3