diff options
| author | Fabian Mastenbroek <mail.fabianm@gmail.com> | 2021-10-01 22:04:35 +0200 |
|---|---|---|
| committer | Fabian Mastenbroek <mail.fabianm@gmail.com> | 2021-10-03 17:17:41 +0200 |
| commit | 081221684fb826ab5a00c1d8cc5a9886b9e2203c (patch) | |
| tree | 7f2202429256b4cb96812f96b682a021f8236180 /opendc-compute | |
| parent | 6e424e9b44687d01e618e7bc38afc427610cd845 (diff) | |
feat(simulator): Expose CPU time counters directly on hypervisor
This change adds a new interface to the SimHypervisor interface that
exposes the CPU time counters directly. These are derived from the flow
counters and will be used by SimHost to expose them via telemetry.
Diffstat (limited to 'opendc-compute')
3 files changed, 105 insertions, 162 deletions
diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt index f499927d..4b96872b 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt @@ -50,7 +50,6 @@ import org.opendc.simulator.compute.power.SimplePowerDriver import org.opendc.simulator.flow.FlowEngine import java.util.* import kotlin.coroutines.CoroutineContext -import kotlin.math.roundToLong /** * A [Host] that is simulates virtual machines on a physical machine using [SimHypervisor]. @@ -63,7 +62,7 @@ public class SimHost( context: CoroutineContext, engine: FlowEngine, meterProvider: MeterProvider, - hypervisor: SimHypervisorProvider, + hypervisorProvider: SimHypervisorProvider, scalingGovernor: ScalingGovernor = PerformanceScalingGovernor(), powerDriver: PowerDriver = SimplePowerDriver(ConstantPowerModel(0.0)), private val mapper: SimWorkloadMapper = SimMetaWorkloadMapper(), @@ -103,29 +102,8 @@ public class SimHost( /** * The hypervisor to run multiple workloads. */ - private val hypervisor: SimHypervisor = hypervisor.create( - engine, - scalingGovernor = scalingGovernor, - interferenceDomain = interferenceDomain, - listener = object : SimHypervisor.Listener { - override fun onSliceFinish( - hypervisor: SimHypervisor, - totalWork: Double, - grantedWork: Double, - overcommittedWork: Double, - interferedWork: Double, - cpuUsage: Double, - cpuDemand: Double - ) { - _cpuDemand = cpuDemand - _cpuUsage = cpuUsage - - collectTime() - } - } - ) - private var _cpuUsage = 0.0 - private var _cpuDemand = 0.0 + private val hypervisor: SimHypervisor = hypervisorProvider + .create(engine, scalingGovernor = scalingGovernor, interferenceDomain = interferenceDomain) /** * The virtual machines running on the hypervisor. @@ -157,22 +135,13 @@ public class SimHost( } } + /** + * The [Job] that represents the machine running the hypervisor. + */ + private var _job: Job? = null + init { - // Launch hypervisor onto machine - scope.launch { - try { - _bootTime = clock.millis() - _state = HostState.UP - machine.run(this@SimHost.hypervisor, emptyMap()) - } catch (_: CancellationException) { - // Ignored - } catch (cause: Throwable) { - logger.error(cause) { "Host failed" } - throw cause - } finally { - _state = HostState.DOWN - } - } + launch() meter.upDownCounterBuilder("system.guests") .setDescription("Number of guests on this host") @@ -184,15 +153,15 @@ public class SimHost( meter.gaugeBuilder("system.cpu.demand") .setDescription("Amount of CPU resources the guests would use if there were no CPU contention or CPU limits") .setUnit("MHz") - .buildWithCallback { result -> result.observe(_cpuDemand) } + .buildWithCallback { result -> result.observe(hypervisor.cpuDemand) } meter.gaugeBuilder("system.cpu.usage") .setDescription("Amount of CPU resources used by the host") .setUnit("MHz") - .buildWithCallback { result -> result.observe(_cpuUsage) } + .buildWithCallback { result -> result.observe(hypervisor.cpuUsage) } meter.gaugeBuilder("system.cpu.utilization") .setDescription("Utilization of the CPU resources of the host") .setUnit("%") - .buildWithCallback { result -> result.observe(_cpuUsage / _cpuLimit) } + .buildWithCallback { result -> result.observe(hypervisor.cpuUsage / _cpuLimit) } meter.counterBuilder("system.cpu.time") .setDescription("Amount of CPU time spent by the host") .setUnit("s") @@ -200,16 +169,16 @@ public class SimHost( meter.gaugeBuilder("system.power.usage") .setDescription("Power usage of the host ") .setUnit("W") - .buildWithCallback { result -> result.observe(machine.powerDraw) } + .buildWithCallback { result -> result.observe(machine.powerUsage) } meter.counterBuilder("system.power.total") .setDescription("Amount of energy used by the CPU") .setUnit("J") .ofDoubles() - .buildWithCallback(::collectPowerTotal) + .buildWithCallback { result -> result.observe(machine.energyUsage) } meter.counterBuilder("system.time") .setDescription("The uptime of the host") .setUnit("s") - .buildWithCallback(::collectTime) + .buildWithCallback(::collectUptime) meter.gaugeBuilder("system.time.boot") .setDescription("The boot time of the host") .setUnit("1") @@ -290,9 +259,12 @@ public class SimHost( } public suspend fun recover() { - collectTime() - _state = HostState.UP - _bootTime = clock.millis() + updateUptime() + + launch() + + // Wait for the hypervisor to launch before recovering the guests + yield() for (guest in guests.values) { guest.recover() @@ -300,14 +272,42 @@ public class SimHost( } /** + * Launch the hypervisor. + */ + private fun launch() { + check(_job == null) { "Concurrent hypervisor running" } + + // Launch hypervisor onto machine + _job = scope.launch { + try { + _bootTime = clock.millis() + _state = HostState.UP + machine.run(hypervisor, emptyMap()) + } catch (_: CancellationException) { + // Ignored + } catch (cause: Throwable) { + logger.error(cause) { "Host failed" } + throw cause + } finally { + _state = HostState.DOWN + } + } + } + + /** * Reset the machine. */ private fun reset() { - collectTime() + updateUptime() + + // Stop the hypervisor + val job = _job + if (job != null) { + job.cancel() + _job = null + } _state = HostState.DOWN - _cpuUsage = 0.0 - _cpuDemand = 0.0 } /** @@ -385,85 +385,46 @@ public class SimHost( } } - private var _lastCpuTimeCallback = clock.millis() - - /** - * Helper function to track the CPU time of a machine. - */ - private fun collectCpuTime(result: ObservableLongMeasurement) { - val now = clock.millis() - val duration = now - _lastCpuTimeCallback - - try { - collectCpuTime(duration, result) - } finally { - _lastCpuTimeCallback = now - } - } - private val _activeState = Attributes.of(STATE_KEY, "active") private val _stealState = Attributes.of(STATE_KEY, "steal") private val _lostState = Attributes.of(STATE_KEY, "lost") private val _idleState = Attributes.of(STATE_KEY, "idle") - private var _totalTime = 0.0 /** * Helper function to track the CPU time of a machine. */ - private fun collectCpuTime(duration: Long, result: ObservableLongMeasurement) { - val coreCount = this.model.cpuCount - val d = coreCount / _cpuLimit - + private fun collectCpuTime(result: ObservableLongMeasurement) { val counters = hypervisor.counters - val grantedWork = counters.actual - val overcommittedWork = counters.overcommit - val interferedWork = counters.interference - _totalTime += (duration / 1000.0) * coreCount - val activeTime = (grantedWork * d).roundToLong() - val idleTime = (_totalTime - grantedWork * d).roundToLong() - val stealTime = (overcommittedWork * d).roundToLong() - val lostTime = (interferedWork * d).roundToLong() - - result.observe(activeTime, _activeState) - result.observe(idleTime, _idleState) - result.observe(stealTime, _stealState) - result.observe(lostTime, _lostState) + result.observe(counters.cpuActiveTime / 1000L, _activeState) + result.observe(counters.cpuIdleTime / 1000L, _idleState) + result.observe(counters.cpuStealTime / 1000L, _stealState) + result.observe(counters.cpuLostTime / 1000L, _lostState) for (guest in guests.values) { - guest.collectCpuTime(duration, result) + guest.collectCpuTime(result) } } - private var _lastPowerCallback = clock.millis() - private var _totalPower = 0.0 - - /** - * Helper function to collect the total power usage of the machine. - */ - private fun collectPowerTotal(result: ObservableDoubleMeasurement) { - val now = clock.millis() - val duration = now - _lastPowerCallback - - _totalPower += duration / 1000.0 * machine.powerDraw - result.observe(_totalPower) - - _lastPowerCallback = now - } - private var _lastReport = clock.millis() /** * Helper function to track the uptime of a machine. */ - private fun collectTime(result: ObservableLongMeasurement? = null) { + private fun updateUptime() { val now = clock.millis() val duration = now - _lastReport + _lastReport = now - try { - collectTime(duration, result) - } finally { - _lastReport = now + if (_state == HostState.UP) { + _uptime += duration + } else if (_state == HostState.DOWN && scope.isActive) { + // Only increment downtime if the machine is in a failure state + _downtime += duration + } + + for (guest in guests.values) { + guest.updateUptime(duration) } } @@ -475,19 +436,14 @@ public class SimHost( /** * Helper function to track the uptime of a machine. */ - private fun collectTime(duration: Long, result: ObservableLongMeasurement? = null) { - if (state == HostState.UP) { - _uptime += duration - } else if (state == HostState.DOWN && scope.isActive) { - // Only increment downtime if the machine is in a failure state - _downtime += duration - } + private fun collectUptime(result: ObservableLongMeasurement) { + updateUptime() - result?.observe(_uptime, _upState) - result?.observe(_downtime, _downState) + result.observe(_uptime, _upState) + result.observe(_downtime, _downState) for (guest in guests.values) { - guest.collectUptime(duration, result) + guest.collectUptime(result) } } @@ -496,9 +452,9 @@ public class SimHost( /** * Helper function to track the boot time of a machine. */ - private fun collectBootTime(result: ObservableLongMeasurement? = null) { + private fun collectBootTime(result: ObservableLongMeasurement) { if (_bootTime != Long.MIN_VALUE) { - result?.observe(_bootTime) + result.observe(_bootTime) } for (guest in guests.values) { diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt index eda76ba0..3ac165c8 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt @@ -33,12 +33,10 @@ import org.opendc.compute.api.Server import org.opendc.compute.api.ServerState import org.opendc.compute.simulator.SimHost import org.opendc.compute.simulator.SimWorkloadMapper -import org.opendc.simulator.compute.SimAbstractMachine -import org.opendc.simulator.compute.SimMachine +import org.opendc.simulator.compute.kernel.SimVirtualMachine import org.opendc.simulator.compute.workload.SimWorkload import java.time.Clock import kotlin.coroutines.CoroutineContext -import kotlin.math.roundToLong /** * A virtual machine instance that is managed by a [SimHost]. @@ -50,7 +48,7 @@ internal class Guest( private val mapper: SimWorkloadMapper, private val listener: GuestListener, val server: Server, - val machine: SimMachine + val machine: SimVirtualMachine ) { /** * The [CoroutineScope] of the guest. @@ -236,17 +234,22 @@ internal class Guest( .build() /** - * Helper function to track the uptime of the guest. + * Helper function to track the uptime and downtime of the guest. */ - fun collectUptime(duration: Long, result: ObservableLongMeasurement? = null) { + fun updateUptime(duration: Long) { if (state == ServerState.RUNNING) { _uptime += duration } else if (state == ServerState.ERROR) { _downtime += duration } + } - result?.observe(_uptime, _upState) - result?.observe(_downtime, _downState) + /** + * Helper function to track the uptime of the guest. + */ + fun collectUptime(result: ObservableLongMeasurement) { + result.observe(_uptime, _upState) + result.observe(_downtime, _downState) } private var _bootTime = Long.MIN_VALUE @@ -254,9 +257,9 @@ internal class Guest( /** * Helper function to track the boot time of the guest. */ - fun collectBootTime(result: ObservableLongMeasurement? = null) { + fun collectBootTime(result: ObservableLongMeasurement) { if (_bootTime != Long.MIN_VALUE) { - result?.observe(_bootTime) + result.observe(_bootTime) } } @@ -276,33 +279,17 @@ internal class Guest( .putAll(attributes) .put(STATE_KEY, "idle") .build() - private var _totalTime = 0.0 /** * Helper function to track the CPU time of a machine. */ - fun collectCpuTime(duration: Long, result: ObservableLongMeasurement) { - val coreCount = server.flavor.cpuCount - val d = coreCount / _cpuLimit - - var grantedWork = 0.0 - var overcommittedWork = 0.0 - - for (cpu in (machine as SimAbstractMachine).cpus) { - val counters = cpu.counters - grantedWork += counters.actual - overcommittedWork += counters.overcommit - } - - _totalTime += (duration / 1000.0) * coreCount - val activeTime = (grantedWork * d).roundToLong() - val idleTime = (_totalTime - grantedWork * d).roundToLong() - val stealTime = (overcommittedWork * d).roundToLong() + fun collectCpuTime(result: ObservableLongMeasurement) { + val counters = machine.counters - result.observe(activeTime, _activeState) - result.observe(idleTime, _idleState) - result.observe(stealTime, _stealState) - result.observe(0, _lostState) + result.observe(counters.cpuActiveTime / 1000, _activeState) + result.observe(counters.cpuIdleTime / 1000, _idleState) + result.observe(counters.cpuStealTime / 1000, _stealState) + result.observe(counters.cpuLostTime / 1000, _lostState) } private val _cpuLimit = machine.model.cpus.sumOf { it.frequency } diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt index d2293be7..26089b6d 100644 --- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt +++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt @@ -170,9 +170,9 @@ internal class SimHostTest { reader.close() assertAll( - { assertEquals(659, activeTime, "Active time does not match") }, - { assertEquals(2342, idleTime, "Idle time does not match") }, - { assertEquals(638, stealTime, "Steal time does not match") }, + { assertEquals(658, activeTime, "Active time does not match") }, + { assertEquals(1741, idleTime, "Idle time does not match") }, + { assertEquals(637, stealTime, "Steal time does not match") }, { assertEquals(1500001, clock.millis()) } ) } @@ -253,7 +253,7 @@ internal class SimHostTest { host.spawn(server) delay(5000L) host.fail() - delay(5000L) + delay(duration * 1000) host.recover() suspendCancellableCoroutine<Unit> { cont -> @@ -274,12 +274,12 @@ internal class SimHostTest { reader.close() assertAll( - { assertEquals(2661, idleTime, "Idle time does not match") }, - { assertEquals(339, activeTime, "Active time does not match") }, - { assertEquals(1195001, uptime, "Uptime does not match") }, - { assertEquals(5000, downtime, "Downtime does not match") }, - { assertEquals(1195000, guestUptime, "Guest uptime does not match") }, - { assertEquals(5000, guestDowntime, "Guest downtime does not match") }, + { assertEquals(1175, idleTime, "Idle time does not match") }, + { assertEquals(624, activeTime, "Active time does not match") }, + { assertEquals(900001, uptime, "Uptime does not match") }, + { assertEquals(300000, downtime, "Downtime does not match") }, + { assertEquals(900000, guestUptime, "Guest uptime does not match") }, + { assertEquals(300000, guestDowntime, "Guest downtime does not match") }, ) } |
