summaryrefslogtreecommitdiff
path: root/opendc-compute/opendc-compute-simulator
diff options
context:
space:
mode:
authorFabian Mastenbroek <mail.fabianm@gmail.com>2022-05-06 17:47:44 +0200
committerFabian Mastenbroek <mail.fabianm@gmail.com>2022-05-06 17:47:44 +0200
commita9657e4fa3b15e2c1c11884b5a250b0861bcc21d (patch)
tree6b25de3d7a1def150ab4977a45723c52167e7211 /opendc-compute/opendc-compute-simulator
parent48da4538707cd074969287724ca6f02823f2ff5a (diff)
parent8e3905273c7a3f2df4df5d5840e4088d99b0dffb (diff)
merge: Expose metrics directly to user (#80)
This pull request adds the ability to access the metrics of resources modeled by the OpenDC Compute, Workflow, FaaS, and TensorFlow services directly from their corresponding interfaces. Previously, users would have to interact with OpenTelemetry to obtain these values, which is complex and provides significant overhead. With this pull request, users can access the metrics of all cloud resources modeled by OpenDC via methods such as `getSchedulerStats()`, etc. ** Breaking Changes ** - `ComputeService.hostCount` removed in favour of `ComputeService.hosts.size`
Diffstat (limited to 'opendc-compute/opendc-compute-simulator')
-rw-r--r--opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt90
-rw-r--r--opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt47
-rw-r--r--opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt123
3 files changed, 150 insertions, 110 deletions
diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt
index 4eb6392e..323ae4fe 100644
--- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt
+++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt
@@ -29,11 +29,14 @@ import io.opentelemetry.api.metrics.MeterProvider
import io.opentelemetry.api.metrics.ObservableDoubleMeasurement
import io.opentelemetry.api.metrics.ObservableLongMeasurement
import kotlinx.coroutines.*
-import mu.KotlinLogging
import org.opendc.compute.api.Flavor
import org.opendc.compute.api.Server
import org.opendc.compute.api.ServerState
import org.opendc.compute.service.driver.*
+import org.opendc.compute.service.driver.telemetry.GuestCpuStats
+import org.opendc.compute.service.driver.telemetry.GuestSystemStats
+import org.opendc.compute.service.driver.telemetry.HostCpuStats
+import org.opendc.compute.service.driver.telemetry.HostSystemStats
import org.opendc.compute.simulator.internal.Guest
import org.opendc.compute.simulator.internal.GuestListener
import org.opendc.simulator.compute.*
@@ -49,6 +52,8 @@ import org.opendc.simulator.compute.power.PowerDriver
import org.opendc.simulator.compute.power.SimplePowerDriver
import org.opendc.simulator.compute.workload.SimWorkload
import org.opendc.simulator.flow.FlowEngine
+import java.time.Duration
+import java.time.Instant
import java.util.*
import kotlin.coroutines.CoroutineContext
@@ -81,11 +86,6 @@ public class SimHost(
private val clock = engine.clock
/**
- * The logger instance of this server.
- */
- private val logger = KotlinLogging.logger {}
-
- /**
* The [Meter] to track metrics of the simulated host.
*/
private val meter = meterProvider.get("org.opendc.compute.simulator")
@@ -112,6 +112,9 @@ public class SimHost(
private val guests = HashMap<Server, Guest>()
private val _guests = mutableListOf<Guest>()
+ override val instances: Set<Server>
+ get() = guests.keys
+
override val state: HostState
get() = _state
private var _state: HostState = HostState.DOWN
@@ -249,6 +252,68 @@ public class SimHost(
machine.cancel()
}
+ override fun getSystemStats(): HostSystemStats {
+ updateUptime()
+
+ var terminated = 0
+ var running = 0
+ var error = 0
+ var invalid = 0
+
+ val guests = _guests.listIterator()
+ for (guest in guests) {
+ when (guest.state) {
+ ServerState.TERMINATED -> terminated++
+ ServerState.RUNNING -> running++
+ ServerState.ERROR -> error++
+ ServerState.DELETED -> {
+ // Remove guests that have been deleted
+ this.guests.remove(guest.server)
+ guests.remove()
+ }
+ else -> invalid++
+ }
+ }
+
+ return HostSystemStats(
+ Duration.ofMillis(_uptime),
+ Duration.ofMillis(_downtime),
+ Instant.ofEpochMilli(_bootTime),
+ machine.powerUsage,
+ machine.energyUsage,
+ terminated,
+ running,
+ error,
+ invalid
+ )
+ }
+
+ override fun getSystemStats(server: Server): GuestSystemStats {
+ val guest = requireNotNull(guests[server]) { "Unknown server ${server.uid} at host $uid" }
+ return guest.getSystemStats()
+ }
+
+ override fun getCpuStats(): HostCpuStats {
+ val counters = hypervisor.counters
+ counters.flush()
+
+ return HostCpuStats(
+ counters.cpuActiveTime / 1000L,
+ counters.cpuIdleTime / 1000L,
+ counters.cpuStealTime / 1000L,
+ counters.cpuLostTime / 1000L,
+ hypervisor.cpuCapacity,
+ hypervisor.cpuDemand,
+ hypervisor.cpuUsage,
+ hypervisor.cpuUsage / _cpuLimit
+ )
+ }
+
+ override fun getCpuStats(server: Server): GuestCpuStats {
+ val guest = requireNotNull(guests[server]) { "Unknown server ${server.uid} at host $uid" }
+ return guest.getCpuStats()
+ }
+
override fun hashCode(): Int = uid.hashCode()
override fun equals(other: Any?): Boolean {
@@ -417,13 +482,12 @@ public class SimHost(
* Helper function to track the CPU time of a machine.
*/
private fun collectCpuTime(result: ObservableLongMeasurement) {
- val counters = hypervisor.counters
- counters.flush()
+ val stats = getCpuStats()
- result.record(counters.cpuActiveTime / 1000L, _activeState)
- result.record(counters.cpuIdleTime / 1000L, _idleState)
- result.record(counters.cpuStealTime / 1000L, _stealState)
- result.record(counters.cpuLostTime / 1000L, _lostState)
+ result.record(stats.activeTime, _activeState)
+ result.record(stats.idleTime, _idleState)
+ result.record(stats.stealTime, _stealState)
+ result.record(stats.lostTime, _lostState)
val guests = _guests
for (i in guests.indices) {
@@ -450,7 +514,7 @@ public class SimHost(
val guests = _guests
for (i in guests.indices) {
- guests[i].updateUptime(duration)
+ guests[i].updateUptime()
}
}
diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt
index bb378ee3..0d4c550d 100644
--- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt
+++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt
@@ -32,6 +32,8 @@ import kotlinx.coroutines.*
import mu.KotlinLogging
import org.opendc.compute.api.Server
import org.opendc.compute.api.ServerState
+import org.opendc.compute.service.driver.telemetry.GuestCpuStats
+import org.opendc.compute.service.driver.telemetry.GuestSystemStats
import org.opendc.compute.simulator.SimHost
import org.opendc.compute.simulator.SimWorkloadMapper
import org.opendc.simulator.compute.kernel.SimHypervisor
@@ -39,6 +41,8 @@ import org.opendc.simulator.compute.kernel.SimVirtualMachine
import org.opendc.simulator.compute.runWorkload
import org.opendc.simulator.compute.workload.SimWorkload
import java.time.Clock
+import java.time.Duration
+import java.time.Instant
import kotlin.coroutines.CoroutineContext
/**
@@ -146,6 +150,37 @@ internal class Guest(
}
/**
+ * Obtain the system statistics of this guest.
+ */
+ fun getSystemStats(): GuestSystemStats {
+ updateUptime()
+
+ return GuestSystemStats(
+ Duration.ofMillis(_uptime),
+ Duration.ofMillis(_downtime),
+ Instant.ofEpochMilli(_bootTime)
+ )
+ }
+
+ /**
+ * Obtain the CPU statistics of this guest.
+ */
+ fun getCpuStats(): GuestCpuStats {
+ val counters = machine.counters
+ counters.flush()
+
+ return GuestCpuStats(
+ counters.cpuActiveTime / 1000L,
+ counters.cpuIdleTime / 1000L,
+ counters.cpuStealTime / 1000L,
+ counters.cpuLostTime / 1000L,
+ machine.cpuCapacity,
+ machine.cpuUsage,
+ machine.cpuUsage / _cpuLimit
+ )
+ }
+
+ /**
* The [Job] representing the current active virtual machine instance or `null` if no virtual machine is active.
*/
private var job: Job? = null
@@ -209,6 +244,8 @@ internal class Guest(
* This method is invoked when the guest stopped.
*/
private fun onStop(target: ServerState) {
+ updateUptime()
+
state = target
listener.onStop(this)
}
@@ -224,10 +261,16 @@ internal class Guest(
.put(STATE_KEY, "down")
.build()
+ private var _lastReport = clock.millis()
+
/**
* Helper function to track the uptime and downtime of the guest.
*/
- fun updateUptime(duration: Long) {
+ fun updateUptime() {
+ val now = clock.millis()
+ val duration = now - _lastReport
+ _lastReport = now
+
if (state == ServerState.RUNNING) {
_uptime += duration
} else if (state == ServerState.ERROR) {
@@ -239,6 +282,8 @@ internal class Guest(
* Helper function to track the uptime of the guest.
*/
fun collectUptime(result: ObservableLongMeasurement) {
+ updateUptime()
+
result.record(_uptime, _upState)
result.record(_downtime, _downState)
}
diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt
index f0325023..fd54ad1d 100644
--- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt
+++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt
@@ -22,8 +22,7 @@
package org.opendc.compute.simulator
-import io.opentelemetry.sdk.metrics.SdkMeterProvider
-import io.opentelemetry.sdk.resources.Resource
+import io.opentelemetry.api.metrics.MeterProvider
import kotlinx.coroutines.*
import org.junit.jupiter.api.Assertions.assertEquals
import org.junit.jupiter.api.BeforeEach
@@ -42,13 +41,7 @@ import org.opendc.simulator.compute.workload.SimTraceFragment
import org.opendc.simulator.compute.workload.SimTraceWorkload
import org.opendc.simulator.core.runBlockingSimulation
import org.opendc.simulator.flow.FlowEngine
-import org.opendc.telemetry.compute.ComputeMetricExporter
-import org.opendc.telemetry.compute.HOST_ID
-import org.opendc.telemetry.compute.table.HostTableReader
-import org.opendc.telemetry.compute.table.ServerTableReader
-import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader
-import org.opendc.telemetry.sdk.toOtelClock
-import java.time.Duration
+import java.time.Instant
import java.util.*
import kotlin.coroutines.resume
@@ -73,45 +66,16 @@ internal class SimHostTest {
*/
@Test
fun testOvercommitted() = runBlockingSimulation {
- var idleTime = 0L
- var activeTime = 0L
- var stealTime = 0L
-
- val hostId = UUID.randomUUID()
- val hostResource = Resource.builder()
- .put(HOST_ID, hostId.toString())
- .build()
-
- // Setup metric reader
val duration = 5 * 60L
- val reader = CoroutineMetricReader(
- this,
- object : ComputeMetricExporter() {
- override fun record(reader: HostTableReader) {
- activeTime += reader.cpuActiveTime
- idleTime += reader.cpuIdleTime
- stealTime += reader.cpuStealTime
- }
- },
- exportInterval = Duration.ofSeconds(duration)
- )
-
- val meterProvider = SdkMeterProvider
- .builder()
- .setResource(hostResource)
- .setClock(clock.toOtelClock())
- .registerMetricReader(reader)
- .build()
-
val engine = FlowEngine(coroutineContext, clock)
- val virtDriver = SimHost(
- uid = hostId,
+ val host = SimHost(
+ uid = UUID.randomUUID(),
name = "test",
model = machineModel,
meta = emptyMap(),
coroutineContext,
engine,
- meterProvider,
+ MeterProvider.noop(),
SimFairShareHypervisorProvider()
)
val vmImageA = MockImage(
@@ -150,11 +114,11 @@ internal class SimHostTest {
val flavor = MockFlavor(2, 0)
coroutineScope {
- launch { virtDriver.spawn(MockServer(UUID.randomUUID(), "a", flavor, vmImageA)) }
- launch { virtDriver.spawn(MockServer(UUID.randomUUID(), "b", flavor, vmImageB)) }
+ launch { host.spawn(MockServer(UUID.randomUUID(), "a", flavor, vmImageA)) }
+ launch { host.spawn(MockServer(UUID.randomUUID(), "b", flavor, vmImageB)) }
suspendCancellableCoroutine<Unit> { cont ->
- virtDriver.addListener(object : HostListener {
+ host.addListener(object : HostListener {
private var finished = 0
override fun onStateChanged(host: Host, server: Server, newState: ServerState) {
@@ -168,13 +132,14 @@ internal class SimHostTest {
// Ensure last cycle is collected
delay(1000L * duration)
- virtDriver.close()
- meterProvider.close()
+ host.close()
+
+ val cpuStats = host.getCpuStats()
assertAll(
- { assertEquals(658, activeTime, "Active time does not match") },
- { assertEquals(2341, idleTime, "Idle time does not match") },
- { assertEquals(637, stealTime, "Steal time does not match") },
+ { assertEquals(658, cpuStats.activeTime, "Active time does not match") },
+ { assertEquals(2341, cpuStats.idleTime, "Idle time does not match") },
+ { assertEquals(637, cpuStats.stealTime, "Steal time does not match") },
{ assertEquals(1500001, clock.millis()) }
)
}
@@ -184,54 +149,16 @@ internal class SimHostTest {
*/
@Test
fun testFailure() = runBlockingSimulation {
- var activeTime = 0L
- var idleTime = 0L
- var uptime = 0L
- var downtime = 0L
- var guestUptime = 0L
- var guestDowntime = 0L
-
- val hostId = UUID.randomUUID()
- val hostResource = Resource.builder()
- .put(HOST_ID, hostId.toString())
- .build()
-
- // Setup metric reader
val duration = 5 * 60L
- val reader = CoroutineMetricReader(
- this,
- object : ComputeMetricExporter() {
- override fun record(reader: HostTableReader) {
- activeTime += reader.cpuActiveTime
- idleTime += reader.cpuIdleTime
- uptime += reader.uptime
- downtime += reader.downtime
- }
-
- override fun record(reader: ServerTableReader) {
- guestUptime += reader.uptime
- guestDowntime += reader.downtime
- }
- },
- exportInterval = Duration.ofSeconds(duration)
- )
-
- val meterProvider = SdkMeterProvider
- .builder()
- .setResource(hostResource)
- .setClock(clock.toOtelClock())
- .registerMetricReader(reader)
- .build()
-
val engine = FlowEngine(coroutineContext, clock)
val host = SimHost(
- uid = hostId,
+ uid = UUID.randomUUID(),
name = "test",
model = machineModel,
meta = emptyMap(),
coroutineContext,
engine,
- meterProvider,
+ MeterProvider.noop(),
SimFairShareHypervisorProvider()
)
val image = MockImage(
@@ -275,15 +202,17 @@ internal class SimHostTest {
// Ensure last cycle is collected
delay(1000L * duration)
- meterProvider.close()
+ val cpuStats = host.getCpuStats()
+ val sysStats = host.getSystemStats()
+ val guestSysStats = host.getSystemStats(server)
assertAll(
- { assertEquals(1775, idleTime, "Idle time does not match") },
- { assertEquals(624, activeTime, "Active time does not match") },
- { assertEquals(900001, uptime, "Uptime does not match") },
- { assertEquals(300000, downtime, "Downtime does not match") },
- { assertEquals(900000, guestUptime, "Guest uptime does not match") },
- { assertEquals(300000, guestDowntime, "Guest downtime does not match") },
+ { assertEquals(1775, cpuStats.idleTime, "Idle time does not match") },
+ { assertEquals(624, cpuStats.activeTime, "Active time does not match") },
+ { assertEquals(900001, sysStats.uptime.toMillis(), "Uptime does not match") },
+ { assertEquals(300000, sysStats.downtime.toMillis(), "Downtime does not match") },
+ { assertEquals(900001, guestSysStats.uptime.toMillis(), "Guest uptime does not match") },
+ { assertEquals(300000, guestSysStats.downtime.toMillis(), "Guest downtime does not match") },
)
}
@@ -332,6 +261,8 @@ internal class SimHostTest {
override val state: ServerState = ServerState.TERMINATED
+ override val launchedAt: Instant? = null
+
override suspend fun start() {}
override suspend fun stop() {}