summaryrefslogtreecommitdiff
path: root/opendc-web/opendc-web-runner/src/main
diff options
context:
space:
mode:
authorFabian Mastenbroek <mail.fabianm@gmail.com>2021-09-07 17:30:46 +0200
committerFabian Mastenbroek <mail.fabianm@gmail.com>2021-09-17 16:52:29 +0200
commit0d8bccc68705d036fbf60f312d9c34ca4392c6b2 (patch)
treefaa50b8bf29976531e2ba757269ceb746195737d /opendc-web/opendc-web-runner/src/main
parent8d899e29dbd757f6df320212d6e0d77ce8216ab9 (diff)
refactor(telemetry): Standardize SimHost metrics
This change standardizes the metrics emitted by SimHost instances and their guests based on the OpenTelemetry semantic conventions. We now also report CPU time as opposed to CPU work as this metric is more commonly used.
Diffstat (limited to 'opendc-web/opendc-web-runner/src/main')
-rw-r--r--opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt12
-rw-r--r--opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt8
-rw-r--r--opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt41
3 files changed, 26 insertions, 35 deletions
diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt
index 960d5ebd..483558e1 100644
--- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt
+++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt
@@ -26,8 +26,6 @@ import com.github.ajalt.clikt.core.CliktCommand
import com.github.ajalt.clikt.parameters.options.*
import com.github.ajalt.clikt.parameters.types.file
import com.github.ajalt.clikt.parameters.types.long
-import io.opentelemetry.api.metrics.MeterProvider
-import io.opentelemetry.sdk.metrics.SdkMeterProvider
import kotlinx.coroutines.*
import mu.KotlinLogging
import org.opendc.experiments.capelin.*
@@ -49,7 +47,6 @@ import org.opendc.simulator.core.runBlockingSimulation
import org.opendc.telemetry.compute.ComputeMetricExporter
import org.opendc.telemetry.compute.collectServiceMetrics
import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader
-import org.opendc.telemetry.sdk.toOtelClock
import org.opendc.web.client.ApiClient
import org.opendc.web.client.AuthConfiguration
import org.opendc.web.client.model.Scenario
@@ -187,11 +184,6 @@ class RunnerCli : CliktCommand(name = "runner") {
val seeder = Random(repeat.toLong())
- val meterProvider: MeterProvider = SdkMeterProvider
- .builder()
- .setClock(clock.toOtelClock())
- .build()
-
val operational = scenario.operationalPhenomena
val computeScheduler = createComputeScheduler(operational.schedulerName, seeder)
@@ -215,7 +207,7 @@ class RunnerCli : CliktCommand(name = "runner") {
interferenceModel.takeIf { operational.performanceInterferenceEnabled }
)
- val metricReader = CoroutineMetricReader(this, simulator.producers, ComputeMetricExporter(clock, monitor))
+ val metricReader = CoroutineMetricReader(this, simulator.producers, ComputeMetricExporter(clock, monitor), exportInterval = Duration.ofHours(1))
try {
simulator.run(trace)
@@ -224,7 +216,7 @@ class RunnerCli : CliktCommand(name = "runner") {
metricReader.close()
}
- val serviceMetrics = collectServiceMetrics(clock.millis(), simulator.producers[0])
+ val serviceMetrics = collectServiceMetrics(clock.instant(), simulator.producers[0])
logger.debug {
"Scheduler " +
"Success=${serviceMetrics.attemptsSuccess} " +
diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt
index e0e3488f..a0c281e8 100644
--- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt
+++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt
@@ -65,10 +65,10 @@ public class ScenarioManager(private val client: ApiClient) {
client.updateJob(
id, SimulationState.FINISHED,
mapOf(
- "total_requested_burst" to results.map { it.totalWork },
- "total_granted_burst" to results.map { it.totalGrantedWork },
- "total_overcommitted_burst" to results.map { it.totalOvercommittedWork },
- "total_interfered_burst" to results.map { it.totalInterferedWork },
+ "total_requested_burst" to results.map { it.totalActiveTime + it.totalIdleTime },
+ "total_granted_burst" to results.map { it.totalActiveTime },
+ "total_overcommitted_burst" to results.map { it.totalStealTime },
+ "total_interfered_burst" to results.map { it.totalLostTime },
"mean_cpu_usage" to results.map { it.meanCpuUsage },
"mean_cpu_demand" to results.map { it.meanCpuDemand },
"mean_num_deployed_images" to results.map { it.meanNumDeployedImages },
diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt
index 5f2c474b..bb412738 100644
--- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt
+++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt
@@ -33,24 +33,23 @@ import kotlin.math.roundToLong
*/
class WebComputeMonitor : ComputeMonitor {
override fun record(data: HostData) {
- val duration = data.uptime
val slices = data.downtime / SLICE_LENGTH
hostAggregateMetrics = AggregateHostMetrics(
- hostAggregateMetrics.totalWork + data.totalWork,
- hostAggregateMetrics.totalGrantedWork + data.grantedWork,
- hostAggregateMetrics.totalOvercommittedWork + data.overcommittedWork,
- hostAggregateMetrics.totalInterferedWork + data.overcommittedWork,
- hostAggregateMetrics.totalPowerDraw + (duration * data.powerDraw) / 3600,
+ hostAggregateMetrics.totalActiveTime + data.cpuActiveTime,
+ hostAggregateMetrics.totalIdleTime + data.cpuIdleTime,
+ hostAggregateMetrics.totalStealTime + data.cpuStealTime,
+ hostAggregateMetrics.totalLostTime + data.cpuLostTime,
+ hostAggregateMetrics.totalPowerDraw + data.powerTotal,
hostAggregateMetrics.totalFailureSlices + slices,
- hostAggregateMetrics.totalFailureVmSlices + data.instanceCount * slices
+ hostAggregateMetrics.totalFailureVmSlices + data.guestsRunning * slices
)
hostMetrics.compute(data.host.id) { _, prev ->
HostMetrics(
data.cpuUsage + (prev?.cpuUsage ?: 0.0),
data.cpuDemand + (prev?.cpuDemand ?: 0.0),
- data.instanceCount + (prev?.instanceCount ?: 0),
+ data.guestsRunning + (prev?.instanceCount ?: 0),
1 + (prev?.count ?: 0)
)
}
@@ -58,13 +57,13 @@ class WebComputeMonitor : ComputeMonitor {
private var hostAggregateMetrics: AggregateHostMetrics = AggregateHostMetrics()
private val hostMetrics: MutableMap<String, HostMetrics> = mutableMapOf()
- private val SLICE_LENGTH: Long = 5 * 60 * 1000
+ private val SLICE_LENGTH: Long = 5 * 60
data class AggregateHostMetrics(
- val totalWork: Double = 0.0,
- val totalGrantedWork: Double = 0.0,
- val totalOvercommittedWork: Double = 0.0,
- val totalInterferedWork: Double = 0.0,
+ val totalActiveTime: Long = 0L,
+ val totalIdleTime: Long = 0L,
+ val totalStealTime: Long = 0L,
+ val totalLostTime: Long = 0L,
val totalPowerDraw: Double = 0.0,
val totalFailureSlices: Double = 0.0,
val totalFailureVmSlices: Double = 0.0,
@@ -99,10 +98,10 @@ class WebComputeMonitor : ComputeMonitor {
fun getResult(): Result {
return Result(
- hostAggregateMetrics.totalWork,
- hostAggregateMetrics.totalGrantedWork,
- hostAggregateMetrics.totalOvercommittedWork,
- hostAggregateMetrics.totalInterferedWork,
+ hostAggregateMetrics.totalActiveTime,
+ hostAggregateMetrics.totalIdleTime,
+ hostAggregateMetrics.totalStealTime,
+ hostAggregateMetrics.totalLostTime,
hostMetrics.map { it.value.cpuUsage / it.value.count }.average(),
hostMetrics.map { it.value.cpuDemand / it.value.count }.average(),
hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.average(),
@@ -118,10 +117,10 @@ class WebComputeMonitor : ComputeMonitor {
}
data class Result(
- val totalWork: Double,
- val totalGrantedWork: Double,
- val totalOvercommittedWork: Double,
- val totalInterferedWork: Double,
+ val totalActiveTime: Long,
+ val totalIdleTime: Long,
+ val totalStealTime: Long,
+ val totalLostTime: Long,
val meanCpuUsage: Double,
val meanCpuDemand: Double,
val meanNumDeployedImages: Double,