diff options
| author | Fabian Mastenbroek <mail.fabianm@gmail.com> | 2021-09-17 17:12:20 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2021-09-17 17:12:20 +0200 |
| commit | c1b9719aad10566c9d17f9eb757236c58a602b89 (patch) | |
| tree | 2755ea2d44256116e6dc08a57a64b37a36331249 /opendc-web | |
| parent | 2cd3bd18e548a72d64afe0e7f59487f4747d722f (diff) | |
| parent | e2537c59bef0645b948e92553cc5a42a8c0f7256 (diff) | |
merge: Standardize simulator metrics
This pull request standardizes the metrics emitted by the simulator based on OpenTelemetry conventions.
From now on, all metrics exposed by the simulator are exported through OpenTelemetry
following the recommended practices for naming, collection, etc.
**Implementation Notes**
- Improve ParquetDataWriter implementation
- Simplify CoroutineMetricReader
- Create separate MeterProvider per service/host
- Standardize compute scheduler metrics
- Standardize SimHost metrics
- Use logical types for Parquet output columns
**External Dependencies**
- Update to OpenTelemetry 1.6.0
**Breaking API Changes**
- Instead of supplying a `Meter` instances, key classes are now responsible for constructing
a `Meter` instance from the supplied `MeterProvider`.
- Export format has been changed to suit the outputted metrics
- Energy experiments shell has been removed
Diffstat (limited to 'opendc-web')
3 files changed, 77 insertions, 101 deletions
diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index b565e90d..483558e1 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -26,12 +26,8 @@ import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.parameters.options.* import com.github.ajalt.clikt.parameters.types.file import com.github.ajalt.clikt.parameters.types.long -import io.opentelemetry.api.metrics.MeterProvider -import io.opentelemetry.sdk.metrics.SdkMeterProvider -import io.opentelemetry.sdk.metrics.export.MetricProducer import kotlinx.coroutines.* import mu.KotlinLogging -import org.opendc.compute.simulator.SimHost import org.opendc.experiments.capelin.* import org.opendc.experiments.capelin.env.EnvironmentReader import org.opendc.experiments.capelin.env.MachineDef @@ -39,6 +35,8 @@ import org.opendc.experiments.capelin.model.Workload import org.opendc.experiments.capelin.trace.ParquetTraceReader import org.opendc.experiments.capelin.trace.PerformanceInterferenceReader import org.opendc.experiments.capelin.trace.RawParquetTraceReader +import org.opendc.experiments.capelin.util.ComputeServiceSimulator +import org.opendc.experiments.capelin.util.createComputeScheduler import org.opendc.simulator.compute.kernel.interference.VmInterferenceModel import org.opendc.simulator.compute.model.MachineModel import org.opendc.simulator.compute.model.MemoryUnit @@ -46,18 +44,17 @@ import org.opendc.simulator.compute.model.ProcessingNode import org.opendc.simulator.compute.model.ProcessingUnit import org.opendc.simulator.compute.power.LinearPowerModel import org.opendc.simulator.core.runBlockingSimulation +import org.opendc.telemetry.compute.ComputeMetricExporter import org.opendc.telemetry.compute.collectServiceMetrics -import org.opendc.telemetry.compute.withMonitor -import org.opendc.telemetry.sdk.toOtelClock +import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader import org.opendc.web.client.ApiClient import org.opendc.web.client.AuthConfiguration import org.opendc.web.client.model.Scenario import org.opendc.web.client.model.Topology import java.io.File import java.net.URI +import java.time.Duration import java.util.* -import kotlin.random.Random -import kotlin.random.asJavaRandom import org.opendc.web.client.model.Portfolio as ClientPortfolio private val logger = KotlinLogging.logger {} @@ -158,7 +155,7 @@ class RunnerCli : CliktCommand(name = "runner") { val results = (0 until targets.repeatsPerScenario).map { repeat -> logger.info { "Starting repeat $repeat" } withTimeout(runTimeout * 1000) { - val interferenceModel = interferenceGroups?.let { VmInterferenceModel(it, Random(repeat.toLong()).asJavaRandom()) } + val interferenceModel = interferenceGroups?.let { VmInterferenceModel(it, Random(repeat.toLong())) } runRepeat(scenario, repeat, environment, traceReader, interferenceModel) } } @@ -182,63 +179,51 @@ class RunnerCli : CliktCommand(name = "runner") { try { runBlockingSimulation { - val seed = repeat val workloadName = scenario.trace.traceId val workloadFraction = scenario.trace.loadSamplingFraction - val seeder = Random(seed) - - val meterProvider: MeterProvider = SdkMeterProvider - .builder() - .setClock(clock.toOtelClock()) - .build() - val metricProducer = meterProvider as MetricProducer + val seeder = Random(repeat.toLong()) val operational = scenario.operationalPhenomena - val allocationPolicy = createComputeScheduler(operational.schedulerName, seeder) + val computeScheduler = createComputeScheduler(operational.schedulerName, seeder) val trace = ParquetTraceReader( listOf(traceReader), Workload(workloadName, workloadFraction), - seed + repeat ) - val failureFrequency = if (operational.failuresEnabled) 24.0 * 7 else 0.0 - - withComputeService(clock, meterProvider, environment, allocationPolicy, interferenceModel) { scheduler -> - val faultInjector = if (failureFrequency > 0) { - logger.debug { "ENABLING failures" } - createFaultInjector( - coroutineContext, - clock, - scheduler.hosts.map { it as SimHost }.toSet(), - seeder.nextInt(), - failureFrequency, - ) - } else { + val failureModel = + if (operational.failuresEnabled) + grid5000(Duration.ofDays(7), repeat) + else null - } - withMonitor(scheduler, clock, meterProvider as MetricProducer, monitor) { - faultInjector?.start() + val simulator = ComputeServiceSimulator( + coroutineContext, + clock, + computeScheduler, + environment.read(), + failureModel, + interferenceModel.takeIf { operational.performanceInterferenceEnabled } + ) - processTrace( - clock, - trace, - scheduler, - monitor - ) + val metricReader = CoroutineMetricReader(this, simulator.producers, ComputeMetricExporter(clock, monitor), exportInterval = Duration.ofHours(1)) - faultInjector?.close() - } + try { + simulator.run(trace) + } finally { + simulator.close() + metricReader.close() } - val monitorResults = collectServiceMetrics(clock.millis(), metricProducer) + val serviceMetrics = collectServiceMetrics(clock.instant(), simulator.producers[0]) logger.debug { - "Finish " + - "SUBMIT=${monitorResults.instanceCount} " + - "FAIL=${monitorResults.failedInstanceCount} " + - "QUEUE=${monitorResults.queuedInstanceCount} " + - "RUNNING=${monitorResults.runningInstanceCount}" + "Scheduler " + + "Success=${serviceMetrics.attemptsSuccess} " + + "Failure=${serviceMetrics.attemptsFailure} " + + "Error=${serviceMetrics.attemptsError} " + + "Pending=${serviceMetrics.serversPending} " + + "Active=${serviceMetrics.serversActive}" } } } catch (cause: Throwable) { diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt index e0e3488f..a0c281e8 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt @@ -65,10 +65,10 @@ public class ScenarioManager(private val client: ApiClient) { client.updateJob( id, SimulationState.FINISHED, mapOf( - "total_requested_burst" to results.map { it.totalWork }, - "total_granted_burst" to results.map { it.totalGrantedWork }, - "total_overcommitted_burst" to results.map { it.totalOvercommittedWork }, - "total_interfered_burst" to results.map { it.totalInterferedWork }, + "total_requested_burst" to results.map { it.totalActiveTime + it.totalIdleTime }, + "total_granted_burst" to results.map { it.totalActiveTime }, + "total_overcommitted_burst" to results.map { it.totalStealTime }, + "total_interfered_burst" to results.map { it.totalLostTime }, "mean_cpu_usage" to results.map { it.meanCpuUsage }, "mean_cpu_demand" to results.map { it.meanCpuDemand }, "mean_num_deployed_images" to results.map { it.meanNumDeployedImages }, diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt index c8e58dde..bb412738 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt @@ -22,60 +22,51 @@ package org.opendc.web.runner -import mu.KotlinLogging -import org.opendc.compute.service.driver.Host -import org.opendc.compute.service.driver.HostState import org.opendc.telemetry.compute.ComputeMonitor import org.opendc.telemetry.compute.table.HostData import org.opendc.telemetry.compute.table.ServiceData import kotlin.math.max +import kotlin.math.roundToLong /** * A [ComputeMonitor] that tracks the aggregate metrics for each repeat. */ -public class WebComputeMonitor : ComputeMonitor { - private val logger = KotlinLogging.logger {} - - override fun onStateChange(time: Long, host: Host, newState: HostState) { - logger.debug { "Host ${host.uid} changed state $newState [$time]" } - } - +class WebComputeMonitor : ComputeMonitor { override fun record(data: HostData) { - val duration = 5 * 60 * 1000L - val slices = duration / SLICE_LENGTH + val slices = data.downtime / SLICE_LENGTH hostAggregateMetrics = AggregateHostMetrics( - hostAggregateMetrics.totalWork + data.totalWork, - hostAggregateMetrics.totalGrantedWork + data.grantedWork, - hostAggregateMetrics.totalOvercommittedWork + data.overcommittedWork, - hostAggregateMetrics.totalInterferedWork + data.overcommittedWork, - hostAggregateMetrics.totalPowerDraw + (duration * data.powerDraw) / 3600, - hostAggregateMetrics.totalFailureSlices + if (data.host.state != HostState.UP) slices else 0, - hostAggregateMetrics.totalFailureVmSlices + if (data.host.state != HostState.UP) data.instanceCount * slices else 0 + hostAggregateMetrics.totalActiveTime + data.cpuActiveTime, + hostAggregateMetrics.totalIdleTime + data.cpuIdleTime, + hostAggregateMetrics.totalStealTime + data.cpuStealTime, + hostAggregateMetrics.totalLostTime + data.cpuLostTime, + hostAggregateMetrics.totalPowerDraw + data.powerTotal, + hostAggregateMetrics.totalFailureSlices + slices, + hostAggregateMetrics.totalFailureVmSlices + data.guestsRunning * slices ) - hostMetrics.compute(data.host) { _, prev -> + hostMetrics.compute(data.host.id) { _, prev -> HostMetrics( - (data.cpuUsage.takeIf { data.host.state == HostState.UP } ?: 0.0) + (prev?.cpuUsage ?: 0.0), - (data.cpuDemand.takeIf { data.host.state == HostState.UP } ?: 0.0) + (prev?.cpuDemand ?: 0.0), - data.instanceCount + (prev?.instanceCount ?: 0), + data.cpuUsage + (prev?.cpuUsage ?: 0.0), + data.cpuDemand + (prev?.cpuDemand ?: 0.0), + data.guestsRunning + (prev?.instanceCount ?: 0), 1 + (prev?.count ?: 0) ) } } private var hostAggregateMetrics: AggregateHostMetrics = AggregateHostMetrics() - private val hostMetrics: MutableMap<Host, HostMetrics> = mutableMapOf() - private val SLICE_LENGTH: Long = 5 * 60 * 1000 + private val hostMetrics: MutableMap<String, HostMetrics> = mutableMapOf() + private val SLICE_LENGTH: Long = 5 * 60 data class AggregateHostMetrics( - val totalWork: Double = 0.0, - val totalGrantedWork: Double = 0.0, - val totalOvercommittedWork: Double = 0.0, - val totalInterferedWork: Double = 0.0, + val totalActiveTime: Long = 0L, + val totalIdleTime: Long = 0L, + val totalStealTime: Long = 0L, + val totalLostTime: Long = 0L, val totalPowerDraw: Double = 0.0, - val totalFailureSlices: Long = 0, - val totalFailureVmSlices: Long = 0, + val totalFailureSlices: Double = 0.0, + val totalFailureVmSlices: Double = 0.0, ) data class HostMetrics( @@ -89,15 +80,15 @@ public class WebComputeMonitor : ComputeMonitor { override fun record(data: ServiceData) { serviceMetrics = AggregateServiceMetrics( - max(data.instanceCount, serviceMetrics.vmTotalCount), - max(data.queuedInstanceCount, serviceMetrics.vmWaitingCount), - max(data.runningInstanceCount, serviceMetrics.vmActiveCount), - max(data.finishedInstanceCount, serviceMetrics.vmInactiveCount), - max(data.failedInstanceCount, serviceMetrics.vmFailedCount), + max(data.attemptsSuccess, serviceMetrics.vmTotalCount), + max(data.serversPending, serviceMetrics.vmWaitingCount), + max(data.serversActive, serviceMetrics.vmActiveCount), + max(0, serviceMetrics.vmInactiveCount), + max(data.attemptsFailure, serviceMetrics.vmFailedCount), ) } - public data class AggregateServiceMetrics( + data class AggregateServiceMetrics( val vmTotalCount: Int = 0, val vmWaitingCount: Int = 0, val vmActiveCount: Int = 0, @@ -105,19 +96,19 @@ public class WebComputeMonitor : ComputeMonitor { val vmFailedCount: Int = 0 ) - public fun getResult(): Result { + fun getResult(): Result { return Result( - hostAggregateMetrics.totalWork, - hostAggregateMetrics.totalGrantedWork, - hostAggregateMetrics.totalOvercommittedWork, - hostAggregateMetrics.totalInterferedWork, + hostAggregateMetrics.totalActiveTime, + hostAggregateMetrics.totalIdleTime, + hostAggregateMetrics.totalStealTime, + hostAggregateMetrics.totalLostTime, hostMetrics.map { it.value.cpuUsage / it.value.count }.average(), hostMetrics.map { it.value.cpuDemand / it.value.count }.average(), hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.average(), hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.maxOrNull() ?: 0.0, hostAggregateMetrics.totalPowerDraw, - hostAggregateMetrics.totalFailureSlices, - hostAggregateMetrics.totalFailureVmSlices, + hostAggregateMetrics.totalFailureSlices.roundToLong(), + hostAggregateMetrics.totalFailureVmSlices.roundToLong(), serviceMetrics.vmTotalCount, serviceMetrics.vmWaitingCount, serviceMetrics.vmInactiveCount, @@ -126,10 +117,10 @@ public class WebComputeMonitor : ComputeMonitor { } data class Result( - val totalWork: Double, - val totalGrantedWork: Double, - val totalOvercommittedWork: Double, - val totalInterferedWork: Double, + val totalActiveTime: Long, + val totalIdleTime: Long, + val totalStealTime: Long, + val totalLostTime: Long, val meanCpuUsage: Double, val meanCpuDemand: Double, val meanNumDeployedImages: Double, |
