summaryrefslogtreecommitdiff
path: root/opendc-web
diff options
context:
space:
mode:
Diffstat (limited to 'opendc-web')
-rw-r--r--opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt83
-rw-r--r--opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt8
-rw-r--r--opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt87
3 files changed, 77 insertions, 101 deletions
diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt
index b565e90d..483558e1 100644
--- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt
+++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt
@@ -26,12 +26,8 @@ import com.github.ajalt.clikt.core.CliktCommand
import com.github.ajalt.clikt.parameters.options.*
import com.github.ajalt.clikt.parameters.types.file
import com.github.ajalt.clikt.parameters.types.long
-import io.opentelemetry.api.metrics.MeterProvider
-import io.opentelemetry.sdk.metrics.SdkMeterProvider
-import io.opentelemetry.sdk.metrics.export.MetricProducer
import kotlinx.coroutines.*
import mu.KotlinLogging
-import org.opendc.compute.simulator.SimHost
import org.opendc.experiments.capelin.*
import org.opendc.experiments.capelin.env.EnvironmentReader
import org.opendc.experiments.capelin.env.MachineDef
@@ -39,6 +35,8 @@ import org.opendc.experiments.capelin.model.Workload
import org.opendc.experiments.capelin.trace.ParquetTraceReader
import org.opendc.experiments.capelin.trace.PerformanceInterferenceReader
import org.opendc.experiments.capelin.trace.RawParquetTraceReader
+import org.opendc.experiments.capelin.util.ComputeServiceSimulator
+import org.opendc.experiments.capelin.util.createComputeScheduler
import org.opendc.simulator.compute.kernel.interference.VmInterferenceModel
import org.opendc.simulator.compute.model.MachineModel
import org.opendc.simulator.compute.model.MemoryUnit
@@ -46,18 +44,17 @@ import org.opendc.simulator.compute.model.ProcessingNode
import org.opendc.simulator.compute.model.ProcessingUnit
import org.opendc.simulator.compute.power.LinearPowerModel
import org.opendc.simulator.core.runBlockingSimulation
+import org.opendc.telemetry.compute.ComputeMetricExporter
import org.opendc.telemetry.compute.collectServiceMetrics
-import org.opendc.telemetry.compute.withMonitor
-import org.opendc.telemetry.sdk.toOtelClock
+import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader
import org.opendc.web.client.ApiClient
import org.opendc.web.client.AuthConfiguration
import org.opendc.web.client.model.Scenario
import org.opendc.web.client.model.Topology
import java.io.File
import java.net.URI
+import java.time.Duration
import java.util.*
-import kotlin.random.Random
-import kotlin.random.asJavaRandom
import org.opendc.web.client.model.Portfolio as ClientPortfolio
private val logger = KotlinLogging.logger {}
@@ -158,7 +155,7 @@ class RunnerCli : CliktCommand(name = "runner") {
val results = (0 until targets.repeatsPerScenario).map { repeat ->
logger.info { "Starting repeat $repeat" }
withTimeout(runTimeout * 1000) {
- val interferenceModel = interferenceGroups?.let { VmInterferenceModel(it, Random(repeat.toLong()).asJavaRandom()) }
+ val interferenceModel = interferenceGroups?.let { VmInterferenceModel(it, Random(repeat.toLong())) }
runRepeat(scenario, repeat, environment, traceReader, interferenceModel)
}
}
@@ -182,63 +179,51 @@ class RunnerCli : CliktCommand(name = "runner") {
try {
runBlockingSimulation {
- val seed = repeat
val workloadName = scenario.trace.traceId
val workloadFraction = scenario.trace.loadSamplingFraction
- val seeder = Random(seed)
-
- val meterProvider: MeterProvider = SdkMeterProvider
- .builder()
- .setClock(clock.toOtelClock())
- .build()
- val metricProducer = meterProvider as MetricProducer
+ val seeder = Random(repeat.toLong())
val operational = scenario.operationalPhenomena
- val allocationPolicy = createComputeScheduler(operational.schedulerName, seeder)
+ val computeScheduler = createComputeScheduler(operational.schedulerName, seeder)
val trace = ParquetTraceReader(
listOf(traceReader),
Workload(workloadName, workloadFraction),
- seed
+ repeat
)
- val failureFrequency = if (operational.failuresEnabled) 24.0 * 7 else 0.0
-
- withComputeService(clock, meterProvider, environment, allocationPolicy, interferenceModel) { scheduler ->
- val faultInjector = if (failureFrequency > 0) {
- logger.debug { "ENABLING failures" }
- createFaultInjector(
- coroutineContext,
- clock,
- scheduler.hosts.map { it as SimHost }.toSet(),
- seeder.nextInt(),
- failureFrequency,
- )
- } else {
+ val failureModel =
+ if (operational.failuresEnabled)
+ grid5000(Duration.ofDays(7), repeat)
+ else
null
- }
- withMonitor(scheduler, clock, meterProvider as MetricProducer, monitor) {
- faultInjector?.start()
+ val simulator = ComputeServiceSimulator(
+ coroutineContext,
+ clock,
+ computeScheduler,
+ environment.read(),
+ failureModel,
+ interferenceModel.takeIf { operational.performanceInterferenceEnabled }
+ )
- processTrace(
- clock,
- trace,
- scheduler,
- monitor
- )
+ val metricReader = CoroutineMetricReader(this, simulator.producers, ComputeMetricExporter(clock, monitor), exportInterval = Duration.ofHours(1))
- faultInjector?.close()
- }
+ try {
+ simulator.run(trace)
+ } finally {
+ simulator.close()
+ metricReader.close()
}
- val monitorResults = collectServiceMetrics(clock.millis(), metricProducer)
+ val serviceMetrics = collectServiceMetrics(clock.instant(), simulator.producers[0])
logger.debug {
- "Finish " +
- "SUBMIT=${monitorResults.instanceCount} " +
- "FAIL=${monitorResults.failedInstanceCount} " +
- "QUEUE=${monitorResults.queuedInstanceCount} " +
- "RUNNING=${monitorResults.runningInstanceCount}"
+ "Scheduler " +
+ "Success=${serviceMetrics.attemptsSuccess} " +
+ "Failure=${serviceMetrics.attemptsFailure} " +
+ "Error=${serviceMetrics.attemptsError} " +
+ "Pending=${serviceMetrics.serversPending} " +
+ "Active=${serviceMetrics.serversActive}"
}
}
} catch (cause: Throwable) {
diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt
index e0e3488f..a0c281e8 100644
--- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt
+++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt
@@ -65,10 +65,10 @@ public class ScenarioManager(private val client: ApiClient) {
client.updateJob(
id, SimulationState.FINISHED,
mapOf(
- "total_requested_burst" to results.map { it.totalWork },
- "total_granted_burst" to results.map { it.totalGrantedWork },
- "total_overcommitted_burst" to results.map { it.totalOvercommittedWork },
- "total_interfered_burst" to results.map { it.totalInterferedWork },
+ "total_requested_burst" to results.map { it.totalActiveTime + it.totalIdleTime },
+ "total_granted_burst" to results.map { it.totalActiveTime },
+ "total_overcommitted_burst" to results.map { it.totalStealTime },
+ "total_interfered_burst" to results.map { it.totalLostTime },
"mean_cpu_usage" to results.map { it.meanCpuUsage },
"mean_cpu_demand" to results.map { it.meanCpuDemand },
"mean_num_deployed_images" to results.map { it.meanNumDeployedImages },
diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt
index c8e58dde..bb412738 100644
--- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt
+++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt
@@ -22,60 +22,51 @@
package org.opendc.web.runner
-import mu.KotlinLogging
-import org.opendc.compute.service.driver.Host
-import org.opendc.compute.service.driver.HostState
import org.opendc.telemetry.compute.ComputeMonitor
import org.opendc.telemetry.compute.table.HostData
import org.opendc.telemetry.compute.table.ServiceData
import kotlin.math.max
+import kotlin.math.roundToLong
/**
* A [ComputeMonitor] that tracks the aggregate metrics for each repeat.
*/
-public class WebComputeMonitor : ComputeMonitor {
- private val logger = KotlinLogging.logger {}
-
- override fun onStateChange(time: Long, host: Host, newState: HostState) {
- logger.debug { "Host ${host.uid} changed state $newState [$time]" }
- }
-
+class WebComputeMonitor : ComputeMonitor {
override fun record(data: HostData) {
- val duration = 5 * 60 * 1000L
- val slices = duration / SLICE_LENGTH
+ val slices = data.downtime / SLICE_LENGTH
hostAggregateMetrics = AggregateHostMetrics(
- hostAggregateMetrics.totalWork + data.totalWork,
- hostAggregateMetrics.totalGrantedWork + data.grantedWork,
- hostAggregateMetrics.totalOvercommittedWork + data.overcommittedWork,
- hostAggregateMetrics.totalInterferedWork + data.overcommittedWork,
- hostAggregateMetrics.totalPowerDraw + (duration * data.powerDraw) / 3600,
- hostAggregateMetrics.totalFailureSlices + if (data.host.state != HostState.UP) slices else 0,
- hostAggregateMetrics.totalFailureVmSlices + if (data.host.state != HostState.UP) data.instanceCount * slices else 0
+ hostAggregateMetrics.totalActiveTime + data.cpuActiveTime,
+ hostAggregateMetrics.totalIdleTime + data.cpuIdleTime,
+ hostAggregateMetrics.totalStealTime + data.cpuStealTime,
+ hostAggregateMetrics.totalLostTime + data.cpuLostTime,
+ hostAggregateMetrics.totalPowerDraw + data.powerTotal,
+ hostAggregateMetrics.totalFailureSlices + slices,
+ hostAggregateMetrics.totalFailureVmSlices + data.guestsRunning * slices
)
- hostMetrics.compute(data.host) { _, prev ->
+ hostMetrics.compute(data.host.id) { _, prev ->
HostMetrics(
- (data.cpuUsage.takeIf { data.host.state == HostState.UP } ?: 0.0) + (prev?.cpuUsage ?: 0.0),
- (data.cpuDemand.takeIf { data.host.state == HostState.UP } ?: 0.0) + (prev?.cpuDemand ?: 0.0),
- data.instanceCount + (prev?.instanceCount ?: 0),
+ data.cpuUsage + (prev?.cpuUsage ?: 0.0),
+ data.cpuDemand + (prev?.cpuDemand ?: 0.0),
+ data.guestsRunning + (prev?.instanceCount ?: 0),
1 + (prev?.count ?: 0)
)
}
}
private var hostAggregateMetrics: AggregateHostMetrics = AggregateHostMetrics()
- private val hostMetrics: MutableMap<Host, HostMetrics> = mutableMapOf()
- private val SLICE_LENGTH: Long = 5 * 60 * 1000
+ private val hostMetrics: MutableMap<String, HostMetrics> = mutableMapOf()
+ private val SLICE_LENGTH: Long = 5 * 60
data class AggregateHostMetrics(
- val totalWork: Double = 0.0,
- val totalGrantedWork: Double = 0.0,
- val totalOvercommittedWork: Double = 0.0,
- val totalInterferedWork: Double = 0.0,
+ val totalActiveTime: Long = 0L,
+ val totalIdleTime: Long = 0L,
+ val totalStealTime: Long = 0L,
+ val totalLostTime: Long = 0L,
val totalPowerDraw: Double = 0.0,
- val totalFailureSlices: Long = 0,
- val totalFailureVmSlices: Long = 0,
+ val totalFailureSlices: Double = 0.0,
+ val totalFailureVmSlices: Double = 0.0,
)
data class HostMetrics(
@@ -89,15 +80,15 @@ public class WebComputeMonitor : ComputeMonitor {
override fun record(data: ServiceData) {
serviceMetrics = AggregateServiceMetrics(
- max(data.instanceCount, serviceMetrics.vmTotalCount),
- max(data.queuedInstanceCount, serviceMetrics.vmWaitingCount),
- max(data.runningInstanceCount, serviceMetrics.vmActiveCount),
- max(data.finishedInstanceCount, serviceMetrics.vmInactiveCount),
- max(data.failedInstanceCount, serviceMetrics.vmFailedCount),
+ max(data.attemptsSuccess, serviceMetrics.vmTotalCount),
+ max(data.serversPending, serviceMetrics.vmWaitingCount),
+ max(data.serversActive, serviceMetrics.vmActiveCount),
+ max(0, serviceMetrics.vmInactiveCount),
+ max(data.attemptsFailure, serviceMetrics.vmFailedCount),
)
}
- public data class AggregateServiceMetrics(
+ data class AggregateServiceMetrics(
val vmTotalCount: Int = 0,
val vmWaitingCount: Int = 0,
val vmActiveCount: Int = 0,
@@ -105,19 +96,19 @@ public class WebComputeMonitor : ComputeMonitor {
val vmFailedCount: Int = 0
)
- public fun getResult(): Result {
+ fun getResult(): Result {
return Result(
- hostAggregateMetrics.totalWork,
- hostAggregateMetrics.totalGrantedWork,
- hostAggregateMetrics.totalOvercommittedWork,
- hostAggregateMetrics.totalInterferedWork,
+ hostAggregateMetrics.totalActiveTime,
+ hostAggregateMetrics.totalIdleTime,
+ hostAggregateMetrics.totalStealTime,
+ hostAggregateMetrics.totalLostTime,
hostMetrics.map { it.value.cpuUsage / it.value.count }.average(),
hostMetrics.map { it.value.cpuDemand / it.value.count }.average(),
hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.average(),
hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.maxOrNull() ?: 0.0,
hostAggregateMetrics.totalPowerDraw,
- hostAggregateMetrics.totalFailureSlices,
- hostAggregateMetrics.totalFailureVmSlices,
+ hostAggregateMetrics.totalFailureSlices.roundToLong(),
+ hostAggregateMetrics.totalFailureVmSlices.roundToLong(),
serviceMetrics.vmTotalCount,
serviceMetrics.vmWaitingCount,
serviceMetrics.vmInactiveCount,
@@ -126,10 +117,10 @@ public class WebComputeMonitor : ComputeMonitor {
}
data class Result(
- val totalWork: Double,
- val totalGrantedWork: Double,
- val totalOvercommittedWork: Double,
- val totalInterferedWork: Double,
+ val totalActiveTime: Long,
+ val totalIdleTime: Long,
+ val totalStealTime: Long,
+ val totalLostTime: Long,
val meanCpuUsage: Double,
val meanCpuDemand: Double,
val meanNumDeployedImages: Double,