From 6752b6d50faab447b3edc13bddf14f53401392f1 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Fri, 2 Jul 2021 17:52:12 +0200 Subject: runner: Use public API for scheduling simulation jobs This change updates the web runner to not require direct database access for scheduling simulation jobs. Instead, the runner polls the public REST API for available jobs and reports its results through there. --- .../src/main/kotlin/org/opendc/web/runner/Main.kt | 404 +++++++++++++++++++++ .../org/opendc/web/runner/ScenarioManager.kt | 86 +++++ .../org/opendc/web/runner/WebExperimentMonitor.kt | 189 ++++++++++ 3 files changed, 679 insertions(+) create mode 100644 opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt create mode 100644 opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt create mode 100644 opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt new file mode 100644 index 00000000..5b5ef802 --- /dev/null +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.web.runner + +import com.github.ajalt.clikt.core.CliktCommand +import com.github.ajalt.clikt.parameters.options.* +import com.github.ajalt.clikt.parameters.types.file +import com.github.ajalt.clikt.parameters.types.long +import io.opentelemetry.api.metrics.MeterProvider +import io.opentelemetry.sdk.metrics.SdkMeterProvider +import io.opentelemetry.sdk.metrics.export.MetricProducer +import kotlinx.coroutines.* +import kotlinx.coroutines.channels.Channel +import mu.KotlinLogging +import org.opendc.compute.service.scheduler.FilterScheduler +import org.opendc.compute.service.scheduler.filters.ComputeCapabilitiesFilter +import org.opendc.compute.service.scheduler.filters.ComputeFilter +import org.opendc.compute.service.scheduler.weights.* +import org.opendc.experiments.capelin.* +import org.opendc.experiments.capelin.model.Workload +import org.opendc.experiments.capelin.trace.ParquetTraceReader +import org.opendc.experiments.capelin.trace.PerformanceInterferenceReader +import org.opendc.experiments.capelin.trace.RawParquetTraceReader +import org.opendc.format.environment.EnvironmentReader +import org.opendc.format.environment.MachineDef +import org.opendc.simulator.compute.kernel.interference.VmInterferenceModel +import org.opendc.simulator.compute.model.MachineModel +import org.opendc.simulator.compute.model.MemoryUnit +import org.opendc.simulator.compute.model.ProcessingNode +import org.opendc.simulator.compute.model.ProcessingUnit +import org.opendc.simulator.compute.power.LinearPowerModel +import org.opendc.simulator.core.runBlockingSimulation +import org.opendc.telemetry.sdk.toOtelClock +import org.opendc.web.client.ApiClient +import org.opendc.web.client.AuthConfiguration +import org.opendc.web.client.model.Scenario +import org.opendc.web.client.model.Topology +import java.io.File +import java.net.URI +import java.util.* +import kotlin.random.Random +import kotlin.random.asJavaRandom +import org.opendc.web.client.model.Portfolio as ClientPortfolio + +private val logger = KotlinLogging.logger {} + +/** + * Represents the CLI command for starting the OpenDC web runner. + */ +class RunnerCli : CliktCommand(name = "runner") { + /** + * The URL to the OpenDC API. + */ + private val apiUrl by option( + "--api-url", + help = "url to the OpenDC API", + envvar = "OPENDC_API_URL" + ) + .convert { URI(it) } + .default(URI("https://api.opendc.org/v2")) + + /** + * The auth domain to use. + */ + private val authDomain by option( + "--auth-domain", + help = "auth domain of the OpenDC API", + envvar = "AUTH0_DOMAIN" + ) + .required() + + /** + * The auth client ID to use. + */ + private val authClientId by option( + "--auth-id", + help = "auth client id of the OpenDC API", + envvar = "AUTH0_CLIENT_ID" + ) + .required() + + /** + * The auth client secret to use. + */ + private val authClientSecret by option( + "--auth-secret", + help = "auth client secret of the OpenDC API", + envvar = "AUTH0_CLIENT_SECRET" + ) + .required() + + /** + * The path to the traces directory. + */ + private val tracePath by option( + "--traces", + help = "path to the directory containing the traces", + envvar = "OPENDC_TRACES" + ) + .file(canBeFile = false) + .defaultLazy { File("traces/") } + + /** + * The maximum duration of a single experiment run. + */ + private val runTimeout by option( + "--run-timeout", + help = "maximum duration of experiment in seconds", + envvar = "OPENDC_RUN_TIMEOUT" + ) + .long() + .default(60L * 3) // Experiment may run for a maximum of three minutes + + /** + * Run a single scenario. + */ + private suspend fun runScenario(portfolio: ClientPortfolio, scenario: Scenario, environment: EnvironmentReader): List { + val id = scenario.id + + logger.info { "Constructing performance interference model" } + + val traceDir = File( + tracePath, + scenario.trace.traceId + ) + val traceReader = RawParquetTraceReader(traceDir) + val interferenceGroups = let { + val path = File(traceDir, "performance-interference-model.json") + val operational = scenario.operationalPhenomena + val enabled = operational.performanceInterferenceEnabled + + if (!enabled || !path.exists()) { + return@let null + } + + PerformanceInterferenceReader(path.inputStream()).use { reader -> reader.read() } + } + + val targets = portfolio.targets + val results = (0 until targets.repeatsPerScenario).map { repeat -> + logger.info { "Starting repeat $repeat" } + withTimeout(runTimeout * 1000) { + val interferenceModel = interferenceGroups?.let { VmInterferenceModel(it, Random(repeat.toLong()).asJavaRandom()) } + runRepeat(scenario, repeat, environment, traceReader, interferenceModel) + } + } + + logger.info { "Finished simulation for scenario $id" } + + return results + } + + /** + * Run a single repeat. + */ + private suspend fun runRepeat( + scenario: Scenario, + repeat: Int, + environment: EnvironmentReader, + traceReader: RawParquetTraceReader, + interferenceModel: VmInterferenceModel? + ): WebExperimentMonitor.Result { + val monitor = WebExperimentMonitor() + + try { + runBlockingSimulation { + val seed = repeat + val workloadName = scenario.trace.traceId + val workloadFraction = scenario.trace.loadSamplingFraction + + val seeder = Random(seed) + + val chan = Channel(Channel.CONFLATED) + + val meterProvider: MeterProvider = SdkMeterProvider + .builder() + .setClock(clock.toOtelClock()) + .build() + val metricProducer = meterProvider as MetricProducer + + val operational = scenario.operationalPhenomena + val allocationPolicy = + when (val policyName = operational.schedulerName) { + "mem" -> FilterScheduler( + filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), + weighers = listOf(MemoryWeigher() to -1.0) + ) + "mem-inv" -> FilterScheduler( + filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), + weighers = listOf(MemoryWeigher() to 1.0) + ) + "core-mem" -> FilterScheduler( + filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), + weighers = listOf(CoreMemoryWeigher() to -1.0) + ) + "core-mem-inv" -> FilterScheduler( + filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), + weighers = listOf(CoreMemoryWeigher() to 1.0) + ) + "active-servers" -> FilterScheduler( + filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), + weighers = listOf(ProvisionedCoresWeigher() to -1.0) + ) + "active-servers-inv" -> FilterScheduler( + filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), + weighers = listOf(InstanceCountWeigher() to 1.0) + ) + "provisioned-cores" -> FilterScheduler( + filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), + weighers = listOf(ProvisionedCoresWeigher() to -1.0) + ) + "provisioned-cores-inv" -> FilterScheduler( + filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), + weighers = listOf(ProvisionedCoresWeigher() to 1.0) + ) + "random" -> FilterScheduler( + filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), + weighers = listOf(RandomWeigher(java.util.Random(seeder.nextLong())) to 1.0) + ) + else -> throw IllegalArgumentException("Unknown policy $policyName") + } + + val trace = ParquetTraceReader( + listOf(traceReader), + Workload(workloadName, workloadFraction), + seed + ) + val failureFrequency = if (operational.failuresEnabled) 24.0 * 7 else 0.0 + + withComputeService(clock, meterProvider, environment, allocationPolicy, interferenceModel) { scheduler -> + val failureDomain = if (failureFrequency > 0) { + logger.debug { "ENABLING failures" } + createFailureDomain( + this, + clock, + seeder.nextInt(), + failureFrequency, + scheduler, + chan + ) + } else { + null + } + + withMonitor(monitor, clock, meterProvider as MetricProducer, scheduler) { + processTrace( + clock, + trace, + scheduler, + chan, + monitor + ) + } + + failureDomain?.cancel() + } + + val monitorResults = collectMetrics(metricProducer) + logger.debug { "Finish SUBMIT=${monitorResults.submittedVms} FAIL=${monitorResults.unscheduledVms} QUEUE=${monitorResults.queuedVms} RUNNING=${monitorResults.runningVms}" } + } + } catch (cause: Throwable) { + logger.warn(cause) { "Experiment failed" } + } + + return monitor.getResult() + } + + private val POLL_INTERVAL = 30000L // ms = 30 s + private val HEARTBEAT_INTERVAL = 60000L // ms = 1 min + + override fun run(): Unit = runBlocking(Dispatchers.Default) { + logger.info { "Starting OpenDC web runner" } + + val client = ApiClient(baseUrl = apiUrl, AuthConfiguration(authDomain, authClientId, authClientSecret)) + val manager = ScenarioManager(client) + + logger.info { "Watching for queued scenarios" } + + while (true) { + val scenario = manager.findNext() + + if (scenario == null) { + delay(POLL_INTERVAL) + continue + } + + val id = scenario.id + + logger.info { "Found queued scenario $id: attempting to claim" } + + if (!manager.claim(id)) { + logger.info { "Failed to claim scenario" } + continue + } + + coroutineScope { + // Launch heartbeat process + val heartbeat = launch { + while (true) { + manager.heartbeat(id) + delay(HEARTBEAT_INTERVAL) + } + } + + try { + val scenarioModel = client.getScenario(id)!! + val portfolio = client.getPortfolio(scenarioModel.portfolioId)!! + val environment = convert(client.getTopology(scenarioModel.topology.topologyId)!!) + val results = runScenario(portfolio, scenarioModel, environment) + + logger.info { "Writing results to database" } + + manager.finish(id, results) + + logger.info { "Successfully finished scenario $id" } + } catch (e: Exception) { + logger.error(e) { "Scenario failed to finish" } + manager.fail(id) + } finally { + heartbeat.cancel() + } + } + } + } + + /** + * Convert the specified [topology] into an [EnvironmentReader] understood by Capelin. + */ + private fun convert(topology: Topology): EnvironmentReader { + val nodes = mutableListOf() + val random = Random(0) + + val machines = topology.rooms.asSequence() + .flatMap { room -> + room.tiles.flatMap { tile -> + tile.rack?.machines?.map { machine -> tile.rack to machine } ?: emptyList() + } + } + for ((rack, machine) in machines) { + val clusterId = rack.id + val position = machine.position + + val processors = machine.cpus.flatMap { cpu -> + val cores = cpu.numberOfCores + val speed = cpu.clockRateMhz + // TODO Remove hard coding of vendor + val node = ProcessingNode("Intel", "amd64", cpu.name, cores) + List(cores) { coreId -> + ProcessingUnit(node, coreId, speed) + } + } + val memoryUnits = machine.memory.map { memory -> + MemoryUnit( + "Samsung", + memory.name, + memory.speedMbPerS, + memory.sizeMb.toLong() + ) + } + + val energyConsumptionW = machine.cpus.sumOf { it.energyConsumptionW } + + nodes.add( + MachineDef( + UUID(random.nextLong(), random.nextLong()), + "node-$clusterId-$position", + mapOf("cluster" to clusterId), + MachineModel(processors, memoryUnits), + LinearPowerModel(2 * energyConsumptionW, energyConsumptionW * 0.5) + ) + ) + } + + return object : EnvironmentReader { + override fun read(): List = nodes + override fun close() {} + } + } +} + +/** + * Main entry point of the runner. + */ +fun main(args: Array): Unit = RunnerCli().main(args) diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt new file mode 100644 index 00000000..4044cec9 --- /dev/null +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.web.runner + +import org.opendc.web.client.ApiClient +import org.opendc.web.client.model.Job +import org.opendc.web.client.model.SimulationState + +/** + * Manages the queue of scenarios that need to be processed. + */ +public class ScenarioManager(private val client: ApiClient) { + /** + * Find the next job that the simulator needs to process. + */ + public suspend fun findNext(): Job? { + return client.getJobs().firstOrNull() + } + + /** + * Claim the simulation job with the specified id. + */ + public suspend fun claim(id: String): Boolean { + return client.updateJob(id, SimulationState.CLAIMED) + } + + /** + * Update the heartbeat of the specified scenario. + */ + public suspend fun heartbeat(id: String) { + client.updateJob(id, SimulationState.RUNNING) + } + + /** + * Mark the scenario as failed. + */ + public suspend fun fail(id: String) { + client.updateJob(id, SimulationState.FAILED) + } + + /** + * Persist the specified results. + */ + public suspend fun finish(id: String, results: List) { + client.updateJob( + id, SimulationState.FINISHED, + mapOf( + "total_requested_burst" to results.map { it.totalRequestedBurst }, + "total_granted_burst" to results.map { it.totalGrantedBurst }, + "total_overcommitted_burst" to results.map { it.totalOvercommittedBurst }, + "total_interfered_burst" to results.map { it.totalInterferedBurst }, + "mean_cpu_usage" to results.map { it.meanCpuUsage }, + "mean_cpu_demand" to results.map { it.meanCpuDemand }, + "mean_num_deployed_images" to results.map { it.meanNumDeployedImages }, + "max_num_deployed_images" to results.map { it.maxNumDeployedImages }, + "total_power_draw" to results.map { it.totalPowerDraw }, + "total_failure_slices" to results.map { it.totalFailureSlices }, + "total_failure_vm_slices" to results.map { it.totalFailureVmSlices }, + "total_vms_submitted" to results.map { it.totalVmsSubmitted }, + "total_vms_queued" to results.map { it.totalVmsQueued }, + "total_vms_finished" to results.map { it.totalVmsFinished }, + "total_vms_failed" to results.map { it.totalVmsFailed } + ) + ) + } +} diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt new file mode 100644 index 00000000..d4445810 --- /dev/null +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.web.runner + +import mu.KotlinLogging +import org.opendc.compute.api.Server +import org.opendc.compute.api.ServerState +import org.opendc.compute.service.driver.Host +import org.opendc.compute.service.driver.HostState +import org.opendc.experiments.capelin.monitor.ExperimentMonitor +import org.opendc.experiments.capelin.telemetry.HostEvent +import kotlin.math.max + +/** + * An [ExperimentMonitor] that tracks the aggregate metrics for each repeat. + */ +public class WebExperimentMonitor : ExperimentMonitor { + private val logger = KotlinLogging.logger {} + + override fun reportVmStateChange(time: Long, server: Server, newState: ServerState) {} + + override fun reportHostStateChange(time: Long, host: Host, newState: HostState) { + logger.debug { "Host ${host.uid} changed state $newState [$time]" } + } + + override fun reportHostSlice( + time: Long, + requestedBurst: Long, + grantedBurst: Long, + overcommissionedBurst: Long, + interferedBurst: Long, + cpuUsage: Double, + cpuDemand: Double, + powerDraw: Double, + numberOfDeployedImages: Int, + host: Host, + ) { + processHostEvent( + HostEvent( + time, + 5 * 60 * 1000L, + host, + numberOfDeployedImages, + requestedBurst, + grantedBurst, + overcommissionedBurst, + interferedBurst, + cpuUsage, + cpuDemand, + powerDraw, + host.model.cpuCount + ) + ) + } + + private var hostAggregateMetrics: AggregateHostMetrics = AggregateHostMetrics() + private val hostMetrics: MutableMap = mutableMapOf() + + private fun processHostEvent(event: HostEvent) { + val slices = event.duration / SLICE_LENGTH + + hostAggregateMetrics = AggregateHostMetrics( + hostAggregateMetrics.totalRequestedBurst + event.requestedBurst, + hostAggregateMetrics.totalGrantedBurst + event.grantedBurst, + hostAggregateMetrics.totalOvercommittedBurst + event.overcommissionedBurst, + hostAggregateMetrics.totalInterferedBurst + event.interferedBurst, + hostAggregateMetrics.totalPowerDraw + (event.duration * event.powerDraw) / 3600, + hostAggregateMetrics.totalFailureSlices + if (event.host.state != HostState.UP) slices else 0, + hostAggregateMetrics.totalFailureVmSlices + if (event.host.state != HostState.UP) event.vmCount * slices else 0 + ) + + hostMetrics.compute(event.host) { _, prev -> + HostMetrics( + (event.cpuUsage.takeIf { event.host.state == HostState.UP } ?: 0.0) + (prev?.cpuUsage ?: 0.0), + (event.cpuDemand.takeIf { event.host.state == HostState.UP } ?: 0.0) + (prev?.cpuDemand ?: 0.0), + event.vmCount + (prev?.vmCount ?: 0), + 1 + (prev?.count ?: 0) + ) + } + } + + private val SLICE_LENGTH: Long = 5 * 60 * 1000 + + public data class AggregateHostMetrics( + val totalRequestedBurst: Long = 0, + val totalGrantedBurst: Long = 0, + val totalOvercommittedBurst: Long = 0, + val totalInterferedBurst: Long = 0, + val totalPowerDraw: Double = 0.0, + val totalFailureSlices: Long = 0, + val totalFailureVmSlices: Long = 0, + ) + + public data class HostMetrics( + val cpuUsage: Double, + val cpuDemand: Double, + val vmCount: Long, + val count: Long + ) + + private var provisionerMetrics: AggregateProvisionerMetrics = AggregateProvisionerMetrics() + + override fun reportProvisionerMetrics( + time: Long, + totalHostCount: Int, + availableHostCount: Int, + totalVmCount: Int, + activeVmCount: Int, + inactiveVmCount: Int, + waitingVmCount: Int, + failedVmCount: Int + ) { + provisionerMetrics = AggregateProvisionerMetrics( + max(totalVmCount, provisionerMetrics.vmTotalCount), + max(waitingVmCount, provisionerMetrics.vmWaitingCount), + max(activeVmCount, provisionerMetrics.vmActiveCount), + max(inactiveVmCount, provisionerMetrics.vmInactiveCount), + max(failedVmCount, provisionerMetrics.vmFailedCount), + ) + } + + public data class AggregateProvisionerMetrics( + val vmTotalCount: Int = 0, + val vmWaitingCount: Int = 0, + val vmActiveCount: Int = 0, + val vmInactiveCount: Int = 0, + val vmFailedCount: Int = 0 + ) + + override fun close() {} + + public fun getResult(): Result { + return Result( + hostAggregateMetrics.totalRequestedBurst, + hostAggregateMetrics.totalGrantedBurst, + hostAggregateMetrics.totalOvercommittedBurst, + hostAggregateMetrics.totalInterferedBurst, + hostMetrics.map { it.value.cpuUsage / it.value.count }.average(), + hostMetrics.map { it.value.cpuDemand / it.value.count }.average(), + hostMetrics.map { it.value.vmCount.toDouble() / it.value.count }.average(), + hostMetrics.map { it.value.vmCount.toDouble() / it.value.count }.maxOrNull() ?: 0.0, + hostAggregateMetrics.totalPowerDraw, + hostAggregateMetrics.totalFailureSlices, + hostAggregateMetrics.totalFailureVmSlices, + provisionerMetrics.vmTotalCount, + provisionerMetrics.vmWaitingCount, + provisionerMetrics.vmInactiveCount, + provisionerMetrics.vmFailedCount, + ) + } + + public data class Result( + public val totalRequestedBurst: Long, + public val totalGrantedBurst: Long, + public val totalOvercommittedBurst: Long, + public val totalInterferedBurst: Long, + public val meanCpuUsage: Double, + public val meanCpuDemand: Double, + public val meanNumDeployedImages: Double, + public val maxNumDeployedImages: Double, + public val totalPowerDraw: Double, + public val totalFailureSlices: Long, + public val totalFailureVmSlices: Long, + public val totalVmsSubmitted: Int, + public val totalVmsQueued: Int, + public val totalVmsFinished: Int, + public val totalVmsFailed: Int + ) +} -- cgit v1.2.3 From b8f64c1d3df2c990df8941cd036222fab2def9fa Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Sun, 22 Aug 2021 13:23:53 +0200 Subject: refactor(compute): Update FilterScheduler to follow OpenStack's Nova This change updates the FilterScheduler implementation to follow more closely the scheduler implementation in OpenStack's Nova. We now normalize the weights, support many of the filters and weights in OpenStack and support overcommitting resources. --- .../src/main/kotlin/org/opendc/web/runner/Main.kt | 44 +--------------------- 1 file changed, 1 insertion(+), 43 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index 5b5ef802..c5f5cd03 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -32,9 +32,6 @@ import io.opentelemetry.sdk.metrics.export.MetricProducer import kotlinx.coroutines.* import kotlinx.coroutines.channels.Channel import mu.KotlinLogging -import org.opendc.compute.service.scheduler.FilterScheduler -import org.opendc.compute.service.scheduler.filters.ComputeCapabilitiesFilter -import org.opendc.compute.service.scheduler.filters.ComputeFilter import org.opendc.compute.service.scheduler.weights.* import org.opendc.experiments.capelin.* import org.opendc.experiments.capelin.model.Workload @@ -199,46 +196,7 @@ class RunnerCli : CliktCommand(name = "runner") { val metricProducer = meterProvider as MetricProducer val operational = scenario.operationalPhenomena - val allocationPolicy = - when (val policyName = operational.schedulerName) { - "mem" -> FilterScheduler( - filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), - weighers = listOf(MemoryWeigher() to -1.0) - ) - "mem-inv" -> FilterScheduler( - filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), - weighers = listOf(MemoryWeigher() to 1.0) - ) - "core-mem" -> FilterScheduler( - filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), - weighers = listOf(CoreMemoryWeigher() to -1.0) - ) - "core-mem-inv" -> FilterScheduler( - filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), - weighers = listOf(CoreMemoryWeigher() to 1.0) - ) - "active-servers" -> FilterScheduler( - filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), - weighers = listOf(ProvisionedCoresWeigher() to -1.0) - ) - "active-servers-inv" -> FilterScheduler( - filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), - weighers = listOf(InstanceCountWeigher() to 1.0) - ) - "provisioned-cores" -> FilterScheduler( - filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), - weighers = listOf(ProvisionedCoresWeigher() to -1.0) - ) - "provisioned-cores-inv" -> FilterScheduler( - filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), - weighers = listOf(ProvisionedCoresWeigher() to 1.0) - ) - "random" -> FilterScheduler( - filters = listOf(ComputeFilter(), ComputeCapabilitiesFilter()), - weighers = listOf(RandomWeigher(java.util.Random(seeder.nextLong())) to 1.0) - ) - else -> throw IllegalArgumentException("Unknown policy $policyName") - } + val allocationPolicy = createComputeScheduler(operational.schedulerName, seeder) val trace = ParquetTraceReader( listOf(traceReader), -- cgit v1.2.3 From f111081627280d4e7e1d7147c56cdce708e32433 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Wed, 25 Aug 2021 14:06:39 +0200 Subject: build: Upgrade to OpenTelemetry 1.5 This change upgrades the OpenTelemetry dependency to version 1.5, which contains various breaking changes in the metrics API. --- .../src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt index d4445810..140f067a 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt @@ -45,10 +45,10 @@ public class WebExperimentMonitor : ExperimentMonitor { override fun reportHostSlice( time: Long, - requestedBurst: Long, - grantedBurst: Long, - overcommissionedBurst: Long, - interferedBurst: Long, + requestedBurst: Double, + grantedBurst: Double, + overcommissionedBurst: Double, + interferedBurst: Double, cpuUsage: Double, cpuDemand: Double, powerDraw: Double, -- cgit v1.2.3 From bb6066e1cecc55a50ac29da200bf3beba1ddd80b Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Wed, 25 Aug 2021 18:16:20 +0200 Subject: fix(capelin): Eliminate unnecessary double to long conversions This change eliminates the unnecessary conversions from double to long in the Capelin metric processing code. --- .../org/opendc/web/runner/WebExperimentMonitor.kt | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt index 140f067a..82e2a334 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt @@ -43,16 +43,16 @@ public class WebExperimentMonitor : ExperimentMonitor { logger.debug { "Host ${host.uid} changed state $newState [$time]" } } - override fun reportHostSlice( + override fun reportHostData( time: Long, - requestedBurst: Double, - grantedBurst: Double, - overcommissionedBurst: Double, - interferedBurst: Double, + totalWork: Double, + grantedWork: Double, + overcommittedWork: Double, + interferedWork: Double, cpuUsage: Double, cpuDemand: Double, powerDraw: Double, - numberOfDeployedImages: Int, + instanceCount: Int, host: Host, ) { processHostEvent( @@ -60,11 +60,11 @@ public class WebExperimentMonitor : ExperimentMonitor { time, 5 * 60 * 1000L, host, - numberOfDeployedImages, - requestedBurst, - grantedBurst, - overcommissionedBurst, - interferedBurst, + instanceCount, + totalWork.toLong(), + grantedWork.toLong(), + overcommittedWork.toLong(), + interferedWork.toLong(), cpuUsage, cpuDemand, powerDraw, @@ -120,7 +120,7 @@ public class WebExperimentMonitor : ExperimentMonitor { private var provisionerMetrics: AggregateProvisionerMetrics = AggregateProvisionerMetrics() - override fun reportProvisionerMetrics( + override fun reportServiceData( time: Long, totalHostCount: Int, availableHostCount: Int, -- cgit v1.2.3 From 9fcce6ade8714f7f0a9073fe5b7ddd3f0b35c375 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Tue, 31 Aug 2021 15:44:47 +0200 Subject: refactor(format): Remove environment reader from format library This change removes the environment reader from the format library since they are highly specific for the particular experiment. In the future, we hope to have a single format to setup the entire datacenter (perhaps similar to the format used by the web runner). --- .../opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index c5f5cd03..53d50357 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -34,12 +34,12 @@ import kotlinx.coroutines.channels.Channel import mu.KotlinLogging import org.opendc.compute.service.scheduler.weights.* import org.opendc.experiments.capelin.* +import org.opendc.experiments.capelin.env.EnvironmentReader +import org.opendc.experiments.capelin.env.MachineDef import org.opendc.experiments.capelin.model.Workload import org.opendc.experiments.capelin.trace.ParquetTraceReader import org.opendc.experiments.capelin.trace.PerformanceInterferenceReader import org.opendc.experiments.capelin.trace.RawParquetTraceReader -import org.opendc.format.environment.EnvironmentReader -import org.opendc.format.environment.MachineDef import org.opendc.simulator.compute.kernel.interference.VmInterferenceModel import org.opendc.simulator.compute.model.MachineModel import org.opendc.simulator.compute.model.MemoryUnit -- cgit v1.2.3 From befec2f1ddf3a6e6d15d9d1b9fd1ecbbc4f38960 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Thu, 26 Aug 2021 10:34:18 +0200 Subject: feat(capelin): Report up/downtime metrics in experiment monitor --- .../src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt | 2 ++ 1 file changed, 2 insertions(+) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt index 82e2a334..281c8dbb 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt @@ -53,6 +53,8 @@ public class WebExperimentMonitor : ExperimentMonitor { cpuDemand: Double, powerDraw: Double, instanceCount: Int, + uptime: Long, + downtime: Long, host: Host, ) { processHostEvent( -- cgit v1.2.3 From aaedd4f3eed83d0c3ebc829fec08a1749a2bfba4 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Fri, 27 Aug 2021 16:41:55 +0200 Subject: refactor(capelin): Move metric collection outside Capelin code This change moves the metric collection outside the Capelin codebase in a separate module so other modules can also benefit from the compute metric collection code. --- .../src/main/kotlin/org/opendc/web/runner/Main.kt | 20 ++- .../org/opendc/web/runner/ScenarioManager.kt | 10 +- .../org/opendc/web/runner/WebComputeMonitor.kt | 145 ++++++++++++++++ .../org/opendc/web/runner/WebExperimentMonitor.kt | 191 --------------------- 4 files changed, 164 insertions(+), 202 deletions(-) create mode 100644 opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt delete mode 100644 opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index 53d50357..65527141 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -47,6 +47,8 @@ import org.opendc.simulator.compute.model.ProcessingNode import org.opendc.simulator.compute.model.ProcessingUnit import org.opendc.simulator.compute.power.LinearPowerModel import org.opendc.simulator.core.runBlockingSimulation +import org.opendc.telemetry.compute.collectServiceMetrics +import org.opendc.telemetry.compute.withMonitor import org.opendc.telemetry.sdk.toOtelClock import org.opendc.web.client.ApiClient import org.opendc.web.client.AuthConfiguration @@ -131,7 +133,7 @@ class RunnerCli : CliktCommand(name = "runner") { /** * Run a single scenario. */ - private suspend fun runScenario(portfolio: ClientPortfolio, scenario: Scenario, environment: EnvironmentReader): List { + private suspend fun runScenario(portfolio: ClientPortfolio, scenario: Scenario, environment: EnvironmentReader): List { val id = scenario.id logger.info { "Constructing performance interference model" } @@ -176,8 +178,8 @@ class RunnerCli : CliktCommand(name = "runner") { environment: EnvironmentReader, traceReader: RawParquetTraceReader, interferenceModel: VmInterferenceModel? - ): WebExperimentMonitor.Result { - val monitor = WebExperimentMonitor() + ): WebComputeMonitor.Result { + val monitor = WebComputeMonitor() try { runBlockingSimulation { @@ -220,7 +222,7 @@ class RunnerCli : CliktCommand(name = "runner") { null } - withMonitor(monitor, clock, meterProvider as MetricProducer, scheduler) { + withMonitor(scheduler, clock, meterProvider as MetricProducer, monitor) { processTrace( clock, trace, @@ -233,8 +235,14 @@ class RunnerCli : CliktCommand(name = "runner") { failureDomain?.cancel() } - val monitorResults = collectMetrics(metricProducer) - logger.debug { "Finish SUBMIT=${monitorResults.submittedVms} FAIL=${monitorResults.unscheduledVms} QUEUE=${monitorResults.queuedVms} RUNNING=${monitorResults.runningVms}" } + val monitorResults = collectServiceMetrics(clock.millis(), metricProducer) + logger.debug { + "Finish " + + "SUBMIT=${monitorResults.instanceCount} " + + "FAIL=${monitorResults.failedInstanceCount} " + + "QUEUE=${monitorResults.queuedInstanceCount} " + + "RUNNING=${monitorResults.runningInstanceCount}" + } } } catch (cause: Throwable) { logger.warn(cause) { "Experiment failed" } diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt index 4044cec9..e0e3488f 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt @@ -61,14 +61,14 @@ public class ScenarioManager(private val client: ApiClient) { /** * Persist the specified results. */ - public suspend fun finish(id: String, results: List) { + public suspend fun finish(id: String, results: List) { client.updateJob( id, SimulationState.FINISHED, mapOf( - "total_requested_burst" to results.map { it.totalRequestedBurst }, - "total_granted_burst" to results.map { it.totalGrantedBurst }, - "total_overcommitted_burst" to results.map { it.totalOvercommittedBurst }, - "total_interfered_burst" to results.map { it.totalInterferedBurst }, + "total_requested_burst" to results.map { it.totalWork }, + "total_granted_burst" to results.map { it.totalGrantedWork }, + "total_overcommitted_burst" to results.map { it.totalOvercommittedWork }, + "total_interfered_burst" to results.map { it.totalInterferedWork }, "mean_cpu_usage" to results.map { it.meanCpuUsage }, "mean_cpu_demand" to results.map { it.meanCpuDemand }, "mean_num_deployed_images" to results.map { it.meanNumDeployedImages }, diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt new file mode 100644 index 00000000..c8e58dde --- /dev/null +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.web.runner + +import mu.KotlinLogging +import org.opendc.compute.service.driver.Host +import org.opendc.compute.service.driver.HostState +import org.opendc.telemetry.compute.ComputeMonitor +import org.opendc.telemetry.compute.table.HostData +import org.opendc.telemetry.compute.table.ServiceData +import kotlin.math.max + +/** + * A [ComputeMonitor] that tracks the aggregate metrics for each repeat. + */ +public class WebComputeMonitor : ComputeMonitor { + private val logger = KotlinLogging.logger {} + + override fun onStateChange(time: Long, host: Host, newState: HostState) { + logger.debug { "Host ${host.uid} changed state $newState [$time]" } + } + + override fun record(data: HostData) { + val duration = 5 * 60 * 1000L + val slices = duration / SLICE_LENGTH + + hostAggregateMetrics = AggregateHostMetrics( + hostAggregateMetrics.totalWork + data.totalWork, + hostAggregateMetrics.totalGrantedWork + data.grantedWork, + hostAggregateMetrics.totalOvercommittedWork + data.overcommittedWork, + hostAggregateMetrics.totalInterferedWork + data.overcommittedWork, + hostAggregateMetrics.totalPowerDraw + (duration * data.powerDraw) / 3600, + hostAggregateMetrics.totalFailureSlices + if (data.host.state != HostState.UP) slices else 0, + hostAggregateMetrics.totalFailureVmSlices + if (data.host.state != HostState.UP) data.instanceCount * slices else 0 + ) + + hostMetrics.compute(data.host) { _, prev -> + HostMetrics( + (data.cpuUsage.takeIf { data.host.state == HostState.UP } ?: 0.0) + (prev?.cpuUsage ?: 0.0), + (data.cpuDemand.takeIf { data.host.state == HostState.UP } ?: 0.0) + (prev?.cpuDemand ?: 0.0), + data.instanceCount + (prev?.instanceCount ?: 0), + 1 + (prev?.count ?: 0) + ) + } + } + + private var hostAggregateMetrics: AggregateHostMetrics = AggregateHostMetrics() + private val hostMetrics: MutableMap = mutableMapOf() + private val SLICE_LENGTH: Long = 5 * 60 * 1000 + + data class AggregateHostMetrics( + val totalWork: Double = 0.0, + val totalGrantedWork: Double = 0.0, + val totalOvercommittedWork: Double = 0.0, + val totalInterferedWork: Double = 0.0, + val totalPowerDraw: Double = 0.0, + val totalFailureSlices: Long = 0, + val totalFailureVmSlices: Long = 0, + ) + + data class HostMetrics( + val cpuUsage: Double, + val cpuDemand: Double, + val instanceCount: Long, + val count: Long + ) + + private var serviceMetrics: AggregateServiceMetrics = AggregateServiceMetrics() + + override fun record(data: ServiceData) { + serviceMetrics = AggregateServiceMetrics( + max(data.instanceCount, serviceMetrics.vmTotalCount), + max(data.queuedInstanceCount, serviceMetrics.vmWaitingCount), + max(data.runningInstanceCount, serviceMetrics.vmActiveCount), + max(data.finishedInstanceCount, serviceMetrics.vmInactiveCount), + max(data.failedInstanceCount, serviceMetrics.vmFailedCount), + ) + } + + public data class AggregateServiceMetrics( + val vmTotalCount: Int = 0, + val vmWaitingCount: Int = 0, + val vmActiveCount: Int = 0, + val vmInactiveCount: Int = 0, + val vmFailedCount: Int = 0 + ) + + public fun getResult(): Result { + return Result( + hostAggregateMetrics.totalWork, + hostAggregateMetrics.totalGrantedWork, + hostAggregateMetrics.totalOvercommittedWork, + hostAggregateMetrics.totalInterferedWork, + hostMetrics.map { it.value.cpuUsage / it.value.count }.average(), + hostMetrics.map { it.value.cpuDemand / it.value.count }.average(), + hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.average(), + hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.maxOrNull() ?: 0.0, + hostAggregateMetrics.totalPowerDraw, + hostAggregateMetrics.totalFailureSlices, + hostAggregateMetrics.totalFailureVmSlices, + serviceMetrics.vmTotalCount, + serviceMetrics.vmWaitingCount, + serviceMetrics.vmInactiveCount, + serviceMetrics.vmFailedCount, + ) + } + + data class Result( + val totalWork: Double, + val totalGrantedWork: Double, + val totalOvercommittedWork: Double, + val totalInterferedWork: Double, + val meanCpuUsage: Double, + val meanCpuDemand: Double, + val meanNumDeployedImages: Double, + val maxNumDeployedImages: Double, + val totalPowerDraw: Double, + val totalFailureSlices: Long, + val totalFailureVmSlices: Long, + val totalVmsSubmitted: Int, + val totalVmsQueued: Int, + val totalVmsFinished: Int, + val totalVmsFailed: Int + ) +} diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt deleted file mode 100644 index 281c8dbb..00000000 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebExperimentMonitor.kt +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2021 AtLarge Research - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -package org.opendc.web.runner - -import mu.KotlinLogging -import org.opendc.compute.api.Server -import org.opendc.compute.api.ServerState -import org.opendc.compute.service.driver.Host -import org.opendc.compute.service.driver.HostState -import org.opendc.experiments.capelin.monitor.ExperimentMonitor -import org.opendc.experiments.capelin.telemetry.HostEvent -import kotlin.math.max - -/** - * An [ExperimentMonitor] that tracks the aggregate metrics for each repeat. - */ -public class WebExperimentMonitor : ExperimentMonitor { - private val logger = KotlinLogging.logger {} - - override fun reportVmStateChange(time: Long, server: Server, newState: ServerState) {} - - override fun reportHostStateChange(time: Long, host: Host, newState: HostState) { - logger.debug { "Host ${host.uid} changed state $newState [$time]" } - } - - override fun reportHostData( - time: Long, - totalWork: Double, - grantedWork: Double, - overcommittedWork: Double, - interferedWork: Double, - cpuUsage: Double, - cpuDemand: Double, - powerDraw: Double, - instanceCount: Int, - uptime: Long, - downtime: Long, - host: Host, - ) { - processHostEvent( - HostEvent( - time, - 5 * 60 * 1000L, - host, - instanceCount, - totalWork.toLong(), - grantedWork.toLong(), - overcommittedWork.toLong(), - interferedWork.toLong(), - cpuUsage, - cpuDemand, - powerDraw, - host.model.cpuCount - ) - ) - } - - private var hostAggregateMetrics: AggregateHostMetrics = AggregateHostMetrics() - private val hostMetrics: MutableMap = mutableMapOf() - - private fun processHostEvent(event: HostEvent) { - val slices = event.duration / SLICE_LENGTH - - hostAggregateMetrics = AggregateHostMetrics( - hostAggregateMetrics.totalRequestedBurst + event.requestedBurst, - hostAggregateMetrics.totalGrantedBurst + event.grantedBurst, - hostAggregateMetrics.totalOvercommittedBurst + event.overcommissionedBurst, - hostAggregateMetrics.totalInterferedBurst + event.interferedBurst, - hostAggregateMetrics.totalPowerDraw + (event.duration * event.powerDraw) / 3600, - hostAggregateMetrics.totalFailureSlices + if (event.host.state != HostState.UP) slices else 0, - hostAggregateMetrics.totalFailureVmSlices + if (event.host.state != HostState.UP) event.vmCount * slices else 0 - ) - - hostMetrics.compute(event.host) { _, prev -> - HostMetrics( - (event.cpuUsage.takeIf { event.host.state == HostState.UP } ?: 0.0) + (prev?.cpuUsage ?: 0.0), - (event.cpuDemand.takeIf { event.host.state == HostState.UP } ?: 0.0) + (prev?.cpuDemand ?: 0.0), - event.vmCount + (prev?.vmCount ?: 0), - 1 + (prev?.count ?: 0) - ) - } - } - - private val SLICE_LENGTH: Long = 5 * 60 * 1000 - - public data class AggregateHostMetrics( - val totalRequestedBurst: Long = 0, - val totalGrantedBurst: Long = 0, - val totalOvercommittedBurst: Long = 0, - val totalInterferedBurst: Long = 0, - val totalPowerDraw: Double = 0.0, - val totalFailureSlices: Long = 0, - val totalFailureVmSlices: Long = 0, - ) - - public data class HostMetrics( - val cpuUsage: Double, - val cpuDemand: Double, - val vmCount: Long, - val count: Long - ) - - private var provisionerMetrics: AggregateProvisionerMetrics = AggregateProvisionerMetrics() - - override fun reportServiceData( - time: Long, - totalHostCount: Int, - availableHostCount: Int, - totalVmCount: Int, - activeVmCount: Int, - inactiveVmCount: Int, - waitingVmCount: Int, - failedVmCount: Int - ) { - provisionerMetrics = AggregateProvisionerMetrics( - max(totalVmCount, provisionerMetrics.vmTotalCount), - max(waitingVmCount, provisionerMetrics.vmWaitingCount), - max(activeVmCount, provisionerMetrics.vmActiveCount), - max(inactiveVmCount, provisionerMetrics.vmInactiveCount), - max(failedVmCount, provisionerMetrics.vmFailedCount), - ) - } - - public data class AggregateProvisionerMetrics( - val vmTotalCount: Int = 0, - val vmWaitingCount: Int = 0, - val vmActiveCount: Int = 0, - val vmInactiveCount: Int = 0, - val vmFailedCount: Int = 0 - ) - - override fun close() {} - - public fun getResult(): Result { - return Result( - hostAggregateMetrics.totalRequestedBurst, - hostAggregateMetrics.totalGrantedBurst, - hostAggregateMetrics.totalOvercommittedBurst, - hostAggregateMetrics.totalInterferedBurst, - hostMetrics.map { it.value.cpuUsage / it.value.count }.average(), - hostMetrics.map { it.value.cpuDemand / it.value.count }.average(), - hostMetrics.map { it.value.vmCount.toDouble() / it.value.count }.average(), - hostMetrics.map { it.value.vmCount.toDouble() / it.value.count }.maxOrNull() ?: 0.0, - hostAggregateMetrics.totalPowerDraw, - hostAggregateMetrics.totalFailureSlices, - hostAggregateMetrics.totalFailureVmSlices, - provisionerMetrics.vmTotalCount, - provisionerMetrics.vmWaitingCount, - provisionerMetrics.vmInactiveCount, - provisionerMetrics.vmFailedCount, - ) - } - - public data class Result( - public val totalRequestedBurst: Long, - public val totalGrantedBurst: Long, - public val totalOvercommittedBurst: Long, - public val totalInterferedBurst: Long, - public val meanCpuUsage: Double, - public val meanCpuDemand: Double, - public val meanNumDeployedImages: Double, - public val maxNumDeployedImages: Double, - public val totalPowerDraw: Double, - public val totalFailureSlices: Long, - public val totalFailureVmSlices: Long, - public val totalVmsSubmitted: Int, - public val totalVmsQueued: Int, - public val totalVmsFinished: Int, - public val totalVmsFailed: Int - ) -} -- cgit v1.2.3 From 18ff316a6b6ab984ebf8283ea48ed98ec69d8295 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Thu, 2 Sep 2021 13:20:05 +0200 Subject: refactor(capelin): Restructure input reading classes --- .../opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index 65527141..5d481270 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -32,7 +32,6 @@ import io.opentelemetry.sdk.metrics.export.MetricProducer import kotlinx.coroutines.* import kotlinx.coroutines.channels.Channel import mu.KotlinLogging -import org.opendc.compute.service.scheduler.weights.* import org.opendc.experiments.capelin.* import org.opendc.experiments.capelin.env.EnvironmentReader import org.opendc.experiments.capelin.env.MachineDef @@ -152,7 +151,7 @@ class RunnerCli : CliktCommand(name = "runner") { return@let null } - PerformanceInterferenceReader(path.inputStream()).use { reader -> reader.read() } + PerformanceInterferenceReader().read(path.inputStream()) } val targets = portfolio.targets -- cgit v1.2.3 From d24cc0cc9c4fe2145f0337d65e9a75f631365973 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Fri, 10 Sep 2021 10:59:44 +0200 Subject: refactor(compute): Integrate fault injection into compute simulator This change moves the fault injection logic directly into the opendc-compute-simulator module, so that it can operate at a higher abstraction. In the future, we might again split the module if we can re-use some of its logic. --- .../src/main/kotlin/org/opendc/web/runner/Main.kt | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index 5d481270..b565e90d 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -30,8 +30,8 @@ import io.opentelemetry.api.metrics.MeterProvider import io.opentelemetry.sdk.metrics.SdkMeterProvider import io.opentelemetry.sdk.metrics.export.MetricProducer import kotlinx.coroutines.* -import kotlinx.coroutines.channels.Channel import mu.KotlinLogging +import org.opendc.compute.simulator.SimHost import org.opendc.experiments.capelin.* import org.opendc.experiments.capelin.env.EnvironmentReader import org.opendc.experiments.capelin.env.MachineDef @@ -188,8 +188,6 @@ class RunnerCli : CliktCommand(name = "runner") { val seeder = Random(seed) - val chan = Channel(Channel.CONFLATED) - val meterProvider: MeterProvider = SdkMeterProvider .builder() .setClock(clock.toOtelClock()) @@ -207,31 +205,31 @@ class RunnerCli : CliktCommand(name = "runner") { val failureFrequency = if (operational.failuresEnabled) 24.0 * 7 else 0.0 withComputeService(clock, meterProvider, environment, allocationPolicy, interferenceModel) { scheduler -> - val failureDomain = if (failureFrequency > 0) { + val faultInjector = if (failureFrequency > 0) { logger.debug { "ENABLING failures" } - createFailureDomain( - this, + createFaultInjector( + coroutineContext, clock, + scheduler.hosts.map { it as SimHost }.toSet(), seeder.nextInt(), failureFrequency, - scheduler, - chan ) } else { null } withMonitor(scheduler, clock, meterProvider as MetricProducer, monitor) { + faultInjector?.start() + processTrace( clock, trace, scheduler, - chan, monitor ) - } - failureDomain?.cancel() + faultInjector?.close() + } } val monitorResults = collectServiceMetrics(clock.millis(), metricProducer) -- cgit v1.2.3 From 3ca64e0110adab65526a0ccfd5b252e9f047ab10 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Tue, 14 Sep 2021 14:41:05 +0200 Subject: refactor(telemetry): Create separate MeterProvider per service/host This change refactors the telemetry implementation by creating a separate MeterProvider per service or host. This means we have to keep track of multiple metric producers, but that we can attach resource information to each of the MeterProviders like we would in a real world scenario. --- .../src/main/kotlin/org/opendc/web/runner/Main.kt | 72 ++++++++++------------ .../org/opendc/web/runner/WebComputeMonitor.kt | 40 +++++------- 2 files changed, 48 insertions(+), 64 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index b565e90d..b9d5a3f5 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -28,10 +28,8 @@ import com.github.ajalt.clikt.parameters.types.file import com.github.ajalt.clikt.parameters.types.long import io.opentelemetry.api.metrics.MeterProvider import io.opentelemetry.sdk.metrics.SdkMeterProvider -import io.opentelemetry.sdk.metrics.export.MetricProducer import kotlinx.coroutines.* import mu.KotlinLogging -import org.opendc.compute.simulator.SimHost import org.opendc.experiments.capelin.* import org.opendc.experiments.capelin.env.EnvironmentReader import org.opendc.experiments.capelin.env.MachineDef @@ -39,6 +37,8 @@ import org.opendc.experiments.capelin.model.Workload import org.opendc.experiments.capelin.trace.ParquetTraceReader import org.opendc.experiments.capelin.trace.PerformanceInterferenceReader import org.opendc.experiments.capelin.trace.RawParquetTraceReader +import org.opendc.experiments.capelin.util.ComputeServiceSimulator +import org.opendc.experiments.capelin.util.createComputeScheduler import org.opendc.simulator.compute.kernel.interference.VmInterferenceModel import org.opendc.simulator.compute.model.MachineModel import org.opendc.simulator.compute.model.MemoryUnit @@ -46,8 +46,9 @@ import org.opendc.simulator.compute.model.ProcessingNode import org.opendc.simulator.compute.model.ProcessingUnit import org.opendc.simulator.compute.power.LinearPowerModel import org.opendc.simulator.core.runBlockingSimulation +import org.opendc.telemetry.compute.ComputeMetricExporter import org.opendc.telemetry.compute.collectServiceMetrics -import org.opendc.telemetry.compute.withMonitor +import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader import org.opendc.telemetry.sdk.toOtelClock import org.opendc.web.client.ApiClient import org.opendc.web.client.AuthConfiguration @@ -55,9 +56,8 @@ import org.opendc.web.client.model.Scenario import org.opendc.web.client.model.Topology import java.io.File import java.net.URI +import java.time.Duration import java.util.* -import kotlin.random.Random -import kotlin.random.asJavaRandom import org.opendc.web.client.model.Portfolio as ClientPortfolio private val logger = KotlinLogging.logger {} @@ -158,7 +158,7 @@ class RunnerCli : CliktCommand(name = "runner") { val results = (0 until targets.repeatsPerScenario).map { repeat -> logger.info { "Starting repeat $repeat" } withTimeout(runTimeout * 1000) { - val interferenceModel = interferenceGroups?.let { VmInterferenceModel(it, Random(repeat.toLong()).asJavaRandom()) } + val interferenceModel = interferenceGroups?.let { VmInterferenceModel(it, Random(repeat.toLong())) } runRepeat(scenario, repeat, environment, traceReader, interferenceModel) } } @@ -182,63 +182,55 @@ class RunnerCli : CliktCommand(name = "runner") { try { runBlockingSimulation { - val seed = repeat val workloadName = scenario.trace.traceId val workloadFraction = scenario.trace.loadSamplingFraction - val seeder = Random(seed) + val seeder = Random(repeat.toLong()) val meterProvider: MeterProvider = SdkMeterProvider .builder() .setClock(clock.toOtelClock()) .build() - val metricProducer = meterProvider as MetricProducer val operational = scenario.operationalPhenomena - val allocationPolicy = createComputeScheduler(operational.schedulerName, seeder) + val computeScheduler = createComputeScheduler(operational.schedulerName, seeder) val trace = ParquetTraceReader( listOf(traceReader), Workload(workloadName, workloadFraction), - seed + repeat ) - val failureFrequency = if (operational.failuresEnabled) 24.0 * 7 else 0.0 - - withComputeService(clock, meterProvider, environment, allocationPolicy, interferenceModel) { scheduler -> - val faultInjector = if (failureFrequency > 0) { - logger.debug { "ENABLING failures" } - createFaultInjector( - coroutineContext, - clock, - scheduler.hosts.map { it as SimHost }.toSet(), - seeder.nextInt(), - failureFrequency, - ) - } else { + val failureModel = + if (operational.failuresEnabled) + grid5000(Duration.ofDays(7), repeat) + else null - } - withMonitor(scheduler, clock, meterProvider as MetricProducer, monitor) { - faultInjector?.start() + val simulator = ComputeServiceSimulator( + coroutineContext, + clock, + computeScheduler, + environment.read(), + failureModel, + interferenceModel.takeIf { operational.performanceInterferenceEnabled } + ) - processTrace( - clock, - trace, - scheduler, - monitor - ) + val metricReader = CoroutineMetricReader(this, simulator.producers, ComputeMetricExporter(clock, monitor)) - faultInjector?.close() - } + try { + simulator.run(trace) + } finally { + simulator.close() + metricReader.close() } - val monitorResults = collectServiceMetrics(clock.millis(), metricProducer) + val serviceMetrics = collectServiceMetrics(clock.millis(), simulator.producers[0]) logger.debug { "Finish " + - "SUBMIT=${monitorResults.instanceCount} " + - "FAIL=${monitorResults.failedInstanceCount} " + - "QUEUE=${monitorResults.queuedInstanceCount} " + - "RUNNING=${monitorResults.runningInstanceCount}" + "SUBMIT=${serviceMetrics.instanceCount} " + + "FAIL=${serviceMetrics.failedInstanceCount} " + + "QUEUE=${serviceMetrics.queuedInstanceCount} " + + "RUNNING=${serviceMetrics.runningInstanceCount}" } } } catch (cause: Throwable) { diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt index c8e58dde..4b813310 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt @@ -22,27 +22,19 @@ package org.opendc.web.runner -import mu.KotlinLogging -import org.opendc.compute.service.driver.Host -import org.opendc.compute.service.driver.HostState import org.opendc.telemetry.compute.ComputeMonitor import org.opendc.telemetry.compute.table.HostData import org.opendc.telemetry.compute.table.ServiceData import kotlin.math.max +import kotlin.math.roundToLong /** * A [ComputeMonitor] that tracks the aggregate metrics for each repeat. */ -public class WebComputeMonitor : ComputeMonitor { - private val logger = KotlinLogging.logger {} - - override fun onStateChange(time: Long, host: Host, newState: HostState) { - logger.debug { "Host ${host.uid} changed state $newState [$time]" } - } - +class WebComputeMonitor : ComputeMonitor { override fun record(data: HostData) { - val duration = 5 * 60 * 1000L - val slices = duration / SLICE_LENGTH + val duration = data.uptime + val slices = data.downtime / SLICE_LENGTH hostAggregateMetrics = AggregateHostMetrics( hostAggregateMetrics.totalWork + data.totalWork, @@ -50,14 +42,14 @@ public class WebComputeMonitor : ComputeMonitor { hostAggregateMetrics.totalOvercommittedWork + data.overcommittedWork, hostAggregateMetrics.totalInterferedWork + data.overcommittedWork, hostAggregateMetrics.totalPowerDraw + (duration * data.powerDraw) / 3600, - hostAggregateMetrics.totalFailureSlices + if (data.host.state != HostState.UP) slices else 0, - hostAggregateMetrics.totalFailureVmSlices + if (data.host.state != HostState.UP) data.instanceCount * slices else 0 + hostAggregateMetrics.totalFailureSlices + slices, + hostAggregateMetrics.totalFailureVmSlices + data.instanceCount * slices ) - hostMetrics.compute(data.host) { _, prev -> + hostMetrics.compute(data.host.id) { _, prev -> HostMetrics( - (data.cpuUsage.takeIf { data.host.state == HostState.UP } ?: 0.0) + (prev?.cpuUsage ?: 0.0), - (data.cpuDemand.takeIf { data.host.state == HostState.UP } ?: 0.0) + (prev?.cpuDemand ?: 0.0), + data.cpuUsage + (prev?.cpuUsage ?: 0.0), + data.cpuDemand + (prev?.cpuDemand ?: 0.0), data.instanceCount + (prev?.instanceCount ?: 0), 1 + (prev?.count ?: 0) ) @@ -65,7 +57,7 @@ public class WebComputeMonitor : ComputeMonitor { } private var hostAggregateMetrics: AggregateHostMetrics = AggregateHostMetrics() - private val hostMetrics: MutableMap = mutableMapOf() + private val hostMetrics: MutableMap = mutableMapOf() private val SLICE_LENGTH: Long = 5 * 60 * 1000 data class AggregateHostMetrics( @@ -74,8 +66,8 @@ public class WebComputeMonitor : ComputeMonitor { val totalOvercommittedWork: Double = 0.0, val totalInterferedWork: Double = 0.0, val totalPowerDraw: Double = 0.0, - val totalFailureSlices: Long = 0, - val totalFailureVmSlices: Long = 0, + val totalFailureSlices: Double = 0.0, + val totalFailureVmSlices: Double = 0.0, ) data class HostMetrics( @@ -97,7 +89,7 @@ public class WebComputeMonitor : ComputeMonitor { ) } - public data class AggregateServiceMetrics( + data class AggregateServiceMetrics( val vmTotalCount: Int = 0, val vmWaitingCount: Int = 0, val vmActiveCount: Int = 0, @@ -105,7 +97,7 @@ public class WebComputeMonitor : ComputeMonitor { val vmFailedCount: Int = 0 ) - public fun getResult(): Result { + fun getResult(): Result { return Result( hostAggregateMetrics.totalWork, hostAggregateMetrics.totalGrantedWork, @@ -116,8 +108,8 @@ public class WebComputeMonitor : ComputeMonitor { hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.average(), hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.maxOrNull() ?: 0.0, hostAggregateMetrics.totalPowerDraw, - hostAggregateMetrics.totalFailureSlices, - hostAggregateMetrics.totalFailureVmSlices, + hostAggregateMetrics.totalFailureSlices.roundToLong(), + hostAggregateMetrics.totalFailureVmSlices.roundToLong(), serviceMetrics.vmTotalCount, serviceMetrics.vmWaitingCount, serviceMetrics.vmInactiveCount, -- cgit v1.2.3 From 8d899e29dbd757f6df320212d6e0d77ce8216ab9 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Tue, 14 Sep 2021 15:38:38 +0200 Subject: refactor(telemetry): Standardize compute scheduler metrics This change updates the OpenDC compute service implementation with multiple meters that follow the OpenTelemetry conventions. --- .../src/main/kotlin/org/opendc/web/runner/Main.kt | 11 ++++++----- .../main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt | 10 +++++----- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index b9d5a3f5..960d5ebd 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -226,11 +226,12 @@ class RunnerCli : CliktCommand(name = "runner") { val serviceMetrics = collectServiceMetrics(clock.millis(), simulator.producers[0]) logger.debug { - "Finish " + - "SUBMIT=${serviceMetrics.instanceCount} " + - "FAIL=${serviceMetrics.failedInstanceCount} " + - "QUEUE=${serviceMetrics.queuedInstanceCount} " + - "RUNNING=${serviceMetrics.runningInstanceCount}" + "Scheduler " + + "Success=${serviceMetrics.attemptsSuccess} " + + "Failure=${serviceMetrics.attemptsFailure} " + + "Error=${serviceMetrics.attemptsError} " + + "Pending=${serviceMetrics.serversPending} " + + "Active=${serviceMetrics.serversActive}" } } } catch (cause: Throwable) { diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt index 4b813310..5f2c474b 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt @@ -81,11 +81,11 @@ class WebComputeMonitor : ComputeMonitor { override fun record(data: ServiceData) { serviceMetrics = AggregateServiceMetrics( - max(data.instanceCount, serviceMetrics.vmTotalCount), - max(data.queuedInstanceCount, serviceMetrics.vmWaitingCount), - max(data.runningInstanceCount, serviceMetrics.vmActiveCount), - max(data.finishedInstanceCount, serviceMetrics.vmInactiveCount), - max(data.failedInstanceCount, serviceMetrics.vmFailedCount), + max(data.attemptsSuccess, serviceMetrics.vmTotalCount), + max(data.serversPending, serviceMetrics.vmWaitingCount), + max(data.serversActive, serviceMetrics.vmActiveCount), + max(0, serviceMetrics.vmInactiveCount), + max(data.attemptsFailure, serviceMetrics.vmFailedCount), ) } -- cgit v1.2.3 From 0d8bccc68705d036fbf60f312d9c34ca4392c6b2 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Tue, 7 Sep 2021 17:30:46 +0200 Subject: refactor(telemetry): Standardize SimHost metrics This change standardizes the metrics emitted by SimHost instances and their guests based on the OpenTelemetry semantic conventions. We now also report CPU time as opposed to CPU work as this metric is more commonly used. --- .../src/main/kotlin/org/opendc/web/runner/Main.kt | 12 ++----- .../org/opendc/web/runner/ScenarioManager.kt | 8 ++--- .../org/opendc/web/runner/WebComputeMonitor.kt | 41 +++++++++++----------- 3 files changed, 26 insertions(+), 35 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index 960d5ebd..483558e1 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -26,8 +26,6 @@ import com.github.ajalt.clikt.core.CliktCommand import com.github.ajalt.clikt.parameters.options.* import com.github.ajalt.clikt.parameters.types.file import com.github.ajalt.clikt.parameters.types.long -import io.opentelemetry.api.metrics.MeterProvider -import io.opentelemetry.sdk.metrics.SdkMeterProvider import kotlinx.coroutines.* import mu.KotlinLogging import org.opendc.experiments.capelin.* @@ -49,7 +47,6 @@ import org.opendc.simulator.core.runBlockingSimulation import org.opendc.telemetry.compute.ComputeMetricExporter import org.opendc.telemetry.compute.collectServiceMetrics import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader -import org.opendc.telemetry.sdk.toOtelClock import org.opendc.web.client.ApiClient import org.opendc.web.client.AuthConfiguration import org.opendc.web.client.model.Scenario @@ -187,11 +184,6 @@ class RunnerCli : CliktCommand(name = "runner") { val seeder = Random(repeat.toLong()) - val meterProvider: MeterProvider = SdkMeterProvider - .builder() - .setClock(clock.toOtelClock()) - .build() - val operational = scenario.operationalPhenomena val computeScheduler = createComputeScheduler(operational.schedulerName, seeder) @@ -215,7 +207,7 @@ class RunnerCli : CliktCommand(name = "runner") { interferenceModel.takeIf { operational.performanceInterferenceEnabled } ) - val metricReader = CoroutineMetricReader(this, simulator.producers, ComputeMetricExporter(clock, monitor)) + val metricReader = CoroutineMetricReader(this, simulator.producers, ComputeMetricExporter(clock, monitor), exportInterval = Duration.ofHours(1)) try { simulator.run(trace) @@ -224,7 +216,7 @@ class RunnerCli : CliktCommand(name = "runner") { metricReader.close() } - val serviceMetrics = collectServiceMetrics(clock.millis(), simulator.producers[0]) + val serviceMetrics = collectServiceMetrics(clock.instant(), simulator.producers[0]) logger.debug { "Scheduler " + "Success=${serviceMetrics.attemptsSuccess} " + diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt index e0e3488f..a0c281e8 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt @@ -65,10 +65,10 @@ public class ScenarioManager(private val client: ApiClient) { client.updateJob( id, SimulationState.FINISHED, mapOf( - "total_requested_burst" to results.map { it.totalWork }, - "total_granted_burst" to results.map { it.totalGrantedWork }, - "total_overcommitted_burst" to results.map { it.totalOvercommittedWork }, - "total_interfered_burst" to results.map { it.totalInterferedWork }, + "total_requested_burst" to results.map { it.totalActiveTime + it.totalIdleTime }, + "total_granted_burst" to results.map { it.totalActiveTime }, + "total_overcommitted_burst" to results.map { it.totalStealTime }, + "total_interfered_burst" to results.map { it.totalLostTime }, "mean_cpu_usage" to results.map { it.meanCpuUsage }, "mean_cpu_demand" to results.map { it.meanCpuDemand }, "mean_num_deployed_images" to results.map { it.meanNumDeployedImages }, diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt index 5f2c474b..bb412738 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt @@ -33,24 +33,23 @@ import kotlin.math.roundToLong */ class WebComputeMonitor : ComputeMonitor { override fun record(data: HostData) { - val duration = data.uptime val slices = data.downtime / SLICE_LENGTH hostAggregateMetrics = AggregateHostMetrics( - hostAggregateMetrics.totalWork + data.totalWork, - hostAggregateMetrics.totalGrantedWork + data.grantedWork, - hostAggregateMetrics.totalOvercommittedWork + data.overcommittedWork, - hostAggregateMetrics.totalInterferedWork + data.overcommittedWork, - hostAggregateMetrics.totalPowerDraw + (duration * data.powerDraw) / 3600, + hostAggregateMetrics.totalActiveTime + data.cpuActiveTime, + hostAggregateMetrics.totalIdleTime + data.cpuIdleTime, + hostAggregateMetrics.totalStealTime + data.cpuStealTime, + hostAggregateMetrics.totalLostTime + data.cpuLostTime, + hostAggregateMetrics.totalPowerDraw + data.powerTotal, hostAggregateMetrics.totalFailureSlices + slices, - hostAggregateMetrics.totalFailureVmSlices + data.instanceCount * slices + hostAggregateMetrics.totalFailureVmSlices + data.guestsRunning * slices ) hostMetrics.compute(data.host.id) { _, prev -> HostMetrics( data.cpuUsage + (prev?.cpuUsage ?: 0.0), data.cpuDemand + (prev?.cpuDemand ?: 0.0), - data.instanceCount + (prev?.instanceCount ?: 0), + data.guestsRunning + (prev?.instanceCount ?: 0), 1 + (prev?.count ?: 0) ) } @@ -58,13 +57,13 @@ class WebComputeMonitor : ComputeMonitor { private var hostAggregateMetrics: AggregateHostMetrics = AggregateHostMetrics() private val hostMetrics: MutableMap = mutableMapOf() - private val SLICE_LENGTH: Long = 5 * 60 * 1000 + private val SLICE_LENGTH: Long = 5 * 60 data class AggregateHostMetrics( - val totalWork: Double = 0.0, - val totalGrantedWork: Double = 0.0, - val totalOvercommittedWork: Double = 0.0, - val totalInterferedWork: Double = 0.0, + val totalActiveTime: Long = 0L, + val totalIdleTime: Long = 0L, + val totalStealTime: Long = 0L, + val totalLostTime: Long = 0L, val totalPowerDraw: Double = 0.0, val totalFailureSlices: Double = 0.0, val totalFailureVmSlices: Double = 0.0, @@ -99,10 +98,10 @@ class WebComputeMonitor : ComputeMonitor { fun getResult(): Result { return Result( - hostAggregateMetrics.totalWork, - hostAggregateMetrics.totalGrantedWork, - hostAggregateMetrics.totalOvercommittedWork, - hostAggregateMetrics.totalInterferedWork, + hostAggregateMetrics.totalActiveTime, + hostAggregateMetrics.totalIdleTime, + hostAggregateMetrics.totalStealTime, + hostAggregateMetrics.totalLostTime, hostMetrics.map { it.value.cpuUsage / it.value.count }.average(), hostMetrics.map { it.value.cpuDemand / it.value.count }.average(), hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.average(), @@ -118,10 +117,10 @@ class WebComputeMonitor : ComputeMonitor { } data class Result( - val totalWork: Double, - val totalGrantedWork: Double, - val totalOvercommittedWork: Double, - val totalInterferedWork: Double, + val totalActiveTime: Long, + val totalIdleTime: Long, + val totalStealTime: Long, + val totalLostTime: Long, val meanCpuUsage: Double, val meanCpuDemand: Double, val meanNumDeployedImages: Double, -- cgit v1.2.3 From 859ce303f0b9110c7110b918e5957c2156fa8b26 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Fri, 17 Sep 2021 17:48:02 +0200 Subject: refactor(capelin): Extract common code out of Capelin experiments This change creates a new module for doing simulations with virtual machine workloads. We have found that a lot of code in the Capelin experiments code is being re-used by non-experiment modules. --- .../src/main/kotlin/org/opendc/web/runner/Main.kt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index 483558e1..497a7281 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -28,14 +28,14 @@ import com.github.ajalt.clikt.parameters.types.file import com.github.ajalt.clikt.parameters.types.long import kotlinx.coroutines.* import mu.KotlinLogging -import org.opendc.experiments.capelin.* +import org.opendc.compute.workload.ComputeWorkloadRunner +import org.opendc.compute.workload.env.MachineDef +import org.opendc.compute.workload.grid5000 +import org.opendc.compute.workload.trace.RawParquetTraceReader +import org.opendc.compute.workload.util.PerformanceInterferenceReader import org.opendc.experiments.capelin.env.EnvironmentReader -import org.opendc.experiments.capelin.env.MachineDef import org.opendc.experiments.capelin.model.Workload import org.opendc.experiments.capelin.trace.ParquetTraceReader -import org.opendc.experiments.capelin.trace.PerformanceInterferenceReader -import org.opendc.experiments.capelin.trace.RawParquetTraceReader -import org.opendc.experiments.capelin.util.ComputeServiceSimulator import org.opendc.experiments.capelin.util.createComputeScheduler import org.opendc.simulator.compute.kernel.interference.VmInterferenceModel import org.opendc.simulator.compute.model.MachineModel @@ -198,7 +198,7 @@ class RunnerCli : CliktCommand(name = "runner") { else null - val simulator = ComputeServiceSimulator( + val simulator = ComputeWorkloadRunner( coroutineContext, clock, computeScheduler, -- cgit v1.2.3 From b0ece0533825f5cd7983752330847071f4e438c4 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Wed, 15 Sep 2021 23:06:08 +0200 Subject: refactor(capelin): Support flexible topology creation This change adds support for creating flexible topologies by creating a TopologyFactory interface that is responsible for configuring the hosts of a compute service. --- .../src/main/kotlin/org/opendc/web/runner/Main.kt | 116 +++++++++++---------- 1 file changed, 63 insertions(+), 53 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index 497a7281..48183d71 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -29,11 +29,12 @@ import com.github.ajalt.clikt.parameters.types.long import kotlinx.coroutines.* import mu.KotlinLogging import org.opendc.compute.workload.ComputeWorkloadRunner -import org.opendc.compute.workload.env.MachineDef import org.opendc.compute.workload.grid5000 +import org.opendc.compute.workload.topology.HostSpec +import org.opendc.compute.workload.topology.Topology +import org.opendc.compute.workload.topology.apply import org.opendc.compute.workload.trace.RawParquetTraceReader import org.opendc.compute.workload.util.PerformanceInterferenceReader -import org.opendc.experiments.capelin.env.EnvironmentReader import org.opendc.experiments.capelin.model.Workload import org.opendc.experiments.capelin.trace.ParquetTraceReader import org.opendc.experiments.capelin.util.createComputeScheduler @@ -43,6 +44,7 @@ import org.opendc.simulator.compute.model.MemoryUnit import org.opendc.simulator.compute.model.ProcessingNode import org.opendc.simulator.compute.model.ProcessingUnit import org.opendc.simulator.compute.power.LinearPowerModel +import org.opendc.simulator.compute.power.SimplePowerDriver import org.opendc.simulator.core.runBlockingSimulation import org.opendc.telemetry.compute.ComputeMetricExporter import org.opendc.telemetry.compute.collectServiceMetrics @@ -50,12 +52,12 @@ import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader import org.opendc.web.client.ApiClient import org.opendc.web.client.AuthConfiguration import org.opendc.web.client.model.Scenario -import org.opendc.web.client.model.Topology import java.io.File import java.net.URI import java.time.Duration import java.util.* import org.opendc.web.client.model.Portfolio as ClientPortfolio +import org.opendc.web.client.model.Topology as ClientTopology private val logger = KotlinLogging.logger {} @@ -129,7 +131,7 @@ class RunnerCli : CliktCommand(name = "runner") { /** * Run a single scenario. */ - private suspend fun runScenario(portfolio: ClientPortfolio, scenario: Scenario, environment: EnvironmentReader): List { + private suspend fun runScenario(portfolio: ClientPortfolio, scenario: Scenario, topology: Topology): List { val id = scenario.id logger.info { "Constructing performance interference model" } @@ -156,7 +158,7 @@ class RunnerCli : CliktCommand(name = "runner") { logger.info { "Starting repeat $repeat" } withTimeout(runTimeout * 1000) { val interferenceModel = interferenceGroups?.let { VmInterferenceModel(it, Random(repeat.toLong())) } - runRepeat(scenario, repeat, environment, traceReader, interferenceModel) + runRepeat(scenario, repeat, topology, traceReader, interferenceModel) } } @@ -171,7 +173,7 @@ class RunnerCli : CliktCommand(name = "runner") { private suspend fun runRepeat( scenario: Scenario, repeat: Int, - environment: EnvironmentReader, + topology: Topology, traceReader: RawParquetTraceReader, interferenceModel: VmInterferenceModel? ): WebComputeMonitor.Result { @@ -202,7 +204,6 @@ class RunnerCli : CliktCommand(name = "runner") { coroutineContext, clock, computeScheduler, - environment.read(), failureModel, interferenceModel.takeIf { operational.performanceInterferenceEnabled } ) @@ -210,6 +211,9 @@ class RunnerCli : CliktCommand(name = "runner") { val metricReader = CoroutineMetricReader(this, simulator.producers, ComputeMetricExporter(clock, monitor), exportInterval = Duration.ofHours(1)) try { + // Instantiate the topology onto the simulator + simulator.apply(topology) + // Run workload trace simulator.run(trace) } finally { simulator.close() @@ -292,56 +296,62 @@ class RunnerCli : CliktCommand(name = "runner") { } /** - * Convert the specified [topology] into an [EnvironmentReader] understood by Capelin. + * Convert the specified [topology] into an [Topology] understood by OpenDC. */ - private fun convert(topology: Topology): EnvironmentReader { - val nodes = mutableListOf() - val random = Random(0) - - val machines = topology.rooms.asSequence() - .flatMap { room -> - room.tiles.flatMap { tile -> - tile.rack?.machines?.map { machine -> tile.rack to machine } ?: emptyList() - } - } - for ((rack, machine) in machines) { - val clusterId = rack.id - val position = machine.position - - val processors = machine.cpus.flatMap { cpu -> - val cores = cpu.numberOfCores - val speed = cpu.clockRateMhz - // TODO Remove hard coding of vendor - val node = ProcessingNode("Intel", "amd64", cpu.name, cores) - List(cores) { coreId -> - ProcessingUnit(node, coreId, speed) - } - } - val memoryUnits = machine.memory.map { memory -> - MemoryUnit( - "Samsung", - memory.name, - memory.speedMbPerS, - memory.sizeMb.toLong() - ) - } + private fun convert(topology: ClientTopology): Topology { + return object : Topology { + + override fun resolve(): List { + val res = mutableListOf() + val random = Random(0) + + val machines = topology.rooms.asSequence() + .flatMap { room -> + room.tiles.flatMap { tile -> + tile.rack?.machines?.map { machine -> tile.rack to machine } ?: emptyList() + } + } + for ((rack, machine) in machines) { + val clusterId = rack.id + val position = machine.position + + val processors = machine.cpus.flatMap { cpu -> + val cores = cpu.numberOfCores + val speed = cpu.clockRateMhz + // TODO Remove hard coding of vendor + val node = ProcessingNode("Intel", "amd64", cpu.name, cores) + List(cores) { coreId -> + ProcessingUnit(node, coreId, speed) + } + } + val memoryUnits = machine.memory.map { memory -> + MemoryUnit( + "Samsung", + memory.name, + memory.speedMbPerS, + memory.sizeMb.toLong() + ) + } - val energyConsumptionW = machine.cpus.sumOf { it.energyConsumptionW } + val energyConsumptionW = machine.cpus.sumOf { it.energyConsumptionW } + val powerModel = LinearPowerModel(2 * energyConsumptionW, energyConsumptionW * 0.5) + val powerDriver = SimplePowerDriver(powerModel) - nodes.add( - MachineDef( - UUID(random.nextLong(), random.nextLong()), - "node-$clusterId-$position", - mapOf("cluster" to clusterId), - MachineModel(processors, memoryUnits), - LinearPowerModel(2 * energyConsumptionW, energyConsumptionW * 0.5) - ) - ) - } + val spec = HostSpec( + UUID(random.nextLong(), random.nextLong()), + "node-$clusterId-$position", + mapOf("cluster" to clusterId), + MachineModel(processors, memoryUnits), + powerDriver + ) + + res += spec + } + + return res + } - return object : EnvironmentReader { - override fun read(): List = nodes - override fun close() {} + override fun toString(): String = "WebRunnerTopologyFactory" } } } -- cgit v1.2.3 From b14df2a0924774c5aed15cedeb1027abf8ee5361 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Thu, 16 Sep 2021 16:52:00 +0200 Subject: refactor(capelin): Make workload sampling model extensible This change updates the workload sampling implementation to be more flexible in the way the workload is constructed. Users can now sample multiple workloads at the same time using multiple samplers and use them as a single workload to simulate. --- .../src/main/kotlin/org/opendc/web/runner/Main.kt | 27 +++++++--------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index 48183d71..1b518fee 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -28,15 +28,12 @@ import com.github.ajalt.clikt.parameters.types.file import com.github.ajalt.clikt.parameters.types.long import kotlinx.coroutines.* import mu.KotlinLogging -import org.opendc.compute.workload.ComputeWorkloadRunner -import org.opendc.compute.workload.grid5000 +import org.opendc.compute.workload.* import org.opendc.compute.workload.topology.HostSpec import org.opendc.compute.workload.topology.Topology import org.opendc.compute.workload.topology.apply -import org.opendc.compute.workload.trace.RawParquetTraceReader import org.opendc.compute.workload.util.PerformanceInterferenceReader import org.opendc.experiments.capelin.model.Workload -import org.opendc.experiments.capelin.trace.ParquetTraceReader import org.opendc.experiments.capelin.util.createComputeScheduler import org.opendc.simulator.compute.kernel.interference.VmInterferenceModel import org.opendc.simulator.compute.model.MachineModel @@ -136,13 +133,9 @@ class RunnerCli : CliktCommand(name = "runner") { logger.info { "Constructing performance interference model" } - val traceDir = File( - tracePath, - scenario.trace.traceId - ) - val traceReader = RawParquetTraceReader(traceDir) + val workloadLoader = ComputeWorkloadLoader(tracePath) val interferenceGroups = let { - val path = File(traceDir, "performance-interference-model.json") + val path = tracePath.resolve(scenario.trace.traceId).resolve("performance-interference-model.json") val operational = scenario.operationalPhenomena val enabled = operational.performanceInterferenceEnabled @@ -158,7 +151,7 @@ class RunnerCli : CliktCommand(name = "runner") { logger.info { "Starting repeat $repeat" } withTimeout(runTimeout * 1000) { val interferenceModel = interferenceGroups?.let { VmInterferenceModel(it, Random(repeat.toLong())) } - runRepeat(scenario, repeat, topology, traceReader, interferenceModel) + runRepeat(scenario, repeat, topology, workloadLoader, interferenceModel) } } @@ -174,7 +167,7 @@ class RunnerCli : CliktCommand(name = "runner") { scenario: Scenario, repeat: Int, topology: Topology, - traceReader: RawParquetTraceReader, + workloadLoader: ComputeWorkloadLoader, interferenceModel: VmInterferenceModel? ): WebComputeMonitor.Result { val monitor = WebComputeMonitor() @@ -188,15 +181,11 @@ class RunnerCli : CliktCommand(name = "runner") { val operational = scenario.operationalPhenomena val computeScheduler = createComputeScheduler(operational.schedulerName, seeder) + val workload = Workload(workloadName, trace(workloadName).sampleByLoad(workloadFraction)) - val trace = ParquetTraceReader( - listOf(traceReader), - Workload(workloadName, workloadFraction), - repeat - ) val failureModel = if (operational.failuresEnabled) - grid5000(Duration.ofDays(7), repeat) + grid5000(Duration.ofDays(7)) else null @@ -214,7 +203,7 @@ class RunnerCli : CliktCommand(name = "runner") { // Instantiate the topology onto the simulator simulator.apply(topology) // Run workload trace - simulator.run(trace) + simulator.run(workload.source.resolve(workloadLoader, seeder), seeder.nextLong()) } finally { simulator.close() metricReader.close() -- cgit v1.2.3 From 68ef3700ed2f69bcf0118bb69eda71e6b1f4d54f Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Tue, 21 Sep 2021 11:34:34 +0200 Subject: feat(trace): Add support for writing traces This change adds a new API for writing traces in a trace format. Currently, writing is only supported by the OpenDC VM format, but over time the other formats will also have support for writing added. --- .../opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index 1b518fee..40a7ea62 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -33,8 +33,6 @@ import org.opendc.compute.workload.topology.HostSpec import org.opendc.compute.workload.topology.Topology import org.opendc.compute.workload.topology.apply import org.opendc.compute.workload.util.PerformanceInterferenceReader -import org.opendc.experiments.capelin.model.Workload -import org.opendc.experiments.capelin.util.createComputeScheduler import org.opendc.simulator.compute.kernel.interference.VmInterferenceModel import org.opendc.simulator.compute.model.MachineModel import org.opendc.simulator.compute.model.MemoryUnit @@ -181,7 +179,7 @@ class RunnerCli : CliktCommand(name = "runner") { val operational = scenario.operationalPhenomena val computeScheduler = createComputeScheduler(operational.schedulerName, seeder) - val workload = Workload(workloadName, trace(workloadName).sampleByLoad(workloadFraction)) + val workload = trace(workloadName).sampleByLoad(workloadFraction) val failureModel = if (operational.failuresEnabled) @@ -203,7 +201,7 @@ class RunnerCli : CliktCommand(name = "runner") { // Instantiate the topology onto the simulator simulator.apply(topology) // Run workload trace - simulator.run(workload.source.resolve(workloadLoader, seeder), seeder.nextLong()) + simulator.run(workload.resolve(workloadLoader, seeder), seeder.nextLong()) } finally { simulator.close() metricReader.close() -- cgit v1.2.3 From 30cd010f1f98262aa7f264bb3c3eb6028b8495c5 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Wed, 22 Sep 2021 12:43:01 +0200 Subject: refactor(telemetry): Do not require clock for ComputeMetricExporter This change drops the requirement for a clock parameter when constructing a ComputeMetricExporter, since it will now derive the timestamp from the recorded metrics. --- .../src/main/kotlin/org/opendc/web/runner/Main.kt | 13 +- .../org/opendc/web/runner/ScenarioManager.kt | 2 +- .../opendc/web/runner/WebComputeMetricExporter.kt | 137 +++++++++++++++++++++ .../org/opendc/web/runner/WebComputeMonitor.kt | 136 -------------------- 4 files changed, 144 insertions(+), 144 deletions(-) create mode 100644 opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMetricExporter.kt delete mode 100644 opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index 40a7ea62..96b300d7 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -41,7 +41,6 @@ import org.opendc.simulator.compute.model.ProcessingUnit import org.opendc.simulator.compute.power.LinearPowerModel import org.opendc.simulator.compute.power.SimplePowerDriver import org.opendc.simulator.core.runBlockingSimulation -import org.opendc.telemetry.compute.ComputeMetricExporter import org.opendc.telemetry.compute.collectServiceMetrics import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader import org.opendc.web.client.ApiClient @@ -126,7 +125,7 @@ class RunnerCli : CliktCommand(name = "runner") { /** * Run a single scenario. */ - private suspend fun runScenario(portfolio: ClientPortfolio, scenario: Scenario, topology: Topology): List { + private suspend fun runScenario(portfolio: ClientPortfolio, scenario: Scenario, topology: Topology): List { val id = scenario.id logger.info { "Constructing performance interference model" } @@ -167,8 +166,8 @@ class RunnerCli : CliktCommand(name = "runner") { topology: Topology, workloadLoader: ComputeWorkloadLoader, interferenceModel: VmInterferenceModel? - ): WebComputeMonitor.Result { - val monitor = WebComputeMonitor() + ): WebComputeMetricExporter.Result { + val exporter = WebComputeMetricExporter() try { runBlockingSimulation { @@ -195,7 +194,7 @@ class RunnerCli : CliktCommand(name = "runner") { interferenceModel.takeIf { operational.performanceInterferenceEnabled } ) - val metricReader = CoroutineMetricReader(this, simulator.producers, ComputeMetricExporter(clock, monitor), exportInterval = Duration.ofHours(1)) + val metricReader = CoroutineMetricReader(this, simulator.producers, exporter, exportInterval = Duration.ofHours(1)) try { // Instantiate the topology onto the simulator @@ -207,7 +206,7 @@ class RunnerCli : CliktCommand(name = "runner") { metricReader.close() } - val serviceMetrics = collectServiceMetrics(clock.instant(), simulator.producers[0]) + val serviceMetrics = collectServiceMetrics(simulator.producers[0]) logger.debug { "Scheduler " + "Success=${serviceMetrics.attemptsSuccess} " + @@ -221,7 +220,7 @@ class RunnerCli : CliktCommand(name = "runner") { logger.warn(cause) { "Experiment failed" } } - return monitor.getResult() + return exporter.getResult() } private val POLL_INTERVAL = 30000L // ms = 30 s diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt index a0c281e8..1ee835a6 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/ScenarioManager.kt @@ -61,7 +61,7 @@ public class ScenarioManager(private val client: ApiClient) { /** * Persist the specified results. */ - public suspend fun finish(id: String, results: List) { + public suspend fun finish(id: String, results: List) { client.updateJob( id, SimulationState.FINISHED, mapOf( diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMetricExporter.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMetricExporter.kt new file mode 100644 index 00000000..7913660d --- /dev/null +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMetricExporter.kt @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.web.runner + +import org.opendc.telemetry.compute.ComputeMetricExporter +import org.opendc.telemetry.compute.ComputeMonitor +import org.opendc.telemetry.compute.table.HostData +import org.opendc.telemetry.compute.table.ServiceData +import kotlin.math.max +import kotlin.math.roundToLong + +/** + * A [ComputeMonitor] that tracks the aggregate metrics for each repeat. + */ +class WebComputeMetricExporter : ComputeMetricExporter() { + override fun record(data: HostData) { + val slices = data.downtime / SLICE_LENGTH + + hostAggregateMetrics = AggregateHostMetrics( + hostAggregateMetrics.totalActiveTime + data.cpuActiveTime, + hostAggregateMetrics.totalIdleTime + data.cpuIdleTime, + hostAggregateMetrics.totalStealTime + data.cpuStealTime, + hostAggregateMetrics.totalLostTime + data.cpuLostTime, + hostAggregateMetrics.totalPowerDraw + data.powerTotal, + hostAggregateMetrics.totalFailureSlices + slices, + hostAggregateMetrics.totalFailureVmSlices + data.guestsRunning * slices + ) + + hostMetrics.compute(data.host.id) { _, prev -> + HostMetrics( + data.cpuUsage + (prev?.cpuUsage ?: 0.0), + data.cpuDemand + (prev?.cpuDemand ?: 0.0), + data.guestsRunning + (prev?.instanceCount ?: 0), + 1 + (prev?.count ?: 0) + ) + } + } + + private var hostAggregateMetrics: AggregateHostMetrics = AggregateHostMetrics() + private val hostMetrics: MutableMap = mutableMapOf() + private val SLICE_LENGTH: Long = 5 * 60L + + data class AggregateHostMetrics( + val totalActiveTime: Long = 0L, + val totalIdleTime: Long = 0L, + val totalStealTime: Long = 0L, + val totalLostTime: Long = 0L, + val totalPowerDraw: Double = 0.0, + val totalFailureSlices: Double = 0.0, + val totalFailureVmSlices: Double = 0.0, + ) + + data class HostMetrics( + val cpuUsage: Double, + val cpuDemand: Double, + val instanceCount: Long, + val count: Long + ) + + private var serviceMetrics: AggregateServiceMetrics = AggregateServiceMetrics() + + override fun record(data: ServiceData) { + serviceMetrics = AggregateServiceMetrics( + max(data.attemptsSuccess, serviceMetrics.vmTotalCount), + max(data.serversPending, serviceMetrics.vmWaitingCount), + max(data.serversActive, serviceMetrics.vmActiveCount), + max(0, serviceMetrics.vmInactiveCount), + max(data.attemptsFailure, serviceMetrics.vmFailedCount), + ) + } + + data class AggregateServiceMetrics( + val vmTotalCount: Int = 0, + val vmWaitingCount: Int = 0, + val vmActiveCount: Int = 0, + val vmInactiveCount: Int = 0, + val vmFailedCount: Int = 0 + ) + + fun getResult(): Result { + return Result( + hostAggregateMetrics.totalActiveTime, + hostAggregateMetrics.totalIdleTime, + hostAggregateMetrics.totalStealTime, + hostAggregateMetrics.totalLostTime, + hostMetrics.map { it.value.cpuUsage / it.value.count }.average(), + hostMetrics.map { it.value.cpuDemand / it.value.count }.average(), + hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.average(), + hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.maxOrNull() ?: 0.0, + hostAggregateMetrics.totalPowerDraw, + hostAggregateMetrics.totalFailureSlices.roundToLong(), + hostAggregateMetrics.totalFailureVmSlices.roundToLong(), + serviceMetrics.vmTotalCount, + serviceMetrics.vmWaitingCount, + serviceMetrics.vmInactiveCount, + serviceMetrics.vmFailedCount, + ) + } + + data class Result( + val totalActiveTime: Long, + val totalIdleTime: Long, + val totalStealTime: Long, + val totalLostTime: Long, + val meanCpuUsage: Double, + val meanCpuDemand: Double, + val meanNumDeployedImages: Double, + val maxNumDeployedImages: Double, + val totalPowerDraw: Double, + val totalFailureSlices: Long, + val totalFailureVmSlices: Long, + val totalVmsSubmitted: Int, + val totalVmsQueued: Int, + val totalVmsFinished: Int, + val totalVmsFailed: Int + ) +} diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt deleted file mode 100644 index bb412738..00000000 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/WebComputeMonitor.kt +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2021 AtLarge Research - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -package org.opendc.web.runner - -import org.opendc.telemetry.compute.ComputeMonitor -import org.opendc.telemetry.compute.table.HostData -import org.opendc.telemetry.compute.table.ServiceData -import kotlin.math.max -import kotlin.math.roundToLong - -/** - * A [ComputeMonitor] that tracks the aggregate metrics for each repeat. - */ -class WebComputeMonitor : ComputeMonitor { - override fun record(data: HostData) { - val slices = data.downtime / SLICE_LENGTH - - hostAggregateMetrics = AggregateHostMetrics( - hostAggregateMetrics.totalActiveTime + data.cpuActiveTime, - hostAggregateMetrics.totalIdleTime + data.cpuIdleTime, - hostAggregateMetrics.totalStealTime + data.cpuStealTime, - hostAggregateMetrics.totalLostTime + data.cpuLostTime, - hostAggregateMetrics.totalPowerDraw + data.powerTotal, - hostAggregateMetrics.totalFailureSlices + slices, - hostAggregateMetrics.totalFailureVmSlices + data.guestsRunning * slices - ) - - hostMetrics.compute(data.host.id) { _, prev -> - HostMetrics( - data.cpuUsage + (prev?.cpuUsage ?: 0.0), - data.cpuDemand + (prev?.cpuDemand ?: 0.0), - data.guestsRunning + (prev?.instanceCount ?: 0), - 1 + (prev?.count ?: 0) - ) - } - } - - private var hostAggregateMetrics: AggregateHostMetrics = AggregateHostMetrics() - private val hostMetrics: MutableMap = mutableMapOf() - private val SLICE_LENGTH: Long = 5 * 60 - - data class AggregateHostMetrics( - val totalActiveTime: Long = 0L, - val totalIdleTime: Long = 0L, - val totalStealTime: Long = 0L, - val totalLostTime: Long = 0L, - val totalPowerDraw: Double = 0.0, - val totalFailureSlices: Double = 0.0, - val totalFailureVmSlices: Double = 0.0, - ) - - data class HostMetrics( - val cpuUsage: Double, - val cpuDemand: Double, - val instanceCount: Long, - val count: Long - ) - - private var serviceMetrics: AggregateServiceMetrics = AggregateServiceMetrics() - - override fun record(data: ServiceData) { - serviceMetrics = AggregateServiceMetrics( - max(data.attemptsSuccess, serviceMetrics.vmTotalCount), - max(data.serversPending, serviceMetrics.vmWaitingCount), - max(data.serversActive, serviceMetrics.vmActiveCount), - max(0, serviceMetrics.vmInactiveCount), - max(data.attemptsFailure, serviceMetrics.vmFailedCount), - ) - } - - data class AggregateServiceMetrics( - val vmTotalCount: Int = 0, - val vmWaitingCount: Int = 0, - val vmActiveCount: Int = 0, - val vmInactiveCount: Int = 0, - val vmFailedCount: Int = 0 - ) - - fun getResult(): Result { - return Result( - hostAggregateMetrics.totalActiveTime, - hostAggregateMetrics.totalIdleTime, - hostAggregateMetrics.totalStealTime, - hostAggregateMetrics.totalLostTime, - hostMetrics.map { it.value.cpuUsage / it.value.count }.average(), - hostMetrics.map { it.value.cpuDemand / it.value.count }.average(), - hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.average(), - hostMetrics.map { it.value.instanceCount.toDouble() / it.value.count }.maxOrNull() ?: 0.0, - hostAggregateMetrics.totalPowerDraw, - hostAggregateMetrics.totalFailureSlices.roundToLong(), - hostAggregateMetrics.totalFailureVmSlices.roundToLong(), - serviceMetrics.vmTotalCount, - serviceMetrics.vmWaitingCount, - serviceMetrics.vmInactiveCount, - serviceMetrics.vmFailedCount, - ) - } - - data class Result( - val totalActiveTime: Long, - val totalIdleTime: Long, - val totalStealTime: Long, - val totalLostTime: Long, - val meanCpuUsage: Double, - val meanCpuDemand: Double, - val meanNumDeployedImages: Double, - val maxNumDeployedImages: Double, - val totalPowerDraw: Double, - val totalFailureSlices: Long, - val totalFailureVmSlices: Long, - val totalVmsSubmitted: Int, - val totalVmsQueued: Int, - val totalVmsFinished: Int, - val totalVmsFailed: Int - ) -} -- cgit v1.2.3 From 4cc1d40d421c8736f8b21b360b61d6b065158b7a Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Wed, 29 Sep 2021 23:56:16 +0200 Subject: refactor(simulator): Migrate to flow-based simulation This change renames the `opendc-simulator-resources` module into the `opendc-simulator-flow` module to indicate that the core simulation model of OpenDC is based around modelling and simulating flows. Previously, the distinction between resource consumer and provider, and input and output caused some confusion. By switching to a flow-based model, this distinction is now clear (as in, the water flows from source to consumer/sink). --- .../opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index 96b300d7..59308e11 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -123,7 +123,7 @@ class RunnerCli : CliktCommand(name = "runner") { .default(60L * 3) // Experiment may run for a maximum of three minutes /** - * Run a single scenario. + * Converge a single scenario. */ private suspend fun runScenario(portfolio: ClientPortfolio, scenario: Scenario, topology: Topology): List { val id = scenario.id @@ -158,7 +158,7 @@ class RunnerCli : CliktCommand(name = "runner") { } /** - * Run a single repeat. + * Converge a single repeat. */ private suspend fun runRepeat( scenario: Scenario, @@ -199,7 +199,7 @@ class RunnerCli : CliktCommand(name = "runner") { try { // Instantiate the topology onto the simulator simulator.apply(topology) - // Run workload trace + // Converge workload trace simulator.run(workload.resolve(workloadLoader, seeder), seeder.nextLong()) } finally { simulator.close() -- cgit v1.2.3