From d24cc0cc9c4fe2145f0337d65e9a75f631365973 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Fri, 10 Sep 2021 10:59:44 +0200 Subject: refactor(compute): Integrate fault injection into compute simulator This change moves the fault injection logic directly into the opendc-compute-simulator module, so that it can operate at a higher abstraction. In the future, we might again split the module if we can re-use some of its logic. --- .../experiments/capelin/ExperimentHelpers.kt | 61 ++++++---------------- .../org/opendc/experiments/capelin/Portfolio.kt | 17 +++--- 2 files changed, 22 insertions(+), 56 deletions(-) (limited to 'opendc-experiments/opendc-experiments-capelin/src/main') diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/ExperimentHelpers.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/ExperimentHelpers.kt index 512b754d..8227bca9 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/ExperimentHelpers.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/ExperimentHelpers.kt @@ -25,7 +25,6 @@ package org.opendc.experiments.capelin import io.opentelemetry.api.metrics.MeterProvider import io.opentelemetry.sdk.metrics.SdkMeterProvider import kotlinx.coroutines.* -import kotlinx.coroutines.channels.Channel import org.apache.commons.math3.distribution.LogNormalDistribution import org.apache.commons.math3.random.Well19937c import org.opendc.compute.api.* @@ -41,6 +40,9 @@ import org.opendc.compute.service.scheduler.weights.InstanceCountWeigher import org.opendc.compute.service.scheduler.weights.RamWeigher import org.opendc.compute.service.scheduler.weights.VCpuWeigher import org.opendc.compute.simulator.SimHost +import org.opendc.compute.simulator.failure.HostFaultInjector +import org.opendc.compute.simulator.failure.StartStopHostFault +import org.opendc.compute.simulator.failure.StochasticVictimSelector import org.opendc.experiments.capelin.env.EnvironmentReader import org.opendc.experiments.capelin.trace.TraceReader import org.opendc.simulator.compute.kernel.SimFairShareHypervisorProvider @@ -48,67 +50,36 @@ import org.opendc.simulator.compute.kernel.interference.VmInterferenceModel import org.opendc.simulator.compute.power.SimplePowerDriver import org.opendc.simulator.compute.workload.SimTraceWorkload import org.opendc.simulator.compute.workload.SimWorkload -import org.opendc.simulator.failures.CorrelatedFaultInjector -import org.opendc.simulator.failures.FaultInjector import org.opendc.simulator.resources.SimResourceInterpreter import org.opendc.telemetry.compute.ComputeMonitor import org.opendc.telemetry.sdk.toOtelClock import java.time.Clock +import kotlin.coroutines.CoroutineContext import kotlin.math.ln import kotlin.math.max import kotlin.random.Random -/** - * Construct the failure domain for the experiments. - */ -fun createFailureDomain( - coroutineScope: CoroutineScope, - clock: Clock, - seed: Int, - failureInterval: Double, - service: ComputeService, - chan: Channel -): CoroutineScope { - val job = coroutineScope.launch { - chan.receive() - val random = Random(seed) - val injectors = mutableMapOf() - for (host in service.hosts) { - val cluster = host.meta["cluster"] as String - val injector = - injectors.getOrPut(cluster) { - createFaultInjector( - this, - clock, - random, - failureInterval - ) - } - injector.enqueue(host as SimHost) - } - } - return CoroutineScope(coroutineScope.coroutineContext + job) -} - /** * Obtain the [FaultInjector] to use for the experiments. */ fun createFaultInjector( - coroutineScope: CoroutineScope, + context: CoroutineContext, clock: Clock, - random: Random, + hosts: Set, + seed: Int, failureInterval: Double -): FaultInjector { - val rng = Well19937c(random.nextLong()) +): HostFaultInjector { + val rng = Well19937c(seed) // Parameters from A. Iosup, A Framework for the Study of Grid Inter-Operation Mechanisms, 2009 // GRID'5000 - return CorrelatedFaultInjector( - coroutineScope, + return HostFaultInjector( + context, clock, + hosts, iat = LogNormalDistribution(rng, ln(failureInterval), 1.03), - size = LogNormalDistribution(rng, 1.88, 1.25), - duration = LogNormalDistribution(rng, 8.89, 2.71) + selector = StochasticVictimSelector(LogNormalDistribution(rng, 1.88, 1.25), Random(seed)), + fault = StartStopHostFault(LogNormalDistribution(rng, 8.89, 2.71)) ) } @@ -164,7 +135,6 @@ suspend fun processTrace( clock: Clock, reader: TraceReader, scheduler: ComputeService, - chan: Channel, monitor: ComputeMonitor? = null, ) { val client = scheduler.newClient() @@ -193,10 +163,9 @@ suspend fun processTrace( delay(max(0, (entry.start - offset) - clock.millis())) launch { - chan.send(Unit) - val workloadOffset = -offset + 300001 val workload = SimTraceWorkload((entry.meta["workload"] as SimTraceWorkload).trace, workloadOffset) + val server = client.newServer( entry.name, image, diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt index 4db04591..82794471 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt @@ -25,9 +25,8 @@ package org.opendc.experiments.capelin import com.typesafe.config.ConfigFactory import io.opentelemetry.sdk.metrics.export.MetricProducer import kotlinx.coroutines.ExperimentalCoroutinesApi -import kotlinx.coroutines.cancel -import kotlinx.coroutines.channels.Channel import mu.KotlinLogging +import org.opendc.compute.simulator.SimHost import org.opendc.experiments.capelin.env.ClusterEnvironmentReader import org.opendc.experiments.capelin.export.parquet.ParquetExportMonitor import org.opendc.experiments.capelin.model.CompositeWorkload @@ -103,7 +102,6 @@ abstract class Portfolio(name: String) : Experiment(name) { val seeder = Random(repeat.toLong()) val environment = ClusterEnvironmentReader(File(config.getString("env-path"), "${topology.name}.txt")) - val chan = Channel(Channel.CONFLATED) val allocationPolicy = createComputeScheduler(allocationPolicy, seeder.asKotlinRandom(), vmPlacements) val meterProvider = createMeterProvider(clock) @@ -137,31 +135,30 @@ abstract class Portfolio(name: String) : Experiment(name) { ) withComputeService(clock, meterProvider, environment, allocationPolicy, performanceInterferenceModel) { scheduler -> - val failureDomain = if (operationalPhenomena.failureFrequency > 0) { + val faultInjector = if (operationalPhenomena.failureFrequency > 0) { logger.debug("ENABLING failures") - createFailureDomain( - this, + createFaultInjector( + coroutineContext, clock, + scheduler.hosts.map { it as SimHost }.toSet(), seeder.nextInt(), operationalPhenomena.failureFrequency, - scheduler, - chan ) } else { null } withMonitor(scheduler, clock, meterProvider as MetricProducer, monitor) { + faultInjector?.start() processTrace( clock, trace, scheduler, - chan, monitor ) } - failureDomain?.cancel() + faultInjector?.close() monitor.close() } -- cgit v1.2.3