diff options
| author | Fabian Mastenbroek <mail.fabianm@gmail.com> | 2021-09-10 10:59:44 +0200 |
|---|---|---|
| committer | Fabian Mastenbroek <mail.fabianm@gmail.com> | 2021-09-10 16:18:00 +0200 |
| commit | d24cc0cc9c4fe2145f0337d65e9a75f631365973 (patch) | |
| tree | fef06ec8c23654b49eda2b15a3febf0a57a78aa5 | |
| parent | 04945381d01d8c6e59befe6843f2c6f6da5e91bf (diff) | |
refactor(compute): Integrate fault injection into compute simulator
This change moves the fault injection logic directly into the
opendc-compute-simulator module, so that it can operate at a higher
abstraction. In the future, we might again split the module if we can
re-use some of its logic.
19 files changed, 316 insertions, 356 deletions
diff --git a/opendc-compute/opendc-compute-simulator/build.gradle.kts b/opendc-compute/opendc-compute-simulator/build.gradle.kts index c5a9e668..cad051e6 100644 --- a/opendc-compute/opendc-compute-simulator/build.gradle.kts +++ b/opendc-compute/opendc-compute-simulator/build.gradle.kts @@ -33,7 +33,7 @@ dependencies { api(platform(projects.opendcPlatform)) api(projects.opendcCompute.opendcComputeService) api(projects.opendcSimulator.opendcSimulatorCompute) - api(projects.opendcSimulator.opendcSimulatorFailures) + api(libs.commons.math3) implementation(projects.opendcUtils) implementation(libs.opentelemetry.semconv) implementation(libs.kotlin.logging) diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt index 213d20ee..a1cc3390 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt @@ -43,7 +43,6 @@ import org.opendc.simulator.compute.model.MemoryUnit import org.opendc.simulator.compute.power.ConstantPowerModel import org.opendc.simulator.compute.power.PowerDriver import org.opendc.simulator.compute.power.SimplePowerDriver -import org.opendc.simulator.failures.FailureDomain import org.opendc.simulator.resources.SimResourceInterpreter import java.util.* import kotlin.coroutines.CoroutineContext @@ -66,11 +65,11 @@ public class SimHost( powerDriver: PowerDriver = SimplePowerDriver(ConstantPowerModel(0.0)), private val mapper: SimWorkloadMapper = SimMetaWorkloadMapper(), interferenceDomain: VmInterferenceDomain? = null -) : Host, FailureDomain, AutoCloseable { +) : Host, AutoCloseable { /** * The [CoroutineScope] of the host bounded by the lifecycle of the host. */ - override val scope: CoroutineScope = CoroutineScope(context + Job()) + private val scope: CoroutineScope = CoroutineScope(context + Job()) /** * The clock instance used by the host. @@ -347,6 +346,22 @@ public class SimHost( override fun toString(): String = "SimHost[uid=$uid,name=$name,model=$model]" + public suspend fun fail() { + reportTime() + _state = HostState.DOWN + for (guest in guests.values) { + guest.fail() + } + } + + public suspend fun recover() { + reportTime() + _state = HostState.UP + for (guest in guests.values) { + guest.start() + } + } + /** * Convert flavor to machine model. */ @@ -369,22 +384,6 @@ public class SimHost( listeners.forEach { it.onStateChanged(this, vm.server, vm.state) } } - override suspend fun fail() { - reportTime() - _state = HostState.DOWN - for (guest in guests.values) { - guest.fail() - } - } - - override suspend fun recover() { - reportTime() - _state = HostState.UP - for (guest in guests.values) { - guest.start() - } - } - /** * A virtual machine instance that the driver manages. */ diff --git a/opendc-simulator/opendc-simulator-failures/src/main/kotlin/org/opendc/simulator/failures/FaultInjector.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/failure/HostFault.kt index a866260c..258ccc89 100644 --- a/opendc-simulator/opendc-simulator-failures/src/main/kotlin/org/opendc/simulator/failures/FaultInjector.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/failure/HostFault.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 AtLarge Research + * Copyright (c) 2021 AtLarge Research * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,14 +20,17 @@ * SOFTWARE. */ -package org.opendc.simulator.failures +package org.opendc.compute.simulator.failure + +import org.opendc.compute.simulator.SimHost +import java.time.Clock /** - * An interface for stochastically injecting faults into a running system. + * Interface responsible for applying the fault to a host. */ -public interface FaultInjector { +public interface HostFault { /** - * Enqueue the specified [FailureDomain] into the queue as candidate for failure injection in the future. + * Apply the fault to the specified [victims]. */ - public fun enqueue(domain: FailureDomain) + public suspend fun apply(clock: Clock, victims: List<SimHost>) } diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/failure/HostFaultInjector.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/failure/HostFaultInjector.kt new file mode 100644 index 00000000..5eff439f --- /dev/null +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/failure/HostFaultInjector.kt @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2020 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.compute.simulator.failure + +import org.apache.commons.math3.distribution.RealDistribution +import org.opendc.compute.simulator.SimHost +import org.opendc.compute.simulator.internal.HostFaultInjectorImpl +import java.time.Clock +import kotlin.coroutines.CoroutineContext + +/** + * An interface for stochastically injecting faults into a set of hosts. + */ +public interface HostFaultInjector : AutoCloseable { + /** + * Start fault injection. + */ + public fun start() + + /** + * Stop fault injection into the system. + */ + public override fun close() + + public companion object { + /** + * Construct a new [HostFaultInjector]. + * + * @param context The scope to run the fault injector in. + * @param clock The [Clock] to keep track of simulation time. + * @param hosts The hosts to inject the faults into. + * @param iat The inter-arrival time distribution of the failures (in hours). + * @param selector The [VictimSelector] to select the host victims. + * @param fault The type of [HostFault] to inject. + */ + public operator fun invoke( + context: CoroutineContext, + clock: Clock, + hosts: Set<SimHost>, + iat: RealDistribution, + selector: VictimSelector, + fault: HostFault + ): HostFaultInjector = HostFaultInjectorImpl(context, clock, hosts, iat, selector, fault) + } +} diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/failure/StartStopHostFault.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/failure/StartStopHostFault.kt new file mode 100644 index 00000000..fc7cebfc --- /dev/null +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/failure/StartStopHostFault.kt @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.compute.simulator.failure + +import kotlinx.coroutines.delay +import org.apache.commons.math3.distribution.RealDistribution +import org.opendc.compute.simulator.SimHost +import java.time.Clock +import kotlin.math.roundToLong + +/** + * A type of [HostFault] where the hosts are stopped and recover after some random amount of time. + */ +public class StartStopHostFault(private val duration: RealDistribution) : HostFault { + override suspend fun apply(clock: Clock, victims: List<SimHost>) { + for (host in victims) { + host.fail() + } + + val df = (duration.sample() * 1000).roundToLong() // seconds to milliseconds + + // Handle long overflow + if (clock.millis() + df <= 0) { + return + } + + delay(df) + + for (host in victims) { + host.recover() + } + } + + override fun toString(): String = "StartStopHostFault[$duration]" +} diff --git a/opendc-simulator/opendc-simulator-failures/src/main/kotlin/org/opendc/simulator/failures/FailureDomain.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/failure/StochasticVictimSelector.kt index dc3006e8..87903623 100644 --- a/opendc-simulator/opendc-simulator-failures/src/main/kotlin/org/opendc/simulator/failures/FailureDomain.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/failure/StochasticVictimSelector.kt @@ -1,7 +1,5 @@ /* - * MIT License - * - * Copyright (c) 2020 atlarge-research + * Copyright (c) 2021 AtLarge Research * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -22,26 +20,25 @@ * SOFTWARE. */ -package org.opendc.simulator.failures +package org.opendc.compute.simulator.failure -import kotlinx.coroutines.CoroutineScope +import org.apache.commons.math3.distribution.RealDistribution +import org.opendc.compute.simulator.SimHost +import kotlin.math.roundToInt +import kotlin.random.Random /** - * A logical or physical component in a computing environment which may fail. + * A [VictimSelector] that stochastically selects a set of hosts to be failed. */ -public interface FailureDomain { - /** - * The lifecycle of the failure domain to which a [FaultInjector] will attach. - */ - public val scope: CoroutineScope +public class StochasticVictimSelector( + private val size: RealDistribution, + private val random: Random = Random(0) +) : VictimSelector { - /** - * Fail the domain externally. - */ - public suspend fun fail() + override fun select(hosts: Set<SimHost>): List<SimHost> { + val n = size.sample().roundToInt() + return hosts.shuffled(random).take(n) + } - /** - * Resume the failure domain. - */ - public suspend fun recover() + override fun toString(): String = "StochasticVictimSelector[$size]" } diff --git a/opendc-simulator/opendc-simulator-failures/build.gradle.kts b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/failure/VictimSelector.kt index edd48b40..b5610284 100644 --- a/opendc-simulator/opendc-simulator-failures/build.gradle.kts +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/failure/VictimSelector.kt @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 AtLarge Research + * Copyright (c) 2021 AtLarge Research * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -20,14 +20,16 @@ * SOFTWARE. */ -description = "Failure models for OpenDC" +package org.opendc.compute.simulator.failure -plugins { - `kotlin-library-conventions` -} +import org.opendc.compute.simulator.SimHost -dependencies { - api(platform(projects.opendcPlatform)) - api(libs.kotlinx.coroutines) - api(libs.commons.math3) +/** + * Interface responsible for selecting the victim(s) for fault injection. + */ +public interface VictimSelector { + /** + * Select the hosts from [hosts] where a fault will be injected. + */ + public fun select(hosts: Set<SimHost>): List<SimHost> } diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/HostFaultInjectorImpl.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/HostFaultInjectorImpl.kt new file mode 100644 index 00000000..6919b7fd --- /dev/null +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/HostFaultInjectorImpl.kt @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.compute.simulator.internal + +import kotlinx.coroutines.* +import org.apache.commons.math3.distribution.RealDistribution +import org.opendc.compute.simulator.SimHost +import org.opendc.compute.simulator.failure.HostFault +import org.opendc.compute.simulator.failure.HostFaultInjector +import org.opendc.compute.simulator.failure.VictimSelector +import java.time.Clock +import kotlin.coroutines.CoroutineContext +import kotlin.math.roundToLong + +/** + * Internal implementation of the [HostFaultInjector] interface. + * + * @param context The scope to run the fault injector in. + * @param clock The [Clock] to keep track of simulation time. + * @param hosts The set of hosts to inject faults into. + * @param iat The inter-arrival time distribution of the failures (in hours). + * @param selector The [VictimSelector] to select the host victims. + * @param fault The type of [HostFault] to inject. + */ +internal class HostFaultInjectorImpl( + private val context: CoroutineContext, + private val clock: Clock, + private val hosts: Set<SimHost>, + private val iat: RealDistribution, + private val selector: VictimSelector, + private val fault: HostFault +) : HostFaultInjector { + /** + * The scope in which the injector runs. + */ + private val scope = CoroutineScope(context + Job()) + + /** + * The [Job] that awaits the nearest fault in the system. + */ + private var job: Job? = null + + /** + * Start the fault injection into the system. + */ + override fun start() { + if (job != null) { + return + } + + job = scope.launch { + runInjector() + job = null + } + } + + /** + * Run the injection process. + */ + private suspend fun runInjector() { + while (true) { + // Make sure to convert delay from hours to milliseconds + val d = (iat.sample() * 3.6e6).roundToLong() + + // Handle long overflow + if (clock.millis() + d <= 0) { + return + } + + delay(d) + + val victims = selector.select(hosts) + fault.apply(clock, victims) + } + } + + /** + * Stop the fault injector. + */ + public override fun close() { + scope.cancel() + } +} diff --git a/opendc-experiments/opendc-experiments-capelin/build.gradle.kts b/opendc-experiments/opendc-experiments-capelin/build.gradle.kts index b2330af0..036d0638 100644 --- a/opendc-experiments/opendc-experiments-capelin/build.gradle.kts +++ b/opendc-experiments/opendc-experiments-capelin/build.gradle.kts @@ -35,7 +35,6 @@ dependencies { implementation(projects.opendcTrace.opendcTraceBitbrains) implementation(projects.opendcSimulator.opendcSimulatorCore) implementation(projects.opendcSimulator.opendcSimulatorCompute) - implementation(projects.opendcSimulator.opendcSimulatorFailures) implementation(projects.opendcCompute.opendcComputeSimulator) implementation(projects.opendcTelemetry.opendcTelemetrySdk) implementation(projects.opendcTelemetry.opendcTelemetryCompute) diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/ExperimentHelpers.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/ExperimentHelpers.kt index 512b754d..8227bca9 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/ExperimentHelpers.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/ExperimentHelpers.kt @@ -25,7 +25,6 @@ package org.opendc.experiments.capelin import io.opentelemetry.api.metrics.MeterProvider import io.opentelemetry.sdk.metrics.SdkMeterProvider import kotlinx.coroutines.* -import kotlinx.coroutines.channels.Channel import org.apache.commons.math3.distribution.LogNormalDistribution import org.apache.commons.math3.random.Well19937c import org.opendc.compute.api.* @@ -41,6 +40,9 @@ import org.opendc.compute.service.scheduler.weights.InstanceCountWeigher import org.opendc.compute.service.scheduler.weights.RamWeigher import org.opendc.compute.service.scheduler.weights.VCpuWeigher import org.opendc.compute.simulator.SimHost +import org.opendc.compute.simulator.failure.HostFaultInjector +import org.opendc.compute.simulator.failure.StartStopHostFault +import org.opendc.compute.simulator.failure.StochasticVictimSelector import org.opendc.experiments.capelin.env.EnvironmentReader import org.opendc.experiments.capelin.trace.TraceReader import org.opendc.simulator.compute.kernel.SimFairShareHypervisorProvider @@ -48,67 +50,36 @@ import org.opendc.simulator.compute.kernel.interference.VmInterferenceModel import org.opendc.simulator.compute.power.SimplePowerDriver import org.opendc.simulator.compute.workload.SimTraceWorkload import org.opendc.simulator.compute.workload.SimWorkload -import org.opendc.simulator.failures.CorrelatedFaultInjector -import org.opendc.simulator.failures.FaultInjector import org.opendc.simulator.resources.SimResourceInterpreter import org.opendc.telemetry.compute.ComputeMonitor import org.opendc.telemetry.sdk.toOtelClock import java.time.Clock +import kotlin.coroutines.CoroutineContext import kotlin.math.ln import kotlin.math.max import kotlin.random.Random /** - * Construct the failure domain for the experiments. - */ -fun createFailureDomain( - coroutineScope: CoroutineScope, - clock: Clock, - seed: Int, - failureInterval: Double, - service: ComputeService, - chan: Channel<Unit> -): CoroutineScope { - val job = coroutineScope.launch { - chan.receive() - val random = Random(seed) - val injectors = mutableMapOf<String, FaultInjector>() - for (host in service.hosts) { - val cluster = host.meta["cluster"] as String - val injector = - injectors.getOrPut(cluster) { - createFaultInjector( - this, - clock, - random, - failureInterval - ) - } - injector.enqueue(host as SimHost) - } - } - return CoroutineScope(coroutineScope.coroutineContext + job) -} - -/** * Obtain the [FaultInjector] to use for the experiments. */ fun createFaultInjector( - coroutineScope: CoroutineScope, + context: CoroutineContext, clock: Clock, - random: Random, + hosts: Set<SimHost>, + seed: Int, failureInterval: Double -): FaultInjector { - val rng = Well19937c(random.nextLong()) +): HostFaultInjector { + val rng = Well19937c(seed) // Parameters from A. Iosup, A Framework for the Study of Grid Inter-Operation Mechanisms, 2009 // GRID'5000 - return CorrelatedFaultInjector( - coroutineScope, + return HostFaultInjector( + context, clock, + hosts, iat = LogNormalDistribution(rng, ln(failureInterval), 1.03), - size = LogNormalDistribution(rng, 1.88, 1.25), - duration = LogNormalDistribution(rng, 8.89, 2.71) + selector = StochasticVictimSelector(LogNormalDistribution(rng, 1.88, 1.25), Random(seed)), + fault = StartStopHostFault(LogNormalDistribution(rng, 8.89, 2.71)) ) } @@ -164,7 +135,6 @@ suspend fun processTrace( clock: Clock, reader: TraceReader<SimWorkload>, scheduler: ComputeService, - chan: Channel<Unit>, monitor: ComputeMonitor? = null, ) { val client = scheduler.newClient() @@ -193,10 +163,9 @@ suspend fun processTrace( delay(max(0, (entry.start - offset) - clock.millis())) launch { - chan.send(Unit) - val workloadOffset = -offset + 300001 val workload = SimTraceWorkload((entry.meta["workload"] as SimTraceWorkload).trace, workloadOffset) + val server = client.newServer( entry.name, image, diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt index 4db04591..82794471 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt @@ -25,9 +25,8 @@ package org.opendc.experiments.capelin import com.typesafe.config.ConfigFactory import io.opentelemetry.sdk.metrics.export.MetricProducer import kotlinx.coroutines.ExperimentalCoroutinesApi -import kotlinx.coroutines.cancel -import kotlinx.coroutines.channels.Channel import mu.KotlinLogging +import org.opendc.compute.simulator.SimHost import org.opendc.experiments.capelin.env.ClusterEnvironmentReader import org.opendc.experiments.capelin.export.parquet.ParquetExportMonitor import org.opendc.experiments.capelin.model.CompositeWorkload @@ -103,7 +102,6 @@ abstract class Portfolio(name: String) : Experiment(name) { val seeder = Random(repeat.toLong()) val environment = ClusterEnvironmentReader(File(config.getString("env-path"), "${topology.name}.txt")) - val chan = Channel<Unit>(Channel.CONFLATED) val allocationPolicy = createComputeScheduler(allocationPolicy, seeder.asKotlinRandom(), vmPlacements) val meterProvider = createMeterProvider(clock) @@ -137,31 +135,30 @@ abstract class Portfolio(name: String) : Experiment(name) { ) withComputeService(clock, meterProvider, environment, allocationPolicy, performanceInterferenceModel) { scheduler -> - val failureDomain = if (operationalPhenomena.failureFrequency > 0) { + val faultInjector = if (operationalPhenomena.failureFrequency > 0) { logger.debug("ENABLING failures") - createFailureDomain( - this, + createFaultInjector( + coroutineContext, clock, + scheduler.hosts.map { it as SimHost }.toSet(), seeder.nextInt(), operationalPhenomena.failureFrequency, - scheduler, - chan ) } else { null } withMonitor(scheduler, clock, meterProvider as MetricProducer, monitor) { + faultInjector?.start() processTrace( clock, trace, scheduler, - chan, monitor ) } - failureDomain?.cancel() + faultInjector?.close() monitor.close() } diff --git a/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt b/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt index ab33bc25..44cf92a8 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt @@ -23,8 +23,6 @@ package org.opendc.experiments.capelin import io.opentelemetry.sdk.metrics.export.MetricProducer -import kotlinx.coroutines.cancel -import kotlinx.coroutines.channels.Channel import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.BeforeEach import org.junit.jupiter.api.Test @@ -34,6 +32,7 @@ import org.opendc.compute.service.scheduler.filters.ComputeFilter import org.opendc.compute.service.scheduler.filters.RamFilter import org.opendc.compute.service.scheduler.filters.VCpuFilter import org.opendc.compute.service.scheduler.weights.CoreRamWeigher +import org.opendc.compute.simulator.SimHost import org.opendc.experiments.capelin.env.ClusterEnvironmentReader import org.opendc.experiments.capelin.env.EnvironmentReader import org.opendc.experiments.capelin.model.Workload @@ -73,9 +72,6 @@ class CapelinIntegrationTest { */ @Test fun testLarge() = runBlockingSimulation { - val failures = false - val seed = 0 - val chan = Channel<Unit>(Channel.CONFLATED) val allocationPolicy = FilterScheduler( filters = listOf(ComputeFilter(), VCpuFilter(16.0), RamFilter(1.0)), weighers = listOf(CoreRamWeigher(multiplier = 1.0)) @@ -85,31 +81,14 @@ class CapelinIntegrationTest { val meterProvider = createMeterProvider(clock) withComputeService(clock, meterProvider, environmentReader, allocationPolicy) { scheduler -> - val failureDomain = if (failures) { - println("ENABLING failures") - createFailureDomain( - this, - clock, - seed, - 24.0 * 7, - scheduler, - chan - ) - } else { - null - } - withMonitor(scheduler, clock, meterProvider as MetricProducer, monitor) { processTrace( clock, traceReader, scheduler, - chan, monitor ) } - - failureDomain?.cancel() } val serviceMetrics = collectServiceMetrics(clock.millis(), meterProvider as MetricProducer) @@ -141,7 +120,6 @@ class CapelinIntegrationTest { @Test fun testSmall() = runBlockingSimulation { val seed = 1 - val chan = Channel<Unit>(Channel.CONFLATED) val allocationPolicy = FilterScheduler( filters = listOf(ComputeFilter(), VCpuFilter(16.0), RamFilter(1.0)), weighers = listOf(CoreRamWeigher(multiplier = 1.0)) @@ -157,7 +135,6 @@ class CapelinIntegrationTest { clock, traceReader, scheduler, - chan, monitor ) } @@ -187,7 +164,6 @@ class CapelinIntegrationTest { @Test fun testInterference() = runBlockingSimulation { val seed = 1 - val chan = Channel<Unit>(Channel.CONFLATED) val allocationPolicy = FilterScheduler( filters = listOf(ComputeFilter(), VCpuFilter(16.0), RamFilter(1.0)), weighers = listOf(CoreRamWeigher(multiplier = 1.0)) @@ -209,7 +185,6 @@ class CapelinIntegrationTest { clock, traceReader, scheduler, - chan, monitor ) } @@ -239,7 +214,6 @@ class CapelinIntegrationTest { @Test fun testFailures() = runBlockingSimulation { val seed = 1 - val chan = Channel<Unit>(Channel.CONFLATED) val allocationPolicy = FilterScheduler( filters = listOf(ComputeFilter(), VCpuFilter(16.0), RamFilter(1.0)), weighers = listOf(CoreRamWeigher(multiplier = 1.0)) @@ -250,27 +224,26 @@ class CapelinIntegrationTest { val meterProvider = createMeterProvider(clock) withComputeService(clock, meterProvider, environmentReader, allocationPolicy) { scheduler -> - val failureDomain = - createFailureDomain( - this, + val faultInjector = + createFaultInjector( + coroutineContext, clock, + scheduler.hosts.map { it as SimHost }.toSet(), seed, 24.0 * 7, - scheduler, - chan ) withMonitor(scheduler, clock, meterProvider as MetricProducer, monitor) { + faultInjector.start() processTrace( clock, traceReader, scheduler, - chan, monitor ) } - failureDomain.cancel() + faultInjector.close() } val serviceMetrics = collectServiceMetrics(clock.millis(), meterProvider as MetricProducer) diff --git a/opendc-experiments/opendc-experiments-energy21/build.gradle.kts b/opendc-experiments/opendc-experiments-energy21/build.gradle.kts index 40ac2967..cc58e5f1 100644 --- a/opendc-experiments/opendc-experiments-energy21/build.gradle.kts +++ b/opendc-experiments/opendc-experiments-energy21/build.gradle.kts @@ -33,7 +33,6 @@ dependencies { api(projects.opendcHarness.opendcHarnessApi) implementation(projects.opendcSimulator.opendcSimulatorCore) implementation(projects.opendcSimulator.opendcSimulatorCompute) - implementation(projects.opendcSimulator.opendcSimulatorFailures) implementation(projects.opendcCompute.opendcComputeSimulator) implementation(projects.opendcExperiments.opendcExperimentsCapelin) implementation(projects.opendcTelemetry.opendcTelemetrySdk) diff --git a/opendc-experiments/opendc-experiments-energy21/src/main/kotlin/org/opendc/experiments/energy21/EnergyExperiment.kt b/opendc-experiments/opendc-experiments-energy21/src/main/kotlin/org/opendc/experiments/energy21/EnergyExperiment.kt index 02aaab3c..d9194969 100644 --- a/opendc-experiments/opendc-experiments-energy21/src/main/kotlin/org/opendc/experiments/energy21/EnergyExperiment.kt +++ b/opendc-experiments/opendc-experiments-energy21/src/main/kotlin/org/opendc/experiments/energy21/EnergyExperiment.kt @@ -26,7 +26,6 @@ import com.typesafe.config.ConfigFactory import io.opentelemetry.api.metrics.MeterProvider import io.opentelemetry.sdk.metrics.export.MetricProducer import kotlinx.coroutines.CoroutineScope -import kotlinx.coroutines.channels.Channel import kotlinx.coroutines.coroutineScope import mu.KotlinLogging import org.opendc.compute.service.ComputeService @@ -81,7 +80,6 @@ public class EnergyExperiment : Experiment("Energy Modeling 2021") { private val powerModel by anyOf(PowerModelType.LINEAR, PowerModelType.CUBIC, PowerModelType.INTERPOLATION) override fun doRun(repeat: Int): Unit = runBlockingSimulation { - val chan = Channel<Unit>(Channel.CONFLATED) val allocationPolicy = FilterScheduler( filters = listOf(ComputeFilter(), VCpuFilter(1.0), RamFilter(1.0)), weighers = listOf(), @@ -98,7 +96,6 @@ public class EnergyExperiment : Experiment("Energy Modeling 2021") { clock, trace, scheduler, - chan, monitor ) } diff --git a/opendc-experiments/opendc-experiments-radice/build.gradle.kts b/opendc-experiments/opendc-experiments-radice/build.gradle.kts index c1515165..0c716183 100644 --- a/opendc-experiments/opendc-experiments-radice/build.gradle.kts +++ b/opendc-experiments/opendc-experiments-radice/build.gradle.kts @@ -34,7 +34,6 @@ dependencies { implementation(projects.opendcFormat) implementation(projects.opendcSimulator.opendcSimulatorCore) implementation(projects.opendcSimulator.opendcSimulatorCompute) - implementation(projects.opendcSimulator.opendcSimulatorFailures) implementation(projects.opendcCompute.opendcComputeSimulator) implementation(projects.opendcTelemetry.opendcTelemetrySdk) diff --git a/opendc-simulator/opendc-simulator-failures/src/main/kotlin/org/opendc/simulator/failures/CorrelatedFaultInjector.kt b/opendc-simulator/opendc-simulator-failures/src/main/kotlin/org/opendc/simulator/failures/CorrelatedFaultInjector.kt deleted file mode 100644 index c3b85666..00000000 --- a/opendc-simulator/opendc-simulator-failures/src/main/kotlin/org/opendc/simulator/failures/CorrelatedFaultInjector.kt +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Copyright (c) 2020 AtLarge Research - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -package org.opendc.simulator.failures - -import kotlinx.coroutines.* -import org.apache.commons.math3.distribution.RealDistribution -import java.time.Clock -import java.util.* -import kotlin.math.roundToInt -import kotlin.math.roundToLong - -/** - * A [FaultInjector] that injects fault in the system which are correlated to each other. Failures do not occur in - * isolation, but will trigger other faults. - * - * @param scope The scope to run the fault injector in. - * @param clock The [Clock] to keep track of simulation time. - * @param iat The inter-arrival time distribution of the failures (in hours). - * @param size The failure group size distribution. - * @param duration The failure duration (in seconds). - */ -public class CorrelatedFaultInjector( - private val scope: CoroutineScope, - private val clock: Clock, - private val iat: RealDistribution, - private val size: RealDistribution, - private val duration: RealDistribution, - private val random: Random = Random(0) -) : FaultInjector { - /** - * The active failure domains that have been registered. - */ - private val active = mutableSetOf<FailureDomain>() - - /** - * The [Job] that awaits the nearest fault in the system. - */ - private var job: Job? = null - - /** - * Enqueue the specified [FailureDomain] to fail some time in the future. - */ - override fun enqueue(domain: FailureDomain) { - active += domain - - // Clean up the domain if it finishes - domain.scope.coroutineContext[Job]!!.invokeOnCompletion { - this@CorrelatedFaultInjector.scope.launch { - active -= domain - - if (active.isEmpty()) { - job?.cancel() - job = null - } - } - } - - if (job != null) { - return - } - - job = this.scope.launch { - while (active.isNotEmpty()) { - ensureActive() - - // Make sure to convert delay from hours to milliseconds - val d = (iat.sample() * 3.6e6).roundToLong() - - // Handle long overflow - if (clock.millis() + d <= 0) { - return@launch - } - - delay(d) - - val n = size.sample().roundToInt() - val targets = active.shuffled(random).take(n) - - for (failureDomain in targets) { - active -= failureDomain - failureDomain.fail() - } - - val df = (duration.sample() * 1000).roundToLong() // seconds to milliseconds - - // Handle long overflow - if (clock.millis() + df <= 0) { - return@launch - } - - delay(df) - - for (failureDomain in targets) { - failureDomain.recover() - - // Re-enqueue machine to be failed - enqueue(failureDomain) - } - } - - job = null - } - } -} diff --git a/opendc-simulator/opendc-simulator-failures/src/main/kotlin/org/opendc/simulator/failures/UncorrelatedFaultInjector.kt b/opendc-simulator/opendc-simulator-failures/src/main/kotlin/org/opendc/simulator/failures/UncorrelatedFaultInjector.kt deleted file mode 100644 index 8f99f758..00000000 --- a/opendc-simulator/opendc-simulator-failures/src/main/kotlin/org/opendc/simulator/failures/UncorrelatedFaultInjector.kt +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2020 AtLarge Research - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -package org.opendc.simulator.failures - -import kotlinx.coroutines.delay -import kotlinx.coroutines.launch -import org.apache.commons.math3.distribution.RealDistribution -import java.time.Clock -import kotlin.math.roundToLong - -/** - * A [FaultInjector] that injects uncorrelated faults into the system, meaning that failures of the subsystems are - * independent. - * - * @param clock The [Clock] to keep track of simulation time. - * @param iat The failure inter-arrival time distribution (in hours) - * @param duration The failure duration distribution (in seconds). - */ -public class UncorrelatedFaultInjector( - private val clock: Clock, - private val iat: RealDistribution, - private val duration: RealDistribution, -) : FaultInjector { - /** - * Enqueue the specified [FailureDomain] to fail some time in the future. - */ - override fun enqueue(domain: FailureDomain) { - domain.scope.launch { - val d = (iat.sample() * 3.6e6).roundToLong() // Make sure to convert delay to milliseconds - - // Handle long overflow - if (clock.millis() + d <= 0) { - return@launch - } - - delay(d) - domain.fail() - - val df = (duration.sample() * 1000).roundToLong() // seconds to milliseconds - - // Handle long overflow - if (clock.millis() + df <= 0) { - return@launch - } - - delay(df) - domain.recover() - } - } -} diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt index 5d481270..b565e90d 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/Main.kt @@ -30,8 +30,8 @@ import io.opentelemetry.api.metrics.MeterProvider import io.opentelemetry.sdk.metrics.SdkMeterProvider import io.opentelemetry.sdk.metrics.export.MetricProducer import kotlinx.coroutines.* -import kotlinx.coroutines.channels.Channel import mu.KotlinLogging +import org.opendc.compute.simulator.SimHost import org.opendc.experiments.capelin.* import org.opendc.experiments.capelin.env.EnvironmentReader import org.opendc.experiments.capelin.env.MachineDef @@ -188,8 +188,6 @@ class RunnerCli : CliktCommand(name = "runner") { val seeder = Random(seed) - val chan = Channel<Unit>(Channel.CONFLATED) - val meterProvider: MeterProvider = SdkMeterProvider .builder() .setClock(clock.toOtelClock()) @@ -207,31 +205,31 @@ class RunnerCli : CliktCommand(name = "runner") { val failureFrequency = if (operational.failuresEnabled) 24.0 * 7 else 0.0 withComputeService(clock, meterProvider, environment, allocationPolicy, interferenceModel) { scheduler -> - val failureDomain = if (failureFrequency > 0) { + val faultInjector = if (failureFrequency > 0) { logger.debug { "ENABLING failures" } - createFailureDomain( - this, + createFaultInjector( + coroutineContext, clock, + scheduler.hosts.map { it as SimHost }.toSet(), seeder.nextInt(), failureFrequency, - scheduler, - chan ) } else { null } withMonitor(scheduler, clock, meterProvider as MetricProducer, monitor) { + faultInjector?.start() + processTrace( clock, trace, scheduler, - chan, monitor ) - } - failureDomain?.cancel() + faultInjector?.close() + } } val monitorResults = collectServiceMetrics(clock.millis(), metricProducer) diff --git a/settings.gradle.kts b/settings.gradle.kts index cee8887b..427cdb52 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -42,7 +42,6 @@ include(":opendc-simulator:opendc-simulator-resources") include(":opendc-simulator:opendc-simulator-power") include(":opendc-simulator:opendc-simulator-network") include(":opendc-simulator:opendc-simulator-compute") -include(":opendc-simulator:opendc-simulator-failures") include(":opendc-telemetry:opendc-telemetry-api") include(":opendc-telemetry:opendc-telemetry-sdk") include(":opendc-telemetry:opendc-telemetry-compute") |
