From 3721831204c2d350b93ea265731c0970cbd8fce4 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Tue, 24 Aug 2021 12:55:49 +0200 Subject: feat(compute): Add support for SimHost failure This change adds support for failures in the SimHost implementation. Failing a host will now cause the virtual machine to enter an error state. --- .../kotlin/org/opendc/compute/simulator/SimHost.kt | 26 ++++-- .../org/opendc/compute/simulator/SimHostTest.kt | 96 ++++++++++++++++++++++ 2 files changed, 117 insertions(+), 5 deletions(-) diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt index 5ea577f3..be771f6d 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/SimHost.kt @@ -46,6 +46,7 @@ import org.opendc.simulator.resources.SimResourceInterpreter import java.util.* import kotlin.coroutines.CoroutineContext import kotlin.coroutines.resume +import kotlin.coroutines.resumeWithException /** * A [Host] that is simulates virtual machines on a physical machine using [SimHypervisor]. @@ -315,10 +316,16 @@ public class SimHost( override suspend fun fail() { _state = HostState.DOWN + for (guest in guests.values) { + guest.fail() + } } override suspend fun recover() { _state = HostState.UP + for (guest in guests.values) { + guest.start() + } } /** @@ -329,7 +336,7 @@ public class SimHost( suspend fun start() { when (state) { - ServerState.TERMINATED -> { + ServerState.TERMINATED, ServerState.ERROR -> { logger.info { "User requested to start server ${server.uid}" } launch() } @@ -356,9 +363,15 @@ public class SimHost( suspend fun terminate() { stop() + machine.close() state = ServerState.DELETED } + suspend fun fail() { + stop() + state = ServerState.ERROR + } + private var job: Job? = null private suspend fun launch() = suspendCancellableCoroutine { cont -> @@ -366,16 +379,19 @@ public class SimHost( val workload = mapper.createWorkload(server) job = scope.launch { - delay(1) // TODO Introduce boot time - init() - cont.resume(Unit) + try { + delay(1) // TODO Introduce boot time + init() + cont.resume(Unit) + } catch (e: Throwable) { + cont.resumeWithException(e) + } try { machine.run(workload, mapOf("driver" to this@SimHost, "server" to server)) exit(null) } catch (cause: Throwable) { exit(cause) } finally { - machine.close() job = null } } diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt index fc96cec8..93a2248a 100644 --- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt +++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/SimHostTest.kt @@ -185,6 +185,102 @@ internal class SimHostTest { ) } + /** + * Test failure of the host. + */ + @Test + fun testFailure() = runBlockingSimulation { + var requestedWork = 0L + var grantedWork = 0L + + val meterProvider: MeterProvider = SdkMeterProvider + .builder() + .setClock(clock.toOtelClock()) + .build() + + val interpreter = SimResourceInterpreter(coroutineContext, clock) + val host = SimHost( + uid = UUID.randomUUID(), + name = "test", + model = machineModel, + meta = emptyMap(), + coroutineContext, + interpreter, + meterProvider.get("opendc-compute-simulator"), + SimFairShareHypervisorProvider() + ) + val duration = 5 * 60L + val image = MockImage( + UUID.randomUUID(), + "", + emptyMap(), + mapOf( + "workload" to SimTraceWorkload( + sequenceOf( + SimTraceWorkload.Fragment(0, duration * 1000, 2 * 28.0, 2), + SimTraceWorkload.Fragment(duration * 1000L, duration * 1000, 2 * 3500.0, 2), + SimTraceWorkload.Fragment(duration * 2000L, duration * 1000, 0.0, 2), + SimTraceWorkload.Fragment(duration * 3000L, duration * 1000, 2 * 183.0, 2) + ), + offset = 1 + ) + ) + ) + val flavor = MockFlavor(2, 0) + val server = MockServer(UUID.randomUUID(), "a", flavor, image) + + // Setup metric reader + val reader = CoroutineMetricReader( + this, listOf(meterProvider as MetricProducer), + object : MetricExporter { + override fun export(metrics: Collection): CompletableResultCode { + val metricsByName = metrics.associateBy { it.name } + metricsByName["cpu.work.total"]?.let { + requestedWork += it.doubleSummaryData.points.first().sum.toLong() + } + metricsByName["cpu.work.granted"]?.let { + grantedWork += it.doubleSummaryData.points.first().sum.toLong() + } + return CompletableResultCode.ofSuccess() + } + + override fun flush(): CompletableResultCode = CompletableResultCode.ofSuccess() + + override fun shutdown(): CompletableResultCode = CompletableResultCode.ofSuccess() + }, + exportInterval = duration * 1000L + ) + + coroutineScope { + host.spawn(server) + delay(5000L) + host.fail() + delay(5000L) + host.recover() + + suspendCancellableCoroutine { cont -> + host.addListener(object : HostListener { + override fun onStateChanged(host: Host, server: Server, newState: ServerState) { + if (newState == ServerState.TERMINATED) { + cont.resume(Unit) + } + } + }) + } + } + + host.close() + // Ensure last cycle is collected + delay(1000L * duration) + + reader.close() + + assertAll( + { assertEquals(2226039, requestedWork, "Total time does not match") }, + { assertEquals(1086039, grantedWork, "Down time does not match") }, + ) + } + private class MockFlavor( override val cpuCount: Int, override val memorySize: Long -- cgit v1.2.3