From c4016fcfd37550b237f6940eaffb5b4efd607601 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Fri, 3 Apr 2020 17:05:05 +0200 Subject: feat: Add initial prototype for failure recovery --- .../opendc/core/failure/CorrelatedFaultInjector.kt | 24 ++++++++++++++++++++-- .../atlarge/opendc/core/failure/FailureDomain.kt | 5 +++++ 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'opendc/opendc-core/src') diff --git a/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/CorrelatedFaultInjector.kt b/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/CorrelatedFaultInjector.kt index c5189764..f363bf45 100644 --- a/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/CorrelatedFaultInjector.kt +++ b/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/CorrelatedFaultInjector.kt @@ -44,6 +44,8 @@ public class CorrelatedFaultInjector( private val iatShape: Double, private val sizeScale: Double, private val sizeShape: Double, + private val dScale: Double, + private val dShape: Double, random: Random = Random(0) ) : FaultInjector { /** @@ -84,7 +86,7 @@ public class CorrelatedFaultInjector( } job = this.domain.launch { - while (true) { + while (active.isNotEmpty()) { ensureActive() // Make sure to convert delay from hours to milliseconds @@ -98,10 +100,28 @@ public class CorrelatedFaultInjector( delay(d.toLong()) val n = lognvariate(sizeScale, sizeShape).toInt() - for (failureDomain in active.shuffled(random).take(n)) { + val targets = active.shuffled(random).take(n) + for (failureDomain in targets) { + active -= failureDomain failureDomain.fail() } + + val df = lognvariate(dScale, dShape) * 3600 * 1e6 + + // Handle long overflow + if (simulationContext.clock.millis() + df <= 0) { + return@launch + } + + delay(df.toLong()) + + for (failureDomain in targets) { + failureDomain.recover() + enqueue(failureDomain) + } } + + job = null } } diff --git a/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/FailureDomain.kt b/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/FailureDomain.kt index 91ca9b83..d56df3c9 100644 --- a/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/FailureDomain.kt +++ b/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/FailureDomain.kt @@ -39,4 +39,9 @@ public interface FailureDomain { * Fail the domain externally. */ public suspend fun fail() + + /** + * Resume the failure domain. + */ + public suspend fun recover() } -- cgit v1.2.3 From 993da5586c23a8cf9c29f5970cc84284e847b408 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Fri, 3 Apr 2020 20:12:24 +0200 Subject: feat: Fix failure duration parameters --- .../atlarge/opendc/core/failure/CorrelatedFaultInjector.kt | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'opendc/opendc-core/src') diff --git a/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/CorrelatedFaultInjector.kt b/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/CorrelatedFaultInjector.kt index f363bf45..f46ce512 100644 --- a/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/CorrelatedFaultInjector.kt +++ b/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/CorrelatedFaultInjector.kt @@ -90,7 +90,7 @@ public class CorrelatedFaultInjector( ensureActive() // Make sure to convert delay from hours to milliseconds - val d = lognvariate(iatScale, iatShape) * 3600 * 1e6 + val d = lognvariate(iatScale, iatShape) * 3.6e6 // Handle long overflow if (simulationContext.clock.millis() + d <= 0) { @@ -99,14 +99,19 @@ public class CorrelatedFaultInjector( delay(d.toLong()) + val n = lognvariate(sizeScale, sizeShape).toInt() val targets = active.shuffled(random).take(n) + + println("[${simulationContext.clock.instant()}] FAIL $targets") + + for (failureDomain in targets) { active -= failureDomain failureDomain.fail() } - val df = lognvariate(dScale, dShape) * 3600 * 1e6 + val df = lognvariate(dScale, dShape) * 6e4 // Handle long overflow if (simulationContext.clock.millis() + df <= 0) { @@ -115,8 +120,12 @@ public class CorrelatedFaultInjector( delay(df.toLong()) + println("[${simulationContext.clock.instant()}] RECOVER $targets") + for (failureDomain in targets) { failureDomain.recover() + + // Re-enqueue machine to be failed enqueue(failureDomain) } } -- cgit v1.2.3 From 843a22e09f32f498d7d5b4782b54c5639375ee20 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Fri, 3 Apr 2020 20:30:27 +0200 Subject: feat: Move to ERROR state on failure --- .../com/atlarge/opendc/core/failure/CorrelatedFaultInjector.kt | 6 ------ 1 file changed, 6 deletions(-) (limited to 'opendc/opendc-core/src') diff --git a/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/CorrelatedFaultInjector.kt b/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/CorrelatedFaultInjector.kt index f46ce512..2904fbec 100644 --- a/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/CorrelatedFaultInjector.kt +++ b/opendc/opendc-core/src/main/kotlin/com/atlarge/opendc/core/failure/CorrelatedFaultInjector.kt @@ -99,13 +99,9 @@ public class CorrelatedFaultInjector( delay(d.toLong()) - val n = lognvariate(sizeScale, sizeShape).toInt() val targets = active.shuffled(random).take(n) - println("[${simulationContext.clock.instant()}] FAIL $targets") - - for (failureDomain in targets) { active -= failureDomain failureDomain.fail() @@ -120,8 +116,6 @@ public class CorrelatedFaultInjector( delay(df.toLong()) - println("[${simulationContext.clock.instant()}] RECOVER $targets") - for (failureDomain in targets) { failureDomain.recover() -- cgit v1.2.3