/* * Copyright (c) 2020 AtLarge Research * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ package org.opendc.experiments.base import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.Test import org.junit.jupiter.api.assertAll import org.opendc.compute.simulator.service.ServiceTask import org.opendc.experiments.base.experiment.specs.TraceBasedFailureModelSpec import org.opendc.simulator.compute.workload.trace.TraceFragment import java.util.ArrayList /** * An integration test suite for the Scenario experiments. */ class FailuresAndCheckpointingTest { /** * Failure test 1: Single Task, Single Failure * In this test, a single task is scheduled that is interrupted by a failure after 5 min. * Because there is no checkpointing, the full task has to be rerun. * * This means the final runtime is 20 minutes * * When the task is running, it is using 50% of the cpu. * This means that half of the time is active, and half is idle. * When the task is failed, all time is idle. */ @Test fun testFailures1() { val workload: ArrayList = arrayListOf( createTestTask( id = 0, fragments = arrayListOf( TraceFragment(10 * 60 * 1000, 1000.0), ), cpuCoreCount = 1, ), ) val failureModelSpec = TraceBasedFailureModelSpec( "src/test/resources/failureTraces/single_failure.parquet", repeat = false, ) val topology = createTopology("single_1_2000.json") val monitor = runTest(topology, workload, failureModelSpec) assertAll( { assertEquals(20 * 60 * 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, { assertEquals(((15 * 30000) + (5 * 60000)).toLong(), monitor.hostCpuIdleTimes["H01"]?.sum()) { "Idle time incorrect" } }, { assertEquals((15 * 30000).toLong(), monitor.hostCpuActiveTimes["H01"]?.sum()) { "Active time incorrect" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(0)) { "Incorrect energy usage" } }, { assertEquals(6000.0, monitor.hostEnergyUsages["H01"]?.get(5)) { "Incorrect energy usage" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(10)) { "Incorrect energy usage" } }, { assertEquals((15 * 60 * 150.0) + (5 * 60 * 100.0), monitor.hostEnergyUsages["H01"]?.sum()) { "Incorrect energy usage" } }, ) } /** * Failure test 2: Single Task, Failure much later * In this test, a single task is scheduled, with a failure trace. * * However, the first failure occurs after 500 min and should thus not affect the Task. */ @Test fun testFailures2() { val workload: ArrayList = arrayListOf( createTestTask( id = 0, fragments = arrayListOf( TraceFragment(10 * 60 * 1000, 1000.0), ), cpuCoreCount = 1, ), ) val failureModelSpec = TraceBasedFailureModelSpec( "src/test/resources/failureTraces/single_failure_2.parquet", repeat = false, ) val topology = createTopology("single_1_2000.json") val monitor = runTest(topology, workload, failureModelSpec) assertAll( { assertEquals(10 * 60 * 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, { assertEquals((10 * 30000).toLong(), monitor.hostCpuIdleTimes["H01"]?.sum()) { "Idle time incorrect" } }, { assertEquals((10 * 30000).toLong(), monitor.hostCpuActiveTimes["H01"]?.sum()) { "Active time incorrect" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(0)) { "Incorrect energy usage" } }, { assertEquals((600 * 150.0), monitor.hostEnergyUsages["H01"]?.sum()) { "Incorrect energy usage" } }, ) } /** * Failure test 3: Single Task, Single Failure * In this test, a single task is scheduled that is interrupted by a failure after 5 min. * Because there is no checkpointing, the full task has to be rerun. * * This means the final runtime is 20 minutes * * When the task is running, it is using 50% of the cpu. * This means that half of the time is active, and half is idle. * When the task is failed, all time is idle. */ @Test fun testFailures3() { val workload: ArrayList = arrayListOf( createTestTask( id = 0, fragments = arrayListOf( TraceFragment(10 * 60 * 1000, 1000.0), ), cpuCoreCount = 1, ), ) val failureModelSpec = TraceBasedFailureModelSpec( "src/test/resources/failureTraces/two_failures.parquet", repeat = false, ) val topology = createTopology("single_1_2000.json") val monitor = runTest(topology, workload, failureModelSpec) assertAll( { assertEquals(37 * 60 * 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, { assertEquals(((22 * 30000) + (15 * 60000)).toLong(), monitor.hostCpuIdleTimes["H01"]?.sum()) { "Idle time incorrect" } }, { assertEquals((22 * 30000).toLong(), monitor.hostCpuActiveTimes["H01"]?.sum()) { "Active time incorrect" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(0)) { "Incorrect energy usage" } }, { assertEquals(6000.0, monitor.hostEnergyUsages["H01"]?.get(5)) { "Incorrect energy usage" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(10)) { "Incorrect energy usage" } }, { assertEquals((22 * 60 * 150.0) + (15 * 60 * 100.0), monitor.hostEnergyUsages["H01"]?.sum()) { "Incorrect energy usage" } }, ) } /** * Failure test 4: Single Task, repeated failure * In this test, a single task is scheduled that is interrupted by a failure after 5 min. * Because there is no checkpointing, the full task has to be rerun. * * This means the final runtime is 20 minutes * * When the task is running, it is using 50% of the cpu. * This means that half of the time is active, and half is idle. * When the task is failed, all time is idle. */ @Test fun testFailures4() { val workload: ArrayList = arrayListOf( createTestTask( id = 0, fragments = arrayListOf( TraceFragment(10 * 60 * 1000, 1000.0), ), cpuCoreCount = 1, ), ) val failureModelSpec = TraceBasedFailureModelSpec( "src/test/resources/failureTraces/single_failure.parquet", repeat = true, ) val topology = createTopology("single_1_2000.json") val monitor = runTest(topology, workload, failureModelSpec) assertAll( { assertEquals(95 * 60000, monitor.maxTimestamp) { "Total runtime incorrect" } }, { assertEquals(((50 * 60000) + (20 * 60000)).toLong(), monitor.hostCpuIdleTimes["H01"]?.sum()) { "Idle time incorrect" } }, { assertEquals((25 * 60000).toLong(), monitor.hostCpuActiveTimes["H01"]?.sum()) { "Active time incorrect" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(0)) { "Incorrect energy usage" } }, { assertEquals(6000.0, monitor.hostEnergyUsages["H01"]?.get(5)) { "Incorrect energy usage" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(10)) { "Incorrect energy usage" } }, { assertEquals(6000.0, monitor.hostEnergyUsages["H01"]?.get(15)) { "Incorrect energy usage" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(20)) { "Incorrect energy usage" } }, { assertEquals(6000.0, monitor.hostEnergyUsages["H01"]?.get(25)) { "Incorrect energy usage" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(30)) { "Incorrect energy usage" } }, { assertEquals(6000.0, monitor.hostEnergyUsages["H01"]?.get(35)) { "Incorrect energy usage" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(40)) { "Incorrect energy usage" } }, { assertEquals(6000.0, monitor.hostEnergyUsages["H01"]?.get(45)) { "Incorrect energy usage" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(50)) { "Incorrect energy usage" } }, { assertEquals(6000.0, monitor.hostEnergyUsages["H01"]?.get(55)) { "Incorrect energy usage" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(60)) { "Incorrect energy usage" } }, { assertEquals(6000.0, monitor.hostEnergyUsages["H01"]?.get(65)) { "Incorrect energy usage" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(70)) { "Incorrect energy usage" } }, { assertEquals(6000.0, monitor.hostEnergyUsages["H01"]?.get(75)) { "Incorrect energy usage" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(80)) { "Incorrect energy usage" } }, { assertEquals(6000.0, monitor.hostEnergyUsages["H01"]?.get(85)) { "Incorrect energy usage" } }, { assertEquals(9000.0, monitor.hostEnergyUsages["H01"]?.get(90)) { "Incorrect energy usage" } }, { assertEquals(0.0, monitor.hostEnergyUsages["H01"]?.get(95)) { "Incorrect energy usage" } }, { assertEquals((10 * 300 * 150.0) + (9 * 300 * 100.0), monitor.hostEnergyUsages["H01"]?.sum()) { "Incorrect energy usage" } }, ) } /** * Checkpointing test 1: Single Task with checkpointing * In this test, a single task is scheduled that is interrupted by a failure after 5 min. * The system is using checkpointing, taking snapshots every minute. * * This means that after failure, only 6 minutes of the task is left. * However, taking a snapshot takes 1 second, which means 9 seconds have to be added to the total runtime. */ @Test fun testCheckpoints1() { val workload: ArrayList = arrayListOf( createTestTask( id = 0, fragments = arrayListOf( TraceFragment(10 * 60 * 1000, 1000.0), ), cpuCoreCount = 1, checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, ), ) val failureModelSpec = TraceBasedFailureModelSpec( "src/test/resources/failureTraces/single_failure.parquet", repeat = false, ) val topology = createTopology("single_1_2000.json") val monitor = runTest(topology, workload, failureModelSpec) assertAll( // Task run time + Time node is in failed state + checkpoint time + time waiting to be scheduled { assertEquals( (10 * 60 * 1000) + (5 * 60 * 1000) + (9 * 1000) + (56 * 1000), monitor.maxTimestamp, ) { "Total runtime incorrect" } }, { assertEquals( (10 * 60 * 150.0) + (5 * 60 * 100.0) + (9 * 150.0) + (56 * 150.0), monitor.hostEnergyUsages["H01"]?.sum(), ) { "Incorrect energy usage" } }, ) } /** * Checkpointing test 2: Single Task with checkpointing, higher cpu demand * In this test, a single task is scheduled that is interrupted by a failure after 5 min. * The system is using checkpointing, taking snapshots every minute. * * This means that after failure, only 16 minutes of the task is left. * However, taking a snapshot takes 1 second, which means 19 seconds have to be added to the total runtime. * * This is similar to the previous test, but the cpu demand of taking a snapshot is higher. * The cpu demand of taking a snapshot is as high as the highest fragment */ @Test fun testCheckpoints2() { val workload: ArrayList = arrayListOf( createTestTask( id = 0, fragments = arrayListOf( TraceFragment(10 * 60 * 1000, 2000.0), TraceFragment(10 * 60 * 1000, 1000.0), ), cpuCoreCount = 1, checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, ), ) val failureModelSpec = TraceBasedFailureModelSpec( "src/test/resources/failureTraces/single_failure.parquet", repeat = false, ) val topology = createTopology("single_1_2000.json") val monitor = runTest(topology, workload, failureModelSpec) assertAll( { assertEquals( (20 * 60000) + (5 * 60 * 1000) + (19 * 1000) + (56 * 1000), monitor.maxTimestamp, ) { "Total runtime incorrect" } }, { assertEquals( (10 * 60 * 200.0) + (10 * 60 * 150.0) + (5 * 60 * 100.0) + (19 * 200.0) + (56 * 200.0), monitor.hostEnergyUsages["H01"]?.sum(), ) { "Incorrect energy usage" } }, ) } /** * Checkpointing test 3: Single Task with checkpointing, higher cpu demand * In this test, a single task is scheduled that is interrupted by a failure after 5 min. * The system is using checkpointing, taking snapshots every minute. * * This means that after failure, only 16 minutes of the task is left. * However, taking a snapshot takes 1 second, which means 19 seconds have to be added to the total runtime. * * This is similar to the previous test, but the fragments are reversed * */ @Test fun testCheckpoints3() { val workload: ArrayList = arrayListOf( createTestTask( id = 0, fragments = arrayListOf( TraceFragment(10 * 60 * 1000, 1000.0), TraceFragment(10 * 60 * 1000, 2000.0), ), cpuCoreCount = 1, checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, ), ) val failureModelSpec = TraceBasedFailureModelSpec( "src/test/resources/failureTraces/single_failure.parquet", repeat = false, ) val topology = createTopology("single_1_2000.json") val monitor = runTest(topology, workload, failureModelSpec) assertAll( { assertEquals( (20 * 60000) + (5 * 60 * 1000) + (19 * 1000) + (56 * 1000), monitor.maxTimestamp, ) { "Total runtime incorrect" } }, { assertEquals( (10 * 60 * 200.0) + (10 * 60 * 150.0) + (5 * 60 * 100.0) + (19 * 200.0) + (56 * 150.0), monitor.hostEnergyUsages["H01"]?.sum(), ) { "Incorrect energy usage" } }, ) } /** * Checkpointing test 4: Single Task with scaling checkpointing * In this test, checkpointing is used, with a scaling factor of 1.5 * * This means that the interval between checkpoints starts at 1 min, but is multiplied by 1.5 every snapshot. * */ @Test fun testCheckpoints4() { val workload: ArrayList = arrayListOf( createTestTask( id = 0, fragments = arrayListOf( TraceFragment(10 * 60 * 1000, 1000.0), ), cpuCoreCount = 1, checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, checkpointIntervalScaling = 1.5, ), ) val failureModelSpec = TraceBasedFailureModelSpec( "src/test/resources/failureTraces/single_failure.parquet", repeat = false, ) val topology = createTopology("single_1_2000.json") val monitor = runTest(topology, workload, failureModelSpec) assertAll( { assertEquals((10 * 60000) + (5 * 60 * 1000) + (4 * 1000) + (14 * 1000), monitor.maxTimestamp) { "Total runtime incorrect" } }, { assertEquals((10 * 60 * 150.0) + (5 * 60 * 100.0) + (4 * 150.0) + (14 * 150.0), monitor.hostEnergyUsages["H01"]?.sum()) { "Incorrect energy usage" } }, ) } /** * Checkpointing test 5: Single Task, single failure with checkpointing * In this test, a single task is scheduled that is interrupted by a failure after 5 min. * Because there is no checkpointing, the full task has to be rerun. * */ @Test fun testCheckpoints5() { val workload: ArrayList = arrayListOf( createTestTask( id = 0, fragments = arrayListOf( TraceFragment(10 * 60 * 1000, 1000.0), ), cpuCoreCount = 1, checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, ), ) val failureModelSpec = TraceBasedFailureModelSpec( "src/test/resources/failureTraces/single_failure.parquet", repeat = false, ) val topology = createTopology("single_1_2000.json") val monitor = runTest(topology, workload, failureModelSpec) assertAll( { assertEquals((960 * 1000) + 5000, monitor.maxTimestamp) { "Total runtime incorrect" } }, { assertEquals( (665 * 150.0) + (300 * 100.0), monitor.hostEnergyUsages["H01"]?.sum(), ) { "Incorrect energy usage" } }, ) } /** * Checkpointing test 6: Single Task, repeated failure with checkpointing * In this test, a single task is scheduled that is interrupted by a failure after 5 min. * Because there is no checkpointing, the full task has to be rerun. * */ @Test fun testCheckpoints6() { val workload: ArrayList = arrayListOf( createTestTask( id = 0, fragments = arrayListOf( TraceFragment(10 * 60 * 1000, 1000.0), ), cpuCoreCount = 1, checkpointInterval = 60 * 1000L, checkpointDuration = 1000L, ), ) val failureModelSpec = TraceBasedFailureModelSpec( "src/test/resources/failureTraces/single_failure.parquet", repeat = true, ) val topology = createTopology("single_1_2000.json") val monitor = runTest(topology, workload, failureModelSpec) assertAll( { assertEquals((22 * 60000) + 1000, monitor.maxTimestamp) { "Total runtime incorrect" } }, { assertEquals( (300 * 150.0) + (300 * 100.0) + (300 * 150.0) + (300 * 100.0) + (121 * 150.0), monitor.hostEnergyUsages["H01"]?.sum(), ) { "Incorrect energy usage" } }, ) } }