merge: Fix distributed strategy for TensorFlow experiment (#89)

This pull request fixes an issue where the distributed strategies for the TensorFlow experiments did not work correctly. ## Implementation Notes :hammer_and_pick: * Limit growth rate for trace construction * Derive device statistics directly from SimMachine * Always recompute power usage when a `SImBareMetalMachine` converges * Add a test case for `MirroredStrategy`
author: Fabian Mastenbroek <mail.fabianm@gmail.com> 2022-06-16 11:08:14 +0200
committer: GitHub <noreply@github.com> 2022-06-16 11:08:14 +0200
commit: 8eab5895dcf21b4a3f585c62db14c9a049c81d98 (patch)
tree: b2b698d85f7397ef67485d52128a9390f40f7252 /opendc-experiments
parent: d146814bbbb86bfcb19ccb94250424703e9179e5 (diff)
parent: 282f199e6f16350123a915b06faff62ca82be91b (diff)
4 files changed, 49 insertions, 20 deletions
diff --git a/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt b/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt
index fa2cd9c8..d7b7caad 100644
--- a/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt
+++ b/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt
@@ -116,7 +116,7 @@ class CapelinIntegrationTest {
                 { assertEquals(66977508, this@CapelinIntegrationTest.monitor.activeTime) { "Incorrect active time" } },
                 { assertEquals(3160381, this@CapelinIntegrationTest.monitor.stealTime) { "Incorrect steal time" } },
                 { assertEquals(0, this@CapelinIntegrationTest.monitor.lostTime) { "Incorrect lost time" } },
-                { assertEquals(5.840845430827075E9, this@CapelinIntegrationTest.monitor.energyUsage, 0.01) { "Incorrect power draw" } },
+                { assertEquals(5.840939264814157E9, this@CapelinIntegrationTest.monitor.energyUsage, 0.01) { "Incorrect power draw" } },
             )
         } finally {
             runner.close()
@@ -164,7 +164,7 @@ class CapelinIntegrationTest {
             { assertEquals(9741207, this@CapelinIntegrationTest.monitor.activeTime) { "Active time incorrect" } },
             { assertEquals(0, this@CapelinIntegrationTest.monitor.stealTime) { "Steal time incorrect" } },
             { assertEquals(0, this@CapelinIntegrationTest.monitor.lostTime) { "Lost time incorrect" } },
-            { assertEquals(7.011413569311495E8, this@CapelinIntegrationTest.monitor.energyUsage, 0.01) { "Incorrect power draw" } }
+            { assertEquals(7.011676470304312E8, this@CapelinIntegrationTest.monitor.energyUsage, 0.01) { "Incorrect power draw" } }
         )
     }
 
diff --git a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt
index 90350142..2c79da02 100644
--- a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt
+++ b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt
@@ -68,13 +68,6 @@ public class SimTFDevice(
     )
 
     /**
-     * Metrics collected by the device.
-     */
-    private var _resourceUsage = 0.0
-    private var _powerUsage = 0.0
-    private var _energyUsage = 0.0
-
-    /**
      * The workload that will be run by the device.
      */
     private val workload = object : SimWorkload, FlowSource {
@@ -121,7 +114,7 @@ public class SimTFDevice(
             ctx = conn
             capacity = conn.capacity
             lastPull = now
-            conn.shouldSourceConverge = true
+            conn.shouldSourceConverge = false
         }
 
         override fun onPull(conn: FlowConnection, now: Long): Long {
@@ -156,12 +149,6 @@ public class SimTFDevice(
                 Long.MAX_VALUE
             }
         }
-
-        override fun onConverge(conn: FlowConnection, now: Long) {
-            _resourceUsage = conn.rate
-            _powerUsage = machine.powerUsage
-            _energyUsage = machine.energyUsage
-        }
     }
 
     init {
@@ -183,7 +170,8 @@ public class SimTFDevice(
     }
 
     override fun getDeviceStats(): TFDeviceStats {
-        return TFDeviceStats(_resourceUsage, _powerUsage, _energyUsage)
+        val resourceUsage = machine.cpus.sumOf { it.rate }
+        return TFDeviceStats(resourceUsage, machine.powerUsage, machine.energyUsage)
     }
 
     override fun close() {
diff --git a/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/TensorFlowTest.kt b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/TensorFlowTest.kt
index 7d72b48d..328f1326 100644
--- a/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/TensorFlowTest.kt
+++ b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/TensorFlowTest.kt
@@ -26,10 +26,12 @@ import org.junit.jupiter.api.Assertions.assertEquals
 import org.junit.jupiter.api.Test
 import org.junit.jupiter.api.assertAll
 import org.opendc.experiments.tf20.core.SimTFDevice
+import org.opendc.experiments.tf20.distribute.MirroredStrategy
 import org.opendc.experiments.tf20.distribute.OneDeviceStrategy
 import org.opendc.experiments.tf20.util.MLEnvironmentReader
 import org.opendc.simulator.compute.power.LinearPowerModel
 import org.opendc.simulator.core.runBlockingSimulation
+import java.util.*
 
 /**
  * Integration test suite for the TensorFlow application model in OpenDC.
@@ -61,7 +63,7 @@ class TensorFlowTest {
         val stats = device.getDeviceStats()
         assertAll(
             { assertEquals(3309694252, clock.millis()) },
-            { assertEquals(8.2520933087E8, stats.energyUsage) }
+            { assertEquals(8.27423563E8, stats.energyUsage) }
         )
     }
 
@@ -91,7 +93,46 @@ class TensorFlowTest {
         val stats = device.getDeviceStats()
         assertAll(
             { assertEquals(176230322904, clock.millis()) },
-            { assertEquals(4.296544914744E10, stats.energyUsage) }
+            { assertEquals(4.4057580726E10, stats.energyUsage) }
+        )
+    }
+
+    /**
+     * Smoke test that tests the capabilities of the TensorFlow application model in OpenDC.
+     */
+    @Test
+    fun testSmokeDistribute() = runBlockingSimulation {
+        val envInput = checkNotNull(TensorFlowTest::class.java.getResourceAsStream("/kth.json"))
+        val def = MLEnvironmentReader().readEnvironment(envInput).first()
+
+        val deviceA = SimTFDevice(
+            def.uid, def.meta["gpu"] as Boolean, coroutineContext, clock, def.model.cpus[0], def.model.memory[0],
+            LinearPowerModel(250.0, 60.0)
+        )
+
+        val deviceB = SimTFDevice(
+            UUID.randomUUID(), def.meta["gpu"] as Boolean, coroutineContext, clock, def.model.cpus[0], def.model.memory[0],
+            LinearPowerModel(250.0, 60.0)
+        )
+
+        val strategy = MirroredStrategy(listOf(deviceA, deviceB))
+        val batchSize = 32
+        val model = AlexNet(batchSize.toLong())
+        model.use {
+            it.compile(strategy)
+
+            it.fit(epochs = 9088 / batchSize, batchSize = batchSize)
+        }
+
+        deviceA.close()
+        deviceB.close()
+
+        val statsA = deviceA.getDeviceStats()
+        val statsB = deviceB.getDeviceStats()
+        assertAll(
+            { assertEquals(1704994000, clock.millis()) },
+            { assertEquals(4.262485E8, statsA.energyUsage) },
+            { assertEquals(4.262485E8, statsB.energyUsage) }
         )
     }
 }
diff --git a/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt
index 21d30250..051d5730 100644
--- a/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt
+++ b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt
@@ -69,7 +69,7 @@ internal class SimTFDeviceTest {
 
         assertAll(
             { assertEquals(3681, clock.millis()) },
-            { assertEquals(325.75, stats.energyUsage) }
+            { assertEquals(749.25, stats.energyUsage) }
         )
     }
 }
author	Fabian Mastenbroek <mail.fabianm@gmail.com>	2022-06-16 11:08:14 +0200
committer	GitHub <noreply@github.com>	2022-06-16 11:08:14 +0200
commit	8eab5895dcf21b4a3f585c62db14c9a049c81d98 (patch)
tree	b2b698d85f7397ef67485d52128a9390f40f7252 /opendc-experiments
parent	d146814bbbb86bfcb19ccb94250424703e9179e5 (diff)
parent	282f199e6f16350123a915b06faff62ca82be91b (diff)