summaryrefslogtreecommitdiff
path: root/opendc-experiments
diff options
context:
space:
mode:
authorFabian Mastenbroek <mail.fabianm@gmail.com>2022-06-16 11:08:14 +0200
committerGitHub <noreply@github.com>2022-06-16 11:08:14 +0200
commit8eab5895dcf21b4a3f585c62db14c9a049c81d98 (patch)
treeb2b698d85f7397ef67485d52128a9390f40f7252 /opendc-experiments
parentd146814bbbb86bfcb19ccb94250424703e9179e5 (diff)
parent282f199e6f16350123a915b06faff62ca82be91b (diff)
merge: Fix distributed strategy for TensorFlow experiment (#89)
This pull request fixes an issue where the distributed strategies for the TensorFlow experiments did not work correctly. ## Implementation Notes :hammer_and_pick: * Limit growth rate for trace construction * Derive device statistics directly from SimMachine * Always recompute power usage when a `SImBareMetalMachine` converges * Add a test case for `MirroredStrategy`
Diffstat (limited to 'opendc-experiments')
-rw-r--r--opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt4
-rw-r--r--opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt18
-rw-r--r--opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/TensorFlowTest.kt45
-rw-r--r--opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt2
4 files changed, 49 insertions, 20 deletions
diff --git a/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt b/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt
index fa2cd9c8..d7b7caad 100644
--- a/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt
+++ b/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt
@@ -116,7 +116,7 @@ class CapelinIntegrationTest {
{ assertEquals(66977508, this@CapelinIntegrationTest.monitor.activeTime) { "Incorrect active time" } },
{ assertEquals(3160381, this@CapelinIntegrationTest.monitor.stealTime) { "Incorrect steal time" } },
{ assertEquals(0, this@CapelinIntegrationTest.monitor.lostTime) { "Incorrect lost time" } },
- { assertEquals(5.840845430827075E9, this@CapelinIntegrationTest.monitor.energyUsage, 0.01) { "Incorrect power draw" } },
+ { assertEquals(5.840939264814157E9, this@CapelinIntegrationTest.monitor.energyUsage, 0.01) { "Incorrect power draw" } },
)
} finally {
runner.close()
@@ -164,7 +164,7 @@ class CapelinIntegrationTest {
{ assertEquals(9741207, this@CapelinIntegrationTest.monitor.activeTime) { "Active time incorrect" } },
{ assertEquals(0, this@CapelinIntegrationTest.monitor.stealTime) { "Steal time incorrect" } },
{ assertEquals(0, this@CapelinIntegrationTest.monitor.lostTime) { "Lost time incorrect" } },
- { assertEquals(7.011413569311495E8, this@CapelinIntegrationTest.monitor.energyUsage, 0.01) { "Incorrect power draw" } }
+ { assertEquals(7.011676470304312E8, this@CapelinIntegrationTest.monitor.energyUsage, 0.01) { "Incorrect power draw" } }
)
}
diff --git a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt
index 90350142..2c79da02 100644
--- a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt
+++ b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt
@@ -68,13 +68,6 @@ public class SimTFDevice(
)
/**
- * Metrics collected by the device.
- */
- private var _resourceUsage = 0.0
- private var _powerUsage = 0.0
- private var _energyUsage = 0.0
-
- /**
* The workload that will be run by the device.
*/
private val workload = object : SimWorkload, FlowSource {
@@ -121,7 +114,7 @@ public class SimTFDevice(
ctx = conn
capacity = conn.capacity
lastPull = now
- conn.shouldSourceConverge = true
+ conn.shouldSourceConverge = false
}
override fun onPull(conn: FlowConnection, now: Long): Long {
@@ -156,12 +149,6 @@ public class SimTFDevice(
Long.MAX_VALUE
}
}
-
- override fun onConverge(conn: FlowConnection, now: Long) {
- _resourceUsage = conn.rate
- _powerUsage = machine.powerUsage
- _energyUsage = machine.energyUsage
- }
}
init {
@@ -183,7 +170,8 @@ public class SimTFDevice(
}
override fun getDeviceStats(): TFDeviceStats {
- return TFDeviceStats(_resourceUsage, _powerUsage, _energyUsage)
+ val resourceUsage = machine.cpus.sumOf { it.rate }
+ return TFDeviceStats(resourceUsage, machine.powerUsage, machine.energyUsage)
}
override fun close() {
diff --git a/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/TensorFlowTest.kt b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/TensorFlowTest.kt
index 7d72b48d..328f1326 100644
--- a/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/TensorFlowTest.kt
+++ b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/TensorFlowTest.kt
@@ -26,10 +26,12 @@ import org.junit.jupiter.api.Assertions.assertEquals
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.assertAll
import org.opendc.experiments.tf20.core.SimTFDevice
+import org.opendc.experiments.tf20.distribute.MirroredStrategy
import org.opendc.experiments.tf20.distribute.OneDeviceStrategy
import org.opendc.experiments.tf20.util.MLEnvironmentReader
import org.opendc.simulator.compute.power.LinearPowerModel
import org.opendc.simulator.core.runBlockingSimulation
+import java.util.*
/**
* Integration test suite for the TensorFlow application model in OpenDC.
@@ -61,7 +63,7 @@ class TensorFlowTest {
val stats = device.getDeviceStats()
assertAll(
{ assertEquals(3309694252, clock.millis()) },
- { assertEquals(8.2520933087E8, stats.energyUsage) }
+ { assertEquals(8.27423563E8, stats.energyUsage) }
)
}
@@ -91,7 +93,46 @@ class TensorFlowTest {
val stats = device.getDeviceStats()
assertAll(
{ assertEquals(176230322904, clock.millis()) },
- { assertEquals(4.296544914744E10, stats.energyUsage) }
+ { assertEquals(4.4057580726E10, stats.energyUsage) }
+ )
+ }
+
+ /**
+ * Smoke test that tests the capabilities of the TensorFlow application model in OpenDC.
+ */
+ @Test
+ fun testSmokeDistribute() = runBlockingSimulation {
+ val envInput = checkNotNull(TensorFlowTest::class.java.getResourceAsStream("/kth.json"))
+ val def = MLEnvironmentReader().readEnvironment(envInput).first()
+
+ val deviceA = SimTFDevice(
+ def.uid, def.meta["gpu"] as Boolean, coroutineContext, clock, def.model.cpus[0], def.model.memory[0],
+ LinearPowerModel(250.0, 60.0)
+ )
+
+ val deviceB = SimTFDevice(
+ UUID.randomUUID(), def.meta["gpu"] as Boolean, coroutineContext, clock, def.model.cpus[0], def.model.memory[0],
+ LinearPowerModel(250.0, 60.0)
+ )
+
+ val strategy = MirroredStrategy(listOf(deviceA, deviceB))
+ val batchSize = 32
+ val model = AlexNet(batchSize.toLong())
+ model.use {
+ it.compile(strategy)
+
+ it.fit(epochs = 9088 / batchSize, batchSize = batchSize)
+ }
+
+ deviceA.close()
+ deviceB.close()
+
+ val statsA = deviceA.getDeviceStats()
+ val statsB = deviceB.getDeviceStats()
+ assertAll(
+ { assertEquals(1704994000, clock.millis()) },
+ { assertEquals(4.262485E8, statsA.energyUsage) },
+ { assertEquals(4.262485E8, statsB.energyUsage) }
)
}
}
diff --git a/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt
index 21d30250..051d5730 100644
--- a/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt
+++ b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt
@@ -69,7 +69,7 @@ internal class SimTFDeviceTest {
assertAll(
{ assertEquals(3681, clock.millis()) },
- { assertEquals(325.75, stats.energyUsage) }
+ { assertEquals(749.25, stats.energyUsage) }
)
}
}