From f782fe21fe507f0dda74fe7b3932d44d0276a0e8 Mon Sep 17 00:00:00 2001 From: Wenchen Lai Date: Sun, 9 May 2021 19:47:17 +0200 Subject: exp: Model TensorFlow compute devices --- .../opendc/experiments/tf20/core/SimTFDevice.kt | 200 +++++++++++++++++++++ .../org/opendc/experiments/tf20/core/TFDevice.kt | 50 ++++++ .../experiments/tf20/core/SimTFDeviceTest.kt | 62 +++++++ 3 files changed, 312 insertions(+) create mode 100644 opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt create mode 100644 opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/TFDevice.kt create mode 100644 opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt (limited to 'opendc-experiments') diff --git a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt new file mode 100644 index 00000000..f4c18ff1 --- /dev/null +++ b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.experiments.tf20.core + +import io.opentelemetry.api.metrics.Meter +import io.opentelemetry.api.metrics.common.Labels +import kotlinx.coroutines.* +import org.opendc.simulator.compute.SimBareMetalMachine +import org.opendc.simulator.compute.SimMachine +import org.opendc.simulator.compute.SimMachineContext +import org.opendc.simulator.compute.SimMachineModel +import org.opendc.simulator.compute.cpufreq.PerformanceScalingGovernor +import org.opendc.simulator.compute.cpufreq.SimpleScalingDriver +import org.opendc.simulator.compute.model.MemoryUnit +import org.opendc.simulator.compute.model.ProcessingUnit +import org.opendc.simulator.compute.power.PowerModel +import org.opendc.simulator.compute.workload.SimWorkload +import org.opendc.simulator.resources.SimResourceCommand +import org.opendc.simulator.resources.SimResourceConsumer +import org.opendc.simulator.resources.SimResourceContext +import org.opendc.simulator.resources.SimResourceEvent +import java.time.Clock +import java.util.* +import kotlin.coroutines.Continuation +import kotlin.coroutines.CoroutineContext +import kotlin.coroutines.resume + +/** + * A [TFDevice] implementation using simulated components. + */ +public class SimTFDevice( + override val uid: UUID, + override val isGpu: Boolean, + context: CoroutineContext, + clock: Clock, + meter: Meter, + private val pu: ProcessingUnit, + private val memory: MemoryUnit, + powerModel: PowerModel +) : TFDevice { + /** + * The scope in which the device runs. + */ + private val scope = CoroutineScope(context + Job()) + + /** + * The [SimMachine] representing the device. + */ + private val machine = SimBareMetalMachine( + scope.coroutineContext, clock, SimMachineModel(listOf(pu), listOf(memory)), + PerformanceScalingGovernor(), SimpleScalingDriver(powerModel) + ) + + /** + * The usage of the device. + */ + private val _usage = meter.doubleValueRecorderBuilder("device.usage") + .setDescription("The amount of device resources used") + .setUnit("MHz") + .build() + .bind(Labels.of("device", uid.toString())) + + /** + * The power draw of the device. + */ + private val _power = meter.doubleValueRecorderBuilder("device.power") + .setDescription("The power draw of the device") + .setUnit("W") + .build() + .bind(Labels.of("device", uid.toString())) + + /** + * The workload that will be run by the device. + */ + private val workload = object : SimWorkload, SimResourceConsumer { + /** + * The resource context to interrupt the workload with. + */ + var ctx: SimResourceContext? = null + + /** + * The capacity of the device. + */ + private var capacity: Double = 0.0 + + /** + * The queue of work to run. + */ + val queue = ArrayDeque() + + /** + * A flag to indicate that the workload is idle. + */ + val isIdle + get() = activeWork == null + + /** + * The active work of the workload. + */ + private var activeWork: Work? = null + + override fun onStart(ctx: SimMachineContext) {} + + override fun getConsumer(ctx: SimMachineContext, cpu: ProcessingUnit): SimResourceConsumer = this + + override fun onNext(ctx: SimResourceContext): SimResourceCommand { + val activeWork = activeWork + if (activeWork != null) { + if (activeWork.consume(activeWork.flops - ctx.remainingWork)) { + this.activeWork = null + } else { + return SimResourceCommand.Consume(activeWork.flops, ctx.capacity) + } + } + + val queue = queue + val head = queue.poll() + return if (head != null) { + this.activeWork = head + SimResourceCommand.Consume(head.flops, ctx.capacity) + } else { + SimResourceCommand.Idle() + } + } + + override fun onEvent(ctx: SimResourceContext, event: SimResourceEvent) { + when (event) { + SimResourceEvent.Start -> { + this.ctx = ctx + this.capacity = ctx.capacity + } + SimResourceEvent.Capacity -> { + this.capacity = ctx.capacity + ctx.interrupt() + } + SimResourceEvent.Run -> { + _usage.record(ctx.speed) + _power.record(machine.powerDraw) + } + else -> {} + } + } + } + + init { + scope.launch { + machine.run(workload) + } + } + + override suspend fun load(dataSize: Long) { + val duration = dataSize / memory.speed * 1000 + delay(duration.toLong()) + } + + override suspend fun compute(flops: Double) = suspendCancellableCoroutine { cont -> + workload.queue.add(Work(flops, cont)) + if (workload.isIdle) { + workload.ctx?.interrupt() + } + } + + override fun close() { + machine.close() + scope.cancel() + } + + private data class Work(var flops: Double, val cont: Continuation) { + fun consume(flops: Double): Boolean { + this.flops -= flops + + if (this.flops <= 0) { + cont.resume(Unit) + return true + } + + return false + } + } +} diff --git a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/TFDevice.kt b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/TFDevice.kt new file mode 100644 index 00000000..bbc34ed9 --- /dev/null +++ b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/TFDevice.kt @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.experiments.tf20.core + +import java.util.* + +/** + * A compute device on which tensor operations are performed. + */ +public interface TFDevice : AutoCloseable { + /** + * The unique identifier of the device. + */ + public val uid: UUID + + /** + * A flag to indicate whether the device is a GPU. + */ + public val isGpu: Boolean + + /** + * Transfer the specified amount of data from memory. + */ + public suspend fun load(dataSize: Long) + + /** + * Perform [flops] amount of computation on the device. + */ + public suspend fun compute(flops: Double) +} diff --git a/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt new file mode 100644 index 00000000..28a2a319 --- /dev/null +++ b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.experiments.tf20.core + +import io.opentelemetry.api.metrics.MeterProvider +import kotlinx.coroutines.coroutineScope +import kotlinx.coroutines.launch +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Test +import org.opendc.simulator.compute.model.MemoryUnit +import org.opendc.simulator.compute.model.ProcessingNode +import org.opendc.simulator.compute.model.ProcessingUnit +import org.opendc.simulator.compute.power.LinearPowerModel +import org.opendc.simulator.core.runBlockingSimulation +import java.util.* + +/** + * Test suite for the [SimTFDevice] class. + */ +internal class SimTFDeviceTest { + @Test + fun testSmoke() = runBlockingSimulation { + val meterProvider: MeterProvider = MeterProvider.noop() + val meter = meterProvider.get("opendc-tf20") + + val puNode = ProcessingNode("NVIDIA", "Tesla V100", "unknown", 1) + val pu = ProcessingUnit(puNode, 0, 960 * 1230.0) + val memory = MemoryUnit("NVIDIA", "Tesla V100", 877.0, 32_000) + + val device = SimTFDevice(UUID.randomUUID(), isGpu = true, coroutineContext, clock, meter, pu, memory, LinearPowerModel(250.0, 100.0)) + + // Load 1 GiB into GPU memory + device.load(1000) + assertEquals(1140, clock.millis()) + + coroutineScope { + launch { device.compute(1e6) } + launch { device.compute(2e6) } + } + assertEquals(3681, clock.millis()) + } +} -- cgit v1.2.3