Implements fixes to run m100 traces with GPUs (#362)

* Updated output format to reduce size * using sum of gpu capacities instead of single max * passing provisioned GPU cores to host view * fix supply update trigger * fixing floating point error, leading to negative demand * fixing double mismatch, due to floating point in precision * adding additional check if demand can be satisfied in the simple way * adds workload invalidation if remaining duration for all resources is 0 * invalidating flow distributors after demand update * spotless apply * updating tests * exporting power consumption of compute resources directly from gpu instead of PSU * using big decimal to avoid floating point in-precision * rolls back to pass-through version of PSU, before GPU implementation * places flowdistributor between PSU and compute resources * adds check to avoid null exception if supply is pushed without demand * fixing task id type * Adds memorizing GPU scheduler * adds boundary for negative remaining work * implemented tests for GPU scheduler filter * Revert "Updated output format to reduce size" This reverts commit 7171de8e0512a863df4962f64560ac7bad1fb48d. * spotless aply --------- Co-authored-by: DanteNiewenhuis <d.niewenhuis@hotmail.com>
author: Niels Thiele <noleu66@posteo.net> 2025-09-15 15:34:38 +0200
committer: GitHub <noreply@github.com> 2025-09-15 15:34:38 +0200
commit: a735f1768677fc996da77b239819c55dcd623f5e (patch)
tree: 703237990dc7d178a7600c2795fbc32d2cd12aa8 /opendc-compute/opendc-compute-simulator/src
parent: 5f539debbe18c9cf5c6c159c098f02f1d239f324 (diff)
6 files changed, 113 insertions, 6 deletions
diff --git a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java
index fde83ead..8feddf54 100644
--- a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java
+++ b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java
@@ -186,6 +186,7 @@ public final class ComputeService implements AutoCloseable, CarbonReceiver {
                     hv.provisionedCpuCores -= flavor.getCpuCoreCount();
                     hv.instanceCount--;
                     hv.availableMemory += flavor.getMemorySize();
+                    hv.provisionedGpuCores -= flavor.getGpuCoreCount();
                 } else {
                     LOGGER.error("Unknown host {}", host);
                 }
@@ -580,6 +581,7 @@ public final class ComputeService implements AutoCloseable, CarbonReceiver {
                 hv.instanceCount++;
                 hv.provisionedCpuCores += flavor.getCpuCoreCount();
                 hv.availableMemory -= flavor.getMemorySize();
+                hv.provisionedGpuCores += flavor.getGpuCoreCount();
 
                 activeTasks.put(task, host);
             } catch (Exception cause) {
diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt
index b7d3b730..1a0cc316 100644
--- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt
+++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt
@@ -361,7 +361,6 @@ public class SimHost(
         for (gpu in simMachine!!.gpus) {
             gpu.updateCounters(this.clock.millis())
             val counters = simMachine!!.getGpuPerformanceCounters(gpu.id)
-            val powerDraw = simMachine!!.psu.getPowerDraw(ResourceType.GPU, gpu.id)
 
             gpuStats.add(
                 HostGpuStats(
@@ -373,7 +372,7 @@ public class SimHost(
                     counters.demand,
                     counters.supply,
                     counters.supply / gpu.getCapacity(ResourceType.GPU),
-                    powerDraw,
+                    counters.powerDraw,
                 ),
             )
         }
diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt
index 0376a492..79af6f62 100644
--- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt
+++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt
@@ -52,6 +52,7 @@ public enum class ComputeSchedulerEnum {
     Timeshift,
     ProvisionedCpuGpuCores,
     ProvisionedCpuGpuCoresInv,
+    GpuTaskMemorizing,
 }
 
 public fun createPrefabComputeScheduler(
@@ -159,5 +160,15 @@ public fun createPrefabComputeScheduler(
                         VGpuWeigher(gpuAllocationRatio, multiplier = -1.0),
                     ),
             )
+        ComputeSchedulerEnum.GpuTaskMemorizing ->
+            MemorizingScheduler(
+                filters =
+                    listOf(
+                        ComputeFilter(),
+                        VCpuFilter(cpuAllocationRatio),
+                        VGpuFilter(gpuAllocationRatio),
+                        RamFilter(ramAllocationRatio),
+                    ),
+            )
     }
 }
diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt
index 6dc27327..5f517257 100644
--- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt
+++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt
@@ -42,7 +42,7 @@ public class VGpuCapacityFilter : HostFilter {
 
         return (
             requiredCapacity == null ||
-                ((availableCapacity / availableCores) >= (requiredCapacity / task.flavor.gpuCoreCount))
+                (availableRatio >= (requiredCapacity / task.flavor.gpuCoreCount))
         )
     }
 }
diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt
index 9f564776..f47013b1 100644
--- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt
+++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt
@@ -26,9 +26,9 @@ import org.opendc.compute.simulator.service.HostView
 import org.opendc.compute.simulator.service.ServiceTask
 
 /**
- * A [HostFilter] that filters hosts based on the vCPU requirements of a [ServiceTask] and the available vCPUs on the host.
+ * A [HostFilter] that filters hosts based on the vGPU requirements of a [ServiceTask] and the available vGPUs on the host.
  *
- * @param allocationRatio Virtual CPU to physical CPU allocation ratio.
+ * @param allocationRatio Virtual GPU to physical GPU allocation ratio.
  */
 public class VGpuFilter(private val allocationRatio: Double) : HostFilter {
     override fun test(
@@ -36,7 +36,7 @@ public class VGpuFilter(private val allocationRatio: Double) : HostFilter {
         task: ServiceTask,
     ): Boolean {
         val requested = task.flavor.gpuCoreCount
-        val totalCores = host.host.getModel().gpuHostModels().maxOfOrNull { it.gpuCoreCount() } ?: 0
+        val totalCores = host.host.getModel().gpuHostModels()?.sumOf { it.gpuCoreCount() } ?: 0
         val limit = totalCores * allocationRatio
 
         // Do not allow an instance to overcommit against itself, only against other instances
diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt
index fe5cea70..65fbfb38 100644
--- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt
+++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt
@@ -28,6 +28,7 @@ import org.junit.jupiter.api.Assertions.assertEquals
 import org.junit.jupiter.api.Test
 import org.junit.jupiter.api.assertAll
 import org.junit.jupiter.api.assertThrows
+import org.opendc.compute.simulator.host.GpuHostModel
 import org.opendc.compute.simulator.host.HostModel
 import org.opendc.compute.simulator.host.HostState
 import org.opendc.compute.simulator.scheduler.filters.ComputeFilter
@@ -37,6 +38,8 @@ import org.opendc.compute.simulator.scheduler.filters.RamFilter
 import org.opendc.compute.simulator.scheduler.filters.SameHostFilter
 import org.opendc.compute.simulator.scheduler.filters.VCpuCapacityFilter
 import org.opendc.compute.simulator.scheduler.filters.VCpuFilter
+import org.opendc.compute.simulator.scheduler.filters.VGpuCapacityFilter
+import org.opendc.compute.simulator.scheduler.filters.VGpuFilter
 import org.opendc.compute.simulator.scheduler.weights.CoreRamWeigher
 import org.opendc.compute.simulator.scheduler.weights.InstanceCountWeigher
 import org.opendc.compute.simulator.scheduler.weights.RamWeigher
@@ -437,6 +440,98 @@ internal class FilterSchedulerTest {
     }
 
     @Test
+    fun testVGPUFilter() {
+        val scheduler =
+            FilterScheduler(
+                filters = listOf(VGpuFilter(1.0)),
+                weighers = emptyList(),
+            )
+
+        val hostA = mockk<HostView>()
+        every { hostA.host.getState() } returns HostState.UP
+        every { hostA.host.getModel() } returns
+            HostModel(
+                0.0,
+                0,
+                2048,
+                listOf(
+                    GpuHostModel(8 * 2600.0, 8, 0L, 0.0),
+                ),
+            )
+        every { hostA.provisionedGpuCores } returns 0
+        scheduler.addHost(hostA)
+
+        val hostB = mockk<HostView>()
+        every { hostB.host.getState() } returns HostState.UP
+        every { hostB.host.getModel() } returns
+            HostModel(
+                0.0,
+                0,
+                2048,
+                listOf(
+                    GpuHostModel(8 * 3200.0, 8, 0L, 0.0),
+                    GpuHostModel(8 * 3200.0, 8, 0L, 0.0),
+                ),
+            )
+        every { hostB.provisionedGpuCores } returns 0
+        scheduler.addHost(hostB)
+
+        val req = mockk<SchedulingRequest>()
+        every { req.task.flavor.gpuCoreCount } returns 9
+        every { req.task.flavor.meta } returns mapOf("gpu-capacity" to 9 * 3200.0)
+        every { req.isCancelled } returns false
+
+        // filter selects hostB because hostA does not have enough GPU capacity
+        assertEquals(hostB, scheduler.select(mutableListOf(req).iterator()).host)
+    }
+
+    @Test
+    fun testVGPUCapacityFilter() {
+        val scheduler =
+            FilterScheduler(
+                filters = listOf(VGpuCapacityFilter()),
+                weighers = emptyList(),
+            )
+
+        val hostA = mockk<HostView>()
+        every { hostA.host.getState() } returns HostState.UP
+        every { hostA.host.getModel() } returns
+            HostModel(
+                0.0,
+                0,
+                2048,
+                listOf(
+                    GpuHostModel(8 * 2600.0, 8, 0L, 0.0),
+                ),
+            )
+        every { hostA.availableMemory } returns 512
+        scheduler.addHost(hostA)
+
+        val hostB = mockk<HostView>()
+        every { hostB.host.getState() } returns HostState.UP
+        every { hostB.host.getModel() } returns
+            HostModel(
+                0.0,
+                0,
+                2048,
+                listOf(
+                    GpuHostModel(8 * 3200.0, 8, 0L, 0.0),
+                    GpuHostModel(8 * 3200.0, 8, 0L, 0.0),
+                ),
+            )
+        every { hostB.availableMemory } returns 512
+        scheduler.addHost(hostB)
+
+        val req = mockk<SchedulingRequest>()
+        every { req.task.flavor.gpuCoreCount } returns 8
+        every { req.task.flavor.meta } returns mapOf("gpu-capacity" to 8 * 3200.0)
+        every { req.isCancelled } returns false
+
+        // filter selects hostB because hostA does not have enough GPU capacity
+        assertEquals(hostB, scheduler.select(mutableListOf(req).iterator()).host)
+    }
+
+    @Test
     fun testRamWeigher() {
         val scheduler =
             FilterScheduler(
author	Niels Thiele <noleu66@posteo.net>	2025-09-15 15:34:38 +0200
committer	GitHub <noreply@github.com>	2025-09-15 15:34:38 +0200
commit	a735f1768677fc996da77b239819c55dcd623f5e (patch)
tree	703237990dc7d178a7600c2795fbc32d2cd12aa8 /opendc-compute/opendc-compute-simulator/src
parent	5f539debbe18c9cf5c6c159c098f02f1d239f324 (diff)