From a735f1768677fc996da77b239819c55dcd623f5e Mon Sep 17 00:00:00 2001
From: Niels Thiele <noleu66@posteo.net>
Date: Mon, 15 Sep 2025 15:34:38 +0200
Subject: Implements fixes to run m100 traces with GPUs (#362)

* Updated output format to reduce size

* using sum of gpu capacities instead of single max

* passing provisioned GPU cores to host view

* fix supply update trigger

* fixing floating point error, leading to negative demand

* fixing double mismatch, due to floating point in precision

* adding additional check if demand can be satisfied in the simple way

* adds workload invalidation if remaining duration for all resources is 0

* invalidating flow distributors after demand update

* spotless apply

* updating tests

* exporting power consumption of compute resources directly from gpu instead of PSU

* using big decimal to avoid floating point in-precision

* rolls back to pass-through version of PSU, before GPU implementation

* places flowdistributor between PSU and compute resources

* adds check to avoid null exception if supply is pushed without demand

* fixing task id type

* Adds memorizing GPU scheduler

* adds boundary for negative remaining work

* implemented tests for GPU scheduler filter

* Revert "Updated output format to reduce size"

This reverts commit 7171de8e0512a863df4962f64560ac7bad1fb48d.

* spotless aply

---------

Co-authored-by: DanteNiewenhuis <d.niewenhuis@hotmail.com>
---
 .../simulator/scheduler/FilterSchedulerTest.kt     | 95 ++++++++++++++++++++++
 1 file changed, 95 insertions(+)

(limited to 'opendc-compute/opendc-compute-simulator/src/test')
diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt
index fe5cea70..65fbfb38 100644
--- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt
+++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt
@@ -28,6 +28,7 @@ import org.junit.jupiter.api.Assertions.assertEquals
 import org.junit.jupiter.api.Test
 import org.junit.jupiter.api.assertAll
 import org.junit.jupiter.api.assertThrows
+import org.opendc.compute.simulator.host.GpuHostModel
 import org.opendc.compute.simulator.host.HostModel
 import org.opendc.compute.simulator.host.HostState
 import org.opendc.compute.simulator.scheduler.filters.ComputeFilter
@@ -37,6 +38,8 @@ import org.opendc.compute.simulator.scheduler.filters.RamFilter
 import org.opendc.compute.simulator.scheduler.filters.SameHostFilter
 import org.opendc.compute.simulator.scheduler.filters.VCpuCapacityFilter
 import org.opendc.compute.simulator.scheduler.filters.VCpuFilter
+import org.opendc.compute.simulator.scheduler.filters.VGpuCapacityFilter
+import org.opendc.compute.simulator.scheduler.filters.VGpuFilter
 import org.opendc.compute.simulator.scheduler.weights.CoreRamWeigher
 import org.opendc.compute.simulator.scheduler.weights.InstanceCountWeigher
 import org.opendc.compute.simulator.scheduler.weights.RamWeigher
@@ -436,6 +439,98 @@ internal class FilterSchedulerTest {
         assertEquals(hostB, scheduler.select(mutableListOf(reqB).iterator()).host)
     }
 
+    @Test
+    fun testVGPUFilter() {
+        val scheduler =
+            FilterScheduler(
+                filters = listOf(VGpuFilter(1.0)),
+                weighers = emptyList(),
+            )
+
+        val hostA = mockk<HostView>()
+        every { hostA.host.getState() } returns HostState.UP
+        every { hostA.host.getModel() } returns
+            HostModel(
+                0.0,
+                0,
+                2048,
+                listOf(
+                    GpuHostModel(8 * 2600.0, 8, 0L, 0.0),
+                ),
+            )
+        every { hostA.provisionedGpuCores } returns 0
+        scheduler.addHost(hostA)
+
+        val hostB = mockk<HostView>()
+        every { hostB.host.getState() } returns HostState.UP
+        every { hostB.host.getModel() } returns
+            HostModel(
+                0.0,
+                0,
+                2048,
+                listOf(
+                    GpuHostModel(8 * 3200.0, 8, 0L, 0.0),
+                    GpuHostModel(8 * 3200.0, 8, 0L, 0.0),
+                ),
+            )
+        every { hostB.provisionedGpuCores } returns 0
+        scheduler.addHost(hostB)
+
+        val req = mockk<SchedulingRequest>()
+        every { req.task.flavor.gpuCoreCount } returns 9
+        every { req.task.flavor.meta } returns mapOf("gpu-capacity" to 9 * 3200.0)
+        every { req.isCancelled } returns false
+
+        // filter selects hostB because hostA does not have enough GPU capacity
+        assertEquals(hostB, scheduler.select(mutableListOf(req).iterator()).host)
+    }
+
+    @Test
+    fun testVGPUCapacityFilter() {
+        val scheduler =
+            FilterScheduler(
+                filters = listOf(VGpuCapacityFilter()),
+                weighers = emptyList(),
+            )
+
+        val hostA = mockk<HostView>()
+        every { hostA.host.getState() } returns HostState.UP
+        every { hostA.host.getModel() } returns
+            HostModel(
+                0.0,
+                0,
+                2048,
+                listOf(
+                    GpuHostModel(8 * 2600.0, 8, 0L, 0.0),
+                ),
+            )
+        every { hostA.availableMemory } returns 512
+        scheduler.addHost(hostA)
+
+        val hostB = mockk<HostView>()
+        every { hostB.host.getState() } returns HostState.UP
+        every { hostB.host.getModel() } returns
+            HostModel(
+                0.0,
+                0,
+                2048,
+                listOf(
+                    GpuHostModel(8 * 3200.0, 8, 0L, 0.0),
+                    GpuHostModel(8 * 3200.0, 8, 0L, 0.0),
+                ),
+            )
+        every { hostB.availableMemory } returns 512
+        scheduler.addHost(hostB)
+
+        val req = mockk<SchedulingRequest>()
+        every { req.task.flavor.gpuCoreCount } returns 8
+        every { req.task.flavor.meta } returns mapOf("gpu-capacity" to 8 * 3200.0)
+        every { req.isCancelled } returns false
+
+        // filter selects hostB because hostA does not have enough GPU capacity
+        assertEquals(hostB, scheduler.select(mutableListOf(req).iterator()).host)
+    }
+
     @Test
     fun testRamWeigher() {
         val scheduler =
-- 
cgit v1.2.3