From a735f1768677fc996da77b239819c55dcd623f5e Mon Sep 17 00:00:00 2001 From: Niels Thiele Date: Mon, 15 Sep 2025 15:34:38 +0200 Subject: Implements fixes to run m100 traces with GPUs (#362) * Updated output format to reduce size * using sum of gpu capacities instead of single max * passing provisioned GPU cores to host view * fix supply update trigger * fixing floating point error, leading to negative demand * fixing double mismatch, due to floating point in precision * adding additional check if demand can be satisfied in the simple way * adds workload invalidation if remaining duration for all resources is 0 * invalidating flow distributors after demand update * spotless apply * updating tests * exporting power consumption of compute resources directly from gpu instead of PSU * using big decimal to avoid floating point in-precision * rolls back to pass-through version of PSU, before GPU implementation * places flowdistributor between PSU and compute resources * adds check to avoid null exception if supply is pushed without demand * fixing task id type * Adds memorizing GPU scheduler * adds boundary for negative remaining work * implemented tests for GPU scheduler filter * Revert "Updated output format to reduce size" This reverts commit 7171de8e0512a863df4962f64560ac7bad1fb48d. * spotless aply --------- Co-authored-by: DanteNiewenhuis --- .../simulator/scheduler/FilterSchedulerTest.kt | 95 ++++++++++++++++++++++ 1 file changed, 95 insertions(+) (limited to 'opendc-compute/opendc-compute-simulator/src/test') diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt index fe5cea70..65fbfb38 100644 --- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt +++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt @@ -28,6 +28,7 @@ import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.Test import org.junit.jupiter.api.assertAll import org.junit.jupiter.api.assertThrows +import org.opendc.compute.simulator.host.GpuHostModel import org.opendc.compute.simulator.host.HostModel import org.opendc.compute.simulator.host.HostState import org.opendc.compute.simulator.scheduler.filters.ComputeFilter @@ -37,6 +38,8 @@ import org.opendc.compute.simulator.scheduler.filters.RamFilter import org.opendc.compute.simulator.scheduler.filters.SameHostFilter import org.opendc.compute.simulator.scheduler.filters.VCpuCapacityFilter import org.opendc.compute.simulator.scheduler.filters.VCpuFilter +import org.opendc.compute.simulator.scheduler.filters.VGpuCapacityFilter +import org.opendc.compute.simulator.scheduler.filters.VGpuFilter import org.opendc.compute.simulator.scheduler.weights.CoreRamWeigher import org.opendc.compute.simulator.scheduler.weights.InstanceCountWeigher import org.opendc.compute.simulator.scheduler.weights.RamWeigher @@ -436,6 +439,98 @@ internal class FilterSchedulerTest { assertEquals(hostB, scheduler.select(mutableListOf(reqB).iterator()).host) } + @Test + fun testVGPUFilter() { + val scheduler = + FilterScheduler( + filters = listOf(VGpuFilter(1.0)), + weighers = emptyList(), + ) + + val hostA = mockk() + every { hostA.host.getState() } returns HostState.UP + every { hostA.host.getModel() } returns + HostModel( + 0.0, + 0, + 2048, + listOf( + GpuHostModel(8 * 2600.0, 8, 0L, 0.0), + ), + ) + every { hostA.provisionedGpuCores } returns 0 + scheduler.addHost(hostA) + + val hostB = mockk() + every { hostB.host.getState() } returns HostState.UP + every { hostB.host.getModel() } returns + HostModel( + 0.0, + 0, + 2048, + listOf( + GpuHostModel(8 * 3200.0, 8, 0L, 0.0), + GpuHostModel(8 * 3200.0, 8, 0L, 0.0), + ), + ) + every { hostB.provisionedGpuCores } returns 0 + scheduler.addHost(hostB) + + val req = mockk() + every { req.task.flavor.gpuCoreCount } returns 9 + every { req.task.flavor.meta } returns mapOf("gpu-capacity" to 9 * 3200.0) + every { req.isCancelled } returns false + + // filter selects hostB because hostA does not have enough GPU capacity + assertEquals(hostB, scheduler.select(mutableListOf(req).iterator()).host) + } + + @Test + fun testVGPUCapacityFilter() { + val scheduler = + FilterScheduler( + filters = listOf(VGpuCapacityFilter()), + weighers = emptyList(), + ) + + val hostA = mockk() + every { hostA.host.getState() } returns HostState.UP + every { hostA.host.getModel() } returns + HostModel( + 0.0, + 0, + 2048, + listOf( + GpuHostModel(8 * 2600.0, 8, 0L, 0.0), + ), + ) + every { hostA.availableMemory } returns 512 + scheduler.addHost(hostA) + + val hostB = mockk() + every { hostB.host.getState() } returns HostState.UP + every { hostB.host.getModel() } returns + HostModel( + 0.0, + 0, + 2048, + listOf( + GpuHostModel(8 * 3200.0, 8, 0L, 0.0), + GpuHostModel(8 * 3200.0, 8, 0L, 0.0), + ), + ) + every { hostB.availableMemory } returns 512 + scheduler.addHost(hostB) + + val req = mockk() + every { req.task.flavor.gpuCoreCount } returns 8 + every { req.task.flavor.meta } returns mapOf("gpu-capacity" to 8 * 3200.0) + every { req.isCancelled } returns false + + // filter selects hostB because hostA does not have enough GPU capacity + assertEquals(hostB, scheduler.select(mutableListOf(req).iterator()).host) + } + @Test fun testRamWeigher() { val scheduler = -- cgit v1.2.3