From a735f1768677fc996da77b239819c55dcd623f5e Mon Sep 17 00:00:00 2001 From: Niels Thiele Date: Mon, 15 Sep 2025 15:34:38 +0200 Subject: Implements fixes to run m100 traces with GPUs (#362) * Updated output format to reduce size * using sum of gpu capacities instead of single max * passing provisioned GPU cores to host view * fix supply update trigger * fixing floating point error, leading to negative demand * fixing double mismatch, due to floating point in precision * adding additional check if demand can be satisfied in the simple way * adds workload invalidation if remaining duration for all resources is 0 * invalidating flow distributors after demand update * spotless apply * updating tests * exporting power consumption of compute resources directly from gpu instead of PSU * using big decimal to avoid floating point in-precision * rolls back to pass-through version of PSU, before GPU implementation * places flowdistributor between PSU and compute resources * adds check to avoid null exception if supply is pushed without demand * fixing task id type * Adds memorizing GPU scheduler * adds boundary for negative remaining work * implemented tests for GPU scheduler filter * Revert "Updated output format to reduce size" This reverts commit 7171de8e0512a863df4962f64560ac7bad1fb48d. * spotless aply --------- Co-authored-by: DanteNiewenhuis --- .../compute/simulator/service/ComputeService.java | 2 + .../org/opendc/compute/simulator/host/SimHost.kt | 3 +- .../simulator/scheduler/ComputeSchedulers.kt | 11 +++ .../scheduler/filters/VGpuCapacityFilter.kt | 2 +- .../simulator/scheduler/filters/VGpuFilter.kt | 6 +- .../simulator/scheduler/FilterSchedulerTest.kt | 95 ++++++++++++++++++++++ 6 files changed, 113 insertions(+), 6 deletions(-) (limited to 'opendc-compute/opendc-compute-simulator/src') diff --git a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java index fde83ead..8feddf54 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java +++ b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java @@ -186,6 +186,7 @@ public final class ComputeService implements AutoCloseable, CarbonReceiver { hv.provisionedCpuCores -= flavor.getCpuCoreCount(); hv.instanceCount--; hv.availableMemory += flavor.getMemorySize(); + hv.provisionedGpuCores -= flavor.getGpuCoreCount(); } else { LOGGER.error("Unknown host {}", host); } @@ -580,6 +581,7 @@ public final class ComputeService implements AutoCloseable, CarbonReceiver { hv.instanceCount++; hv.provisionedCpuCores += flavor.getCpuCoreCount(); hv.availableMemory -= flavor.getMemorySize(); + hv.provisionedGpuCores += flavor.getGpuCoreCount(); activeTasks.put(task, host); } catch (Exception cause) { diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt index b7d3b730..1a0cc316 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt @@ -361,7 +361,6 @@ public class SimHost( for (gpu in simMachine!!.gpus) { gpu.updateCounters(this.clock.millis()) val counters = simMachine!!.getGpuPerformanceCounters(gpu.id) - val powerDraw = simMachine!!.psu.getPowerDraw(ResourceType.GPU, gpu.id) gpuStats.add( HostGpuStats( @@ -373,7 +372,7 @@ public class SimHost( counters.demand, counters.supply, counters.supply / gpu.getCapacity(ResourceType.GPU), - powerDraw, + counters.powerDraw, ), ) } diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt index 0376a492..79af6f62 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt @@ -52,6 +52,7 @@ public enum class ComputeSchedulerEnum { Timeshift, ProvisionedCpuGpuCores, ProvisionedCpuGpuCoresInv, + GpuTaskMemorizing, } public fun createPrefabComputeScheduler( @@ -159,5 +160,15 @@ public fun createPrefabComputeScheduler( VGpuWeigher(gpuAllocationRatio, multiplier = -1.0), ), ) + ComputeSchedulerEnum.GpuTaskMemorizing -> + MemorizingScheduler( + filters = + listOf( + ComputeFilter(), + VCpuFilter(cpuAllocationRatio), + VGpuFilter(gpuAllocationRatio), + RamFilter(ramAllocationRatio), + ), + ) } } diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt index 6dc27327..5f517257 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt @@ -42,7 +42,7 @@ public class VGpuCapacityFilter : HostFilter { return ( requiredCapacity == null || - ((availableCapacity / availableCores) >= (requiredCapacity / task.flavor.gpuCoreCount)) + (availableRatio >= (requiredCapacity / task.flavor.gpuCoreCount)) ) } } diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt index 9f564776..f47013b1 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt @@ -26,9 +26,9 @@ import org.opendc.compute.simulator.service.HostView import org.opendc.compute.simulator.service.ServiceTask /** - * A [HostFilter] that filters hosts based on the vCPU requirements of a [ServiceTask] and the available vCPUs on the host. + * A [HostFilter] that filters hosts based on the vGPU requirements of a [ServiceTask] and the available vGPUs on the host. * - * @param allocationRatio Virtual CPU to physical CPU allocation ratio. + * @param allocationRatio Virtual GPU to physical GPU allocation ratio. */ public class VGpuFilter(private val allocationRatio: Double) : HostFilter { override fun test( @@ -36,7 +36,7 @@ public class VGpuFilter(private val allocationRatio: Double) : HostFilter { task: ServiceTask, ): Boolean { val requested = task.flavor.gpuCoreCount - val totalCores = host.host.getModel().gpuHostModels().maxOfOrNull { it.gpuCoreCount() } ?: 0 + val totalCores = host.host.getModel().gpuHostModels()?.sumOf { it.gpuCoreCount() } ?: 0 val limit = totalCores * allocationRatio // Do not allow an instance to overcommit against itself, only against other instances diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt index fe5cea70..65fbfb38 100644 --- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt +++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt @@ -28,6 +28,7 @@ import org.junit.jupiter.api.Assertions.assertEquals import org.junit.jupiter.api.Test import org.junit.jupiter.api.assertAll import org.junit.jupiter.api.assertThrows +import org.opendc.compute.simulator.host.GpuHostModel import org.opendc.compute.simulator.host.HostModel import org.opendc.compute.simulator.host.HostState import org.opendc.compute.simulator.scheduler.filters.ComputeFilter @@ -37,6 +38,8 @@ import org.opendc.compute.simulator.scheduler.filters.RamFilter import org.opendc.compute.simulator.scheduler.filters.SameHostFilter import org.opendc.compute.simulator.scheduler.filters.VCpuCapacityFilter import org.opendc.compute.simulator.scheduler.filters.VCpuFilter +import org.opendc.compute.simulator.scheduler.filters.VGpuCapacityFilter +import org.opendc.compute.simulator.scheduler.filters.VGpuFilter import org.opendc.compute.simulator.scheduler.weights.CoreRamWeigher import org.opendc.compute.simulator.scheduler.weights.InstanceCountWeigher import org.opendc.compute.simulator.scheduler.weights.RamWeigher @@ -436,6 +439,98 @@ internal class FilterSchedulerTest { assertEquals(hostB, scheduler.select(mutableListOf(reqB).iterator()).host) } + @Test + fun testVGPUFilter() { + val scheduler = + FilterScheduler( + filters = listOf(VGpuFilter(1.0)), + weighers = emptyList(), + ) + + val hostA = mockk() + every { hostA.host.getState() } returns HostState.UP + every { hostA.host.getModel() } returns + HostModel( + 0.0, + 0, + 2048, + listOf( + GpuHostModel(8 * 2600.0, 8, 0L, 0.0), + ), + ) + every { hostA.provisionedGpuCores } returns 0 + scheduler.addHost(hostA) + + val hostB = mockk() + every { hostB.host.getState() } returns HostState.UP + every { hostB.host.getModel() } returns + HostModel( + 0.0, + 0, + 2048, + listOf( + GpuHostModel(8 * 3200.0, 8, 0L, 0.0), + GpuHostModel(8 * 3200.0, 8, 0L, 0.0), + ), + ) + every { hostB.provisionedGpuCores } returns 0 + scheduler.addHost(hostB) + + val req = mockk() + every { req.task.flavor.gpuCoreCount } returns 9 + every { req.task.flavor.meta } returns mapOf("gpu-capacity" to 9 * 3200.0) + every { req.isCancelled } returns false + + // filter selects hostB because hostA does not have enough GPU capacity + assertEquals(hostB, scheduler.select(mutableListOf(req).iterator()).host) + } + + @Test + fun testVGPUCapacityFilter() { + val scheduler = + FilterScheduler( + filters = listOf(VGpuCapacityFilter()), + weighers = emptyList(), + ) + + val hostA = mockk() + every { hostA.host.getState() } returns HostState.UP + every { hostA.host.getModel() } returns + HostModel( + 0.0, + 0, + 2048, + listOf( + GpuHostModel(8 * 2600.0, 8, 0L, 0.0), + ), + ) + every { hostA.availableMemory } returns 512 + scheduler.addHost(hostA) + + val hostB = mockk() + every { hostB.host.getState() } returns HostState.UP + every { hostB.host.getModel() } returns + HostModel( + 0.0, + 0, + 2048, + listOf( + GpuHostModel(8 * 3200.0, 8, 0L, 0.0), + GpuHostModel(8 * 3200.0, 8, 0L, 0.0), + ), + ) + every { hostB.availableMemory } returns 512 + scheduler.addHost(hostB) + + val req = mockk() + every { req.task.flavor.gpuCoreCount } returns 8 + every { req.task.flavor.meta } returns mapOf("gpu-capacity" to 8 * 3200.0) + every { req.isCancelled } returns false + + // filter selects hostB because hostA does not have enough GPU capacity + assertEquals(hostB, scheduler.select(mutableListOf(req).iterator()).host) + } + @Test fun testRamWeigher() { val scheduler = -- cgit v1.2.3