summaryrefslogtreecommitdiff
path: root/opendc-compute/opendc-compute-simulator/src
diff options
context:
space:
mode:
authorNiels Thiele <noleu66@posteo.net>2025-09-15 15:34:38 +0200
committerGitHub <noreply@github.com>2025-09-15 15:34:38 +0200
commita735f1768677fc996da77b239819c55dcd623f5e (patch)
tree703237990dc7d178a7600c2795fbc32d2cd12aa8 /opendc-compute/opendc-compute-simulator/src
parent5f539debbe18c9cf5c6c159c098f02f1d239f324 (diff)
Implements fixes to run m100 traces with GPUs (#362)
* Updated output format to reduce size * using sum of gpu capacities instead of single max * passing provisioned GPU cores to host view * fix supply update trigger * fixing floating point error, leading to negative demand * fixing double mismatch, due to floating point in precision * adding additional check if demand can be satisfied in the simple way * adds workload invalidation if remaining duration for all resources is 0 * invalidating flow distributors after demand update * spotless apply * updating tests * exporting power consumption of compute resources directly from gpu instead of PSU * using big decimal to avoid floating point in-precision * rolls back to pass-through version of PSU, before GPU implementation * places flowdistributor between PSU and compute resources * adds check to avoid null exception if supply is pushed without demand * fixing task id type * Adds memorizing GPU scheduler * adds boundary for negative remaining work * implemented tests for GPU scheduler filter * Revert "Updated output format to reduce size" This reverts commit 7171de8e0512a863df4962f64560ac7bad1fb48d. * spotless aply --------- Co-authored-by: DanteNiewenhuis <d.niewenhuis@hotmail.com>
Diffstat (limited to 'opendc-compute/opendc-compute-simulator/src')
-rw-r--r--opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java2
-rw-r--r--opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt3
-rw-r--r--opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt11
-rw-r--r--opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt2
-rw-r--r--opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt6
-rw-r--r--opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt95
6 files changed, 113 insertions, 6 deletions
diff --git a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java
index fde83ead..8feddf54 100644
--- a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java
+++ b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java
@@ -186,6 +186,7 @@ public final class ComputeService implements AutoCloseable, CarbonReceiver {
hv.provisionedCpuCores -= flavor.getCpuCoreCount();
hv.instanceCount--;
hv.availableMemory += flavor.getMemorySize();
+ hv.provisionedGpuCores -= flavor.getGpuCoreCount();
} else {
LOGGER.error("Unknown host {}", host);
}
@@ -580,6 +581,7 @@ public final class ComputeService implements AutoCloseable, CarbonReceiver {
hv.instanceCount++;
hv.provisionedCpuCores += flavor.getCpuCoreCount();
hv.availableMemory -= flavor.getMemorySize();
+ hv.provisionedGpuCores += flavor.getGpuCoreCount();
activeTasks.put(task, host);
} catch (Exception cause) {
diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt
index b7d3b730..1a0cc316 100644
--- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt
+++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt
@@ -361,7 +361,6 @@ public class SimHost(
for (gpu in simMachine!!.gpus) {
gpu.updateCounters(this.clock.millis())
val counters = simMachine!!.getGpuPerformanceCounters(gpu.id)
- val powerDraw = simMachine!!.psu.getPowerDraw(ResourceType.GPU, gpu.id)
gpuStats.add(
HostGpuStats(
@@ -373,7 +372,7 @@ public class SimHost(
counters.demand,
counters.supply,
counters.supply / gpu.getCapacity(ResourceType.GPU),
- powerDraw,
+ counters.powerDraw,
),
)
}
diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt
index 0376a492..79af6f62 100644
--- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt
+++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt
@@ -52,6 +52,7 @@ public enum class ComputeSchedulerEnum {
Timeshift,
ProvisionedCpuGpuCores,
ProvisionedCpuGpuCoresInv,
+ GpuTaskMemorizing,
}
public fun createPrefabComputeScheduler(
@@ -159,5 +160,15 @@ public fun createPrefabComputeScheduler(
VGpuWeigher(gpuAllocationRatio, multiplier = -1.0),
),
)
+ ComputeSchedulerEnum.GpuTaskMemorizing ->
+ MemorizingScheduler(
+ filters =
+ listOf(
+ ComputeFilter(),
+ VCpuFilter(cpuAllocationRatio),
+ VGpuFilter(gpuAllocationRatio),
+ RamFilter(ramAllocationRatio),
+ ),
+ )
}
}
diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt
index 6dc27327..5f517257 100644
--- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt
+++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt
@@ -42,7 +42,7 @@ public class VGpuCapacityFilter : HostFilter {
return (
requiredCapacity == null ||
- ((availableCapacity / availableCores) >= (requiredCapacity / task.flavor.gpuCoreCount))
+ (availableRatio >= (requiredCapacity / task.flavor.gpuCoreCount))
)
}
}
diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt
index 9f564776..f47013b1 100644
--- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt
+++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt
@@ -26,9 +26,9 @@ import org.opendc.compute.simulator.service.HostView
import org.opendc.compute.simulator.service.ServiceTask
/**
- * A [HostFilter] that filters hosts based on the vCPU requirements of a [ServiceTask] and the available vCPUs on the host.
+ * A [HostFilter] that filters hosts based on the vGPU requirements of a [ServiceTask] and the available vGPUs on the host.
*
- * @param allocationRatio Virtual CPU to physical CPU allocation ratio.
+ * @param allocationRatio Virtual GPU to physical GPU allocation ratio.
*/
public class VGpuFilter(private val allocationRatio: Double) : HostFilter {
override fun test(
@@ -36,7 +36,7 @@ public class VGpuFilter(private val allocationRatio: Double) : HostFilter {
task: ServiceTask,
): Boolean {
val requested = task.flavor.gpuCoreCount
- val totalCores = host.host.getModel().gpuHostModels().maxOfOrNull { it.gpuCoreCount() } ?: 0
+ val totalCores = host.host.getModel().gpuHostModels()?.sumOf { it.gpuCoreCount() } ?: 0
val limit = totalCores * allocationRatio
// Do not allow an instance to overcommit against itself, only against other instances
diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt
index fe5cea70..65fbfb38 100644
--- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt
+++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt
@@ -28,6 +28,7 @@ import org.junit.jupiter.api.Assertions.assertEquals
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.assertAll
import org.junit.jupiter.api.assertThrows
+import org.opendc.compute.simulator.host.GpuHostModel
import org.opendc.compute.simulator.host.HostModel
import org.opendc.compute.simulator.host.HostState
import org.opendc.compute.simulator.scheduler.filters.ComputeFilter
@@ -37,6 +38,8 @@ import org.opendc.compute.simulator.scheduler.filters.RamFilter
import org.opendc.compute.simulator.scheduler.filters.SameHostFilter
import org.opendc.compute.simulator.scheduler.filters.VCpuCapacityFilter
import org.opendc.compute.simulator.scheduler.filters.VCpuFilter
+import org.opendc.compute.simulator.scheduler.filters.VGpuCapacityFilter
+import org.opendc.compute.simulator.scheduler.filters.VGpuFilter
import org.opendc.compute.simulator.scheduler.weights.CoreRamWeigher
import org.opendc.compute.simulator.scheduler.weights.InstanceCountWeigher
import org.opendc.compute.simulator.scheduler.weights.RamWeigher
@@ -437,6 +440,98 @@ internal class FilterSchedulerTest {
}
@Test
+ fun testVGPUFilter() {
+ val scheduler =
+ FilterScheduler(
+ filters = listOf(VGpuFilter(1.0)),
+ weighers = emptyList(),
+ )
+
+ val hostA = mockk<HostView>()
+ every { hostA.host.getState() } returns HostState.UP
+ every { hostA.host.getModel() } returns
+ HostModel(
+ 0.0,
+ 0,
+ 2048,
+ listOf(
+ GpuHostModel(8 * 2600.0, 8, 0L, 0.0),
+ ),
+ )
+ every { hostA.provisionedGpuCores } returns 0
+ scheduler.addHost(hostA)
+
+ val hostB = mockk<HostView>()
+ every { hostB.host.getState() } returns HostState.UP
+ every { hostB.host.getModel() } returns
+ HostModel(
+ 0.0,
+ 0,
+ 2048,
+ listOf(
+ GpuHostModel(8 * 3200.0, 8, 0L, 0.0),
+ GpuHostModel(8 * 3200.0, 8, 0L, 0.0),
+ ),
+ )
+ every { hostB.provisionedGpuCores } returns 0
+ scheduler.addHost(hostB)
+
+ val req = mockk<SchedulingRequest>()
+ every { req.task.flavor.gpuCoreCount } returns 9
+ every { req.task.flavor.meta } returns mapOf("gpu-capacity" to 9 * 3200.0)
+ every { req.isCancelled } returns false
+
+ // filter selects hostB because hostA does not have enough GPU capacity
+ assertEquals(hostB, scheduler.select(mutableListOf(req).iterator()).host)
+ }
+
+ @Test
+ fun testVGPUCapacityFilter() {
+ val scheduler =
+ FilterScheduler(
+ filters = listOf(VGpuCapacityFilter()),
+ weighers = emptyList(),
+ )
+
+ val hostA = mockk<HostView>()
+ every { hostA.host.getState() } returns HostState.UP
+ every { hostA.host.getModel() } returns
+ HostModel(
+ 0.0,
+ 0,
+ 2048,
+ listOf(
+ GpuHostModel(8 * 2600.0, 8, 0L, 0.0),
+ ),
+ )
+ every { hostA.availableMemory } returns 512
+ scheduler.addHost(hostA)
+
+ val hostB = mockk<HostView>()
+ every { hostB.host.getState() } returns HostState.UP
+ every { hostB.host.getModel() } returns
+ HostModel(
+ 0.0,
+ 0,
+ 2048,
+ listOf(
+ GpuHostModel(8 * 3200.0, 8, 0L, 0.0),
+ GpuHostModel(8 * 3200.0, 8, 0L, 0.0),
+ ),
+ )
+ every { hostB.availableMemory } returns 512
+ scheduler.addHost(hostB)
+
+ val req = mockk<SchedulingRequest>()
+ every { req.task.flavor.gpuCoreCount } returns 8
+ every { req.task.flavor.meta } returns mapOf("gpu-capacity" to 8 * 3200.0)
+ every { req.isCancelled } returns false
+
+ // filter selects hostB because hostA does not have enough GPU capacity
+ assertEquals(hostB, scheduler.select(mutableListOf(req).iterator()).host)
+ }
+
+ @Test
fun testRamWeigher() {
val scheduler =
FilterScheduler(