From 0203254b709614fa732c114aa25916f61b8b3275 Mon Sep 17 00:00:00 2001
From: Niels Thiele <noleu66@posteo.net>
Date: Sun, 22 Jun 2025 12:31:21 +0200
Subject: Implemented Single GPU Support & outline of host-level allocation
 policies (#342)

* renamed performance counter to distinguish different resource types

* added GPU, modelled similar to CPU

* added GPUs to machine model

* list of GPUs instead of single instance

* renamed memory speed to bandwidth

* enabled parsing of GPU resources

* split powermodel into cpu and GPU powermodel

* added gpu parsing tests

* added idea of host level scheduling

* added tests for multi gpu parsing

* renamed powermodel to cpupowermodel

* clarified naming of cpu and gpu components

* added resource type to flow suplier and edge

* added resourcetype

* added GPU components and resource type to fragments

* added GPU to workload and updated resource usage retrieval

* implemented first version of multi resource

* added name to workload

* renamed perfomance counters

* removed commented out code

* removed deprecated comments

* included demand and supply into calculations

* resolving rebase mismatches

* moved resource type from flowedge class to common package

* added available resources to machinees

* cleaner separation if workload is started of simmachine or vm

* Replaced exception with dedicated enum

* Only looping over resources that are actually used

* using hashmaps to handle resourcetype instead of arrays for readability

* fixed condition

* tracking finished workloads per resource type

* removed resource type from flowedge

* made supply and demand distribution resource specific

* added power model for GPU

* removed unused test setup

* removed depracated comments

* removed unused parameter

* added ID for GPU

* added GPUs and GPU performance counters (naively)

* implemented capturing of GPU statistics

* added reminders for future implementations

* renamed properties for better identification

* added capturing GPU statistics

* implemented first tests for GPUs

* unified access to performance counters

* added interface for general compute resource handling

* implemented multi resource support in simmachine

* added individual edge to VM per resource

* extended compute resource interface

* implemented multi-resource support in PSU

* implemented generic retrieval of computeresources

* implemented mult-resource suppport in vm

* made method use more resource specific

* implemented simple GPU tests

* rolled back frquency and demand use

* made naming independent of used resource

* using workloads resources instead of VMs to determine available resource

* implemented determination of used resources in workload

* removed logging statements

* implemented reading from workload

* fixed naming for host-level allocation

* fixed next deadline calculation

* fixed forwarding supply

* reduced memory footprint

* made GPU powermodel nullable

* maded Gpu powermodel configurable in topology

* implemented tests for basic gpu scheduler

* added gpu properties

* implemented weights, filter and simple cpu-gpu scheduler

* spotless apply

* spotless apply pt. 2

* fixed capitalization

* spotless kotlin run

* implemented coloumn export

* todo update

* removed code comments

* Merged PerformanceCounter classes into one & removed interface

* removed GPU  specific powermodel

* Rebase master: kept both versions of TopologyFactories

* renamed CpuPowermodel to resource independent Powermodel

Moved it from Cpu package to power package

* implementated default of getResourceType & removed overrides if possible

* split getResourceType into Consumer and Supplier

* added power as resource type

* reduced supply demand from arrayList to single value

* combining GPUs into one large GPU, until full multi-gpu support

* merged distribution policy enum with corresponding factory

* added comment

* post-rebase fixes

* aligned naming

* Added GPU metrics to task output

* Updates power resource type to uppercase.

Standardizes the `ResourceType.Power` enum to `ResourceType.POWER`
for consistency with other resource types and improved readability.

* Removes deprecated test assertions

Removes commented-out assertions in GPU tests.

These assertions are no longer needed and clutter the test code.

* Renames MaxMinFairnessStrategy to Policy

Renames MaxMinFairnessStrategy to MaxMinFairnessPolicy for
clarity and consistency with naming conventions. This change
affects the factory and distributor to use the updated name.

* applies spotless

* nulls GPUs as it is not used
---
 .../opendc/compute/topology/TopologyFactories.kt   | 57 ++++++++++++++++++----
 .../org/opendc/compute/topology/specs/HostSpec.kt  |  5 +-
 .../opendc/compute/topology/specs/TopologySpecs.kt | 17 ++++++-
 3 files changed, 66 insertions(+), 13 deletions(-)

(limited to 'opendc-compute/opendc-compute-topology/src')

diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt
index b6c945d2..b52608a9 100644
--- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt
+++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt
@@ -31,10 +31,13 @@ import org.opendc.compute.topology.specs.HostJSONSpec
 import org.opendc.compute.topology.specs.HostSpec
 import org.opendc.compute.topology.specs.PowerSourceSpec
 import org.opendc.compute.topology.specs.TopologySpec
-import org.opendc.simulator.compute.cpu.getPowerModel
 import org.opendc.simulator.compute.models.CpuModel
+import org.opendc.simulator.compute.models.GpuModel
 import org.opendc.simulator.compute.models.MachineModel
 import org.opendc.simulator.compute.models.MemoryUnit
+import org.opendc.simulator.compute.power.getPowerModel
+import org.opendc.simulator.engine.graph.distributionPolicies.DistributionPolicyFactory
+import org.opendc.simulator.engine.graph.distributionPolicies.DistributionPolicyFactory.DistributionPolicyType
 import java.io.File
 import java.io.InputStream
 
@@ -166,29 +169,63 @@ private fun HostJSONSpec.toHostSpec(clusterName: String): HostSpec {
         }
 
     val unknownMemoryUnit = MemoryUnit(memory.vendor, memory.modelName, memory.memorySpeed.toMHz(), memory.memorySize.toMiB().toLong())
+    val gpuUnits =
+        List(gpu?.count ?: 0) {
+            GpuModel(
+                globalCoreId++,
+                gpu!!.coreCount,
+                gpu.coreSpeed.toMHz(),
+                gpu.memoryBandwidth.toKibps(),
+                gpu.memorySize.toMiB().toLong(),
+                gpu.vendor,
+                gpu.modelName,
+                gpu.architecture,
+            )
+        }
+
     val machineModel =
         MachineModel(
             units,
             unknownMemoryUnit,
+            gpuUnits,
+            // TODO: Pass through
+            DistributionPolicyFactory.getDistributionStrategy(DistributionPolicyType.MaxMinFairness),
+            DistributionPolicyFactory.getDistributionStrategy(DistributionPolicyType.MaxMinFairness),
         )
 
-    val powerModel =
+    val cpuPowerModel =
         getPowerModel(
-            powerModel.modelType,
-            powerModel.power.toWatts(),
-            powerModel.maxPower.toWatts(),
-            powerModel.idlePower.toWatts(),
-            powerModel.calibrationFactor,
-            powerModel.asymUtil,
-            powerModel.dvfs,
+            cpuPowerModel.modelType,
+            cpuPowerModel.power.toWatts(),
+            cpuPowerModel.maxPower.toWatts(),
+            cpuPowerModel.idlePower.toWatts(),
+            cpuPowerModel.calibrationFactor,
+            cpuPowerModel.asymUtil,
+            cpuPowerModel.dvfs,
         )
 
+    val gpuPowerModel =
+        if (gpuUnits.isEmpty()) {
+            null
+        } else {
+            getPowerModel(
+                gpuPowerModel.modelType,
+                gpuPowerModel.power.toWatts(),
+                gpuPowerModel.maxPower.toWatts(),
+                gpuPowerModel.idlePower.toWatts(),
+                gpuPowerModel.calibrationFactor,
+                gpuPowerModel.asymUtil,
+                gpuPowerModel.dvfs,
+            )
+        }
+
     val hostSpec =
         HostSpec(
             createUniqueName(this.name, hostNames),
             clusterName,
             machineModel,
-            powerModel,
+            cpuPowerModel,
+            gpuPowerModel,
         )
     return hostSpec
 }
diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt
index e4ec89e1..30a75896 100644
--- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt
+++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt
@@ -22,8 +22,8 @@
 
 package org.opendc.compute.topology.specs
 
-import org.opendc.simulator.compute.cpu.CpuPowerModel
 import org.opendc.simulator.compute.models.MachineModel
+import org.opendc.simulator.compute.power.PowerModel
 
 /**
  * Description of a physical host that will be simulated by OpenDC and host the virtual machines.
@@ -36,7 +36,8 @@ public data class HostSpec(
     val name: String,
     val clusterName: String,
     val model: MachineModel,
-    val cpuPowerModel: CpuPowerModel,
+    val cpuPowerModel: PowerModel,
+    val gpuPowerModel: PowerModel?,
     val embodiedCarbon: Double = 1000.0,
     val expectedLifetime: Double = 5.0,
 )
diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt
index 8cbf818b..62c3906a 100644
--- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt
+++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt
@@ -24,6 +24,7 @@ package org.opendc.compute.topology.specs
 
 import kotlinx.serialization.SerialName
 import kotlinx.serialization.Serializable
+import org.opendc.common.units.DataRate
 import org.opendc.common.units.DataSize
 import org.opendc.common.units.Frequency
 import org.opendc.common.units.Power
@@ -76,7 +77,9 @@ public data class HostJSONSpec(
     val cpu: CPUJSONSpec,
     val count: Int = 1,
     val memory: MemoryJSONSpec,
-    val powerModel: PowerModelSpec = PowerModelSpec.DFLT,
+    val gpu: GPUJSONSpec? = null,
+    val cpuPowerModel: PowerModelSpec = PowerModelSpec.DFLT,
+    val gpuPowerModel: PowerModelSpec = PowerModelSpec.DFLT,
 )
 
 /**
@@ -117,6 +120,18 @@ public data class MemoryJSONSpec(
     val memorySpeed: Frequency = Frequency.ofMHz(-1),
 )
 
+@Serializable
+public data class GPUJSONSpec(
+    val count: Int = 1,
+    val coreCount: Int,
+    val coreSpeed: Frequency,
+    val memorySize: DataSize = DataSize.ofMiB(-1),
+    val memoryBandwidth: DataRate = DataRate.ofKibps(-1),
+    val vendor: String = "unknown",
+    val modelName: String = "unknown",
+    val architecture: String = "unknown",
+)
+
 @Serializable
 public data class PowerModelSpec(
     val modelType: String,
-- 
cgit v1.2.3