From 0203254b709614fa732c114aa25916f61b8b3275 Mon Sep 17 00:00:00 2001
From: Niels Thiele <noleu66@posteo.net>
Date: Sun, 22 Jun 2025 12:31:21 +0200
Subject: Implemented Single GPU Support & outline of host-level allocation
 policies (#342)

* renamed performance counter to distinguish different resource types

* added GPU, modelled similar to CPU

* added GPUs to machine model

* list of GPUs instead of single instance

* renamed memory speed to bandwidth

* enabled parsing of GPU resources

* split powermodel into cpu and GPU powermodel

* added gpu parsing tests

* added idea of host level scheduling

* added tests for multi gpu parsing

* renamed powermodel to cpupowermodel

* clarified naming of cpu and gpu components

* added resource type to flow suplier and edge

* added resourcetype

* added GPU components and resource type to fragments

* added GPU to workload and updated resource usage retrieval

* implemented first version of multi resource

* added name to workload

* renamed perfomance counters

* removed commented out code

* removed deprecated comments

* included demand and supply into calculations

* resolving rebase mismatches

* moved resource type from flowedge class to common package

* added available resources to machinees

* cleaner separation if workload is started of simmachine or vm

* Replaced exception with dedicated enum

* Only looping over resources that are actually used

* using hashmaps to handle resourcetype instead of arrays for readability

* fixed condition

* tracking finished workloads per resource type

* removed resource type from flowedge

* made supply and demand distribution resource specific

* added power model for GPU

* removed unused test setup

* removed depracated comments

* removed unused parameter

* added ID for GPU

* added GPUs and GPU performance counters (naively)

* implemented capturing of GPU statistics

* added reminders for future implementations

* renamed properties for better identification

* added capturing GPU statistics

* implemented first tests for GPUs

* unified access to performance counters

* added interface for general compute resource handling

* implemented multi resource support in simmachine

* added individual edge to VM per resource

* extended compute resource interface

* implemented multi-resource support in PSU

* implemented generic retrieval of computeresources

* implemented mult-resource suppport in vm

* made method use more resource specific

* implemented simple GPU tests

* rolled back frquency and demand use

* made naming independent of used resource

* using workloads resources instead of VMs to determine available resource

* implemented determination of used resources in workload

* removed logging statements

* implemented reading from workload

* fixed naming for host-level allocation

* fixed next deadline calculation

* fixed forwarding supply

* reduced memory footprint

* made GPU powermodel nullable

* maded Gpu powermodel configurable in topology

* implemented tests for basic gpu scheduler

* added gpu properties

* implemented weights, filter and simple cpu-gpu scheduler

* spotless apply

* spotless apply pt. 2

* fixed capitalization

* spotless kotlin run

* implemented coloumn export

* todo update

* removed code comments

* Merged PerformanceCounter classes into one & removed interface

* removed GPU  specific powermodel

* Rebase master: kept both versions of TopologyFactories

* renamed CpuPowermodel to resource independent Powermodel

Moved it from Cpu package to power package

* implementated default of getResourceType & removed overrides if possible

* split getResourceType into Consumer and Supplier

* added power as resource type

* reduced supply demand from arrayList to single value

* combining GPUs into one large GPU, until full multi-gpu support

* merged distribution policy enum with corresponding factory

* added comment

* post-rebase fixes

* aligned naming

* Added GPU metrics to task output

* Updates power resource type to uppercase.

Standardizes the `ResourceType.Power` enum to `ResourceType.POWER`
for consistency with other resource types and improved readability.

* Removes deprecated test assertions

Removes commented-out assertions in GPU tests.

These assertions are no longer needed and clutter the test code.

* Renames MaxMinFairnessStrategy to Policy

Renames MaxMinFairnessStrategy to MaxMinFairnessPolicy for
clarity and consistency with naming conventions. This change
affects the factory and distributor to use the updated name.

* applies spotless

* nulls GPUs as it is not used
---
 .../compute/workload/ComputeWorkloadLoader.kt      | 54 +++++++++++++++++++---
 .../kotlin/org/opendc/compute/workload/Task.kt     |  3 ++
 2 files changed, 50 insertions(+), 7 deletions(-)

(limited to 'opendc-compute/opendc-compute-workload')

diff --git a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt
index 80996c0e..7599d4e1 100644
--- a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt
+++ b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt
@@ -33,11 +33,15 @@ import org.opendc.trace.conv.resourceCpuCapacity
 import org.opendc.trace.conv.resourceCpuCount
 import org.opendc.trace.conv.resourceDeadline
 import org.opendc.trace.conv.resourceDuration
+import org.opendc.trace.conv.resourceGpuCapacity
+import org.opendc.trace.conv.resourceGpuCount
+import org.opendc.trace.conv.resourceGpuMemCapacity
 import org.opendc.trace.conv.resourceID
 import org.opendc.trace.conv.resourceMemCapacity
 import org.opendc.trace.conv.resourceNature
 import org.opendc.trace.conv.resourceStateCpuUsage
 import org.opendc.trace.conv.resourceStateDuration
+import org.opendc.trace.conv.resourceStateGpuUsage
 import org.opendc.trace.conv.resourceSubmissionTime
 import java.io.File
 import java.lang.ref.SoftReference
@@ -79,6 +83,8 @@ public class ComputeWorkloadLoader(
         val durationCol = reader.resolve(resourceStateDuration)
         val coresCol = reader.resolve(resourceCpuCount)
         val usageCol = reader.resolve(resourceStateCpuUsage)
+        val gpuCoresCol = reader.resolve(resourceGpuCount)
+        val resourceGpuCapacityCol = reader.resolve(resourceStateGpuUsage)
 
         val fragments = mutableMapOf<String, Builder>()
 
@@ -88,12 +94,23 @@ public class ComputeWorkloadLoader(
                 val durationMs = reader.getDuration(durationCol)!!
                 val cores = reader.getInt(coresCol)
                 val cpuUsage = reader.getDouble(usageCol)
+                val gpuUsage =
+                    if (reader.getDouble(
+                            resourceGpuCapacityCol,
+                        ).isNaN()
+                    ) {
+                        0.0
+                    } else {
+                        reader.getDouble(resourceGpuCapacityCol) // Default to 0 if not present
+                    }
+                val gpuCores = reader.getInt(gpuCoresCol) // Default to 0 if not present
+                val gpuMemory = 0L // Default to 0 if not present
 
                 val builder =
                     fragments.computeIfAbsent(
                         id,
                     ) { Builder(checkpointInterval, checkpointDuration, checkpointIntervalScaling, scalingPolicy, id) }
-                builder.add(durationMs, cpuUsage, cores)
+                builder.add(durationMs, cpuUsage, cores, gpuUsage, gpuCores, gpuMemory)
             }
 
             fragments
@@ -117,6 +134,9 @@ public class ComputeWorkloadLoader(
         val cpuCountCol = reader.resolve(resourceCpuCount)
         val cpuCapacityCol = reader.resolve(resourceCpuCapacity)
         val memCol = reader.resolve(resourceMemCapacity)
+        val gpuCapacityCol = reader.resolve(resourceGpuCapacity) // Assuming GPU capacity is also present
+        val gpuCoreCountCol = reader.resolve(resourceGpuCount) // Assuming GPU cores are also present
+        val gpuMemoryCol = reader.resolve(resourceGpuMemCapacity) // Assuming GPU memory is also present
         val natureCol = reader.resolve(resourceNature)
         val deadlineCol = reader.resolve(resourceDeadline)
 
@@ -135,6 +155,17 @@ public class ComputeWorkloadLoader(
                 val cpuCount = reader.getInt(cpuCountCol)
                 val cpuCapacity = reader.getDouble(cpuCapacityCol)
                 val memCapacity = reader.getDouble(memCol) / 1000.0 // Convert from KB to MB
+                val gpuUsage =
+                    if (reader.getDouble(
+                            gpuCapacityCol,
+                        ).isNaN()
+                    ) {
+                        0.0
+                    } else {
+                        reader.getDouble(gpuCapacityCol) // Default to 0 if not present// Default to 0 if not present
+                    }
+                val gpuCoreCount = reader.getInt(gpuCoreCountCol) // Default to 0 if not present
+                val gpuMemory = 0L // currently not implemented
                 val uid = UUID.nameUUIDFromBytes("$id-${counter++}".toByteArray())
                 var nature = reader.getString(natureCol)
                 var deadline = reader.getLong(deadlineCol)
@@ -153,6 +184,9 @@ public class ComputeWorkloadLoader(
                         cpuCount,
                         cpuCapacity,
                         memCapacity.roundToLong(),
+                        gpuCoreCount,
+                        gpuUsage,
+                        gpuMemory,
                         totalLoad,
                         submissionTime,
                         duration,
@@ -224,17 +258,23 @@ public class ComputeWorkloadLoader(
          * Add a fragment to the trace.
          *
          * @param duration The duration of the fragment (in epoch millis).
-         * @param usage CPU usage of this fragment.
-         * @param cores Number of cores used.
+         * @param cpuUsage CPU usage of this fragment.
+         * @param cpuCores Number of cores used.
+         * @param gpuUsage GPU usage of this fragment.
+         * @param gpuCores Number of GPU cores used.
+         * @param gpuMemoryUsage GPU memory usage of this fragment.
          */
         fun add(
             duration: Duration,
-            usage: Double,
-            cores: Int,
+            cpuUsage: Double,
+            cpuCores: Int,
+            gpuUsage: Double = 0.0,
+            gpuCores: Int = 0,
+            gpuMemoryUsage: Long = 0,
         ) {
-            totalLoad += (usage * duration.toMillis()) / 1000 // avg MHz * duration = MFLOPs
+            totalLoad += ((cpuUsage * duration.toMillis()) + (gpuUsage * duration.toMillis())) / 1000 // avg MHz * duration = MFLOPs
 
-            builder.add(duration.toMillis(), usage, cores)
+            builder.add(duration.toMillis(), cpuUsage, cpuCores, gpuUsage, gpuCores, gpuMemoryUsage)
         }
 
         /**
diff --git a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/Task.kt b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/Task.kt
index 787f271e..228b84ed 100644
--- a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/Task.kt
+++ b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/Task.kt
@@ -43,6 +43,9 @@ public data class Task(
     val cpuCount: Int,
     val cpuCapacity: Double,
     val memCapacity: Long,
+    val gpuCount: Int = 0,
+    val gpuCapacity: Double = 0.0,
+    val gpuMemCapacity: Long = 0L,
     val totalLoad: Double,
     var submissionTime: Long,
     val duration: Long,
-- 
cgit v1.2.3