diff options
| author | Niels Thiele <noleu66@posteo.net> | 2025-06-22 12:31:21 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2025-06-22 12:31:21 +0200 |
| commit | 0203254b709614fa732c114aa25916f61b8b3275 (patch) | |
| tree | 63232140a8e60e16e1668a51eb58954d8609fbdc /opendc-compute/opendc-compute-workload | |
| parent | 8f846655347195bf6f22a4a102aa06f0ab127da1 (diff) | |
Implemented Single GPU Support & outline of host-level allocation policies (#342)
* renamed performance counter to distinguish different resource types
* added GPU, modelled similar to CPU
* added GPUs to machine model
* list of GPUs instead of single instance
* renamed memory speed to bandwidth
* enabled parsing of GPU resources
* split powermodel into cpu and GPU powermodel
* added gpu parsing tests
* added idea of host level scheduling
* added tests for multi gpu parsing
* renamed powermodel to cpupowermodel
* clarified naming of cpu and gpu components
* added resource type to flow suplier and edge
* added resourcetype
* added GPU components and resource type to fragments
* added GPU to workload and updated resource usage retrieval
* implemented first version of multi resource
* added name to workload
* renamed perfomance counters
* removed commented out code
* removed deprecated comments
* included demand and supply into calculations
* resolving rebase mismatches
* moved resource type from flowedge class to common package
* added available resources to machinees
* cleaner separation if workload is started of simmachine or vm
* Replaced exception with dedicated enum
* Only looping over resources that are actually used
* using hashmaps to handle resourcetype instead of arrays for readability
* fixed condition
* tracking finished workloads per resource type
* removed resource type from flowedge
* made supply and demand distribution resource specific
* added power model for GPU
* removed unused test setup
* removed depracated comments
* removed unused parameter
* added ID for GPU
* added GPUs and GPU performance counters (naively)
* implemented capturing of GPU statistics
* added reminders for future implementations
* renamed properties for better identification
* added capturing GPU statistics
* implemented first tests for GPUs
* unified access to performance counters
* added interface for general compute resource handling
* implemented multi resource support in simmachine
* added individual edge to VM per resource
* extended compute resource interface
* implemented multi-resource support in PSU
* implemented generic retrieval of computeresources
* implemented mult-resource suppport in vm
* made method use more resource specific
* implemented simple GPU tests
* rolled back frquency and demand use
* made naming independent of used resource
* using workloads resources instead of VMs to determine available resource
* implemented determination of used resources in workload
* removed logging statements
* implemented reading from workload
* fixed naming for host-level allocation
* fixed next deadline calculation
* fixed forwarding supply
* reduced memory footprint
* made GPU powermodel nullable
* maded Gpu powermodel configurable in topology
* implemented tests for basic gpu scheduler
* added gpu properties
* implemented weights, filter and simple cpu-gpu scheduler
* spotless apply
* spotless apply pt. 2
* fixed capitalization
* spotless kotlin run
* implemented coloumn export
* todo update
* removed code comments
* Merged PerformanceCounter classes into one & removed interface
* removed GPU specific powermodel
* Rebase master: kept both versions of TopologyFactories
* renamed CpuPowermodel to resource independent Powermodel
Moved it from Cpu package to power package
* implementated default of getResourceType & removed overrides if possible
* split getResourceType into Consumer and Supplier
* added power as resource type
* reduced supply demand from arrayList to single value
* combining GPUs into one large GPU, until full multi-gpu support
* merged distribution policy enum with corresponding factory
* added comment
* post-rebase fixes
* aligned naming
* Added GPU metrics to task output
* Updates power resource type to uppercase.
Standardizes the `ResourceType.Power` enum to `ResourceType.POWER`
for consistency with other resource types and improved readability.
* Removes deprecated test assertions
Removes commented-out assertions in GPU tests.
These assertions are no longer needed and clutter the test code.
* Renames MaxMinFairnessStrategy to Policy
Renames MaxMinFairnessStrategy to MaxMinFairnessPolicy for
clarity and consistency with naming conventions. This change
affects the factory and distributor to use the updated name.
* applies spotless
* nulls GPUs as it is not used
Diffstat (limited to 'opendc-compute/opendc-compute-workload')
2 files changed, 50 insertions, 7 deletions
diff --git a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt index 80996c0e..7599d4e1 100644 --- a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt +++ b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt @@ -33,11 +33,15 @@ import org.opendc.trace.conv.resourceCpuCapacity import org.opendc.trace.conv.resourceCpuCount import org.opendc.trace.conv.resourceDeadline import org.opendc.trace.conv.resourceDuration +import org.opendc.trace.conv.resourceGpuCapacity +import org.opendc.trace.conv.resourceGpuCount +import org.opendc.trace.conv.resourceGpuMemCapacity import org.opendc.trace.conv.resourceID import org.opendc.trace.conv.resourceMemCapacity import org.opendc.trace.conv.resourceNature import org.opendc.trace.conv.resourceStateCpuUsage import org.opendc.trace.conv.resourceStateDuration +import org.opendc.trace.conv.resourceStateGpuUsage import org.opendc.trace.conv.resourceSubmissionTime import java.io.File import java.lang.ref.SoftReference @@ -79,6 +83,8 @@ public class ComputeWorkloadLoader( val durationCol = reader.resolve(resourceStateDuration) val coresCol = reader.resolve(resourceCpuCount) val usageCol = reader.resolve(resourceStateCpuUsage) + val gpuCoresCol = reader.resolve(resourceGpuCount) + val resourceGpuCapacityCol = reader.resolve(resourceStateGpuUsage) val fragments = mutableMapOf<String, Builder>() @@ -88,12 +94,23 @@ public class ComputeWorkloadLoader( val durationMs = reader.getDuration(durationCol)!! val cores = reader.getInt(coresCol) val cpuUsage = reader.getDouble(usageCol) + val gpuUsage = + if (reader.getDouble( + resourceGpuCapacityCol, + ).isNaN() + ) { + 0.0 + } else { + reader.getDouble(resourceGpuCapacityCol) // Default to 0 if not present + } + val gpuCores = reader.getInt(gpuCoresCol) // Default to 0 if not present + val gpuMemory = 0L // Default to 0 if not present val builder = fragments.computeIfAbsent( id, ) { Builder(checkpointInterval, checkpointDuration, checkpointIntervalScaling, scalingPolicy, id) } - builder.add(durationMs, cpuUsage, cores) + builder.add(durationMs, cpuUsage, cores, gpuUsage, gpuCores, gpuMemory) } fragments @@ -117,6 +134,9 @@ public class ComputeWorkloadLoader( val cpuCountCol = reader.resolve(resourceCpuCount) val cpuCapacityCol = reader.resolve(resourceCpuCapacity) val memCol = reader.resolve(resourceMemCapacity) + val gpuCapacityCol = reader.resolve(resourceGpuCapacity) // Assuming GPU capacity is also present + val gpuCoreCountCol = reader.resolve(resourceGpuCount) // Assuming GPU cores are also present + val gpuMemoryCol = reader.resolve(resourceGpuMemCapacity) // Assuming GPU memory is also present val natureCol = reader.resolve(resourceNature) val deadlineCol = reader.resolve(resourceDeadline) @@ -135,6 +155,17 @@ public class ComputeWorkloadLoader( val cpuCount = reader.getInt(cpuCountCol) val cpuCapacity = reader.getDouble(cpuCapacityCol) val memCapacity = reader.getDouble(memCol) / 1000.0 // Convert from KB to MB + val gpuUsage = + if (reader.getDouble( + gpuCapacityCol, + ).isNaN() + ) { + 0.0 + } else { + reader.getDouble(gpuCapacityCol) // Default to 0 if not present// Default to 0 if not present + } + val gpuCoreCount = reader.getInt(gpuCoreCountCol) // Default to 0 if not present + val gpuMemory = 0L // currently not implemented val uid = UUID.nameUUIDFromBytes("$id-${counter++}".toByteArray()) var nature = reader.getString(natureCol) var deadline = reader.getLong(deadlineCol) @@ -153,6 +184,9 @@ public class ComputeWorkloadLoader( cpuCount, cpuCapacity, memCapacity.roundToLong(), + gpuCoreCount, + gpuUsage, + gpuMemory, totalLoad, submissionTime, duration, @@ -224,17 +258,23 @@ public class ComputeWorkloadLoader( * Add a fragment to the trace. * * @param duration The duration of the fragment (in epoch millis). - * @param usage CPU usage of this fragment. - * @param cores Number of cores used. + * @param cpuUsage CPU usage of this fragment. + * @param cpuCores Number of cores used. + * @param gpuUsage GPU usage of this fragment. + * @param gpuCores Number of GPU cores used. + * @param gpuMemoryUsage GPU memory usage of this fragment. */ fun add( duration: Duration, - usage: Double, - cores: Int, + cpuUsage: Double, + cpuCores: Int, + gpuUsage: Double = 0.0, + gpuCores: Int = 0, + gpuMemoryUsage: Long = 0, ) { - totalLoad += (usage * duration.toMillis()) / 1000 // avg MHz * duration = MFLOPs + totalLoad += ((cpuUsage * duration.toMillis()) + (gpuUsage * duration.toMillis())) / 1000 // avg MHz * duration = MFLOPs - builder.add(duration.toMillis(), usage, cores) + builder.add(duration.toMillis(), cpuUsage, cpuCores, gpuUsage, gpuCores, gpuMemoryUsage) } /** diff --git a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/Task.kt b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/Task.kt index 787f271e..228b84ed 100644 --- a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/Task.kt +++ b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/Task.kt @@ -43,6 +43,9 @@ public data class Task( val cpuCount: Int, val cpuCapacity: Double, val memCapacity: Long, + val gpuCount: Int = 0, + val gpuCapacity: Double = 0.0, + val gpuMemCapacity: Long = 0L, val totalLoad: Double, var submissionTime: Long, val duration: Long, |
