From 0203254b709614fa732c114aa25916f61b8b3275 Mon Sep 17 00:00:00 2001 From: Niels Thiele Date: Sun, 22 Jun 2025 12:31:21 +0200 Subject: Implemented Single GPU Support & outline of host-level allocation policies (#342) * renamed performance counter to distinguish different resource types * added GPU, modelled similar to CPU * added GPUs to machine model * list of GPUs instead of single instance * renamed memory speed to bandwidth * enabled parsing of GPU resources * split powermodel into cpu and GPU powermodel * added gpu parsing tests * added idea of host level scheduling * added tests for multi gpu parsing * renamed powermodel to cpupowermodel * clarified naming of cpu and gpu components * added resource type to flow suplier and edge * added resourcetype * added GPU components and resource type to fragments * added GPU to workload and updated resource usage retrieval * implemented first version of multi resource * added name to workload * renamed perfomance counters * removed commented out code * removed deprecated comments * included demand and supply into calculations * resolving rebase mismatches * moved resource type from flowedge class to common package * added available resources to machinees * cleaner separation if workload is started of simmachine or vm * Replaced exception with dedicated enum * Only looping over resources that are actually used * using hashmaps to handle resourcetype instead of arrays for readability * fixed condition * tracking finished workloads per resource type * removed resource type from flowedge * made supply and demand distribution resource specific * added power model for GPU * removed unused test setup * removed depracated comments * removed unused parameter * added ID for GPU * added GPUs and GPU performance counters (naively) * implemented capturing of GPU statistics * added reminders for future implementations * renamed properties for better identification * added capturing GPU statistics * implemented first tests for GPUs * unified access to performance counters * added interface for general compute resource handling * implemented multi resource support in simmachine * added individual edge to VM per resource * extended compute resource interface * implemented multi-resource support in PSU * implemented generic retrieval of computeresources * implemented mult-resource suppport in vm * made method use more resource specific * implemented simple GPU tests * rolled back frquency and demand use * made naming independent of used resource * using workloads resources instead of VMs to determine available resource * implemented determination of used resources in workload * removed logging statements * implemented reading from workload * fixed naming for host-level allocation * fixed next deadline calculation * fixed forwarding supply * reduced memory footprint * made GPU powermodel nullable * maded Gpu powermodel configurable in topology * implemented tests for basic gpu scheduler * added gpu properties * implemented weights, filter and simple cpu-gpu scheduler * spotless apply * spotless apply pt. 2 * fixed capitalization * spotless kotlin run * implemented coloumn export * todo update * removed code comments * Merged PerformanceCounter classes into one & removed interface * removed GPU specific powermodel * Rebase master: kept both versions of TopologyFactories * renamed CpuPowermodel to resource independent Powermodel Moved it from Cpu package to power package * implementated default of getResourceType & removed overrides if possible * split getResourceType into Consumer and Supplier * added power as resource type * reduced supply demand from arrayList to single value * combining GPUs into one large GPU, until full multi-gpu support * merged distribution policy enum with corresponding factory * added comment * post-rebase fixes * aligned naming * Added GPU metrics to task output * Updates power resource type to uppercase. Standardizes the `ResourceType.Power` enum to `ResourceType.POWER` for consistency with other resource types and improved readability. * Removes deprecated test assertions Removes commented-out assertions in GPU tests. These assertions are no longer needed and clutter the test code. * Renames MaxMinFairnessStrategy to Policy Renames MaxMinFairnessStrategy to MaxMinFairnessPolicy for clarity and consistency with naming conventions. This change affects the factory and distributor to use the updated name. * applies spotless * nulls GPUs as it is not used --- .../opendc/compute/topology/TopologyFactories.kt | 57 ++++++++++++++++++---- .../org/opendc/compute/topology/specs/HostSpec.kt | 5 +- .../opendc/compute/topology/specs/TopologySpecs.kt | 17 ++++++- 3 files changed, 66 insertions(+), 13 deletions(-) (limited to 'opendc-compute/opendc-compute-topology/src') diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt index b6c945d2..b52608a9 100644 --- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt +++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt @@ -31,10 +31,13 @@ import org.opendc.compute.topology.specs.HostJSONSpec import org.opendc.compute.topology.specs.HostSpec import org.opendc.compute.topology.specs.PowerSourceSpec import org.opendc.compute.topology.specs.TopologySpec -import org.opendc.simulator.compute.cpu.getPowerModel import org.opendc.simulator.compute.models.CpuModel +import org.opendc.simulator.compute.models.GpuModel import org.opendc.simulator.compute.models.MachineModel import org.opendc.simulator.compute.models.MemoryUnit +import org.opendc.simulator.compute.power.getPowerModel +import org.opendc.simulator.engine.graph.distributionPolicies.DistributionPolicyFactory +import org.opendc.simulator.engine.graph.distributionPolicies.DistributionPolicyFactory.DistributionPolicyType import java.io.File import java.io.InputStream @@ -166,29 +169,63 @@ private fun HostJSONSpec.toHostSpec(clusterName: String): HostSpec { } val unknownMemoryUnit = MemoryUnit(memory.vendor, memory.modelName, memory.memorySpeed.toMHz(), memory.memorySize.toMiB().toLong()) + val gpuUnits = + List(gpu?.count ?: 0) { + GpuModel( + globalCoreId++, + gpu!!.coreCount, + gpu.coreSpeed.toMHz(), + gpu.memoryBandwidth.toKibps(), + gpu.memorySize.toMiB().toLong(), + gpu.vendor, + gpu.modelName, + gpu.architecture, + ) + } + val machineModel = MachineModel( units, unknownMemoryUnit, + gpuUnits, + // TODO: Pass through + DistributionPolicyFactory.getDistributionStrategy(DistributionPolicyType.MaxMinFairness), + DistributionPolicyFactory.getDistributionStrategy(DistributionPolicyType.MaxMinFairness), ) - val powerModel = + val cpuPowerModel = getPowerModel( - powerModel.modelType, - powerModel.power.toWatts(), - powerModel.maxPower.toWatts(), - powerModel.idlePower.toWatts(), - powerModel.calibrationFactor, - powerModel.asymUtil, - powerModel.dvfs, + cpuPowerModel.modelType, + cpuPowerModel.power.toWatts(), + cpuPowerModel.maxPower.toWatts(), + cpuPowerModel.idlePower.toWatts(), + cpuPowerModel.calibrationFactor, + cpuPowerModel.asymUtil, + cpuPowerModel.dvfs, ) + val gpuPowerModel = + if (gpuUnits.isEmpty()) { + null + } else { + getPowerModel( + gpuPowerModel.modelType, + gpuPowerModel.power.toWatts(), + gpuPowerModel.maxPower.toWatts(), + gpuPowerModel.idlePower.toWatts(), + gpuPowerModel.calibrationFactor, + gpuPowerModel.asymUtil, + gpuPowerModel.dvfs, + ) + } + val hostSpec = HostSpec( createUniqueName(this.name, hostNames), clusterName, machineModel, - powerModel, + cpuPowerModel, + gpuPowerModel, ) return hostSpec } diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt index e4ec89e1..30a75896 100644 --- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt +++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt @@ -22,8 +22,8 @@ package org.opendc.compute.topology.specs -import org.opendc.simulator.compute.cpu.CpuPowerModel import org.opendc.simulator.compute.models.MachineModel +import org.opendc.simulator.compute.power.PowerModel /** * Description of a physical host that will be simulated by OpenDC and host the virtual machines. @@ -36,7 +36,8 @@ public data class HostSpec( val name: String, val clusterName: String, val model: MachineModel, - val cpuPowerModel: CpuPowerModel, + val cpuPowerModel: PowerModel, + val gpuPowerModel: PowerModel?, val embodiedCarbon: Double = 1000.0, val expectedLifetime: Double = 5.0, ) diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt index 8cbf818b..62c3906a 100644 --- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt +++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt @@ -24,6 +24,7 @@ package org.opendc.compute.topology.specs import kotlinx.serialization.SerialName import kotlinx.serialization.Serializable +import org.opendc.common.units.DataRate import org.opendc.common.units.DataSize import org.opendc.common.units.Frequency import org.opendc.common.units.Power @@ -76,7 +77,9 @@ public data class HostJSONSpec( val cpu: CPUJSONSpec, val count: Int = 1, val memory: MemoryJSONSpec, - val powerModel: PowerModelSpec = PowerModelSpec.DFLT, + val gpu: GPUJSONSpec? = null, + val cpuPowerModel: PowerModelSpec = PowerModelSpec.DFLT, + val gpuPowerModel: PowerModelSpec = PowerModelSpec.DFLT, ) /** @@ -117,6 +120,18 @@ public data class MemoryJSONSpec( val memorySpeed: Frequency = Frequency.ofMHz(-1), ) +@Serializable +public data class GPUJSONSpec( + val count: Int = 1, + val coreCount: Int, + val coreSpeed: Frequency, + val memorySize: DataSize = DataSize.ofMiB(-1), + val memoryBandwidth: DataRate = DataRate.ofKibps(-1), + val vendor: String = "unknown", + val modelName: String = "unknown", + val architecture: String = "unknown", +) + @Serializable public data class PowerModelSpec( val modelType: String, -- cgit v1.2.3