From 0203254b709614fa732c114aa25916f61b8b3275 Mon Sep 17 00:00:00 2001 From: Niels Thiele Date: Sun, 22 Jun 2025 12:31:21 +0200 Subject: Implemented Single GPU Support & outline of host-level allocation policies (#342) * renamed performance counter to distinguish different resource types * added GPU, modelled similar to CPU * added GPUs to machine model * list of GPUs instead of single instance * renamed memory speed to bandwidth * enabled parsing of GPU resources * split powermodel into cpu and GPU powermodel * added gpu parsing tests * added idea of host level scheduling * added tests for multi gpu parsing * renamed powermodel to cpupowermodel * clarified naming of cpu and gpu components * added resource type to flow suplier and edge * added resourcetype * added GPU components and resource type to fragments * added GPU to workload and updated resource usage retrieval * implemented first version of multi resource * added name to workload * renamed perfomance counters * removed commented out code * removed deprecated comments * included demand and supply into calculations * resolving rebase mismatches * moved resource type from flowedge class to common package * added available resources to machinees * cleaner separation if workload is started of simmachine or vm * Replaced exception with dedicated enum * Only looping over resources that are actually used * using hashmaps to handle resourcetype instead of arrays for readability * fixed condition * tracking finished workloads per resource type * removed resource type from flowedge * made supply and demand distribution resource specific * added power model for GPU * removed unused test setup * removed depracated comments * removed unused parameter * added ID for GPU * added GPUs and GPU performance counters (naively) * implemented capturing of GPU statistics * added reminders for future implementations * renamed properties for better identification * added capturing GPU statistics * implemented first tests for GPUs * unified access to performance counters * added interface for general compute resource handling * implemented multi resource support in simmachine * added individual edge to VM per resource * extended compute resource interface * implemented multi-resource support in PSU * implemented generic retrieval of computeresources * implemented mult-resource suppport in vm * made method use more resource specific * implemented simple GPU tests * rolled back frquency and demand use * made naming independent of used resource * using workloads resources instead of VMs to determine available resource * implemented determination of used resources in workload * removed logging statements * implemented reading from workload * fixed naming for host-level allocation * fixed next deadline calculation * fixed forwarding supply * reduced memory footprint * made GPU powermodel nullable * maded Gpu powermodel configurable in topology * implemented tests for basic gpu scheduler * added gpu properties * implemented weights, filter and simple cpu-gpu scheduler * spotless apply * spotless apply pt. 2 * fixed capitalization * spotless kotlin run * implemented coloumn export * todo update * removed code comments * Merged PerformanceCounter classes into one & removed interface * removed GPU specific powermodel * Rebase master: kept both versions of TopologyFactories * renamed CpuPowermodel to resource independent Powermodel Moved it from Cpu package to power package * implementated default of getResourceType & removed overrides if possible * split getResourceType into Consumer and Supplier * added power as resource type * reduced supply demand from arrayList to single value * combining GPUs into one large GPU, until full multi-gpu support * merged distribution policy enum with corresponding factory * added comment * post-rebase fixes * aligned naming * Added GPU metrics to task output * Updates power resource type to uppercase. Standardizes the `ResourceType.Power` enum to `ResourceType.POWER` for consistency with other resource types and improved readability. * Removes deprecated test assertions Removes commented-out assertions in GPU tests. These assertions are no longer needed and clutter the test code. * Renames MaxMinFairnessStrategy to Policy Renames MaxMinFairnessStrategy to MaxMinFairnessPolicy for clarity and consistency with naming conventions. This change affects the factory and distributor to use the updated name. * applies spotless * nulls GPUs as it is not used --- .../org/opendc/trace/conv/ResourceColumns.kt | 18 +++++++++++++++ .../org/opendc/trace/conv/ResourceStateColumns.kt | 6 +++++ .../opendc/OdcVmResourceStateTableReader.kt | 9 ++++++++ .../opendc/OdcVmResourceStateTableWriter.kt | 14 ++++++++++- .../formats/opendc/OdcVmResourceTableReader.kt | 8 +++++++ .../formats/opendc/OdcVmResourceTableWriter.kt | 10 ++++++++ .../trace/formats/opendc/parquet/Resource.kt | 2 ++ .../opendc/parquet/ResourceRecordMaterializer.kt | 18 +++++++++++++++ .../trace/formats/opendc/parquet/ResourceState.kt | 2 ++ .../parquet/ResourceStateRecordMaterializer.kt | 27 +++++++++++++++++++++- 10 files changed, 112 insertions(+), 2 deletions(-) (limited to 'opendc-trace/opendc-trace-api') diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceColumns.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceColumns.kt index d0f56bff..181ca8e8 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceColumns.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceColumns.kt @@ -72,6 +72,24 @@ public val resourceCpuCapacity: String = "cpu_capacity" @JvmField public val resourceMemCapacity: String = "mem_capacity" +/** + * Number of GPU cores for the resource. + */ +@JvmField +public val resourceGpuCount: String = "gpu_count" + +/** + * Total GPU capacity of the resource in MHz. + */ +@JvmField +public val resourceGpuCapacity: String = "gpu_capacity" + +/** + * Total GPU memory capacity of the resource in MB. + */ +@JvmField +public val resourceGpuMemCapacity: String = "gpu_mem_capacity" + /** * Nature of the task. Delayable, interruptible, etc. */ diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt index eede6bd6..f4ab7759 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt @@ -95,3 +95,9 @@ public val resourceStateNetRx: String = "net_rx" */ @JvmField public val resourceStateNetTx: String = "net_tx" + +/** + * Total GPU capacity of the resource in MHz. + */ +@JvmField +public val resourceStateGpuUsage: String = "gpu_usage" diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt index 39475f9f..d474e0ec 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableReader.kt @@ -24,9 +24,11 @@ package org.opendc.trace.formats.opendc import org.opendc.trace.TableReader import org.opendc.trace.conv.resourceCpuCount +import org.opendc.trace.conv.resourceGpuCount import org.opendc.trace.conv.resourceID import org.opendc.trace.conv.resourceStateCpuUsage import org.opendc.trace.conv.resourceStateDuration +import org.opendc.trace.conv.resourceStateGpuUsage import org.opendc.trace.conv.resourceStateTimestamp import org.opendc.trace.formats.opendc.parquet.ResourceState import org.opendc.trace.util.parquet.LocalParquetReader @@ -60,6 +62,9 @@ internal class OdcVmResourceStateTableReader(private val reader: LocalParquetRea private val colDuration = 2 private val colCpuCount = 3 private val colCpuUsage = 4 + private val colGpuCount = 5 + private val colGpuUsage = 6 + private val colMemoryCapacity = 7 override fun resolve(name: String): Int { return when (name) { @@ -68,6 +73,8 @@ internal class OdcVmResourceStateTableReader(private val reader: LocalParquetRea resourceStateDuration -> colDuration resourceCpuCount -> colCpuCount resourceStateCpuUsage -> colCpuUsage + resourceGpuCount -> colGpuCount + resourceStateGpuUsage -> colGpuUsage else -> -1 } } @@ -85,6 +92,7 @@ internal class OdcVmResourceStateTableReader(private val reader: LocalParquetRea val record = checkNotNull(record) { "Reader in invalid state" } return when (index) { colCpuCount -> record.cpuCount + colGpuCount -> record.gpuCount else -> throw IllegalArgumentException("Invalid column or type [index $index]") } } @@ -101,6 +109,7 @@ internal class OdcVmResourceStateTableReader(private val reader: LocalParquetRea val record = checkNotNull(record) { "Reader in invalid state" } return when (index) { colCpuUsage -> record.cpuUsage + colGpuUsage -> record.gpuUsage else -> throw IllegalArgumentException("Invalid column or type [index $index]") } } diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableWriter.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableWriter.kt index 1421d77c..c6f117d2 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableWriter.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceStateTableWriter.kt @@ -25,9 +25,11 @@ package org.opendc.trace.formats.opendc import org.apache.parquet.hadoop.ParquetWriter import org.opendc.trace.TableWriter import org.opendc.trace.conv.resourceCpuCount +import org.opendc.trace.conv.resourceGpuCount import org.opendc.trace.conv.resourceID import org.opendc.trace.conv.resourceStateCpuUsage import org.opendc.trace.conv.resourceStateDuration +import org.opendc.trace.conv.resourceStateGpuUsage import org.opendc.trace.conv.resourceStateTimestamp import org.opendc.trace.formats.opendc.parquet.ResourceState import java.time.Duration @@ -47,6 +49,8 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter= lastTimestamp) { "Records need to be ordered by (id, timestamp)" } - writer.write(ResourceState(localID, localTimestamp, localDuration, localCpuCount, localCpuUsage)) + writer.write(ResourceState(localID, localTimestamp, localDuration, localCpuCount, localCpuUsage, localGpuCount, localGpuUsage)) lastId = localID lastTimestamp = localTimestamp @@ -76,6 +82,8 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter colDuration resourceCpuCount -> colCpuCount resourceStateCpuUsage -> colCpuUsage + resourceGpuCount -> colGpuCount + resourceStateGpuUsage -> colGpuUsage else -> -1 } } @@ -94,6 +102,7 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter localCpuCount = value + colGpuCount -> localGpuCount = value else -> throw IllegalArgumentException("Invalid column or type [index $index]") } } @@ -119,6 +128,7 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter localCpuUsage = value + colGpuUsage -> localGpuUsage = value else -> throw IllegalArgumentException("Invalid column or type [index $index]") } } @@ -206,4 +216,6 @@ internal class OdcVmResourceStateTableWriter(private val writer: ParquetWriter colMemCapacity resourceNature -> colNature resourceDeadline -> colDeadline + resourceGpuCount -> colGpuCount + resourceGpuCapacity -> colGpuCapacity else -> -1 } } @@ -101,6 +107,7 @@ internal class OdcVmResourceTableReader(private val reader: LocalParquetReader record.cpuCount + colGpuCount -> record.gpuCount else -> throw IllegalArgumentException("Invalid column") } } @@ -124,6 +131,7 @@ internal class OdcVmResourceTableReader(private val reader: LocalParquetReader record.cpuCapacity colMemCapacity -> record.memCapacity + colGpuCapacity -> record.gpuCapacity else -> throw IllegalArgumentException("Invalid column") } } diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceTableWriter.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceTableWriter.kt index 2b8db7f1..310d3dfc 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceTableWriter.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/OdcVmResourceTableWriter.kt @@ -53,6 +53,8 @@ internal class OdcVmResourceTableWriter(private val writer: ParquetWriter localCpuCount = value + colGpuCount -> localGpuCount = value else -> throw IllegalArgumentException("Invalid column or type [index $index]") } } @@ -142,6 +149,7 @@ internal class OdcVmResourceTableWriter(private val writer: ParquetWriter localCpuCapacity = value colMemCapacity -> localMemCapacity = value + colGpuCapacity -> localGpuCapacity = value else -> throw IllegalArgumentException("Invalid column or type [index $index]") } } @@ -220,4 +228,6 @@ internal class OdcVmResourceTableWriter(private val writer: ParquetWriter + object : PrimitiveConverter() { + override fun addInt(value: Int) { + localGpuCount = value + } + } + "gpu_capacity" -> + object : PrimitiveConverter() { + override fun addDouble(value: Double) { + localGpuCapacity = value + } + } "nature" -> object : PrimitiveConverter() { override fun addBinary(value: Binary) { @@ -120,6 +134,8 @@ internal class ResourceRecordMaterializer(schema: MessageType) : RecordMateriali localCpuCount = 0 localCpuCapacity = 0.0 localMemCapacity = 0.0 + localGpuCount = 0 + localGpuCapacity = 0.0 localNature = null localDeadline = -1 } @@ -137,6 +153,8 @@ internal class ResourceRecordMaterializer(schema: MessageType) : RecordMateriali localCpuCount, localCpuCapacity, localMemCapacity, + localGpuCount, + localGpuCapacity, localNature, localDeadline, ) diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceState.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceState.kt index 64ab9dca..10fc6be4 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceState.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceState.kt @@ -31,4 +31,6 @@ internal class ResourceState( val duration: Duration, val cpuCount: Int, val cpuUsage: Double, + val gpuCount: Int, + val gpuUsage: Double, ) diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateRecordMaterializer.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateRecordMaterializer.kt index 8ff0e476..9ad786d5 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateRecordMaterializer.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/formats/opendc/parquet/ResourceStateRecordMaterializer.kt @@ -43,6 +43,8 @@ internal class ResourceStateRecordMaterializer(schema: MessageType) : RecordMate private var localDuration = Duration.ZERO private var localCpuCount = 0 private var localCpuUsage = 0.0 + private var localGpuCount = 0 + private var localGpuUsage = 0.0 /** * Root converter for the record. @@ -85,6 +87,18 @@ internal class ResourceStateRecordMaterializer(schema: MessageType) : RecordMate localCpuUsage = value } } + "gpu_count", "gpu_cores" -> + object : PrimitiveConverter() { + override fun addInt(value: Int) { + localGpuCount = value + } + } + "gpu_usage", "gpuUsage" -> + object : PrimitiveConverter() { + override fun addDouble(value: Double) { + localGpuUsage = value + } + } "flops" -> object : PrimitiveConverter() { override fun addLong(value: Long) { @@ -101,6 +115,8 @@ internal class ResourceStateRecordMaterializer(schema: MessageType) : RecordMate localDuration = Duration.ZERO localCpuCount = 0 localCpuUsage = 0.0 + localGpuCount = 0 + localGpuUsage = 0.0 } override fun end() {} @@ -108,7 +124,16 @@ internal class ResourceStateRecordMaterializer(schema: MessageType) : RecordMate override fun getConverter(fieldIndex: Int): Converter = converters[fieldIndex] } - override fun getCurrentRecord(): ResourceState = ResourceState(localId, localTimestamp, localDuration, localCpuCount, localCpuUsage) + override fun getCurrentRecord(): ResourceState = + ResourceState( + localId, + localTimestamp, + localDuration, + localCpuCount, + localCpuUsage, + localGpuCount, + localGpuUsage, + ) override fun getRootConverter(): GroupConverter = root } -- cgit v1.2.3