diff options
Diffstat (limited to 'opendc-compute')
35 files changed, 997 insertions, 98 deletions
diff --git a/opendc-compute/opendc-compute-api/src/main/kotlin/org/opendc/compute/api/Flavor.kt b/opendc-compute/opendc-compute-api/src/main/kotlin/org/opendc/compute/api/Flavor.kt index e88379f6..a54a0130 100644 --- a/opendc-compute/opendc-compute-api/src/main/kotlin/org/opendc/compute/api/Flavor.kt +++ b/opendc-compute/opendc-compute-api/src/main/kotlin/org/opendc/compute/api/Flavor.kt @@ -30,10 +30,15 @@ public interface Flavor : Resource { /** * The number of (virtual) processing cores to use. */ - public val coreCount: Int + public val cpuCoreCount: Int /** * The amount of RAM available to the task (in MB). */ public val memorySize: Long + + /** + * The amount of gpu cores available to the task. + */ + public val gpuCoreCount: Int } diff --git a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/host/GpuHostModel.java b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/host/GpuHostModel.java new file mode 100644 index 00000000..97aaa820 --- /dev/null +++ b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/host/GpuHostModel.java @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2022 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.compute.simulator.host; + +/** + * A model for a GPU in a host. + * + * @param gpuCoreCapacity The capacity of the GPU cores hz. + * @param gpuCoreCount The number of GPU cores. + * @param GpuMemoryCapacity The capacity of the GPU memory in GB. + * @param GpuMemorySpeed The speed of the GPU memory in GB/s. + */ +public record GpuHostModel(double gpuCoreCapacity, int gpuCoreCount, long GpuMemoryCapacity, double GpuMemorySpeed) {} diff --git a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/host/HostModel.java b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/host/HostModel.java index 1ea73ea6..6464a56c 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/host/HostModel.java +++ b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/host/HostModel.java @@ -22,11 +22,24 @@ package org.opendc.compute.simulator.host; +import java.util.List; + /** * Record describing the static machine properties of the host. * - * @param cpuCapacity The total CPU capacity of the host in MHz. - * @param coreCount The number of logical processing cores available for this host. + * @param cpuCapacity The total CPU capacity of the host in MHz. + * @param coreCount The number of logical processing cores available for this host. * @param memoryCapacity The amount of memory available for this host in MB. */ -public record HostModel(double cpuCapacity, int coreCount, long memoryCapacity) {} +public record HostModel(double cpuCapacity, int coreCount, long memoryCapacity, List<GpuHostModel> gpuHostModels) { + /** + * Create a new host model. + * + * @param cpuCapacity The total CPU capacity of the host in MHz. + * @param coreCount The number of logical processing cores available for this host. + * @param memoryCapacity The amount of memory available for this host in MB. + */ + public HostModel(double cpuCapacity, int coreCount, long memoryCapacity) { + this(cpuCapacity, coreCount, memoryCapacity, null); + } +} diff --git a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java index 2b4306af..835c7186 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java +++ b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ComputeService.java @@ -198,7 +198,7 @@ public final class ComputeService implements AutoCloseable, CarbonReceiver { HostView hv = hostToView.get(host); final ServiceFlavor flavor = task.getFlavor(); if (hv != null) { - hv.provisionedCores -= flavor.getCoreCount(); + hv.provisionedCpuCores -= flavor.getCpuCoreCount(); hv.instanceCount--; hv.availableMemory += flavor.getMemorySize(); } else { @@ -496,7 +496,7 @@ public final class ComputeService implements AutoCloseable, CarbonReceiver { if (result.getResultType() == SchedulingResultType.FAILURE) { LOGGER.trace("Task {} selected for scheduling but no capacity available for it at the moment", task); - if (flavor.getMemorySize() > maxMemory || flavor.getCoreCount() > maxCores) { + if (flavor.getMemorySize() > maxMemory || flavor.getCpuCoreCount() > maxCores) { // Remove the incoming image taskQueue.remove(req); tasksPending--; @@ -531,7 +531,7 @@ public final class ComputeService implements AutoCloseable, CarbonReceiver { attemptsSuccess++; hv.instanceCount++; - hv.provisionedCores += flavor.getCoreCount(); + hv.provisionedCpuCores += flavor.getCpuCoreCount(); hv.availableMemory -= flavor.getMemorySize(); activeTasks.put(task, host); @@ -612,12 +612,12 @@ public final class ComputeService implements AutoCloseable, CarbonReceiver { @NotNull public ServiceFlavor newFlavor( - @NotNull String name, int cpuCount, long memorySize, @NotNull Map<String, ?> meta) { + @NotNull String name, int cpuCount, long memorySize, int gpuCoreCount, @NotNull Map<String, ?> meta) { checkOpen(); final ComputeService service = this.service; UUID uid = new UUID(service.clock.millis(), service.random.nextLong()); - ServiceFlavor flavor = new ServiceFlavor(service, uid, name, cpuCount, memorySize, meta); + ServiceFlavor flavor = new ServiceFlavor(service, uid, name, cpuCount, memorySize, gpuCoreCount, meta); // service.flavorById.put(uid, flavor); // service.flavors.add(flavor); diff --git a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/HostView.java b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/HostView.java index 7c548add..c07f58c7 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/HostView.java +++ b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/HostView.java @@ -31,7 +31,8 @@ public class HostView { private final SimHost host; int instanceCount; long availableMemory; - int provisionedCores; + int provisionedCpuCores; + int provisionedGpuCores; /** * Scheduler bookkeeping @@ -83,8 +84,12 @@ public class HostView { /** * Return the provisioned cores on the host. */ - public int getProvisionedCores() { - return provisionedCores; + public int getProvisionedCpuCores() { + return provisionedCpuCores; + } + + public int getProvisionedGpuCores() { + return provisionedGpuCores; } @Override diff --git a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ServiceFlavor.java b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ServiceFlavor.java index eddde87e..8a4359b4 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ServiceFlavor.java +++ b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/service/ServiceFlavor.java @@ -36,22 +36,31 @@ public final class ServiceFlavor implements Flavor { private final ComputeService service; private final UUID uid; private final String name; - private final int coreCount; + private final int cpuCoreCount; private final long memorySize; + private final int gpuCoreCount; private final Map<String, ?> meta; - ServiceFlavor(ComputeService service, UUID uid, String name, int coreCount, long memorySize, Map<String, ?> meta) { + ServiceFlavor( + ComputeService service, + UUID uid, + String name, + int cpuCoreCount, + long memorySize, + int gpuCoreCount, + Map<String, ?> meta) { this.service = service; this.uid = uid; this.name = name; - this.coreCount = coreCount; + this.cpuCoreCount = cpuCoreCount; this.memorySize = memorySize; + this.gpuCoreCount = gpuCoreCount; this.meta = meta; } @Override - public int getCoreCount() { - return coreCount; + public int getCpuCoreCount() { + return cpuCoreCount; } @Override @@ -59,6 +68,11 @@ public final class ServiceFlavor implements Flavor { return memorySize; } + @Override + public int getGpuCoreCount() { + return gpuCoreCount; + } + @NotNull @Override public UUID getUid() { diff --git a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/telemetry/GuestGpuStats.java b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/telemetry/GuestGpuStats.java new file mode 100644 index 00000000..1aba13e3 --- /dev/null +++ b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/telemetry/GuestGpuStats.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.compute.simulator.telemetry; + +/** + * Statistics about the GPUs of a guest. + * + * @param activeTime The cumulative time (in seconds) that the GPUs of the guest were actively running. + * @param idleTime The cumulative time (in seconds) the GPUs of the guest were idle. + * @param stealTime The cumulative GPU time (in seconds) that the guest was ready to run, but not granted time by the host. + * @param lostTime The cumulative GPU time (in seconds) that was lost due to interference with other machines. + * @param capacity The available GPU capacity of the guest (in MHz). + * @param usage Amount of GPU resources (in MHz) actually used by the guest. + * @param utilization The utilization of the GPU resources (in %) relative to the total GPU capacity. + */ +public record GuestGpuStats( + long activeTime, + long idleTime, + long stealTime, + long lostTime, + double capacity, + double usage, + double demand, + double utilization) {} diff --git a/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/telemetry/HostGpuStats.java b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/telemetry/HostGpuStats.java new file mode 100644 index 00000000..e42d7704 --- /dev/null +++ b/opendc-compute/opendc-compute-simulator/src/main/java/org/opendc/compute/simulator/telemetry/HostGpuStats.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2022 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.compute.simulator.telemetry; + +/** + * Statistics about the GPUs of a host. + * + * @param activeTime The cumulative time (in seconds) that the GPUs of the host were actively running. + * @param idleTime The cumulative time (in seconds) the GPUs of the host were idle. + * @param stealTime The cumulative GPU time (in seconds) that virtual machines were ready to run, but were not able to. + * @param lostTime The cumulative GPU time (in seconds) that was lost due to interference between virtual machines. + * @param capacity The available GPU capacity of the host (in MHz). + * @param demand Amount of GPU resources (in MHz) the guests would use if there were no GPU contention or GPU + * limits. + * @param usage Amount of GPU resources (in MHz) actually used by the host. + * @param utilization The utilization of the GPU resources (in %) relative to the total GPU capacity. + */ +public record HostGpuStats( + long activeTime, + long idleTime, + long stealTime, + long lostTime, + double capacity, + double demand, + double usage, + double utilization) {} diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt index d23794ab..effe3d5b 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/host/SimHost.kt @@ -22,19 +22,22 @@ package org.opendc.compute.simulator.host +import org.opendc.common.ResourceType import org.opendc.compute.api.Flavor import org.opendc.compute.api.TaskState import org.opendc.compute.simulator.internal.Guest import org.opendc.compute.simulator.internal.GuestListener import org.opendc.compute.simulator.service.ServiceTask import org.opendc.compute.simulator.telemetry.GuestCpuStats +import org.opendc.compute.simulator.telemetry.GuestGpuStats import org.opendc.compute.simulator.telemetry.GuestSystemStats import org.opendc.compute.simulator.telemetry.HostCpuStats +import org.opendc.compute.simulator.telemetry.HostGpuStats import org.opendc.compute.simulator.telemetry.HostSystemStats -import org.opendc.simulator.compute.cpu.CpuPowerModel import org.opendc.simulator.compute.machine.SimMachine import org.opendc.simulator.compute.models.MachineModel import org.opendc.simulator.compute.models.MemoryUnit +import org.opendc.simulator.compute.power.PowerModel import org.opendc.simulator.engine.engine.FlowEngine import org.opendc.simulator.engine.graph.FlowDistributor import java.time.Duration @@ -57,7 +60,8 @@ public class SimHost( private val clock: InstantSource, private val engine: FlowEngine, private val machineModel: MachineModel, - private val cpuPowerModel: CpuPowerModel, + private val cpuPowerModel: PowerModel, + private val gpuPowerModel: PowerModel?, private val embodiedCarbon: Double, private val expectedLifetime: Double, private val powerDistributor: FlowDistributor, @@ -81,11 +85,22 @@ public class SimHost( field = value } + private val gpuHostModels: List<GpuHostModel>? = + machineModel.gpuModels?.map { gpumodel -> + return@map GpuHostModel( + gpumodel.totalCoreCapacity, + gpumodel.coreCount, + gpumodel.memorySize, + gpumodel.memoryBandwidth, + ) + } + private val model: HostModel = HostModel( machineModel.cpuModel.totalCapacity, machineModel.cpuModel.coreCount, machineModel.memory.size, + gpuHostModels, ) private var simMachine: SimMachine? = null @@ -136,6 +151,7 @@ public class SimHost( this.machineModel, this.powerDistributor, this.cpuPowerModel, + this.gpuPowerModel, ) { cause -> hostState = if (cause != null) HostState.ERROR else HostState.DOWN } @@ -207,7 +223,7 @@ public class SimHost( public fun canFit(task: ServiceTask): Boolean { val sufficientMemory = model.memoryCapacity >= task.flavor.memorySize - val enoughCpus = model.coreCount >= task.flavor.coreCount + val enoughCpus = model.coreCount >= task.flavor.cpuCoreCount val canFit = simMachine!!.canFit(task.flavor.toMachineModel()) return sufficientMemory && enoughCpus && canFit @@ -324,14 +340,14 @@ public class SimHost( val counters = simMachine!!.performanceCounters return HostCpuStats( - counters.cpuActiveTime, - counters.cpuIdleTime, - counters.cpuStealTime, - counters.cpuLostTime, - counters.cpuCapacity, - counters.cpuDemand, - counters.cpuSupply, - counters.cpuSupply / cpuLimit, + counters.activeTime, + counters.idleTime, + counters.stealTime, + counters.lostTime, + counters.capacity, + counters.demand, + counters.supply, + counters.supply / cpuLimit, ) } @@ -340,6 +356,33 @@ public class SimHost( return guest.getCpuStats() } + public fun getGpuStats(): List<HostGpuStats> { + val gpuStats = mutableListOf<HostGpuStats>() + for (gpu in simMachine!!.gpus) { + gpu.updateCounters(this.clock.millis()) + val counters = simMachine!!.getGpuPerformanceCounters(gpu.id) + + gpuStats.add( + HostGpuStats( + counters.activeTime, + counters.idleTime, + counters.stealTime, + counters.lostTime, + counters.capacity, + counters.demand, + counters.supply, + counters.supply / gpu.getCapacity(ResourceType.GPU), + ), + ) + } + return gpuStats + } + + public fun getGpuStats(task: ServiceTask): List<GuestGpuStats> { + val guest = requireNotNull(taskToGuestMap[task]) { "Unknown task ${task.name} at host $name" } + return guest.getGpuStats() + } + override fun hashCode(): Int = name.hashCode() override fun equals(other: Any?): Boolean { @@ -352,7 +395,13 @@ public class SimHost( * Convert flavor to machine model. */ private fun Flavor.toMachineModel(): MachineModel { - return MachineModel(simMachine!!.machineModel.cpuModel, MemoryUnit("Generic", "Generic", 3200.0, memorySize)) + return MachineModel( + simMachine!!.machineModel.cpuModel, + MemoryUnit("Generic", "Generic", 3200.0, memorySize), + simMachine!!.machineModel.gpuModels, + simMachine!!.machineModel.cpuDistributionStrategy, + simMachine!!.machineModel.gpuDistributionStrategy, + ) } /** diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt index fe8cbf2f..a980f6cb 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/internal/Guest.kt @@ -27,6 +27,7 @@ import org.opendc.compute.api.TaskState import org.opendc.compute.simulator.host.SimHost import org.opendc.compute.simulator.service.ServiceTask import org.opendc.compute.simulator.telemetry.GuestCpuStats +import org.opendc.compute.simulator.telemetry.GuestGpuStats import org.opendc.compute.simulator.telemetry.GuestSystemStats import org.opendc.simulator.compute.machine.SimMachine import org.opendc.simulator.compute.workload.ChainWorkload @@ -64,6 +65,7 @@ public class Guest( private var lastReport = clock.millis() private var bootTime: Instant? = null private val cpuLimit = simMachine.cpu.cpuModel.totalCapacity + private val gpuLimit = simMachine.gpus?.firstOrNull()?.gpuModel?.totalCoreCapacity ?: 0.0 /** * Start the guest. @@ -242,20 +244,43 @@ public class Guest( */ public fun getCpuStats(): GuestCpuStats { virtualMachine!!.updateCounters(this.clock.millis()) - val counters = virtualMachine!!.performanceCounters + val counters = virtualMachine!!.cpuPerformanceCounters return GuestCpuStats( - counters.cpuActiveTime / 1000L, - counters.cpuIdleTime / 1000L, - counters.cpuStealTime / 1000L, - counters.cpuLostTime / 1000L, - counters.cpuCapacity, - counters.cpuSupply, - counters.cpuDemand, - counters.cpuSupply / cpuLimit, + counters.activeTime / 1000L, + counters.idleTime / 1000L, + counters.stealTime / 1000L, + counters.lostTime / 1000L, + counters.capacity, + counters.supply, + counters.demand, + counters.supply / cpuLimit, ) } + public fun getGpuStats(): List<GuestGpuStats> { + virtualMachine!!.updateCounters(this.clock.millis()) + val counters = virtualMachine!!.gpuPerformanceCounters + + val gpuStats = mutableListOf<GuestGpuStats>() + for (gpuCounter in counters) { + gpuStats.add( + GuestGpuStats( + gpuCounter.activeTime / 1000L, + gpuCounter.idleTime / 1000L, + gpuCounter.stealTime / 1000L, + gpuCounter.lostTime / 1000L, + gpuCounter.capacity, + gpuCounter.supply, + gpuCounter.demand, + // Assuming similar scaling as CPU + gpuCounter.supply / gpuLimit, + ), + ) + } + return gpuStats + } + /** * Helper function to track the uptime and downtime of the guest. */ diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/provisioner/HostsProvisioningStep.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/provisioner/HostsProvisioningStep.kt index 675ce3a9..791ab692 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/provisioner/HostsProvisioningStep.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/provisioner/HostsProvisioningStep.kt @@ -127,6 +127,7 @@ public class HostsProvisioningStep internal constructor( engine, hostSpec.model, hostSpec.cpuPowerModel, + hostSpec.gpuPowerModel, hostSpec.embodiedCarbon, hostSpec.expectedLifetime, hostDistributor, diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt index e70cec58..0376a492 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/ComputeSchedulers.kt @@ -27,11 +27,13 @@ package org.opendc.compute.simulator.scheduler import org.opendc.compute.simulator.scheduler.filters.ComputeFilter import org.opendc.compute.simulator.scheduler.filters.RamFilter import org.opendc.compute.simulator.scheduler.filters.VCpuFilter +import org.opendc.compute.simulator.scheduler.filters.VGpuFilter import org.opendc.compute.simulator.scheduler.timeshift.TimeshiftScheduler import org.opendc.compute.simulator.scheduler.weights.CoreRamWeigher import org.opendc.compute.simulator.scheduler.weights.InstanceCountWeigher import org.opendc.compute.simulator.scheduler.weights.RamWeigher import org.opendc.compute.simulator.scheduler.weights.VCpuWeigher +import org.opendc.compute.simulator.scheduler.weights.VGpuWeigher import java.time.InstantSource import java.util.SplittableRandom import java.util.random.RandomGenerator @@ -48,6 +50,8 @@ public enum class ComputeSchedulerEnum { Random, TaskNumMemorizing, Timeshift, + ProvisionedCpuGpuCores, + ProvisionedCpuGpuCoresInv, } public fun createPrefabComputeScheduler( @@ -68,6 +72,7 @@ public fun createPrefabComputeScheduler( ): ComputeScheduler { val cpuAllocationRatio = 1.0 val ramAllocationRatio = 1.5 + val gpuAllocationRatio = 1.0 return when (name) { ComputeSchedulerEnum.Mem -> FilterScheduler( @@ -128,5 +133,31 @@ public fun createPrefabComputeScheduler( clock = clock, random = SplittableRandom(seeder.nextLong()), ) + ComputeSchedulerEnum.ProvisionedCpuGpuCores -> + FilterScheduler( + filters = + listOf( + ComputeFilter(), + VCpuFilter(cpuAllocationRatio), + VGpuFilter(gpuAllocationRatio), + RamFilter(ramAllocationRatio), + ), + weighers = listOf(VCpuWeigher(cpuAllocationRatio, multiplier = 1.0), VGpuWeigher(gpuAllocationRatio, multiplier = 1.0)), + ) + ComputeSchedulerEnum.ProvisionedCpuGpuCoresInv -> + FilterScheduler( + filters = + listOf( + ComputeFilter(), + VCpuFilter(cpuAllocationRatio), + VGpuFilter(gpuAllocationRatio), + RamFilter(ramAllocationRatio), + ), + weighers = + listOf( + VCpuWeigher(cpuAllocationRatio, multiplier = -1.0), + VGpuWeigher(gpuAllocationRatio, multiplier = -1.0), + ), + ) } } diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VCpuCapacityFilter.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VCpuCapacityFilter.kt index 4e63baf4..7fa7a051 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VCpuCapacityFilter.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VCpuCapacityFilter.kt @@ -40,7 +40,7 @@ public class VCpuCapacityFilter : HostFilter { return ( requiredCapacity == null || (availableCapacity / host.host.getModel().coreCount) - >= (requiredCapacity / task.flavor.coreCount) + >= (requiredCapacity / task.flavor.cpuCoreCount) ) } } diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VCpuFilter.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VCpuFilter.kt index c179a7bf..89739658 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VCpuFilter.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VCpuFilter.kt @@ -35,7 +35,7 @@ public class VCpuFilter(private val allocationRatio: Double) : HostFilter { host: HostView, task: ServiceTask, ): Boolean { - val requested = task.flavor.coreCount + val requested = task.flavor.cpuCoreCount val totalCores = host.host.getModel().coreCount val limit = totalCores * allocationRatio @@ -44,7 +44,7 @@ public class VCpuFilter(private val allocationRatio: Double) : HostFilter { return false } - val availableCores = limit - host.provisionedCores + val availableCores = limit - host.provisionedCpuCores return availableCores >= requested } } diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt new file mode 100644 index 00000000..6dc27327 --- /dev/null +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuCapacityFilter.kt @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.compute.simulator.scheduler.filters + +import org.opendc.compute.simulator.service.HostView +import org.opendc.compute.simulator.service.ServiceTask +import kotlin.collections.maxOfOrNull + +/** + * A [HostFilter] that filters hosts based on the vCPU speed requirements of a [ServiceTask] and the available + * capacity on the host. + */ +public class VGpuCapacityFilter : HostFilter { + override fun test( + host: HostView, + task: ServiceTask, + ): Boolean { + val requiredCapacity = task.flavor.meta["gpu-capacity"] as? Double + val availableCapacity = (host.host.getModel().gpuHostModels().maxOfOrNull { it.gpuCoreCapacity() } ?: 0).toDouble() + val availableCores = (host.host.getModel().gpuHostModels().maxOfOrNull { it -> it.gpuCoreCount } ?: -1).toDouble() + val availableRatio = availableCapacity / availableCores + + return ( + requiredCapacity == null || + ((availableCapacity / availableCores) >= (requiredCapacity / task.flavor.gpuCoreCount)) + ) + } +} diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt new file mode 100644 index 00000000..9f564776 --- /dev/null +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/filters/VGpuFilter.kt @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.compute.simulator.scheduler.filters + +import org.opendc.compute.simulator.service.HostView +import org.opendc.compute.simulator.service.ServiceTask + +/** + * A [HostFilter] that filters hosts based on the vCPU requirements of a [ServiceTask] and the available vCPUs on the host. + * + * @param allocationRatio Virtual CPU to physical CPU allocation ratio. + */ +public class VGpuFilter(private val allocationRatio: Double) : HostFilter { + override fun test( + host: HostView, + task: ServiceTask, + ): Boolean { + val requested = task.flavor.gpuCoreCount + val totalCores = host.host.getModel().gpuHostModels().maxOfOrNull { it.gpuCoreCount() } ?: 0 + val limit = totalCores * allocationRatio + + // Do not allow an instance to overcommit against itself, only against other instances + if (requested > totalCores) { + return false + } + + val availableCores = limit - host.provisionedGpuCores + return availableCores >= requested + } +} diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VCpuCapacityWeigher.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VCpuCapacityWeigher.kt index 4f52e11a..d9b094fb 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VCpuCapacityWeigher.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VCpuCapacityWeigher.kt @@ -35,7 +35,7 @@ public class VCpuCapacityWeigher(override val multiplier: Double = 1.0) : HostWe ): Double { val model = host.host.getModel() val requiredCapacity = task.flavor.meta["cpu-capacity"] as? Double ?: 0.0 - return model.cpuCapacity - requiredCapacity / task.flavor.coreCount + return model.cpuCapacity - requiredCapacity / task.flavor.cpuCoreCount } override fun toString(): String = "VCpuWeigher" diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VCpuWeigher.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VCpuWeigher.kt index 3f9a7f03..d882c237 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VCpuWeigher.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VCpuWeigher.kt @@ -39,7 +39,7 @@ public class VCpuWeigher(private val allocationRatio: Double, override val multi host: HostView, task: ServiceTask, ): Double { - return allocationRatio - host.provisionedCores + return allocationRatio - host.provisionedCpuCores } override fun toString(): String = "VCpuWeigher" diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VGpuCapacityWeigher.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VGpuCapacityWeigher.kt new file mode 100644 index 00000000..35f2c7b9 --- /dev/null +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VGpuCapacityWeigher.kt @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.compute.simulator.scheduler.weights + +import org.opendc.compute.simulator.service.HostView +import org.opendc.compute.simulator.service.ServiceTask + +/** + * A [HostWeigher] that weighs the hosts based on the difference required vCPU capacity and the available CPU capacity. + */ +public class VGpuCapacityWeigher(override val multiplier: Double = 1.0) : HostWeigher { + override fun getWeight( + host: HostView, + task: ServiceTask, + ): Double { + val model = host.host.getModel() + val requiredCapacity = task.flavor.meta["gpu-capacity"] as? Double ?: 0.0 + val availableCapacity = model.gpuHostModels.maxOfOrNull { it.gpuCoreCapacity } ?: 0.0 + return availableCapacity - requiredCapacity / task.flavor.gpuCoreCount + } + + override fun toString(): String = "VGpuWeigher" +} diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VGpuWeigher.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VGpuWeigher.kt new file mode 100644 index 00000000..7397bf10 --- /dev/null +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/scheduler/weights/VGpuWeigher.kt @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.compute.simulator.scheduler.weights + +import org.opendc.compute.simulator.service.HostView +import org.opendc.compute.simulator.service.ServiceTask + +/** + * A [HostWeigher] that weighs the hosts based on the remaining number of vCPUs available. + * + * @param allocationRatio Virtual CPU to physical CPU allocation ratio. + */ +public class VGpuWeigher(private val allocationRatio: Double, override val multiplier: Double = 1.0) : HostWeigher { + init { + require(allocationRatio > 0.0) { "Allocation ratio must be greater than zero" } + } + + override fun getWeight( + host: HostView, + task: ServiceTask, + ): Double { + return allocationRatio - host.provisionedGpuCores + } + + override fun toString(): String = "VGpuWeigher" +} diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/parquet/DfltHostExportColumns.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/parquet/DfltHostExportColumns.kt index 00f7854d..affaab58 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/parquet/DfltHostExportColumns.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/parquet/DfltHostExportColumns.kt @@ -144,6 +144,48 @@ public object DfltHostExportColumns { field = Types.required(INT64).named("cpu_time_lost"), ) { it.cpuLostTime } + // TODO: support multiple GPUs + + public val GPU_CAPACITY: ExportColumn<HostTableReader> = + ExportColumn( + field = Types.optional(FLOAT).named("gpu_capacity"), + ) { it.gpuCapacities.getOrNull(0) } + + public val GPU_USAGE: ExportColumn<HostTableReader> = + ExportColumn( + field = Types.optional(FLOAT).named("gpu_usage"), + ) { it.gpuUsages.getOrNull(0) } + + public val GPU_DEMAND: ExportColumn<HostTableReader> = + ExportColumn( + field = Types.optional(FLOAT).named("gpu_demand"), + ) { it.gpuDemands.getOrNull(0) } + + public val GPU_UTILIZATION: ExportColumn<HostTableReader> = + ExportColumn( + field = Types.optional(FLOAT).named("gpu_utilization"), + ) { it.gpuUtilizations.getOrNull(0) } + + public val GPU_TIME_ACTIVE: ExportColumn<HostTableReader> = + ExportColumn( + field = Types.optional(INT64).named("gpu_time_active"), + ) { it.gpuActiveTimes.getOrNull(0) } + + public val GPU_TIME_IDLE: ExportColumn<HostTableReader> = + ExportColumn( + field = Types.optional(INT64).named("gpu_time_idle"), + ) { it.gpuIdleTimes.getOrNull(0) } + + public val GPU_TIME_STEAL: ExportColumn<HostTableReader> = + ExportColumn( + field = Types.optional(INT64).named("gpu_time_steal"), + ) { it.gpuStealTimes.getOrNull(0) } + + public val GPU_TIME_LOST: ExportColumn<HostTableReader> = + ExportColumn( + field = Types.optional(INT64).named("gpu_time_lost"), + ) { it.gpuLostTimes.getOrNull(0) } + public val POWER_DRAW: ExportColumn<HostTableReader> = ExportColumn( field = Types.required(FLOAT).named("power_draw"), diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/parquet/DfltTaskExportColumns.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/parquet/DfltTaskExportColumns.kt index f533eb1f..ad7a1d52 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/parquet/DfltTaskExportColumns.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/parquet/DfltTaskExportColumns.kt @@ -132,6 +132,43 @@ public object DfltTaskExportColumns { field = Types.required(INT64).named("cpu_time_lost"), ) { it.cpuLostTime } + // TODO: support multiple GPUs + + public val GPU_CAPACITY: ExportColumn<TaskTableReader> = + ExportColumn( + field = Types.optional(FLOAT).named("gpu_capacity"), + ) { it.gpuLimits?.getOrNull(0) } + + public val GPU_USAGE: ExportColumn<TaskTableReader> = + ExportColumn( + field = Types.optional(FLOAT).named("gpu_usage"), + ) { it.gpuUsages?.getOrNull(0) } + + public val GPU_DEMAND: ExportColumn<TaskTableReader> = + ExportColumn( + field = Types.optional(FLOAT).named("gpu_demand"), + ) { it.gpuDemands?.getOrNull(0) } + + public val GPU_TIME_ACTIVE: ExportColumn<TaskTableReader> = + ExportColumn( + field = Types.optional(INT64).named("gpu_time_active"), + ) { it.gpuActiveTimes?.getOrNull(0) } + + public val GPU_TIME_IDLE: ExportColumn<TaskTableReader> = + ExportColumn( + field = Types.optional(INT64).named("gpu_time_idle"), + ) { it.gpuIdleTimes?.getOrNull(0) } + + public val GPU_TIME_STEAL: ExportColumn<TaskTableReader> = + ExportColumn( + field = Types.optional(INT64).named("gpu_time_steal"), + ) { it.gpuStealTimes?.getOrNull(0) } + + public val GPU_TIME_LOST: ExportColumn<TaskTableReader> = + ExportColumn( + field = Types.optional(INT64).named("gpu_time_lost"), + ) { it.gpuLostTimes?.getOrNull(0) } + public val UP_TIME: ExportColumn<TaskTableReader> = ExportColumn( field = Types.required(INT64).named("uptime"), diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/parquet/ParquetComputeMonitor.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/parquet/ParquetComputeMonitor.kt index a626c41b..4fb930e1 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/parquet/ParquetComputeMonitor.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/parquet/ParquetComputeMonitor.kt @@ -44,6 +44,7 @@ public class ParquetComputeMonitor( private val batteryExporter: Exporter<BatteryTableReader>?, private val serviceExporter: Exporter<ServiceTableReader>?, ) : ComputeMonitor, AutoCloseable { + // FIXME: Include GPU override fun record(reader: HostTableReader) { hostExporter?.write(reader) } diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/host/HostTableReader.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/host/HostTableReader.kt index ff0115df..fbffd508 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/host/HostTableReader.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/host/HostTableReader.kt @@ -113,6 +113,51 @@ public interface HostTableReader : Exportable { public val cpuLostTime: Long /** + * The capacity of the CPUs in the host (in MHz). + */ + public val gpuCapacities: ArrayList<Double> + + /** + * The capacity of the GPUs in the host (in MHz). They inserted by GPU ID. + */ + public val gpuLimits: ArrayList<Double> + + /** + * The usage per GPU in the host (in MHz). They inserted by GPU ID + */ + public val gpuUsages: ArrayList<Double> + + /** + * The demand per GPU of the guests (in MHz). They inserted by GPU ID + */ + public val gpuDemands: ArrayList<Double> + + /** + * The GPU utilization of the host of each GPU. They inserted by GPU ID. + */ + public val gpuUtilizations: ArrayList<Double> + + /** + * The duration (in ms) that the respective GPU was active in the host. They inserted by GPU ID + */ + public val gpuActiveTimes: ArrayList<Long> + + /** + * The duration (in ms) that a GPU was idle in the host. They inserted by GPU ID + */ + public val gpuIdleTimes: ArrayList<Long> + + /** + * The duration (in ms) that a vGPU wanted to run, but no capacity was available. They inserted by GPU ID. + */ + public val gpuStealTimes: ArrayList<Long> + + /** + * The duration (in ms) of GPU time that was lost due to interference. They inserted by GPU ID + */ + public val gpuLostTimes: ArrayList<Long> + + /** * The current power draw of the host in W. */ public val powerDraw: Double diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/host/HostTableReaderImpl.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/host/HostTableReaderImpl.kt index 6e1dac48..cb25358a 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/host/HostTableReaderImpl.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/host/HostTableReaderImpl.kt @@ -49,6 +49,7 @@ public class HostTableReaderImpl( _tasksActive = table.tasksActive _guestsError = table.guestsError _guestsInvalid = table.guestsInvalid + // CPU stats _cpuCapacity = table.cpuCapacity _cpuDemand = table.cpuDemand _cpuUsage = table.cpuUsage @@ -57,6 +58,18 @@ public class HostTableReaderImpl( _cpuIdleTime = table.cpuIdleTime _cpuStealTime = table.cpuStealTime _cpuLostTime = table.cpuLostTime + // GPU stats + _gpuCapacities = table.gpuCapacities + _gpuLimits = table.gpuLimits + _gpuDemands = table.gpuDemands + _gpuUsages = table.gpuUsages + _gpuUtilizations = table.gpuUtilizations + _gpuActiveTimes = table.gpuActiveTimes + _gpuIdleTimes = table.gpuIdleTimes + _gpuStealTimes = table.gpuStealTimes + _gpuLostTimes = table.gpuLostTimes + + // energy & carbon stats _powerDraw = table.powerDraw _energyUsage = table.energyUsage _embodiedCarbon = table.embodiedCarbon @@ -135,6 +148,65 @@ public class HostTableReaderImpl( private var _cpuLostTime = 0L private var previousCpuLostTime = 0L + override val gpuCapacities: ArrayList<Double> + get() = _gpuCapacities + private var _gpuCapacities: ArrayList<Double> = ArrayList() + + override val gpuLimits: ArrayList<Double> + get() = _gpuLimits + private var _gpuLimits: ArrayList<Double> = ArrayList() + + override val gpuUsages: ArrayList<Double> + get() = _gpuUsages + private var _gpuUsages: ArrayList<Double> = ArrayList() + + override val gpuDemands: ArrayList<Double> + get() = _gpuDemands + private var _gpuDemands: ArrayList<Double> = ArrayList() + + override val gpuUtilizations: ArrayList<Double> + get() = _gpuUtilizations + private var _gpuUtilizations: ArrayList<Double> = ArrayList() + + // half of the CPU stats + override val gpuActiveTimes: ArrayList<Long> +// get() = _gpuActiveTimes.zip(previousGpuActiveTimes) { current, previous -> current - previous} as ArrayList<Long> + get() = + (0 until _gpuActiveTimes.size).map { + i -> + (_gpuActiveTimes.getOrNull(i) ?: 0L) - (previousGpuActiveTimes.getOrNull(i) ?: 0L) + } as ArrayList<Long> + private var _gpuActiveTimes: ArrayList<Long> = ArrayList() + private var previousGpuActiveTimes: ArrayList<Long> = ArrayList() + + override val gpuIdleTimes: ArrayList<Long> +// get() = _gpuIdleTimes.zip(previousGpuIdleTimes) { current, previous -> current - previous} as ArrayList<Long> + get() = + (0 until _gpuIdleTimes.size).map { + i -> + (_gpuIdleTimes.getOrNull(i) ?: 0L) - (previousGpuIdleTimes.getOrNull(i) ?: 0L) + } as ArrayList<Long> + private var _gpuIdleTimes: ArrayList<Long> = ArrayList() + private var previousGpuIdleTimes: ArrayList<Long> = ArrayList() + + override val gpuStealTimes: ArrayList<Long> + get() = + (0 until _gpuStealTimes.size).map { + i -> + (_gpuStealTimes.getOrNull(i) ?: 0L) - (previousGpuStealTimes.getOrNull(i) ?: 0L) + } as ArrayList<Long> + private var _gpuStealTimes: ArrayList<Long> = ArrayList() + private var previousGpuStealTimes: ArrayList<Long> = ArrayList() + + override val gpuLostTimes: ArrayList<Long> + get() = + (0 until _gpuLostTimes.size).map { + i -> + (_gpuLostTimes.getOrNull(i) ?: 0L) - (previousGpuLostTimes.getOrNull(i) ?: 0L) + } as ArrayList<Long> + private var _gpuLostTimes: ArrayList<Long> = ArrayList() + private var previousGpuLostTimes: ArrayList<Long> = ArrayList() + override val powerDraw: Double get() = _powerDraw private var _powerDraw = 0.0 @@ -168,6 +240,7 @@ public class HostTableReaderImpl( override fun record(now: Instant) { val hostCpuStats = host.getCpuStats() val hostSysStats = host.getSystemStats() + val hostGpuStats = host.getGpuStats() _timestamp = now _timestampAbsolute = now + startTime @@ -184,6 +257,16 @@ public class HostTableReaderImpl( _cpuIdleTime = hostCpuStats.idleTime _cpuStealTime = hostCpuStats.stealTime _cpuLostTime = hostCpuStats.lostTime + // GPU stats + _gpuLimits = hostGpuStats.map { it.capacity } as ArrayList<Double> + _gpuDemands = hostGpuStats.map { it.demand } as ArrayList<Double> + _gpuUsages = hostGpuStats.map { it.usage } as ArrayList<Double> + _gpuUtilizations = hostGpuStats.map { it.utilization } as ArrayList<Double> + _gpuActiveTimes = hostGpuStats.map { it.activeTime } as ArrayList<Long> + _gpuIdleTimes = hostGpuStats.map { it.idleTime } as ArrayList<Long> + _gpuStealTimes = hostGpuStats.map { it.stealTime } as ArrayList<Long> + _gpuLostTimes = hostGpuStats.map { it.lostTime } as ArrayList<Long> + // energy & carbon stats _powerDraw = hostSysStats.powerDraw _energyUsage = hostSysStats.energyUsage _embodiedCarbon = hostSysStats.embodiedCarbon @@ -202,6 +285,10 @@ public class HostTableReaderImpl( previousCpuIdleTime = _cpuIdleTime previousCpuStealTime = _cpuStealTime previousCpuLostTime = _cpuLostTime + previousGpuActiveTimes = _gpuActiveTimes + previousGpuIdleTimes = _gpuIdleTimes + previousGpuStealTimes = _gpuStealTimes + previousGpuLostTimes = _gpuLostTimes previousEnergyUsage = _energyUsage previousUptime = _uptime previousDowntime = _downtime diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReader.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReader.kt index b0745dd6..f71587c7 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReader.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReader.kt @@ -32,6 +32,7 @@ import java.time.Instant * An interface that is used to read a row of a task trace entry. */ public interface TaskTableReader : Exportable { + // TODO: find better way for more resources public fun copy(): TaskTableReader public fun setValues(table: TaskTableReader) @@ -130,6 +131,41 @@ public interface TaskTableReader : Exportable { public val cpuLostTime: Long /** + * The capacity of the GPUs of Host on which the task is running (in MHz). + */ + public val gpuLimits: DoubleArray? + + /** + * The amount of GPus allocated to the task (in MHz). + */ + public val gpuUsages: DoubleArray? + + /** + * The GPU demanded by this task (in MHz). + */ + public val gpuDemands: DoubleArray? + + /** + * The duration (in seconds) that a GPU was active in the task. + */ + public val gpuActiveTimes: LongArray? + + /** + * The duration (in seconds) that a GPU was idle in the task. + */ + public val gpuIdleTimes: LongArray? + + /** + * The duration (in seconds) that a vGPU wanted to run, but no capacity was available. + */ + public val gpuStealTimes: LongArray? + + /** + * The duration (in seconds) of GPU time that was lost due to interference. + */ + public val gpuLostTimes: LongArray? + + /** * The state of the task */ public val taskState: TaskState? diff --git a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReaderImpl.kt b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReaderImpl.kt index d63202a9..6128c9a2 100644 --- a/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReaderImpl.kt +++ b/opendc-compute/opendc-compute-simulator/src/main/kotlin/org/opendc/compute/simulator/telemetry/table/task/TaskTableReaderImpl.kt @@ -63,6 +63,15 @@ public class TaskTableReaderImpl( _cpuIdleTime = table.cpuIdleTime _cpuStealTime = table.cpuStealTime _cpuLostTime = table.cpuLostTime + // GPU stats + _gpuLimits = table.gpuLimits + _gpuDemands = table.gpuDemands + _gpuUsages = table.gpuUsages + _gpuActiveTimes = table.gpuActiveTimes + _gpuIdleTimes = table.gpuIdleTimes + _gpuStealTimes = table.gpuStealTimes + _gpuLostTimes = table.gpuLostTimes + _uptime = table.uptime _downtime = table.downtime _numFailures = table.numFailures @@ -84,7 +93,7 @@ public class TaskTableReaderImpl( task.name, "vm", "x86", - task.flavor.coreCount, + task.flavor.cpuCoreCount, task.flavor.memorySize, ) @@ -168,6 +177,74 @@ public class TaskTableReaderImpl( private var _cpuLostTime = 0L private var previousCpuLostTime = 0L + override val gpuLimits: DoubleArray? + get() = _gpuLimits ?: DoubleArray(0) + private var _gpuLimits: DoubleArray? = null + + override val gpuUsages: DoubleArray? + get() = _gpuUsages ?: DoubleArray(0) + private var _gpuUsages: DoubleArray? = null + + override val gpuDemands: DoubleArray? + get() = _gpuDemands ?: DoubleArray(0) + private var _gpuDemands: DoubleArray? = null + + override val gpuActiveTimes: LongArray? + get() { + val current = _gpuActiveTimes ?: return LongArray(0) + val previous = previousGpuActiveTimes + + return if (previous == null || current.size != previous.size) { // not sure if I like the second clause + current + } else { + LongArray(current.size) { i -> current[i] - previous[i] } + } + } + private var _gpuActiveTimes: LongArray? = null + private var previousGpuActiveTimes: LongArray? = null + + override val gpuIdleTimes: LongArray? + get() { + val current = _gpuIdleTimes ?: return LongArray(0) + val previous = previousGpuIdleTimes + + return if (previous == null || current.size != previous.size) { // not sure if I like the second clause + current + } else { + LongArray(current.size) { i -> current[i] - previous[i] } + } + } + private var _gpuIdleTimes: LongArray? = null + private var previousGpuIdleTimes: LongArray? = null + + override val gpuStealTimes: LongArray? + get() { + val current = _gpuStealTimes ?: return LongArray(0) + val previous = previousGpuStealTimes + + return if (previous == null || current.size != previous.size) { + current + } else { + LongArray(current.size) { i -> current[i] - previous[i] } + } + } + private var _gpuStealTimes: LongArray? = null + private var previousGpuStealTimes: LongArray? = null + + override val gpuLostTimes: LongArray? + get() { + val current = _gpuLostTimes ?: return LongArray(0) + val previous = previousGpuLostTimes + + return if (previous == null || current.size != previous.size) { + current + } else { + LongArray(current.size) { i -> current[i] - previous[i] } + } + } + private var _gpuLostTimes: LongArray? = null + private var previousGpuLostTimes: LongArray? = null + override val taskState: TaskState? get() = _taskState private var _taskState: TaskState? = null @@ -192,6 +269,7 @@ public class TaskTableReaderImpl( val cpuStats = simHost?.getCpuStats(task) val sysStats = simHost?.getSystemStats(task) + val gpuStats = simHost?.getGpuStats(task) _hostName = task.hostName @@ -214,6 +292,26 @@ public class TaskTableReaderImpl( _scheduleTime = task.scheduledAt _finishTime = task.finishedAt + if (gpuStats != null && gpuStats.isNotEmpty()) { + val size = gpuStats.size + _gpuLimits = DoubleArray(size) { i -> gpuStats[i].capacity } + _gpuDemands = DoubleArray(size) { i -> gpuStats[i].demand } + _gpuUsages = DoubleArray(size) { i -> gpuStats[i].usage } + _gpuActiveTimes = LongArray(size) { i -> gpuStats[i].activeTime } + _gpuIdleTimes = LongArray(size) { i -> gpuStats[i].idleTime } + _gpuStealTimes = LongArray(size) { i -> gpuStats[i].stealTime } + _gpuLostTimes = LongArray(size) { i -> gpuStats[i].lostTime } + } else { + _gpuIdleTimes = null + _gpuStealTimes = null + _gpuLostTimes = null + _gpuIdleTimes = null + _gpuLimits = null + _gpuUsages = null + _gpuDemands = null + _gpuActiveTimes = null + } + _taskState = task.state } @@ -227,6 +325,10 @@ public class TaskTableReaderImpl( previousCpuIdleTime = _cpuIdleTime previousCpuStealTime = _cpuStealTime previousCpuLostTime = _cpuLostTime + previousGpuActiveTimes = _gpuActiveTimes + previousGpuIdleTimes = _gpuIdleTimes + previousGpuStealTimes = _gpuStealTimes + previousGpuLostTimes = _gpuLostTimes simHost = null _cpuLimit = 0.0 diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt index 04a20f49..5109f828 100644 --- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt +++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/FilterSchedulerTest.kt @@ -78,7 +78,7 @@ internal class FilterSchedulerTest { ) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -103,7 +103,7 @@ internal class FilterSchedulerTest { scheduler.addHost(hostB) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -134,7 +134,7 @@ internal class FilterSchedulerTest { scheduler.addHost(hostB) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -159,7 +159,7 @@ internal class FilterSchedulerTest { scheduler.addHost(host) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -180,7 +180,7 @@ internal class FilterSchedulerTest { scheduler.addHost(host) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -209,7 +209,7 @@ internal class FilterSchedulerTest { scheduler.addHost(hostB) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -232,7 +232,7 @@ internal class FilterSchedulerTest { scheduler.addHost(host) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 2300 every { req.isCancelled } returns false @@ -250,18 +250,18 @@ internal class FilterSchedulerTest { val hostA = mockk<HostView>() every { hostA.host.getState() } returns HostState.UP every { hostA.host.getModel() } returns HostModel(4 * 2600.0, 4, 2048) - every { hostA.provisionedCores } returns 3 + every { hostA.provisionedCpuCores } returns 3 val hostB = mockk<HostView>() every { hostB.host.getState() } returns HostState.UP every { hostB.host.getModel() } returns HostModel(4 * 2600.0, 4, 2048) - every { hostB.provisionedCores } returns 0 + every { hostB.provisionedCpuCores } returns 0 scheduler.addHost(hostA) scheduler.addHost(hostB) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -279,12 +279,12 @@ internal class FilterSchedulerTest { val host = mockk<HostView>() every { host.host.getState() } returns HostState.UP every { host.host.getModel() } returns HostModel(4 * 2600.0, 4, 2048) - every { host.provisionedCores } returns 0 + every { host.provisionedCpuCores } returns 0 scheduler.addHost(host) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 8 + every { req.task.flavor.cpuCoreCount } returns 8 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -312,7 +312,7 @@ internal class FilterSchedulerTest { scheduler.addHost(hostB) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.task.flavor.meta } returns mapOf("cpu-capacity" to 2 * 3200.0) every { req.isCancelled } returns false @@ -342,7 +342,7 @@ internal class FilterSchedulerTest { scheduler.addHost(hostB) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -358,7 +358,7 @@ internal class FilterSchedulerTest { ) val reqA = mockk<SchedulingRequest>() - every { reqA.task.flavor.coreCount } returns 2 + every { reqA.task.flavor.cpuCoreCount } returns 2 every { reqA.task.flavor.memorySize } returns 1024 every { reqA.isCancelled } returns false val taskA = mockk<ServiceTask>() @@ -369,19 +369,19 @@ internal class FilterSchedulerTest { every { hostA.host.getState() } returns HostState.UP every { hostA.host.getModel() } returns HostModel(4 * 2600.0, 4, 2048) every { hostA.host.getInstances() } returns emptySet() - every { hostA.provisionedCores } returns 3 + every { hostA.provisionedCpuCores } returns 3 val hostB = mockk<HostView>() every { hostB.host.getState() } returns HostState.UP every { hostB.host.getModel() } returns HostModel(4 * 2600.0, 4, 2048) every { hostB.host.getInstances() } returns setOf(reqA.task) - every { hostB.provisionedCores } returns 0 + every { hostB.provisionedCpuCores } returns 0 scheduler.addHost(hostA) scheduler.addHost(hostB) val reqB = mockk<SchedulingRequest>() - every { reqB.task.flavor.coreCount } returns 2 + every { reqB.task.flavor.cpuCoreCount } returns 2 every { reqB.task.flavor.memorySize } returns 1024 every { reqB.task.meta } returns emptyMap() every { reqB.isCancelled } returns false @@ -402,7 +402,7 @@ internal class FilterSchedulerTest { ) val reqA = mockk<SchedulingRequest>() - every { reqA.task.flavor.coreCount } returns 2 + every { reqA.task.flavor.cpuCoreCount } returns 2 every { reqA.task.flavor.memorySize } returns 1024 every { reqA.isCancelled } returns false val taskA = mockk<ServiceTask>() @@ -413,19 +413,19 @@ internal class FilterSchedulerTest { every { hostA.host.getState() } returns HostState.UP every { hostA.host.getModel() } returns HostModel(4 * 2600.0, 4, 2048) every { hostA.host.getInstances() } returns setOf(reqA.task) - every { hostA.provisionedCores } returns 3 + every { hostA.provisionedCpuCores } returns 3 val hostB = mockk<HostView>() every { hostB.host.getState() } returns HostState.UP every { hostB.host.getModel() } returns HostModel(4 * 2600.0, 4, 2048) every { hostB.host.getInstances() } returns emptySet() - every { hostB.provisionedCores } returns 0 + every { hostB.provisionedCpuCores } returns 0 scheduler.addHost(hostA) scheduler.addHost(hostB) val reqB = mockk<SchedulingRequest>() - every { reqB.task.flavor.coreCount } returns 2 + every { reqB.task.flavor.cpuCoreCount } returns 2 every { reqB.task.flavor.memorySize } returns 1024 every { reqB.task.meta } returns emptyMap() every { reqB.isCancelled } returns false @@ -459,7 +459,7 @@ internal class FilterSchedulerTest { scheduler.addHost(hostB) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -488,7 +488,7 @@ internal class FilterSchedulerTest { scheduler.addHost(hostB) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -506,18 +506,18 @@ internal class FilterSchedulerTest { val hostA = mockk<HostView>() every { hostA.host.getState() } returns HostState.UP every { hostA.host.getModel() } returns HostModel(4 * 2600.0, 4, 2048) - every { hostA.provisionedCores } returns 2 + every { hostA.provisionedCpuCores } returns 2 val hostB = mockk<HostView>() every { hostB.host.getState() } returns HostState.UP every { hostB.host.getModel() } returns HostModel(4 * 2600.0, 4, 2048) - every { hostB.provisionedCores } returns 0 + every { hostB.provisionedCpuCores } returns 0 scheduler.addHost(hostA) scheduler.addHost(hostB) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -546,7 +546,7 @@ internal class FilterSchedulerTest { scheduler.addHost(hostB) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/MemorizingSchedulerTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/MemorizingSchedulerTest.kt index 92d5008b..6b9b0048 100644 --- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/MemorizingSchedulerTest.kt +++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/MemorizingSchedulerTest.kt @@ -43,7 +43,7 @@ internal class MemorizingSchedulerTest { ) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -67,7 +67,7 @@ internal class MemorizingSchedulerTest { scheduler.addHost(hostB) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false @@ -101,7 +101,7 @@ internal class MemorizingSchedulerTest { scheduler.addHost(hostB) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false val skipped = slot<Int>() @@ -129,7 +129,7 @@ internal class MemorizingSchedulerTest { scheduler.addHost(host) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 2300 every { req.isCancelled } returns false val skipped = slot<Int>() diff --git a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/TimeshiftSchedulerTest.kt b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/TimeshiftSchedulerTest.kt index 46c6425e..02f83eaf 100644 --- a/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/TimeshiftSchedulerTest.kt +++ b/opendc-compute/opendc-compute-simulator/src/test/kotlin/org/opendc/compute/simulator/scheduler/TimeshiftSchedulerTest.kt @@ -48,7 +48,7 @@ class TimeshiftSchedulerTest { ) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false every { req.task.nature } returns TaskNature(true) @@ -76,7 +76,7 @@ class TimeshiftSchedulerTest { ) val req = mockk<SchedulingRequest>() - every { req.task.flavor.coreCount } returns 2 + every { req.task.flavor.cpuCoreCount } returns 2 every { req.task.flavor.memorySize } returns 1024 every { req.isCancelled } returns false every { req.task.nature } returns TaskNature(true) diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt index b6c945d2..b52608a9 100644 --- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt +++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/TopologyFactories.kt @@ -31,10 +31,13 @@ import org.opendc.compute.topology.specs.HostJSONSpec import org.opendc.compute.topology.specs.HostSpec import org.opendc.compute.topology.specs.PowerSourceSpec import org.opendc.compute.topology.specs.TopologySpec -import org.opendc.simulator.compute.cpu.getPowerModel import org.opendc.simulator.compute.models.CpuModel +import org.opendc.simulator.compute.models.GpuModel import org.opendc.simulator.compute.models.MachineModel import org.opendc.simulator.compute.models.MemoryUnit +import org.opendc.simulator.compute.power.getPowerModel +import org.opendc.simulator.engine.graph.distributionPolicies.DistributionPolicyFactory +import org.opendc.simulator.engine.graph.distributionPolicies.DistributionPolicyFactory.DistributionPolicyType import java.io.File import java.io.InputStream @@ -166,29 +169,63 @@ private fun HostJSONSpec.toHostSpec(clusterName: String): HostSpec { } val unknownMemoryUnit = MemoryUnit(memory.vendor, memory.modelName, memory.memorySpeed.toMHz(), memory.memorySize.toMiB().toLong()) + val gpuUnits = + List(gpu?.count ?: 0) { + GpuModel( + globalCoreId++, + gpu!!.coreCount, + gpu.coreSpeed.toMHz(), + gpu.memoryBandwidth.toKibps(), + gpu.memorySize.toMiB().toLong(), + gpu.vendor, + gpu.modelName, + gpu.architecture, + ) + } + val machineModel = MachineModel( units, unknownMemoryUnit, + gpuUnits, + // TODO: Pass through + DistributionPolicyFactory.getDistributionStrategy(DistributionPolicyType.MaxMinFairness), + DistributionPolicyFactory.getDistributionStrategy(DistributionPolicyType.MaxMinFairness), ) - val powerModel = + val cpuPowerModel = getPowerModel( - powerModel.modelType, - powerModel.power.toWatts(), - powerModel.maxPower.toWatts(), - powerModel.idlePower.toWatts(), - powerModel.calibrationFactor, - powerModel.asymUtil, - powerModel.dvfs, + cpuPowerModel.modelType, + cpuPowerModel.power.toWatts(), + cpuPowerModel.maxPower.toWatts(), + cpuPowerModel.idlePower.toWatts(), + cpuPowerModel.calibrationFactor, + cpuPowerModel.asymUtil, + cpuPowerModel.dvfs, ) + val gpuPowerModel = + if (gpuUnits.isEmpty()) { + null + } else { + getPowerModel( + gpuPowerModel.modelType, + gpuPowerModel.power.toWatts(), + gpuPowerModel.maxPower.toWatts(), + gpuPowerModel.idlePower.toWatts(), + gpuPowerModel.calibrationFactor, + gpuPowerModel.asymUtil, + gpuPowerModel.dvfs, + ) + } + val hostSpec = HostSpec( createUniqueName(this.name, hostNames), clusterName, machineModel, - powerModel, + cpuPowerModel, + gpuPowerModel, ) return hostSpec } diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt index e4ec89e1..30a75896 100644 --- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt +++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/HostSpec.kt @@ -22,8 +22,8 @@ package org.opendc.compute.topology.specs -import org.opendc.simulator.compute.cpu.CpuPowerModel import org.opendc.simulator.compute.models.MachineModel +import org.opendc.simulator.compute.power.PowerModel /** * Description of a physical host that will be simulated by OpenDC and host the virtual machines. @@ -36,7 +36,8 @@ public data class HostSpec( val name: String, val clusterName: String, val model: MachineModel, - val cpuPowerModel: CpuPowerModel, + val cpuPowerModel: PowerModel, + val gpuPowerModel: PowerModel?, val embodiedCarbon: Double = 1000.0, val expectedLifetime: Double = 5.0, ) diff --git a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt index 8cbf818b..62c3906a 100644 --- a/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt +++ b/opendc-compute/opendc-compute-topology/src/main/kotlin/org/opendc/compute/topology/specs/TopologySpecs.kt @@ -24,6 +24,7 @@ package org.opendc.compute.topology.specs import kotlinx.serialization.SerialName import kotlinx.serialization.Serializable +import org.opendc.common.units.DataRate import org.opendc.common.units.DataSize import org.opendc.common.units.Frequency import org.opendc.common.units.Power @@ -76,7 +77,9 @@ public data class HostJSONSpec( val cpu: CPUJSONSpec, val count: Int = 1, val memory: MemoryJSONSpec, - val powerModel: PowerModelSpec = PowerModelSpec.DFLT, + val gpu: GPUJSONSpec? = null, + val cpuPowerModel: PowerModelSpec = PowerModelSpec.DFLT, + val gpuPowerModel: PowerModelSpec = PowerModelSpec.DFLT, ) /** @@ -118,6 +121,18 @@ public data class MemoryJSONSpec( ) @Serializable +public data class GPUJSONSpec( + val count: Int = 1, + val coreCount: Int, + val coreSpeed: Frequency, + val memorySize: DataSize = DataSize.ofMiB(-1), + val memoryBandwidth: DataRate = DataRate.ofKibps(-1), + val vendor: String = "unknown", + val modelName: String = "unknown", + val architecture: String = "unknown", +) + +@Serializable public data class PowerModelSpec( val modelType: String, val power: Power = Power.ofWatts(400), diff --git a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt index 80996c0e..7599d4e1 100644 --- a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt +++ b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/ComputeWorkloadLoader.kt @@ -33,11 +33,15 @@ import org.opendc.trace.conv.resourceCpuCapacity import org.opendc.trace.conv.resourceCpuCount import org.opendc.trace.conv.resourceDeadline import org.opendc.trace.conv.resourceDuration +import org.opendc.trace.conv.resourceGpuCapacity +import org.opendc.trace.conv.resourceGpuCount +import org.opendc.trace.conv.resourceGpuMemCapacity import org.opendc.trace.conv.resourceID import org.opendc.trace.conv.resourceMemCapacity import org.opendc.trace.conv.resourceNature import org.opendc.trace.conv.resourceStateCpuUsage import org.opendc.trace.conv.resourceStateDuration +import org.opendc.trace.conv.resourceStateGpuUsage import org.opendc.trace.conv.resourceSubmissionTime import java.io.File import java.lang.ref.SoftReference @@ -79,6 +83,8 @@ public class ComputeWorkloadLoader( val durationCol = reader.resolve(resourceStateDuration) val coresCol = reader.resolve(resourceCpuCount) val usageCol = reader.resolve(resourceStateCpuUsage) + val gpuCoresCol = reader.resolve(resourceGpuCount) + val resourceGpuCapacityCol = reader.resolve(resourceStateGpuUsage) val fragments = mutableMapOf<String, Builder>() @@ -88,12 +94,23 @@ public class ComputeWorkloadLoader( val durationMs = reader.getDuration(durationCol)!! val cores = reader.getInt(coresCol) val cpuUsage = reader.getDouble(usageCol) + val gpuUsage = + if (reader.getDouble( + resourceGpuCapacityCol, + ).isNaN() + ) { + 0.0 + } else { + reader.getDouble(resourceGpuCapacityCol) // Default to 0 if not present + } + val gpuCores = reader.getInt(gpuCoresCol) // Default to 0 if not present + val gpuMemory = 0L // Default to 0 if not present val builder = fragments.computeIfAbsent( id, ) { Builder(checkpointInterval, checkpointDuration, checkpointIntervalScaling, scalingPolicy, id) } - builder.add(durationMs, cpuUsage, cores) + builder.add(durationMs, cpuUsage, cores, gpuUsage, gpuCores, gpuMemory) } fragments @@ -117,6 +134,9 @@ public class ComputeWorkloadLoader( val cpuCountCol = reader.resolve(resourceCpuCount) val cpuCapacityCol = reader.resolve(resourceCpuCapacity) val memCol = reader.resolve(resourceMemCapacity) + val gpuCapacityCol = reader.resolve(resourceGpuCapacity) // Assuming GPU capacity is also present + val gpuCoreCountCol = reader.resolve(resourceGpuCount) // Assuming GPU cores are also present + val gpuMemoryCol = reader.resolve(resourceGpuMemCapacity) // Assuming GPU memory is also present val natureCol = reader.resolve(resourceNature) val deadlineCol = reader.resolve(resourceDeadline) @@ -135,6 +155,17 @@ public class ComputeWorkloadLoader( val cpuCount = reader.getInt(cpuCountCol) val cpuCapacity = reader.getDouble(cpuCapacityCol) val memCapacity = reader.getDouble(memCol) / 1000.0 // Convert from KB to MB + val gpuUsage = + if (reader.getDouble( + gpuCapacityCol, + ).isNaN() + ) { + 0.0 + } else { + reader.getDouble(gpuCapacityCol) // Default to 0 if not present// Default to 0 if not present + } + val gpuCoreCount = reader.getInt(gpuCoreCountCol) // Default to 0 if not present + val gpuMemory = 0L // currently not implemented val uid = UUID.nameUUIDFromBytes("$id-${counter++}".toByteArray()) var nature = reader.getString(natureCol) var deadline = reader.getLong(deadlineCol) @@ -153,6 +184,9 @@ public class ComputeWorkloadLoader( cpuCount, cpuCapacity, memCapacity.roundToLong(), + gpuCoreCount, + gpuUsage, + gpuMemory, totalLoad, submissionTime, duration, @@ -224,17 +258,23 @@ public class ComputeWorkloadLoader( * Add a fragment to the trace. * * @param duration The duration of the fragment (in epoch millis). - * @param usage CPU usage of this fragment. - * @param cores Number of cores used. + * @param cpuUsage CPU usage of this fragment. + * @param cpuCores Number of cores used. + * @param gpuUsage GPU usage of this fragment. + * @param gpuCores Number of GPU cores used. + * @param gpuMemoryUsage GPU memory usage of this fragment. */ fun add( duration: Duration, - usage: Double, - cores: Int, + cpuUsage: Double, + cpuCores: Int, + gpuUsage: Double = 0.0, + gpuCores: Int = 0, + gpuMemoryUsage: Long = 0, ) { - totalLoad += (usage * duration.toMillis()) / 1000 // avg MHz * duration = MFLOPs + totalLoad += ((cpuUsage * duration.toMillis()) + (gpuUsage * duration.toMillis())) / 1000 // avg MHz * duration = MFLOPs - builder.add(duration.toMillis(), usage, cores) + builder.add(duration.toMillis(), cpuUsage, cpuCores, gpuUsage, gpuCores, gpuMemoryUsage) } /** diff --git a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/Task.kt b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/Task.kt index 787f271e..228b84ed 100644 --- a/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/Task.kt +++ b/opendc-compute/opendc-compute-workload/src/main/kotlin/org/opendc/compute/workload/Task.kt @@ -43,6 +43,9 @@ public data class Task( val cpuCount: Int, val cpuCapacity: Double, val memCapacity: Long, + val gpuCount: Int = 0, + val gpuCapacity: Double = 0.0, + val gpuMemCapacity: Long = 0L, val totalLoad: Double, var submissionTime: Long, val duration: Long, |
