From c7eec7904e08029b3ab31d3e7b21afa1ea9ab7e6 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Wed, 4 May 2022 16:24:53 +0200 Subject: refactor(compute/service): Remove OpenTelemetry from "compute" modules This change removes the OpenTelemetry integration from the OpenDC Compute modules. Previously, we chose to integrate OpenTelemetry to provide a unified way to report metrics to the users. Although this worked as expected, the overhead of the OpenTelemetry when collecting metrics during simulation was considerable and lacked more optimization opportunities (other than providing a separate API implementation). Furthermore, since we were tied to OpenTelemetry's SDK implementation, we experienced issues with throttling and registering multiple instruments. We will instead use another approach, where we expose the core metrics in OpenDC via specialized interfaces (see the commits before) such that access is fast and can be done without having to interface with OpenTelemetry. In addition, we will provide an adapter to that is able to forward these metrics to OpenTelemetry implementations, so we can still integrate with the wider ecosystem. --- .../opendc-experiments-capelin/build.gradle.kts | 3 --- .../org/opendc/experiments/capelin/CapelinBenchmarks.kt | 6 +----- .../opendc-experiments-capelin/src/jmh/resources/topology.txt | 5 +++++ .../main/kotlin/org/opendc/experiments/capelin/Portfolio.kt | 5 +---- .../org/opendc/experiments/capelin/CapelinIntegrationTest.kt | 11 +++-------- 5 files changed, 10 insertions(+), 20 deletions(-) create mode 100644 opendc-experiments/opendc-experiments-capelin/src/jmh/resources/topology.txt (limited to 'opendc-experiments') diff --git a/opendc-experiments/opendc-experiments-capelin/build.gradle.kts b/opendc-experiments/opendc-experiments-capelin/build.gradle.kts index 9495f4ca..39cf101d 100644 --- a/opendc-experiments/opendc-experiments-capelin/build.gradle.kts +++ b/opendc-experiments/opendc-experiments-capelin/build.gradle.kts @@ -37,8 +37,6 @@ dependencies { implementation(projects.opendcSimulator.opendcSimulatorCore) implementation(projects.opendcSimulator.opendcSimulatorCompute) implementation(projects.opendcCompute.opendcComputeSimulator) - implementation(projects.opendcTelemetry.opendcTelemetrySdk) - implementation(projects.opendcTelemetry.opendcTelemetryCompute) implementation(libs.config) implementation(libs.kotlin.logging) @@ -46,7 +44,6 @@ dependencies { implementation(libs.jackson.module.kotlin) implementation(libs.jackson.dataformat.csv) implementation(kotlin("reflect")) - implementation(libs.opentelemetry.semconv) runtimeOnly(projects.opendcTrace.opendcTraceOpendc) diff --git a/opendc-experiments/opendc-experiments-capelin/src/jmh/kotlin/org/opendc/experiments/capelin/CapelinBenchmarks.kt b/opendc-experiments/opendc-experiments-capelin/src/jmh/kotlin/org/opendc/experiments/capelin/CapelinBenchmarks.kt index 83b8c0c6..fd2c26f0 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/jmh/kotlin/org/opendc/experiments/capelin/CapelinBenchmarks.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/jmh/kotlin/org/opendc/experiments/capelin/CapelinBenchmarks.kt @@ -22,14 +22,12 @@ package org.opendc.experiments.capelin -import kotlinx.coroutines.ExperimentalCoroutinesApi import org.opendc.compute.service.scheduler.FilterScheduler import org.opendc.compute.service.scheduler.filters.ComputeFilter import org.opendc.compute.service.scheduler.filters.RamFilter import org.opendc.compute.service.scheduler.filters.VCpuFilter import org.opendc.compute.service.scheduler.weights.CoreRamWeigher import org.opendc.compute.workload.* -import org.opendc.compute.workload.telemetry.NoopTelemetryManager import org.opendc.compute.workload.topology.Topology import org.opendc.compute.workload.topology.apply import org.opendc.experiments.capelin.topology.clusterTopology @@ -46,7 +44,6 @@ import java.util.concurrent.TimeUnit @Fork(1) @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) @Measurement(iterations = 5, time = 5, timeUnit = TimeUnit.SECONDS) -@OptIn(ExperimentalCoroutinesApi::class) class CapelinBenchmarks { private lateinit var vms: List private lateinit var topology: Topology @@ -59,7 +56,7 @@ class CapelinBenchmarks { val loader = ComputeWorkloadLoader(File("src/test/resources/trace")) val source = trace("bitbrains-small") vms = source.resolve(loader, Random(1L)).vms - topology = checkNotNull(object {}.javaClass.getResourceAsStream("/env/topology.txt")).use { clusterTopology(it) } + topology = checkNotNull(object {}.javaClass.getResourceAsStream("/topology.txt")).use { clusterTopology(it) } } @Benchmark @@ -71,7 +68,6 @@ class CapelinBenchmarks { val runner = ComputeServiceHelper( coroutineContext, clock, - NoopTelemetryManager(), computeScheduler ) diff --git a/opendc-experiments/opendc-experiments-capelin/src/jmh/resources/topology.txt b/opendc-experiments/opendc-experiments-capelin/src/jmh/resources/topology.txt new file mode 100644 index 00000000..6b347bff --- /dev/null +++ b/opendc-experiments/opendc-experiments-capelin/src/jmh/resources/topology.txt @@ -0,0 +1,5 @@ +ClusterID;ClusterName;Cores;Speed;Memory;numberOfHosts;memoryCapacityPerHost;coreCountPerHost +A01;A01;32;3.2;2048;1;256;32 +B01;B01;48;2.93;1256;6;64;8 +C01;C01;32;3.2;2048;2;128;16 + diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt index 6fd85e8c..0de8aa7b 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt @@ -30,7 +30,7 @@ import org.opendc.compute.workload.ComputeWorkloadLoader import org.opendc.compute.workload.createComputeScheduler import org.opendc.compute.workload.export.parquet.ParquetComputeMonitor import org.opendc.compute.workload.grid5000 -import org.opendc.compute.workload.telemetry.NoopTelemetryManager +import org.opendc.compute.workload.telemetry.ComputeMetricReader import org.opendc.compute.workload.topology.apply import org.opendc.experiments.capelin.model.OperationalPhenomena import org.opendc.experiments.capelin.model.Topology @@ -39,7 +39,6 @@ import org.opendc.experiments.capelin.topology.clusterTopology import org.opendc.harness.dsl.Experiment import org.opendc.harness.dsl.anyOf import org.opendc.simulator.core.runBlockingSimulation -import org.opendc.telemetry.compute.ComputeMetricReader import java.io.File import java.time.Duration import java.util.* @@ -99,11 +98,9 @@ abstract class Portfolio(name: String) : Experiment(name) { else null val (vms, interferenceModel) = workload.source.resolve(workloadLoader, seeder) - val telemetry = NoopTelemetryManager() val runner = ComputeServiceHelper( coroutineContext, clock, - telemetry, computeScheduler, failureModel, interferenceModel?.withSeed(repeat.toLong()) diff --git a/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt b/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt index 62cdf123..fa2cd9c8 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt @@ -33,14 +33,13 @@ import org.opendc.compute.service.scheduler.filters.RamFilter import org.opendc.compute.service.scheduler.filters.VCpuFilter import org.opendc.compute.service.scheduler.weights.CoreRamWeigher import org.opendc.compute.workload.* -import org.opendc.compute.workload.telemetry.NoopTelemetryManager +import org.opendc.compute.workload.telemetry.ComputeMetricReader +import org.opendc.compute.workload.telemetry.ComputeMonitor +import org.opendc.compute.workload.telemetry.table.HostTableReader import org.opendc.compute.workload.topology.Topology import org.opendc.compute.workload.topology.apply import org.opendc.experiments.capelin.topology.clusterTopology import org.opendc.simulator.core.runBlockingSimulation -import org.opendc.telemetry.compute.ComputeMetricReader -import org.opendc.telemetry.compute.ComputeMonitor -import org.opendc.telemetry.compute.table.HostTableReader import java.io.File import java.time.Duration import java.util.* @@ -86,7 +85,6 @@ class CapelinIntegrationTest { val runner = ComputeServiceHelper( coroutineContext, clock, - NoopTelemetryManager(), computeScheduler ) val topology = createTopology() @@ -136,7 +134,6 @@ class CapelinIntegrationTest { val runner = ComputeServiceHelper( coroutineContext, clock, - NoopTelemetryManager(), computeScheduler ) val topology = createTopology("single") @@ -182,7 +179,6 @@ class CapelinIntegrationTest { val simulator = ComputeServiceHelper( coroutineContext, clock, - NoopTelemetryManager(), computeScheduler, interferenceModel = interferenceModel?.withSeed(seed.toLong()) ) @@ -226,7 +222,6 @@ class CapelinIntegrationTest { val simulator = ComputeServiceHelper( coroutineContext, clock, - NoopTelemetryManager(), computeScheduler, grid5000(Duration.ofDays(7)) ) -- cgit v1.2.3 From 0b584e261fdf34d662129b1b47f00711c0ce0779 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Fri, 6 May 2022 09:27:45 +0200 Subject: refactor(workflow/service): Remove OpenTelemetry from "FaaS" modules This change removes the OpenTelemetry integration from the OpenDC FaaS modules. Previously, we chose to integrate OpenTelemetry to provide a unified way to report metrics to the users. See the previous commit removing it from the "Compute" modules for the reasoning behind this change. --- .../opendc-experiments-serverless20/build.gradle.kts | 1 - .../experiments/serverless/ServerlessExperiment.kt | 16 +++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'opendc-experiments') diff --git a/opendc-experiments/opendc-experiments-serverless20/build.gradle.kts b/opendc-experiments/opendc-experiments-serverless20/build.gradle.kts index b96647a6..a6391986 100644 --- a/opendc-experiments/opendc-experiments-serverless20/build.gradle.kts +++ b/opendc-experiments/opendc-experiments-serverless20/build.gradle.kts @@ -33,7 +33,6 @@ dependencies { implementation(projects.opendcSimulator.opendcSimulatorCore) implementation(projects.opendcFaas.opendcFaasService) implementation(projects.opendcFaas.opendcFaasSimulator) - implementation(projects.opendcTelemetry.opendcTelemetrySdk) implementation(libs.kotlin.logging) implementation(libs.config) } diff --git a/opendc-experiments/opendc-experiments-serverless20/src/main/kotlin/org/opendc/experiments/serverless/ServerlessExperiment.kt b/opendc-experiments/opendc-experiments-serverless20/src/main/kotlin/org/opendc/experiments/serverless/ServerlessExperiment.kt index 3312d6c0..1c357f67 100644 --- a/opendc-experiments/opendc-experiments-serverless20/src/main/kotlin/org/opendc/experiments/serverless/ServerlessExperiment.kt +++ b/opendc-experiments/opendc-experiments-serverless20/src/main/kotlin/org/opendc/experiments/serverless/ServerlessExperiment.kt @@ -23,8 +23,6 @@ package org.opendc.experiments.serverless import com.typesafe.config.ConfigFactory -import io.opentelemetry.api.metrics.MeterProvider -import io.opentelemetry.sdk.metrics.SdkMeterProvider import kotlinx.coroutines.coroutineScope import kotlinx.coroutines.delay import kotlinx.coroutines.launch @@ -44,7 +42,6 @@ import org.opendc.simulator.compute.model.MemoryUnit import org.opendc.simulator.compute.model.ProcessingNode import org.opendc.simulator.compute.model.ProcessingUnit import org.opendc.simulator.core.runBlockingSimulation -import org.opendc.telemetry.sdk.toOtelClock import java.io.File import java.time.Duration import java.util.* @@ -76,17 +73,18 @@ public class ServerlessExperiment : Experiment("Serverless") { private val coldStartModel by anyOf(ColdStartModel.LAMBDA, ColdStartModel.AZURE, ColdStartModel.GOOGLE) override fun doRun(repeat: Int): Unit = runBlockingSimulation { - val meterProvider: MeterProvider = SdkMeterProvider - .builder() - .setClock(clock.toOtelClock()) - .build() - val trace = ServerlessTraceReader().parse(File(config.getString("trace-path"))) val traceById = trace.associateBy { it.id } val delayInjector = StochasticDelayInjector(coldStartModel, Random()) val deployer = SimFunctionDeployer(clock, this, createMachineModel(), delayInjector) { FunctionTraceWorkload(traceById.getValue(it.name)) } val service = - FaaSService(coroutineContext, clock, meterProvider, deployer, routingPolicy, FunctionTerminationPolicyFixed(coroutineContext, clock, timeout = Duration.ofMinutes(10))) + FaaSService( + coroutineContext, + clock, + deployer, + routingPolicy, + FunctionTerminationPolicyFixed(coroutineContext, clock, timeout = Duration.ofMinutes(10)) + ) val client = service.newClient() coroutineScope { -- cgit v1.2.3 From 0e8ad565a78dd194e687003e5ccc8ccf9b28667f Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Fri, 6 May 2022 10:22:35 +0200 Subject: refactor(exp/tf20): Remove OpenTelemetry from TF20 experiment This change removes the OpenTelemetry integration from the OpenDC Tensorflow 2020 experiments. Previously, we chose to integrate OpenTelemetry to provide a unified way to report metrics to the users. See the previous commit removing it from the "Compute" modules for the reasoning behind this change. --- .../opendc-experiments-tf20/build.gradle.kts | 1 - .../opendc/experiments/tf20/TensorFlowExperiment.kt | 13 ++----------- .../org/opendc/experiments/tf20/core/SimTFDevice.kt | 18 +----------------- .../opendc/experiments/tf20/core/SimTFDeviceTest.kt | 14 +++++++++----- 4 files changed, 12 insertions(+), 34 deletions(-) (limited to 'opendc-experiments') diff --git a/opendc-experiments/opendc-experiments-tf20/build.gradle.kts b/opendc-experiments/opendc-experiments-tf20/build.gradle.kts index 5762ce64..f61c8fef 100644 --- a/opendc-experiments/opendc-experiments-tf20/build.gradle.kts +++ b/opendc-experiments/opendc-experiments-tf20/build.gradle.kts @@ -32,7 +32,6 @@ dependencies { api(projects.opendcHarness.opendcHarnessApi) implementation(projects.opendcSimulator.opendcSimulatorCore) implementation(projects.opendcSimulator.opendcSimulatorCompute) - implementation(projects.opendcTelemetry.opendcTelemetrySdk) implementation(projects.opendcCommon) implementation(libs.kotlin.logging) diff --git a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/TensorFlowExperiment.kt b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/TensorFlowExperiment.kt index 2153a862..19236029 100644 --- a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/TensorFlowExperiment.kt +++ b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/TensorFlowExperiment.kt @@ -22,8 +22,6 @@ package org.opendc.experiments.tf20 -import io.opentelemetry.api.metrics.MeterProvider -import io.opentelemetry.sdk.metrics.SdkMeterProvider import org.opendc.experiments.tf20.core.SimTFDevice import org.opendc.experiments.tf20.distribute.* import org.opendc.experiments.tf20.keras.AlexNet @@ -32,7 +30,6 @@ import org.opendc.harness.dsl.Experiment import org.opendc.harness.dsl.anyOf import org.opendc.simulator.compute.power.LinearPowerModel import org.opendc.simulator.core.runBlockingSimulation -import org.opendc.telemetry.sdk.toOtelClock /** * Experiments with the TensorFlow simulation model. @@ -49,17 +46,11 @@ public class TensorFlowExperiment : Experiment(name = "tf20") { private val batchSize by anyOf(16, 32, 64, 128) override fun doRun(repeat: Int): Unit = runBlockingSimulation { - val meterProvider: MeterProvider = SdkMeterProvider - .builder() - .setClock(clock.toOtelClock()) - .build() - val meter = meterProvider.get("opendc-tf20") - val envInput = checkNotNull(TensorFlowExperiment::class.java.getResourceAsStream(environmentFile)) val def = MLEnvironmentReader().readEnvironment(envInput).first() val device = SimTFDevice( - def.uid, def.meta["gpu"] as Boolean, coroutineContext, clock, meter, def.model.cpus[0], - def.model.memory[0], LinearPowerModel(250.0, 60.0) + def.uid, def.meta["gpu"] as Boolean, coroutineContext, clock, def.model.cpus[0], def.model.memory[0], + LinearPowerModel(250.0, 60.0) ) val strategy = OneDeviceStrategy(device) diff --git a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt index 99948c8e..d2105196 100644 --- a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt +++ b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt @@ -22,7 +22,6 @@ package org.opendc.experiments.tf20.core -import io.opentelemetry.api.metrics.Meter import kotlinx.coroutines.* import org.opendc.simulator.compute.SimBareMetalMachine import org.opendc.simulator.compute.SimMachine @@ -50,7 +49,6 @@ public class SimTFDevice( override val isGpu: Boolean, context: CoroutineContext, clock: Clock, - meter: Meter, pu: ProcessingUnit, private val memory: MemoryUnit, powerModel: PowerModel @@ -69,21 +67,9 @@ public class SimTFDevice( ) /** - * The usage of the device. + * Metrics collected by the device. */ - private val _usage = meter.histogramBuilder("device.usage") - .setDescription("The amount of device resources used") - .setUnit("MHz") - .build() private var _resourceUsage = 0.0 - - /** - * The power draw of the device. - */ - private val _power = meter.histogramBuilder("device.power") - .setDescription("The power draw of the device") - .setUnit("W") - .build() private var _powerUsage = 0.0 private var _energyUsage = 0.0 @@ -171,9 +157,7 @@ public class SimTFDevice( } override fun onConverge(conn: FlowConnection, now: Long) { - _usage.record(conn.rate) _resourceUsage = conn.rate - _power.record(machine.psu.powerDraw) _powerUsage = machine.powerUsage _energyUsage = machine.energyUsage } diff --git a/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt index 0d5fbebb..fd18a3a7 100644 --- a/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt +++ b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt @@ -22,7 +22,6 @@ package org.opendc.experiments.tf20.core -import io.opentelemetry.api.metrics.MeterProvider import kotlinx.coroutines.coroutineScope import kotlinx.coroutines.launch import org.junit.jupiter.api.Assertions.assertAll @@ -41,14 +40,19 @@ import java.util.* internal class SimTFDeviceTest { @Test fun testSmoke() = runBlockingSimulation { - val meterProvider: MeterProvider = MeterProvider.noop() - val meter = meterProvider.get("opendc-tf20") - val puNode = ProcessingNode("NVIDIA", "Tesla V100", "unknown", 1) val pu = ProcessingUnit(puNode, 0, 960 * 1230.0) val memory = MemoryUnit("NVIDIA", "Tesla V100", 877.0, 32_000) - val device = SimTFDevice(UUID.randomUUID(), isGpu = true, coroutineContext, clock, meter, pu, memory, LinearPowerModel(250.0, 100.0)) + val device = SimTFDevice( + UUID.randomUUID(), + isGpu = true, + coroutineContext, + clock, + pu, + memory, + LinearPowerModel(250.0, 100.0) + ) // Load 1 GiB into GPU memory device.load(1000) -- cgit v1.2.3