diff options
| author | Fabian Mastenbroek <mail.fabianm@gmail.com> | 2022-05-06 19:04:03 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-05-06 19:04:03 +0200 |
| commit | c3d8d967f82f39f1ef461d5687eb68fb867336c5 (patch) | |
| tree | 2e9938f63c42e5d02fe203e049377d1d17b5d782 /opendc-experiments | |
| parent | a9657e4fa3b15e2c1c11884b5a250b0861bcc21d (diff) | |
| parent | 260e2228afea08868e8f7f07233b1861b2d7f0c7 (diff) | |
merge: Move OpenTelemetry integration outside core modules (#81)
This change removes the OpenTelemetry integration from the OpenDC modules.
Previously, we chose to integrate OpenTelemetry to provide a unified way to
report metrics to the users.
Although this worked as expected, the overhead of the OpenTelemetry when
collecting metrics during simulation was considerable and lacked more
optimization opportunities (other than providing a separate API
implementation). Furthermore, since we were tied to OpenTelemetry's SDK
implementation, we experienced issues with throttling and registering
multiple instruments.
We will instead use another approach, where we expose the core metrics
in OpenDC via specialized interfaces (see #80) such that
access is fast and can be done without having to interface with
OpenTelemetry. In addition, we will provide an adapter to that is able
to forward these metrics to OpenTelemetry implementations, so we can
still integrate with the wider ecosystem.
## Implementation Notes :hammer_and_pick:
* Remove OpenTelemetry from "compute" modules
* Remove OpenTelemetry from "workflow" modules
* Remove OpenTelemetry from "FaaS" modules
* Remove OpenTelemetry from TF20 experiment
* Remove dependency on OpenTelemetry SDK
## External Dependencies :four_leaf_clover:
* N/A
## Breaking API Changes :warning:
* Metrics are not anymore directly exposed via OpenTelemetry. Instead, an adapter needs to be used to access the data via OpenTelemetry.
Diffstat (limited to 'opendc-experiments')
11 files changed, 29 insertions, 64 deletions
diff --git a/opendc-experiments/opendc-experiments-capelin/build.gradle.kts b/opendc-experiments/opendc-experiments-capelin/build.gradle.kts index 9495f4ca..39cf101d 100644 --- a/opendc-experiments/opendc-experiments-capelin/build.gradle.kts +++ b/opendc-experiments/opendc-experiments-capelin/build.gradle.kts @@ -37,8 +37,6 @@ dependencies { implementation(projects.opendcSimulator.opendcSimulatorCore) implementation(projects.opendcSimulator.opendcSimulatorCompute) implementation(projects.opendcCompute.opendcComputeSimulator) - implementation(projects.opendcTelemetry.opendcTelemetrySdk) - implementation(projects.opendcTelemetry.opendcTelemetryCompute) implementation(libs.config) implementation(libs.kotlin.logging) @@ -46,7 +44,6 @@ dependencies { implementation(libs.jackson.module.kotlin) implementation(libs.jackson.dataformat.csv) implementation(kotlin("reflect")) - implementation(libs.opentelemetry.semconv) runtimeOnly(projects.opendcTrace.opendcTraceOpendc) diff --git a/opendc-experiments/opendc-experiments-capelin/src/jmh/kotlin/org/opendc/experiments/capelin/CapelinBenchmarks.kt b/opendc-experiments/opendc-experiments-capelin/src/jmh/kotlin/org/opendc/experiments/capelin/CapelinBenchmarks.kt index 83b8c0c6..fd2c26f0 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/jmh/kotlin/org/opendc/experiments/capelin/CapelinBenchmarks.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/jmh/kotlin/org/opendc/experiments/capelin/CapelinBenchmarks.kt @@ -22,14 +22,12 @@ package org.opendc.experiments.capelin -import kotlinx.coroutines.ExperimentalCoroutinesApi import org.opendc.compute.service.scheduler.FilterScheduler import org.opendc.compute.service.scheduler.filters.ComputeFilter import org.opendc.compute.service.scheduler.filters.RamFilter import org.opendc.compute.service.scheduler.filters.VCpuFilter import org.opendc.compute.service.scheduler.weights.CoreRamWeigher import org.opendc.compute.workload.* -import org.opendc.compute.workload.telemetry.NoopTelemetryManager import org.opendc.compute.workload.topology.Topology import org.opendc.compute.workload.topology.apply import org.opendc.experiments.capelin.topology.clusterTopology @@ -46,7 +44,6 @@ import java.util.concurrent.TimeUnit @Fork(1) @Warmup(iterations = 2, time = 5, timeUnit = TimeUnit.SECONDS) @Measurement(iterations = 5, time = 5, timeUnit = TimeUnit.SECONDS) -@OptIn(ExperimentalCoroutinesApi::class) class CapelinBenchmarks { private lateinit var vms: List<VirtualMachine> private lateinit var topology: Topology @@ -59,7 +56,7 @@ class CapelinBenchmarks { val loader = ComputeWorkloadLoader(File("src/test/resources/trace")) val source = trace("bitbrains-small") vms = source.resolve(loader, Random(1L)).vms - topology = checkNotNull(object {}.javaClass.getResourceAsStream("/env/topology.txt")).use { clusterTopology(it) } + topology = checkNotNull(object {}.javaClass.getResourceAsStream("/topology.txt")).use { clusterTopology(it) } } @Benchmark @@ -71,7 +68,6 @@ class CapelinBenchmarks { val runner = ComputeServiceHelper( coroutineContext, clock, - NoopTelemetryManager(), computeScheduler ) diff --git a/opendc-experiments/opendc-experiments-capelin/src/jmh/resources/topology.txt b/opendc-experiments/opendc-experiments-capelin/src/jmh/resources/topology.txt new file mode 100644 index 00000000..6b347bff --- /dev/null +++ b/opendc-experiments/opendc-experiments-capelin/src/jmh/resources/topology.txt @@ -0,0 +1,5 @@ +ClusterID;ClusterName;Cores;Speed;Memory;numberOfHosts;memoryCapacityPerHost;coreCountPerHost +A01;A01;32;3.2;2048;1;256;32 +B01;B01;48;2.93;1256;6;64;8 +C01;C01;32;3.2;2048;2;128;16 + diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt index 6fd85e8c..0de8aa7b 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt @@ -30,7 +30,7 @@ import org.opendc.compute.workload.ComputeWorkloadLoader import org.opendc.compute.workload.createComputeScheduler import org.opendc.compute.workload.export.parquet.ParquetComputeMonitor import org.opendc.compute.workload.grid5000 -import org.opendc.compute.workload.telemetry.NoopTelemetryManager +import org.opendc.compute.workload.telemetry.ComputeMetricReader import org.opendc.compute.workload.topology.apply import org.opendc.experiments.capelin.model.OperationalPhenomena import org.opendc.experiments.capelin.model.Topology @@ -39,7 +39,6 @@ import org.opendc.experiments.capelin.topology.clusterTopology import org.opendc.harness.dsl.Experiment import org.opendc.harness.dsl.anyOf import org.opendc.simulator.core.runBlockingSimulation -import org.opendc.telemetry.compute.ComputeMetricReader import java.io.File import java.time.Duration import java.util.* @@ -99,11 +98,9 @@ abstract class Portfolio(name: String) : Experiment(name) { else null val (vms, interferenceModel) = workload.source.resolve(workloadLoader, seeder) - val telemetry = NoopTelemetryManager() val runner = ComputeServiceHelper( coroutineContext, clock, - telemetry, computeScheduler, failureModel, interferenceModel?.withSeed(repeat.toLong()) diff --git a/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt b/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt index 62cdf123..fa2cd9c8 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt @@ -33,14 +33,13 @@ import org.opendc.compute.service.scheduler.filters.RamFilter import org.opendc.compute.service.scheduler.filters.VCpuFilter import org.opendc.compute.service.scheduler.weights.CoreRamWeigher import org.opendc.compute.workload.* -import org.opendc.compute.workload.telemetry.NoopTelemetryManager +import org.opendc.compute.workload.telemetry.ComputeMetricReader +import org.opendc.compute.workload.telemetry.ComputeMonitor +import org.opendc.compute.workload.telemetry.table.HostTableReader import org.opendc.compute.workload.topology.Topology import org.opendc.compute.workload.topology.apply import org.opendc.experiments.capelin.topology.clusterTopology import org.opendc.simulator.core.runBlockingSimulation -import org.opendc.telemetry.compute.ComputeMetricReader -import org.opendc.telemetry.compute.ComputeMonitor -import org.opendc.telemetry.compute.table.HostTableReader import java.io.File import java.time.Duration import java.util.* @@ -86,7 +85,6 @@ class CapelinIntegrationTest { val runner = ComputeServiceHelper( coroutineContext, clock, - NoopTelemetryManager(), computeScheduler ) val topology = createTopology() @@ -136,7 +134,6 @@ class CapelinIntegrationTest { val runner = ComputeServiceHelper( coroutineContext, clock, - NoopTelemetryManager(), computeScheduler ) val topology = createTopology("single") @@ -182,7 +179,6 @@ class CapelinIntegrationTest { val simulator = ComputeServiceHelper( coroutineContext, clock, - NoopTelemetryManager(), computeScheduler, interferenceModel = interferenceModel?.withSeed(seed.toLong()) ) @@ -226,7 +222,6 @@ class CapelinIntegrationTest { val simulator = ComputeServiceHelper( coroutineContext, clock, - NoopTelemetryManager(), computeScheduler, grid5000(Duration.ofDays(7)) ) diff --git a/opendc-experiments/opendc-experiments-serverless20/build.gradle.kts b/opendc-experiments/opendc-experiments-serverless20/build.gradle.kts index b96647a6..a6391986 100644 --- a/opendc-experiments/opendc-experiments-serverless20/build.gradle.kts +++ b/opendc-experiments/opendc-experiments-serverless20/build.gradle.kts @@ -33,7 +33,6 @@ dependencies { implementation(projects.opendcSimulator.opendcSimulatorCore) implementation(projects.opendcFaas.opendcFaasService) implementation(projects.opendcFaas.opendcFaasSimulator) - implementation(projects.opendcTelemetry.opendcTelemetrySdk) implementation(libs.kotlin.logging) implementation(libs.config) } diff --git a/opendc-experiments/opendc-experiments-serverless20/src/main/kotlin/org/opendc/experiments/serverless/ServerlessExperiment.kt b/opendc-experiments/opendc-experiments-serverless20/src/main/kotlin/org/opendc/experiments/serverless/ServerlessExperiment.kt index 3312d6c0..1c357f67 100644 --- a/opendc-experiments/opendc-experiments-serverless20/src/main/kotlin/org/opendc/experiments/serverless/ServerlessExperiment.kt +++ b/opendc-experiments/opendc-experiments-serverless20/src/main/kotlin/org/opendc/experiments/serverless/ServerlessExperiment.kt @@ -23,8 +23,6 @@ package org.opendc.experiments.serverless import com.typesafe.config.ConfigFactory -import io.opentelemetry.api.metrics.MeterProvider -import io.opentelemetry.sdk.metrics.SdkMeterProvider import kotlinx.coroutines.coroutineScope import kotlinx.coroutines.delay import kotlinx.coroutines.launch @@ -44,7 +42,6 @@ import org.opendc.simulator.compute.model.MemoryUnit import org.opendc.simulator.compute.model.ProcessingNode import org.opendc.simulator.compute.model.ProcessingUnit import org.opendc.simulator.core.runBlockingSimulation -import org.opendc.telemetry.sdk.toOtelClock import java.io.File import java.time.Duration import java.util.* @@ -76,17 +73,18 @@ public class ServerlessExperiment : Experiment("Serverless") { private val coldStartModel by anyOf(ColdStartModel.LAMBDA, ColdStartModel.AZURE, ColdStartModel.GOOGLE) override fun doRun(repeat: Int): Unit = runBlockingSimulation { - val meterProvider: MeterProvider = SdkMeterProvider - .builder() - .setClock(clock.toOtelClock()) - .build() - val trace = ServerlessTraceReader().parse(File(config.getString("trace-path"))) val traceById = trace.associateBy { it.id } val delayInjector = StochasticDelayInjector(coldStartModel, Random()) val deployer = SimFunctionDeployer(clock, this, createMachineModel(), delayInjector) { FunctionTraceWorkload(traceById.getValue(it.name)) } val service = - FaaSService(coroutineContext, clock, meterProvider, deployer, routingPolicy, FunctionTerminationPolicyFixed(coroutineContext, clock, timeout = Duration.ofMinutes(10))) + FaaSService( + coroutineContext, + clock, + deployer, + routingPolicy, + FunctionTerminationPolicyFixed(coroutineContext, clock, timeout = Duration.ofMinutes(10)) + ) val client = service.newClient() coroutineScope { diff --git a/opendc-experiments/opendc-experiments-tf20/build.gradle.kts b/opendc-experiments/opendc-experiments-tf20/build.gradle.kts index 5762ce64..f61c8fef 100644 --- a/opendc-experiments/opendc-experiments-tf20/build.gradle.kts +++ b/opendc-experiments/opendc-experiments-tf20/build.gradle.kts @@ -32,7 +32,6 @@ dependencies { api(projects.opendcHarness.opendcHarnessApi) implementation(projects.opendcSimulator.opendcSimulatorCore) implementation(projects.opendcSimulator.opendcSimulatorCompute) - implementation(projects.opendcTelemetry.opendcTelemetrySdk) implementation(projects.opendcCommon) implementation(libs.kotlin.logging) diff --git a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/TensorFlowExperiment.kt b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/TensorFlowExperiment.kt index 2153a862..19236029 100644 --- a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/TensorFlowExperiment.kt +++ b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/TensorFlowExperiment.kt @@ -22,8 +22,6 @@ package org.opendc.experiments.tf20 -import io.opentelemetry.api.metrics.MeterProvider -import io.opentelemetry.sdk.metrics.SdkMeterProvider import org.opendc.experiments.tf20.core.SimTFDevice import org.opendc.experiments.tf20.distribute.* import org.opendc.experiments.tf20.keras.AlexNet @@ -32,7 +30,6 @@ import org.opendc.harness.dsl.Experiment import org.opendc.harness.dsl.anyOf import org.opendc.simulator.compute.power.LinearPowerModel import org.opendc.simulator.core.runBlockingSimulation -import org.opendc.telemetry.sdk.toOtelClock /** * Experiments with the TensorFlow simulation model. @@ -49,17 +46,11 @@ public class TensorFlowExperiment : Experiment(name = "tf20") { private val batchSize by anyOf(16, 32, 64, 128) override fun doRun(repeat: Int): Unit = runBlockingSimulation { - val meterProvider: MeterProvider = SdkMeterProvider - .builder() - .setClock(clock.toOtelClock()) - .build() - val meter = meterProvider.get("opendc-tf20") - val envInput = checkNotNull(TensorFlowExperiment::class.java.getResourceAsStream(environmentFile)) val def = MLEnvironmentReader().readEnvironment(envInput).first() val device = SimTFDevice( - def.uid, def.meta["gpu"] as Boolean, coroutineContext, clock, meter, def.model.cpus[0], - def.model.memory[0], LinearPowerModel(250.0, 60.0) + def.uid, def.meta["gpu"] as Boolean, coroutineContext, clock, def.model.cpus[0], def.model.memory[0], + LinearPowerModel(250.0, 60.0) ) val strategy = OneDeviceStrategy(device) diff --git a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt index 99948c8e..d2105196 100644 --- a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt +++ b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/core/SimTFDevice.kt @@ -22,7 +22,6 @@ package org.opendc.experiments.tf20.core -import io.opentelemetry.api.metrics.Meter import kotlinx.coroutines.* import org.opendc.simulator.compute.SimBareMetalMachine import org.opendc.simulator.compute.SimMachine @@ -50,7 +49,6 @@ public class SimTFDevice( override val isGpu: Boolean, context: CoroutineContext, clock: Clock, - meter: Meter, pu: ProcessingUnit, private val memory: MemoryUnit, powerModel: PowerModel @@ -69,21 +67,9 @@ public class SimTFDevice( ) /** - * The usage of the device. + * Metrics collected by the device. */ - private val _usage = meter.histogramBuilder("device.usage") - .setDescription("The amount of device resources used") - .setUnit("MHz") - .build() private var _resourceUsage = 0.0 - - /** - * The power draw of the device. - */ - private val _power = meter.histogramBuilder("device.power") - .setDescription("The power draw of the device") - .setUnit("W") - .build() private var _powerUsage = 0.0 private var _energyUsage = 0.0 @@ -171,9 +157,7 @@ public class SimTFDevice( } override fun onConverge(conn: FlowConnection, now: Long) { - _usage.record(conn.rate) _resourceUsage = conn.rate - _power.record(machine.psu.powerDraw) _powerUsage = machine.powerUsage _energyUsage = machine.energyUsage } diff --git a/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt index 0d5fbebb..fd18a3a7 100644 --- a/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt +++ b/opendc-experiments/opendc-experiments-tf20/src/test/kotlin/org/opendc/experiments/tf20/core/SimTFDeviceTest.kt @@ -22,7 +22,6 @@ package org.opendc.experiments.tf20.core -import io.opentelemetry.api.metrics.MeterProvider import kotlinx.coroutines.coroutineScope import kotlinx.coroutines.launch import org.junit.jupiter.api.Assertions.assertAll @@ -41,14 +40,19 @@ import java.util.* internal class SimTFDeviceTest { @Test fun testSmoke() = runBlockingSimulation { - val meterProvider: MeterProvider = MeterProvider.noop() - val meter = meterProvider.get("opendc-tf20") - val puNode = ProcessingNode("NVIDIA", "Tesla V100", "unknown", 1) val pu = ProcessingUnit(puNode, 0, 960 * 1230.0) val memory = MemoryUnit("NVIDIA", "Tesla V100", 877.0, 32_000) - val device = SimTFDevice(UUID.randomUUID(), isGpu = true, coroutineContext, clock, meter, pu, memory, LinearPowerModel(250.0, 100.0)) + val device = SimTFDevice( + UUID.randomUUID(), + isGpu = true, + coroutineContext, + clock, + pu, + memory, + LinearPowerModel(250.0, 100.0) + ) // Load 1 GiB into GPU memory device.load(1000) |
