diff options
| author | Fabian Mastenbroek <mail.fabianm@gmail.com> | 2021-09-07 16:04:46 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2021-09-07 16:04:46 +0200 |
| commit | 3eda751b725448139217dc1929dca1fc354e2a4e (patch) | |
| tree | 11d933753c515140a6ae846fe96448ad64b165aa /opendc-telemetry | |
| parent | eb4de7f832c6d26725e0d7c29644c704ea82604e (diff) | |
| parent | 18ff316a6b6ab984ebf8283ea48ed98ec69d8295 (diff) | |
merge: Prepare for risk analysis experiments
This pull request adds the necessary code in preparation for the risk analysis experiments:
- Track provisioning time
- Track host up/down time
- Track guest up/down time
- Support overcommitted memory
- Do not fail inactive guests
- Mark unschedulable server as terminated
- Make ExperimentMonitor optional for trace processing
- Report up/downtime metrics in experiment monitor
- Move metric collection outside Capelin code
- Resolve kotlin-reflect incompatibility
- Restructure input reading classes
**Breaking API Changes**
- `ExperimentMonitor` replaced in favour of `ComputeMonitor`
Diffstat (limited to 'opendc-telemetry')
7 files changed, 473 insertions, 0 deletions
diff --git a/opendc-telemetry/opendc-telemetry-compute/build.gradle.kts b/opendc-telemetry/opendc-telemetry-compute/build.gradle.kts new file mode 100644 index 00000000..6a3de9bc --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/build.gradle.kts @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +description = "Telemetry for OpenDC Compute" + +/* Build configuration */ +plugins { + `kotlin-library-conventions` +} + +dependencies { + api(platform(projects.opendcPlatform)) + api(projects.opendcTelemetry.opendcTelemetrySdk) + + implementation(projects.opendcCompute.opendcComputeSimulator) + implementation(libs.opentelemetry.semconv) + implementation(libs.kotlin.logging) +} diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt new file mode 100644 index 00000000..95e7ff9e --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute + +import io.opentelemetry.sdk.common.CompletableResultCode +import io.opentelemetry.sdk.metrics.data.MetricData +import io.opentelemetry.sdk.metrics.export.MetricExporter +import io.opentelemetry.semconv.resource.attributes.ResourceAttributes +import org.opendc.compute.service.driver.Host +import org.opendc.telemetry.compute.table.HostData +import java.time.Clock + +/** + * A [MetricExporter] that redirects data to a [ComputeMonitor] implementation. + */ +public class ComputeMetricExporter( + private val clock: Clock, + private val hosts: Map<String, Host>, + private val monitor: ComputeMonitor +) : MetricExporter { + + override fun export(metrics: Collection<MetricData>): CompletableResultCode { + return try { + reportHostMetrics(metrics) + reportServiceMetrics(metrics) + CompletableResultCode.ofSuccess() + } catch (e: Throwable) { + CompletableResultCode.ofFailure() + } + } + + private var lastHostMetrics: Map<String, HBuffer> = emptyMap() + private val hostMetricsSingleton = HBuffer() + + private fun reportHostMetrics(metrics: Collection<MetricData>) { + val hostMetrics = mutableMapOf<String, HBuffer>() + + for (metric in metrics) { + when (metric.name) { + "cpu.demand" -> mapDoubleSummary(metric, hostMetrics) { m, v -> m.cpuDemand = v } + "cpu.usage" -> mapDoubleSummary(metric, hostMetrics) { m, v -> m.cpuUsage = v } + "power.usage" -> mapDoubleSummary(metric, hostMetrics) { m, v -> m.powerDraw = v } + "cpu.work.total" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.totalWork = v } + "cpu.work.granted" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.grantedWork = v } + "cpu.work.overcommit" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.overcommittedWork = v } + "cpu.work.interference" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.interferedWork = v } + "guests.active" -> mapLongSum(metric, hostMetrics) { m, v -> m.instanceCount = v.toInt() } + "host.time.up" -> mapLongSum(metric, hostMetrics) { m, v -> m.uptime = v } + "host.time.down" -> mapLongSum(metric, hostMetrics) { m, v -> m.downtime = v } + } + } + + for ((id, hostMetric) in hostMetrics) { + val lastHostMetric = lastHostMetrics.getOrDefault(id, hostMetricsSingleton) + val host = hosts[id] ?: continue + + monitor.record( + HostData( + clock.millis(), + host, + hostMetric.totalWork - lastHostMetric.totalWork, + hostMetric.grantedWork - lastHostMetric.grantedWork, + hostMetric.overcommittedWork - lastHostMetric.overcommittedWork, + hostMetric.interferedWork - lastHostMetric.interferedWork, + hostMetric.cpuUsage, + hostMetric.cpuDemand, + hostMetric.instanceCount, + hostMetric.powerDraw, + hostMetric.uptime - lastHostMetric.uptime, + hostMetric.downtime - lastHostMetric.downtime, + ) + ) + } + + lastHostMetrics = hostMetrics + } + + private fun mapDoubleSummary(data: MetricData, hostMetrics: MutableMap<String, HBuffer>, block: (HBuffer, Double) -> Unit) { + val points = data.doubleSummaryData?.points ?: emptyList() + for (point in points) { + val uid = point.attributes[ResourceAttributes.HOST_ID] ?: continue + val hostMetric = hostMetrics.computeIfAbsent(uid) { HBuffer() } + val avg = (point.percentileValues[0].value + point.percentileValues[1].value) / 2 + block(hostMetric, avg) + } + } + + private fun mapLongSum(data: MetricData?, hostMetrics: MutableMap<String, HBuffer>, block: (HBuffer, Long) -> Unit) { + val points = data?.longSumData?.points ?: emptyList() + for (point in points) { + val uid = point.attributes[ResourceAttributes.HOST_ID] ?: continue + val hostMetric = hostMetrics.computeIfAbsent(uid) { HBuffer() } + block(hostMetric, point.value) + } + } + + private fun mapDoubleSum(data: MetricData?, hostMetrics: MutableMap<String, HBuffer>, block: (HBuffer, Double) -> Unit) { + val points = data?.doubleSumData?.points ?: emptyList() + for (point in points) { + val uid = point.attributes[ResourceAttributes.HOST_ID] ?: continue + val hostMetric = hostMetrics.computeIfAbsent(uid) { HBuffer() } + block(hostMetric, point.value) + } + } + + /** + * A buffer for host metrics before they are reported. + */ + private class HBuffer { + var totalWork: Double = 0.0 + var grantedWork: Double = 0.0 + var overcommittedWork: Double = 0.0 + var interferedWork: Double = 0.0 + var cpuUsage: Double = 0.0 + var cpuDemand: Double = 0.0 + var instanceCount: Int = 0 + var powerDraw: Double = 0.0 + var uptime: Long = 0 + var downtime: Long = 0 + } + + private fun reportServiceMetrics(metrics: Collection<MetricData>) { + monitor.record(extractServiceMetrics(clock.millis(), metrics)) + } + + override fun flush(): CompletableResultCode = CompletableResultCode.ofSuccess() + + override fun shutdown(): CompletableResultCode = CompletableResultCode.ofSuccess() +} diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMonitor.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMonitor.kt new file mode 100644 index 00000000..ec303b37 --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMonitor.kt @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute + +import org.opendc.compute.api.Server +import org.opendc.compute.api.ServerState +import org.opendc.compute.service.driver.Host +import org.opendc.compute.service.driver.HostState +import org.opendc.telemetry.compute.table.HostData +import org.opendc.telemetry.compute.table.ServerData +import org.opendc.telemetry.compute.table.ServiceData + +/** + * A monitor that tracks the metrics and events of the OpenDC Compute service. + */ +public interface ComputeMonitor { + /** + * This method is invoked when the state of a [Server] changes. + */ + public fun onStateChange(timestamp: Long, server: Server, newState: ServerState) {} + + /** + * This method is invoked when the state of a [Host] changes. + */ + public fun onStateChange(time: Long, host: Host, newState: HostState) {} + + /** + * Record the specified [data]. + */ + public fun record(data: ServerData) {} + + /** + * Record the specified [data]. + */ + public fun record(data: HostData) {} + + /** + * Record the specified [data]. + */ + public fun record(data: ServiceData) {} +} diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt new file mode 100644 index 00000000..d3d983b9 --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute + +import io.opentelemetry.sdk.metrics.data.MetricData +import io.opentelemetry.sdk.metrics.export.MetricProducer +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.coroutineScope +import org.opendc.compute.service.ComputeService +import org.opendc.compute.service.driver.Host +import org.opendc.compute.service.driver.HostListener +import org.opendc.compute.service.driver.HostState +import org.opendc.telemetry.compute.table.ServiceData +import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader +import java.time.Clock + +/** + * Attach the specified monitor to the OpenDC Compute service. + */ +public suspend fun withMonitor( + scheduler: ComputeService, + clock: Clock, + metricProducer: MetricProducer, + monitor: ComputeMonitor, + exportInterval: Long = 5L * 60 * 1000, /* Every 5 min (which is the granularity of the workload trace) */ + block: suspend CoroutineScope.() -> Unit +): Unit = coroutineScope { + // Monitor host events + for (host in scheduler.hosts) { + monitor.onStateChange(clock.millis(), host, HostState.UP) + host.addListener(object : HostListener { + override fun onStateChanged(host: Host, newState: HostState) { + monitor.onStateChange(clock.millis(), host, newState) + } + }) + } + + val reader = CoroutineMetricReader( + this, + listOf(metricProducer), + ComputeMetricExporter(clock, scheduler.hosts.associateBy { it.uid.toString() }, monitor), + exportInterval + ) + + try { + block(this) + } finally { + reader.close() + } +} + +/** + * Collect the metrics of the compute service. + */ +public fun collectServiceMetrics(timestamp: Long, metricProducer: MetricProducer): ServiceData { + return extractServiceMetrics(timestamp, metricProducer.collectAllMetrics()) +} + +/** + * Extract a [ServiceData] object from the specified list of metric data. + */ +public fun extractServiceMetrics(timestamp: Long, metrics: Collection<MetricData>): ServiceData { + var submittedVms = 0 + var queuedVms = 0 + var unscheduledVms = 0 + var runningVms = 0 + var finishedVms = 0 + var hosts = 0 + var availableHosts = 0 + + for (metric in metrics) { + val points = metric.longSumData.points + + if (points.isEmpty()) { + continue + } + + val value = points.first().value.toInt() + when (metric.name) { + "servers.submitted" -> submittedVms = value + "servers.waiting" -> queuedVms = value + "servers.unscheduled" -> unscheduledVms = value + "servers.active" -> runningVms = value + "servers.finished" -> finishedVms = value + "hosts.total" -> hosts = value + "hosts.available" -> availableHosts = value + } + } + + return ServiceData(timestamp, hosts, availableHosts, submittedVms, runningVms, finishedVms, queuedVms, unscheduledVms) +} diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt new file mode 100644 index 00000000..8e6c34d0 --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute.table + +import org.opendc.compute.service.driver.Host + +/** + * A trace entry for a particular host. + */ +public data class HostData( + public val timestamp: Long, + public val host: Host, + public val totalWork: Double, + public val grantedWork: Double, + public val overcommittedWork: Double, + public val interferedWork: Double, + public val cpuUsage: Double, + public val cpuDemand: Double, + public val instanceCount: Int, + public val powerDraw: Double, + public val uptime: Long, + public val downtime: Long, +) diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt new file mode 100644 index 00000000..2a9fa8a6 --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute.table + +import org.opendc.compute.api.Server + +/** + * A trace entry for a particular server. + */ +public data class ServerData( + public val timestamp: Long, + public val server: Server, + public val uptime: Long, + public val downtime: Long, +) diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt new file mode 100644 index 00000000..f6ff5db5 --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute.table + +/** + * A trace entry for the compute service. + */ +public data class ServiceData( + public val timestamp: Long, + public val hostCount: Int, + public val activeHostCount: Int, + public val instanceCount: Int, + public val runningInstanceCount: Int, + public val finishedInstanceCount: Int, + public val queuedInstanceCount: Int, + public val failedInstanceCount: Int +) |
