From aaedd4f3eed83d0c3ebc829fec08a1749a2bfba4 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Fri, 27 Aug 2021 16:41:55 +0200 Subject: refactor(capelin): Move metric collection outside Capelin code This change moves the metric collection outside the Capelin codebase in a separate module so other modules can also benefit from the compute metric collection code. --- .../opendc-telemetry-compute/build.gradle.kts | 37 +++++ .../telemetry/compute/ComputeMetricExporter.kt | 149 +++++++++++++++++++++ .../org/opendc/telemetry/compute/ComputeMonitor.kt | 61 +++++++++ .../kotlin/org/opendc/telemetry/compute/Helpers.kt | 111 +++++++++++++++ .../org/opendc/telemetry/compute/table/HostData.kt | 43 ++++++ .../opendc/telemetry/compute/table/ServerData.kt | 35 +++++ .../opendc/telemetry/compute/table/ServiceData.kt | 37 +++++ 7 files changed, 473 insertions(+) create mode 100644 opendc-telemetry/opendc-telemetry-compute/build.gradle.kts create mode 100644 opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt create mode 100644 opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMonitor.kt create mode 100644 opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt create mode 100644 opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt create mode 100644 opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt create mode 100644 opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt (limited to 'opendc-telemetry') diff --git a/opendc-telemetry/opendc-telemetry-compute/build.gradle.kts b/opendc-telemetry/opendc-telemetry-compute/build.gradle.kts new file mode 100644 index 00000000..6a3de9bc --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/build.gradle.kts @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +description = "Telemetry for OpenDC Compute" + +/* Build configuration */ +plugins { + `kotlin-library-conventions` +} + +dependencies { + api(platform(projects.opendcPlatform)) + api(projects.opendcTelemetry.opendcTelemetrySdk) + + implementation(projects.opendcCompute.opendcComputeSimulator) + implementation(libs.opentelemetry.semconv) + implementation(libs.kotlin.logging) +} diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt new file mode 100644 index 00000000..95e7ff9e --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute + +import io.opentelemetry.sdk.common.CompletableResultCode +import io.opentelemetry.sdk.metrics.data.MetricData +import io.opentelemetry.sdk.metrics.export.MetricExporter +import io.opentelemetry.semconv.resource.attributes.ResourceAttributes +import org.opendc.compute.service.driver.Host +import org.opendc.telemetry.compute.table.HostData +import java.time.Clock + +/** + * A [MetricExporter] that redirects data to a [ComputeMonitor] implementation. + */ +public class ComputeMetricExporter( + private val clock: Clock, + private val hosts: Map, + private val monitor: ComputeMonitor +) : MetricExporter { + + override fun export(metrics: Collection): CompletableResultCode { + return try { + reportHostMetrics(metrics) + reportServiceMetrics(metrics) + CompletableResultCode.ofSuccess() + } catch (e: Throwable) { + CompletableResultCode.ofFailure() + } + } + + private var lastHostMetrics: Map = emptyMap() + private val hostMetricsSingleton = HBuffer() + + private fun reportHostMetrics(metrics: Collection) { + val hostMetrics = mutableMapOf() + + for (metric in metrics) { + when (metric.name) { + "cpu.demand" -> mapDoubleSummary(metric, hostMetrics) { m, v -> m.cpuDemand = v } + "cpu.usage" -> mapDoubleSummary(metric, hostMetrics) { m, v -> m.cpuUsage = v } + "power.usage" -> mapDoubleSummary(metric, hostMetrics) { m, v -> m.powerDraw = v } + "cpu.work.total" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.totalWork = v } + "cpu.work.granted" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.grantedWork = v } + "cpu.work.overcommit" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.overcommittedWork = v } + "cpu.work.interference" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.interferedWork = v } + "guests.active" -> mapLongSum(metric, hostMetrics) { m, v -> m.instanceCount = v.toInt() } + "host.time.up" -> mapLongSum(metric, hostMetrics) { m, v -> m.uptime = v } + "host.time.down" -> mapLongSum(metric, hostMetrics) { m, v -> m.downtime = v } + } + } + + for ((id, hostMetric) in hostMetrics) { + val lastHostMetric = lastHostMetrics.getOrDefault(id, hostMetricsSingleton) + val host = hosts[id] ?: continue + + monitor.record( + HostData( + clock.millis(), + host, + hostMetric.totalWork - lastHostMetric.totalWork, + hostMetric.grantedWork - lastHostMetric.grantedWork, + hostMetric.overcommittedWork - lastHostMetric.overcommittedWork, + hostMetric.interferedWork - lastHostMetric.interferedWork, + hostMetric.cpuUsage, + hostMetric.cpuDemand, + hostMetric.instanceCount, + hostMetric.powerDraw, + hostMetric.uptime - lastHostMetric.uptime, + hostMetric.downtime - lastHostMetric.downtime, + ) + ) + } + + lastHostMetrics = hostMetrics + } + + private fun mapDoubleSummary(data: MetricData, hostMetrics: MutableMap, block: (HBuffer, Double) -> Unit) { + val points = data.doubleSummaryData?.points ?: emptyList() + for (point in points) { + val uid = point.attributes[ResourceAttributes.HOST_ID] ?: continue + val hostMetric = hostMetrics.computeIfAbsent(uid) { HBuffer() } + val avg = (point.percentileValues[0].value + point.percentileValues[1].value) / 2 + block(hostMetric, avg) + } + } + + private fun mapLongSum(data: MetricData?, hostMetrics: MutableMap, block: (HBuffer, Long) -> Unit) { + val points = data?.longSumData?.points ?: emptyList() + for (point in points) { + val uid = point.attributes[ResourceAttributes.HOST_ID] ?: continue + val hostMetric = hostMetrics.computeIfAbsent(uid) { HBuffer() } + block(hostMetric, point.value) + } + } + + private fun mapDoubleSum(data: MetricData?, hostMetrics: MutableMap, block: (HBuffer, Double) -> Unit) { + val points = data?.doubleSumData?.points ?: emptyList() + for (point in points) { + val uid = point.attributes[ResourceAttributes.HOST_ID] ?: continue + val hostMetric = hostMetrics.computeIfAbsent(uid) { HBuffer() } + block(hostMetric, point.value) + } + } + + /** + * A buffer for host metrics before they are reported. + */ + private class HBuffer { + var totalWork: Double = 0.0 + var grantedWork: Double = 0.0 + var overcommittedWork: Double = 0.0 + var interferedWork: Double = 0.0 + var cpuUsage: Double = 0.0 + var cpuDemand: Double = 0.0 + var instanceCount: Int = 0 + var powerDraw: Double = 0.0 + var uptime: Long = 0 + var downtime: Long = 0 + } + + private fun reportServiceMetrics(metrics: Collection) { + monitor.record(extractServiceMetrics(clock.millis(), metrics)) + } + + override fun flush(): CompletableResultCode = CompletableResultCode.ofSuccess() + + override fun shutdown(): CompletableResultCode = CompletableResultCode.ofSuccess() +} diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMonitor.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMonitor.kt new file mode 100644 index 00000000..ec303b37 --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMonitor.kt @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute + +import org.opendc.compute.api.Server +import org.opendc.compute.api.ServerState +import org.opendc.compute.service.driver.Host +import org.opendc.compute.service.driver.HostState +import org.opendc.telemetry.compute.table.HostData +import org.opendc.telemetry.compute.table.ServerData +import org.opendc.telemetry.compute.table.ServiceData + +/** + * A monitor that tracks the metrics and events of the OpenDC Compute service. + */ +public interface ComputeMonitor { + /** + * This method is invoked when the state of a [Server] changes. + */ + public fun onStateChange(timestamp: Long, server: Server, newState: ServerState) {} + + /** + * This method is invoked when the state of a [Host] changes. + */ + public fun onStateChange(time: Long, host: Host, newState: HostState) {} + + /** + * Record the specified [data]. + */ + public fun record(data: ServerData) {} + + /** + * Record the specified [data]. + */ + public fun record(data: HostData) {} + + /** + * Record the specified [data]. + */ + public fun record(data: ServiceData) {} +} diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt new file mode 100644 index 00000000..d3d983b9 --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute + +import io.opentelemetry.sdk.metrics.data.MetricData +import io.opentelemetry.sdk.metrics.export.MetricProducer +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.coroutineScope +import org.opendc.compute.service.ComputeService +import org.opendc.compute.service.driver.Host +import org.opendc.compute.service.driver.HostListener +import org.opendc.compute.service.driver.HostState +import org.opendc.telemetry.compute.table.ServiceData +import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader +import java.time.Clock + +/** + * Attach the specified monitor to the OpenDC Compute service. + */ +public suspend fun withMonitor( + scheduler: ComputeService, + clock: Clock, + metricProducer: MetricProducer, + monitor: ComputeMonitor, + exportInterval: Long = 5L * 60 * 1000, /* Every 5 min (which is the granularity of the workload trace) */ + block: suspend CoroutineScope.() -> Unit +): Unit = coroutineScope { + // Monitor host events + for (host in scheduler.hosts) { + monitor.onStateChange(clock.millis(), host, HostState.UP) + host.addListener(object : HostListener { + override fun onStateChanged(host: Host, newState: HostState) { + monitor.onStateChange(clock.millis(), host, newState) + } + }) + } + + val reader = CoroutineMetricReader( + this, + listOf(metricProducer), + ComputeMetricExporter(clock, scheduler.hosts.associateBy { it.uid.toString() }, monitor), + exportInterval + ) + + try { + block(this) + } finally { + reader.close() + } +} + +/** + * Collect the metrics of the compute service. + */ +public fun collectServiceMetrics(timestamp: Long, metricProducer: MetricProducer): ServiceData { + return extractServiceMetrics(timestamp, metricProducer.collectAllMetrics()) +} + +/** + * Extract a [ServiceData] object from the specified list of metric data. + */ +public fun extractServiceMetrics(timestamp: Long, metrics: Collection): ServiceData { + var submittedVms = 0 + var queuedVms = 0 + var unscheduledVms = 0 + var runningVms = 0 + var finishedVms = 0 + var hosts = 0 + var availableHosts = 0 + + for (metric in metrics) { + val points = metric.longSumData.points + + if (points.isEmpty()) { + continue + } + + val value = points.first().value.toInt() + when (metric.name) { + "servers.submitted" -> submittedVms = value + "servers.waiting" -> queuedVms = value + "servers.unscheduled" -> unscheduledVms = value + "servers.active" -> runningVms = value + "servers.finished" -> finishedVms = value + "hosts.total" -> hosts = value + "hosts.available" -> availableHosts = value + } + } + + return ServiceData(timestamp, hosts, availableHosts, submittedVms, runningVms, finishedVms, queuedVms, unscheduledVms) +} diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt new file mode 100644 index 00000000..8e6c34d0 --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute.table + +import org.opendc.compute.service.driver.Host + +/** + * A trace entry for a particular host. + */ +public data class HostData( + public val timestamp: Long, + public val host: Host, + public val totalWork: Double, + public val grantedWork: Double, + public val overcommittedWork: Double, + public val interferedWork: Double, + public val cpuUsage: Double, + public val cpuDemand: Double, + public val instanceCount: Int, + public val powerDraw: Double, + public val uptime: Long, + public val downtime: Long, +) diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt new file mode 100644 index 00000000..2a9fa8a6 --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute.table + +import org.opendc.compute.api.Server + +/** + * A trace entry for a particular server. + */ +public data class ServerData( + public val timestamp: Long, + public val server: Server, + public val uptime: Long, + public val downtime: Long, +) diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt new file mode 100644 index 00000000..f6ff5db5 --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute.table + +/** + * A trace entry for the compute service. + */ +public data class ServiceData( + public val timestamp: Long, + public val hostCount: Int, + public val activeHostCount: Int, + public val instanceCount: Int, + public val runningInstanceCount: Int, + public val finishedInstanceCount: Int, + public val queuedInstanceCount: Int, + public val failedInstanceCount: Int +) -- cgit v1.2.3 From 144d9d0c118097900c086b7fb8b1cf22a788592b Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Mon, 13 Sep 2021 12:22:32 +0200 Subject: build(telemetry): Update to OpenTelemetry 1.6.0 This change updates the opentelemetry-java library to version 1.6.0. --- .../org/opendc/telemetry/compute/ComputeMetricExporter.kt | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'opendc-telemetry') diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt index 95e7ff9e..57d43c60 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt @@ -59,7 +59,7 @@ public class ComputeMetricExporter( when (metric.name) { "cpu.demand" -> mapDoubleSummary(metric, hostMetrics) { m, v -> m.cpuDemand = v } "cpu.usage" -> mapDoubleSummary(metric, hostMetrics) { m, v -> m.cpuUsage = v } - "power.usage" -> mapDoubleSummary(metric, hostMetrics) { m, v -> m.powerDraw = v } + "power.usage" -> mapDoubleHistogram(metric, hostMetrics) { m, v -> m.powerDraw = v } "cpu.work.total" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.totalWork = v } "cpu.work.granted" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.grantedWork = v } "cpu.work.overcommit" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.overcommittedWork = v } @@ -105,6 +105,15 @@ public class ComputeMetricExporter( } } + private fun mapDoubleHistogram(data: MetricData, hostMetrics: MutableMap, block: (HBuffer, Double) -> Unit) { + val points = data.doubleHistogramData?.points ?: emptyList() + for (point in points) { + val uid = point.attributes[ResourceAttributes.HOST_ID] ?: continue + val hostMetric = hostMetrics.computeIfAbsent(uid) { HBuffer() } + block(hostMetric, point.sum / point.count) + } + } + private fun mapLongSum(data: MetricData?, hostMetrics: MutableMap, block: (HBuffer, Long) -> Unit) { val points = data?.longSumData?.points ?: emptyList() for (point in points) { -- cgit v1.2.3 From 5f0b6b372487d79594cf59010822e160f351e0be Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Mon, 13 Sep 2021 14:48:02 +0200 Subject: refactor(telemetry): Simplify CoroutineMetricReader This change simplifies the CoroutineMetricReader implementation by removing the seperation of reader and exporter jobs. --- .../kotlin/org/opendc/telemetry/compute/Helpers.kt | 3 +- .../sdk/metrics/export/CoroutineMetricReader.kt | 52 +++++++--------------- 2 files changed, 19 insertions(+), 36 deletions(-) (limited to 'opendc-telemetry') diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt index d3d983b9..01df0e69 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt @@ -33,6 +33,7 @@ import org.opendc.compute.service.driver.HostState import org.opendc.telemetry.compute.table.ServiceData import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader import java.time.Clock +import java.time.Duration /** * Attach the specified monitor to the OpenDC Compute service. @@ -42,7 +43,7 @@ public suspend fun withMonitor( clock: Clock, metricProducer: MetricProducer, monitor: ComputeMonitor, - exportInterval: Long = 5L * 60 * 1000, /* Every 5 min (which is the granularity of the workload trace) */ + exportInterval: Duration = Duration.ofMinutes(5), /* Every 5 min (which is the granularity of the workload trace) */ block: suspend CoroutineScope.() -> Unit ): Unit = coroutineScope { // Monitor host events diff --git a/opendc-telemetry/opendc-telemetry-sdk/src/main/kotlin/org/opendc/telemetry/sdk/metrics/export/CoroutineMetricReader.kt b/opendc-telemetry/opendc-telemetry-sdk/src/main/kotlin/org/opendc/telemetry/sdk/metrics/export/CoroutineMetricReader.kt index 9ee16fac..8f19ab81 100644 --- a/opendc-telemetry/opendc-telemetry-sdk/src/main/kotlin/org/opendc/telemetry/sdk/metrics/export/CoroutineMetricReader.kt +++ b/opendc-telemetry/opendc-telemetry-sdk/src/main/kotlin/org/opendc/telemetry/sdk/metrics/export/CoroutineMetricReader.kt @@ -26,14 +26,8 @@ import io.opentelemetry.sdk.metrics.data.MetricData import io.opentelemetry.sdk.metrics.export.MetricExporter import io.opentelemetry.sdk.metrics.export.MetricProducer import kotlinx.coroutines.* -import kotlinx.coroutines.channels.Channel -import kotlinx.coroutines.flow.consumeAsFlow -import kotlinx.coroutines.flow.launchIn -import kotlinx.coroutines.flow.onEach import mu.KotlinLogging -import java.util.* -import kotlin.coroutines.resume -import kotlin.coroutines.suspendCoroutine +import java.time.Duration /** * A helper class to read the metrics from a list of [MetricProducer]s and automatically export the metrics every @@ -44,56 +38,44 @@ import kotlin.coroutines.suspendCoroutine * @param scope The [CoroutineScope] to run the reader in. * @param producers The metric producers to gather metrics from. * @param exporter The export to export the metrics to. - * @param exportInterval The export interval in milliseconds. + * @param exportInterval The export interval. */ public class CoroutineMetricReader( scope: CoroutineScope, private val producers: List, private val exporter: MetricExporter, - private val exportInterval: Long = 60_000 + private val exportInterval: Duration = Duration.ofMinutes(1) ) : AutoCloseable { private val logger = KotlinLogging.logger {} - private val chan = Channel>(Channel.RENDEZVOUS) /** - * The metric reader job. + * The background job that is responsible for collecting the metrics every cycle. */ - private val readerJob = scope.launch { + private val job = scope.launch { + val intervalMs = exportInterval.toMillis() + while (isActive) { - delay(exportInterval) + delay(intervalMs) val metrics = mutableListOf() for (producer in producers) { metrics.addAll(producer.collectAllMetrics()) } - chan.send(Collections.unmodifiableList(metrics)) - } - } - /** - * The exporter job runs in the background to actually export the metrics. - */ - private val exporterJob = chan.consumeAsFlow() - .onEach { metrics -> - suspendCoroutine { cont -> - try { - val result = exporter.export(metrics) - result.whenComplete { - if (!result.isSuccess) { - logger.trace { "Exporter failed" } - } - cont.resume(Unit) + try { + val result = exporter.export(metrics) + result.whenComplete { + if (!result.isSuccess) { + logger.trace { "Exporter failed" } } - } catch (cause: Throwable) { - logger.warn(cause) { "Exporter threw an Exception" } - cont.resume(Unit) } + } catch (cause: Throwable) { + logger.warn(cause) { "Exporter threw an Exception" } } } - .launchIn(scope) + } override fun close() { - readerJob.cancel() - exporterJob.cancel() + job.cancel() } } -- cgit v1.2.3 From 3ca64e0110adab65526a0ccfd5b252e9f047ab10 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Tue, 14 Sep 2021 14:41:05 +0200 Subject: refactor(telemetry): Create separate MeterProvider per service/host This change refactors the telemetry implementation by creating a separate MeterProvider per service or host. This means we have to keep track of multiple metric producers, but that we can attach resource information to each of the MeterProviders like we would in a real world scenario. --- .../opendc-telemetry-compute/build.gradle.kts | 1 - .../telemetry/compute/ComputeMetricExporter.kt | 295 +++++++++++++++------ .../org/opendc/telemetry/compute/ComputeMonitor.kt | 14 - .../kotlin/org/opendc/telemetry/compute/Helpers.kt | 44 --- .../org/opendc/telemetry/compute/HostAttributes.kt | 51 ++++ .../org/opendc/telemetry/compute/table/HostData.kt | 4 +- .../org/opendc/telemetry/compute/table/HostInfo.kt | 28 ++ .../opendc/telemetry/compute/table/ServerData.kt | 5 +- .../opendc/telemetry/compute/table/ServerInfo.kt | 37 +++ 9 files changed, 328 insertions(+), 151 deletions(-) create mode 100644 opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/HostAttributes.kt create mode 100644 opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostInfo.kt create mode 100644 opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerInfo.kt (limited to 'opendc-telemetry') diff --git a/opendc-telemetry/opendc-telemetry-compute/build.gradle.kts b/opendc-telemetry/opendc-telemetry-compute/build.gradle.kts index 6a3de9bc..cd8cb57a 100644 --- a/opendc-telemetry/opendc-telemetry-compute/build.gradle.kts +++ b/opendc-telemetry/opendc-telemetry-compute/build.gradle.kts @@ -31,7 +31,6 @@ dependencies { api(platform(projects.opendcPlatform)) api(projects.opendcTelemetry.opendcTelemetrySdk) - implementation(projects.opendcCompute.opendcComputeSimulator) implementation(libs.opentelemetry.semconv) implementation(libs.kotlin.logging) } diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt index 57d43c60..408d1325 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt @@ -22,137 +22,260 @@ package org.opendc.telemetry.compute +import io.opentelemetry.api.common.AttributeKey +import io.opentelemetry.api.common.Attributes import io.opentelemetry.sdk.common.CompletableResultCode -import io.opentelemetry.sdk.metrics.data.MetricData +import io.opentelemetry.sdk.metrics.data.* import io.opentelemetry.sdk.metrics.export.MetricExporter +import io.opentelemetry.sdk.resources.Resource import io.opentelemetry.semconv.resource.attributes.ResourceAttributes -import org.opendc.compute.service.driver.Host import org.opendc.telemetry.compute.table.HostData +import org.opendc.telemetry.compute.table.HostInfo +import org.opendc.telemetry.compute.table.ServerData +import org.opendc.telemetry.compute.table.ServerInfo import java.time.Clock /** * A [MetricExporter] that redirects data to a [ComputeMonitor] implementation. */ -public class ComputeMetricExporter( - private val clock: Clock, - private val hosts: Map, - private val monitor: ComputeMonitor -) : MetricExporter { - +public class ComputeMetricExporter(private val clock: Clock, private val monitor: ComputeMonitor) : MetricExporter { override fun export(metrics: Collection): CompletableResultCode { return try { - reportHostMetrics(metrics) reportServiceMetrics(metrics) + reportHostMetrics(metrics) + reportServerMetrics(metrics) CompletableResultCode.ofSuccess() } catch (e: Throwable) { CompletableResultCode.ofFailure() } } - private var lastHostMetrics: Map = emptyMap() - private val hostMetricsSingleton = HBuffer() + override fun flush(): CompletableResultCode = CompletableResultCode.ofSuccess() + + override fun shutdown(): CompletableResultCode = CompletableResultCode.ofSuccess() + + private fun reportServiceMetrics(metrics: Collection) { + monitor.record(extractServiceMetrics(clock.millis(), metrics)) + } + + private val hosts = mutableMapOf() + private val servers = mutableMapOf() private fun reportHostMetrics(metrics: Collection) { - val hostMetrics = mutableMapOf() + val hosts = hosts + val servers = servers + + for (metric in metrics) { + val resource = metric.resource + val hostId = resource.attributes[HOST_ID] ?: continue + val agg = hosts.computeIfAbsent(hostId) { HostAggregator(resource) } + agg.accept(metric) + } + + val monitor = monitor + val now = clock.millis() + for ((_, server) in servers) { + server.record(monitor, now) + } + } + + private fun reportServerMetrics(metrics: Collection) { + val hosts = hosts for (metric in metrics) { + val resource = metric.resource + val host = resource.attributes[HOST_ID]?.let { hosts[it]?.host } + when (metric.name) { - "cpu.demand" -> mapDoubleSummary(metric, hostMetrics) { m, v -> m.cpuDemand = v } - "cpu.usage" -> mapDoubleSummary(metric, hostMetrics) { m, v -> m.cpuUsage = v } - "power.usage" -> mapDoubleHistogram(metric, hostMetrics) { m, v -> m.powerDraw = v } - "cpu.work.total" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.totalWork = v } - "cpu.work.granted" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.grantedWork = v } - "cpu.work.overcommit" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.overcommittedWork = v } - "cpu.work.interference" -> mapDoubleSum(metric, hostMetrics) { m, v -> m.interferedWork = v } - "guests.active" -> mapLongSum(metric, hostMetrics) { m, v -> m.instanceCount = v.toInt() } - "host.time.up" -> mapLongSum(metric, hostMetrics) { m, v -> m.uptime = v } - "host.time.down" -> mapLongSum(metric, hostMetrics) { m, v -> m.downtime = v } + "scheduler.duration" -> mapByServer(metric.doubleHistogramData.points, host) { agg, point -> + agg.schedulingLatency = point.sum / point.count + } + "guest.time.running" -> mapByServer(metric.longSumData.points, host) { agg, point -> + agg.uptime = point.value + } + "guest.time.error" -> mapByServer(metric.longSumData.points, host) { agg, point -> + agg.downtime = point.value + } } } - for ((id, hostMetric) in hostMetrics) { - val lastHostMetric = lastHostMetrics.getOrDefault(id, hostMetricsSingleton) - val host = hosts[id] ?: continue + val monitor = monitor + val now = clock.millis() + for ((_, host) in hosts) { + host.record(monitor, now) + } + } + + /** + * Helper function to map a metric by the server. + */ + private inline fun

mapByServer(points: Collection

, host: HostInfo? = null, block: (ServerAggregator, P) -> Unit) { + for (point in points) { + val serverId = point.attributes[ResourceAttributes.HOST_ID] ?: continue + val agg = servers.computeIfAbsent(serverId) { ServerAggregator(point.attributes) } + + if (host != null) { + agg.host = host + } + + block(agg, point) + } + } + + /** + * An aggregator for host metrics before they are reported. + */ + private class HostAggregator(resource: Resource) { + /** + * The static information about this host. + */ + val host = HostInfo( + resource.attributes[HOST_ID]!!, + resource.attributes[HOST_NAME]!!, + resource.attributes[HOST_ARCH]!!, + resource.attributes[HOST_NCPUS]!!.toInt(), + resource.attributes[HOST_MEM_CAPACITY]!!, + ) + + private var totalWork: Double = 0.0 + private var previousTotalWork = 0.0 + private var grantedWork: Double = 0.0 + private var previousGrantedWork = 0.0 + private var overcommittedWork: Double = 0.0 + private var previousOvercommittedWork = 0.0 + private var interferedWork: Double = 0.0 + private var previousInterferedWork = 0.0 + private var cpuUsage: Double = 0.0 + private var cpuDemand: Double = 0.0 + private var instanceCount: Int = 0 + private var powerDraw: Double = 0.0 + private var uptime: Long = 0 + private var previousUptime = 0L + private var downtime: Long = 0 + private var previousDowntime = 0L + fun record(monitor: ComputeMonitor, now: Long) { monitor.record( HostData( - clock.millis(), + now, host, - hostMetric.totalWork - lastHostMetric.totalWork, - hostMetric.grantedWork - lastHostMetric.grantedWork, - hostMetric.overcommittedWork - lastHostMetric.overcommittedWork, - hostMetric.interferedWork - lastHostMetric.interferedWork, - hostMetric.cpuUsage, - hostMetric.cpuDemand, - hostMetric.instanceCount, - hostMetric.powerDraw, - hostMetric.uptime - lastHostMetric.uptime, - hostMetric.downtime - lastHostMetric.downtime, + totalWork - previousTotalWork, + grantedWork - previousGrantedWork, + overcommittedWork - previousOvercommittedWork, + interferedWork - previousInterferedWork, + cpuUsage, + cpuDemand, + instanceCount, + powerDraw, + uptime - previousUptime, + downtime - previousDowntime, ) ) - } - - lastHostMetrics = hostMetrics - } - private fun mapDoubleSummary(data: MetricData, hostMetrics: MutableMap, block: (HBuffer, Double) -> Unit) { - val points = data.doubleSummaryData?.points ?: emptyList() - for (point in points) { - val uid = point.attributes[ResourceAttributes.HOST_ID] ?: continue - val hostMetric = hostMetrics.computeIfAbsent(uid) { HBuffer() } - val avg = (point.percentileValues[0].value + point.percentileValues[1].value) / 2 - block(hostMetric, avg) + previousTotalWork = totalWork + previousGrantedWork = grantedWork + previousOvercommittedWork = overcommittedWork + previousInterferedWork = interferedWork + previousUptime = uptime + previousDowntime = downtime + reset() } - } - private fun mapDoubleHistogram(data: MetricData, hostMetrics: MutableMap, block: (HBuffer, Double) -> Unit) { - val points = data.doubleHistogramData?.points ?: emptyList() - for (point in points) { - val uid = point.attributes[ResourceAttributes.HOST_ID] ?: continue - val hostMetric = hostMetrics.computeIfAbsent(uid) { HBuffer() } - block(hostMetric, point.sum / point.count) + /** + * Accept the [MetricData] for this host. + */ + fun accept(data: MetricData) { + when (data.name) { + "cpu.work.total" -> totalWork = data.doubleSumData.points.first().value + "cpu.work.granted" -> grantedWork = data.doubleSumData.points.first().value + "cpu.work.overcommit" -> overcommittedWork = data.doubleSumData.points.first().value + "cpu.work.interference" -> interferedWork = data.doubleSumData.points.first().value + "power.usage" -> powerDraw = acceptHistogram(data) + "cpu.usage" -> cpuUsage = acceptHistogram(data) + "cpu.demand" -> cpuDemand = acceptHistogram(data) + "guests.active" -> instanceCount = data.longSumData.points.first().value.toInt() + "host.time.up" -> uptime = data.longSumData.points.first().value + "host.time.down" -> downtime = data.longSumData.points.first().value + } } - } - private fun mapLongSum(data: MetricData?, hostMetrics: MutableMap, block: (HBuffer, Long) -> Unit) { - val points = data?.longSumData?.points ?: emptyList() - for (point in points) { - val uid = point.attributes[ResourceAttributes.HOST_ID] ?: continue - val hostMetric = hostMetrics.computeIfAbsent(uid) { HBuffer() } - block(hostMetric, point.value) + private fun acceptHistogram(data: MetricData): Double { + return when (data.type) { + MetricDataType.HISTOGRAM -> { + val point = data.doubleHistogramData.points.first() + point.sum / point.count + } + MetricDataType.SUMMARY -> { + val point = data.doubleSummaryData.points.first() + point.sum / point.count + } + else -> error("Invalid metric type") + } } - } - private fun mapDoubleSum(data: MetricData?, hostMetrics: MutableMap, block: (HBuffer, Double) -> Unit) { - val points = data?.doubleSumData?.points ?: emptyList() - for (point in points) { - val uid = point.attributes[ResourceAttributes.HOST_ID] ?: continue - val hostMetric = hostMetrics.computeIfAbsent(uid) { HBuffer() } - block(hostMetric, point.value) + private fun reset() { + totalWork = 0.0 + grantedWork = 0.0 + overcommittedWork = 0.0 + interferedWork = 0.0 + cpuUsage = 0.0 + cpuDemand = 0.0 + instanceCount = 0 + powerDraw = 0.0 + uptime = 0L + downtime = 0L } } /** - * A buffer for host metrics before they are reported. + * An aggregator for server metrics before they are reported. */ - private class HBuffer { - var totalWork: Double = 0.0 - var grantedWork: Double = 0.0 - var overcommittedWork: Double = 0.0 - var interferedWork: Double = 0.0 - var cpuUsage: Double = 0.0 - var cpuDemand: Double = 0.0 - var instanceCount: Int = 0 - var powerDraw: Double = 0.0 - var uptime: Long = 0 - var downtime: Long = 0 - } + private class ServerAggregator(attributes: Attributes) { + /** + * The static information about this server. + */ + val server = ServerInfo( + attributes[ResourceAttributes.HOST_ID]!!, + attributes[ResourceAttributes.HOST_NAME]!!, + attributes[ResourceAttributes.HOST_TYPE]!!, + attributes[ResourceAttributes.HOST_ARCH]!!, + attributes[ResourceAttributes.HOST_IMAGE_ID]!!, + attributes[ResourceAttributes.HOST_IMAGE_NAME]!!, + attributes[AttributeKey.longKey("host.num_cpus")]!!.toInt(), + attributes[AttributeKey.longKey("host.mem_capacity")]!!, + ) - private fun reportServiceMetrics(metrics: Collection) { - monitor.record(extractServiceMetrics(clock.millis(), metrics)) - } + /** + * The [HostInfo] of the host on which the server is hosted. + */ + var host: HostInfo? = null - override fun flush(): CompletableResultCode = CompletableResultCode.ofSuccess() + @JvmField var uptime: Long = 0 + private var previousUptime = 0L + @JvmField var downtime: Long = 0 + private var previousDowntime = 0L + @JvmField var schedulingLatency = 0.0 - override fun shutdown(): CompletableResultCode = CompletableResultCode.ofSuccess() + fun record(monitor: ComputeMonitor, now: Long) { + monitor.record( + ServerData( + now, + server, + null, + uptime - previousUptime, + downtime - previousDowntime, + ) + ) + + previousUptime = uptime + previousDowntime = downtime + reset() + } + + private fun reset() { + host = null + uptime = 0L + downtime = 0L + } + } } diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMonitor.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMonitor.kt index ec303b37..d51bcab4 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMonitor.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMonitor.kt @@ -22,10 +22,6 @@ package org.opendc.telemetry.compute -import org.opendc.compute.api.Server -import org.opendc.compute.api.ServerState -import org.opendc.compute.service.driver.Host -import org.opendc.compute.service.driver.HostState import org.opendc.telemetry.compute.table.HostData import org.opendc.telemetry.compute.table.ServerData import org.opendc.telemetry.compute.table.ServiceData @@ -34,16 +30,6 @@ import org.opendc.telemetry.compute.table.ServiceData * A monitor that tracks the metrics and events of the OpenDC Compute service. */ public interface ComputeMonitor { - /** - * This method is invoked when the state of a [Server] changes. - */ - public fun onStateChange(timestamp: Long, server: Server, newState: ServerState) {} - - /** - * This method is invoked when the state of a [Host] changes. - */ - public fun onStateChange(time: Long, host: Host, newState: HostState) {} - /** * Record the specified [data]. */ diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt index 01df0e69..1f309f1b 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt @@ -24,51 +24,7 @@ package org.opendc.telemetry.compute import io.opentelemetry.sdk.metrics.data.MetricData import io.opentelemetry.sdk.metrics.export.MetricProducer -import kotlinx.coroutines.CoroutineScope -import kotlinx.coroutines.coroutineScope -import org.opendc.compute.service.ComputeService -import org.opendc.compute.service.driver.Host -import org.opendc.compute.service.driver.HostListener -import org.opendc.compute.service.driver.HostState import org.opendc.telemetry.compute.table.ServiceData -import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader -import java.time.Clock -import java.time.Duration - -/** - * Attach the specified monitor to the OpenDC Compute service. - */ -public suspend fun withMonitor( - scheduler: ComputeService, - clock: Clock, - metricProducer: MetricProducer, - monitor: ComputeMonitor, - exportInterval: Duration = Duration.ofMinutes(5), /* Every 5 min (which is the granularity of the workload trace) */ - block: suspend CoroutineScope.() -> Unit -): Unit = coroutineScope { - // Monitor host events - for (host in scheduler.hosts) { - monitor.onStateChange(clock.millis(), host, HostState.UP) - host.addListener(object : HostListener { - override fun onStateChanged(host: Host, newState: HostState) { - monitor.onStateChange(clock.millis(), host, newState) - } - }) - } - - val reader = CoroutineMetricReader( - this, - listOf(metricProducer), - ComputeMetricExporter(clock, scheduler.hosts.associateBy { it.uid.toString() }, monitor), - exportInterval - ) - - try { - block(this) - } finally { - reader.close() - } -} /** * Collect the metrics of the compute service. diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/HostAttributes.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/HostAttributes.kt new file mode 100644 index 00000000..7dca6186 --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/HostAttributes.kt @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +@file:JvmName("HostAttributes") +package org.opendc.telemetry.compute + +import io.opentelemetry.api.common.AttributeKey + +/** + * The identifier of the node hosting virtual machines. + */ +public val HOST_ID: AttributeKey = AttributeKey.stringKey("node.id") + +/** + * The name of the node hosting virtual machines. + */ +public val HOST_NAME: AttributeKey = AttributeKey.stringKey("node.name") + +/** + * The CPU architecture of the host node. + */ +public val HOST_ARCH: AttributeKey = AttributeKey.stringKey("node.arch") + +/** + * The number of CPUs in the host node. + */ +public val HOST_NCPUS: AttributeKey = AttributeKey.longKey("node.num_cpus") + +/** + * The amount of memory installed in the host node in MiB. + */ +public val HOST_MEM_CAPACITY: AttributeKey = AttributeKey.longKey("node.mem_capacity") diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt index 8e6c34d0..e3ecda3d 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt @@ -22,14 +22,12 @@ package org.opendc.telemetry.compute.table -import org.opendc.compute.service.driver.Host - /** * A trace entry for a particular host. */ public data class HostData( public val timestamp: Long, - public val host: Host, + public val host: HostInfo, public val totalWork: Double, public val grantedWork: Double, public val overcommittedWork: Double, diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostInfo.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostInfo.kt new file mode 100644 index 00000000..d9a5906b --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostInfo.kt @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute.table + +/** + * Information about a host exposed to the telemetry service. + */ +public data class HostInfo(val id: String, val name: String, val arch: String, val cpuCount: Int, val memCapacity: Long) diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt index 2a9fa8a6..7fde86d9 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt @@ -22,14 +22,13 @@ package org.opendc.telemetry.compute.table -import org.opendc.compute.api.Server - /** * A trace entry for a particular server. */ public data class ServerData( public val timestamp: Long, - public val server: Server, + public val server: ServerInfo, + public val host: HostInfo?, public val uptime: Long, public val downtime: Long, ) diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerInfo.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerInfo.kt new file mode 100644 index 00000000..b16e5f3d --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerInfo.kt @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute.table + +/** + * Static information about a server exposed to the telemetry service. + */ +public data class ServerInfo( + val id: String, + val name: String, + val type: String, + val arch: String, + val imageId: String, + val imageName: String, + val cpuCount: Int, + val memCapacity: Long +) -- cgit v1.2.3 From 8d899e29dbd757f6df320212d6e0d77ce8216ab9 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Tue, 14 Sep 2021 15:38:38 +0200 Subject: refactor(telemetry): Standardize compute scheduler metrics This change updates the OpenDC compute service implementation with multiple meters that follow the OpenTelemetry conventions. --- .../kotlin/org/opendc/telemetry/compute/Helpers.kt | 59 ++++++++++++++-------- .../opendc/telemetry/compute/table/ServiceData.kt | 14 ++--- 2 files changed, 45 insertions(+), 28 deletions(-) (limited to 'opendc-telemetry') diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt index 1f309f1b..f3690ee8 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt @@ -22,6 +22,7 @@ package org.opendc.telemetry.compute +import io.opentelemetry.api.common.AttributeKey import io.opentelemetry.sdk.metrics.data.MetricData import io.opentelemetry.sdk.metrics.export.MetricProducer import org.opendc.telemetry.compute.table.ServiceData @@ -37,32 +38,48 @@ public fun collectServiceMetrics(timestamp: Long, metricProducer: MetricProducer * Extract a [ServiceData] object from the specified list of metric data. */ public fun extractServiceMetrics(timestamp: Long, metrics: Collection): ServiceData { - var submittedVms = 0 - var queuedVms = 0 - var unscheduledVms = 0 - var runningVms = 0 - var finishedVms = 0 - var hosts = 0 - var availableHosts = 0 + val resultKey = AttributeKey.stringKey("result") + val stateKey = AttributeKey.stringKey("state") - for (metric in metrics) { - val points = metric.longSumData.points + var hostsUp = 0 + var hostsDown = 0 - if (points.isEmpty()) { - continue - } + var serversPending = 0 + var serversActive = 0 - val value = points.first().value.toInt() + var attemptsSuccess = 0 + var attemptsFailure = 0 + var attemptsError = 0 + + for (metric in metrics) { when (metric.name) { - "servers.submitted" -> submittedVms = value - "servers.waiting" -> queuedVms = value - "servers.unscheduled" -> unscheduledVms = value - "servers.active" -> runningVms = value - "servers.finished" -> finishedVms = value - "hosts.total" -> hosts = value - "hosts.available" -> availableHosts = value + "scheduler.hosts" -> { + for (point in metric.longSumData.points) { + when (point.attributes[stateKey]) { + "up" -> hostsUp = point.value.toInt() + "down" -> hostsDown = point.value.toInt() + } + } + } + "scheduler.servers" -> { + for (point in metric.longSumData.points) { + when (point.attributes[stateKey]) { + "pending" -> serversPending = point.value.toInt() + "active" -> serversActive = point.value.toInt() + } + } + } + "scheduler.attempts" -> { + for (point in metric.longSumData.points) { + when (point.attributes[resultKey]) { + "success" -> attemptsSuccess = point.value.toInt() + "failure" -> attemptsFailure = point.value.toInt() + "error" -> attemptsError = point.value.toInt() + } + } + } } } - return ServiceData(timestamp, hosts, availableHosts, submittedVms, runningVms, finishedVms, queuedVms, unscheduledVms) + return ServiceData(timestamp, hostsUp, hostsDown, serversPending, serversActive, attemptsSuccess, attemptsFailure, attemptsError) } diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt index f6ff5db5..da2ebdf4 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt @@ -27,11 +27,11 @@ package org.opendc.telemetry.compute.table */ public data class ServiceData( public val timestamp: Long, - public val hostCount: Int, - public val activeHostCount: Int, - public val instanceCount: Int, - public val runningInstanceCount: Int, - public val finishedInstanceCount: Int, - public val queuedInstanceCount: Int, - public val failedInstanceCount: Int + public val hostsUp: Int, + public val hostsDown: Int, + public val serversPending: Int, + public val serversActive: Int, + public val attemptsSuccess: Int, + public val attemptsFailure: Int, + public val attemptsError: Int ) -- cgit v1.2.3 From 0d8bccc68705d036fbf60f312d9c34ca4392c6b2 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Tue, 7 Sep 2021 17:30:46 +0200 Subject: refactor(telemetry): Standardize SimHost metrics This change standardizes the metrics emitted by SimHost instances and their guests based on the OpenTelemetry semantic conventions. We now also report CPU time as opposed to CPU work as this metric is more commonly used. --- .../telemetry/compute/ComputeMetricAggregator.kt | 448 +++++++++++++++++++++ .../telemetry/compute/ComputeMetricExporter.kt | 243 +---------- .../kotlin/org/opendc/telemetry/compute/Helpers.kt | 55 +-- .../org/opendc/telemetry/compute/table/HostData.kt | 33 +- .../opendc/telemetry/compute/table/ServerData.kt | 19 +- .../opendc/telemetry/compute/table/ServiceData.kt | 18 +- .../sdk/metrics/export/CoroutineMetricReader.kt | 2 +- 7 files changed, 512 insertions(+), 306 deletions(-) create mode 100644 opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricAggregator.kt (limited to 'opendc-telemetry') diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricAggregator.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricAggregator.kt new file mode 100644 index 00000000..e9449634 --- /dev/null +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricAggregator.kt @@ -0,0 +1,448 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.telemetry.compute + +import io.opentelemetry.api.common.AttributeKey +import io.opentelemetry.api.common.Attributes +import io.opentelemetry.sdk.metrics.data.MetricData +import io.opentelemetry.sdk.metrics.data.PointData +import io.opentelemetry.sdk.resources.Resource +import io.opentelemetry.semconv.resource.attributes.ResourceAttributes +import org.opendc.telemetry.compute.table.* +import java.time.Instant +import kotlin.math.roundToLong + +/** + * Helper class responsible for aggregating [MetricData] into [ServiceData], [HostData] and [ServerData]. + */ +public class ComputeMetricAggregator { + private val _service = ServiceAggregator() + private val _hosts = mutableMapOf() + private val _servers = mutableMapOf() + + /** + * Process the specified [metrics] for this cycle. + */ + public fun process(metrics: Collection) { + val service = _service + val hosts = _hosts + val servers = _servers + + for (metric in metrics) { + val resource = metric.resource + + when (metric.name) { + // ComputeService + "scheduler.hosts" -> { + for (point in metric.longSumData.points) { + when (point.attributes[STATE_KEY]) { + "up" -> service.hostsUp = point.value.toInt() + "down" -> service.hostsDown = point.value.toInt() + } + } + } + "scheduler.servers" -> { + for (point in metric.longSumData.points) { + when (point.attributes[STATE_KEY]) { + "pending" -> service.serversPending = point.value.toInt() + "active" -> service.serversActive = point.value.toInt() + } + } + } + "scheduler.attempts" -> { + for (point in metric.longSumData.points) { + when (point.attributes[RESULT_KEY]) { + "success" -> service.attemptsSuccess = point.value.toInt() + "failure" -> service.attemptsFailure = point.value.toInt() + "error" -> service.attemptsError = point.value.toInt() + } + } + } + "scheduler.latency" -> { + for (point in metric.doubleHistogramData.points) { + val server = getServer(servers, point) ?: continue + server.schedulingLatency = (point.sum / point.count).roundToLong() + } + } + + // SimHost + "system.guests" -> { + val agg = getHost(hosts, resource) ?: continue + + for (point in metric.longSumData.points) { + when (point.attributes[STATE_KEY]) { + "terminated" -> agg.guestsTerminated = point.value.toInt() + "running" -> agg.guestsRunning = point.value.toInt() + "error" -> agg.guestsRunning = point.value.toInt() + "invalid" -> agg.guestsInvalid = point.value.toInt() + } + } + } + "system.cpu.limit" -> { + val agg = getHost(hosts, resource) ?: continue + + for (point in metric.doubleGaugeData.points) { + val server = getServer(servers, point) + + if (server != null) { + server.cpuLimit = point.value + server.host = agg.host + } else { + agg.cpuLimit = point.value + } + } + } + "system.cpu.usage" -> { + val agg = getHost(hosts, resource) ?: continue + agg.cpuUsage = metric.doubleGaugeData.points.first().value + } + "system.cpu.demand" -> { + val agg = getHost(hosts, resource) ?: continue + agg.cpuDemand = metric.doubleGaugeData.points.first().value + } + "system.cpu.utilization" -> { + val agg = getHost(hosts, resource) ?: continue + agg.cpuUtilization = metric.doubleGaugeData.points.first().value + } + "system.cpu.time" -> { + val agg = getHost(hosts, resource) ?: continue + + for (point in metric.longSumData.points) { + val server = getServer(servers, point) + val state = point.attributes[STATE_KEY] + if (server != null) { + when (state) { + "active" -> server.cpuActiveTime = point.value + "idle" -> server.cpuIdleTime = point.value + "steal" -> server.cpuStealTime = point.value + "lost" -> server.cpuLostTime = point.value + } + server.host = agg.host + } else { + when (state) { + "active" -> agg.cpuActiveTime = point.value + "idle" -> agg.cpuIdleTime = point.value + "steal" -> agg.cpuStealTime = point.value + "lost" -> agg.cpuLostTime = point.value + } + } + } + } + "system.power.usage" -> { + val agg = getHost(hosts, resource) ?: continue + agg.powerUsage = metric.doubleGaugeData.points.first().value + } + "system.power.total" -> { + val agg = getHost(hosts, resource) ?: continue + agg.powerTotal = metric.doubleSumData.points.first().value + } + "system.time" -> { + val agg = getHost(hosts, resource) ?: continue + + for (point in metric.longSumData.points) { + val server = getServer(servers, point) + + if (server != null) { + when (point.attributes[STATE_KEY]) { + "up" -> server.uptime = point.value + "down" -> server.downtime = point.value + } + server.host = agg.host + } else { + when (point.attributes[STATE_KEY]) { + "up" -> agg.uptime = point.value + "down" -> agg.downtime = point.value + } + } + } + } + "system.time.boot" -> { + val agg = getHost(hosts, resource) ?: continue + + for (point in metric.longGaugeData.points) { + val server = getServer(servers, point) + + if (server != null) { + server.bootTime = point.value + server.host = agg.host + } else { + agg.bootTime = point.value + } + } + } + } + } + } + + /** + * Collect the data via the [monitor]. + */ + public fun collect(now: Instant, monitor: ComputeMonitor) { + monitor.record(_service.collect(now)) + + for (host in _hosts.values) { + monitor.record(host.collect(now)) + } + + for (server in _servers.values) { + monitor.record(server.collect(now)) + } + } + + /** + * Obtain the [HostAggregator] for the specified [resource]. + */ + private fun getHost(hosts: MutableMap, resource: Resource): HostAggregator? { + val id = resource.attributes[HOST_ID] + return if (id != null) { + hosts.computeIfAbsent(id) { HostAggregator(resource) } + } else { + null + } + } + + /** + * Obtain the [ServerAggregator] for the specified [point]. + */ + private fun getServer(servers: MutableMap, point: PointData): ServerAggregator? { + val id = point.attributes[ResourceAttributes.HOST_ID] + return if (id != null) { + servers.computeIfAbsent(id) { ServerAggregator(point.attributes) } + } else { + null + } + } + + /** + * An aggregator for service metrics before they are reported. + */ + internal class ServiceAggregator { + @JvmField var hostsUp = 0 + @JvmField var hostsDown = 0 + + @JvmField var serversPending = 0 + @JvmField var serversActive = 0 + + @JvmField var attemptsSuccess = 0 + @JvmField var attemptsFailure = 0 + @JvmField var attemptsError = 0 + + /** + * Finish the aggregation for this cycle. + */ + fun collect(now: Instant): ServiceData = toServiceData(now) + + /** + * Convert the aggregator state to an immutable [ServiceData]. + */ + private fun toServiceData(now: Instant): ServiceData { + return ServiceData(now, hostsUp, hostsDown, serversPending, serversActive, attemptsSuccess, attemptsFailure, attemptsError) + } + } + + /** + * An aggregator for host metrics before they are reported. + */ + internal class HostAggregator(resource: Resource) { + /** + * The static information about this host. + */ + val host = HostInfo( + resource.attributes[HOST_ID]!!, + resource.attributes[HOST_NAME] ?: "", + resource.attributes[HOST_ARCH] ?: "", + resource.attributes[HOST_NCPUS]?.toInt() ?: 0, + resource.attributes[HOST_MEM_CAPACITY] ?: 0, + ) + + @JvmField var guestsTerminated = 0 + @JvmField var guestsRunning = 0 + @JvmField var guestsError = 0 + @JvmField var guestsInvalid = 0 + + @JvmField var cpuLimit = 0.0 + @JvmField var cpuUsage = 0.0 + @JvmField var cpuDemand = 0.0 + @JvmField var cpuUtilization = 0.0 + + @JvmField var cpuActiveTime = 0L + @JvmField var cpuIdleTime = 0L + @JvmField var cpuStealTime = 0L + @JvmField var cpuLostTime = 0L + private var previousCpuActiveTime = 0L + private var previousCpuIdleTime = 0L + private var previousCpuStealTime = 0L + private var previousCpuLostTime = 0L + + @JvmField var powerUsage = 0.0 + @JvmField var powerTotal = 0.0 + private var previousPowerTotal = 0.0 + + @JvmField var uptime = 0L + private var previousUptime = 0L + @JvmField var downtime = 0L + private var previousDowntime = 0L + @JvmField var bootTime = Long.MIN_VALUE + + /** + * Finish the aggregation for this cycle. + */ + fun collect(now: Instant): HostData { + val data = toHostData(now) + + // Reset intermediate state for next aggregation + previousCpuActiveTime = cpuActiveTime + previousCpuIdleTime = cpuIdleTime + previousCpuStealTime = cpuStealTime + previousCpuLostTime = cpuLostTime + previousPowerTotal = powerTotal + previousUptime = uptime + previousDowntime = downtime + + guestsTerminated = 0 + guestsRunning = 0 + guestsError = 0 + guestsInvalid = 0 + + cpuLimit = 0.0 + cpuUsage = 0.0 + cpuDemand = 0.0 + cpuUtilization = 0.0 + + powerUsage = 0.0 + + return data + } + + /** + * Convert the aggregator state to an immutable [HostData] instance. + */ + private fun toHostData(now: Instant): HostData { + return HostData( + now, + host, + guestsTerminated, + guestsRunning, + guestsError, + guestsInvalid, + cpuLimit, + cpuUsage, + cpuDemand, + cpuUtilization, + cpuActiveTime - previousCpuActiveTime, + cpuIdleTime - previousCpuIdleTime, + cpuStealTime - previousCpuStealTime, + cpuLostTime - previousCpuLostTime, + powerUsage, + powerTotal - previousPowerTotal, + uptime - previousUptime, + downtime - previousDowntime, + if (bootTime != Long.MIN_VALUE) Instant.ofEpochMilli(bootTime) else null + ) + } + } + + /** + * An aggregator for server metrics before they are reported. + */ + internal class ServerAggregator(attributes: Attributes) { + /** + * The static information about this server. + */ + val server = ServerInfo( + attributes[ResourceAttributes.HOST_ID]!!, + attributes[ResourceAttributes.HOST_NAME]!!, + attributes[ResourceAttributes.HOST_TYPE]!!, + attributes[ResourceAttributes.HOST_ARCH]!!, + attributes[ResourceAttributes.HOST_IMAGE_ID]!!, + attributes[ResourceAttributes.HOST_IMAGE_NAME]!!, + attributes[AttributeKey.longKey("host.num_cpus")]!!.toInt(), + attributes[AttributeKey.longKey("host.mem_capacity")]!!, + ) + + /** + * The [HostInfo] of the host on which the server is hosted. + */ + var host: HostInfo? = null + + @JvmField var uptime: Long = 0 + private var previousUptime = 0L + @JvmField var downtime: Long = 0 + private var previousDowntime = 0L + @JvmField var bootTime: Long = 0 + @JvmField var schedulingLatency = 0L + @JvmField var cpuLimit = 0.0 + @JvmField var cpuActiveTime = 0L + @JvmField var cpuIdleTime = 0L + @JvmField var cpuStealTime = 0L + @JvmField var cpuLostTime = 0L + private var previousCpuActiveTime = 0L + private var previousCpuIdleTime = 0L + private var previousCpuStealTime = 0L + private var previousCpuLostTime = 0L + + /** + * Finish the aggregation for this cycle. + */ + fun collect(now: Instant): ServerData { + val data = toServerData(now) + + previousUptime = uptime + previousDowntime = downtime + previousCpuActiveTime = cpuActiveTime + previousCpuIdleTime = cpuIdleTime + previousCpuStealTime = cpuStealTime + previousCpuLostTime = cpuLostTime + + host = null + cpuLimit = 0.0 + + return data + } + + /** + * Convert the aggregator state into an immutable [ServerData]. + */ + private fun toServerData(now: Instant): ServerData { + return ServerData( + now, + server, + host, + uptime - previousUptime, + downtime - previousDowntime, + if (bootTime != Long.MIN_VALUE) Instant.ofEpochMilli(bootTime) else null, + schedulingLatency, + cpuLimit, + cpuActiveTime - previousCpuActiveTime, + cpuIdleTime - previousCpuIdleTime, + cpuStealTime - previousCpuStealTime, + cpuLostTime - previousCpuLostTime + ) + } + } + + private companion object { + private val STATE_KEY = AttributeKey.stringKey("state") + private val RESULT_KEY = AttributeKey.stringKey("result") + } +} diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt index 408d1325..ea96f721 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt @@ -22,28 +22,24 @@ package org.opendc.telemetry.compute -import io.opentelemetry.api.common.AttributeKey -import io.opentelemetry.api.common.Attributes import io.opentelemetry.sdk.common.CompletableResultCode import io.opentelemetry.sdk.metrics.data.* import io.opentelemetry.sdk.metrics.export.MetricExporter -import io.opentelemetry.sdk.resources.Resource -import io.opentelemetry.semconv.resource.attributes.ResourceAttributes -import org.opendc.telemetry.compute.table.HostData -import org.opendc.telemetry.compute.table.HostInfo -import org.opendc.telemetry.compute.table.ServerData -import org.opendc.telemetry.compute.table.ServerInfo import java.time.Clock /** * A [MetricExporter] that redirects data to a [ComputeMonitor] implementation. */ public class ComputeMetricExporter(private val clock: Clock, private val monitor: ComputeMonitor) : MetricExporter { + /** + * A [ComputeMetricAggregator] that actually performs the aggregation. + */ + private val agg = ComputeMetricAggregator() + override fun export(metrics: Collection): CompletableResultCode { return try { - reportServiceMetrics(metrics) - reportHostMetrics(metrics) - reportServerMetrics(metrics) + agg.process(metrics) + agg.collect(clock.instant(), monitor) CompletableResultCode.ofSuccess() } catch (e: Throwable) { CompletableResultCode.ofFailure() @@ -53,229 +49,4 @@ public class ComputeMetricExporter(private val clock: Clock, private val monitor override fun flush(): CompletableResultCode = CompletableResultCode.ofSuccess() override fun shutdown(): CompletableResultCode = CompletableResultCode.ofSuccess() - - private fun reportServiceMetrics(metrics: Collection) { - monitor.record(extractServiceMetrics(clock.millis(), metrics)) - } - - private val hosts = mutableMapOf() - private val servers = mutableMapOf() - - private fun reportHostMetrics(metrics: Collection) { - val hosts = hosts - val servers = servers - - for (metric in metrics) { - val resource = metric.resource - val hostId = resource.attributes[HOST_ID] ?: continue - val agg = hosts.computeIfAbsent(hostId) { HostAggregator(resource) } - agg.accept(metric) - } - - val monitor = monitor - val now = clock.millis() - for ((_, server) in servers) { - server.record(monitor, now) - } - } - - private fun reportServerMetrics(metrics: Collection) { - val hosts = hosts - - for (metric in metrics) { - val resource = metric.resource - val host = resource.attributes[HOST_ID]?.let { hosts[it]?.host } - - when (metric.name) { - "scheduler.duration" -> mapByServer(metric.doubleHistogramData.points, host) { agg, point -> - agg.schedulingLatency = point.sum / point.count - } - "guest.time.running" -> mapByServer(metric.longSumData.points, host) { agg, point -> - agg.uptime = point.value - } - "guest.time.error" -> mapByServer(metric.longSumData.points, host) { agg, point -> - agg.downtime = point.value - } - } - } - - val monitor = monitor - val now = clock.millis() - for ((_, host) in hosts) { - host.record(monitor, now) - } - } - - /** - * Helper function to map a metric by the server. - */ - private inline fun

mapByServer(points: Collection

, host: HostInfo? = null, block: (ServerAggregator, P) -> Unit) { - for (point in points) { - val serverId = point.attributes[ResourceAttributes.HOST_ID] ?: continue - val agg = servers.computeIfAbsent(serverId) { ServerAggregator(point.attributes) } - - if (host != null) { - agg.host = host - } - - block(agg, point) - } - } - - /** - * An aggregator for host metrics before they are reported. - */ - private class HostAggregator(resource: Resource) { - /** - * The static information about this host. - */ - val host = HostInfo( - resource.attributes[HOST_ID]!!, - resource.attributes[HOST_NAME]!!, - resource.attributes[HOST_ARCH]!!, - resource.attributes[HOST_NCPUS]!!.toInt(), - resource.attributes[HOST_MEM_CAPACITY]!!, - ) - - private var totalWork: Double = 0.0 - private var previousTotalWork = 0.0 - private var grantedWork: Double = 0.0 - private var previousGrantedWork = 0.0 - private var overcommittedWork: Double = 0.0 - private var previousOvercommittedWork = 0.0 - private var interferedWork: Double = 0.0 - private var previousInterferedWork = 0.0 - private var cpuUsage: Double = 0.0 - private var cpuDemand: Double = 0.0 - private var instanceCount: Int = 0 - private var powerDraw: Double = 0.0 - private var uptime: Long = 0 - private var previousUptime = 0L - private var downtime: Long = 0 - private var previousDowntime = 0L - - fun record(monitor: ComputeMonitor, now: Long) { - monitor.record( - HostData( - now, - host, - totalWork - previousTotalWork, - grantedWork - previousGrantedWork, - overcommittedWork - previousOvercommittedWork, - interferedWork - previousInterferedWork, - cpuUsage, - cpuDemand, - instanceCount, - powerDraw, - uptime - previousUptime, - downtime - previousDowntime, - ) - ) - - previousTotalWork = totalWork - previousGrantedWork = grantedWork - previousOvercommittedWork = overcommittedWork - previousInterferedWork = interferedWork - previousUptime = uptime - previousDowntime = downtime - reset() - } - - /** - * Accept the [MetricData] for this host. - */ - fun accept(data: MetricData) { - when (data.name) { - "cpu.work.total" -> totalWork = data.doubleSumData.points.first().value - "cpu.work.granted" -> grantedWork = data.doubleSumData.points.first().value - "cpu.work.overcommit" -> overcommittedWork = data.doubleSumData.points.first().value - "cpu.work.interference" -> interferedWork = data.doubleSumData.points.first().value - "power.usage" -> powerDraw = acceptHistogram(data) - "cpu.usage" -> cpuUsage = acceptHistogram(data) - "cpu.demand" -> cpuDemand = acceptHistogram(data) - "guests.active" -> instanceCount = data.longSumData.points.first().value.toInt() - "host.time.up" -> uptime = data.longSumData.points.first().value - "host.time.down" -> downtime = data.longSumData.points.first().value - } - } - - private fun acceptHistogram(data: MetricData): Double { - return when (data.type) { - MetricDataType.HISTOGRAM -> { - val point = data.doubleHistogramData.points.first() - point.sum / point.count - } - MetricDataType.SUMMARY -> { - val point = data.doubleSummaryData.points.first() - point.sum / point.count - } - else -> error("Invalid metric type") - } - } - - private fun reset() { - totalWork = 0.0 - grantedWork = 0.0 - overcommittedWork = 0.0 - interferedWork = 0.0 - cpuUsage = 0.0 - cpuDemand = 0.0 - instanceCount = 0 - powerDraw = 0.0 - uptime = 0L - downtime = 0L - } - } - - /** - * An aggregator for server metrics before they are reported. - */ - private class ServerAggregator(attributes: Attributes) { - /** - * The static information about this server. - */ - val server = ServerInfo( - attributes[ResourceAttributes.HOST_ID]!!, - attributes[ResourceAttributes.HOST_NAME]!!, - attributes[ResourceAttributes.HOST_TYPE]!!, - attributes[ResourceAttributes.HOST_ARCH]!!, - attributes[ResourceAttributes.HOST_IMAGE_ID]!!, - attributes[ResourceAttributes.HOST_IMAGE_NAME]!!, - attributes[AttributeKey.longKey("host.num_cpus")]!!.toInt(), - attributes[AttributeKey.longKey("host.mem_capacity")]!!, - ) - - /** - * The [HostInfo] of the host on which the server is hosted. - */ - var host: HostInfo? = null - - @JvmField var uptime: Long = 0 - private var previousUptime = 0L - @JvmField var downtime: Long = 0 - private var previousDowntime = 0L - @JvmField var schedulingLatency = 0.0 - - fun record(monitor: ComputeMonitor, now: Long) { - monitor.record( - ServerData( - now, - server, - null, - uptime - previousUptime, - downtime - previousDowntime, - ) - ) - - previousUptime = uptime - previousDowntime = downtime - reset() - } - - private fun reset() { - host = null - uptime = 0L - downtime = 0L - } - } } diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt index f3690ee8..25d346fb 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt @@ -22,64 +22,31 @@ package org.opendc.telemetry.compute -import io.opentelemetry.api.common.AttributeKey import io.opentelemetry.sdk.metrics.data.MetricData import io.opentelemetry.sdk.metrics.export.MetricProducer import org.opendc.telemetry.compute.table.ServiceData +import java.time.Instant /** * Collect the metrics of the compute service. */ -public fun collectServiceMetrics(timestamp: Long, metricProducer: MetricProducer): ServiceData { +public fun collectServiceMetrics(timestamp: Instant, metricProducer: MetricProducer): ServiceData { return extractServiceMetrics(timestamp, metricProducer.collectAllMetrics()) } /** * Extract a [ServiceData] object from the specified list of metric data. */ -public fun extractServiceMetrics(timestamp: Long, metrics: Collection): ServiceData { - val resultKey = AttributeKey.stringKey("result") - val stateKey = AttributeKey.stringKey("state") - - var hostsUp = 0 - var hostsDown = 0 - - var serversPending = 0 - var serversActive = 0 - - var attemptsSuccess = 0 - var attemptsFailure = 0 - var attemptsError = 0 - - for (metric in metrics) { - when (metric.name) { - "scheduler.hosts" -> { - for (point in metric.longSumData.points) { - when (point.attributes[stateKey]) { - "up" -> hostsUp = point.value.toInt() - "down" -> hostsDown = point.value.toInt() - } - } - } - "scheduler.servers" -> { - for (point in metric.longSumData.points) { - when (point.attributes[stateKey]) { - "pending" -> serversPending = point.value.toInt() - "active" -> serversActive = point.value.toInt() - } - } - } - "scheduler.attempts" -> { - for (point in metric.longSumData.points) { - when (point.attributes[resultKey]) { - "success" -> attemptsSuccess = point.value.toInt() - "failure" -> attemptsFailure = point.value.toInt() - "error" -> attemptsError = point.value.toInt() - } - } - } +public fun extractServiceMetrics(timestamp: Instant, metrics: Collection): ServiceData { + lateinit var serviceData: ServiceData + val agg = ComputeMetricAggregator() + val monitor = object : ComputeMonitor { + override fun record(data: ServiceData) { + serviceData = data } } - return ServiceData(timestamp, hostsUp, hostsDown, serversPending, serversActive, attemptsSuccess, attemptsFailure, attemptsError) + agg.process(metrics) + agg.collect(timestamp, monitor) + return serviceData } diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt index e3ecda3d..8e787b97 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/HostData.kt @@ -22,20 +22,29 @@ package org.opendc.telemetry.compute.table +import java.time.Instant + /** * A trace entry for a particular host. */ public data class HostData( - public val timestamp: Long, - public val host: HostInfo, - public val totalWork: Double, - public val grantedWork: Double, - public val overcommittedWork: Double, - public val interferedWork: Double, - public val cpuUsage: Double, - public val cpuDemand: Double, - public val instanceCount: Int, - public val powerDraw: Double, - public val uptime: Long, - public val downtime: Long, + val timestamp: Instant, + val host: HostInfo, + val guestsTerminated: Int, + val guestsRunning: Int, + val guestsError: Int, + val guestsInvalid: Int, + val cpuLimit: Double, + val cpuUsage: Double, + val cpuDemand: Double, + val cpuUtilization: Double, + val cpuActiveTime: Long, + val cpuIdleTime: Long, + val cpuStealTime: Long, + val cpuLostTime: Long, + val powerUsage: Double, + val powerTotal: Double, + val uptime: Long, + val downtime: Long, + val bootTime: Instant? ) diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt index 7fde86d9..c48bff3a 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServerData.kt @@ -22,13 +22,22 @@ package org.opendc.telemetry.compute.table +import java.time.Instant + /** * A trace entry for a particular server. */ public data class ServerData( - public val timestamp: Long, - public val server: ServerInfo, - public val host: HostInfo?, - public val uptime: Long, - public val downtime: Long, + val timestamp: Instant, + val server: ServerInfo, + val host: HostInfo?, + val uptime: Long, + val downtime: Long, + val bootTime: Instant?, + val schedulingLatency: Long, + val cpuLimit: Double, + val cpuActiveTime: Long, + val cpuIdleTime: Long, + val cpuStealTime: Long, + val cpuLostTime: Long, ) diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt index da2ebdf4..6db1399d 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/table/ServiceData.kt @@ -22,16 +22,18 @@ package org.opendc.telemetry.compute.table +import java.time.Instant + /** * A trace entry for the compute service. */ public data class ServiceData( - public val timestamp: Long, - public val hostsUp: Int, - public val hostsDown: Int, - public val serversPending: Int, - public val serversActive: Int, - public val attemptsSuccess: Int, - public val attemptsFailure: Int, - public val attemptsError: Int + val timestamp: Instant, + val hostsUp: Int, + val hostsDown: Int, + val serversPending: Int, + val serversActive: Int, + val attemptsSuccess: Int, + val attemptsFailure: Int, + val attemptsError: Int ) diff --git a/opendc-telemetry/opendc-telemetry-sdk/src/main/kotlin/org/opendc/telemetry/sdk/metrics/export/CoroutineMetricReader.kt b/opendc-telemetry/opendc-telemetry-sdk/src/main/kotlin/org/opendc/telemetry/sdk/metrics/export/CoroutineMetricReader.kt index 8f19ab81..07f0ff7f 100644 --- a/opendc-telemetry/opendc-telemetry-sdk/src/main/kotlin/org/opendc/telemetry/sdk/metrics/export/CoroutineMetricReader.kt +++ b/opendc-telemetry/opendc-telemetry-sdk/src/main/kotlin/org/opendc/telemetry/sdk/metrics/export/CoroutineMetricReader.kt @@ -44,7 +44,7 @@ public class CoroutineMetricReader( scope: CoroutineScope, private val producers: List, private val exporter: MetricExporter, - private val exportInterval: Duration = Duration.ofMinutes(1) + private val exportInterval: Duration = Duration.ofMinutes(5) ) : AutoCloseable { private val logger = KotlinLogging.logger {} -- cgit v1.2.3 From 5fa0cf915ecf643e94a0de972125e8f862308f80 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Wed, 22 Sep 2021 11:22:00 +0200 Subject: fix(telemetry): Ensure shutdown of exporter is called This change updates the CoroutineMetricReader to ensure that the exporter is shutdown when the metric reader fails or is shutdown. --- .../sdk/metrics/export/CoroutineMetricReader.kt | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'opendc-telemetry') diff --git a/opendc-telemetry/opendc-telemetry-sdk/src/main/kotlin/org/opendc/telemetry/sdk/metrics/export/CoroutineMetricReader.kt b/opendc-telemetry/opendc-telemetry-sdk/src/main/kotlin/org/opendc/telemetry/sdk/metrics/export/CoroutineMetricReader.kt index 07f0ff7f..1de235e7 100644 --- a/opendc-telemetry/opendc-telemetry-sdk/src/main/kotlin/org/opendc/telemetry/sdk/metrics/export/CoroutineMetricReader.kt +++ b/opendc-telemetry/opendc-telemetry-sdk/src/main/kotlin/org/opendc/telemetry/sdk/metrics/export/CoroutineMetricReader.kt @@ -22,7 +22,6 @@ package org.opendc.telemetry.sdk.metrics.export -import io.opentelemetry.sdk.metrics.data.MetricData import io.opentelemetry.sdk.metrics.export.MetricExporter import io.opentelemetry.sdk.metrics.export.MetricProducer import kotlinx.coroutines.* @@ -54,24 +53,25 @@ public class CoroutineMetricReader( private val job = scope.launch { val intervalMs = exportInterval.toMillis() - while (isActive) { - delay(intervalMs) + try { + while (isActive) { + delay(intervalMs) - val metrics = mutableListOf() - for (producer in producers) { - metrics.addAll(producer.collectAllMetrics()) - } + val metrics = producers.flatMap(MetricProducer::collectAllMetrics) - try { - val result = exporter.export(metrics) - result.whenComplete { - if (!result.isSuccess) { - logger.trace { "Exporter failed" } + try { + val result = exporter.export(metrics) + result.whenComplete { + if (!result.isSuccess) { + logger.warn { "Exporter failed" } + } } + } catch (cause: Throwable) { + logger.warn(cause) { "Exporter threw an Exception" } } - } catch (cause: Throwable) { - logger.warn(cause) { "Exporter threw an Exception" } } + } finally { + exporter.shutdown() } } -- cgit v1.2.3 From 30cd010f1f98262aa7f264bb3c3eb6028b8495c5 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Wed, 22 Sep 2021 12:43:01 +0200 Subject: refactor(telemetry): Do not require clock for ComputeMetricExporter This change drops the requirement for a clock parameter when constructing a ComputeMetricExporter, since it will now derive the timestamp from the recorded metrics. --- .../telemetry/compute/ComputeMetricAggregator.kt | 52 +++++++++++++++++++--- .../telemetry/compute/ComputeMetricExporter.kt | 6 +-- .../kotlin/org/opendc/telemetry/compute/Helpers.kt | 15 ++----- 3 files changed, 51 insertions(+), 22 deletions(-) (limited to 'opendc-telemetry') diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricAggregator.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricAggregator.kt index e9449634..679d5944 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricAggregator.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricAggregator.kt @@ -55,6 +55,9 @@ public class ComputeMetricAggregator { // ComputeService "scheduler.hosts" -> { for (point in metric.longSumData.points) { + // Record the timestamp for the service + service.recordTimestamp(point) + when (point.attributes[STATE_KEY]) { "up" -> service.hostsUp = point.value.toInt() "down" -> service.hostsDown = point.value.toInt() @@ -163,12 +166,16 @@ public class ComputeMetricAggregator { val server = getServer(servers, point) if (server != null) { + server.recordTimestamp(point) + when (point.attributes[STATE_KEY]) { "up" -> server.uptime = point.value "down" -> server.downtime = point.value } server.host = agg.host } else { + agg.recordTimestamp(point) + when (point.attributes[STATE_KEY]) { "up" -> agg.uptime = point.value "down" -> agg.downtime = point.value @@ -197,15 +204,15 @@ public class ComputeMetricAggregator { /** * Collect the data via the [monitor]. */ - public fun collect(now: Instant, monitor: ComputeMonitor) { - monitor.record(_service.collect(now)) + public fun collect(monitor: ComputeMonitor) { + monitor.record(_service.collect()) for (host in _hosts.values) { - monitor.record(host.collect(now)) + monitor.record(host.collect()) } for (server in _servers.values) { - monitor.record(server.collect(now)) + monitor.record(server.collect()) } } @@ -237,6 +244,8 @@ public class ComputeMetricAggregator { * An aggregator for service metrics before they are reported. */ internal class ServiceAggregator { + private var timestamp = Long.MIN_VALUE + @JvmField var hostsUp = 0 @JvmField var hostsDown = 0 @@ -250,7 +259,10 @@ public class ComputeMetricAggregator { /** * Finish the aggregation for this cycle. */ - fun collect(now: Instant): ServiceData = toServiceData(now) + fun collect(): ServiceData { + val now = Instant.ofEpochMilli(timestamp) + return toServiceData(now) + } /** * Convert the aggregator state to an immutable [ServiceData]. @@ -258,6 +270,13 @@ public class ComputeMetricAggregator { private fun toServiceData(now: Instant): ServiceData { return ServiceData(now, hostsUp, hostsDown, serversPending, serversActive, attemptsSuccess, attemptsFailure, attemptsError) } + + /** + * Record the timestamp of a [point] for this aggregator. + */ + fun recordTimestamp(point: PointData) { + timestamp = point.epochNanos / 1_000_000L // ns to ms + } } /** @@ -275,6 +294,8 @@ public class ComputeMetricAggregator { resource.attributes[HOST_MEM_CAPACITY] ?: 0, ) + private var timestamp = Long.MIN_VALUE + @JvmField var guestsTerminated = 0 @JvmField var guestsRunning = 0 @JvmField var guestsError = 0 @@ -307,7 +328,8 @@ public class ComputeMetricAggregator { /** * Finish the aggregation for this cycle. */ - fun collect(now: Instant): HostData { + fun collect(): HostData { + val now = Instant.ofEpochMilli(timestamp) val data = toHostData(now) // Reset intermediate state for next aggregation @@ -360,6 +382,13 @@ public class ComputeMetricAggregator { if (bootTime != Long.MIN_VALUE) Instant.ofEpochMilli(bootTime) else null ) } + + /** + * Record the timestamp of a [point] for this aggregator. + */ + fun recordTimestamp(point: PointData) { + timestamp = point.epochNanos / 1_000_000L // ns to ms + } } /** @@ -385,6 +414,7 @@ public class ComputeMetricAggregator { */ var host: HostInfo? = null + private var timestamp = Long.MIN_VALUE @JvmField var uptime: Long = 0 private var previousUptime = 0L @JvmField var downtime: Long = 0 @@ -404,7 +434,8 @@ public class ComputeMetricAggregator { /** * Finish the aggregation for this cycle. */ - fun collect(now: Instant): ServerData { + fun collect(): ServerData { + val now = Instant.ofEpochMilli(timestamp) val data = toServerData(now) previousUptime = uptime @@ -439,6 +470,13 @@ public class ComputeMetricAggregator { cpuLostTime - previousCpuLostTime ) } + + /** + * Record the timestamp of a [point] for this aggregator. + */ + fun recordTimestamp(point: PointData) { + timestamp = point.epochNanos / 1_000_000L // ns to ms + } } private companion object { diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt index ea96f721..580cc6fb 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt @@ -25,12 +25,11 @@ package org.opendc.telemetry.compute import io.opentelemetry.sdk.common.CompletableResultCode import io.opentelemetry.sdk.metrics.data.* import io.opentelemetry.sdk.metrics.export.MetricExporter -import java.time.Clock /** * A [MetricExporter] that redirects data to a [ComputeMonitor] implementation. */ -public class ComputeMetricExporter(private val clock: Clock, private val monitor: ComputeMonitor) : MetricExporter { +public abstract class ComputeMetricExporter : MetricExporter, ComputeMonitor { /** * A [ComputeMetricAggregator] that actually performs the aggregation. */ @@ -39,7 +38,8 @@ public class ComputeMetricExporter(private val clock: Clock, private val monitor override fun export(metrics: Collection): CompletableResultCode { return try { agg.process(metrics) - agg.collect(clock.instant(), monitor) + agg.collect(this) + CompletableResultCode.ofSuccess() } catch (e: Throwable) { CompletableResultCode.ofFailure() diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt index 25d346fb..ce89061b 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/Helpers.kt @@ -22,22 +22,13 @@ package org.opendc.telemetry.compute -import io.opentelemetry.sdk.metrics.data.MetricData import io.opentelemetry.sdk.metrics.export.MetricProducer import org.opendc.telemetry.compute.table.ServiceData -import java.time.Instant /** * Collect the metrics of the compute service. */ -public fun collectServiceMetrics(timestamp: Instant, metricProducer: MetricProducer): ServiceData { - return extractServiceMetrics(timestamp, metricProducer.collectAllMetrics()) -} - -/** - * Extract a [ServiceData] object from the specified list of metric data. - */ -public fun extractServiceMetrics(timestamp: Instant, metrics: Collection): ServiceData { +public fun collectServiceMetrics(metricProducer: MetricProducer): ServiceData { lateinit var serviceData: ServiceData val agg = ComputeMetricAggregator() val monitor = object : ComputeMonitor { @@ -46,7 +37,7 @@ public fun extractServiceMetrics(timestamp: Instant, metrics: Collection Date: Thu, 23 Sep 2021 14:46:57 +0200 Subject: fix(telemetry): Report cause of compute exporter failure --- .../kotlin/org/opendc/telemetry/compute/ComputeMetricAggregator.kt | 2 +- .../kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'opendc-telemetry') diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricAggregator.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricAggregator.kt index 679d5944..738ec38b 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricAggregator.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricAggregator.kt @@ -412,7 +412,7 @@ public class ComputeMetricAggregator { /** * The [HostInfo] of the host on which the server is hosted. */ - var host: HostInfo? = null + @JvmField var host: HostInfo? = null private var timestamp = Long.MIN_VALUE @JvmField var uptime: Long = 0 diff --git a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt index 580cc6fb..3ab6c7b2 100644 --- a/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt +++ b/opendc-telemetry/opendc-telemetry-compute/src/main/kotlin/org/opendc/telemetry/compute/ComputeMetricExporter.kt @@ -25,11 +25,17 @@ package org.opendc.telemetry.compute import io.opentelemetry.sdk.common.CompletableResultCode import io.opentelemetry.sdk.metrics.data.* import io.opentelemetry.sdk.metrics.export.MetricExporter +import mu.KotlinLogging /** * A [MetricExporter] that redirects data to a [ComputeMonitor] implementation. */ public abstract class ComputeMetricExporter : MetricExporter, ComputeMonitor { + /** + * The logging instance for this exporter. + */ + private val logger = KotlinLogging.logger {} + /** * A [ComputeMetricAggregator] that actually performs the aggregation. */ @@ -42,6 +48,7 @@ public abstract class ComputeMetricExporter : MetricExporter, ComputeMonitor { CompletableResultCode.ofSuccess() } catch (e: Throwable) { + logger.warn(e) { "Failed to export results" } CompletableResultCode.ofFailure() } } -- cgit v1.2.3