diff options
| author | Fabian Mastenbroek <mail.fabianm@gmail.com> | 2021-09-07 17:30:46 +0200 |
|---|---|---|
| committer | Fabian Mastenbroek <mail.fabianm@gmail.com> | 2021-09-17 16:52:29 +0200 |
| commit | 0d8bccc68705d036fbf60f312d9c34ca4392c6b2 (patch) | |
| tree | faa50b8bf29976531e2ba757269ceb746195737d /opendc-experiments | |
| parent | 8d899e29dbd757f6df320212d6e0d77ce8216ab9 (diff) | |
refactor(telemetry): Standardize SimHost metrics
This change standardizes the metrics emitted by SimHost instances and
their guests based on the OpenTelemetry semantic conventions. We now
also report CPU time as opposed to CPU work as this metric is more
commonly used.
Diffstat (limited to 'opendc-experiments')
6 files changed, 101 insertions, 65 deletions
diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt index 3ec424f1..6261ebbf 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/Portfolio.kt @@ -149,9 +149,10 @@ abstract class Portfolio(name: String) : Experiment(name) { } finally { simulator.close() metricReader.close() + monitor.close() } - val monitorResults = collectServiceMetrics(clock.millis(), simulator.producers[0]) + val monitorResults = collectServiceMetrics(clock.instant(), simulator.producers[0]) logger.debug { "Scheduler " + "Success=${monitorResults.attemptsSuccess} " + diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetDataWriter.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetDataWriter.kt index 5684bde9..e3d15c3b 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetDataWriter.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetDataWriter.kt @@ -27,7 +27,6 @@ import org.apache.avro.Schema import org.apache.avro.generic.GenericData import org.apache.avro.generic.GenericRecordBuilder import org.apache.parquet.avro.AvroParquetWriter -import org.apache.parquet.example.Paper.schema import org.apache.parquet.hadoop.ParquetFileWriter import org.apache.parquet.hadoop.ParquetWriter import org.apache.parquet.hadoop.metadata.CompressionCodecName diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetHostDataWriter.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetHostDataWriter.kt index fa00fc35..36207045 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetHostDataWriter.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetHostDataWriter.kt @@ -44,20 +44,31 @@ public class ParquetHostDataWriter(path: File, bufferSize: Int) : } override fun convert(builder: GenericRecordBuilder, data: HostData) { - builder["timestamp"] = data.timestamp + builder["timestamp"] = data.timestamp.toEpochMilli() + builder["host_id"] = data.host.id - builder["powered_on"] = true + builder["num_cpus"] = data.host.cpuCount + builder["mem_capacity"] = data.host.memCapacity + builder["uptime"] = data.uptime builder["downtime"] = data.downtime - builder["total_work"] = data.totalWork - builder["granted_work"] = data.grantedWork - builder["overcommitted_work"] = data.overcommittedWork - builder["interfered_work"] = data.interferedWork - builder["cpu_usage"] = data.cpuUsage - builder["cpu_demand"] = data.cpuDemand - builder["power_draw"] = data.powerDraw - builder["num_instances"] = data.instanceCount - builder["num_cpus"] = data.host.cpuCount + val bootTime = data.bootTime + if (bootTime != null) { + builder["boot_time"] = bootTime.toEpochMilli() + } + + builder["cpu_limit"] = data.cpuLimit + builder["cpu_time_active"] = data.cpuActiveTime + builder["cpu_time_idle"] = data.cpuIdleTime + builder["cpu_time_steal"] = data.cpuStealTime + builder["cpu_time_lost"] = data.cpuLostTime + + builder["power_total"] = data.powerTotal + + builder["guests_terminated"] = data.guestsTerminated + builder["guests_running"] = data.guestsRunning + builder["guests_error"] = data.guestsError + builder["guests_invalid"] = data.guestsInvalid } override fun toString(): String = "host-writer" @@ -69,18 +80,21 @@ public class ParquetHostDataWriter(path: File, bufferSize: Int) : .fields() .requiredLong("timestamp") .requiredString("host_id") - .requiredBoolean("powered_on") + .requiredInt("num_cpus") + .requiredLong("mem_capacity") .requiredLong("uptime") .requiredLong("downtime") - .requiredDouble("total_work") - .requiredDouble("granted_work") - .requiredDouble("overcommitted_work") - .requiredDouble("interfered_work") - .requiredDouble("cpu_usage") - .requiredDouble("cpu_demand") - .requiredDouble("power_draw") - .requiredInt("num_instances") - .requiredInt("num_cpus") + .optionalLong("boot_time") + .requiredDouble("cpu_limit") + .requiredLong("cpu_time_active") + .requiredLong("cpu_time_idle") + .requiredLong("cpu_time_steal") + .requiredLong("cpu_time_lost") + .requiredDouble("power_total") + .requiredInt("guests_terminated") + .requiredInt("guests_running") + .requiredInt("guests_error") + .requiredInt("guests_invalid") .endRecord() } } diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetServerDataWriter.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetServerDataWriter.kt index bb2db4b7..c5a5e7c0 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetServerDataWriter.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetServerDataWriter.kt @@ -40,18 +40,31 @@ public class ParquetServerDataWriter(path: File, bufferSize: Int) : override fun buildWriter(builder: AvroParquetWriter.Builder<GenericData.Record>): ParquetWriter<GenericData.Record> { return builder .withDictionaryEncoding("server_id", true) - .withDictionaryEncoding("state", true) + .withDictionaryEncoding("host_id", true) .build() } override fun convert(builder: GenericRecordBuilder, data: ServerData) { - builder["timestamp"] = data.timestamp - builder["server_id"] = data.server - // builder["state"] = data.server.state + builder["timestamp"] = data.timestamp.toEpochMilli() + + builder["server_id"] = data.server.id + builder["host_id"] = data.host?.id + builder["num_vcpus"] = data.server.cpuCount + builder["mem_capacity"] = data.server.memCapacity + builder["uptime"] = data.uptime builder["downtime"] = data.downtime - // builder["num_vcpus"] = data.server.flavor.cpuCount - // builder["mem_capacity"] = data.server.flavor.memorySize + val bootTime = data.bootTime + if (bootTime != null) { + builder["boot_time"] = bootTime.toEpochMilli() + } + builder["scheduling_latency"] = data.schedulingLatency + + builder["cpu_limit"] = data.cpuLimit + builder["cpu_time_active"] = data.cpuActiveTime + builder["cpu_time_idle"] = data.cpuIdleTime + builder["cpu_time_steal"] = data.cpuStealTime + builder["cpu_time_lost"] = data.cpuLostTime } override fun toString(): String = "server-writer" @@ -63,11 +76,18 @@ public class ParquetServerDataWriter(path: File, bufferSize: Int) : .fields() .requiredLong("timestamp") .requiredString("server_id") - .requiredString("state") - .requiredLong("uptime") - .requiredLong("downtime") + .optionalString("host_id") .requiredInt("num_vcpus") .requiredLong("mem_capacity") + .requiredLong("uptime") + .requiredLong("downtime") + .optionalLong("boot_time") + .requiredLong("scheduling_latency") + .requiredDouble("cpu_limit") + .requiredLong("cpu_time_active") + .requiredLong("cpu_time_idle") + .requiredLong("cpu_time_steal") + .requiredLong("cpu_time_lost") .endRecord() } } diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetServiceDataWriter.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetServiceDataWriter.kt index 29b48878..d9ca55cb 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetServiceDataWriter.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/export/parquet/ParquetServiceDataWriter.kt @@ -35,7 +35,7 @@ public class ParquetServiceDataWriter(path: File, bufferSize: Int) : ParquetDataWriter<ServiceData>(path, SCHEMA, bufferSize) { override fun convert(builder: GenericRecordBuilder, data: ServiceData) { - builder["timestamp"] = data.timestamp + builder["timestamp"] = data.timestamp.toEpochMilli() builder["hosts_up"] = data.hostsUp builder["hosts_down"] = data.hostsDown builder["servers_pending"] = data.serversPending diff --git a/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt b/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt index 81405acf..727530e3 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/test/kotlin/org/opendc/experiments/capelin/CapelinIntegrationTest.kt @@ -50,7 +50,6 @@ import org.opendc.telemetry.sdk.metrics.export.CoroutineMetricReader import java.io.File import java.time.Duration import java.util.* -import kotlin.math.roundToLong /** * An integration test suite for the Capelin experiments. @@ -102,7 +101,7 @@ class CapelinIntegrationTest { metricReader.close() } - val serviceMetrics = collectServiceMetrics(clock.millis(), simulator.producers[0]) + val serviceMetrics = collectServiceMetrics(clock.instant(), simulator.producers[0]) println( "Scheduler " + "Success=${serviceMetrics.attemptsSuccess} " + @@ -118,11 +117,11 @@ class CapelinIntegrationTest { { assertEquals(0, serviceMetrics.serversActive, "All VMs should finish after a run") }, { assertEquals(0, serviceMetrics.attemptsFailure, "No VM should be unscheduled") }, { assertEquals(0, serviceMetrics.serversPending, "No VM should not be in the queue") }, - { assertEquals(220346412191, monitor.totalWork) { "Incorrect requested burst" } }, - { assertEquals(206667852689, monitor.totalGrantedWork) { "Incorrect granted burst" } }, - { assertEquals(1151612221, monitor.totalOvercommittedWork) { "Incorrect overcommitted burst" } }, - { assertEquals(0, monitor.totalInterferedWork) { "Incorrect interfered burst" } }, - { assertEquals(9.088769763540529E7, monitor.totalPowerDraw, 0.01) { "Incorrect power draw" } }, + { assertEquals(223856043, monitor.idleTime) { "Incorrect idle time" } }, + { assertEquals(66481557, monitor.activeTime) { "Incorrect active time" } }, + { assertEquals(360441, monitor.stealTime) { "Incorrect steal time" } }, + { assertEquals(0, monitor.lostTime) { "Incorrect lost time" } }, + { assertEquals(5.418336360461193E9, monitor.energyUsage, 0.01) { "Incorrect power draw" } }, ) } @@ -151,7 +150,7 @@ class CapelinIntegrationTest { metricReader.close() } - val serviceMetrics = collectServiceMetrics(clock.millis(), simulator.producers[0]) + val serviceMetrics = collectServiceMetrics(clock.instant(), simulator.producers[0]) println( "Scheduler " + "Success=${serviceMetrics.attemptsSuccess} " + @@ -163,10 +162,10 @@ class CapelinIntegrationTest { // Note that these values have been verified beforehand assertAll( - { assertEquals(39183965664, monitor.totalWork) { "Total work incorrect" } }, - { assertEquals(35649907631, monitor.totalGrantedWork) { "Total granted work incorrect" } }, - { assertEquals(1043642275, monitor.totalOvercommittedWork) { "Total overcommitted work incorrect" } }, - { assertEquals(0, monitor.totalInterferedWork) { "Total interfered work incorrect" } } + { assertEquals(9597804, monitor.idleTime) { "Idle time incorrect" } }, + { assertEquals(11140596, monitor.activeTime) { "Active time incorrect" } }, + { assertEquals(326138, monitor.stealTime) { "Steal time incorrect" } }, + { assertEquals(0, monitor.lostTime) { "Lost time incorrect" } } ) } @@ -202,7 +201,7 @@ class CapelinIntegrationTest { metricReader.close() } - val serviceMetrics = collectServiceMetrics(clock.millis(), simulator.producers[0]) + val serviceMetrics = collectServiceMetrics(clock.instant(), simulator.producers[0]) println( "Scheduler " + "Success=${serviceMetrics.attemptsSuccess} " + @@ -214,10 +213,10 @@ class CapelinIntegrationTest { // Note that these values have been verified beforehand assertAll( - { assertEquals(39183965664, monitor.totalWork) { "Total work incorrect" } }, - { assertEquals(35649907631, monitor.totalGrantedWork) { "Total granted work incorrect" } }, - { assertEquals(1043642275, monitor.totalOvercommittedWork) { "Total overcommitted work incorrect" } }, - { assertEquals(2960974524, monitor.totalInterferedWork) { "Total interfered work incorrect" } } + { assertEquals(9597804, monitor.idleTime) { "Idle time incorrect" } }, + { assertEquals(11140596, monitor.activeTime) { "Active time incorrect" } }, + { assertEquals(326138, monitor.stealTime) { "Steal time incorrect" } }, + { assertEquals(925305, monitor.lostTime) { "Lost time incorrect" } } ) } @@ -247,7 +246,7 @@ class CapelinIntegrationTest { metricReader.close() } - val serviceMetrics = collectServiceMetrics(clock.millis(), simulator.producers[0]) + val serviceMetrics = collectServiceMetrics(clock.instant(), simulator.producers[0]) println( "Scheduler " + "Success=${serviceMetrics.attemptsSuccess} " + @@ -259,10 +258,11 @@ class CapelinIntegrationTest { // Note that these values have been verified beforehand assertAll( - { assertEquals(38385856700, monitor.totalWork) { "Total requested work incorrect" } }, - { assertEquals(34886670127, monitor.totalGrantedWork) { "Total granted work incorrect" } }, - { assertEquals(979997628, monitor.totalOvercommittedWork) { "Total overcommitted work incorrect" } }, - { assertEquals(0, monitor.totalInterferedWork) { "Total interfered work incorrect" } } + { assertEquals(9836315, monitor.idleTime) { "Idle time incorrect" } }, + { assertEquals(10902085, monitor.activeTime) { "Active time incorrect" } }, + { assertEquals(306249, monitor.stealTime) { "Steal time incorrect" } }, + { assertEquals(0, monitor.lostTime) { "Lost time incorrect" } }, + { assertEquals(2540877457, monitor.uptime) { "Uptime incorrect" } } ) } @@ -286,18 +286,20 @@ class CapelinIntegrationTest { } class TestExperimentReporter : ComputeMonitor { - var totalWork = 0L - var totalGrantedWork = 0L - var totalOvercommittedWork = 0L - var totalInterferedWork = 0L - var totalPowerDraw = 0.0 + var idleTime = 0L + var activeTime = 0L + var stealTime = 0L + var lostTime = 0L + var energyUsage = 0.0 + var uptime = 0L override fun record(data: HostData) { - this.totalWork += data.totalWork.roundToLong() - totalGrantedWork += data.grantedWork.roundToLong() - totalOvercommittedWork += data.overcommittedWork.roundToLong() - totalInterferedWork += data.interferedWork.roundToLong() - totalPowerDraw += data.powerDraw + idleTime += data.cpuIdleTime + activeTime += data.cpuActiveTime + stealTime += data.cpuStealTime + lostTime += data.cpuLostTime + energyUsage += data.powerTotal + uptime += data.uptime } } } |
