From 5b8dfc78496452bd23fab59e3ead84a8941da779 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Thu, 6 Oct 2022 22:42:31 +0200 Subject: feat(web/server): Add support for accounting simulation time This change updates the Quarkus-based web server to add support for tracking and limiting the simulation minutes used by the user in order to prevent misuse of shared resources. --- .../kotlin/org/opendc/web/runner/JobManager.kt | 10 +++++--- .../kotlin/org/opendc/web/runner/OpenDCRunner.kt | 27 ++++++++++++++++++---- .../opendc/web/runner/internal/JobManagerImpl.kt | 15 ++++++------ 3 files changed, 37 insertions(+), 15 deletions(-) (limited to 'opendc-web/opendc-web-runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/JobManager.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/JobManager.kt index 50aa03d8..d6c06889 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/JobManager.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/JobManager.kt @@ -42,18 +42,22 @@ public interface JobManager { /** * Update the heartbeat of the specified job. + * + * @param id The identifier of the job. + * @param runtime The total runtime of the job. + * @return `true` if the job can continue, `false` if the job has been cancelled. */ - public fun heartbeat(id: Long) + public fun heartbeat(id: Long, runtime: Int): Boolean /** * Mark the job as failed. */ - public fun fail(id: Long) + public fun fail(id: Long, runtime: Int) /** * Persist the specified results for the specified job. */ - public fun finish(id: Long, results: Map) + public fun finish(id: Long, runtime: Int, results: Map) public companion object { /** diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt index 226bad47..d4996198 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt @@ -48,6 +48,8 @@ import org.opendc.web.proto.runner.Topology import org.opendc.web.runner.internal.WebComputeMonitor import java.io.File import java.time.Duration +import java.time.Instant +import java.time.temporal.ChronoUnit import java.util.Random import java.util.UUID import java.util.concurrent.Executors @@ -144,9 +146,15 @@ public class OpenDCRunner( override fun compute() { val id = job.id val scenario = job.scenario + val startTime = Instant.now() + val currentThread = Thread.currentThread() val heartbeat = scheduler.scheduleWithFixedDelay( - { manager.heartbeat(id) }, + { + if (!manager.heartbeat(id, startTime.secondsSince())) { + currentThread.interrupt() + } + }, 0, heartbeatInterval.toMillis(), TimeUnit.MILLISECONDS @@ -163,12 +171,14 @@ public class OpenDCRunner( } val results = invokeAll(jobs).map { it.rawResult } - logger.info { "Finished simulation for job $id" } - heartbeat.cancel(true) + val duration = startTime.secondsSince() + logger.info { "Finished simulation for job $id (in $duration seconds)" } + manager.finish( id, + duration, mapOf( "total_requested_burst" to results.map { it.totalActiveTime + it.totalIdleTime }, "total_granted_burst" to results.map { it.totalActiveTime }, @@ -190,19 +200,26 @@ public class OpenDCRunner( } catch (e: Exception) { // Check whether the job failed due to exceeding its time budget if (Thread.interrupted()) { - logger.info { "Simulation job $id exceeded time limit" } + logger.info { "Simulation job $id exceeded time limit (${startTime.secondsSince()} seconds)" } } else { logger.info(e) { "Simulation job $id failed" } } try { heartbeat.cancel(true) - manager.fail(id) + manager.fail(id, startTime.secondsSince()) } catch (e: Throwable) { logger.error(e) { "Failed to update job" } } } } + + /** + * Calculate the seconds since the specified instant. + */ + private fun Instant.secondsSince(): Int { + return ChronoUnit.SECONDS.between(this, Instant.now()).toInt() + } } /** diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/internal/JobManagerImpl.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/internal/JobManagerImpl.kt index 39a6851c..5b1b7132 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/internal/JobManagerImpl.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/internal/JobManagerImpl.kt @@ -37,22 +37,23 @@ internal class JobManagerImpl(private val client: OpenDCRunnerClient) : JobManag override fun claim(id: Long): Boolean { return try { - client.jobs.update(id, Job.Update(JobState.CLAIMED)) + client.jobs.update(id, Job.Update(JobState.CLAIMED, 0)) true } catch (e: IllegalStateException) { false } } - override fun heartbeat(id: Long) { - client.jobs.update(id, Job.Update(JobState.RUNNING)) + override fun heartbeat(id: Long, runtime: Int): Boolean { + val res = client.jobs.update(id, Job.Update(JobState.RUNNING, runtime)) + return res?.state != JobState.FAILED } - override fun fail(id: Long) { - client.jobs.update(id, Job.Update(JobState.FAILED)) + override fun fail(id: Long, runtime: Int) { + client.jobs.update(id, Job.Update(JobState.FAILED, runtime)) } - override fun finish(id: Long, results: Map) { - client.jobs.update(id, Job.Update(JobState.FINISHED, results)) + override fun finish(id: Long, runtime: Int, results: Map) { + client.jobs.update(id, Job.Update(JobState.FINISHED, runtime)) } } -- cgit v1.2.3 From 3624e37c847f6b4d45ad1abd6437c6e7cdb28dcd Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Mon, 10 Oct 2022 11:15:40 +0200 Subject: fix(web/runner): Fix service metric reporting This change resolves an issue in the web runner where the finished VMs would always be reported as zero. --- .../web/runner/internal/WebComputeMonitor.kt | 33 ++++++++-------------- 1 file changed, 12 insertions(+), 21 deletions(-) (limited to 'opendc-web/opendc-web-runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/internal/WebComputeMonitor.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/internal/WebComputeMonitor.kt index 4db70d3d..d6722115 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/internal/WebComputeMonitor.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/internal/WebComputeMonitor.kt @@ -24,8 +24,9 @@ package org.opendc.web.runner.internal import org.opendc.experiments.compute.telemetry.ComputeMonitor import org.opendc.experiments.compute.telemetry.table.HostTableReader +import org.opendc.experiments.compute.telemetry.table.ServiceData import org.opendc.experiments.compute.telemetry.table.ServiceTableReader -import kotlin.math.max +import org.opendc.experiments.compute.telemetry.table.toServiceData import kotlin.math.roundToLong /** @@ -76,30 +77,20 @@ internal class WebComputeMonitor : ComputeMonitor { val count: Long ) - private var serviceMetrics: AggregateServiceMetrics = AggregateServiceMetrics() + private lateinit var serviceData: ServiceData override fun record(reader: ServiceTableReader) { - serviceMetrics = AggregateServiceMetrics( - max(reader.attemptsSuccess, serviceMetrics.vmTotalCount), - max(reader.serversPending, serviceMetrics.vmWaitingCount), - max(reader.serversActive, serviceMetrics.vmActiveCount), - max(0, serviceMetrics.vmInactiveCount), - max(reader.attemptsFailure, serviceMetrics.vmFailedCount) - ) + serviceData = reader.toServiceData() } - private data class AggregateServiceMetrics( - val vmTotalCount: Int = 0, - val vmWaitingCount: Int = 0, - val vmActiveCount: Int = 0, - val vmInactiveCount: Int = 0, - val vmFailedCount: Int = 0 - ) - /** * Collect the results of the simulation. */ fun collectResults(): Results { + val hostAggregateMetrics = hostAggregateMetrics + val hostMetrics = hostMetrics + val serviceData = serviceData + return Results( hostAggregateMetrics.totalActiveTime, hostAggregateMetrics.totalIdleTime, @@ -112,10 +103,10 @@ internal class WebComputeMonitor : ComputeMonitor { hostAggregateMetrics.totalPowerDraw, hostAggregateMetrics.totalFailureSlices.roundToLong(), hostAggregateMetrics.totalFailureVmSlices.roundToLong(), - serviceMetrics.vmTotalCount, - serviceMetrics.vmWaitingCount, - serviceMetrics.vmInactiveCount, - serviceMetrics.vmFailedCount + serviceData.serversTotal, + serviceData.serversPending, + serviceData.serversTotal - serviceData.serversPending - serviceData.serversActive, + serviceData.attemptsError + serviceData.attemptsFailure ) } -- cgit v1.2.3 From 70630a40b54218ab8f8fd7a5e2dfb424d3dec378 Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Mon, 10 Oct 2022 11:53:25 +0200 Subject: fix(web/runner): Increase default job timeout This change fixes an issue with the OpenDC web runner where the default job timeout was set to 10 ms instead of 10 minutes. For longer simulations, this would cause the job to be terminated. --- .../src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'opendc-web/opendc-web-runner') diff --git a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt index d4996198..1bc4e938 100644 --- a/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt +++ b/opendc-web/opendc-web-runner/src/main/kotlin/org/opendc/web/runner/OpenDCRunner.kt @@ -74,7 +74,7 @@ public class OpenDCRunner( private val manager: JobManager, private val tracePath: File, parallelism: Int = Runtime.getRuntime().availableProcessors(), - private val jobTimeout: Duration = Duration.ofMillis(10), + private val jobTimeout: Duration = Duration.ofMinutes(10), private val pollInterval: Duration = Duration.ofSeconds(30), private val heartbeatInterval: Duration = Duration.ofMinutes(1) ) : Runnable { -- cgit v1.2.3