diff options
| author | Fabian Mastenbroek <mail.fabianm@gmail.com> | 2022-05-02 16:06:44 +0200 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-05-02 16:06:44 +0200 |
| commit | c78285f6346236053979aa98113ba9e6d7efb21e (patch) | |
| tree | 44221b3a39516a235a0b41adf525a79a60abb998 /opendc-trace/opendc-trace-api | |
| parent | 44ddd27a745f2dfe4b6ffef1b7657d156dd61489 (diff) | |
| parent | e4d3a8add5388182cf7a12b1099678a0b769b106 (diff) | |
merge: Add support for SQL via Apache Calcite (#78)
This pull request integrates initial support for SQL queries via Apache Calcite into the OpenDC codebase.
Our vision is that users of OpenDC should be able to use SQL queries to access and process most
of the experiment data generated by simulations.
This pull request moves towards this goal by adding the ability to query workload traces supported
by OpenDC using SQL. We also provide a CLI for querying the data in workload traces via `opendc-trace-tools`:
```bash
opendc-trace-tools query -i data/bitbrains-small -f opendc-vm "SELECT MAX(cpu_count) FROM resource_states"
```
## Implementation Notes :hammer_and_pick:
* Add Calcite (SQL) integration
* Add support for writing via SQL
* Add support for writing via SQL
* Support custom Parquet ReadSupport implementations
* Read records using low-level Parquet API
* Do not use Avro when exporting experiment data
* Do not use Avro when reading WTF trace
* Drop dependency on Avro
* Add support for projections
## External Dependencies :four_leaf_clover:
* Apache Calcite
## Breaking API Changes :warning:
* The existing code for reading Parquet traces using Apache Avro has been removed.
* `TraceFormat.newReader` now accepts a nullable `projection` parameter
Diffstat (limited to 'opendc-trace/opendc-trace-api')
9 files changed, 48 insertions, 44 deletions
diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/Table.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/Table.kt index b0181cbc..05d0234a 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/Table.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/Table.kt @@ -42,9 +42,11 @@ public interface Table { public val partitionKeys: List<TableColumn<*>> /** - * Open a [TableReader] for this table. + * Open a [TableReader] for a projection of this table. + * + * @param projection The list of columns to fetch from the table or `null` if no projection is performed. */ - public fun newReader(): TableReader + public fun newReader(projection: List<TableColumn<*>>? = null): TableReader /** * Open a [TableWriter] for this table. diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/TableColumn.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/TableColumn.kt index 776c40c0..b77a2982 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/TableColumn.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/TableColumn.kt @@ -33,7 +33,7 @@ public class TableColumn<out T>(public val name: String, type: Class<T>) { /** * The type of the column. */ - private val type: Class<*> = type + public val type: Class<*> = type /** * Determine whether the type of the column is a subtype of [column]. diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/InterferenceGroupColumns.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/InterferenceGroupColumns.kt index 532f6d24..5e8859e4 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/InterferenceGroupColumns.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/InterferenceGroupColumns.kt @@ -24,22 +24,21 @@ package org.opendc.trace.conv import org.opendc.trace.TableColumn -import org.opendc.trace.column /** * Members of the interference group. */ @JvmField -public val INTERFERENCE_GROUP_MEMBERS: TableColumn<Set<String>> = column("interference_group:members") +public val INTERFERENCE_GROUP_MEMBERS: TableColumn<Set<String>> = column("members") /** * Target load after which the interference occurs. */ @JvmField -public val INTERFERENCE_GROUP_TARGET: TableColumn<Double> = column("interference_group:target") +public val INTERFERENCE_GROUP_TARGET: TableColumn<Double> = column("target") /** * Performance score when the interference occurs. */ @JvmField -public val INTERFERENCE_GROUP_SCORE: TableColumn<Double> = column("interference_group:score") +public val INTERFERENCE_GROUP_SCORE: TableColumn<Double> = column("score") diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceColumns.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceColumns.kt index e9fc5d44..e602e534 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceColumns.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceColumns.kt @@ -24,47 +24,46 @@ package org.opendc.trace.conv import org.opendc.trace.TableColumn -import org.opendc.trace.column import java.time.Instant /** * Identifier of the resource. */ @JvmField -public val RESOURCE_ID: TableColumn<String> = column("resource:id") +public val RESOURCE_ID: TableColumn<String> = column("id") /** * The cluster to which the resource belongs. */ @JvmField -public val RESOURCE_CLUSTER_ID: TableColumn<String> = column("resource:cluster_id") +public val RESOURCE_CLUSTER_ID: TableColumn<String> = column("cluster_id") /** * Start time for the resource. */ @JvmField -public val RESOURCE_START_TIME: TableColumn<Instant> = column("resource:start_time") +public val RESOURCE_START_TIME: TableColumn<Instant> = column("start_time") /** * End time for the resource. */ @JvmField -public val RESOURCE_STOP_TIME: TableColumn<Instant> = column("resource:stop_time") +public val RESOURCE_STOP_TIME: TableColumn<Instant> = column("stop_time") /** * Number of CPUs for the resource. */ @JvmField -public val RESOURCE_CPU_COUNT: TableColumn<Int> = column("resource:cpu_count") +public val RESOURCE_CPU_COUNT: TableColumn<Int> = column("cpu_count") /** * Total CPU capacity of the resource in MHz. */ @JvmField -public val RESOURCE_CPU_CAPACITY: TableColumn<Double> = column("resource:cpu_capacity") +public val RESOURCE_CPU_CAPACITY: TableColumn<Double> = column("cpu_capacity") /** * Memory capacity for the resource in KB. */ @JvmField -public val RESOURCE_MEM_CAPACITY: TableColumn<Double> = column("resource:mem_capacity") +public val RESOURCE_MEM_CAPACITY: TableColumn<Double> = column("mem_capacity") diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt index d5bbafd7..3a44f817 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/ResourceStateColumns.kt @@ -24,7 +24,6 @@ package org.opendc.trace.conv import org.opendc.trace.TableColumn -import org.opendc.trace.column import java.time.Duration import java.time.Instant @@ -32,70 +31,70 @@ import java.time.Instant * The timestamp at which the state was recorded. */ @JvmField -public val RESOURCE_STATE_TIMESTAMP: TableColumn<Instant> = column("resource_state:timestamp") +public val RESOURCE_STATE_TIMESTAMP: TableColumn<Instant> = column("timestamp") /** * Duration for the state. */ @JvmField -public val RESOURCE_STATE_DURATION: TableColumn<Duration> = column("resource_state:duration") +public val RESOURCE_STATE_DURATION: TableColumn<Duration> = column("duration") /** * A flag to indicate that the resource is powered on. */ @JvmField -public val RESOURCE_STATE_POWERED_ON: TableColumn<Boolean> = column("resource_state:powered_on") +public val RESOURCE_STATE_POWERED_ON: TableColumn<Boolean> = column("powered_on") /** * Total CPU usage of the resource in MHz. */ @JvmField -public val RESOURCE_STATE_CPU_USAGE: TableColumn<Double> = column("resource_state:cpu_usage") +public val RESOURCE_STATE_CPU_USAGE: TableColumn<Double> = column("cpu_usage") /** * Total CPU usage of the resource in percentage. */ @JvmField -public val RESOURCE_STATE_CPU_USAGE_PCT: TableColumn<Double> = column("resource_state:cpu_usage_pct") +public val RESOURCE_STATE_CPU_USAGE_PCT: TableColumn<Double> = column("cpu_usage_pct") /** * Total CPU demand of the resource in MHz. */ @JvmField -public val RESOURCE_STATE_CPU_DEMAND: TableColumn<Double> = column("resource_state:cpu_demand") +public val RESOURCE_STATE_CPU_DEMAND: TableColumn<Double> = column("cpu_demand") /** * CPU ready percentage. */ @JvmField -public val RESOURCE_STATE_CPU_READY_PCT: TableColumn<Double> = column("resource_state:cpu_ready_pct") +public val RESOURCE_STATE_CPU_READY_PCT: TableColumn<Double> = column("cpu_ready_pct") /** * Memory usage of the resource in KB. */ @JvmField -public val RESOURCE_STATE_MEM_USAGE: TableColumn<Double> = column("resource_state:mem_usage") +public val RESOURCE_STATE_MEM_USAGE: TableColumn<Double> = column("mem_usage") /** * Disk read throughput of the resource in KB/s. */ @JvmField -public val RESOURCE_STATE_DISK_READ: TableColumn<Double> = column("resource_state:disk_read") +public val RESOURCE_STATE_DISK_READ: TableColumn<Double> = column("disk_read") /** * Disk write throughput of the resource in KB/s. */ @JvmField -public val RESOURCE_STATE_DISK_WRITE: TableColumn<Double> = column("resource_state:disk_write") +public val RESOURCE_STATE_DISK_WRITE: TableColumn<Double> = column("disk_write") /** * Network receive throughput of the resource in KB/s. */ @JvmField -public val RESOURCE_STATE_NET_RX: TableColumn<Double> = column("resource_state:net_rx") +public val RESOURCE_STATE_NET_RX: TableColumn<Double> = column("net_rx") /** * Network transmit throughput of the resource in KB/s. */ @JvmField -public val RESOURCE_STATE_NET_TX: TableColumn<Double> = column("resource_state:net_tx") +public val RESOURCE_STATE_NET_TX: TableColumn<Double> = column("net_tx") diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/TableColumns.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/TableColumns.kt index 31a58360..a58505e9 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/TableColumns.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/TableColumns.kt @@ -21,7 +21,9 @@ */ @file:JvmName("TableColumns") -package org.opendc.trace +package org.opendc.trace.conv + +import org.opendc.trace.TableColumn /** * Construct a [TableColumn] with the specified [name] and type [T]. diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/TaskColumns.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/TaskColumns.kt index 397c0794..e6daafb7 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/TaskColumns.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/conv/TaskColumns.kt @@ -24,7 +24,6 @@ package org.opendc.trace.conv import org.opendc.trace.TableColumn -import org.opendc.trace.column import java.time.Duration import java.time.Instant @@ -32,70 +31,70 @@ import java.time.Instant * A column containing the task identifier. */ @JvmField -public val TASK_ID: TableColumn<String> = column("task:id") +public val TASK_ID: TableColumn<String> = column("id") /** * A column containing the identifier of the workflow. */ @JvmField -public val TASK_WORKFLOW_ID: TableColumn<String> = column("task:workflow_id") +public val TASK_WORKFLOW_ID: TableColumn<String> = column("workflow_id") /** * A column containing the submission time of the task. */ @JvmField -public val TASK_SUBMIT_TIME: TableColumn<Instant> = column("task:submit_time") +public val TASK_SUBMIT_TIME: TableColumn<Instant> = column("submit_time") /** * A column containing the wait time of the task. */ @JvmField -public val TASK_WAIT_TIME: TableColumn<Instant> = column("task:wait_time") +public val TASK_WAIT_TIME: TableColumn<Instant> = column("wait_time") /** * A column containing the runtime time of the task. */ @JvmField -public val TASK_RUNTIME: TableColumn<Duration> = column("task:runtime") +public val TASK_RUNTIME: TableColumn<Duration> = column("runtime") /** * A column containing the parents of a task. */ @JvmField -public val TASK_PARENTS: TableColumn<Set<String>> = column("task:parents") +public val TASK_PARENTS: TableColumn<Set<String>> = column("parents") /** * A column containing the children of a task. */ @JvmField -public val TASK_CHILDREN: TableColumn<Set<String>> = column("task:children") +public val TASK_CHILDREN: TableColumn<Set<String>> = column("children") /** * A column containing the requested CPUs of a task. */ @JvmField -public val TASK_REQ_NCPUS: TableColumn<Int> = column("task:req_ncpus") +public val TASK_REQ_NCPUS: TableColumn<Int> = column("req_ncpus") /** * A column containing the allocated CPUs of a task. */ @JvmField -public val TASK_ALLOC_NCPUS: TableColumn<Int> = column("task:alloc_ncpus") +public val TASK_ALLOC_NCPUS: TableColumn<Int> = column("alloc_ncpus") /** * A column containing the status of a task. */ @JvmField -public val TASK_STATUS: TableColumn<Int> = column("task:status") +public val TASK_STATUS: TableColumn<Int> = column("status") /** * A column containing the group id of a task. */ @JvmField -public val TASK_GROUP_ID: TableColumn<Int> = column("task:group_id") +public val TASK_GROUP_ID: TableColumn<Int> = column("group_id") /** * A column containing the user id of a task. */ @JvmField -public val TASK_USER_ID: TableColumn<Int> = column("task:user_id") +public val TASK_USER_ID: TableColumn<Int> = column("user_id") diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/internal/TableImpl.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/internal/TableImpl.kt index 24551edb..b848e19a 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/internal/TableImpl.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/internal/TableImpl.kt @@ -43,7 +43,9 @@ internal class TableImpl(val trace: TraceImpl, override val name: String) : Tabl override val partitionKeys: List<TableColumn<*>> get() = details.partitionKeys - override fun newReader(): TableReader = trace.format.newReader(trace.path, name) + override fun newReader(projection: List<TableColumn<*>>?): TableReader { + return trace.format.newReader(trace.path, name, projection) + } override fun newWriter(): TableWriter = trace.format.newWriter(trace.path, name) diff --git a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/spi/TraceFormat.kt b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/spi/TraceFormat.kt index f2e610db..47761e0f 100644 --- a/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/spi/TraceFormat.kt +++ b/opendc-trace/opendc-trace-api/src/main/kotlin/org/opendc/trace/spi/TraceFormat.kt @@ -22,6 +22,7 @@ package org.opendc.trace.spi +import org.opendc.trace.TableColumn import org.opendc.trace.TableReader import org.opendc.trace.TableWriter import java.nio.file.Path @@ -68,10 +69,11 @@ public interface TraceFormat { * * @param path The path to the trace to open. * @param table The name of the table to open a [TableReader] for. + * @param projection The list of [TableColumn]s to project or `null` if no projection is performed. * @throws IllegalArgumentException If [table] does not exist. * @return A [TableReader] instance for the table. */ - public fun newReader(path: Path, table: String): TableReader + public fun newReader(path: Path, table: String, projection: List<TableColumn<*>>?): TableReader /** * Open a [TableWriter] for the specified [table]. |
