diff options
17 files changed, 527 insertions, 83 deletions
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index b0e8cc38..28e5846b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,6 +12,9 @@ jobs: matrix: os: [ubuntu-latest] java: [8, 16] + include: + - os: windows-latest + java: 16 steps: - name: Checkout repository uses: actions/checkout@v2 diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 1d7fdd3e..22f713f6 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -5,6 +5,7 @@ slf4j = "1.7.30" log4j = "2.14.1" opentelemetry-main = "1.2.0" opentelemetry-metrics = "1.2.0-alpha" +hadoop = "3.3.0" [libraries] kotlinx-coroutines = { module = "org.jetbrains.kotlinx:kotlinx-coroutines-core", version = "1.5.0" } @@ -45,4 +46,5 @@ kotlinx-benchmark-runtime-jvm = { module = "org.jetbrains.kotlinx:kotlinx-benchm # Other mongodb = { module = "org.mongodb:mongodb-driver-sync", version = "4.2.3" } classgraph = { module = "io.github.classgraph:classgraph", version = "4.8.105" } -hadoop-client = { module = "org.apache.hadoop:hadoop-client", version = "3.3.0" } +hadoop-common = { module = "org.apache.hadoop:hadoop-common", version.ref = "hadoop" } +hadoop-mapreduce-client-core = { module = "org.apache.hadoop:hadoop-mapreduce-client-core", version.ref = "hadoop" } diff --git a/opendc-experiments/opendc-experiments-capelin/build.gradle.kts b/opendc-experiments/opendc-experiments-capelin/build.gradle.kts index 0dade513..324cae3e 100644 --- a/opendc-experiments/opendc-experiments-capelin/build.gradle.kts +++ b/opendc-experiments/opendc-experiments-capelin/build.gradle.kts @@ -44,10 +44,5 @@ dependencies { implementation(libs.clikt) implementation(libs.parquet) - implementation(libs.hadoop.client) { - exclude(group = "org.slf4j", module = "slf4j-log4j12") - exclude(group = "log4j") - } - testImplementation(libs.log4j.slf4j) } diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/telemetry/parquet/ParquetEventWriter.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/telemetry/parquet/ParquetEventWriter.kt index 4fa6ae66..d8f7ff75 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/telemetry/parquet/ParquetEventWriter.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/telemetry/parquet/ParquetEventWriter.kt @@ -25,10 +25,10 @@ package org.opendc.experiments.capelin.telemetry.parquet import mu.KotlinLogging import org.apache.avro.Schema import org.apache.avro.generic.GenericData -import org.apache.hadoop.fs.Path import org.apache.parquet.avro.AvroParquetWriter import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.opendc.experiments.capelin.telemetry.Event +import org.opendc.format.util.LocalOutputFile import java.io.Closeable import java.io.File import java.util.concurrent.ArrayBlockingQueue @@ -52,8 +52,7 @@ public open class ParquetEventWriter<in T : Event>( /** * The writer to write the Parquet file. */ - @Suppress("DEPRECATION") - private val writer = AvroParquetWriter.builder<GenericData.Record>(Path(path.absolutePath)) + private val writer = AvroParquetWriter.builder<GenericData.Record>(LocalOutputFile(path)) .withSchema(schema) .withCompressionCodec(CompressionCodecName.SNAPPY) .withPageSize(4 * 1024 * 1024) // For compression diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20ParquetTraceReader.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20ParquetTraceReader.kt index a8462a51..7f25137e 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20ParquetTraceReader.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20ParquetTraceReader.kt @@ -38,7 +38,6 @@ import java.util.TreeSet * @param performanceInterferenceModel The performance model covering the workload in the VM trace. * @param run The run to which this reader belongs. */ -@OptIn(ExperimentalStdlibApi::class) public class Sc20ParquetTraceReader( rawReaders: List<Sc20RawParquetTraceReader>, performanceInterferenceModel: Map<String, PerformanceInterferenceModel>, diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20RawParquetTraceReader.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20RawParquetTraceReader.kt index bd27cf02..54151c9f 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20RawParquetTraceReader.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20RawParquetTraceReader.kt @@ -24,10 +24,9 @@ package org.opendc.experiments.capelin.trace import mu.KotlinLogging import org.apache.avro.generic.GenericData -import org.apache.hadoop.fs.Path -import org.apache.parquet.avro.AvroParquetReader import org.opendc.format.trace.TraceEntry import org.opendc.format.trace.TraceReader +import org.opendc.format.util.LocalParquetReader import org.opendc.simulator.compute.workload.SimTraceWorkload import org.opendc.simulator.compute.workload.SimWorkload import java.io.File @@ -40,16 +39,12 @@ private val logger = KotlinLogging.logger {} * * @param path The directory of the traces. */ -@OptIn(ExperimentalStdlibApi::class) public class Sc20RawParquetTraceReader(private val path: File) { /** * Read the fragments into memory. */ private fun parseFragments(path: File): Map<String, List<SimTraceWorkload.Fragment>> { - @Suppress("DEPRECATION") - val reader = AvroParquetReader.builder<GenericData.Record>(Path(path.absolutePath, "trace.parquet")) - .disableCompatibility() - .build() + val reader = LocalParquetReader<GenericData.Record>(File(path, "trace.parquet")) val fragments = mutableMapOf<String, MutableList<SimTraceWorkload.Fragment>>() @@ -81,10 +76,7 @@ public class Sc20RawParquetTraceReader(private val path: File) { * Read the metadata into a workload. */ private fun parseMeta(path: File, fragments: Map<String, List<SimTraceWorkload.Fragment>>): List<TraceEntry<SimWorkload>> { - @Suppress("DEPRECATION") - val metaReader = AvroParquetReader.builder<GenericData.Record>(Path(path.absolutePath, "meta.parquet")) - .disableCompatibility() - .build() + val metaReader = LocalParquetReader<GenericData.Record>(File(path, "meta.parquet")) var counter = 0 val entries = mutableListOf<TraceEntry<SimWorkload>>() diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20StreamingParquetTraceReader.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20StreamingParquetTraceReader.kt index c5294b55..6792c2ab 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20StreamingParquetTraceReader.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20StreamingParquetTraceReader.kt @@ -24,7 +24,6 @@ package org.opendc.experiments.capelin.trace import mu.KotlinLogging import org.apache.avro.generic.GenericData -import org.apache.hadoop.fs.Path import org.apache.parquet.avro.AvroParquetReader import org.apache.parquet.filter2.compat.FilterCompat import org.apache.parquet.filter2.predicate.FilterApi @@ -33,6 +32,7 @@ import org.apache.parquet.filter2.predicate.UserDefinedPredicate import org.apache.parquet.io.api.Binary import org.opendc.format.trace.TraceEntry import org.opendc.format.trace.TraceReader +import org.opendc.format.util.LocalInputFile import org.opendc.simulator.compute.interference.IMAGE_PERF_INTERFERENCE_MODEL import org.opendc.simulator.compute.interference.PerformanceInterferenceModel import org.opendc.simulator.compute.workload.SimTraceWorkload @@ -54,7 +54,6 @@ private val logger = KotlinLogging.logger {} * @param traceFile The directory of the traces. * @param performanceInterferenceModel The performance model covering the workload in the VM trace. */ -@OptIn(ExperimentalStdlibApi::class) public class Sc20StreamingParquetTraceReader( traceFile: File, performanceInterferenceModel: PerformanceInterferenceModel? = null, @@ -96,10 +95,10 @@ public class Sc20StreamingParquetTraceReader( * The thread to read the records in. */ private val readerThread = thread(start = true, name = "sc20-reader") { - @Suppress("DEPRECATION") - val reader = AvroParquetReader.builder<GenericData.Record>(Path(traceFile.absolutePath, "trace.parquet")) + val reader = AvroParquetReader + .builder<GenericData.Record>(LocalInputFile(File(traceFile, "trace.parquet"))) .disableCompatibility() - .run { if (filter != null) withFilter(filter) else this } + .withFilter(filter) .build() try { @@ -164,10 +163,10 @@ public class Sc20StreamingParquetTraceReader( val entries = mutableMapOf<String, GenericData.Record>() val buffers = mutableMapOf<String, MutableList<MutableList<SimTraceWorkload.Fragment>>>() - @Suppress("DEPRECATION") - val metaReader = AvroParquetReader.builder<GenericData.Record>(Path(traceFile.absolutePath, "meta.parquet")) + val metaReader = AvroParquetReader + .builder<GenericData.Record>(LocalInputFile(File(traceFile, "meta.parquet"))) .disableCompatibility() - .run { if (filter != null) withFilter(filter) else this } + .withFilter(filter) .build() while (true) { @@ -178,7 +177,7 @@ public class Sc20StreamingParquetTraceReader( metaReader.close() - val selection = if (selectedVms.isEmpty()) entries.keys else selectedVms + val selection = selectedVms.ifEmpty { entries.keys } // Create the entry iterator iterator = selection.asSequence() diff --git a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20TraceConverter.kt b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20TraceConverter.kt index 1f9e289c..d0031a66 100644 --- a/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20TraceConverter.kt +++ b/opendc-experiments/opendc-experiments-capelin/src/main/kotlin/org/opendc/experiments/capelin/trace/Sc20TraceConverter.kt @@ -38,11 +38,11 @@ import me.tongfei.progressbar.ProgressBar import org.apache.avro.Schema import org.apache.avro.SchemaBuilder import org.apache.avro.generic.GenericData -import org.apache.hadoop.fs.Path import org.apache.parquet.avro.AvroParquetWriter import org.apache.parquet.hadoop.ParquetWriter import org.apache.parquet.hadoop.metadata.CompressionCodecName import org.opendc.format.trace.sc20.Sc20VmPlacementReader +import org.opendc.format.util.LocalOutputFile import java.io.BufferedReader import java.io.File import java.io.FileReader @@ -109,16 +109,14 @@ public class TraceConverterCli : CliktCommand(name = "trace-converter") { traceParquet.delete() } - @Suppress("DEPRECATION") - val metaWriter = AvroParquetWriter.builder<GenericData.Record>(Path(metaParquet.toURI())) + val metaWriter = AvroParquetWriter.builder<GenericData.Record>(LocalOutputFile(metaParquet)) .withSchema(metaSchema) .withCompressionCodec(CompressionCodecName.SNAPPY) .withPageSize(4 * 1024 * 1024) // For compression .withRowGroupSize(16 * 1024 * 1024) // For write buffering (Page size) .build() - @Suppress("DEPRECATION") - val writer = AvroParquetWriter.builder<GenericData.Record>(Path(traceParquet.toURI())) + val writer = AvroParquetWriter.builder<GenericData.Record>(LocalOutputFile(traceParquet)) .withSchema(schema) .withCompressionCodec(CompressionCodecName.SNAPPY) .withPageSize(4 * 1024 * 1024) // For compression diff --git a/opendc-experiments/opendc-experiments-serverless20/build.gradle.kts b/opendc-experiments/opendc-experiments-serverless20/build.gradle.kts index 88479765..7d68cb3a 100644 --- a/opendc-experiments/opendc-experiments-serverless20/build.gradle.kts +++ b/opendc-experiments/opendc-experiments-serverless20/build.gradle.kts @@ -37,10 +37,4 @@ dependencies { implementation(projects.opendcTelemetry.opendcTelemetrySdk) implementation(libs.kotlin.logging) implementation(libs.config) - - implementation(libs.parquet) - implementation(libs.hadoop.client) { - exclude(group = "org.slf4j", module = "slf4j-log4j12") - exclude(group = "log4j") - } } diff --git a/opendc-experiments/opendc-experiments-tf20/build.gradle.kts b/opendc-experiments/opendc-experiments-tf20/build.gradle.kts index 64483bd4..b088045b 100644 --- a/opendc-experiments/opendc-experiments-tf20/build.gradle.kts +++ b/opendc-experiments/opendc-experiments-tf20/build.gradle.kts @@ -38,9 +38,4 @@ dependencies { implementation(projects.opendcUtils) implementation(libs.kotlin.logging) - implementation(libs.parquet) - implementation(libs.hadoop.client) { - exclude(group = "org.slf4j", module = "slf4j-log4j12") - exclude(group = "log4j") - } } diff --git a/opendc-format/build.gradle.kts b/opendc-format/build.gradle.kts index e95cb666..e19e0ec8 100644 --- a/opendc-format/build.gradle.kts +++ b/opendc-format/build.gradle.kts @@ -40,9 +40,29 @@ dependencies { } implementation(kotlin("reflect")) - implementation(libs.parquet) - implementation(libs.hadoop.client) { + /* This configuration is necessary for a slim dependency on Apache Parquet */ + implementation(libs.parquet) { + exclude(group = "org.apache.hadoop") + } + runtimeOnly(libs.hadoop.common) { exclude(group = "org.slf4j", module = "slf4j-log4j12") exclude(group = "log4j") + exclude(group = "org.apache.hadoop") + exclude(group = "org.apache.curator") + exclude(group = "org.apache.zookeeper") + exclude(group = "org.apache.kerby") + exclude(group = "org.apache.httpcomponents") + exclude(group = "org.apache.htrace") + exclude(group = "commons-cli") + exclude(group = "javax.servlet") + exclude(group = "org.eclipse.jetty") + exclude(group = "com.sun.jersey") + exclude(group = "com.jcraft") + exclude(group = "dnsjava") + } + runtimeOnly(libs.hadoop.mapreduce.client.core) { + isTransitive = false } + + testRuntimeOnly(libs.slf4j.simple) } diff --git a/opendc-format/src/main/kotlin/org/opendc/format/trace/wtf/WtfTraceReader.kt b/opendc-format/src/main/kotlin/org/opendc/format/trace/wtf/WtfTraceReader.kt index feadf61f..dde1b340 100644 --- a/opendc-format/src/main/kotlin/org/opendc/format/trace/wtf/WtfTraceReader.kt +++ b/opendc-format/src/main/kotlin/org/opendc/format/trace/wtf/WtfTraceReader.kt @@ -23,15 +23,16 @@ package org.opendc.format.trace.wtf import org.apache.avro.generic.GenericRecord -import org.apache.hadoop.fs.Path -import org.apache.parquet.avro.AvroParquetReader import org.opendc.format.trace.TraceEntry import org.opendc.format.trace.TraceReader +import org.opendc.format.util.LocalParquetReader import org.opendc.simulator.compute.workload.SimFlopsWorkload import org.opendc.workflow.api.Job import org.opendc.workflow.api.Task import org.opendc.workflow.api.WORKFLOW_TASK_CORES import org.opendc.workflow.api.WORKFLOW_TASK_DEADLINE +import java.io.File +import java.nio.file.Path import java.util.UUID import kotlin.math.min @@ -41,13 +42,20 @@ import kotlin.math.min * * @param path The path to the trace. */ -public class WtfTraceReader(path: String) : TraceReader<Job> { +public class WtfTraceReader(path: Path) : TraceReader<Job> { /** * The internal iterator to use for this reader. */ private val iterator: Iterator<TraceEntry<Job>> /** + * Construct a [TraceReader] from the specified [path]. + * + * @param path The path to the trace. + */ + public constructor(path: File) : this(path.toPath()) + + /** * Initialize the reader. */ init { @@ -56,43 +64,43 @@ public class WtfTraceReader(path: String) : TraceReader<Job> { val tasks = mutableMapOf<Long, Task>() val taskDependencies = mutableMapOf<Task, List<Long>>() - @Suppress("DEPRECATION") - val reader = AvroParquetReader.builder<GenericRecord>(Path(path, "tasks/schema-1.0")).build() + LocalParquetReader<GenericRecord>(path.resolve("tasks/schema-1.0")).use { reader -> + while (true) { + val nextRecord = reader.read() ?: break - while (true) { - val nextRecord = reader.read() ?: break + val workflowId = nextRecord.get("workflow_id") as Long + val taskId = nextRecord.get("id") as Long + val submitTime = nextRecord.get("ts_submit") as Long + val runtime = nextRecord.get("runtime") as Long + val cores = (nextRecord.get("resource_amount_requested") as Double).toInt() - val workflowId = nextRecord.get("workflow_id") as Long - val taskId = nextRecord.get("id") as Long - val submitTime = nextRecord.get("ts_submit") as Long - val runtime = nextRecord.get("runtime") as Long - val cores = (nextRecord.get("resource_amount_requested") as Double).toInt() - @Suppress("UNCHECKED_CAST") - val dependencies = (nextRecord.get("parents") as ArrayList<GenericRecord>).map { - it.get("item") as Long - } + @Suppress("UNCHECKED_CAST") + val dependencies = (nextRecord.get("parents") as ArrayList<GenericRecord>).map { + it.get("item") as Long + } - val flops: Long = 4100 * (runtime / 1000) * cores + val flops: Long = 4100 * (runtime / 1000) * cores - val workflow = workflows.getOrPut(workflowId) { - Job(UUID(0L, workflowId), "<unnamed>", HashSet()) - } - val workload = SimFlopsWorkload(flops) - val task = Task( - UUID(0L, taskId), - "<unnamed>", - HashSet(), - mapOf( - "workload" to workload, - WORKFLOW_TASK_CORES to cores, - WORKFLOW_TASK_DEADLINE to runtime + val workflow = workflows.getOrPut(workflowId) { + Job(UUID(0L, workflowId), "<unnamed>", HashSet()) + } + val workload = SimFlopsWorkload(flops) + val task = Task( + UUID(0L, taskId), + "<unnamed>", + HashSet(), + mapOf( + "workload" to workload, + WORKFLOW_TASK_CORES to cores, + WORKFLOW_TASK_DEADLINE to runtime + ) ) - ) - starts.merge(workflowId, submitTime, ::min) - (workflow.tasks as MutableSet<Task>).add(task) - tasks[taskId] = task - taskDependencies[task] = dependencies + starts.merge(workflowId, submitTime, ::min) + (workflow.tasks as MutableSet<Task>).add(task) + tasks[taskId] = task + taskDependencies[task] = dependencies + } } // Fix dependencies and dependents for all tasks diff --git a/opendc-format/src/main/kotlin/org/opendc/format/util/LocalInputFile.kt b/opendc-format/src/main/kotlin/org/opendc/format/util/LocalInputFile.kt new file mode 100644 index 00000000..92319ace --- /dev/null +++ b/opendc-format/src/main/kotlin/org/opendc/format/util/LocalInputFile.kt @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.format.util + +import org.apache.parquet.io.InputFile +import org.apache.parquet.io.SeekableInputStream +import java.io.EOFException +import java.io.File +import java.nio.ByteBuffer +import java.nio.channels.FileChannel +import java.nio.file.Path +import java.nio.file.StandardOpenOption + +/** + * An [InputFile] on the local filesystem. + */ +public class LocalInputFile(private val path: Path) : InputFile { + /** + * The [FileChannel] used for accessing the input path. + */ + private val channel = FileChannel.open(path, StandardOpenOption.READ) + + /** + * Construct a [LocalInputFile] for the specified [file]. + */ + public constructor(file: File) : this(file.toPath()) + + override fun getLength(): Long = channel.size() + + override fun newStream(): SeekableInputStream = object : SeekableInputStream() { + override fun read(buf: ByteBuffer): Int { + return channel.read(buf) + } + + override fun read(): Int { + val single = ByteBuffer.allocate(1) + var read: Int + + // ReadableByteChannel#read might read zero bytes so continue until we read at least one byte + do { + read = channel.read(single) + } while (read == 0) + + return if (read == -1) { + read + } else { + single.get(0).toInt() and 0xff + } + } + + override fun getPos(): Long { + return channel.position() + } + + override fun seek(newPos: Long) { + channel.position(newPos) + } + + override fun readFully(bytes: ByteArray) { + readFully(ByteBuffer.wrap(bytes)) + } + + override fun readFully(bytes: ByteArray, start: Int, len: Int) { + readFully(ByteBuffer.wrap(bytes, start, len)) + } + + override fun readFully(buf: ByteBuffer) { + var remainder = buf.remaining() + while (remainder > 0) { + val read = channel.read(buf) + remainder -= read + + if (read == -1 && remainder > 0) { + throw EOFException() + } + } + } + + override fun close() { + channel.close() + } + + override fun toString(): String = "NioSeekableInputStream" + } + + override fun toString(): String = "LocalInputFile[path=$path]" +} diff --git a/opendc-format/src/main/kotlin/org/opendc/format/util/LocalOutputFile.kt b/opendc-format/src/main/kotlin/org/opendc/format/util/LocalOutputFile.kt new file mode 100644 index 00000000..657bca5a --- /dev/null +++ b/opendc-format/src/main/kotlin/org/opendc/format/util/LocalOutputFile.kt @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.format.util + +import org.apache.parquet.io.OutputFile +import org.apache.parquet.io.PositionOutputStream +import java.io.File +import java.io.OutputStream +import java.nio.file.Files +import java.nio.file.Path +import java.nio.file.StandardOpenOption + +/** + * An [OutputFile] on the local filesystem. + */ +public class LocalOutputFile(private val path: Path) : OutputFile { + /** + * Construct a [LocalOutputFile] from the specified [file] + */ + public constructor(file: File) : this(file.toPath()) + + override fun create(blockSizeHint: Long): PositionOutputStream { + val output = Files.newOutputStream(path, StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE) + return NioPositionOutputStream(output) + } + + override fun createOrOverwrite(blockSizeHint: Long): PositionOutputStream { + val output = Files.newOutputStream(path, StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING) + return NioPositionOutputStream(output) + } + + override fun supportsBlockSize(): Boolean = false + + override fun defaultBlockSize(): Long = + throw UnsupportedOperationException("Local filesystem does not have default block size") + + override fun getPath(): String = path.toString() + + /** + * Implementation of [PositionOutputStream] for an [OutputStream]. + */ + private class NioPositionOutputStream(private val output: OutputStream) : PositionOutputStream() { + /** + * The current position in the file. + */ + private var _pos = 0L + + override fun getPos(): Long = _pos + + override fun write(b: Int) { + output.write(b) + _pos++ + } + + override fun write(b: ByteArray) { + output.write(b) + _pos += b.size + } + + override fun write(b: ByteArray, off: Int, len: Int) { + output.write(b, off, len) + _pos += len + } + + override fun flush() { + output.flush() + } + + override fun close() { + output.close() + } + + override fun toString(): String = "NioPositionOutputStream[output=$output]" + } +} diff --git a/opendc-format/src/main/kotlin/org/opendc/format/util/LocalParquetReader.kt b/opendc-format/src/main/kotlin/org/opendc/format/util/LocalParquetReader.kt new file mode 100644 index 00000000..5083f3e1 --- /dev/null +++ b/opendc-format/src/main/kotlin/org/opendc/format/util/LocalParquetReader.kt @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.format.util + +import org.apache.parquet.avro.AvroParquetReader +import org.apache.parquet.hadoop.ParquetReader +import org.apache.parquet.io.InputFile +import java.io.File +import java.io.IOException +import java.nio.file.Files +import java.nio.file.Path +import kotlin.io.path.isDirectory + +/** + * A helper class to read Parquet files. + * + * @param path The path to the Parquet file or directory to read. + */ +public class LocalParquetReader<out T>(path: Path) : AutoCloseable { + /** + * The input files to process. + */ + private val filesIterator = if (path.isDirectory()) + Files.list(path) + .filter { !it.isDirectory() } + .sorted() + .map { LocalInputFile(it) } + .iterator() + else + listOf(LocalInputFile(path)).iterator() + + /** + * The Parquet reader to use. + */ + private var reader: ParquetReader<T>? = null + + /** + * Construct a [LocalParquetReader] for the specified [file]. + */ + public constructor(file: File) : this(file.toPath()) + + /** + * Read a single entry in the Parquet file. + */ + public fun read(): T? { + return try { + val next = reader?.read() + if (next != null) { + next + } else { + initReader() + + if (reader == null) + null + else + read() + } + } catch (e: InterruptedException) { + throw IOException(e) + } + } + + /** + * Close the Parquet reader. + */ + override fun close() { + reader?.close() + } + + /** + * Initialize the next reader. + */ + private fun initReader() { + reader?.close() + + this.reader = if (filesIterator.hasNext()) { + createReader(filesIterator.next()) + } else { + null + } + } + + /** + * Create a Parquet reader for the specified file. + */ + private fun createReader(input: InputFile): ParquetReader<T> { + return AvroParquetReader + .builder<T>(input) + .disableCompatibility() + .build() + } +} diff --git a/opendc-format/src/test/kotlin/org/opendc/format/trace/wtf/WtfTraceReaderTest.kt b/opendc-format/src/test/kotlin/org/opendc/format/trace/wtf/WtfTraceReaderTest.kt index bcfa7553..31ae03e0 100644 --- a/opendc-format/src/test/kotlin/org/opendc/format/trace/wtf/WtfTraceReaderTest.kt +++ b/opendc-format/src/test/kotlin/org/opendc/format/trace/wtf/WtfTraceReaderTest.kt @@ -24,6 +24,7 @@ package org.opendc.format.trace.wtf import org.junit.jupiter.api.Assertions.* import org.junit.jupiter.api.Test +import java.io.File /** * Test suite for the [WtfTraceReader] class. @@ -34,7 +35,7 @@ class WtfTraceReaderTest { */ @Test fun testParseWtf() { - val reader = WtfTraceReader("src/test/resources/wtf-trace") + val reader = WtfTraceReader(File("src/test/resources/wtf-trace")) var entry = reader.next() assertEquals(0, entry.start) assertEquals(23, entry.workload.tasks.size) diff --git a/opendc-format/src/test/kotlin/org/opendc/format/util/ParquetTest.kt b/opendc-format/src/test/kotlin/org/opendc/format/util/ParquetTest.kt new file mode 100644 index 00000000..e496dd96 --- /dev/null +++ b/opendc-format/src/test/kotlin/org/opendc/format/util/ParquetTest.kt @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.format.util + +import org.apache.avro.SchemaBuilder +import org.apache.avro.generic.GenericData +import org.apache.parquet.avro.AvroParquetReader +import org.apache.parquet.avro.AvroParquetWriter +import org.apache.parquet.hadoop.ParquetFileWriter +import org.junit.jupiter.api.* +import org.junit.jupiter.api.Assertions.assertEquals +import java.io.File +import java.nio.file.FileAlreadyExistsException +import java.nio.file.NoSuchFileException + +/** + * Test suite for the Parquet helper classes. + */ +internal class ParquetTest { + private val schema = SchemaBuilder + .record("test") + .namespace("org.opendc.format.util") + .fields() + .name("field").type().intType().noDefault() + .endRecord() + + private lateinit var file: File + + /** + * Setup the test + */ + @BeforeEach + fun setUp() { + file = File.createTempFile("opendc", "parquet") + } + + /** + * Tear down the test. + */ + @AfterEach + fun tearDown() { + file.delete() + } + + /** + * Initial test to verify whether the Parquet writer works. + */ + @Test + fun testSmoke() { + val n = 4 + val writer = AvroParquetWriter.builder<GenericData.Record>(LocalOutputFile(file)) + .withSchema(schema) + .withWriteMode(ParquetFileWriter.Mode.OVERWRITE) + .build() + + try { + repeat(n) { i -> + val record = GenericData.Record(schema) + record.put("field", i) + writer.write(record) + } + } finally { + writer.close() + } + + val reader = AvroParquetReader.builder<GenericData.Record>(LocalInputFile(file)) + .build() + + var counter = 0 + try { + while (true) { + val record = reader.read() ?: break + assertEquals(counter++, record.get("field")) + } + } finally { + reader.close() + } + + assertEquals(n, counter) + } + + /** + * Test if overwriting fails if not specified. + */ + @Test + fun testOverwrite() { + assertThrows<FileAlreadyExistsException> { + AvroParquetWriter.builder<GenericData.Record>(LocalOutputFile(file)) + .withSchema(schema) + .build() + } + } + + /** + * Test non-existent file. + */ + @Test + fun testNonExistent() { + file.delete() + assertThrows<NoSuchFileException> { + AvroParquetReader.builder<GenericData.Record>(LocalInputFile(file)) + .build() + } + } +} |
