From ea5e79fc77072e6151ee7952581b97e35a2027fb Mon Sep 17 00:00:00 2001 From: Fabian Mastenbroek Date: Sun, 1 May 2022 22:54:08 +0200 Subject: perf(trace/opendc): Read records using low-level API This change updates the OpenDC VM format reader implementation to use the low-level record reading APIs provided by the `parquet-mr` library for improved performance. Previously, we used the `parquet-avro` library to read/write Avro records in Parquet format, but that library carries considerable overhead. --- .../trace/util/parquet/LocalParquetReader.kt | 6 ++- .../trace/util/parquet/LocalParquetWriter.kt | 55 ++++++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) create mode 100644 opendc-trace/opendc-trace-parquet/src/main/kotlin/org/opendc/trace/util/parquet/LocalParquetWriter.kt (limited to 'opendc-trace/opendc-trace-parquet') diff --git a/opendc-trace/opendc-trace-parquet/src/main/kotlin/org/opendc/trace/util/parquet/LocalParquetReader.kt b/opendc-trace/opendc-trace-parquet/src/main/kotlin/org/opendc/trace/util/parquet/LocalParquetReader.kt index bb2bb10d..3e6f19a2 100644 --- a/opendc-trace/opendc-trace-parquet/src/main/kotlin/org/opendc/trace/util/parquet/LocalParquetReader.kt +++ b/opendc-trace/opendc-trace-parquet/src/main/kotlin/org/opendc/trace/util/parquet/LocalParquetReader.kt @@ -40,8 +40,10 @@ import kotlin.io.path.isDirectory * @param path The path to the Parquet file or directory to read. * @param factory Function to construct a [ParquetReader] for a local [InputFile]. */ -public class LocalParquetReader(path: Path, - private val factory: (InputFile) -> ParquetReader = avro()) : AutoCloseable { +public class LocalParquetReader( + path: Path, + private val factory: (InputFile) -> ParquetReader = avro() +) : AutoCloseable { /** * The input files to process. */ diff --git a/opendc-trace/opendc-trace-parquet/src/main/kotlin/org/opendc/trace/util/parquet/LocalParquetWriter.kt b/opendc-trace/opendc-trace-parquet/src/main/kotlin/org/opendc/trace/util/parquet/LocalParquetWriter.kt new file mode 100644 index 00000000..b5eb1deb --- /dev/null +++ b/opendc-trace/opendc-trace-parquet/src/main/kotlin/org/opendc/trace/util/parquet/LocalParquetWriter.kt @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2022 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.trace.util.parquet + +import org.apache.hadoop.conf.Configuration +import org.apache.parquet.hadoop.ParquetWriter +import org.apache.parquet.hadoop.api.WriteSupport +import org.apache.parquet.io.OutputFile +import java.nio.file.Path + +/** + * Helper class for writing Parquet records to local disk. + */ +public class LocalParquetWriter { + /** + * A [ParquetWriter.Builder] implementation supporting custom [OutputFile]s and [WriteSupport] implementations. + */ + public class Builder internal constructor( + output: OutputFile, + private val writeSupport: WriteSupport + ) : ParquetWriter.Builder>(output) { + override fun self(): Builder = this + + override fun getWriteSupport(conf: Configuration): WriteSupport = writeSupport + } + + public companion object { + /** + * Create a [Builder] instance that writes a Parquet file at the specified [path]. + */ + @JvmStatic + public fun builder(path: Path, writeSupport: WriteSupport): Builder = + Builder(LocalOutputFile(path), writeSupport) + } +} -- cgit v1.2.3