From 052f2b4a80c737e0a3fc38c5facdec5472931aeb Mon Sep 17 00:00:00 2001 From: Wenchen Lai Date: Fri, 7 May 2021 21:07:40 +0200 Subject: exp: Add simple network model for TensorFlow experiments --- .../opendc-experiments-tf20/build.gradle.kts | 1 + .../org/opendc/experiments/tf20/network/Message.kt | 39 +++++++++ .../opendc/experiments/tf20/network/MessageType.kt | 31 +++++++ .../experiments/tf20/network/NetworkController.kt | 94 ++++++++++++++++++++++ .../opendc/experiments/tf20/network/NetworkNode.kt | 28 +++++++ 5 files changed, 193 insertions(+) create mode 100644 opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/Message.kt create mode 100644 opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/MessageType.kt create mode 100644 opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/NetworkController.kt create mode 100644 opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/NetworkNode.kt (limited to 'opendc-experiments/opendc-experiments-tf20') diff --git a/opendc-experiments/opendc-experiments-tf20/build.gradle.kts b/opendc-experiments/opendc-experiments-tf20/build.gradle.kts index 3ee5462a..64483bd4 100644 --- a/opendc-experiments/opendc-experiments-tf20/build.gradle.kts +++ b/opendc-experiments/opendc-experiments-tf20/build.gradle.kts @@ -35,6 +35,7 @@ dependencies { implementation(projects.opendcSimulator.opendcSimulatorCompute) implementation(projects.opendcTelemetry.opendcTelemetrySdk) implementation(projects.opendcFormat) + implementation(projects.opendcUtils) implementation(libs.kotlin.logging) implementation(libs.parquet) diff --git a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/Message.kt b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/Message.kt new file mode 100644 index 00000000..d6360873 --- /dev/null +++ b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/Message.kt @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.experiments.tf20.network + +/** + * A communication message between TensorFlow worker and master nodes. + * + * @property from The source node. + * @property to The destination node. + * @property type The type of message sent. + * @property dataSize message data size. + */ +public data class Message( + val from: NetworkNode, + val to: NetworkNode, + val type: MessageType, + val dataSize: Long, + val iterations: Int +) diff --git a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/MessageType.kt b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/MessageType.kt new file mode 100644 index 00000000..8be16261 --- /dev/null +++ b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/MessageType.kt @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.experiments.tf20.network + +/** + * Enumeration of the types of messages exchanged between worker and master nodes during TensorFlow execution. + */ +public enum class MessageType { + REQUEST, + WEIGHTS +} diff --git a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/NetworkController.kt b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/NetworkController.kt new file mode 100644 index 00000000..75b11423 --- /dev/null +++ b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/NetworkController.kt @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.experiments.tf20.network + +import kotlinx.coroutines.channels.Channel +import org.opendc.utils.TimerScheduler +import java.time.Clock +import kotlin.coroutines.CoroutineContext + +/** + * The network controller represents a simple network model between the worker and master nodes during + * TensorFlow execution. + */ +public class NetworkController(context: CoroutineContext, clock: Clock) : AutoCloseable { + /** + * The scheduler for the message. + */ + private val scheduler = TimerScheduler(context, clock) + + /** + * The outbound communication channels. + */ + private val channels = mutableMapOf>() + + /** + * A map of the bandwidth between the different nodes. + */ + private val bandwidthMatrix: MutableMap, Long> = mutableMapOf() + + /** + * A counter representing the amount of messages sent via the controller. + */ + private var messageCounter = 0 + + /** + * Add the specified link to this controller. + */ + public fun addLink(node: NetworkNode): Channel { + val channel = Channel(Channel.UNLIMITED) + channels[node] = channel + return channel + } + + /** + * Add a connection between two links. + */ + public fun addConnection(node1: NetworkNode, node2: NetworkNode, bandwidth: Long) { + bandwidthMatrix[Pair(node1, node2)] = bandwidth + } + + /** + * Route the specified [message]. + */ + public fun send(message: Message) { + val from = message.from + val to = message.to + val bandwidth = bandwidthMatrix[Pair(from, to)] ?: bandwidthMatrix[Pair(to, from)] ?: 1 + val size = message.dataSize / 1_000_000 + val delayTime = size / bandwidth + (0..5).random() + + messageCounter++ + + val target = channels[to] ?: return // Drop if destination not found + + scheduler.startSingleTimer(message, delayTime) { target.offer(message) } + } + + /** + * Stop the network controller. + */ + override fun close() { + scheduler.close() + } +} diff --git a/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/NetworkNode.kt b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/NetworkNode.kt new file mode 100644 index 00000000..46fb5ce9 --- /dev/null +++ b/opendc-experiments/opendc-experiments-tf20/src/main/kotlin/org/opendc/experiments/tf20/network/NetworkNode.kt @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2021 AtLarge Research + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +package org.opendc.experiments.tf20.network + +/** + * A node represents a machine with which other nodes can communicate. + */ +public data class NetworkNode(val hostname: String) -- cgit v1.2.3