summaryrefslogtreecommitdiff
path: root/main.tex
diff options
context:
space:
mode:
Diffstat (limited to 'main.tex')
-rw-r--r--main.tex190
1 files changed, 116 insertions, 74 deletions
diff --git a/main.tex b/main.tex
index 5d3dfec..c600fb7 100644
--- a/main.tex
+++ b/main.tex
@@ -11,7 +11,7 @@
In result datacenters become extremely complex and hard to operate with millions of CPU's, GPU's etc.
\end{tcolorbox}
\begin{center}
- \includegraphics[width=\linewidth]{images/datacenter_complexity.pdf}
+ \includegraphics[width=\linewidth]{images/datacenter_complexity.png}
\end{center}
\tiny
\textbf{Figure 1.1:} Society depends on datacenters to keep running, and therefore we cannot afford to let these systems break down or experience significant performance-related issues.
@@ -19,7 +19,6 @@
Left to right: a Google datacenter, server racks, Ada Lovelace AD102 GPU architecture.
\end{frame}
-
\begin{frame}\frametitle{Problem Statement}
\begin{tcolorbox}[title=DCDT's lack predictive analytics]
We need Datacenter Digital Twins (DCDT) to be better able to detect and solve issues in critical ICT infrastructure~\cite{DBLP:journals/computer/AthavaleBBMMPS24}.
@@ -33,7 +32,7 @@
\tiny
\textbf{Figure 1.2:} Where does our work fit within the field of datacenter digital twinning?
There are 5 core elements to any Digital Twin: \myCircled{A} The Digital $\rightarrow$ Physical Twin link, \myCircled{B} the Physical Twin (\emph{e.g.,} the datacenter), \myCircled{C} the Physical $\rightarrow$ Digital Twin link, \myCircled{D} the Digital Twin, \myCircled{E} the features necessary to any Digital Twin.
- \textcolor{ForestGreen}{\faHighlighter~Highlighted areas are the contributions from this thesis, which include the autonomous actions resulting from predictive insights \myCircledGreen{A} and the predictive analysis itself within \myCircledGreen{E}.}
+ \textcolor{Green}{\faHighlighter~Highlighted areas are the contributions from this thesis, which include the autonomous actions resulting from predictive insights \myCircledGreen{A} and the predictive analysis itself within \myCircledGreen{E}.}
\end{frame}
\begin{frame}\frametitle{Research Questions}
@@ -50,18 +49,19 @@
\end{tcolorbox}
\begin{tcolorbox}[title=Research Question 3]
- How to evaluate and validate a datacenter digital twin architecture in relation to system requirements?
+ % no "and validate?"
+ How to validate and evaluate a datacenter digital twin architecture in relation to system requirements?
\end{tcolorbox}
\end{frame}
-
\begin{frame}\frametitle{\textbf{RQ1}: Literature Review I}
- \begin{tcolorbox}[title=Results]
+ \begin{tcolorbox}[title=Main Finding I]
The literature on DCDTs is scarce.
Some systems barely classify as DTs (\emph{e.g.,} Kalibre~\cite{DBLP:conf/sensys/WangZD0TCWZ20}, ChatTwin~\cite{DBLP:conf/sensys/LiW0Z0T23}).
Existing deployments specialize in \textcolor{Red}{Cooling and Heat Modelling}, together with \textcolor{Red}{3D visualizations}.
- Most lack crucial predictive DC behaviour modelling.
+ Most lack predictive modelling of DC operations.
\end{tcolorbox}
+ \vspace{-0.1cm}
\input{images/table.tex}
% Research on DTs for datacenters have been separate, siloed efforts focused on either datacenter cooling, network performance, power consumption or visualization efforts.
% CFD usually means Navier-Stokes equations.
@@ -72,8 +72,7 @@
% Mandatory: split the figure into 2: top and bottom, and that way you can fill in the entire slide nicely.
\begin{tcolorbox}[title=A holistic DCDT system model]
- We propose a generic model of datacenter digital twinning that can be mapped to each system from \textbf{Table 1.1}. To answer \textbf{RQ2}, we design a ref. arch. for \emph{Operations Model}.
- We introduce the \emph{Digital Thread}: a bridge between software and reality.
+ We propose a generic model of datacenter digital twinning that can be mapped to each system from \textbf{Table 1.1}. Within this model (see \textbf{Fig. 1.3}) we introduce a concept of the \emph{Digital Thread}: a bridge between the DCDT and the physical DC equipment.
\end{tcolorbox}
\begin{center}
\vspace{-0.1cm}
@@ -85,7 +84,7 @@
% Not really a digital twin per se.
\tiny
- \textbf{Figure 1.3:} To answer \textbf{RQ1} we designed a generic datacenter digital twin system model based on a comprehensive literature review and findings from \textbf{Table 1.1}. The \emph{Infrastructure Model} simulates the structure of the DC and the \emph{Operations model} simulates the behaviour of the DC.
+ \textbf{Figure 1.3:} To answer \textbf{RQ1} we designed a generic datacenter digital twin system model based on a comprehensive literature review and findings from \textbf{Table 1.1}. The \emph{Infrastructure Model} simulates the structure of the DC and the \emph{Operations Model} simulates the behaviour of the DC.
% Consider splitting the figure into 2 a.k.a. top and bottom.
% By the AIAA definition, the DT mimicks the structure and behaviour.
% Data Lake -> Data Storage
@@ -93,19 +92,31 @@
\end{frame}
\begin{frame}\frametitle{\textbf{RQ2}: Reference Architecture}
+ % Make Kafka logos clearly defined --> add a legend with icons?
+ \hspace{-0.3cm}
\begin{minipage}[b]{0.45\linewidth}
- \begin{tcolorbox}[title=Use cases]
-
- \end{tcolorbox}
- \vspace{1cm}
+ \begin{center}
+ % Change to Datacenter (Physical Twin)
+ \includegraphics[width=1.15\textwidth]{images/ref_architecture.pdf}
+ \end{center}
+ \vspace{-0.15cm}
+ \tiny
+ \textbf{Figure 1.4:} The predictive datacenter digital twin reference architecture.
+ The architecture was designed with the \emph{AtLarge Design Process}~\cite{DBLP:conf/icdcs/IosupVTETBFMT19} over several iterations in the past months.
+ \vspace{0.2cm}
\end{minipage}
+ \hspace{0.6cm}
\begin{minipage}[b]{0.45\linewidth}
\begin{center}
- \includegraphics[width=1.25\textwidth]{images/ref_architecture.pdf}
+ \includegraphics[width=1.17\linewidth]{images/implementation.png}
\end{center}
\vspace{-0.2cm}
\tiny
- \textbf{Figure 1.4:} The predictive datacenter digital twin reference architecture. \end{minipage}
+ \textbf{Figure 1.5:} The prototype -- \emph{Sunfish}, and its components based on \textbf{Figure 1.4}.
+ The time-series data flows first to the \texttt{Grafana} dashboard, \texttt{PostgreSQL} database and \texttt{Redis} cache~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24}.
+ \vspace{0.1cm}
+ \end{minipage}
+
% We decided to use discrete-event simulation, as opposed to computational fluid dynamics because of the high overheads of development time needed for CFD.
% CFD simply takes too long to run, making it unfeasible for real-time analytics and simulation.
% Citing ExaDigit: [CFD] they are also more computationally expensive, generally making real-time operation unfeasible.
@@ -113,37 +124,34 @@
\end{frame}
% You should skip \hfill completely or in favour of \hspace very minimally.
\begin{frame}\frametitle{\textbf{RQ3}: Experimental Setup}
- % The software stack of \emph{Sunfish} includes state-of-the-art software.
- %The time-series data flows first to the \texttt{Grafana} dashboard, \texttt{PostgreSQL} database and \texttt{Redis} cache, as advised in~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24}.
-
+ \hspace{-0.3cm}
\begin{minipage}[b]{0.45\linewidth}
- \begin{tcolorbox}[title=Setup Recipe]
- \scriptsize
- \textbf{Step 1.} Ensure Redis and PostgreSQL servers are up and alive.\newline
-
- \textbf{Step 2.} Run a Confluent Kafka setup: Kafka Connect, Schema Registry and a Kafka server.\newline
-
- \textbf{Step 3.} Start the Python HTTP Server, and the Python Redis Monitor.\newline
-
- \textbf{Step 4.} Run the (modified) OpenDC (physical twin) with example experiment.\newline
-
- \textbf{Step 5.} \emph{Sunfish} will automatically start a second OpenDC instance, and start the data analysis.
+ \begin{tcolorbox}[title=Problem, colbacktitle=red!70!black,colback=red!20!white]
+ We cannot just go and test digital twins on large systems, because we do not have large systems at hand.
+ Moreover, real-world experimentation is costly and unsustainable in the long run~\cite{DBLP:conf/ccgrid/MastenbroekAJLB21}.
\end{tcolorbox}
\vspace{0.5cm}
+ \begin{tcolorbox}[title=Solution, colbacktitle=Green!70!black, colback=Green!20!white]
+ \scriptsize
+ They way we test our reference architecture prototype is by using multiple simulators.
+ We use an additional OpenDC process to play the role of a real datacenter.
+ \end{tcolorbox}
+ \vspace{1cm}
\end{minipage}
- \hspace{0.35cm}
+ \hspace{0.25cm}
\begin{minipage}[b]{0.45\linewidth}
\vspace{-0.2cm}
\begin{center}
- \includegraphics[width=1.2\linewidth]{images/predictive_analyticsv2.pdf}
+ \includegraphics[width=1.2\linewidth]{images/predictive_analyticsv3.pdf}
\end{center}
\tiny
- \vspace{-0.4cm}
- \textbf{Figure 1.5:} We can't just go and test digital twins on large systems as large systems often aren't at hand.
- Answering \textbf{RQ3} we provide a novel way to evaluate datacenter digital twins through discrete-event simulation instead.
+ \vspace{-0.2cm}
+ \textbf{Figure 1.6:} The experimental setup.
+ Answering \textbf{RQ3} we provide a novel way to evaluate datacenter digital twins through discrete-event simulation.
\end{minipage}
\end{frame}
+
\begin{frame}\frametitle{\textbf{RQ3}: Experimental Results I}
% You have some model, and this can be based on multiple traces.
%Get insight from CINECA --> you get a probability of certain hosts failing.
@@ -151,59 +159,85 @@
%If you incorporate that? If you can make the case that because of our new digital twin we can incorporate such models, anomaly/failure detection, from CINECA.
%If we had that in, we can reach these kinds of gains.
% @Mateusz there is really not a possibility to incorporate CINECA's models, so to address Dante's feedback, I created this experiment.
-
- \begin{tcolorbox}[title=Failure Detection: Main Finding I]
- On average, \emph{Sunfish} can detect 14.5\% of unexpected failures in the physical twin.
- We show, that digital twinning \emph{can} be used for failure detection.
-
+ % If a single host crashes for the entire workload, that's not really that bad.
+ % If a lot of hosts suddenly crash but for a really short time, that's terrible.
+ % Failures that are more intensive are worse than failures with long duration.
+ \begin{tcolorbox}[title=Main Finding II]
+ We posit digital twinning can be used for failure detection to the benefit of DC operators.
+ We replicate an experiment from DyTwin~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24} designed by Milojicic \etal to show our system can reliably detect \emph{unexpected} host failures.
\end{tcolorbox}
+ \hspace{-0.2cm}
\begin{minipage}[b]{0.45\linewidth}
\begin{center}
- \includegraphics[width=1.1\textwidth]{images/23_Jun_2026_102028.pdf}
+ \includegraphics[width=1.1\textwidth]{images/25_Jun_2026_152341.pdf}
+ \end{center}
+ \vspace{-0.3cm}
+ \tiny
+ \textbf{Figure 1.7a:} Experiment 1a. In this experiment we use red and yellow alarms to notify datacenter operators of unexpected failures.
+ We use a threshold based on predictions done by the simulator and a statistical distribution.
+ \end{minipage}
+ \hspace{0.6cm}
+ \begin{minipage}[b]{0.45\linewidth}
+ \begin{center}
+ \includegraphics[width=1.1\textwidth]{images/25_Jun_2026_161052.pdf}
\end{center}
\vspace{-0.3cm}
\tiny
- \textbf{Figure 1.5:} Experiment 1 Setup: The Digital Twin estimates the failures based on the Normal Distribution \emph{N\textasciitilde($\mu$,$\sigma$)} with $\mu = 1.5$ and $\sigma = 0.5$.
- ``Real'' OpenDC failures come from a WhatsApp user reports.
+ \textbf{Figure 1.7b:} Experiment 1b. The mean failure detection rate is around 15\%. Even though this seems low, if we look at \textbf{Fig. E.1} (see Extra Slides), this simply means around 15\% of failures are unexpected.
\end{minipage}
% Explain what the axis are in the figure caption.
% Talk about the experimental setup in the figure.
% Give more reliable results than just numbers -- do statistical testing, i.e., standard deviation, confidence intervals.
\end{frame}
-
\begin{frame}\frametitle{\textbf{RQ3}: Experimental Results II}
- \begin{tcolorbox}[title=Failure Prediction: Main Finding II]
- Here explain what did you find.
+ \begin{tcolorbox}[title=Main Finding III]
+ \emph{Sunfish} is capable of dynamic adjustments to the physical twin at runtime, and can lower the mean number of failed tasks.
\end{tcolorbox}
-
+ \hspace{0.2cm}
+ \begin{minipage}[b]{0.45\linewidth}
+ \begin{center}
+ \includegraphics[width=1.1\textwidth]{images/23_Jun_2026_102028.pdf}
+ \end{center}
+ \vspace{-0.3cm}
+ \tiny
+ \textbf{Figure 1.8a:} Experiment 2a.
+ \end{minipage}
+ \begin{minipage}[b]{0.45\linewidth}
+ \begin{center}
+ \includegraphics[width=1.1\textwidth]{images/23_Jun_2026_102028.pdf}
+ \end{center}
+ \vspace{-0.3cm}
+ \tiny
+ \textbf{Figure 1.8b:} Experiment 2b.
+ \end{minipage}
\end{frame}
\begin{frame}\frametitle{Key Takeaways}
\begin{tcolorbox}[title=What is the societal context?]
-
+ Datacenter manageability is a top-priority for the digital society.
+ Over 3 million jobs in the Netherlands directly depend on cloud services, which are hosted in datacenters~\cite{DBLP:journals/corr/IosupKLVG22}.
\end{tcolorbox}
\begin{tcolorbox}[title=What problem did we solve?]
-
+ DCDT's, still under development, lack crucial features such as predictive analytics to manage datacenters well.
+ The entire DCDT design space remains largely unexplored.
\end{tcolorbox}
\begin{tcolorbox}[title=How did we solve this problem?]
-
+ Our contributions are: a thorough literature survey with a system model, a DCDT reference architecture, and prototype-based experiments via a novel evaluation method.
\end{tcolorbox}
- \begin{tcolorbox}[colbacktitle=red!70!black, colback=red!20!white,title=What did we find?]
-
- \end{tcolorbox}
-
- \begin{tcolorbox}[title=What do we see in future work?]
-
+ \begin{tcolorbox}[title=What did we find?]
+ \emph{Sunfish} can reliably detect unexpected failures based on discrete-event predictions, and can serve as a foundation for additional research and future work.
\end{tcolorbox}
-
+ % Mandatory to mention here the future work that you see happening.
+ % Not enough space for another tcolorbox.
\end{frame}
-\setcounter{framenumber}{4}
-\setbeamertemplate{footline}[page number]{
+\setcounter{framenumber}{3}
+\setbeamertemplate{footline}[page number]{}
+
% Unfortunately this must remain here.
\setbeamercolor{frametitle}{fg=Brown,bg=Brown!20}
@@ -213,28 +247,33 @@
\usebeamerfont{frametitle}\insertframetitle\hfill
\end{beamercolorbox}
}
-
\begin{frame}[allowframebreaks]\frametitle{Extra Slides: References}
\tiny
\bibliographystyle{is-plain}
\bibliography{main.bib}
\end{frame}
-\begin{frame}\frametitle{Extra Slides: Societal Impact}
- \begin{tcolorbox}[title=Why is this research important today?]
- Over 3 million jobs in the Netherlands directly depend on cloud services, which are hosted in datacenters~\cite{DBLP:journals/corr/IosupKLVG22}.
- Already the rapid
- expansion of datacenters has increased the presence of service failures across all cloud
- services~\cite{DBLP:conf/acsos/TalluriOVTI21}.
- We need to act now.
+\begin{frame}\frametitle{Extra Slides: Technical Setup }
+ \begin{tcolorbox}[title=What is the simulation workload?]
+ The compute workload is BitBrainsSmall.
+ The failure traces include user reports from Gmail, WhatsApp, Facebook and Twitter.
+ For predictions we use \texttt{prefabs}~\cite{DBLP:journals/fgcs/VersluisCGLPCUI23}.
\end{tcolorbox}
- \begin{center}
- \includegraphics[height=10em]{images/manageability.pdf}
- \end{center}
- \tiny
- \textbf{Figure E.1:} Horizontally: the most important research areas in computer science in Netherlands.
- Vertically: qualities we should ensure across all research areas with the most outstanding impact on society.
- Datacenter manageability is a top-priority~\cite{DBLP:journals/corr/IosupKLVG22}.
+ \begin{tcolorbox}[title=What is the experiment environment?] A commodity laptop: Framework Laptop 13, with 32GB of DDR5 RAM and an AMD Ryzen 7840U processor and an ArchLinux OS with Linux 7.0.13-arch1-1 kernel.
+
+ \end{tcolorbox}
+
+
+ \begin{tcolorbox}[title=How did we adjust OpenDC (Physical Twin)?]
+ We use a SURF~\cite{DBLP:journals/fgcs/VersluisCGLPCUI23} datacenter topology with 277 hosts.
+ We wrote a custom Kotlin \texttt{ComputeMonitor} to export live-metrics into Kafka, and a custom Kotlin \texttt{HTTPClient} to talk to the digital twin.
+ We add a new scheduling mechanism, the \texttt{SmartScheduler}.
+
+ \end{tcolorbox}
+ \begin{tcolorbox}[title=Which metrics do we measure?]
+ Timestamps, host names, uptime, downtime, CPU utilization \emph{etc.}
+ \end{tcolorbox}
+
\end{frame}
@@ -267,6 +306,9 @@
\tiny \textbf{Figure E.3:} Real-time control that is tightly-coupled with the IT equipment is a prerequisite for timely predictions within seconds/minutes~\cite{DBLP:journals/computer/AthavaleBBMMPS24}.
\end{frame}
+
+
+
% Computational Fluid Dynamics (CFD) have high computation overhead, unsuitable for real-time simulation of a dynamic datacenter.
%Moreover oftentimes a poorly configured CFD model can lead to high error rates~\cite{DBLP:conf/sensys/WangZD0TCWZ20}.
%Data-driven Machine Learning performs poorly by the cases not covered in the training data.