summaryrefslogtreecommitdiff
path: root/main.tex
diff options
context:
space:
mode:
Diffstat (limited to 'main.tex')
-rw-r--r--main.tex130
1 files changed, 83 insertions, 47 deletions
diff --git a/main.tex b/main.tex
index 1f0cf0d..444bd07 100644
--- a/main.tex
+++ b/main.tex
@@ -6,8 +6,7 @@
\begin{frame}\frametitle{Motivation}
\begin{tcolorbox}[title=Context]
- 21\textsuperscript{st} century datacenters (DC) are mostly heterogeneous~\cite{DBLP:conf/date/MilojicicFDR21} and
- modern computational needs of AI drive managers to diversify datacenters even more~\cite{DBLP:journals/computer/AthavaleBBMMPS24}.
+ 21\textsuperscript{st} century datacenters (DC) are mostly heterogeneous~\cite{DBLP:conf/date/MilojicicFDR21} and modern computational needs of AI drive managers to diversify them even more~\cite{DBLP:journals/computer/AthavaleBBMMPS24}.
In result datacenters become extremely complex and hard to operate with millions of CPU's, GPU's etc.
\end{tcolorbox}
\begin{center}
@@ -23,16 +22,14 @@
\begin{tcolorbox}[title=DCDT's lack predictive analytics]
We need Datacenter Digital Twins (DCDT) to be better able to detect and solve issues in critical ICT infrastructure~\cite{DBLP:journals/computer/AthavaleBBMMPS24}.
However, DCDT's are still actively developed and lack crucial features such as predictive analytics~\cite{DBLP:usdoe/report/AP26894} to \emph{e.g.,} prevent unexpected failures.
- With predictive analysis (\emph{e.g.,} simulation) DCDT's could save millions of lost \$USD~\cite{DBLP:conf/acsos/TalluriOVTI21}.
\end{tcolorbox}
\begin{center}
\includegraphics[width=0.9\linewidth]{images/predictive_analytics.pdf}
\end{center}
\tiny
- \textbf{Figure 1.2:} Where does our work fit within the field of datacenter digital twinning?
- There are 5 core elements to any Digital Twin: \myCircled{A} The Digital $\rightarrow$ Physical Twin link, \myCircled{B} the Physical Twin (\emph{e.g.,} the datacenter), \myCircled{C} the Physical $\rightarrow$ Digital Twin link, \myCircled{D} the Digital Twin, \myCircled{E} the features necessary to any Digital Twin.
- \textcolor{Green}{\faHighlighter~Highlighted areas are the contributions from this thesis, which include the autonomous actions resulting from predictive insights \myCircledGreen{A} and the predictive analysis itself within \myCircledGreen{E}.}
+ \textbf{Figure 1.2:} Datacenter Digital Twin Diagram. There are 5 core elements to any Digital Twin: \myCircled{A} The Digital $\rightarrow$ Physical Twin link, \myCircled{B} the Physical Twin (\emph{e.g.,} the datacenter), \myCircled{C} the Physical $\rightarrow$ Digital Twin link, \myCircled{D} the Digital Twin, \myCircled{E} the features necessary to any Digital Twin.
+ \textcolor{Green}{\faHighlighter~Highlighted areas are the contributions from this thesis, which include the autonomous actions resulting from predictive insights \myCircledGreen{A} and the predictive analysis framework (including simple storage capabilities) within \myCircledGreen{E}.}
\end{frame}
\begin{frame}\frametitle{Research Questions}
@@ -56,13 +53,13 @@
\begin{frame}\frametitle{\textbf{RQ1}: Literature Review I}
\begin{tcolorbox}[title=Main Finding I]
- The literature on DCDTs is scarce.
+ There is little literature on DCDTs.
Some systems barely classify as DTs (\emph{e.g.,} Kalibre~\cite{DBLP:conf/sensys/WangZD0TCWZ20}, ChatTwin~\cite{DBLP:conf/sensys/LiW0Z0T23}).
Existing deployments specialize in \textcolor{Red}{Cooling and Heat Modelling}, together with \textcolor{Red}{3D visualizations}.
Most lack predictive modelling of DC operations.
\end{tcolorbox}
\vspace{-0.1cm}
- \input{images/table.tex}
+ \input{sources/table.tex}
% Research on DTs for datacenters have been separate, siloed efforts focused on either datacenter cooling, network performance, power consumption or visualization efforts.
% CFD usually means Navier-Stokes equations.
% CFD models take ages to compute.
@@ -70,13 +67,12 @@
\begin{frame}\frametitle{\textbf{RQ1}: Literature Review II}
% Mandatory: split the figure into 2: top and bottom, and that way you can fill in the entire slide nicely.
-
\begin{tcolorbox}[title=A holistic DCDT system model]
- We propose a generic model of datacenter digital twinning that can be mapped to each system from \textbf{Table 1.1}. Within this model (see \textbf{Fig. 1.3}) we introduce a concept of the \emph{Digital Thread}: a bridge between the DCDT and the physical DC equipment.
+ We propose a holistic model of datacenter digital twinning that can be mapped to each system from \textbf{Table 1.1}. Within this model (see \textbf{Fig. 1.3}) we introduce a concept of the \emph{Digital Thread}: a bridge between the DCDT and the physical DC equipment.
\end{tcolorbox}
\begin{center}
\vspace{-0.1cm}
- \includegraphics[width=0.8\textwidth]{images/system_model2.pdf}
+ \includegraphics[width=0.8\textwidth]{images/system_model.pdf}
\end{center}
% The reason why the cooling system is in the graph is because of the fact that 40\% of total energy consumed in DCs comes from cooling~\cite{DBLP:conf/noms/ZhangZLZWC22}.
% It has come to the point where datacenters are being build in the Pan-Arctic region, such as Finland,Russia,Sweden etc. with Iceland leading in number of DCs https://www.datacentermap.com/iceland/
@@ -85,6 +81,7 @@
\tiny
\textbf{Figure 1.3:} To answer \textbf{RQ1} we designed a generic datacenter digital twin system model based on a comprehensive literature review and findings from \textbf{Table 1.1}. The \emph{Infrastructure Model} simulates the structure of the DC and the \emph{Operations Model} simulates the behaviour of the DC.
+ \emph{Note:} Federation is not included explicitly but is covered by the model.
% Consider splitting the figure into 2 a.k.a. top and bottom.
% By the AIAA definition, the DT mimicks the structure and behaviour.
% Data Lake -> Data Storage
@@ -102,6 +99,7 @@
\vspace{-0.15cm}
\tiny
\textbf{Figure 1.4:} The predictive datacenter digital twin reference architecture.
+ We call the system \emph{Sunfish}.
The architecture was designed with the \emph{AtLarge Design Process}~\cite{DBLP:conf/icdcs/IosupVTETBFMT19} over several iterations in the past months.
\vspace{0.2cm}
\end{minipage}
@@ -112,8 +110,8 @@
\end{center}
\vspace{-0.2cm}
\tiny
- \textbf{Figure 1.5:} The prototype -- \emph{Sunfish}, and its components based on \textbf{Figure 1.4}.
- The time-series data flows first to the \texttt{Grafana} dashboard, \texttt{PostgreSQL} database and \texttt{Redis} cache~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24}.
+ \textbf{Figure 1.5:} The prototype and its components based on the architecture.
+ The time-series data flows first to the \texttt{Grafana} dashboard, \texttt{PostgreSQL} database and \texttt{Redis} cache as advised in ~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24}.
\vspace{0.1cm}
\end{minipage}
@@ -142,7 +140,7 @@
\begin{minipage}[b]{0.45\linewidth}
\vspace{-0.2cm}
\begin{center}
- \includegraphics[width=1.2\linewidth]{images/predictive_analyticsv3.pdf}
+ \includegraphics[width=1.2\linewidth]{images/novel_eval_method.pdf}
\end{center}
\tiny
\vspace{-0.2cm}
@@ -169,69 +167,86 @@
\hspace{-0.2cm}
\begin{minipage}[b]{0.45\linewidth}
\begin{center}
- \includegraphics[width=1.1\textwidth]{images/25_Jun_2026_152341.pdf}
+ \includegraphics[width=1.1\textwidth]{images/red_yellow_alarms.pdf}
\end{center}
\vspace{-0.3cm}
\tiny
- \textbf{Figure 1.7:} Experiment 1a. In this experiment we use red and yellow alarms to notify datacenter operators of unexpected failures.
+ \textbf{Figure 1.7a:} Experiment 1a. In this experiment we use red and yellow alarms to notify datacenter operators of unexpected failures.
We use a threshold based on predictions done by the simulator and a statistical distribution.
\end{minipage}
\hspace{0.6cm}
\begin{minipage}[b]{0.45\linewidth}
\begin{center}
- \includegraphics[width=1.1\textwidth]{images/25_Jun_2026_161052.pdf}
+ \includegraphics[width=1.1\textwidth]{images/failure_detecton_rate.pdf}
\end{center}
\vspace{-0.3cm}
\tiny
- \textbf{Figure 1.8:} Experiment 1b. The mean failure detection rate is around 15\%. Even though this seems low, if we look at \textbf{Fig. 1.9} (see Extra Slides), this simply means around 15\% of failures are unexpected.
+ \textbf{Figure 1.7b:} Experiment 1b. The mean failure detection rate is around 12\%. Even though this seems low, if we look at \textbf{Fig. E.1} (see Extra Slides), this simply means around 12\% of failures are unexpected.
\end{minipage}
% Explain what the axis are in the figure caption.
% Talk about the experimental setup in the figure.
% Give more reliable results than just numbers -- do statistical testing, i.e., standard deviation, confidence intervals.
\end{frame}
-%\begin{frame}\frametitle{\textbf{RQ3}: Experimental Results II}
-% \begin{tcolorbox}[title=Evaluation]
-% Predictive analytics is core to digital twinning. We evaluate our system against the requirements (extra slides) by predicting an optimal scheduling policy.
-% During runtime, we make dynamic adjustments to the physical twin, if the scheduling results differ.
-% \end{tcolorbox}
-% \hspace{0.2cm}
-% \begin{minipage}[b]{0.32\linewidth}
-% \begin{center}
-% \includegraphics[width=1.1\textwidth]{images/23_Jun_2026_102028.pdf}
-% \end{center}
-% \vspace{-0.3cm}
-% \tiny
-% \textbf{Figure 1.9:} Experiment 1
-% \end{minipage}
-%\end{frame}
+\begin{frame}\frametitle{\textbf{RQ3}: Experimental Results II} % Let's say we have some knowledge about the kind of workload we are going to run, e.g., Skype video calls.
+ % We can then estimate on previous Skype node failures and one of statistical distributions when are failures likely to happen.
+ % During the experiment, we unfortunately do not know what kind of distribution will the failures follow, so we constantly check to see which one fits best, and dynamically adjust the scheduling policy based on that.
+ %---%
+ % Step 1: We know we are going to soon run a workload coming in from Skype. Let's try to predict the failure pattern we might encounter.
+ % Run the OpenDC simulator 5 times to estimate the possible failure patterns. Save the results inside the Digital Twin.
+ % Step 2: Run the Digital Twin. Each time a new metric comes in, update the similarity score of each possible distribution.
+ % If the distribution with the similarity score that is the highest is about to match timestamps with the running workload AND according to the distribution we are going to experience failures in hosts A,B,C,D, % We decide to stop scheduling tasks on hosts A,B,C and D (we send a message to the running datacenter).
+ \begin{tcolorbox}[title=Main Finding III]
+ Predicting failures in advance is really difficult. \emph{Sunfish} is capable of dynamic adjustments to the physical twin at runtime, and can slightly lower the number of failed tasks.
+ \end{tcolorbox}
+ \hspace{-0.2cm}
+ \begin{minipage}[b]{0.45\linewidth}
+ \begin{center}
+ \includegraphics[width=1.1\textwidth]{images/failure_likelihood.pdf}
+ \end{center}
+ \vspace{-0.3cm}
+ \tiny
+ \textbf{Figure 1.8a:} Experiment 2a. The figure shows which failure distribution is the most likely to be the true failure distribution while the simulation is running.
+ This figure shows the difficulty of predictive analytics.
+ \end{minipage}
+ \hspace{0.5cm}
+ \begin{minipage}[b]{0.45\linewidth}
+ \vspace{-0.1cm}
+ \begin{center}
+ \includegraphics[width=1.1\textwidth]{images/conceptual_experiment.pdf}
+ \end{center}
+ \vspace{-0.3cm}
+ \tiny
+ \textbf{Figure 1.8b:} Experiment 2b. With perfect precognition (\emph{i.e.,} knowing on which day, what failures might happen) we could lower the mean number of failures.
+ This experiment is a proof of concept (results are indication-only).
+ \end{minipage}
+\end{frame}
\begin{frame}\frametitle{Key Takeaways}
- \begin{tcolorbox}[title=What is the societal context?]
+ \begin{tcolorbox}[title=Societal Context]
Datacenter manageability is a top-priority for the digital society.
Over 3 million jobs in the Netherlands directly depend on cloud services, which are hosted in datacenters~\cite{DBLP:journals/corr/IosupKLVG22}.
\end{tcolorbox}
- \begin{tcolorbox}[title=What problem did we solve?]
+ \begin{tcolorbox}[title=Problem Statement]
DCDT's, still under development, lack crucial features such as predictive analytics to manage datacenters well.
The entire DCDT design space remains largely unexplored.
\end{tcolorbox}
- \begin{tcolorbox}[title=How did we solve this problem?]
- Our contributions are: a thorough literature survey with a system model, a DCDT reference architecture, and prototype-based experiments via a novel evaluation method.
+ \begin{tcolorbox}[title=Contributions]
+ (1) A thorough literature survey with a system model, (2) a DCDT reference architecture, and (3) prototype-based experiments via a novel evaluation method.
\end{tcolorbox}
- \begin{tcolorbox}[title=What did we find?]
- \emph{Sunfish} can reliably detect unexpected failures based on discrete-event predictions, and can serve as a foundation for additional research and future work.
+ \begin{tcolorbox}[title=Main Findings]
+ \emph{Sunfish} can reliably detect unexpected failures based on discrete-event predictions, and can serve as a foundation for research and future work in predictive analytics.
\end{tcolorbox}
% Mandatory to mention here the future work that you see happening.
% Not enough space for another tcolorbox.
\end{frame}
-\setcounter{framenumber}{3}
+\setcounter{framenumber}{4}
\setbeamertemplate{footline}[page number]{}
-
% Unfortunately this must remain here.
\setbeamercolor{frametitle}{fg=Brown,bg=Brown!20}
\setbeamertemplate{frametitle}{
@@ -246,11 +261,11 @@
\bibliography{main.bib}
\end{frame}
-\begin{frame}\frametitle{Technical Setup }
+\begin{frame}\frametitle{Extra Slides: Technical Setup }
\begin{tcolorbox}[title=What is the simulation workload?]
The compute workload is BitBrainsSmall.
- The failure traces include user reports from Gmail, WhatsApp and Twitter.
-
+ The failure traces include Gmail, WhatsApp, Facebook and Twitter.
+ For predictions we use different statistical distributions~\cite{DBLP:journals/fgcs/VersluisCGLPCUI23}.
\end{tcolorbox}
\begin{tcolorbox}[title=What is the experiment environment?] A commodity laptop: Framework Laptop 13, with 32GB of DDR5 RAM and an AMD Ryzen 7840U processor and an ArchLinux OS with Linux 7.0.13-arch1-1 kernel.
@@ -269,6 +284,18 @@
\end{frame}
+\begin{frame}\frametitle{Extra Slides: Experiment 1}
+ \begin{tcolorbox}[title=Clarification]
+ In experiment 1 we are able to differentiate between severe failures, that down more than some threshold $\tau$ hosts.
+ $\tau$ is determined using predictions based on potential distribution of failures, modeled with \textasciitilde\emph{N(1.5, 1.5)}.
+ \end{tcolorbox}
+
+ \begin{center}
+ \includegraphics[width=0.58\linewidth]{images/alarms_vs_failures.pdf}
+ \end{center}
+ \tiny
+ \textbf{Figure E.1:} The comparison between failures experienced and alarms raised.
+\end{frame}
\begin{frame}\frametitle{Extra Slides: Why Digital Twinning?}
\begin{tcolorbox}[title=Definition]
@@ -299,11 +326,20 @@
\tiny \textbf{Figure E.3:} Real-time control that is tightly-coupled with the IT equipment is a prerequisite for timely predictions within seconds/minutes~\cite{DBLP:journals/computer/AthavaleBBMMPS24}.
\end{frame}
-
-
+\begin{frame}\frametitle{Extra Slides: Experiment 2}
+ \begin{tcolorbox}[title=Statistical Distributions]
+ Different failure distributions were used in order to predict the true failure distribution.
+ \textbf{Table E.1} summarizes the distributions.
+ \end{tcolorbox}
+ \begin{center}
+ \includegraphics[width=\linewidth]{images/failure_models_table.png}
+ \end{center}
+ \tiny
+ \textbf{Table E.1:} Different failure models used throughout this project.
+ All failure models come from Javadi \etal (for a more thorough overview see~\cite{DBLP:journals/jpdc/JavadiKIE13})
+\end{frame}
% Computational Fluid Dynamics (CFD) have high computation overhead, unsuitable for real-time simulation of a dynamic datacenter.
%Moreover oftentimes a poorly configured CFD model can lead to high error rates~\cite{DBLP:conf/sensys/WangZD0TCWZ20}.
%Data-driven Machine Learning performs poorly by the cases not covered in the training data.
-
\end{document}