From 604201a150c62285ad5421d1cdd2928668bf0bee Mon Sep 17 00:00:00 2001 From: mjkwiatkowski Date: Thu, 2 Jul 2026 19:02:27 +0200 Subject: feat: last changes --- indent.log | 2 +- main.tex | 302 +++++++++++++++++++++++++++++++------------------------- style/style.tex | 2 +- 3 files changed, 168 insertions(+), 138 deletions(-) diff --git a/indent.log b/indent.log index 2b112d6..92ad483 100644 --- a/indent.log +++ b/indent.log @@ -1,6 +1,6 @@ INFO: latexindent version 3.24.7, 2025-08-15, a script to indent .tex files latexindent lives here: /usr/share/texmf-dist/scripts/latexindent/ - Sun Jun 28 19:02:38 2026 + Sun Jun 28 19:57:30 2026 Reading input from STDIN INFO: Processing switches: INFO: Directory for backup files and log file indent.log: diff --git a/main.tex b/main.tex index 444bd07..8908ab0 100644 --- a/main.tex +++ b/main.tex @@ -1,119 +1,140 @@ -\documentclass[12pt, handout]{beamer} +\documentclass[12pt]{beamer} \input{style/style.tex} \begin{document} \frame{\titlepage \centering \footnotesize Online slideshow: \url{mjkw.pl/vu/bsc}} -\begin{frame}\frametitle{Motivation} - \begin{tcolorbox}[title=Context] - 21\textsuperscript{st} century datacenters (DC) are mostly heterogeneous~\cite{DBLP:conf/date/MilojicicFDR21} and modern computational needs of AI drive managers to diversify them even more~\cite{DBLP:journals/computer/AthavaleBBMMPS24}. - In result datacenters become extremely complex and hard to operate with millions of CPU's, GPU's etc. - \end{tcolorbox} - \begin{center} - \includegraphics[width=\linewidth]{images/datacenter_complexity.png} - \end{center} - \tiny - \textbf{Figure 1.1:} Society depends on datacenters to keep running, and therefore we cannot afford to let these systems break down or experience significant performance-related issues. - With millions of servers in the largest datacenters, real-time management becomes very difficult. - Left to right: a Google datacenter, server racks, Ada Lovelace AD102 GPU architecture. +\begin{frame}[t]\frametitle{Motivation} + \only<1-2>{ + \vspace{-0.2cm} + \begin{tcolorbox}[title=Context] + 21\textsuperscript{st} century datacenters (DC) are mostly heterogeneous~\cite{DBLP:conf/date/MilojicicFDR21} and modern computational needs of AI drive managers to diversify them even more~\cite{DBLP:journals/computer/AthavaleBBMMPS24}. + In result datacenters become extremely complex and hard to operate with millions of CPU's, GPU's etc. + \end{tcolorbox} + } + \only<2>{\begin{center} + \includegraphics[width=\linewidth]{images/datacenter_complexity.png} + \end{center} + \tiny + \textbf{Figure 1.1:} Society depends on datacenters to keep running, and therefore we cannot afford to let these systems break down or experience significant performance-related issues. + With millions of servers in the largest datacenters, real-time management becomes very difficult. + Left to right: a Google datacenter, server racks, Ada Lovelace AD102 GPU architecture. + } \end{frame} -\begin{frame}\frametitle{Problem Statement} - \begin{tcolorbox}[title=DCDT's lack predictive analytics] - We need Datacenter Digital Twins (DCDT) to be better able to detect and solve issues in critical ICT infrastructure~\cite{DBLP:journals/computer/AthavaleBBMMPS24}. - However, DCDT's are still actively developed and lack crucial features such as predictive analytics~\cite{DBLP:usdoe/report/AP26894} to \emph{e.g.,} prevent unexpected failures. - \end{tcolorbox} - - \begin{center} - \includegraphics[width=0.9\linewidth]{images/predictive_analytics.pdf} - \end{center} - \tiny - \textbf{Figure 1.2:} Datacenter Digital Twin Diagram. There are 5 core elements to any Digital Twin: \myCircled{A} The Digital $\rightarrow$ Physical Twin link, \myCircled{B} the Physical Twin (\emph{e.g.,} the datacenter), \myCircled{C} the Physical $\rightarrow$ Digital Twin link, \myCircled{D} the Digital Twin, \myCircled{E} the features necessary to any Digital Twin. - \textcolor{Green}{\faHighlighter~Highlighted areas are the contributions from this thesis, which include the autonomous actions resulting from predictive insights \myCircledGreen{A} and the predictive analysis framework (including simple storage capabilities) within \myCircledGreen{E}.} +\begin{frame}[t]\frametitle{Problem Statement} + \only<1-2>{ + \vspace{-0.2cm} + \begin{tcolorbox}[title=DCDT's lack predictive analytics] + We need Datacenter Digital Twins (DCDT) to be better able to detect and solve issues in critical ICT infrastructure~\cite{DBLP:journals/computer/AthavaleBBMMPS24}. + However, DCDT's are still actively developed and lack crucial features such as predictive analytics~\cite{DBLP:usdoe/report/AP26894} to \emph{e.g.,} prevent unexpected failures. + \end{tcolorbox} + } + \only<2>{ + \begin{center} + \includegraphics[width=0.9\linewidth]{images/predictive_analytics.pdf} + \end{center} + \tiny + \textbf{Figure 1.2:} Datacenter Digital Twin Diagram. There are 5 core elements to any Digital Twin: \myCircled{A} The Digital $\rightarrow$ Physical Twin link, \myCircled{B} the Physical Twin (\emph{e.g.,} the datacenter), \myCircled{C} the Physical $\rightarrow$ Digital Twin link, \myCircled{D} the Digital Twin, \myCircled{E} the features necessary to any Digital Twin. + \textcolor{Green}{\faHighlighter~Highlighted areas are the contributions from this thesis, which include the autonomous actions resulting from predictive insights \myCircledGreen{A} and the predictive analysis framework (including simple storage capabilities) within \myCircledGreen{E}.} + } \end{frame} -\begin{frame}\frametitle{Research Questions} - \begin{tcolorbox}[title=Main Research Question, colbacktitle=red!70!black,colback=red!20!white] - How to enable predictive analytics for datacenters through digital twinning? - \end{tcolorbox} +\begin{frame}[t]\frametitle{Research Questions} + \only<1-4>{ + \vspace{-0.2cm} + \begin{tcolorbox}[title=Main Research Question, colbacktitle=red!70!black,colback=red!20!white] + How to enable predictive analytics for datacenters through digital twinning? + \end{tcolorbox}} - \begin{tcolorbox}[title=Research Question 1] - How to asses the current state-of-the-art of digital twinning for datacenters? - \end{tcolorbox} + \only<2-4>{\begin{tcolorbox}[title=Research Question 1] + How to asses the current state-of-the-art of digital twinning for datacenters? + \end{tcolorbox}} - \begin{tcolorbox}[title=Research Question 2] - How to design a reference architecture for a predictive datacenter digital twin using discrete-event simulation? - \end{tcolorbox} + \only<3-4>{\begin{tcolorbox}[title=Research Question 2] + How to design a reference architecture for a predictive datacenter digital twin using discrete-event simulation? + \end{tcolorbox} + } - \begin{tcolorbox}[title=Research Question 3] - % no "and validate?" - How to validate and evaluate a datacenter digital twin architecture in relation to system requirements? - \end{tcolorbox} + \only<4>{\begin{tcolorbox}[title=Research Question 3] + % no "and validate?" + How to validate and evaluate a datacenter digital twin architecture in relation to system requirements? + \end{tcolorbox} + } \end{frame} -\begin{frame}\frametitle{\textbf{RQ1}: Literature Review I} - \begin{tcolorbox}[title=Main Finding I] - There is little literature on DCDTs. - Some systems barely classify as DTs (\emph{e.g.,} Kalibre~\cite{DBLP:conf/sensys/WangZD0TCWZ20}, ChatTwin~\cite{DBLP:conf/sensys/LiW0Z0T23}). - Existing deployments specialize in \textcolor{Red}{Cooling and Heat Modelling}, together with \textcolor{Red}{3D visualizations}. - Most lack predictive modelling of DC operations. - \end{tcolorbox} - \vspace{-0.1cm} - \input{sources/table.tex} +\begin{frame}[t]\frametitle{\textbf{RQ1}: Literature Review I} + \only<1-2>{ + \vspace{-0.2cm} + \begin{tcolorbox}[title=Main Finding I] + There is little literature on DCDTs. + Some systems barely classify as DTs (\emph{e.g.,} Kalibre~\cite{DBLP:conf/sensys/WangZD0TCWZ20}, ChatTwin~\cite{DBLP:conf/sensys/LiW0Z0T23}). + Existing deployments specialize in \textcolor{Red}{Cooling and Heat Modelling}, together with \textcolor{Red}{3D visualizations}. + Most lack predictive modelling of DC operations. + \end{tcolorbox}} + \only<2>{ + \vspace{-0.2cm} + \input{sources/table.tex} + + } % Research on DTs for datacenters have been separate, siloed efforts focused on either datacenter cooling, network performance, power consumption or visualization efforts. % CFD usually means Navier-Stokes equations. % CFD models take ages to compute. \end{frame} -\begin{frame}\frametitle{\textbf{RQ1}: Literature Review II} +\begin{frame}[t]\frametitle{\textbf{RQ1}: Literature Review II} % Mandatory: split the figure into 2: top and bottom, and that way you can fill in the entire slide nicely. - \begin{tcolorbox}[title=A holistic DCDT system model] - We propose a holistic model of datacenter digital twinning that can be mapped to each system from \textbf{Table 1.1}. Within this model (see \textbf{Fig. 1.3}) we introduce a concept of the \emph{Digital Thread}: a bridge between the DCDT and the physical DC equipment. - \end{tcolorbox} - \begin{center} - \vspace{-0.1cm} - \includegraphics[width=0.8\textwidth]{images/system_model.pdf} - \end{center} - % The reason why the cooling system is in the graph is because of the fact that 40\% of total energy consumed in DCs comes from cooling~\cite{DBLP:conf/noms/ZhangZLZWC22}. - % It has come to the point where datacenters are being build in the Pan-Arctic region, such as Finland,Russia,Sweden etc. with Iceland leading in number of DCs https://www.datacentermap.com/iceland/ - % The SmarDC digital twin is purely to get more training data for AI models. - % Not really a digital twin per se. - - \tiny - \textbf{Figure 1.3:} To answer \textbf{RQ1} we designed a generic datacenter digital twin system model based on a comprehensive literature review and findings from \textbf{Table 1.1}. The \emph{Infrastructure Model} simulates the structure of the DC and the \emph{Operations Model} simulates the behaviour of the DC. - \emph{Note:} Federation is not included explicitly but is covered by the model. - % Consider splitting the figure into 2 a.k.a. top and bottom. - % By the AIAA definition, the DT mimicks the structure and behaviour. - % Data Lake -> Data Storage - % Use cases of DT's found by Brewer et al.: augmented reality, forensic analysis and diagnostics, predictive modelling, failure detection, operational optimization, ``what-if''' scenarios and virtual prototyping. + \only<1-2>{\vspace{-0.2cm} + \begin{tcolorbox}[title=A holistic DCDT system model] + We propose a holistic model of datacenter digital twinning that can be mapped to each system from \textbf{Table 1.1}. Within this model (see \textbf{Fig. 1.3}) we introduce a concept of the \emph{Digital Thread}: a bridge between the DCDT and the physical DC equipment. + \end{tcolorbox} + } + \only<2>{\begin{center} + \vspace{-0.1cm} + \includegraphics[width=0.8\textwidth]{images/system_model.pdf} + \end{center} + % The reason why the cooling system is in the graph is because of the fact that 40\% of total energy consumed in DCs comes from cooling~\cite{DBLP:conf/noms/ZhangZLZWC22}. + % It has come to the point where datacenters are being build in the Pan-Arctic region, such as Finland,Russia,Sweden etc. with Iceland leading in number of DCs https://www.datacentermap.com/iceland/ + % The SmarDC digital twin is purely to get more training data for AI models. + % Not really a digital twin per se. + \tiny + \textbf{Figure 1.3:} To answer \textbf{RQ1} we designed a generic datacenter digital twin system model based on a comprehensive literature review and findings from \textbf{Table 1.1}. The \emph{Infrastructure Model} simulates the structure of the DC and the \emph{Operations Model} simulates the behaviour of the DC. + \emph{Note:} Federation is not included explicitly but is covered by the model. + % Consider splitting the figure into 2 a.k.a. top and bottom. + % By the AIAA definition, the DT mimicks the structure and behaviour. + % Data Lake -> Data Storage + % Use cases of DT's found by Brewer et al.: augmented reality, forensic analysis and diagnostics, predictive modelling, failure detection, operational optimization, ``what-if''' scenarios and virtual prototyping. + } \end{frame} \begin{frame}\frametitle{\textbf{RQ2}: Reference Architecture} % Make Kafka logos clearly defined --> add a legend with icons? - \hspace{-0.3cm} - \begin{minipage}[b]{0.45\linewidth} - \begin{center} - % Change to Datacenter (Physical Twin) - \includegraphics[width=1.15\textwidth]{images/ref_architecture.pdf} - \end{center} - \vspace{-0.15cm} - \tiny - \textbf{Figure 1.4:} The predictive datacenter digital twin reference architecture. - We call the system \emph{Sunfish}. - The architecture was designed with the \emph{AtLarge Design Process}~\cite{DBLP:conf/icdcs/IosupVTETBFMT19} over several iterations in the past months. - \vspace{0.2cm} - \end{minipage} - \hspace{0.6cm} - \begin{minipage}[b]{0.45\linewidth} - \begin{center} - \includegraphics[width=1.17\linewidth]{images/implementation.png} - \end{center} - \vspace{-0.2cm} - \tiny - \textbf{Figure 1.5:} The prototype and its components based on the architecture. - The time-series data flows first to the \texttt{Grafana} dashboard, \texttt{PostgreSQL} database and \texttt{Redis} cache as advised in ~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24}. - \vspace{0.1cm} - \end{minipage} + \only<1-2>{\hspace{-0.3cm} + \begin{minipage}[b]{0.45\linewidth} + \begin{center} + % Change to Datacenter (Physical Twin) + \includegraphics[width=1.15\textwidth]{images/ref_architecture.pdf} + \end{center} + \vspace{-0.15cm} + \tiny + \textbf{Figure 1.4:} The predictive datacenter digital twin reference architecture. + We call the system \emph{Sunfish}. + The architecture was designed with the \emph{AtLarge Design Process}~\cite{DBLP:conf/icdcs/IosupVTETBFMT19} over several iterations in the past months. + \vspace{0.2cm} + \end{minipage} + \hspace{0.6cm} + } + \only<2>{\begin{minipage}[b]{0.45\linewidth} + \begin{center} + \includegraphics[width=1.17\linewidth]{images/implementation.png} + \end{center} + \vspace{-0.2cm} + \tiny + \textbf{Figure 1.5:} The prototype and its components based on the architecture. + The time-series data flows first to the \texttt{Grafana} dashboard, \texttt{PostgreSQL} database and \texttt{Redis} cache as advised in ~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24}. + \vspace{0.1cm} + \end{minipage} + } % We decided to use discrete-event simulation, as opposed to computational fluid dynamics because of the high overheads of development time needed for CFD. % CFD simply takes too long to run, making it unfeasible for real-time analytics and simulation. @@ -162,27 +183,31 @@ % Failures that are more intensive are worse than failures with long duration. \begin{tcolorbox}[title=Main Finding II] We posit digital twinning can be used for failure detection to the benefit of DC operators. - We replicate an experiment from DyTwin~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24} designed by Milojicic \etal to show our system can reliably detect \emph{unexpected} host failures. + We replicate an experiment from DyTwin~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24} designed by Milojicic \etal to show \emph{Sunfish} can reliably detect \emph{unexpected} host failures. \end{tcolorbox} - \hspace{-0.2cm} - \begin{minipage}[b]{0.45\linewidth} - \begin{center} - \includegraphics[width=1.1\textwidth]{images/red_yellow_alarms.pdf} - \end{center} - \vspace{-0.3cm} - \tiny - \textbf{Figure 1.7a:} Experiment 1a. In this experiment we use red and yellow alarms to notify datacenter operators of unexpected failures. - We use a threshold based on predictions done by the simulator and a statistical distribution. - \end{minipage} - \hspace{0.6cm} - \begin{minipage}[b]{0.45\linewidth} - \begin{center} - \includegraphics[width=1.1\textwidth]{images/failure_detecton_rate.pdf} - \end{center} - \vspace{-0.3cm} - \tiny - \textbf{Figure 1.7b:} Experiment 1b. The mean failure detection rate is around 12\%. Even though this seems low, if we look at \textbf{Fig. E.1} (see Extra Slides), this simply means around 12\% of failures are unexpected. - \end{minipage} + \only<1-2>{ + \hspace{-0.2cm} + \begin{minipage}[b]{0.45\linewidth} + \vspace{0.1cm} + \begin{center} + \includegraphics[width=1.1\textwidth]{images/red_yellow_alarms.pdf} + \end{center} + \vspace{-0.3cm} + \tiny + \textbf{Figure 1.7a:} Experiment 1a. In this experiment we use red and yellow alarms to notify datacenter operators of unexpected failures. + We use a threshold based on predictions done by the simulator and a statistical distribution. + \end{minipage} + \hspace{0.6cm} + } + \only<2>{\begin{minipage}[b]{0.45\linewidth} + \begin{center} + \includegraphics[width=1.1\textwidth]{images/failure_detecton_rate.pdf} + \end{center} + \vspace{-0.3cm} + \tiny + \textbf{Figure 1.7b:} Experiment 1b. The mean failure detection rate is around 12\%. Even though this seems low, if we look at \textbf{Fig. E.1} (see Extra Slides), this simply means around 12\% of failures are unexpected. + \end{minipage} + } % Explain what the axis are in the figure caption. % Talk about the experimental setup in the figure. % Give more reliable results than just numbers -- do statistical testing, i.e., standard deviation, confidence intervals. @@ -199,33 +224,38 @@ \begin{tcolorbox}[title=Main Finding III] Predicting failures in advance is really difficult. \emph{Sunfish} is capable of dynamic adjustments to the physical twin at runtime, and can slightly lower the number of failed tasks. \end{tcolorbox} - \hspace{-0.2cm} - \begin{minipage}[b]{0.45\linewidth} - \begin{center} - \includegraphics[width=1.1\textwidth]{images/failure_likelihood.pdf} - \end{center} - \vspace{-0.3cm} - \tiny - \textbf{Figure 1.8a:} Experiment 2a. The figure shows which failure distribution is the most likely to be the true failure distribution while the simulation is running. - This figure shows the difficulty of predictive analytics. - \end{minipage} - \hspace{0.5cm} - \begin{minipage}[b]{0.45\linewidth} - \vspace{-0.1cm} - \begin{center} - \includegraphics[width=1.1\textwidth]{images/conceptual_experiment.pdf} - \end{center} - \vspace{-0.3cm} - \tiny - \textbf{Figure 1.8b:} Experiment 2b. With perfect precognition (\emph{i.e.,} knowing on which day, what failures might happen) we could lower the mean number of failures. - This experiment is a proof of concept (results are indication-only). - \end{minipage} + \only<1-2>{ + \hspace{-0.2cm} + \begin{minipage}[b]{0.45\linewidth} + \vspace{0.2cm} + \begin{center} + \includegraphics[width=1.1\textwidth]{images/failure_likelihood.pdf} + \end{center} + \vspace{-0.3cm} + \tiny + \textbf{Figure 1.8a:} Experiment 2a. The figure shows which failure distribution is the most likely to be the true failure distribution while the simulation is running. + This figure shows the difficulty of predictive analytics. + \end{minipage} + \hspace{0.5cm} + } + \only<2>{\begin{minipage}[b]{0.45\linewidth} + \vspace{-0.1cm} + \begin{center} + \includegraphics[width=1.1\textwidth]{images/conceptual_experiment.pdf} + \end{center} + \vspace{-0.3cm} + \tiny + \textbf{Figure 1.8b:} Experiment 2b. With perfect precognition (\emph{i.e.,} knowing on which day, what failures might happen) we could lower the mean number of failures. + This experiment is a proof of concept (results are indication-only). + \end{minipage} + } \end{frame} \begin{frame}\frametitle{Key Takeaways} + \vspace{-0.2cm} \begin{tcolorbox}[title=Societal Context] Datacenter manageability is a top-priority for the digital society. - Over 3 million jobs in the Netherlands directly depend on cloud services, which are hosted in datacenters~\cite{DBLP:journals/corr/IosupKLVG22}. + Over 3 million jobs in the Netherlands directly depend on cloud services, which 1are hosted in datacenters~\cite{DBLP:journals/corr/IosupKLVG22}. \end{tcolorbox} \begin{tcolorbox}[title=Problem Statement] @@ -244,7 +274,7 @@ % Not enough space for another tcolorbox. \end{frame} -\setcounter{framenumber}{4} +\setcounter{framenumber}{2} \setbeamertemplate{footline}[page number]{} % Unfortunately this must remain here. @@ -272,7 +302,7 @@ \end{tcolorbox} - \begin{tcolorbox}[title=How did we adjust OpenDC (Physical Twin)?] + \begin{tcolorbox}[title=How did we adjust OpenDC (Physical Twin)?]1 We use a SURF~\cite{DBLP:journals/fgcs/VersluisCGLPCUI23} datacenter topology with 277 hosts. We wrote a custom Kotlin \texttt{ComputeMonitor} to export live-metrics into Kafka, and a custom Kotlin \texttt{HTTPClient} to talk to the digital twin. We add a new scheduling mechanism, the \texttt{SmartScheduler}. diff --git a/style/style.tex b/style/style.tex index 9f2647c..1020b8f 100644 --- a/style/style.tex +++ b/style/style.tex @@ -1,6 +1,6 @@ \usetheme{Rochester} \usepackage[dvipsnames]{xcolor} -\usepackage{helvet, textpos, stix, caption, booktabs, array, lipsum, fontawesome5, circledsteps, url, inconsolata, amsmath, amssymb, xspace} +\usepackage{helvet, textpos, stix, caption, booktabs, array, lipsum, fontawesome5, circledsteps, url, inconsolata, amsmath, amssymb, xspace, animate, multimedia} \newcommand{\etal}{\emph{et~al.}\xspace} -- cgit v1.2.3