\documentclass[12pt, handout]{beamer} \input{style/style.tex} \begin{document} \frame{\titlepage \centering \footnotesize Online slideshow: \url{mjkw.pl/vu/bsc}} \begin{frame}\frametitle{Motivation} \begin{tcolorbox}[title=Context] 21\textsuperscript{st} century datacenters (DC) are mostly heterogeneous~\cite{DBLP:conf/date/MilojicicFDR21} and modern computational needs of AI drive managers to diversify datacenters even more~\cite{DBLP:journals/computer/AthavaleBBMMPS24}. In result datacenters become extremely complex and hard to operate with millions of CPU's, GPU's etc. \end{tcolorbox} \begin{center} \includegraphics[width=\linewidth]{images/datacenter_complexity.png} \end{center} \tiny \textbf{Figure 1.1:} Society depends on datacenters to keep running, and therefore we cannot afford to let these systems break down or experience significant performance-related issues. With millions of servers in the largest datacenters, real-time management becomes very difficult. Left to right: a Google datacenter, server racks, Ada Lovelace AD102 GPU architecture. \end{frame} \begin{frame}\frametitle{Problem Statement} \begin{tcolorbox}[title=DCDT's lack predictive analytics] We need Datacenter Digital Twins (DCDT) to be better able to detect and solve issues in critical ICT infrastructure~\cite{DBLP:journals/computer/AthavaleBBMMPS24}. However, DCDT's are still actively developed and lack crucial features such as predictive analytics~\cite{DBLP:usdoe/report/AP26894} to \emph{e.g.,} prevent unexpected failures. With predictive analysis (\emph{e.g.,} simulation) DCDT's could save millions of lost \$USD~\cite{DBLP:conf/acsos/TalluriOVTI21}. \end{tcolorbox} \begin{center} \includegraphics[width=0.9\linewidth]{images/predictive_analytics.pdf} \end{center} \tiny \textbf{Figure 1.2:} Where does our work fit within the field of datacenter digital twinning? There are 5 core elements to any Digital Twin: \myCircled{A} The Digital $\rightarrow$ Physical Twin link, \myCircled{B} the Physical Twin (\emph{e.g.,} the datacenter), \myCircled{C} the Physical $\rightarrow$ Digital Twin link, \myCircled{D} the Digital Twin, \myCircled{E} the features necessary to any Digital Twin. \textcolor{Green}{\faHighlighter~Highlighted areas are the contributions from this thesis, which include the autonomous actions resulting from predictive insights \myCircledGreen{A} and the predictive analysis itself within \myCircledGreen{E}.} \end{frame} \begin{frame}\frametitle{Research Questions} \begin{tcolorbox}[title=Main Research Question, colbacktitle=red!70!black,colback=red!20!white] How to enable predictive analytics for datacenters through digital twinning? \end{tcolorbox} \begin{tcolorbox}[title=Research Question 1] How to asses the current state-of-the-art of digital twinning for datacenters? \end{tcolorbox} \begin{tcolorbox}[title=Research Question 2] How to design a reference architecture for a predictive datacenter digital twin using discrete-event simulation? \end{tcolorbox} \begin{tcolorbox}[title=Research Question 3] % no "and validate?" How to validate and evaluate a datacenter digital twin architecture in relation to system requirements? \end{tcolorbox} \end{frame} \begin{frame}\frametitle{\textbf{RQ1}: Literature Review I} \begin{tcolorbox}[title=Main Finding I] The literature on DCDTs is scarce. Some systems barely classify as DTs (\emph{e.g.,} Kalibre~\cite{DBLP:conf/sensys/WangZD0TCWZ20}, ChatTwin~\cite{DBLP:conf/sensys/LiW0Z0T23}). Existing deployments specialize in \textcolor{Red}{Cooling and Heat Modelling}, together with \textcolor{Red}{3D visualizations}. Most lack predictive modelling of DC operations. \end{tcolorbox} \vspace{-0.1cm} \input{images/table.tex} % Research on DTs for datacenters have been separate, siloed efforts focused on either datacenter cooling, network performance, power consumption or visualization efforts. % CFD usually means Navier-Stokes equations. % CFD models take ages to compute. \end{frame} \begin{frame}\frametitle{\textbf{RQ1}: Literature Review II} % Mandatory: split the figure into 2: top and bottom, and that way you can fill in the entire slide nicely. \begin{tcolorbox}[title=A holistic DCDT system model] We propose a generic model of datacenter digital twinning that can be mapped to each system from \textbf{Table 1.1}. Within this model (see \textbf{Fig. 1.3}) we introduce a concept of the \emph{Digital Thread}: a bridge between the DCDT and the physical DC equipment. \end{tcolorbox} \begin{center} \vspace{-0.1cm} \includegraphics[width=0.8\textwidth]{images/system_model2.pdf} \end{center} % The reason why the cooling system is in the graph is because of the fact that 40\% of total energy consumed in DCs comes from cooling~\cite{DBLP:conf/noms/ZhangZLZWC22}. % It has come to the point where datacenters are being build in the Pan-Arctic region, such as Finland,Russia,Sweden etc. with Iceland leading in number of DCs https://www.datacentermap.com/iceland/ % The SmarDC digital twin is purely to get more training data for AI models. % Not really a digital twin per se. \tiny \textbf{Figure 1.3:} To answer \textbf{RQ1} we designed a generic datacenter digital twin system model based on a comprehensive literature review and findings from \textbf{Table 1.1}. The \emph{Infrastructure Model} simulates the structure of the DC and the \emph{Operations Model} simulates the behaviour of the DC. % Consider splitting the figure into 2 a.k.a. top and bottom. % By the AIAA definition, the DT mimicks the structure and behaviour. % Data Lake -> Data Storage % Use cases of DT's found by Brewer et al.: augmented reality, forensic analysis and diagnostics, predictive modelling, failure detection, operational optimization, ``what-if''' scenarios and virtual prototyping. \end{frame} \begin{frame}\frametitle{\textbf{RQ2}: Reference Architecture} % Make Kafka logos clearly defined --> add a legend with icons? \hspace{-0.3cm} \begin{minipage}[b]{0.45\linewidth} \begin{center} % Change to Datacenter (Physical Twin) \includegraphics[width=1.15\textwidth]{images/ref_architecture.pdf} \end{center} \vspace{-0.15cm} \tiny \textbf{Figure 1.4:} The predictive datacenter digital twin reference architecture. The architecture was designed with the \emph{AtLarge Design Process}~\cite{DBLP:conf/icdcs/IosupVTETBFMT19}. \vspace{0.2cm} \end{minipage} \hspace{0.6cm} \begin{minipage}[b]{0.45\linewidth} \begin{center} \includegraphics[width=1.17\linewidth]{images/implementation.png} \end{center} \vspace{-0.2cm} \tiny \textbf{Figure 1.5:} The prototype components based on \textbf{Figure 1.4}. The time-series data flows first to the \texttt{Grafana} dashboard, \texttt{PostgreSQL} database and \texttt{Redis} cache~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24}. \vspace{0.1cm} \end{minipage} % We decided to use discrete-event simulation, as opposed to computational fluid dynamics because of the high overheads of development time needed for CFD. % CFD simply takes too long to run, making it unfeasible for real-time analytics and simulation. % Citing ExaDigit: [CFD] they are also more computationally expensive, generally making real-time operation unfeasible. % Consider adding this minipage directly to the ``draw.io'' diagram \end{frame} % You should skip \hfill completely or in favour of \hspace very minimally. \begin{frame}\frametitle{\textbf{RQ3}: Experimental Setup I} \hspace{-0.3cm} \begin{minipage}[b]{0.45\linewidth} \begin{tcolorbox}[title=Problem, colbacktitle=red!70!black,colback=red!20!white] We cannot just go and test digital twins on large systems, because we do not have large systems at hand. Moreover, real-world experimentation is costly and unsustainable in the long run~\cite{DBLP:conf/ccgrid/MastenbroekAJLB21}. \end{tcolorbox} \vspace{0.5cm} \begin{tcolorbox}[title=Solution, colbacktitle=Green!70!black, colback=Green!20!white] \scriptsize They way we test our reference architecture prototype is by using multiple simulators. We use an additional OpenDC process to play the role of a real datacenter. \end{tcolorbox} \vspace{1cm} \end{minipage} \hspace{0.25cm} \begin{minipage}[b]{0.45\linewidth} \vspace{-0.2cm} \begin{center} \includegraphics[width=1.2\linewidth]{images/predictive_analyticsv3.pdf} \end{center} \tiny \vspace{-0.2cm} \textbf{Figure 1.6:} The experimental setup. Answering \textbf{RQ3} we provide a novel way to evaluate datacenter digital twins through discrete-event simulation. \end{minipage} \end{frame} \begin{frame}\frametitle{\textbf{RQ3}: Experimental Results I} % You have some model, and this can be based on multiple traces. %Get insight from CINECA --> you get a probability of certain hosts failing. % Anomaly detection --> CINECA, how good their detection is? %If you incorporate that? If you can make the case that because of our new digital twin we can incorporate such models, anomaly/failure detection, from CINECA. %If we had that in, we can reach these kinds of gains. % @Mateusz there is really not a possibility to incorporate CINECA's models, so to address Dante's feedback, I created this experiment. % If a single host crashes for the entire workload, that's not really that bad. % If a lot of hosts suddenly crash but for a really short time, that's terrible. % Failures that are more intensive are worse than failures with long duration. \begin{tcolorbox}[title=Main Finding II] We posit digital twinning can be used for failure detection to the benefit of DC operators. We replicate an experiment from DyTwin~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24} designed by Milojicic \etal to show our system can reliably detect \emph{unexpected} host failures. \end{tcolorbox} \hspace{-0.2cm} \begin{minipage}[b]{0.45\linewidth} \begin{center} \includegraphics[width=1.1\textwidth]{images/25_Jun_2026_152341.pdf} \end{center} \vspace{-0.3cm} \tiny \textbf{Figure 1.7:} Experiment 1a. In this experiment we use red and yellow alarms to notify datacenter operators of unexpected failures. We use a threshold based on predictions done by the simulator. \end{minipage} \hspace{0.6cm} \begin{minipage}[b]{0.45\linewidth} \begin{center} \includegraphics[width=1.1\textwidth]{images/25_Jun_2026_161052.pdf} \end{center} \vspace{-0.3cm} \tiny \textbf{Figure 1.8:} Experiment 1b. The mean failure detection rate is around 15\%. Even though this seems low, if we look at \textbf{Fig. 1.9} (see extra slides), this simply means around 15\% of failures are unexpected. \end{minipage} % Explain what the axis are in the figure caption. % Talk about the experimental setup in the figure. % Give more reliable results than just numbers -- do statistical testing, i.e., standard deviation, confidence intervals. \end{frame} %\begin{frame}\frametitle{\textbf{RQ3}: Experimental Results II} % \begin{tcolorbox}[title=Evaluation] % Predictive analytics is core to digital twinning. We evaluate our system against the requirements (extra slides) by predicting an optimal scheduling policy. % During runtime, we make dynamic adjustments to the physical twin, if the scheduling results differ. % \end{tcolorbox} % \hspace{0.2cm} % \begin{minipage}[b]{0.32\linewidth} % \begin{center} % \includegraphics[width=1.1\textwidth]{images/23_Jun_2026_102028.pdf} % \end{center} % \vspace{-0.3cm} % \tiny % \textbf{Figure 1.9:} Experiment 1 % \end{minipage} %\end{frame} \begin{frame}\frametitle{Key Takeaways} \begin{tcolorbox}[title=What is the societal context?] Datacenter manageability is a top-priority for the digital society. Over 3 million jobs in the Netherlands directly depend on cloud services, which are hosted in datacenters~\cite{DBLP:journals/corr/IosupKLVG22}. \end{tcolorbox} \begin{tcolorbox}[title=What problem did we solve?] DCDT's, still under development, lack crucial features such as predictive analytics to manage datacenters well. The entire DCDT design space remains largely unexplored. \end{tcolorbox} \begin{tcolorbox}[title=How did we solve this problem?] Our contributions are: a thorough literature survey with a system model, a DCDT reference architecture, and prototype-based experiments via a novel evaluation method. \end{tcolorbox} \begin{tcolorbox}[title=What did we find?] \emph{Sunfish} can reliably detect unexpected failures based on discrete-event predictions, and can serve as a foundation for additional research and future work. \end{tcolorbox} % Mandatory to mention here the future work that you see happening. % Not enough space for another tcolorbox. \end{frame} \setcounter{framenumber}{3} \setbeamertemplate{footline}[page number]{} % Unfortunately this must remain here. \setbeamercolor{frametitle}{fg=Brown,bg=Brown!20} \setbeamertemplate{frametitle}{ \vspace*{-0.1cm} \begin{beamercolorbox}[wd=\paperwidth, ht=0.75cm, dp=0.3cm,leftskip=10pt, rightskip=10pt]{frametitle} \usebeamerfont{frametitle}\insertframetitle\hfill \end{beamercolorbox} } \begin{frame}[allowframebreaks]\frametitle{Extra Slides: References} \tiny \bibliographystyle{is-plain} \bibliography{main.bib} \end{frame} \begin{frame}\frametitle{Technical Setup } \begin{tcolorbox}[title=What is the simulation workload?] The compute workload is BitBrainsSmall. The failure traces include user reports from Gmail, WhatsApp and Twitter. \end{tcolorbox} \begin{tcolorbox}[title=What is the experiment environment?] A commodity laptop: Framework Laptop 13, with 32GB of DDR5 RAM and an AMD Ryzen 7840U processor and an ArchLinux OS with Linux 7.0.13-arch1-1 kernel. \end{tcolorbox} \begin{tcolorbox}[title=How did we adjust OpenDC (Physical Twin)?] We use a SURF~\cite{DBLP:journals/fgcs/VersluisCGLPCUI23} datacenter topology with 277 hosts. We wrote a custom Kotlin \texttt{ComputeMonitor} to export live-metrics into Kafka, and a custom Kotlin \texttt{HTTPClient} to talk to the digital twin. We add a new scheduling mechanism, the \texttt{SmartScheduler}. \end{tcolorbox} \begin{tcolorbox}[title=Which metrics do we measure?] Timestamps, host names, uptime, downtime, CPU utilization \emph{etc.} \end{tcolorbox} \end{frame} \begin{frame}\frametitle{Extra Slides: Why Digital Twinning?} \begin{tcolorbox}[title=Definition] A DCDT mirrors the structure, context and behaviour of a datacenter~\cite{DBLP:journals/computer/AthavaleBBMMPS24}. The prerequisite to any digital twin is good monitoring and sensing capabilities in the physical entity. Datacenters meet this requirement easily because they already connect hundreds of monitoring sensors. \end{tcolorbox} \begin{center} \includegraphics[height=10em]{images/dt_timeline.pdf} \end{center} \tiny \textbf{Figure E.2:} Due to insufficient technological foundations, little work is available on DTs between 2003 and 2018, and it is only with the rapid growth of cloud computing, Internet-of-Things and Big Data analytics that DTs have reemerged~\cite{DBLP:conf/cirp/TAO2018169}. That is why nobody used digital twins to mirror datacenters earlier. \end{frame} \begin{frame}\frametitle{Extra Slides: Why not pure simulation?} \begin{tcolorbox}[title=Predicting job failures] Preventing failure-caused outages in advance can reduce huge operational costs, as over 20\% of all reported outages amount to more than 1 million US\$~\cite{DBLP:report/AnnualOutageAnalysis2025}. Only a constant bi-directional interaction (digital twin $\iff$ physical entity) can achieve this. \end{tcolorbox} \begin{center} \includegraphics[height=10em]{images/digital_twin_ms.pdf} \end{center} \tiny \textbf{Figure E.3:} Real-time control that is tightly-coupled with the IT equipment is a prerequisite for timely predictions within seconds/minutes~\cite{DBLP:journals/computer/AthavaleBBMMPS24}. \end{frame} % Computational Fluid Dynamics (CFD) have high computation overhead, unsuitable for real-time simulation of a dynamic datacenter. %Moreover oftentimes a poorly configured CFD model can lead to high error rates~\cite{DBLP:conf/sensys/WangZD0TCWZ20}. %Data-driven Machine Learning performs poorly by the cases not covered in the training data. \end{document}