summaryrefslogtreecommitdiff
path: root/main.tex
blob: a6194ca7be38397ac0915d2142a0102df0704d5a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
\documentclass[12pt, handout]{beamer}
\input{style/style.tex}
\begin{document}

\frame{\titlepage \centering \footnotesize Online slideshow: \url{mjkw.pl/vu/bsc}}

\begin{frame}\frametitle{Motivation}
	\begin{tcolorbox}[title=Context]
		21\textsuperscript{st} century datacenters (DC) are mostly heterogeneous~\cite{DBLP:conf/date/MilojicicFDR21} and
		modern computational needs of AI drive managers to diversify datacenters even more~\cite{DBLP:journals/computer/AthavaleBBMMPS24}.
		In result datacenters become extremely complex and hard to operate with millions of CPU's, GPU's etc.
	\end{tcolorbox}
	\begin{center}
		\includegraphics[width=\linewidth]{images/datacenter_complexity.pdf}
	\end{center}
	\tiny
	\textbf{Figure 1.1:} Society depends on datacenters to keep running, and therefore we cannot afford to let these systems break down or experience significant performance-related issues.
	With millions of servers in the largest datacenters, real-time management becomes very difficult.
	Left to right: a Google datacenter, server racks, Ada Lovelace AD102 GPU architecture.
\end{frame}


\begin{frame}\frametitle{Problem Statement}
	\begin{tcolorbox}[title=DCDT's lack predictive analytics]
		We need Datacenter Digital Twins (DCDT) to be better able to detect and solve issues in critical ICT infrastructure~\cite{DBLP:journals/computer/AthavaleBBMMPS24}.
		However, DCDT's are still actively developed and lack crucial features such as predictive analytics~\cite{DBLP:usdoe/report/AP26894} to \emph{e.g.,} prevent unexpected failures.
		With predictive analysis (\emph{e.g.,} simulation) DCDT's could save millions of lost \$USD~\cite{DBLP:conf/acsos/TalluriOVTI21}.
	\end{tcolorbox}

	\begin{center}
		\includegraphics[width=0.9\linewidth]{images/predictive_analytics.pdf}
	\end{center}
	\tiny
	\textbf{Figure 1.2:} Where does our work fit within the field of datacenter digital twinning?
	There are 5 core elements to any Digital Twin: \myCircled{A} The Digital $\rightarrow$ Physical Twin link, \myCircled{B} the Physical Twin (\emph{e.g.,} the datacenter), \myCircled{C} the Physical $\rightarrow$ Digital Twin link, \myCircled{D} the Digital Twin, \myCircled{E} the features necessary to any Digital Twin.
	\textcolor{ForestGreen}{\faHighlighter~Highlighted areas are the contributions from this thesis, which include the autonomous actions resulting from predictive insights \myCircledGreen{A} and the predictive analysis itself within \myCircledGreen{E}.}
\end{frame}

\begin{frame}\frametitle{Research Questions}
	\begin{tcolorbox}[title=Main Research Question, colbacktitle=red!70!black,colback=red!20!white]
		How to enable predictive analytics for datacenters through digital twinning?
	\end{tcolorbox}

	\begin{tcolorbox}[title=Research Question 1]
		How to asses the current state-of-the-art of digital twinning for datacenters?
	\end{tcolorbox}

	\begin{tcolorbox}[title=Research Question 2]
		How to design a reference architecture for a predictive datacenter digital twin using discrete-event simulation?
	\end{tcolorbox}

	\begin{tcolorbox}[title=Research Question 3]
		How to evaluate and validate a datacenter digital twin architecture in relation to system requirements?
	\end{tcolorbox}
\end{frame}


\begin{frame}\frametitle{\textbf{RQ1}: Literature Review I}
	\begin{tcolorbox}[title=Results]
		The literature on DCDTs is scarce.
		Some systems barely classify as DTs (\emph{e.g.,} Kalibre~\cite{DBLP:conf/sensys/WangZD0TCWZ20}, ChatTwin~\cite{DBLP:conf/sensys/LiW0Z0T23}).
		Existing deployments specialize in \textcolor{Red}{Cooling and Heat Modelling}, together with \textcolor{Red}{3D visualizations}.
		Most lack crucial predictive DC behaviour modelling.
	\end{tcolorbox}
	\input{images/table.tex}
	% Research on DTs for datacenters have been separate, siloed efforts focused on either datacenter cooling, network performance, power consumption or visualization efforts.
	% CFD usually means Navier-Stokes equations.
	% CFD models take ages to compute.
\end{frame}

\begin{frame}\frametitle{\textbf{RQ1}: Literature Review II}
	% Mandatory: split the figure into 2: top and bottom, and that way you can fill in the entire slide nicely.

	\begin{tcolorbox}[title=A holistic DCDT system model]
		We propose a generic model of datacenter digital twinning that can be mapped to each system from \textbf{Table 1.1}. To answer \textbf{RQ2}, we design a ref. arch. for \emph{Operations Model}.
		We introduce the \emph{Digital Thread}: a bridge between software and reality.
	\end{tcolorbox}
	\begin{center}
		\vspace{-0.1cm}
		\includegraphics[width=0.8\textwidth]{images/system_model2.pdf}
	\end{center}
	% The reason why the cooling system is in the graph is because of the fact that 40\% of total energy consumed in DCs comes from cooling~\cite{DBLP:conf/noms/ZhangZLZWC22}.
	% It has come to the point where datacenters are being build in the Pan-Arctic region, such as Finland,Russia,Sweden etc. with Iceland leading in number of DCs https://www.datacentermap.com/iceland/
	% The SmarDC digital twin is purely to get more training data for AI models.
	% Not really a digital twin per se.

	\tiny
	\textbf{Figure 1.3:} To answer \textbf{RQ1} we designed a generic datacenter digital twin system model based on a comprehensive literature review and findings from \textbf{Table 1.1}. The \emph{Infrastructure Model} simulates the structure of the DC and the \emph{Operations model} simulates the behaviour of the DC.
	% Consider splitting the figure into 2 a.k.a. top and bottom.
	% By the AIAA definition, the DT mimicks the structure and behaviour.
	% Data Lake -> Data Storage
	% Use cases of DT's found by Brewer et al.: augmented reality, forensic analysis and diagnostics, predictive modelling, failure detection, operational optimization, ``what-if''' scenarios and virtual prototyping.
\end{frame}

\begin{frame}\frametitle{\textbf{RQ2}: Reference Architecture}
	\begin{minipage}[b]{0.45\linewidth}
		\begin{tcolorbox}[title=Use cases]

		\end{tcolorbox}
		\vspace{1cm}
	\end{minipage}
	\begin{minipage}[b]{0.45\linewidth}
		\begin{center}
			\includegraphics[width=1.25\textwidth]{images/ref_architecture.pdf}
		\end{center}
		\vspace{-0.2cm}
		\tiny
		\textbf{Figure 1.4:} The predictive datacenter digital twin reference architecture. 	\end{minipage}
	% We decided to use discrete-event simulation, as opposed to computational fluid dynamics because of the high overheads of development time needed for CFD.
	% CFD simply takes too long to run, making it unfeasible for real-time analytics and simulation.
	% Citing ExaDigit: [CFD] they are also more computationally expensive, generally making real-time operation unfeasible.
	% Consider adding this minipage directly to the ``draw.io'' diagram
\end{frame}
% You should skip \hfill completely or in favour of \hspace very minimally.
\begin{frame}\frametitle{\textbf{RQ3}: Experimental Setup}
	% 			The software stack of \emph{Sunfish} includes state-of-the-art software.
	%The time-series data flows first to the \texttt{Grafana} dashboard, \texttt{PostgreSQL} database and \texttt{Redis} cache, as advised in~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24}.

	\begin{minipage}[b]{0.45\linewidth}
		\begin{tcolorbox}[title=Setup Recipe]
			\scriptsize
			\textbf{Step 1.} Ensure Redis and PostgreSQL servers are up and alive.\newline

			\textbf{Step 2.} Run a Confluent Kafka setup: Kafka Connect, Schema Registry and a Kafka server.\newline

			\textbf{Step 3.} Start the Python HTTP Server, and the Python Redis Monitor.\newline

			\textbf{Step 4.} Run the (modified) OpenDC (physical twin) with example experiment.\newline

			\textbf{Step 5.} \emph{Sunfish} will automatically start a second OpenDC instance, and start the data analysis.
		\end{tcolorbox}
		\vspace{0.5cm}
	\end{minipage}
	\hspace{0.35cm}
	\begin{minipage}[b]{0.45\linewidth}
		\vspace{-0.2cm}
		\begin{center}
			\includegraphics[width=1.2\linewidth]{images/predictive_analyticsv2.pdf}
		\end{center}
		\tiny
		\vspace{-0.4cm}
		\textbf{Figure 1.5:} We can't just go and test digital twins on large systems as large systems often aren't at hand.
		Answering \textbf{RQ3} we provide a novel way to evaluate datacenter digital twins through discrete-event simulation instead.
	\end{minipage}
\end{frame}

\begin{frame}\frametitle{\textbf{RQ3}: Experimental Results I}
	% You have some model, and this can be based on multiple traces.
	%Get insight from CINECA --> you get a probability of certain hosts failing.
	% Anomaly detection --> CINECA, how good their detection is?
	%If you incorporate that? If you can make the case that because of our new digital twin we can incorporate such models, anomaly/failure detection, from CINECA.
	%If we had that in, we can reach these kinds of gains.
	% @Mateusz there is really not a possibility to incorporate CINECA's models, so to address Dante's feedback, I created this experiment.

	\begin{tcolorbox}[title=Failure Detection: Main Finding I]
		On average, \emph{Sunfish} can detect 14.5\% of unexpected failures in the physical twin.
		We show, that digital twinning \emph{can} be used for failure detection.

	\end{tcolorbox}
	\begin{minipage}[b]{0.45\linewidth}
		\begin{center}
			\includegraphics[width=1.1\textwidth]{images/23_Jun_2026_102028.pdf}
		\end{center}
		\vspace{-0.3cm}
		\tiny
		\textbf{Figure 1.5:} Experiment 1 Setup: The Digital Twin estimates the failures based on the Normal Distribution \emph{N\textasciitilde($\mu$,$\sigma$)} with $\mu = 1.5$ and $\sigma = 0.5$.
		``Real'' OpenDC failures come from a WhatsApp user reports.
	\end{minipage}
	% Explain what the axis are in the figure caption.
	% Talk about the experimental setup in the figure.
	% Give more reliable results than just numbers -- do statistical testing, i.e., standard deviation, confidence intervals.
\end{frame}

\begin{frame}\frametitle{\textbf{RQ3}: Experimental Results II}
	\begin{tcolorbox}[title=Scheduling Optimization: Main Finding II]
		Here explain what did you find.
	\end{tcolorbox}

\end{frame}

\begin{frame}\frametitle{Key Takeaways}
	\begin{tcolorbox}[title=What is the societal context?]

	\end{tcolorbox}

	\begin{tcolorbox}[title=What problem did we solve?]

	\end{tcolorbox}

	\begin{tcolorbox}[title=How did we solve this problem?]

	\end{tcolorbox}

	\begin{tcolorbox}[colbacktitle=red!70!black, colback=red!20!white,title=What did we find?]

	\end{tcolorbox}

	\begin{tcolorbox}[title=What do we see in future work?]

	\end{tcolorbox}

\end{frame}

\setcounter{framenumber}{4}
\setbeamertemplate{footline}[page number]{

% Unfortunately this must remain here.
\setbeamercolor{frametitle}{fg=Brown,bg=Brown!20}
\setbeamertemplate{frametitle}{
	\vspace*{-0.1cm}
	\begin{beamercolorbox}[wd=\paperwidth, ht=0.75cm, dp=0.3cm,leftskip=10pt, rightskip=10pt]{frametitle}
		\usebeamerfont{frametitle}\insertframetitle\hfill
	\end{beamercolorbox}
}

\begin{frame}[allowframebreaks]\frametitle{Extra Slides: References}
	\tiny
	\bibliographystyle{is-plain}
	\bibliography{main.bib}
\end{frame}

\begin{frame}\frametitle{Extra Slides: Societal Impact}
	\begin{tcolorbox}[title=Why is this research important today?]
		Over 3 million jobs in the Netherlands directly depend on cloud services, which are hosted in datacenters~\cite{DBLP:journals/corr/IosupKLVG22}.
		Already the rapid
		expansion of datacenters has increased the presence of service failures across all cloud
		services~\cite{DBLP:conf/acsos/TalluriOVTI21}.
		We need to act now.
	\end{tcolorbox}
	\begin{center}
		\includegraphics[height=10em]{images/manageability.pdf}
	\end{center}
	\tiny
	\textbf{Figure E.1:} Horizontally: the most important research areas in computer science in Netherlands.
	Vertically: qualities we should ensure across all research areas with the most outstanding impact on society.
	Datacenter manageability is a top-priority~\cite{DBLP:journals/corr/IosupKLVG22}.
\end{frame}


\begin{frame}\frametitle{Extra Slides: Why Digital Twinning?}
	\begin{tcolorbox}[title=Definition]
		A DCDT mirrors the structure, context and behaviour of a datacenter~\cite{DBLP:journals/computer/AthavaleBBMMPS24}. The prerequisite to any digital twin is good monitoring and sensing capabilities in the physical entity.
		Datacenters meet this requirement easily because they already connect hundreds of monitoring sensors. 	\end{tcolorbox}

	\begin{center}
		\includegraphics[height=10em]{images/dt_timeline.pdf}
	\end{center}
	\tiny

	\textbf{Figure E.2:} Due to insufficient technological foundations,
	little work is available on DTs between 2003 and 2018, and it is only with the rapid growth
	of cloud computing, Internet-of-Things and Big Data analytics that DTs have reemerged~\cite{DBLP:conf/cirp/TAO2018169}.
	That is why nobody used digital twins to mirror datacenters earlier.
\end{frame}

\begin{frame}\frametitle{Extra Slides: Why not pure simulation?}
	\begin{tcolorbox}[title=Predicting job failures]
		Preventing failure-caused outages in advance
		can reduce huge operational costs, as over 20\% of all reported outages amount to more than 1 million
		US\$~\cite{DBLP:report/AnnualOutageAnalysis2025}.
		Only a constant bi-directional interaction (digital twin $\iff$ physical entity) can achieve this.
	\end{tcolorbox}
	\begin{center}
		\includegraphics[height=10em]{images/digital_twin_ms.pdf}
	\end{center}
	\tiny \textbf{Figure E.3:} Real-time control that is tightly-coupled with the IT equipment is a prerequisite for timely predictions within seconds/minutes~\cite{DBLP:journals/computer/AthavaleBBMMPS24}.
\end{frame}

% Computational Fluid Dynamics (CFD) have high computation overhead, unsuitable for real-time simulation of a dynamic datacenter.
%Moreover oftentimes a poorly configured CFD model can lead to high error rates~\cite{DBLP:conf/sensys/WangZD0TCWZ20}.
%Data-driven Machine Learning performs poorly by the cases not covered in the training data.

\end{document}