diff options
| author | mjkwiatkowski <mati.rewa@gmail.com> | 2026-06-28 19:03:19 +0200 |
|---|---|---|
| committer | mjkwiatkowski <mati.rewa@gmail.com> | 2026-06-28 19:03:19 +0200 |
| commit | 90fe47fe94b455bbe02b3b017dd993af2ff24af1 (patch) | |
| tree | 717d92540ec92776f482fc4dc3202a09183b44ad | |
| parent | ee72786e62f74062204f8830aa965131a5c0686d (diff) | |
| -rw-r--r-- | images/alarms_vs_failures.pdf | bin | 0 -> 15420 bytes | |||
| -rw-r--r-- | images/failure_models_table.png | bin | 0 -> 110177 bytes | |||
| -rw-r--r-- | indent.log | 2 | ||||
| -rw-r--r-- | main.bib | 17 | ||||
| -rw-r--r-- | main.tex | 39 |
5 files changed, 49 insertions, 9 deletions
diff --git a/images/alarms_vs_failures.pdf b/images/alarms_vs_failures.pdf Binary files differnew file mode 100644 index 0000000..9987232 --- /dev/null +++ b/images/alarms_vs_failures.pdf diff --git a/images/failure_models_table.png b/images/failure_models_table.png Binary files differnew file mode 100644 index 0000000..5ad3a85 --- /dev/null +++ b/images/failure_models_table.png @@ -1,6 +1,6 @@ INFO: latexindent version 3.24.7, 2025-08-15, a script to indent .tex files latexindent lives here: /usr/share/texmf-dist/scripts/latexindent/ - Sun Jun 28 09:25:22 2026 + Sun Jun 28 19:02:38 2026 Reading input from STDIN INFO: Processing switches: INFO: Directory for backup files and log file indent.log: @@ -711,3 +711,20 @@ biburl = {https://dblp.org/rec/journals/fgcs/VersluisCGLPCUI23.bib}, bibsource = {dblp computer science bibliography, https://dblp.org}, } + +@article{DBLP:journals/jpdc/JavadiKIE13, + author = {Bahman Javadi and Derrick Kondo and Alexandru Iosup and Dick + H. J. Epema}, + title = {The Failure Trace Archive: Enabling the comparison of failure + measurements and models of distributed systems}, + journal = {J. Parallel Distributed Comput.}, + volume = {73}, + number = {8}, + pages = {1208--1223}, + year = {2013}, + url = {https://doi.org/10.1016/j.jpdc.2013.04.002}, + doi = {10.1016/J.JPDC.2013.04.002}, + timestamp = {Sat, 22 Feb 2020 19:36:34 +0100}, + biburl = {https://dblp.org/rec/journals/jpdc/JavadiKIE13.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org}, +} @@ -53,7 +53,7 @@ \begin{frame}\frametitle{\textbf{RQ1}: Literature Review I} \begin{tcolorbox}[title=Main Finding I] - The literature on DCDTs is sparse. + There is little literature on DCDTs. Some systems barely classify as DTs (\emph{e.g.,} Kalibre~\cite{DBLP:conf/sensys/WangZD0TCWZ20}, ChatTwin~\cite{DBLP:conf/sensys/LiW0Z0T23}). Existing deployments specialize in \textcolor{Red}{Cooling and Heat Modelling}, together with \textcolor{Red}{3D visualizations}. Most lack predictive modelling of DC operations. @@ -81,6 +81,7 @@ \tiny \textbf{Figure 1.3:} To answer \textbf{RQ1} we designed a generic datacenter digital twin system model based on a comprehensive literature review and findings from \textbf{Table 1.1}. The \emph{Infrastructure Model} simulates the structure of the DC and the \emph{Operations Model} simulates the behaviour of the DC. + \emph{Note:} Federation is not included explicitly but is covered by the model. % Consider splitting the figure into 2 a.k.a. top and bottom. % By the AIAA definition, the DT mimicks the structure and behaviour. % Data Lake -> Data Storage @@ -98,6 +99,7 @@ \vspace{-0.15cm} \tiny \textbf{Figure 1.4:} The predictive datacenter digital twin reference architecture. + We call the system \emph{Sunfish}. The architecture was designed with the \emph{AtLarge Design Process}~\cite{DBLP:conf/icdcs/IosupVTETBFMT19} over several iterations in the past months. \vspace{0.2cm} \end{minipage} @@ -108,8 +110,8 @@ \end{center} \vspace{-0.2cm} \tiny - \textbf{Figure 1.5:} The prototype -- \emph{Sunfish}, and its components based on \textbf{Figure 1.4}. - The time-series data flows first to the \texttt{Grafana} dashboard, \texttt{PostgreSQL} database and \texttt{Redis} cache~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24}. + \textbf{Figure 1.5:} The prototype and its components based on the architecture. + The time-series data flows first to the \texttt{Grafana} dashboard, \texttt{PostgreSQL} database and \texttt{Redis} cache as advised in ~\cite{DBLP:conf/sc/TaheriBPRHDEWPM24}. \vspace{0.1cm} \end{minipage} @@ -262,8 +264,8 @@ \begin{frame}\frametitle{Extra Slides: Technical Setup } \begin{tcolorbox}[title=What is the simulation workload?] The compute workload is BitBrainsSmall. - The failure traces include user reports from Gmail, WhatsApp, Facebook and Twitter. - For predictions we use \texttt{prefabs}~\cite{DBLP:journals/fgcs/VersluisCGLPCUI23}. + The failure traces include Gmail, WhatsApp, Facebook and Twitter. + For predictions we use different statistical distributions~\cite{DBLP:journals/fgcs/VersluisCGLPCUI23}. \end{tcolorbox} \begin{tcolorbox}[title=What is the experiment environment?] A commodity laptop: Framework Laptop 13, with 32GB of DDR5 RAM and an AMD Ryzen 7840U processor and an ArchLinux OS with Linux 7.0.13-arch1-1 kernel. @@ -282,6 +284,18 @@ \end{frame} +\begin{frame}\frametitle{Extra Slides: Experiment 1} + \begin{tcolorbox}[title=Clarification] + In experiment 1 we are able to differentiate between severe failures, that down more than some threshold $\tau$ hosts. + $\tau$ is determined using predictions based on potential distribution of failures, modeled with \textasciitilde\emph{N(1.5, 1.5)}. + \end{tcolorbox} + + \begin{center} + \includegraphics[width=0.58\linewidth]{images/alarms_vs_failures.pdf} + \end{center} + \tiny + \textbf{Figure E.1:} The comparison between failures experienced and alarms raised. +\end{frame} \begin{frame}\frametitle{Extra Slides: Why Digital Twinning?} \begin{tcolorbox}[title=Definition] @@ -312,11 +326,20 @@ \tiny \textbf{Figure E.3:} Real-time control that is tightly-coupled with the IT equipment is a prerequisite for timely predictions within seconds/minutes~\cite{DBLP:journals/computer/AthavaleBBMMPS24}. \end{frame} - - +\begin{frame}\frametitle{Extra Slides: Experiment 2} + \begin{tcolorbox}[title=Statistical Distributions] + Different failure distributions were used in order to predict the true failure distribution. + \textbf{Table E.1} summarizes the distributions. + \end{tcolorbox} + \begin{center} + \includegraphics[width=\linewidth]{images/failure_models_table.png} + \end{center} + \tiny + \textbf{Table E.1:} Different failure models used throughout this project. + All failure models come from Javadi \etal (for a more thorough overview see~\cite{DBLP:journals/jpdc/JavadiKIE13}) +\end{frame} % Computational Fluid Dynamics (CFD) have high computation overhead, unsuitable for real-time simulation of a dynamic datacenter. %Moreover oftentimes a poorly configured CFD model can lead to high error rates~\cite{DBLP:conf/sensys/WangZD0TCWZ20}. %Data-driven Machine Learning performs poorly by the cases not covered in the training data. - \end{document} |
