rwth-misc/hpc/hpc_summary.tex

\documentclass[11pt]{article}

\usepackage{amsmath,amssymb,latexsym}
\usepackage{fancyhdr}
\usepackage[margin=2.50cm]{geometry}
\usepackage[compact]{titlesec}
\usepackage{hyperref}
\usepackage[T1] {fontenc}
\usepackage{color}
\usepackage{multirow}
\usepackage{array}
\usepackage{paralist}
\usepackage{listings}
\usepackage{enumitem}
\hypersetup{colorlinks=true, linkcolor=blue, urlcolor=cyan}

\lstset{
	moredelim=[is][\color{green}]{§}{§},
	moredelim=[is][\color{red}]{?}{?},
	moredelim=[is][\color{cyan}]{|}{|}
}

% Title infos
\title{HPC Formulars}
\author{Steffen Vogel}
\date{\today}

% headline
\lhead{HPC Formulars}
\chead{Steffen Vogel}
\rhead{\today}

\pagestyle{fancy}
\setlength{\parindent}{0cm}

\begin{document}

\section{Amdahl \& Gustafson}

\begin{tabular}{ l l | c c }

	&  & \textbf{Amdahl's Law} & \textbf{Gustafson's Law} \\
	&	 & \textit{strong scaling}	& \textit{weak scaling} \\
	\hline
	\textbf{Speedup} & \( S_p(N) = \frac{T(1)}{T(N)} \) & \( \frac{1}{s + \frac{1 - s}{N}} \xrightarrow{N \rightarrow \infty} \frac{1}{s} \) & \( Np + s \xrightarrow{N \rightarrow \infty} \infty \) \\
	\textbf{Efficency} & \( \varepsilon_p(N) = \frac{S_p(N)}{N} \) & \( \frac{1}{s (N-1) + 1} \xrightarrow{N \rightarrow \infty} 0 \) & \( \frac{1 - p}{N} + p \xrightarrow{N \rightarrow \infty} p \) \\
\end{tabular}

\section{Moore's Law}

\begin{quote}
	\# of transistors / cost-effective IC doubles every $N$ months, with $12 \leq N \leq 24$
\end{quote}

\section{Energy Efficency}

\begin{tabular}{ p{7cm} l }
	Power-voltage-frequency & \( P \sim V^2 \cdot f \) or \( P \sim f^3 \) \\
	& \\
	energy efficency & = \( \frac{\text{energy}}{\text{Flop}} \) \\
			 & = \( \frac{\text{power consumption} \cdot \text{time}}{\text{Flop}} \) \\
			 & = \( \frac{\text{power consumption}}{\text{Flop}/s} \) \\
	energy		 & = \( \text{power consumption} \cdot \text{time} \) \\
	& \\
	Physical limit for efficency 	& 2.87e-21 Joule/Bit \( \approx ln(2) \cdot k \cdot T \) \\
					& 1.84e-19 Joule/Flop \\
	Green TOP500 \#1		& 2.22e-10 Joule/Flop \\
	& \\
	Power wall			& expected to be around 20 to 25 MWatt \\
\end{tabular}

\section{Pipelining}

\begin{tabular}{ p{7cm} l }
	Stages & $m$ \\
	Operations & $N$ \\
	Without pipeline & \( T_{seq} = m \cdot N \) \\
	With pipeline & \( T_{pipe} = N + m - 1 \) \\
	Speedup & \(S_{pipe} = \frac{m}{1 + \frac{m-1}{N}} \xrightarrow{N \rightarrow \infty} m \) \\
	Throughput (results/cycle) & \( \frac{N}{T_{pipe}} = \frac{1}{1 + \frac{m-1}{N}} \xrightarrow{N \rightarrow \infty} 1 \) \\
\end{tabular}

\section{Memory Access}

\begin{tabular}{ p{7cm} l }
	Hit rate & $\beta$ \\
	Miss rate & $(1 - \beta)$ \\
	Access time& $T_m$ (Memory), $T_c$ (Cache) \\
	Average access time & \( T_{avg} = \beta T_c + (1 - \beta) T_m \) \\
	Performance gain & \( G( \theta, \beta ) = \frac{T_m}{T_{avg}} = \frac{\theta}{\beta + \theta(1 - \beta)},  \theta = \frac{T_m}{T_c} \) \\
\end{tabular}

\section{Networks}

\subsection{Transmission \& Bandwidth}

\begin{tabular}{ p{7cm} l }
	Transmission time & \( T = T_L + \frac{N}{B} \) \\
	Effective bandwidth & \( B_{eff} = \frac{N}{T} = \frac{N}{T_L + \frac{N}{B}} \) \\
\end{tabular}

\subsection{Topologies}

\begin{tabular}{l | c c c c }
	\textbf{Topology} & \textbf{Max degree} & \textbf{Edge connectivity} & \textbf{Diameter} & \textbf{Bisection BW} \\
	\hline
	Bus		& $1$  & $1$ & $1$ & $B$ \\
	Ring	& $2$  & $2$ & $\lfloor \frac{N}{2} \rfloor$ & $2B$ \\
	Fully connected	& $N-1$ & $N-1$ & $1$ & $\frac{N^2}{4}$ \\
	Sw. Fat Tree	& 1 w/o redunancy & depends on design & 2 hierarchy height & depends on design \\
	Mesh	& $2d$ & $d$ & $\sum_{i=1}^d (N_i - 1)$ & $B ( \prod_{i=1}^{d-1} N_i )$ \\
	Torus	& $2d$ & $2d$ & $\sum_{i=1}^d \lfloor \frac{N}{2} \rfloor$ & $2B ( \prod_{i=1}^{d-1} N_i )$ \\
	Hypercube	& $d$ & $d$ & $d$ & $B2^{d-1}$ \\
\end{tabular}

\section{Balance, Lightspeed}

\begin{tabular}{ p{7cm} l }
	Code balance &  \( B_c = \frac{\text{data transfers}}{\text{arithmetic ops}} = \frac{[Words]}{[Flops]} \) \\
	Machine balance & \( B_m = \frac{b_s}{P_{max}} \) \\
	Relative lightspeed & \( l = \min(1, \frac{B_m}{B_c} ) \) \\
	Absolute lightspeed performance & \( P = P_{max} \cdot l \) \\
\end{tabular}

\section{Programming API's}

Argument directions: \lstinline$§input§$, \lstinline$?output?$, \lstinline$|inout|$
Common arguments:
	communicator = cm

\subsection{MPI}

All MPI routines return an integer error code!

\subsubsection{Initialization}
\begin{tabular}{ p{8cm} l }
	\lstinline$MPI_Init(§argc§, §argv§)$ & Has to be called before every other MPI\_* function. \\
	\lstinline$MPI_Finalize()$ & Like \lstinline$MPI_Init()$ this must only be called once. \\
	\lstinline$MPI_Initialized(?flag?)$ & Check if MPI runtime was already initialized. \\
	\lstinline$MPI_Finalized(?flag?)$ & Check if MPI runtime was already shut down. \\
\end{tabular}

\subsubsection{Point-to-point data transfer}

\begin{tabular}{ l | l l l }
	\textbf{Mode / Op} & \textbf{Blocking} & \textbf{Non-blocking} & \textbf{Completes after...} \\
	\hline
	Standard	& \lstinline$MPI_Send$($\otimes$) & \lstinline$MPI_Isend$($\otimes, \ddag$) & the message has been safely stored away. \\
	Synchronous	& \lstinline$MPI_Ssend$($\otimes$) & \lstinline$MPI_Issend$($\otimes, \ddag$) & the receive was called. \\
	Buffered	& \lstinline$MPI_Bsend$($\otimes$) & \lstinline$MPI_Ibsend$($\otimes, \ddag$) & the message was stored away in a buffer. \\
	Ready-mode	& \lstinline$MPI_Rsend$($\otimes$) & \lstinline$MPI_Irsend$($\otimes, \ddag$) & only if the receive was called before send. \\
	\hline
	Standard	& \lstinline$MPI_Recv$($\diamond, \dag$) & \lstinline$MPI_Irecv$($\diamond, \ddag$) & a message was received. \\
	Probing		& \lstinline$MPI_Probe$() & \lstinline$MPI_Iprobe$  & immediately. \\
	Waiting		& \lstinline$MPI_Wait$($\ddag, \dag$) & \lstinline$MPI_Test$($\ddag, $\lstinline$?flag?$$, \dag$) & message available. \\
\end{tabular}


\textbf{Signatures}: \\
{ \footnotesize
	\hspace{2cm} $\otimes$ \lstinline$(§buffer§, §count§, §type§, §dst§, §tag§, §cm§)$, \\
	\hspace{2cm} $\diamond$ \lstinline$(?buffer?, §count§, §type§, §dst§, §tag§, §cm§)$, \\
	\hspace{2cm} $\dag$ \lstinline$(?status?)$ $\ddag$ \lstinline$(?request?)$
}

\begin{tabular}{ p{8cm} l }
	\lstinline$MPI_Get_count(§stat§, §type§, ?count?)$ & Get the number of received elements by \lstinline$MPI_Recv()$. \\

	\lstinline$MPI_Sendrecv(§sbuf§, §scnt§, §stype§, §dst§, §stag§,$ & Simultaneous, deadlock-free send and receive. \\
	\lstinline$    ?rbuf?, ?scnt?, ?stype?, ?dst?, ?stag?, §cm§, ?stat?)$ & \\

	\lstinline$MPI_Sendrecv_replace(...)$ & Like \lstinline$MPI_Sendrecv()$ but using the same buffer. \\
\end{tabular}

\subsubsection{Collective data transfer}

\begin{tabular}{ p{8cm} l }
	\lstinline$MPI_Bcast(|buf|, §cnt§, §type§, §root§, §cm§)$ & Broadcasts data from root to all other. \\
		\lstinline$MPI_Scatter(§sbuf§, §scnt§, §stype§,$ & Scatters chunks of data from root to all \\
	\lstinline$     ?rbuf?, §rcnt§, §rtype§, §root§, §cm§)$ & \\
	\lstinline$MPI_Gather(§sbuf§, §scnt§, §stype§,$ & Gathers chunks of data from root to all \\
	\lstinline$     ?rbuf?, §rcnt§, §rtype§, §root§, §cm§)$ & \\
	\lstinline$MPI_Allgather(...)$ & \\
	\lstinline$MPI_Alltoall()$ & \\
	\lstinline$MPI_Reduce()$ & \\
	\lstinline$MPI_Allreduce(...)$ & \\
\end{tabular}

\subsubsection{Synchronisation}
\begin{tabular}{ p{8cm} l }
	\lstinline$MPI_Barrier(§cm§)$ & Waits for all processes. \\
\end{tabular}

\subsubsection{Communicator handling}
\begin{tabular}{ p{8cm} l }
	\lstinline$MPI_Comm_dup(...)$ & \\
	\lstinline$MPI_Comm_split(...)$ & \\
	\lstinline$MPI_Comm_compare(...)$ & \\
	\lstinline$MPI_Comm_free(...)$ & \\
	\lstinline$MPI_Comm_rank(§cm§, ?rank?)$ & Get rank/id of current process in this communicator. \\
	\lstinline$MPI_Comm_size(§cm§, ?size?)$ & Get total number of processes in this communicator. \\
\end{tabular}

\subsubsection{Virtual topologies}
\begin{tabular}{ p{8cm} l }
	\lstinline$MPI_Cart_create(..)$ & Creates a new cartesian cm from given old one.\\
	\lstinline$MPI_Cart_rank(...)$ & \\
	\lstinline$MPI_Cart_shift(...)$ & \\
	\lstinline$MPI_Cart_sub(...)$ & \\
	\lstinline$MPI_Cart_coords(...)$ & \\
	\lstinline$MPI_Dims_create(...)$ & \\
\end{tabular}

\subsubsection{Derived Datatypes}
\begin{tabular}{ p{8cm} l }
	\lstinline$MPI_Type_contiguous(...)$ & \\
	\lstinline$MPI_Type_vector(...)$ & \\
	\lstinline$MPI_Type_commit(...)$ & \\
	\lstinline$MPI_Type_free(...)$ & \\
\end{tabular}


\subsubsection{Timekeping}

\begin{tabular}{ p{8cm} l }
	\lstinline$double MPI_Wtime()$ & Measure local wall-clock time. \\
	\lstinline$double MPI_Wtick()$ & Obtain the precision of the MPI timer. \\
\end{tabular}

\subsection{CUDA}


\subsubsection{General Memory Operations}
\begin{tabular}{ p{7cm} l }
	\lstinline$cudaMalloc(&p, n*sizeof(float));$ &  Allocate memory of n floats on GPU, save pointer in p\\
	\lstinline$cudaFree(p);$& Free memory on GPU, which p is pointing to\\
	\lstinline$cudaHostMalloc(&p, n*sizeof(float));$ &  Allocate memory of n floats on Host, save pointer in p\\
	\lstinline$cudaHostFree(p);$& Free memory on Host, which p is pointing to\\
	\lstinline$cudaMemcpy(d, h, n * sizeof(float),DIR);$ &  memcopy Host => GPU, if DIR = cudaMemcpyHostToDevice \\
	\lstinline$cudaMemcpy(h, d, n * sizeof(float),DIR);$& GPU => Host if DIR = cudaMemcpyDeviceToHost\\
	\lstinline$func<<<blockNum, threadNum>>>();$ & Call CUDA Kernel code $func$, with given \# of blocks / threads\\
\end{tabular}

\subsubsection{Shared Memory}
\begin{tabular}{ p{7cm} l }
	\lstinline$__shared__ double[x][y];$ & Static shared memory between threads\\
	\lstinline$extern __shared__ double[x][y];$ & Dynamic shared memory between threads\\
	\lstinline$__syncthreads();$ & Synchronize threads after shared memory usage	\\
\end{tabular}

\subsubsection{Asynchronous operations}
\begin{tabular}{ p{7cm} l }
	\lstinline$cudaStream_t st1,st2;$ & Defines streams. Instructions of one stream are exec'd in order.\\
	\lstinline$cudaStreamCreate(&st1)$& Initialize Stream\\
	\lstinline$cudaMemcpyAsync(dst,src,size,dir,st1);$& Async memcpy in Stream st1. (nonlbocking)\\
	\lstinline$cudaStreamSynchronize(st1);$ & Host waits for stream\\
	\lstinline$cudaDeviceSynchronize();$ & Host waits for everything \\
\end{tabular}

\subsection{OpenMP}

\begin{tabular}{ p{7cm} p{9cm} }
	\lstinline$#pragma omp parallel$ & Forms a team of threads and starts parallel execution. \\
	\lstinline$#pragma omp for$ & Split iterations of following for loop across threads.  \\
	\lstinline$#pragma omp sections$ & Non-iterative worksharing of diffrent sections. \\
	\lstinline$#pragma omp section$ & \\
	\lstinline$#pragma omp single$ & Region shall only be executed by a single thread. \\
	\lstinline$#pragma omp critical$ & Region is executed by all threads but only
		by a single simultaneously. \\
	\lstinline$#pragma omp barrier$ & All tasks created by any thread of the current
		team are guaranteed to complete at barrier exit. \\
	\lstinline$#pragma omp taskwait$ & Encountering task suspends until all (direct) child tasks are complete. \\
	\lstinline$#pragma omp task$ & Defines an explicit task. \\
	\lstinline$#pragma omp target [data]$ & \\
	\lstinline$#pragma omp distribute$ & \\
	\lstinline$#pragma omp teams$ & \\
\end{tabular}

\subsubsection{Clauses}

\begin{tabular}{ p{7cm} p{9cm} }
	\lstinline$num_threads(cnt)$ & Number of threads for a parallel region. \\
	\lstinline$default(shared|none)$ & Set default scope. \\
	\lstinline$[first|last]private(list)$ & Thread private scope. \\
	\lstinline$shared(list)$ & Shared scope among thread team. \\
	\lstinline$proc_bind(master|close|spread)$ & OMP places? \\
	\lstinline$nowait, untied$ & \\
	\lstinline$reduction(op:list)$ & Do reduction after parallel block. \\
	\lstinline$schedule(kind,chunk_size)$ & \\
	\lstinline$if(condition)$ & Only fork/enter thread/parallel region if \texttt{condition} is true. \\
\end{tabular}

\subsubsection{Environment variables}

\begin{tabular}{ p{7cm} l }
	\lstinline$OMP_NUM_THREADS$ & \\
	\lstinline$OMP_PLACES$ & \\
	\lstinline$KMP_AFFINITY$ & \\
\end{tabular}

\subsubsection{Runtime API}

\begin{tabular}{ p{7cm} p{9cm} }
	\lstinline$omp_{set,get}_num_threads(n)$ & Get/set the number of threads used for the encountering region. \\
	\lstinline$omp_get_thread_num()$ & Returns the number of the current thread (see \lstinline$MPI_Comm_rank()$).

\end{tabular}

\subsection{OpenACC}

\begin{tabular}{ p{7cm} l }
	\lstinline$#pragma acc parallel [loop, d-clauses]$ & \\
	\lstinline$#pragma acc kernels [loop, d-clauses]$ & \\
	\lstinline$#pragma acc loop [gang, reduction(op:list)]$ & \\
	\lstinline$#pragma acc wait$ & \\
	\lstinline$#pragma acc data [d-clauses]$ & \\
	\lstinline$#pragma acc update {host,device}(data)$ & \\
	\lstinline$#pragma acc {enter,exit} data [d-clauses]$ & \\
	\lstinline$#pragma acc routine (name)$ & \\
\end{tabular}

\section{LSF}

See \texttt{man bsub}.

\section{Common problems and operations}

\begin{tabular}{ p{7cm} l }
	Single-precision real Alpha X Plus Y & \( \vec{y} = \alpha \cdot \vec{x} + \vec{y} \) \\
	(S/DAXPY, FMA, Triad) & \\
	Sparse matrix vector multiply & \( \vec{y} = \mathbf{A} \cdot \vec{x} \) \\
	Dense matrix vector multiply & \\
	Dense matrix multiplication & \( \mathbf{A} = \mathbf{B} \cdot \mathbf{C} \) \\
	Dense matrix diagonalization & \( \mathbf{P}^{-1} \mathbf{A} \mathbf{P} = diag(\lambda_1, ..., \lambda_n) \) \\
	Scalar product & \( s = \vec{x} \cdot \vec{y} \) \\
	Vector addition & \( \vec{y} = \vec{x} + \vec{z} \) \\
	Matrix addition & \( \mathbf{A} = \mathbf{B} + \mathbf{C} \) \\
	Matrix transpose & \( \mathbf{A} = \mathbf{B}' \) \\
\end{tabular}

\newpage
\section{What you've learned?}

\subsection{Why supercomputers?}

\begin{description}[style=nextline]
	\item[What is a supercomputer?] A supercomputer is a computer
		at the frontline of current processing capacity, particularly speed of calculation.
	\item[What are supercomputers used for?]
	\begin{itemize}
		\item Scientific computing \& simulation
		\item Weapon research
		\item Economics (Oil, minerals)
		\item Medicine
		\item Meteorology
	\end{itemize}
	\item[What are recent supercomputers]	See TOP500 list of world's fastest systems:
		\begin{enumerate}
			\item Tianhe-2 (33 PFlops/s)
			\item Titan (17 PFlops/s)
		\end{enumerate}
	\item[What can we read from the performance development as measured in the
		TOP500?] The TOP500 lists by peak performance $R_{max}$
		in Flops/s (double) measured by a standardized benchmark (LIN/LAPACK). \\
		Future trends are predictable (exascale computing, architectures).\\
		At the moment, it takes approximately 11 years to increase performance by 3
		orders of magnitude (ExaFlop/s projected for 2019).

	\item[What does Moore's Law tell you? Is it still valid?]
		"The number of transistors per chip doubles around every one to two years." \\
		Slope is now declining slowly. But mostly still valid.

	\item[Why do we have multi-core architectures today?]
		Clock frequencies can't be pushed further due to limited cooling capabilities. \\
		Solution: Decrease frequency; increase die-size by putting more cores on a single chip and exploit parallelism.

\end{description}

\newpage
\subsection{Energy Efficiency}

\begin{description}[style=nextline]
	\item[What is the power wall?] 20 to 25 MegaWatt of energy consumption is
        considered an important barrier for HPC centres.\\
        The power wall gives us an upper limit on power consumption.

	\item[What is the problem?]
		To meet the exa-scale goal in 2019, the current trend of energy efficiency isn't sufficient.
		Power consumption will become the dominating cost factor for
		operate clusters in the future, with energy costs surpassing investment costs.

	\begin{description}[style=nextline]
		\item[Aspects of energy efficency] Energy efficiency is continuously
		improving. To meet the exa-scale goal this has to be focused more by using
		new architectures (accelerators).
		At the moment, we have a performance increase of 2 orders of magnitude
		every 7 years while energy efficiency ($\frac{\text{power consumption}}{\text{Flop/s}}$)
		decreases by only 1 order of magnitude in the same time.
		This means power consumption is increasing 1 order of magnitude every 7 years.

		\item[Physical descriptions \& limits]
			Landauer's principle: "Changing one bit of information needs at
			least an amount of energy of $ln(2) \cdot k \cdot T$". \\
			The current top Green500 systems are about 10 orders of magnitude away from this limitation.
	\end{description}

	\item[How can we benchmark energy efficency?] Measure only the consumption of a
		single rack (submodule) and extrapolate. Average over different benchmarks.

	\begin{description}[style=nextline]
		\item[Advantages \& disadvantages of Green500, SPEC Power]
			High variations in idle and during benchmarks and between nodes.
			Power extrapolation may not be allowed.
			The Green500 assumes that the computational workload during the Linpack
			benchmark is well-balanced across all units and that all units are
			identical and have the same power consumption for the same workload.
			Those assumptions are however very dangerous and often wrong. Idle and
			Linpack power consumption can vary from node to node as well as over time.
	\end{description}

	\item[What are the architectural aspects of energy efficency?]
		Current von Neumann architectures weren't designed with energy
		efficiency in mind:
		OoO execution, cache coherency and prefetchers are inefficient. \\
		Desktop/mobile power saving methods are applied to HPC (C and
		P-states) C0 is the operating state, everything above is an idle state.
		With increasing numbers, more and more parts are shut down to conserve
		energy (at the cost of an increased wakeup time). P-states are all
		operational states, but they switch between supported frequencies \&
		voltages to save energy in case maximal performance is not needed
		$\Longrightarrow$ DVFS.\\
		There are many parameters that affect energy efficiency: number of
		cores, clock frequency, prefetcher, SMT, branch prediction, GPU clock
		frequency, cache size, speed of interconnects. It is important to find
		sweet spots for those parameters, not only for whole applications but
		also for each application phase.

	\begin{description}[style=nextline]
		\item[What is dynamic frequency and voltage scaling?]
            		Reduce voltage/frequency during idle times to save energy.
			Implemented by processor P-states.
	\end{description}

	\item[What are the future plans to climb the power wall?]
	\begin{itemize}
		\item Optimize current architectures
		\item Power-efficient accelerators
		\item Reduce idle power to $\approx 0 W$
		\item Load-balancing by DVS/DFS.
	\end{itemize}

	\begin{description}[style=nextline]
		\item[What about GPU's?] GPU's are a lot more efficient because more
			transistors are used for computation instead of controlling/management tasks.
			They already use faster DDR5-RAM. But PCIe will become a bottleneck when CPUs get stacked RAM.

		\item[What about stacked memory?] Is a must-have to reach the exa-scale goal.
			Shorter paths allow higher bandwidths and lower voltages (maintainability? extend
			the caching hierarchy by SSDs?).
	\end{description}
\end{description}

\newpage
\subsection{Modern Processors}

\begin{description}[style=nextline]
	\item[What is an ISA?] The Instruction Set Architecture is the part of an
        architecture that is visible to the application programmer / compiler.
        It contains e.g. opcode, operands, registers, addressing modes\dots

	\begin{description}[style=nextline]
		\item[What are the differences in ISA's?]
		\begin{itemize}
			\item Instructions, CISC or RISC
			\item Memory addressing modes (effective of relative)
			\item Type of Internal storage, e.g. stack, accumulator or
                general-purpose registers (dominant today)
			\item Endianess
            \item Type and size of operands
		\end{itemize}

		\item[What are the main differences between RISC and CISC?]
			\begin{description}
				\item[CISC] Instructions may involve complex operations and are variable in length.
				\item[RISC] Similar encoding for all instructions, fixed length.
					Strict distinction between data loading/storing and manipulation.
			\end{description}

		\item[What do we have today?] HPC dominant x86 architectures are CISC
            based (but get translated to internal RISC micro instructions).
	\end{description}

	\item[What is the basis of today's processors?]
        They mostly decode CISC instructions to microcode ops (hybrid CISC/RISC).
        Modern processors are still based on the Von Neumann architecture,
        albeit they usually consist of two or more cores.
        In addition, ALU, control unit, memory controller etc. are all
        integrated on the core.

	\begin{description}[style=nextline]
		\item[What are the corresponding bottlenecks?]
            The main bottlenecks today are memory access speed and sequential
            architecture (SISD)
		\item[How can we overcome these?]
            Pipelining, ILP (superscalarity), SIMD - Single Instruction
            Multiple Data, Prefetching.

	\end{description}
	\item[What is ILP?]
        Instruction Level Parallelism is the potential overlap of instruction
        execution caused by superscalarity, OoO and pipelines.

	\begin{description}[style=nextline]
		\item[What is its influence on today's computers? over time?]
            Todays CPU's use hardware implemented schedulers, dependency
            resolvers for microcode operations $\Longrightarrow$ very complex
            controlling.

	\end{description}
	\item[Pipelining]
	\begin{description}[style=nextline]
		\item[How does it work?]
            Pipelining splits the execution of an instruction in multiple
            preferably equally long 'stages' which are orthogonal to each other. \\
		    This allows reusing the execution unit of one stage for the next
            instruction before the first instruction has retired.

		\item[How to compute possible speedup/throughput?] See formulas.

		\item[Problems?]
		\begin{itemize}
			\item Pipeline stalls caused by phases of different lengths
			\item Hazards caused by dependent data, instructions, branch-misses...
			\item Imbalanced CISC instruction decoding/fetching. Instructions
                are not of equal length and complexity.
			\item Aliasing...
		\end{itemize}

	\end{description}
	\item[Superscalarity]
        "Execute multiple instructions per cycle."

	\begin{description}[style=nextline]
		\item[What is a superscalar processor?]
            A superscalar processor consist of multiple FP pipelines,
            instruction fetchers and decoders.
		    Thereby can execute multiple instructions per cycle and allows
            a higher pipeline utilization and SMT. \\
		    This is a kind of ILP.

	\end{description}
	\item[SIMD] Single-Instruction-Multiple-Data
	\begin{description}[style=nextline]
		\item[How does it work?]
            Execute a single instruction which initiates multiple integer or floating
            point operations on wide "registers"/"vectors".

		\item[What are the recent vector register widths?]
            SSE has 128 Bits, AVX has 256 Bits, the Xeon Phi even 512 Bits with AVX2.

		\item[What to think about if code vectorization is desired?]
		\begin{itemize}
			\item Check for automatic loop-unrolling by compilers
	 		\item Profiling is recommended to eliminate memory bottlenecks (alignment!)
	 		\item Avoid high level C++ abstractions which could avoid optimization!
		\end{itemize}

	\end{description}
	\item[Why were caches introduced?]
        To close the increasing gap between memory bandwidth and processor
        performance by adding several layers of caches with increasing bandwidth.

	\begin{description}[style=nextline]
		\item[What is a cache?]
            It's a high speed memory with low capacity which is usually organized
            in multiple layers. Commonly the caches are integrated into the CPU
            die for high bandwidth.

		\item[What performance characteristics does it have?]
            High speed: usually allows multiple transfers per cycle

		\item[When does it make sense to use a cache?]
            If computation performance is limited by memory bandwidth
            ("memory-bound problems").

		\item[What are cache hits/misses?]
            Cache hits occur when requested items are currently held in
            the cache. Otherwise a cache miss happens.

		\item[What is the principle of locality?]
            A item that was referenced will soon be referenced again (temporal).
            Items that are close to a recently referenced item are likely to
            be referenced in near future (spatial).

	\end{description}
	\item[Which cache types do exist \& what are their differences?]
	\begin{itemize}
		\item Data cache => stores data only
		\item Instruction cache => stores instructions only
		\item Unified cache => stores both
		\item Trace/Micro-OP cache => stores decoded instructions
	\end{itemize}

	\item[Why is cache coherence important?]
        Memory contents have to be kept consistent across multiple cores.

	\begin{description}[style=nextline]
		\item[How is the coherence achieved?]
            Two main strategies: directory based (status of physical memory is
            kept in one place only) and snooping (caches are connected via a
            broadcast medium ans monitor on that medium).
            By using cache invalidation protocols like MESI. \\
		    Write-back/write-through strategies.
            Write-through: information is written to the cache line as well as
            the corresponding location in main memory.
            Write-back: information is only written to the cache and only
            transfered to main memory if the cache line is evicted.

	\end{description}
	\item[When is prefetching used?]
        Prefetching is used to continuously fetch data from the main memory
        $\Longrightarrow$ keep the bus busy.
        Cache misses usually trigger prefetches of the neighboring cache line.

\end{description}

\newpage
\subsection{Basic optimization techniques for serial code}

\begin{description}[style=nextline]
	\item[What is a hot spot?] A part of the code where a dominant fraction of the execution time is spent.

	\item[What kind of trigger mechanisms do exist?] Event-driven and trigger-driven.

	\begin{description}[style=nextline]
		\item[What is the difference between event-driven and sample-driven triggers?] \textbf{Event}-driven
			triggers count accuratly every occurance by using instrumentation or PMCs => probably high overhead. \
		\textbf{Sample}-driven triggers save the state/PC at periodic times => constant overhead.

		\item[What is instrumentation?] Instrumentation is used to trigger events during
			runtime by putting additional instructions in an existing program.
			This may be done by libraries, emulators or the compiler or the programmer itself.

	\end{description}
	\item[What kind of recording mechanisms do exist?] Profiling and Tracing.

	\begin{description}[style=nextline]
		\item[What is the difference between profiling and tracing?] Profiling summaries information about
			the runtime spent per routine/line/block. Tracing collects call-paths
			with timing information and is therefore much more detailed.

	\end{description}
	\item[What can hardware performance counter measure?] PMCs are MSRs/hw-counters to measure processor
		events like cache/branch-misses, loads/stores and much more.

	\item[What are basic single-core optimizations?]
	\begin{itemize}
		\item Do-less work
		\item Avoid expensive operations
		\item Shrink the working set
		\item Eliminate common subexpressions
		\item Avoid branches
		\item Use SIMD instruction sets
		\item Support the compiler use flags wisely
		\item C++
	\end{itemize}

	\begin{description}[style=nextline]
		\item[What are simple loop optimizations?] Break-out the loop as soon as possible.
			Jam neighbouring loops.

		\item[How to avoid expensive operations?] Use precalculated tables for fast lookup of results.

		\item[What to consider with data types?] Aligning is important. Take care when using SIMD.

		\item[How can temporary variables increase performance?] They can be kept in registers and
			therefore reduce the memory transations.

		\item[Can vectorization help for memory-bound programs?] No.

		\item[What is the role of the compiler?] Register usage, inlining, general optimizations

		\item[What are problems with C++ codes?] Avoid frequent (de)allocation,
			use static varibales whenever possible, avoid using to much abstraction.

	\end{description}
\end{description}

\newpage
\subsection{Data access optimization}

\begin{description}[style=nextline]
	\item[What can be modelled with the balance metric?] Ratio of memory bandwidth \[Words/s\] to peak
		performance \[Flops/s\].

	\begin{description}[style=nextline]
		\item[How are lightspeed, machine and code balance defined?] See formulars.

		\item[How can we get values for lightspeed, machine and code balance?] Count \# of
			load/stores and arithmetic operations per loop iteration.

		\item[What are the limitations of this model?] Expects peak performance: saturated pipelines,
			superscalarity. Latency is ignored. Data transfer and calculation overlaps perfectly.
	\end{description}

	\item[How can algorithms be classified depending on the number of arithmetic operations and data transfers?] By
		the scaling behaviour of memory transactions and arithmetic operations vs. problem size: $O(N), O(N^2), ...$.

	\begin{description}[style=nextline]
		\item[Which algorithm types have potential for optimizations?] Memory-bound usually not so much.
			Take care for computate-bound problems: \( \frac{O(N^3)}{O(N^2)} \).

		\item[Which optimizations should be applied?] Loop unrolling, jamming and blocking.
	\end{description}

	\item[Sparse matrix-vector multiply] Number of non-zero entries grows linearly with number of matrix rows.

	\begin{description}[style=nextline]
		\item[How can sparse matrices be stored and why?] Compressed row storage (CRS): store only
			non-zero entries by using additional index- and row-pointer- arrays.

		\item[How is the sparse matrix-vector multiply computed and optimized?] Workingset shrinking by
			using CRS, loop unrolling, etc..
	\end{description}
\end{description}

\newpage
\subsection{Parallel computers}

\begin{description}[style=nextline]
	\item[Which taxonomy for parallel computing does exist?] Flynns: SIMD, MIMD, (SISD, MISD)

	\item[What can limit scalability?] Serial parts: synchronization and communication

	\item[What does Amdahl's Law (strong scaling) say?] For a fixed data set, speedup $S_p$ converges
		with increasing number of workers towards $\frac{1}{s}$, where $s$ denotes the
		fraction of serial code.

	\item[What does Gustafson's Law (weak scaling) say?] For an arbitrarily large data set,
		speedup $S_p$ grows linearly with the number $N$ of parallel workers: $N p + s$ with $p = 1 - s$. \\
		Does not model the reality!

	\item[What is a advantage? of multicore processors?] Higher performance is feasible with lower
		clock frequencies and therefore lower energy consumption.

	\begin{description}[style=nextline]
		\item[Why do we have multicore processors?] Increasing the clock rate would require
		enormous cooling efforts which arn't economic.
	\end{description}

	\item[Which advantages do SMT have?] Improves the exploitation of ILP. Increase in piplined throughput.

	\item[What is a shared-memory computer?] Multiple cores are using the same main memory and address space.

	\begin{description}[style=nextline]
		\item[What is the difference between UMA and ccNUMA?] The flat model of uniform memory access (UMA)
			has equal costs for memory accesses across all cores.
			Their memory is connected to a shared chipset.
			Non-uniform memory access (NUMA) costs are dependent on the path between the memory and
			the socket which initiates the transfer.
			Memory is connected directly to sockets and/or distributed accross multiple
			boards which are connected by custom interconnects (mostly propriatary: QPI, HT).
	\end{description}

	\item[What is a distributed-memory computer?] Unlike (N)UMA systems,
		so called "No Remote Memory Access" (NORMA) systems don't have a common address space.
		Their nodes a coupled via networks and use message passing to communicate.

	\begin{description}[style=nextline]
		\item[How do hybrid systems look like?] They combine shared and distributed memory aspects
			in one cluster. Multiple shared memory nodes are coupled to a
			distributed system => multiple isles of cache corherence!
	\end{description}
\end{description}

\subsubsection{Networks}

\begin{description}[style=nextline]
	\item[Which network performance metrics do exist?]
	\begin{itemize}
		\item Bisection (band)width $b$: \# of edges which have to be removed to split the network
			into two (equal sized) halves.
		\item Diameter $dm$: Max distance between to nodes/processors.
		\item Edge connectivity $e$: Min \# of edges that must be removed to break the network it
			into two disconnected parts.
		\item Node connectivity $\delta$: Max \# of edges in the graph per node (max degree).
	\end{itemize}

	\begin{description}[style=nextline]
		\item[What can the PingPong benchmark model (latency, effective bandwidth)?] \textbf{Small}
			messages have larger overhead and therefore lower effective bandwidth (latency dominates). \\
			\textbf{Large} messages have less overhead better effective bandwidth (latency plays no role).

		\item[What does the LogP model do?] Generic model for scaling behaviour of connections between
			parallel systems. It includes:
		\begin{description}
			\item[L] Upper boundary on latency for one transfer
			\item[o] Overhead: time that a processor is involved with the transfer
			\item[g] Gap: minimum time interval between to consective transfers.
			\item[P] The number of processors
		\end{description}

		\item[What are the definitions for bisection bandwidth, diameter \& edge connectivity?] See above.
	\end{description}

	\item[What are relevant network topologies in HPC?] See table above.

	\begin{description}[style=nextline]
		\item[What are the bisection bandwidth, diameter, node \& edge connectivity of networks such as
			Ring, Bus, Switched/Fat Tree, Meshes, Hypercubes, Tori, Hybrids?] See tables above.
	\end{description}
\end{description}

\newpage
\subsection{Parallelization and optimization strategies}

\begin{description}[style=nextline]
	\item[Which types of parallelism do exist?]
	\begin{itemize}
		\item Instruction/Processor level parallelism: Superscalarity, SIMD, SMT?
		\item Node/chip level: several cores run in parallel on the same memory
		\item System level: serveral nodes run in parallel, communication over a network
	\end{itemize}

	\begin{description}[style=nextline]
		\item[What overhead do the different types have \& how can it be reduced?]
			Overhead increases from instruction to system level parallelism.
			Overhead includes Amdahl's Law implications as well as the additional time required for implementation!
			Use compiler optimization and existing patterns to reduce implementation overhead.

		\item[What is a task dependency graph \& its critical path and average concurrency?] It's a DAG
			which show dependencies on data and execution order of diffrent parts of the code.
		\begin{description}
			\item[Critial Path] Length of the longest path (sum of weights)
			\item[Avg Concurrency] Ratio of total amount of work to critical path length.
		\end{description}
	\end{description}

	\item[Which parallel design spaces \& patterns are defined by Mattson?]
	\begin{itemize}
		\item Finding Concurrency: Dependency Analysis \& Decomposition.
		\item Algorithm Structure: Organize by.. tasks, data decomposition, flow of of data.
		\item Supporting Structures: Program \& Data Structures
		\item Implementation Mechanisms: UE Management, Sychronization, Communication.
	\end{itemize}

	\begin{description}[style=nextline]
		\item[What is geometric decomposition \& adaptive mesh refinement (AMR)?] Problem/data dependent
			adaption of grid resolution. Increasing accuracy in important regions.
			Split the problem space geometrically and distribute these parts to UEs.

		\item[What does load imbalance mean? Which types do exist? How to get a load balanced application?] In
			a team of threads/UE, one takes longer because it's working set takes longer to
			solve than the others. These threads are called "speeders" \& "laggers". \\
			There are geometric and graph-based load balancing methods which are statically or dynamic.

		\item[What are differences between SPMD, Master/Worker, Loop Parallelism and Fork/Join?
			Name a typical example.]
			All these are supporting structures to parallelize algorithms.
			SPMD and Loop Parallelism execute the same code on multiple UEs.
			Master/Worker, Fork/Join offer a greater flexibility in this regard.

		\item[What does NUMA affinity mean? How can memory placement and binding affect performance?]
			For optimal performance, data has to be closely located to the
			cores which are using it. Modern OSes use "affinity on first touch"
			which allocates memory on the same socket where the task is executed.
	\end{description}

	\item[Which are typical approaches in a Molecular Dynamics application?]
	\begin{itemize}
		\item Particle Decomposition
		\item Force Decomposition
		\item Domain Decomposition
	\end{itemize}
\end{description}

\newpage
\subsection{Distributed-memory programming with MPI}

\begin{description}[style=nextline]
	\item[Basic ideas of MPI] Don't rely on cache coherence.
		Use messages to communicate between workers/UEs => SPMD.

	\begin{description}[style=nextline]
		\item[What is SPMD and how is it implemented by MPI?]
			All workers execute the same program.
			Program flow is directed by so called "ranks" which are unique per process.

		\item[How MPI abstracts the communication?] MPI provides a runtime API with a large set of
			functions for point-to-point and collective operations.
			It also has abstraction mechanisms for the underlying network topology.
			There's a set of MPI types which are portable between architectures / programming languages.
 	\end{description}

	\item[Point-to-point operations]
	\begin{description}[style=nextline]
		\item[How to transfer data between processes in the form of messages?]
			Use \lstinline$MPI_Send()$, \lstinline$MPI_Recv()$ functions
			(or their variants) for point-to-point communication.
			See API reference above.
		\item[How to prevent deadlocks and overlap communication and computation?]
			Use non-blocking \lstinline$MPI_Isend()$, \lstinline$MPI_Irecv()$
			in conjunction with the MPI request methods \lstinline$MPI_Wait()$
			and \lstinline$MPI_Test()$. \\
			Alternatively a deadlock-free \lstinline$MPI_Sendrecv()$ is available
			which combines sending and receiving in on function call.
	\end{description}

	\item[Collective operations]
	\begin{description}[style=nextline]
		\item[How to scatter and gather data and perform operations on distributed data?] Use the
			\lstinline$MPI_Scatter()$ and \lstinline$MPI_Gather()$ or their variable chunk variants.
	\end{description}

	\item[Virtual topologies]
	\begin{description}[style=nextline]
		\item[How to distribute processes over a regular grid?]
			Use the \lstinline$MPI_Cart_*()$ to create a new MPI communicator.
			There are functions to translate MPI ranks to coordinates
			as well to split grids and get adjacent ranks.
	\end{description}

	\item[Derived datatypes]
	\begin{description}[style=nextline]
		\item[How to combine MPI datatypes into more complex entities?] Use the
			\lstinline$MPI_Type_contiguous()$ and \lstinline$MPI_Type_vector()$ functions.
			Type signature must be the same between processes,
			gaps \& alignment are allowed to differ (architectural properties).
	\end{description}
\end{description}

\newpage
\subsection{Shared-memory programming with OpenMP}

\begin{description}[style=nextline]
	\item[Basic principle of OpenMP] Extend an existing application by
		using \texttt{\#pragma} directives.
		Worksharing is used to distribute loops, sections to multiple threads
		on a shared-memory machine.

	\begin{description}[style=nextline]
		\item[Execution model] OpenMP programs start with a single master thread.
			Worker threads are spawned/joined by the master
			in parallel regions => Fork/Join concept.
		\item[Parallel region + worksharing constructs] OpenMP handles the thread management work.
	\end{description}

	\item[Scoping] There are OpenMP clauses the specifiy the scope (shared or private) of variables.
	\begin{description}[style=nextline]
		\item[Data sharing clauses] See API reference above.
	\end{description}

	\item[Synchronization]
	\begin{description}[style=nextline]
		\item[Critical section] Use \lstinline$#pragma omp critical$ directive.
		\item[Reduction clause] Use \lstinline$reduction(operation:var-list)$ clause.
 		\item[Team and Task-Barriers] Use \lstinline$#pragma omp barrier$ and \lstinline$#pragma omp taskwait$ directives.
 	\end{description}

	\item[Runtime library]
	\begin{description}[style=nextline]
		\item[Important functions] See API reference above.
	\end{description}

\end{description}

\newpage
\subsection{Hybrid programming (MPI + OpenMP)}

\begin{description}[style=nextline]
	\item[Hybrid programming basics] \hfill
	\begin{description}[style=nextline]
		\item[Why we need hybrid programs] \hfill
		\item[Hybrid programming models] \hfill
	\end{description}

	\item[Threading modes of MPI] \hfill
 	\begin{description}[style=nextline]
		\item[What levels of thread support MPI provides] \hfill
		\item[Potential troubles with multithreaded MPI programs] \hfill
	\end{description}

	\item[Addressing multiple threads within MPI processes] \hfill
	\begin{description}[style=nextline]
		\item[How to work around the flat addressing space of MPI] \hfill
		\item[Using multiple communicators in a hybrid context] \hfill
	\end{description}

	\item[Running hybrid programs on the RWTH cluster] \hfill
	\begin{description}[style=nextline]
		\item[How to properly instruct LSF to run your hybrid job] \hfill
	\end{description}
\end{description}

\newpage
\subsection{Heterogeneous architectures (GPUs, Xeon Phis)}

\subsubsection{GPGPU's}

\begin{description}[style=nextline]
	\item[How does a GPU look like?]
		A GPGPU is a manycore architecture optimized for high data-parallel throughput.
		It consists of  multiple streaming processors (SM), which itself consists of many cores.
	\begin{description}[style=nextline]
		\item[Why do GPUs deliver a good performance per Watt ratio?] GPGPUs have small caches,
			many low frequency cores and little control logic.
			Memory is close to the processing unit.
		\item[What is the difference to CPUs?] There are more transistors dedicated to computation
			instead of controlling. A CPU has large caches with multiple levels,
			supports OoO execution, sophisticated control logic like speculative
			execution and hw-scheduling of microops.
		\item[How does the memory hierarchy look like?] Each SM has seperated shared memories,
			caches, registers and texture storage.
			All SM have access to a common LLC and global memory of the GPU.
			This global memory is seperated from the host.
			There's no cache coherence for GPU caches!
			Communication with the host has to be done by DMA transfers.
		\item[How can the logical programming hierarchy be mapped to the execution model?]
			The execution model is host-directed.
			Portions of the code, so called "kernels" are offloaded to the GPU.
			These kernels are executed highly parallel by an array of threads.
	\end{description}

	\item[Which models can be used to program a GPU?] There are special GPU programming APIs
		like CUDA and more general directive based models like OpenACC/MP.
	\begin{description}[style=nextline]
		\item[How to handle offloading of regions?]
			Programmer has to manually find regions	which can be parallelized.
			These "kernels" are compiled to special GPU code and called by the host.
		\item[How to handle data management?] Data transfer has to be done explicitly.

		\item[What are the main differences?] \hfill
	\end{description}

	\item[Which impact does the PCIe have?] Usually GPUs are have very fast memory.
		The PCIe bus isn't fast enough to saturate this memory bandwidth => bottleneck.

	\item[What is branch divergence?] 32 threads are joined to "warps".
		These warps share a common program counter => branches are serialized inside warps.
	\begin{description}[style=nextline]
		\item[Which performance impact does it have?]
			Branch divergancy causes threads which doesn't took a branch to idle.
			It therefore leds to performance degration =>
			Branch only with sets of multiple warps.
	\end{description}

	\item[What can be a good launch configuration and why?] \hfill
	\begin{description}[style=nextline]
		\item[How can the launch configuration be specified?] \hfill
	\end{description}

	\item[What can be done to saturate the bus?] Maximize PCIe throughput by transfer
		less data, batch smaller transfers to larger ones.
	\begin{description}[style=nextline]
		\item[What is coalescing?] \hfill
		\item[How can it be achieved?] \hfill
		\item[What impact does AoS and SoA have?] \hfill
		\item[What is the difference between caching and non-caching loads?] \hfill
	\end{description}

	\item[What is shared memory?] CUDA SM/blocks have dedicated shared memory.
		This can be used for low-latency inter-thread communication.

	\item[How to avoid CPU/ GPU idling?] Use asynchronous data transfers and CUDA streams
		to enqueue multiple kernels for execution.
\end{description}

\subsubsection{Xeon Phi}

\begin{description}[style=nextline]
	\item[How does a Xeon Phi look like?]
		The Xeon Phi consists of 60 cores with 4-fold SMT,
		has a clock frequency of 1090~Mhz
		and a peak performance of about 1 TFlop/s.
	\begin{description}[style=nextline]
		\item[How does the memory hierarchy look like?]
			All cores are connected to their own L1/L2 caches by a ring bus.
			There's a common Memory and I/O interface which also connected to ring bus
			and used to access the global GDDR5 RAM.
			Caches a kept coherent. Data transfer to the host has to be done via DMA.
			There's no common address space with the host.

		\item[How many threads/ vector-widths are available?]
		\begin{description}
			\item[Threads] 60 cores * 4 threads = 240 threads
			\item[Vector-width] AVX-512 has 512 bits per vector
				(extension of AVX2, MIC only).
		\end{description}
	\end{description}

	\item[Which programming concepts do exist?] OpenMP/CL/ACC, MPI and Posix threads.
		The Phi is a slightly modified x86 architecture and can be programmed used
		like a normal x86 system. Intel offers propriatary language offloading extensions (LEO)
		for their own C compiler.
	\begin{description}[style=nextline]
		\item[How can OpenMP (4.0) be used?] The recent OpenMP spec adds new
			directives (\texttt{target} and \texttt{teams}) which can be used
			to offload parts of the code.
			This concept has been adopted from the OpenACC
			standard which can be considered to be a testbed for OpenMP.
	\end{description}

	\item[Which optimization strategies should be applied?]
		Massive SIMD vectorization, check data aligment (SoA, AoS), avoid aliasing.
	\begin{description}[style=nextline]
		\item[Which impact can the PCIe have?]
			PCIe is a bottleneck for host to Phi data transfer.
			Adds latency for communication => see GPGPU.
		\item[Which impact does vectorization have?] Huge impact for the Phi.
			It has support for FMA-3 with large SIMD vector registers.
	\end{description}
\end{description}

\end{document}