rwth-misc/hpc/hpc_summary.tex

1114 lines
47 KiB
TeX

\documentclass[11pt]{article}
\usepackage{amsmath,amssymb,latexsym}
\usepackage{fancyhdr}
\usepackage[margin=2.50cm]{geometry}
\usepackage[compact]{titlesec}
\usepackage{hyperref}
\usepackage[T1] {fontenc}
\usepackage{color}
\usepackage{multirow}
\usepackage{array}
\usepackage{paralist}
\usepackage{listings}
\usepackage{enumitem}
\hypersetup{colorlinks=true, linkcolor=blue, urlcolor=cyan}
\lstset{
moredelim=[is][\color{green}]{§}{§},
moredelim=[is][\color{red}]{?}{?},
moredelim=[is][\color{cyan}]{|}{|}
}
% Title infos
\title{HPC Formulars}
\author{Steffen Vogel}
\date{\today}
% headline
\lhead{HPC Formulars}
\chead{Steffen Vogel}
\rhead{\today}
\pagestyle{fancy}
\setlength{\parindent}{0cm}
\begin{document}
\section{Amdahl \& Gustafson}
\begin{tabular}{ l l | c c }
& & \textbf{Amdahl's Law} & \textbf{Gustafson's Law} \\
& & \textit{strong scaling} & \textit{weak scaling} \\
\hline
\textbf{Speedup} & \( S_p(N) = \frac{T(1)}{T(N)} \) & \( \frac{1}{s + \frac{1 - s}{N}} \xrightarrow{N \rightarrow \infty} \frac{1}{s} \) & \( Np + s \xrightarrow{N \rightarrow \infty} \infty \) \\
\textbf{Efficency} & \( \varepsilon_p(N) = \frac{S_p(N)}{N} \) & \( \frac{1}{s (N-1) + 1} \xrightarrow{N \rightarrow \infty} 0 \) & \( \frac{1 - p}{N} + p \xrightarrow{N \rightarrow \infty} p \) \\
\end{tabular}
\section{Moore's Law}
\begin{quote}
\# of transistors / cost-effective IC doubles every $N$ months, with $12 \leq N \leq 24$
\end{quote}
\section{Energy Efficency}
\begin{tabular}{ p{7cm} l }
Power-voltage-frequency & \( P \sim V^2 \cdot f \) or \( P \sim f^3 \) \\
& \\
energy efficency & = \( \frac{\text{energy}}{\text{Flop}} \) \\
& = \( \frac{\text{power consumption} \cdot \text{time}}{\text{Flop}} \) \\
& = \( \frac{\text{power consumption}}{\text{Flop}/s} \) \\
energy & = \( \text{power consumption} \cdot \text{time} \) \\
& \\
Physical limit for efficency & 2.87e-21 Joule/Bit \( \approx ln(2) \cdot k \cdot T \) \\
& 1.84e-19 Joule/Flop \\
Green TOP500 \#1 & 2.22e-10 Joule/Flop \\
& \\
Power wall & expected to be around 20 to 25 MWatt \\
\end{tabular}
\section{Pipelining}
\begin{tabular}{ p{7cm} l }
Stages & $m$ \\
Operations & $N$ \\
Without pipeline & \( T_{seq} = m \cdot N \) \\
With pipeline & \( T_{pipe} = N + m - 1 \) \\
Speedup & \(S_{pipe} = \frac{m}{1 + \frac{m-1}{N}} \xrightarrow{N \rightarrow \infty} m \) \\
Throughput (results/cycle) & \( \frac{N}{T_{pipe}} = \frac{1}{1 + \frac{m-1}{N}} \xrightarrow{N \rightarrow \infty} 1 \) \\
\end{tabular}
\section{Memory Access}
\begin{tabular}{ p{7cm} l }
Hit rate & $\beta$ \\
Miss rate & $(1 - \beta)$ \\
Access time& $T_m$ (Memory), $T_c$ (Cache) \\
Average access time & \( T_{avg} = \beta T_c + (1 - \beta) T_m \) \\
Performance gain & \( G( \theta, \beta ) = \frac{T_m}{T_{avg}} = \frac{\theta}{\beta + \theta(1 - \beta)}, \theta = \frac{T_m}{T_c} \) \\
\end{tabular}
\section{Networks}
\subsection{Transmission \& Bandwidth}
\begin{tabular}{ p{7cm} l }
Transmission time & \( T = T_L + \frac{N}{B} \) \\
Effective bandwidth & \( B_{eff} = \frac{N}{T} = \frac{N}{T_L + \frac{N}{B}} \) \\
\end{tabular}
\subsection{Topologies}
\begin{tabular}{l | c c c c }
\textbf{Topology} & \textbf{Max degree} & \textbf{Edge connectivity} & \textbf{Diameter} & \textbf{Bisection BW} \\
\hline
Bus & $1$ & $1$ & $1$ & $B$ \\
Ring & $2$ & $2$ & $\lfloor \frac{N}{2} \rfloor$ & $2B$ \\
Fully connected & $N-1$ & $N-1$ & $1$ & $\frac{N^2}{4}$ \\
Sw. Fat Tree & 1 w/o redunancy & depends on design & 2 hierarchy height & depends on design \\
Mesh & $2d$ & $d$ & $\sum_{i=1}^d (N_i - 1)$ & $B ( \prod_{i=1}^{d-1} N_i )$ \\
Torus & $2d$ & $2d$ & $\sum_{i=1}^d \lfloor \frac{N}{2} \rfloor$ & $2B ( \prod_{i=1}^{d-1} N_i )$ \\
Hypercube & $d$ & $d$ & $d$ & $B2^{d-1}$ \\
\end{tabular}
\section{Balance, Lightspeed}
\begin{tabular}{ p{7cm} l }
Code balance & \( B_c = \frac{\text{data transfers}}{\text{arithmetic ops}} = \frac{[Words]}{[Flops]} \) \\
Machine balance & \( B_m = \frac{b_s}{P_{max}} \) \\
Relative lightspeed & \( l = \min(1, \frac{B_m}{B_c} ) \) \\
Absolute lightspeed performance & \( P = P_{max} \cdot l \) \\
\end{tabular}
\section{Programming API's}
Argument directions: \lstinline$§input§$, \lstinline$?output?$, \lstinline$|inout|$
Common arguments:
communicator = cm
\subsection{MPI}
All MPI routines return an integer error code!
\subsubsection{Initialization}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Init(§argc§, §argv§)$ & Has to be called before every other MPI\_* function. \\
\lstinline$MPI_Finalize()$ & Like \lstinline$MPI_Init()$ this must only be called once. \\
\lstinline$MPI_Initialized(?flag?)$ & Check if MPI runtime was already initialized. \\
\lstinline$MPI_Finalized(?flag?)$ & Check if MPI runtime was already shut down. \\
\end{tabular}
\subsubsection{Point-to-point data transfer}
\begin{tabular}{ l | l l l }
\textbf{Mode / Op} & \textbf{Blocking} & \textbf{Non-blocking} & \textbf{Completes after...} \\
\hline
Standard & \lstinline$MPI_Send$($\otimes$) & \lstinline$MPI_Isend$($\otimes, \ddag$) & the message has been safely stored away. \\
Synchronous & \lstinline$MPI_Ssend$($\otimes$) & \lstinline$MPI_Issend$($\otimes, \ddag$) & the receive was called. \\
Buffered & \lstinline$MPI_Bsend$($\otimes$) & \lstinline$MPI_Ibsend$($\otimes, \ddag$) & the message was stored away in a buffer. \\
Ready-mode & \lstinline$MPI_Rsend$($\otimes$) & \lstinline$MPI_Irsend$($\otimes, \ddag$) & only if the receive was called before send. \\
\hline
Standard & \lstinline$MPI_Recv$($\diamond, \dag$) & \lstinline$MPI_Irecv$($\diamond, \ddag$) & a message was received. \\
Probing & \lstinline$MPI_Probe$() & \lstinline$MPI_Iprobe$ & immediately. \\
Waiting & \lstinline$MPI_Wait$($\ddag, \dag$) & \lstinline$MPI_Test$($\ddag, $\lstinline$?flag?$$, \dag$) & message available. \\
\end{tabular}
\textbf{Signatures}: \\
{ \footnotesize
\hspace{2cm} $\otimes$ \lstinline$(§buffer§, §count§, §type§, §dst§, §tag§, §cm§)$, \\
\hspace{2cm} $\diamond$ \lstinline$(?buffer?, §count§, §type§, §dst§, §tag§, §cm§)$, \\
\hspace{2cm} $\dag$ \lstinline$(?status?)$ $\ddag$ \lstinline$(?request?)$
}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Get_count(§stat§, §type§, ?count?)$ & Get the number of received elements by \lstinline$MPI_Recv()$. \\
\lstinline$MPI_Sendrecv(§sbuf§, §scnt§, §stype§, §dst§, §stag§,$ & Simultaneous, deadlock-free send and receive. \\
\lstinline$ ?rbuf?, ?scnt?, ?stype?, ?dst?, ?stag?, §cm§, ?stat?)$ & \\
\lstinline$MPI_Sendrecv_replace(...)$ & Like \lstinline$MPI_Sendrecv()$ but using the same buffer. \\
\end{tabular}
\subsubsection{Collective data transfer}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Bcast(|buf|, §cnt§, §type§, §root§, §cm§)$ & Broadcasts data from root to all other. \\
\lstinline$MPI_Scatter(§sbuf§, §scnt§, §stype§,$ & Scatters chunks of data from root to all \\
\lstinline$ ?rbuf?, §rcnt§, §rtype§, §root§, §cm§)$ & \\
\lstinline$MPI_Gather(§sbuf§, §scnt§, §stype§,$ & Gathers chunks of data from root to all \\
\lstinline$ ?rbuf?, §rcnt§, §rtype§, §root§, §cm§)$ & \\
\lstinline$MPI_Allgather(...)$ & \\
\lstinline$MPI_Alltoall()$ & \\
\lstinline$MPI_Reduce()$ & \\
\lstinline$MPI_Allreduce(...)$ & \\
\end{tabular}
\subsubsection{Synchronisation}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Barrier(§cm§)$ & Waits for all processes. \\
\end{tabular}
\subsubsection{Communicator handling}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Comm_dup(...)$ & \\
\lstinline$MPI_Comm_split(...)$ & \\
\lstinline$MPI_Comm_compare(...)$ & \\
\lstinline$MPI_Comm_free(...)$ & \\
\lstinline$MPI_Comm_rank(§cm§, ?rank?)$ & Get rank/id of current process in this communicator. \\
\lstinline$MPI_Comm_size(§cm§, ?size?)$ & Get total number of processes in this communicator. \\
\end{tabular}
\subsubsection{Virtual topologies}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Cart_create(..)$ & Creates a new cartesian cm from given old one.\\
\lstinline$MPI_Cart_rank(...)$ & \\
\lstinline$MPI_Cart_shift(...)$ & \\
\lstinline$MPI_Cart_sub(...)$ & \\
\lstinline$MPI_Cart_coords(...)$ & \\
\lstinline$MPI_Dims_create(...)$ & \\
\end{tabular}
\subsubsection{Derived Datatypes}
\begin{tabular}{ p{8cm} l }
\lstinline$MPI_Type_contiguous(...)$ & \\
\lstinline$MPI_Type_vector(...)$ & \\
\lstinline$MPI_Type_commit(...)$ & \\
\lstinline$MPI_Type_free(...)$ & \\
\end{tabular}
\subsubsection{Timekeping}
\begin{tabular}{ p{8cm} l }
\lstinline$double MPI_Wtime()$ & Measure local wall-clock time. \\
\lstinline$double MPI_Wtick()$ & Obtain the precision of the MPI timer. \\
\end{tabular}
\subsection{CUDA}
\subsubsection{General Memory Operations}
\begin{tabular}{ p{7cm} l }
\lstinline$cudaMalloc(&p, n*sizeof(float));$ & Allocate memory of n floats on GPU, save pointer in p\\
\lstinline$cudaFree(p);$& Free memory on GPU, which p is pointing to\\
\lstinline$cudaHostMalloc(&p, n*sizeof(float));$ & Allocate memory of n floats on Host, save pointer in p\\
\lstinline$cudaHostFree(p);$& Free memory on Host, which p is pointing to\\
\lstinline$cudaMemcpy(d, h, n * sizeof(float),DIR);$ & memcopy Host => GPU, if DIR = cudaMemcpyHostToDevice \\
\lstinline$cudaMemcpy(h, d, n * sizeof(float),DIR);$& GPU => Host if DIR = cudaMemcpyDeviceToHost\\
\lstinline$func<<<blockNum, threadNum>>>();$ & Call CUDA Kernel code $func$, with given \# of blocks / threads\\
\end{tabular}
\subsubsection{Shared Memory}
\begin{tabular}{ p{7cm} l }
\lstinline$__shared__ double[x][y];$ & Static shared memory between threads\\
\lstinline$extern __shared__ double[x][y];$ & Dynamic shared memory between threads\\
\lstinline$__syncthreads();$ & Synchronize threads after shared memory usage \\
\end{tabular}
\subsubsection{Asynchronous operations}
\begin{tabular}{ p{7cm} l }
\lstinline$cudaStream_t st1,st2;$ & Defines streams. Instructions of one stream are exec'd in order.\\
\lstinline$cudaStreamCreate(&st1)$& Initialize Stream\\
\lstinline$cudaMemcpyAsync(dst,src,size,dir,st1);$& Async memcpy in Stream st1. (nonlbocking)\\
\lstinline$cudaStreamSynchronize(st1);$ & Host waits for stream\\
\lstinline$cudaDeviceSynchronize();$ & Host waits for everything \\
\end{tabular}
\subsection{OpenMP}
\begin{tabular}{ p{7cm} p{9cm} }
\lstinline$#pragma omp parallel$ & Forms a team of threads and starts parallel execution. \\
\lstinline$#pragma omp for$ & Split iterations of following for loop across threads. \\
\lstinline$#pragma omp sections$ & Non-iterative worksharing of diffrent sections. \\
\lstinline$#pragma omp section$ & \\
\lstinline$#pragma omp single$ & Region shall only be executed by a single thread. \\
\lstinline$#pragma omp critical$ & Region is executed by all threads but only
by a single simultaneously. \\
\lstinline$#pragma omp barrier$ & All tasks created by any thread of the current
team are guaranteed to complete at barrier exit. \\
\lstinline$#pragma omp taskwait$ & Encountering task suspends until all (direct) child tasks are complete. \\
\lstinline$#pragma omp task$ & Defines an explicit task. \\
\lstinline$#pragma omp target [data]$ & \\
\lstinline$#pragma omp distribute$ & \\
\lstinline$#pragma omp teams$ & \\
\end{tabular}
\subsubsection{Clauses}
\begin{tabular}{ p{7cm} p{9cm} }
\lstinline$num_threads(cnt)$ & Number of threads for a parallel region. \\
\lstinline$default(shared|none)$ & Set default scope. \\
\lstinline$[first|last]private(list)$ & Thread private scope. \\
\lstinline$shared(list)$ & Shared scope among thread team. \\
\lstinline$proc_bind(master|close|spread)$ & OMP places? \\
\lstinline$nowait, untied$ & \\
\lstinline$reduction(op:list)$ & Do reduction after parallel block. \\
\lstinline$schedule(kind,chunk_size)$ & \\
\lstinline$if(condition)$ & Only fork/enter thread/parallel region if \texttt{condition} is true. \\
\end{tabular}
\subsubsection{Environment variables}
\begin{tabular}{ p{7cm} l }
\lstinline$OMP_NUM_THREADS$ & \\
\lstinline$OMP_PLACES$ & \\
\lstinline$KMP_AFFINITY$ & \\
\end{tabular}
\subsubsection{Runtime API}
\begin{tabular}{ p{7cm} p{9cm} }
\lstinline$omp_{set,get}_num_threads(n)$ & Get/set the number of threads used for the encountering region. \\
\lstinline$omp_get_thread_num()$ & Returns the number of the current thread (see \lstinline$MPI_Comm_rank()$).
\end{tabular}
\subsection{OpenACC}
\begin{tabular}{ p{7cm} l }
\lstinline$#pragma acc parallel [loop, d-clauses]$ & \\
\lstinline$#pragma acc kernels [loop, d-clauses]$ & \\
\lstinline$#pragma acc loop [gang, reduction(op:list)]$ & \\
\lstinline$#pragma acc wait$ & \\
\lstinline$#pragma acc data [d-clauses]$ & \\
\lstinline$#pragma acc update {host,device}(data)$ & \\
\lstinline$#pragma acc {enter,exit} data [d-clauses]$ & \\
\lstinline$#pragma acc routine (name)$ & \\
\end{tabular}
\section{LSF}
See \texttt{man bsub}.
\section{Common problems and operations}
\begin{tabular}{ p{7cm} l }
Single-precision real Alpha X Plus Y & \( \vec{y} = \alpha \cdot \vec{x} + \vec{y} \) \\
(S/DAXPY, FMA, Triad) & \\
Sparse matrix vector multiply & \( \vec{y} = \mathbf{A} \cdot \vec{x} \) \\
Dense matrix vector multiply & \\
Dense matrix multiplication & \( \mathbf{A} = \mathbf{B} \cdot \mathbf{C} \) \\
Dense matrix diagonalization & \( \mathbf{P}^{-1} \mathbf{A} \mathbf{P} = diag(\lambda_1, ..., \lambda_n) \) \\
Scalar product & \( s = \vec{x} \cdot \vec{y} \) \\
Vector addition & \( \vec{y} = \vec{x} + \vec{z} \) \\
Matrix addition & \( \mathbf{A} = \mathbf{B} + \mathbf{C} \) \\
Matrix transpose & \( \mathbf{A} = \mathbf{B}' \) \\
\end{tabular}
\newpage
\section{What you've learned?}
\subsection{Why supercomputers?}
\begin{description}[style=nextline]
\item[What is a supercomputer?] A supercomputer is a computer
at the frontline of current processing capacity, particularly speed of calculation.
\item[What are supercomputers used for?]
\begin{itemize}
\item Scientific computing \& simulation
\item Weapon research
\item Economics (Oil, minerals)
\item Medicine
\item Meteorology
\end{itemize}
\item[What are recent supercomputers] See TOP500 list of world's fastest systems:
\begin{enumerate}
\item Tianhe-2 (33 PFlops/s)
\item Titan (17 PFlops/s)
\end{enumerate}
\item[What can we read from the performance development as measured in the
TOP500?] The TOP500 lists by peak performance $R_{max}$
in Flops/s (double) measured by a standardized benchmark (LIN/LAPACK). \\
Future trends are predictable (exascale computing, architectures).\\
At the moment, it takes approximately 11 years to increase performance by 3
orders of magnitude (ExaFlop/s projected for 2019).
\item[What does Moore's Law tell you? Is it still valid?]
"The number of transistors per chip doubles around every one to two years." \\
Slope is now declining slowly. But mostly still valid.
\item[Why do we have multi-core architectures today?]
Clock frequencies can't be pushed further due to limited cooling capabilities. \\
Solution: Decrease frequency; increase die-size by putting more cores on a single chip and exploit parallelism.
\end{description}
\newpage
\subsection{Energy Efficiency}
\begin{description}[style=nextline]
\item[What is the power wall?] 20 to 25 MegaWatt of energy consumption is
considered an important barrier for HPC centres.\\
The power wall gives us an upper limit on power consumption.
\item[What is the problem?]
To meet the exa-scale goal in 2019, the current trend of energy efficiency isn't sufficient.
Power consumption will become the dominating cost factor for
operate clusters in the future, with energy costs surpassing investment costs.
\begin{description}[style=nextline]
\item[Aspects of energy efficency] Energy efficiency is continuously
improving. To meet the exa-scale goal this has to be focused more by using
new architectures (accelerators).
At the moment, we have a performance increase of 2 orders of magnitude
every 7 years while energy efficiency ($\frac{\text{power consumption}}{\text{Flop/s}}$)
decreases by only 1 order of magnitude in the same time.
This means power consumption is increasing 1 order of magnitude every 7 years.
\item[Physical descriptions \& limits]
Landauer's principle: "Changing one bit of information needs at
least an amount of energy of $ln(2) \cdot k \cdot T$". \\
The current top Green500 systems are about 10 orders of magnitude away from this limitation.
\end{description}
\item[How can we benchmark energy efficency?] Measure only the consumption of a
single rack (submodule) and extrapolate. Average over different benchmarks.
\begin{description}[style=nextline]
\item[Advantages \& disadvantages of Green500, SPEC Power]
High variations in idle and during benchmarks and between nodes.
Power extrapolation may not be allowed.
The Green500 assumes that the computational workload during the Linpack
benchmark is well-balanced across all units and that all units are
identical and have the same power consumption for the same workload.
Those assumptions are however very dangerous and often wrong. Idle and
Linpack power consumption can vary from node to node as well as over time.
\end{description}
\item[What are the architectural aspects of energy efficency?]
Current von Neumann architectures weren't designed with energy
efficiency in mind:
OoO execution, cache coherency and prefetchers are inefficient. \\
Desktop/mobile power saving methods are applied to HPC (C and
P-states) C0 is the operating state, everything above is an idle state.
With increasing numbers, more and more parts are shut down to conserve
energy (at the cost of an increased wakeup time). P-states are all
operational states, but they switch between supported frequencies \&
voltages to save energy in case maximal performance is not needed
$\Longrightarrow$ DVFS.\\
There are many parameters that affect energy efficiency: number of
cores, clock frequency, prefetcher, SMT, branch prediction, GPU clock
frequency, cache size, speed of interconnects. It is important to find
sweet spots for those parameters, not only for whole applications but
also for each application phase.
\begin{description}[style=nextline]
\item[What is dynamic frequency and voltage scaling?]
Reduce voltage/frequency during idle times to save energy.
Implemented by processor P-states.
\end{description}
\item[What are the future plans to climb the power wall?]
\begin{itemize}
\item Optimize current architectures
\item Power-efficient accelerators
\item Reduce idle power to $\approx 0 W$
\item Load-balancing by DVS/DFS.
\end{itemize}
\begin{description}[style=nextline]
\item[What about GPU's?] GPU's are a lot more efficient because more
transistors are used for computation instead of controlling/management tasks.
They already use faster DDR5-RAM. But PCIe will become a bottleneck when CPUs get stacked RAM.
\item[What about stacked memory?] Is a must-have to reach the exa-scale goal.
Shorter paths allow higher bandwidths and lower voltages (maintainability? extend
the caching hierarchy by SSDs?).
\end{description}
\end{description}
\newpage
\subsection{Modern Processors}
\begin{description}[style=nextline]
\item[What is an ISA?] The Instruction Set Architecture is the part of an
architecture that is visible to the application programmer / compiler.
It contains e.g. opcode, operands, registers, addressing modes\dots
\begin{description}[style=nextline]
\item[What are the differences in ISA's?]
\begin{itemize}
\item Instructions, CISC or RISC
\item Memory addressing modes (effective of relative)
\item Type of Internal storage, e.g. stack, accumulator or
general-purpose registers (dominant today)
\item Endianess
\item Type and size of operands
\end{itemize}
\item[What are the main differences between RISC and CISC?]
\begin{description}
\item[CISC] Instructions may involve complex operations and are variable in length.
\item[RISC] Similar encoding for all instructions, fixed length.
Strict distinction between data loading/storing and manipulation.
\end{description}
\item[What do we have today?] HPC dominant x86 architectures are CISC
based (but get translated to internal RISC micro instructions).
\end{description}
\item[What is the basis of today's processors?]
They mostly decode CISC instructions to microcode ops (hybrid CISC/RISC).
Modern processors are still based on the Von Neumann architecture,
albeit they usually consist of two or more cores.
In addition, ALU, control unit, memory controller etc. are all
integrated on the core.
\begin{description}[style=nextline]
\item[What are the corresponding bottlenecks?]
The main bottlenecks today are memory access speed and sequential
architecture (SISD)
\item[How can we overcome these?]
Pipelining, ILP (superscalarity), SIMD - Single Instruction
Multiple Data, Prefetching.
\end{description}
\item[What is ILP?]
Instruction Level Parallelism is the potential overlap of instruction
execution caused by superscalarity, OoO and pipelines.
\begin{description}[style=nextline]
\item[What is its influence on today's computers? over time?]
Todays CPU's use hardware implemented schedulers, dependency
resolvers for microcode operations $\Longrightarrow$ very complex
controlling.
\end{description}
\item[Pipelining]
\begin{description}[style=nextline]
\item[How does it work?]
Pipelining splits the execution of an instruction in multiple
preferably equally long 'stages' which are orthogonal to each other. \\
This allows reusing the execution unit of one stage for the next
instruction before the first instruction has retired.
\item[How to compute possible speedup/throughput?] See formulas.
\item[Problems?]
\begin{itemize}
\item Pipeline stalls caused by phases of different lengths
\item Hazards caused by dependent data, instructions, branch-misses...
\item Imbalanced CISC instruction decoding/fetching. Instructions
are not of equal length and complexity.
\item Aliasing...
\end{itemize}
\end{description}
\item[Superscalarity]
"Execute multiple instructions per cycle."
\begin{description}[style=nextline]
\item[What is a superscalar processor?]
A superscalar processor consist of multiple FP pipelines,
instruction fetchers and decoders.
Thereby can execute multiple instructions per cycle and allows
a higher pipeline utilization and SMT. \\
This is a kind of ILP.
\end{description}
\item[SIMD] Single-Instruction-Multiple-Data
\begin{description}[style=nextline]
\item[How does it work?]
Execute a single instruction which initiates multiple integer or floating
point operations on wide "registers"/"vectors".
\item[What are the recent vector register widths?]
SSE has 128 Bits, AVX has 256 Bits, the Xeon Phi even 512 Bits with AVX2.
\item[What to think about if code vectorization is desired?]
\begin{itemize}
\item Check for automatic loop-unrolling by compilers
\item Profiling is recommended to eliminate memory bottlenecks (alignment!)
\item Avoid high level C++ abstractions which could avoid optimization!
\end{itemize}
\end{description}
\item[Why were caches introduced?]
To close the increasing gap between memory bandwidth and processor
performance by adding several layers of caches with increasing bandwidth.
\begin{description}[style=nextline]
\item[What is a cache?]
It's a high speed memory with low capacity which is usually organized
in multiple layers. Commonly the caches are integrated into the CPU
die for high bandwidth.
\item[What performance characteristics does it have?]
High speed: usually allows multiple transfers per cycle
\item[When does it make sense to use a cache?]
If computation performance is limited by memory bandwidth
("memory-bound problems").
\item[What are cache hits/misses?]
Cache hits occur when requested items are currently held in
the cache. Otherwise a cache miss happens.
\item[What is the principle of locality?]
A item that was referenced will soon be referenced again (temporal).
Items that are close to a recently referenced item are likely to
be referenced in near future (spatial).
\end{description}
\item[Which cache types do exist \& what are their differences?]
\begin{itemize}
\item Data cache => stores data only
\item Instruction cache => stores instructions only
\item Unified cache => stores both
\item Trace/Micro-OP cache => stores decoded instructions
\end{itemize}
\item[Why is cache coherence important?]
Memory contents have to be kept consistent across multiple cores.
\begin{description}[style=nextline]
\item[How is the coherence achieved?]
Two main strategies: directory based (status of physical memory is
kept in one place only) and snooping (caches are connected via a
broadcast medium ans monitor on that medium).
By using cache invalidation protocols like MESI. \\
Write-back/write-through strategies.
Write-through: information is written to the cache line as well as
the corresponding location in main memory.
Write-back: information is only written to the cache and only
transfered to main memory if the cache line is evicted.
\end{description}
\item[When is prefetching used?]
Prefetching is used to continuously fetch data from the main memory
$\Longrightarrow$ keep the bus busy.
Cache misses usually trigger prefetches of the neighboring cache line.
\end{description}
\newpage
\subsection{Basic optimization techniques for serial code}
\begin{description}[style=nextline]
\item[What is a hot spot?] A part of the code where a dominant fraction of the execution time is spent.
\item[What kind of trigger mechanisms do exist?] Event-driven and trigger-driven.
\begin{description}[style=nextline]
\item[What is the difference between event-driven and sample-driven triggers?] \textbf{Event}-driven
triggers count accuratly every occurance by using instrumentation or PMCs => probably high overhead. \
\textbf{Sample}-driven triggers save the state/PC at periodic times => constant overhead.
\item[What is instrumentation?] Instrumentation is used to trigger events during
runtime by putting additional instructions in an existing program.
This may be done by libraries, emulators or the compiler or the programmer itself.
\end{description}
\item[What kind of recording mechanisms do exist?] Profiling and Tracing.
\begin{description}[style=nextline]
\item[What is the difference between profiling and tracing?] Profiling summaries information about
the runtime spent per routine/line/block. Tracing collects call-paths
with timing information and is therefore much more detailed.
\end{description}
\item[What can hardware performance counter measure?] PMCs are MSRs/hw-counters to measure processor
events like cache/branch-misses, loads/stores and much more.
\item[What are basic single-core optimizations?]
\begin{itemize}
\item Do-less work
\item Avoid expensive operations
\item Shrink the working set
\item Eliminate common subexpressions
\item Avoid branches
\item Use SIMD instruction sets
\item Support the compiler use flags wisely
\item C++
\end{itemize}
\begin{description}[style=nextline]
\item[What are simple loop optimizations?] Break-out the loop as soon as possible.
Jam neighbouring loops.
\item[How to avoid expensive operations?] Use precalculated tables for fast lookup of results.
\item[What to consider with data types?] Aligning is important. Take care when using SIMD.
\item[How can temporary variables increase performance?] They can be kept in registers and
therefore reduce the memory transations.
\item[Can vectorization help for memory-bound programs?] No.
\item[What is the role of the compiler?] Register usage, inlining, general optimizations
\item[What are problems with C++ codes?] Avoid frequent (de)allocation,
use static varibales whenever possible, avoid using to much abstraction.
\end{description}
\end{description}
\newpage
\subsection{Data access optimization}
\begin{description}[style=nextline]
\item[What can be modelled with the balance metric?] Ratio of memory bandwidth \[Words/s\] to peak
performance \[Flops/s\].
\begin{description}[style=nextline]
\item[How are lightspeed, machine and code balance defined?] See formulars.
\item[How can we get values for lightspeed, machine and code balance?] Count \# of
load/stores and arithmetic operations per loop iteration.
\item[What are the limitations of this model?] Expects peak performance: saturated pipelines,
superscalarity. Latency is ignored. Data transfer and calculation overlaps perfectly.
\end{description}
\item[How can algorithms be classified depending on the number of arithmetic operations and data transfers?] By
the scaling behaviour of memory transactions and arithmetic operations vs. problem size: $O(N), O(N^2), ...$.
\begin{description}[style=nextline]
\item[Which algorithm types have potential for optimizations?] Memory-bound usually not so much.
Take care for computate-bound problems: \( \frac{O(N^3)}{O(N^2)} \).
\item[Which optimizations should be applied?] Loop unrolling, jamming and blocking.
\end{description}
\item[Sparse matrix-vector multiply] Number of non-zero entries grows linearly with number of matrix rows.
\begin{description}[style=nextline]
\item[How can sparse matrices be stored and why?] Compressed row storage (CRS): store only
non-zero entries by using additional index- and row-pointer- arrays.
\item[How is the sparse matrix-vector multiply computed and optimized?] Workingset shrinking by
using CRS, loop unrolling, etc..
\end{description}
\end{description}
\newpage
\subsection{Parallel computers}
\begin{description}[style=nextline]
\item[Which taxonomy for parallel computing does exist?] Flynns: SIMD, MIMD, (SISD, MISD)
\item[What can limit scalability?] Serial parts: synchronization and communication
\item[What does Amdahl's Law (strong scaling) say?] For a fixed data set, speedup $S_p$ converges
with increasing number of workers towards $\frac{1}{s}$, where $s$ denotes the
fraction of serial code.
\item[What does Gustafson's Law (weak scaling) say?] For an arbitrarily large data set,
speedup $S_p$ grows linearly with the number $N$ of parallel workers: $N p + s$ with $p = 1 - s$. \\
Does not model the reality!
\item[What is a advantage? of multicore processors?] Higher performance is feasible with lower
clock frequencies and therefore lower energy consumption.
\begin{description}[style=nextline]
\item[Why do we have multicore processors?] Increasing the clock rate would require
enormous cooling efforts which arn't economic.
\end{description}
\item[Which advantages do SMT have?] Improves the exploitation of ILP. Increase in piplined throughput.
\item[What is a shared-memory computer?] Multiple cores are using the same main memory and address space.
\begin{description}[style=nextline]
\item[What is the difference between UMA and ccNUMA?] The flat model of uniform memory access (UMA)
has equal costs for memory accesses across all cores.
Their memory is connected to a shared chipset.
Non-uniform memory access (NUMA) costs are dependent on the path between the memory and
the socket which initiates the transfer.
Memory is connected directly to sockets and/or distributed accross multiple
boards which are connected by custom interconnects (mostly propriatary: QPI, HT).
\end{description}
\item[What is a distributed-memory computer?] Unlike (N)UMA systems,
so called "No Remote Memory Access" (NORMA) systems don't have a common address space.
Their nodes a coupled via networks and use message passing to communicate.
\begin{description}[style=nextline]
\item[How do hybrid systems look like?] They combine shared and distributed memory aspects
in one cluster. Multiple shared memory nodes are coupled to a
distributed system => multiple isles of cache corherence!
\end{description}
\end{description}
\subsubsection{Networks}
\begin{description}[style=nextline]
\item[Which network performance metrics do exist?]
\begin{itemize}
\item Bisection (band)width $b$: \# of edges which have to be removed to split the network
into two (equal sized) halves.
\item Diameter $dm$: Max distance between to nodes/processors.
\item Edge connectivity $e$: Min \# of edges that must be removed to break the network it
into two disconnected parts.
\item Node connectivity $\delta$: Max \# of edges in the graph per node (max degree).
\end{itemize}
\begin{description}[style=nextline]
\item[What can the PingPong benchmark model (latency, effective bandwidth)?] \textbf{Small}
messages have larger overhead and therefore lower effective bandwidth (latency dominates). \\
\textbf{Large} messages have less overhead better effective bandwidth (latency plays no role).
\item[What does the LogP model do?] Generic model for scaling behaviour of connections between
parallel systems. It includes:
\begin{description}
\item[L] Upper boundary on latency for one transfer
\item[o] Overhead: time that a processor is involved with the transfer
\item[g] Gap: minimum time interval between to consective transfers.
\item[P] The number of processors
\end{description}
\item[What are the definitions for bisection bandwidth, diameter \& edge connectivity?] See above.
\end{description}
\item[What are relevant network topologies in HPC?] See table above.
\begin{description}[style=nextline]
\item[What are the bisection bandwidth, diameter, node \& edge connectivity of networks such as
Ring, Bus, Switched/Fat Tree, Meshes, Hypercubes, Tori, Hybrids?] See tables above.
\end{description}
\end{description}
\newpage
\subsection{Parallelization and optimization strategies}
\begin{description}[style=nextline]
\item[Which types of parallelism do exist?]
\begin{itemize}
\item Instruction/Processor level parallelism: Superscalarity, SIMD, SMT?
\item Node/chip level: several cores run in parallel on the same memory
\item System level: serveral nodes run in parallel, communication over a network
\end{itemize}
\begin{description}[style=nextline]
\item[What overhead do the different types have \& how can it be reduced?]
Overhead increases from instruction to system level parallelism.
Overhead includes Amdahl's Law implications as well as the additional time required for implementation!
Use compiler optimization and existing patterns to reduce implementation overhead.
\item[What is a task dependency graph \& its critical path and average concurrency?] It's a DAG
which show dependencies on data and execution order of diffrent parts of the code.
\begin{description}
\item[Critial Path] Length of the longest path (sum of weights)
\item[Avg Concurrency] Ratio of total amount of work to critical path length.
\end{description}
\end{description}
\item[Which parallel design spaces \& patterns are defined by Mattson?]
\begin{itemize}
\item Finding Concurrency: Dependency Analysis \& Decomposition.
\item Algorithm Structure: Organize by.. tasks, data decomposition, flow of of data.
\item Supporting Structures: Program \& Data Structures
\item Implementation Mechanisms: UE Management, Sychronization, Communication.
\end{itemize}
\begin{description}[style=nextline]
\item[What is geometric decomposition \& adaptive mesh refinement (AMR)?] Problem/data dependent
adaption of grid resolution. Increasing accuracy in important regions.
Split the problem space geometrically and distribute these parts to UEs.
\item[What does load imbalance mean? Which types do exist? How to get a load balanced application?] In
a team of threads/UE, one takes longer because it's working set takes longer to
solve than the others. These threads are called "speeders" \& "laggers". \\
There are geometric and graph-based load balancing methods which are statically or dynamic.
\item[What are differences between SPMD, Master/Worker, Loop Parallelism and Fork/Join?
Name a typical example.]
All these are supporting structures to parallelize algorithms.
SPMD and Loop Parallelism execute the same code on multiple UEs.
Master/Worker, Fork/Join offer a greater flexibility in this regard.
\item[What does NUMA affinity mean? How can memory placement and binding affect performance?]
For optimal performance, data has to be closely located to the
cores which are using it. Modern OSes use "affinity on first touch"
which allocates memory on the same socket where the task is executed.
\end{description}
\item[Which are typical approaches in a Molecular Dynamics application?]
\begin{itemize}
\item Particle Decomposition
\item Force Decomposition
\item Domain Decomposition
\end{itemize}
\end{description}
\newpage
\subsection{Distributed-memory programming with MPI}
\begin{description}[style=nextline]
\item[Basic ideas of MPI] Don't rely on cache coherence.
Use messages to communicate between workers/UEs => SPMD.
\begin{description}[style=nextline]
\item[What is SPMD and how is it implemented by MPI?]
All workers execute the same program.
Program flow is directed by so called "ranks" which are unique per process.
\item[How MPI abstracts the communication?] MPI provides a runtime API with a large set of
functions for point-to-point and collective operations.
It also has abstraction mechanisms for the underlying network topology.
There's a set of MPI types which are portable between architectures / programming languages.
\end{description}
\item[Point-to-point operations]
\begin{description}[style=nextline]
\item[How to transfer data between processes in the form of messages?]
Use \lstinline$MPI_Send()$, \lstinline$MPI_Recv()$ functions
(or their variants) for point-to-point communication.
See API reference above.
\item[How to prevent deadlocks and overlap communication and computation?]
Use non-blocking \lstinline$MPI_Isend()$, \lstinline$MPI_Irecv()$
in conjunction with the MPI request methods \lstinline$MPI_Wait()$
and \lstinline$MPI_Test()$. \\
Alternatively a deadlock-free \lstinline$MPI_Sendrecv()$ is available
which combines sending and receiving in on function call.
\end{description}
\item[Collective operations]
\begin{description}[style=nextline]
\item[How to scatter and gather data and perform operations on distributed data?] Use the
\lstinline$MPI_Scatter()$ and \lstinline$MPI_Gather()$ or their variable chunk variants.
\end{description}
\item[Virtual topologies]
\begin{description}[style=nextline]
\item[How to distribute processes over a regular grid?]
Use the \lstinline$MPI_Cart_*()$ to create a new MPI communicator.
There are functions to translate MPI ranks to coordinates
as well to split grids and get adjacent ranks.
\end{description}
\item[Derived datatypes]
\begin{description}[style=nextline]
\item[How to combine MPI datatypes into more complex entities?] Use the
\lstinline$MPI_Type_contiguous()$ and \lstinline$MPI_Type_vector()$ functions.
Type signature must be the same between processes,
gaps \& alignment are allowed to differ (architectural properties).
\end{description}
\end{description}
\newpage
\subsection{Shared-memory programming with OpenMP}
\begin{description}[style=nextline]
\item[Basic principle of OpenMP] Extend an existing application by
using \texttt{\#pragma} directives.
Worksharing is used to distribute loops, sections to multiple threads
on a shared-memory machine.
\begin{description}[style=nextline]
\item[Execution model] OpenMP programs start with a single master thread.
Worker threads are spawned/joined by the master
in parallel regions => Fork/Join concept.
\item[Parallel region + worksharing constructs] OpenMP handles the thread management work.
\end{description}
\item[Scoping] There are OpenMP clauses the specifiy the scope (shared or private) of variables.
\begin{description}[style=nextline]
\item[Data sharing clauses] See API reference above.
\end{description}
\item[Synchronization]
\begin{description}[style=nextline]
\item[Critical section] Use \lstinline$#pragma omp critical$ directive.
\item[Reduction clause] Use \lstinline$reduction(operation:var-list)$ clause.
\item[Team and Task-Barriers] Use \lstinline$#pragma omp barrier$ and \lstinline$#pragma omp taskwait$ directives.
\end{description}
\item[Runtime library]
\begin{description}[style=nextline]
\item[Important functions] See API reference above.
\end{description}
\end{description}
\newpage
\subsection{Hybrid programming (MPI + OpenMP)}
\begin{description}[style=nextline]
\item[Hybrid programming basics] \hfill
\begin{description}[style=nextline]
\item[Why we need hybrid programs] \hfill
\item[Hybrid programming models] \hfill
\end{description}
\item[Threading modes of MPI] \hfill
\begin{description}[style=nextline]
\item[What levels of thread support MPI provides] \hfill
\item[Potential troubles with multithreaded MPI programs] \hfill
\end{description}
\item[Addressing multiple threads within MPI processes] \hfill
\begin{description}[style=nextline]
\item[How to work around the flat addressing space of MPI] \hfill
\item[Using multiple communicators in a hybrid context] \hfill
\end{description}
\item[Running hybrid programs on the RWTH cluster] \hfill
\begin{description}[style=nextline]
\item[How to properly instruct LSF to run your hybrid job] \hfill
\end{description}
\end{description}
\newpage
\subsection{Heterogeneous architectures (GPUs, Xeon Phis)}
\subsubsection{GPGPU's}
\begin{description}[style=nextline]
\item[How does a GPU look like?]
A GPGPU is a manycore architecture optimized for high data-parallel throughput.
It consists of multiple streaming processors (SM), which itself consists of many cores.
\begin{description}[style=nextline]
\item[Why do GPUs deliver a good performance per Watt ratio?] GPGPUs have small caches,
many low frequency cores and little control logic.
Memory is close to the processing unit.
\item[What is the difference to CPUs?] There are more transistors dedicated to computation
instead of controlling. A CPU has large caches with multiple levels,
supports OoO execution, sophisticated control logic like speculative
execution and hw-scheduling of microops.
\item[How does the memory hierarchy look like?] Each SM has seperated shared memories,
caches, registers and texture storage.
All SM have access to a common LLC and global memory of the GPU.
This global memory is seperated from the host.
There's no cache coherence for GPU caches!
Communication with the host has to be done by DMA transfers.
\item[How can the logical programming hierarchy be mapped to the execution model?]
The execution model is host-directed.
Portions of the code, so called "kernels" are offloaded to the GPU.
These kernels are executed highly parallel by an array of threads.
\end{description}
\item[Which models can be used to program a GPU?] There are special GPU programming APIs
like CUDA and more general directive based models like OpenACC/MP.
\begin{description}[style=nextline]
\item[How to handle offloading of regions?]
Programmer has to manually find regions which can be parallelized.
These "kernels" are compiled to special GPU code and called by the host.
\item[How to handle data management?] Data transfer has to be done explicitly.
\item[What are the main differences?] \hfill
\end{description}
\item[Which impact does the PCIe have?] Usually GPUs are have very fast memory.
The PCIe bus isn't fast enough to saturate this memory bandwidth => bottleneck.
\item[What is branch divergence?] 32 threads are joined to "warps".
These warps share a common program counter => branches are serialized inside warps.
\begin{description}[style=nextline]
\item[Which performance impact does it have?]
Branch divergancy causes threads which doesn't took a branch to idle.
It therefore leds to performance degration =>
Branch only with sets of multiple warps.
\end{description}
\item[What can be a good launch configuration and why?] \hfill
\begin{description}[style=nextline]
\item[How can the launch configuration be specified?] \hfill
\end{description}
\item[What can be done to saturate the bus?] Maximize PCIe throughput by transfer
less data, batch smaller transfers to larger ones.
\begin{description}[style=nextline]
\item[What is coalescing?] \hfill
\item[How can it be achieved?] \hfill
\item[What impact does AoS and SoA have?] \hfill
\item[What is the difference between caching and non-caching loads?] \hfill
\end{description}
\item[What is shared memory?] CUDA SM/blocks have dedicated shared memory.
This can be used for low-latency inter-thread communication.
\item[How to avoid CPU/ GPU idling?] Use asynchronous data transfers and CUDA streams
to enqueue multiple kernels for execution.
\end{description}
\subsubsection{Xeon Phi}
\begin{description}[style=nextline]
\item[How does a Xeon Phi look like?]
The Xeon Phi consists of 60 cores with 4-fold SMT,
has a clock frequency of 1090~Mhz
and a peak performance of about 1 TFlop/s.
\begin{description}[style=nextline]
\item[How does the memory hierarchy look like?]
All cores are connected to their own L1/L2 caches by a ring bus.
There's a common Memory and I/O interface which also connected to ring bus
and used to access the global GDDR5 RAM.
Caches a kept coherent. Data transfer to the host has to be done via DMA.
There's no common address space with the host.
\item[How many threads/ vector-widths are available?]
\begin{description}
\item[Threads] 60 cores * 4 threads = 240 threads
\item[Vector-width] AVX-512 has 512 bits per vector
(extension of AVX2, MIC only).
\end{description}
\end{description}
\item[Which programming concepts do exist?] OpenMP/CL/ACC, MPI and Posix threads.
The Phi is a slightly modified x86 architecture and can be programmed used
like a normal x86 system. Intel offers propriatary language offloading extensions (LEO)
for their own C compiler.
\begin{description}[style=nextline]
\item[How can OpenMP (4.0) be used?] The recent OpenMP spec adds new
directives (\texttt{target} and \texttt{teams}) which can be used
to offload parts of the code.
This concept has been adopted from the OpenACC
standard which can be considered to be a testbed for OpenMP.
\end{description}
\item[Which optimization strategies should be applied?]
Massive SIMD vectorization, check data aligment (SoA, AoS), avoid aliasing.
\begin{description}[style=nextline]
\item[Which impact can the PCIe have?]
PCIe is a bottleneck for host to Phi data transfer.
Adds latency for communication => see GPGPU.
\item[Which impact does vectorization have?] Huge impact for the Phi.
It has support for FMA-3 with large SIMD vector registers.
\end{description}
\end{description}
\end{document}