1114 lines
47 KiB
TeX
1114 lines
47 KiB
TeX
\documentclass[11pt]{article}
|
|
|
|
\usepackage{amsmath,amssymb,latexsym}
|
|
\usepackage{fancyhdr}
|
|
\usepackage[margin=2.50cm]{geometry}
|
|
\usepackage[compact]{titlesec}
|
|
\usepackage{hyperref}
|
|
\usepackage[T1] {fontenc}
|
|
\usepackage{color}
|
|
\usepackage{multirow}
|
|
\usepackage{array}
|
|
\usepackage{paralist}
|
|
\usepackage{listings}
|
|
\usepackage{enumitem}
|
|
\hypersetup{colorlinks=true, linkcolor=blue, urlcolor=cyan}
|
|
|
|
\lstset{
|
|
moredelim=[is][\color{green}]{§}{§},
|
|
moredelim=[is][\color{red}]{?}{?},
|
|
moredelim=[is][\color{cyan}]{|}{|}
|
|
}
|
|
|
|
% Title infos
|
|
\title{HPC Formulars}
|
|
\author{Steffen Vogel}
|
|
\date{\today}
|
|
|
|
% headline
|
|
\lhead{HPC Formulars}
|
|
\chead{Steffen Vogel}
|
|
\rhead{\today}
|
|
|
|
\pagestyle{fancy}
|
|
\setlength{\parindent}{0cm}
|
|
|
|
\begin{document}
|
|
|
|
\section{Amdahl \& Gustafson}
|
|
|
|
\begin{tabular}{ l l | c c }
|
|
|
|
& & \textbf{Amdahl's Law} & \textbf{Gustafson's Law} \\
|
|
& & \textit{strong scaling} & \textit{weak scaling} \\
|
|
\hline
|
|
\textbf{Speedup} & \( S_p(N) = \frac{T(1)}{T(N)} \) & \( \frac{1}{s + \frac{1 - s}{N}} \xrightarrow{N \rightarrow \infty} \frac{1}{s} \) & \( Np + s \xrightarrow{N \rightarrow \infty} \infty \) \\
|
|
\textbf{Efficency} & \( \varepsilon_p(N) = \frac{S_p(N)}{N} \) & \( \frac{1}{s (N-1) + 1} \xrightarrow{N \rightarrow \infty} 0 \) & \( \frac{1 - p}{N} + p \xrightarrow{N \rightarrow \infty} p \) \\
|
|
\end{tabular}
|
|
|
|
\section{Moore's Law}
|
|
|
|
\begin{quote}
|
|
\# of transistors / cost-effective IC doubles every $N$ months, with $12 \leq N \leq 24$
|
|
\end{quote}
|
|
|
|
\section{Energy Efficency}
|
|
|
|
\begin{tabular}{ p{7cm} l }
|
|
Power-voltage-frequency & \( P \sim V^2 \cdot f \) or \( P \sim f^3 \) \\
|
|
& \\
|
|
energy efficency & = \( \frac{\text{energy}}{\text{Flop}} \) \\
|
|
& = \( \frac{\text{power consumption} \cdot \text{time}}{\text{Flop}} \) \\
|
|
& = \( \frac{\text{power consumption}}{\text{Flop}/s} \) \\
|
|
energy & = \( \text{power consumption} \cdot \text{time} \) \\
|
|
& \\
|
|
Physical limit for efficency & 2.87e-21 Joule/Bit \( \approx ln(2) \cdot k \cdot T \) \\
|
|
& 1.84e-19 Joule/Flop \\
|
|
Green TOP500 \#1 & 2.22e-10 Joule/Flop \\
|
|
& \\
|
|
Power wall & expected to be around 20 to 25 MWatt \\
|
|
\end{tabular}
|
|
|
|
\section{Pipelining}
|
|
|
|
\begin{tabular}{ p{7cm} l }
|
|
Stages & $m$ \\
|
|
Operations & $N$ \\
|
|
Without pipeline & \( T_{seq} = m \cdot N \) \\
|
|
With pipeline & \( T_{pipe} = N + m - 1 \) \\
|
|
Speedup & \(S_{pipe} = \frac{m}{1 + \frac{m-1}{N}} \xrightarrow{N \rightarrow \infty} m \) \\
|
|
Throughput (results/cycle) & \( \frac{N}{T_{pipe}} = \frac{1}{1 + \frac{m-1}{N}} \xrightarrow{N \rightarrow \infty} 1 \) \\
|
|
\end{tabular}
|
|
|
|
\section{Memory Access}
|
|
|
|
\begin{tabular}{ p{7cm} l }
|
|
Hit rate & $\beta$ \\
|
|
Miss rate & $(1 - \beta)$ \\
|
|
Access time& $T_m$ (Memory), $T_c$ (Cache) \\
|
|
Average access time & \( T_{avg} = \beta T_c + (1 - \beta) T_m \) \\
|
|
Performance gain & \( G( \theta, \beta ) = \frac{T_m}{T_{avg}} = \frac{\theta}{\beta + \theta(1 - \beta)}, \theta = \frac{T_m}{T_c} \) \\
|
|
\end{tabular}
|
|
|
|
\section{Networks}
|
|
|
|
\subsection{Transmission \& Bandwidth}
|
|
|
|
\begin{tabular}{ p{7cm} l }
|
|
Transmission time & \( T = T_L + \frac{N}{B} \) \\
|
|
Effective bandwidth & \( B_{eff} = \frac{N}{T} = \frac{N}{T_L + \frac{N}{B}} \) \\
|
|
\end{tabular}
|
|
|
|
\subsection{Topologies}
|
|
|
|
\begin{tabular}{l | c c c c }
|
|
\textbf{Topology} & \textbf{Max degree} & \textbf{Edge connectivity} & \textbf{Diameter} & \textbf{Bisection BW} \\
|
|
\hline
|
|
Bus & $1$ & $1$ & $1$ & $B$ \\
|
|
Ring & $2$ & $2$ & $\lfloor \frac{N}{2} \rfloor$ & $2B$ \\
|
|
Fully connected & $N-1$ & $N-1$ & $1$ & $\frac{N^2}{4}$ \\
|
|
Sw. Fat Tree & 1 w/o redunancy & depends on design & 2 hierarchy height & depends on design \\
|
|
Mesh & $2d$ & $d$ & $\sum_{i=1}^d (N_i - 1)$ & $B ( \prod_{i=1}^{d-1} N_i )$ \\
|
|
Torus & $2d$ & $2d$ & $\sum_{i=1}^d \lfloor \frac{N}{2} \rfloor$ & $2B ( \prod_{i=1}^{d-1} N_i )$ \\
|
|
Hypercube & $d$ & $d$ & $d$ & $B2^{d-1}$ \\
|
|
\end{tabular}
|
|
|
|
\section{Balance, Lightspeed}
|
|
|
|
\begin{tabular}{ p{7cm} l }
|
|
Code balance & \( B_c = \frac{\text{data transfers}}{\text{arithmetic ops}} = \frac{[Words]}{[Flops]} \) \\
|
|
Machine balance & \( B_m = \frac{b_s}{P_{max}} \) \\
|
|
Relative lightspeed & \( l = \min(1, \frac{B_m}{B_c} ) \) \\
|
|
Absolute lightspeed performance & \( P = P_{max} \cdot l \) \\
|
|
\end{tabular}
|
|
|
|
\section{Programming API's}
|
|
|
|
Argument directions: \lstinline$§input§$, \lstinline$?output?$, \lstinline$|inout|$
|
|
Common arguments:
|
|
communicator = cm
|
|
|
|
\subsection{MPI}
|
|
|
|
All MPI routines return an integer error code!
|
|
|
|
\subsubsection{Initialization}
|
|
\begin{tabular}{ p{8cm} l }
|
|
\lstinline$MPI_Init(§argc§, §argv§)$ & Has to be called before every other MPI\_* function. \\
|
|
\lstinline$MPI_Finalize()$ & Like \lstinline$MPI_Init()$ this must only be called once. \\
|
|
\lstinline$MPI_Initialized(?flag?)$ & Check if MPI runtime was already initialized. \\
|
|
\lstinline$MPI_Finalized(?flag?)$ & Check if MPI runtime was already shut down. \\
|
|
\end{tabular}
|
|
|
|
\subsubsection{Point-to-point data transfer}
|
|
|
|
\begin{tabular}{ l | l l l }
|
|
\textbf{Mode / Op} & \textbf{Blocking} & \textbf{Non-blocking} & \textbf{Completes after...} \\
|
|
\hline
|
|
Standard & \lstinline$MPI_Send$($\otimes$) & \lstinline$MPI_Isend$($\otimes, \ddag$) & the message has been safely stored away. \\
|
|
Synchronous & \lstinline$MPI_Ssend$($\otimes$) & \lstinline$MPI_Issend$($\otimes, \ddag$) & the receive was called. \\
|
|
Buffered & \lstinline$MPI_Bsend$($\otimes$) & \lstinline$MPI_Ibsend$($\otimes, \ddag$) & the message was stored away in a buffer. \\
|
|
Ready-mode & \lstinline$MPI_Rsend$($\otimes$) & \lstinline$MPI_Irsend$($\otimes, \ddag$) & only if the receive was called before send. \\
|
|
\hline
|
|
Standard & \lstinline$MPI_Recv$($\diamond, \dag$) & \lstinline$MPI_Irecv$($\diamond, \ddag$) & a message was received. \\
|
|
Probing & \lstinline$MPI_Probe$() & \lstinline$MPI_Iprobe$ & immediately. \\
|
|
Waiting & \lstinline$MPI_Wait$($\ddag, \dag$) & \lstinline$MPI_Test$($\ddag, $\lstinline$?flag?$$, \dag$) & message available. \\
|
|
\end{tabular}
|
|
|
|
|
|
\textbf{Signatures}: \\
|
|
{ \footnotesize
|
|
\hspace{2cm} $\otimes$ \lstinline$(§buffer§, §count§, §type§, §dst§, §tag§, §cm§)$, \\
|
|
\hspace{2cm} $\diamond$ \lstinline$(?buffer?, §count§, §type§, §dst§, §tag§, §cm§)$, \\
|
|
\hspace{2cm} $\dag$ \lstinline$(?status?)$ $\ddag$ \lstinline$(?request?)$
|
|
}
|
|
|
|
\begin{tabular}{ p{8cm} l }
|
|
\lstinline$MPI_Get_count(§stat§, §type§, ?count?)$ & Get the number of received elements by \lstinline$MPI_Recv()$. \\
|
|
|
|
\lstinline$MPI_Sendrecv(§sbuf§, §scnt§, §stype§, §dst§, §stag§,$ & Simultaneous, deadlock-free send and receive. \\
|
|
\lstinline$ ?rbuf?, ?scnt?, ?stype?, ?dst?, ?stag?, §cm§, ?stat?)$ & \\
|
|
|
|
\lstinline$MPI_Sendrecv_replace(...)$ & Like \lstinline$MPI_Sendrecv()$ but using the same buffer. \\
|
|
\end{tabular}
|
|
|
|
\subsubsection{Collective data transfer}
|
|
|
|
\begin{tabular}{ p{8cm} l }
|
|
\lstinline$MPI_Bcast(|buf|, §cnt§, §type§, §root§, §cm§)$ & Broadcasts data from root to all other. \\
|
|
\lstinline$MPI_Scatter(§sbuf§, §scnt§, §stype§,$ & Scatters chunks of data from root to all \\
|
|
\lstinline$ ?rbuf?, §rcnt§, §rtype§, §root§, §cm§)$ & \\
|
|
\lstinline$MPI_Gather(§sbuf§, §scnt§, §stype§,$ & Gathers chunks of data from root to all \\
|
|
\lstinline$ ?rbuf?, §rcnt§, §rtype§, §root§, §cm§)$ & \\
|
|
\lstinline$MPI_Allgather(...)$ & \\
|
|
\lstinline$MPI_Alltoall()$ & \\
|
|
\lstinline$MPI_Reduce()$ & \\
|
|
\lstinline$MPI_Allreduce(...)$ & \\
|
|
\end{tabular}
|
|
|
|
\subsubsection{Synchronisation}
|
|
\begin{tabular}{ p{8cm} l }
|
|
\lstinline$MPI_Barrier(§cm§)$ & Waits for all processes. \\
|
|
\end{tabular}
|
|
|
|
\subsubsection{Communicator handling}
|
|
\begin{tabular}{ p{8cm} l }
|
|
\lstinline$MPI_Comm_dup(...)$ & \\
|
|
\lstinline$MPI_Comm_split(...)$ & \\
|
|
\lstinline$MPI_Comm_compare(...)$ & \\
|
|
\lstinline$MPI_Comm_free(...)$ & \\
|
|
\lstinline$MPI_Comm_rank(§cm§, ?rank?)$ & Get rank/id of current process in this communicator. \\
|
|
\lstinline$MPI_Comm_size(§cm§, ?size?)$ & Get total number of processes in this communicator. \\
|
|
\end{tabular}
|
|
|
|
\subsubsection{Virtual topologies}
|
|
\begin{tabular}{ p{8cm} l }
|
|
\lstinline$MPI_Cart_create(..)$ & Creates a new cartesian cm from given old one.\\
|
|
\lstinline$MPI_Cart_rank(...)$ & \\
|
|
\lstinline$MPI_Cart_shift(...)$ & \\
|
|
\lstinline$MPI_Cart_sub(...)$ & \\
|
|
\lstinline$MPI_Cart_coords(...)$ & \\
|
|
\lstinline$MPI_Dims_create(...)$ & \\
|
|
\end{tabular}
|
|
|
|
\subsubsection{Derived Datatypes}
|
|
\begin{tabular}{ p{8cm} l }
|
|
\lstinline$MPI_Type_contiguous(...)$ & \\
|
|
\lstinline$MPI_Type_vector(...)$ & \\
|
|
\lstinline$MPI_Type_commit(...)$ & \\
|
|
\lstinline$MPI_Type_free(...)$ & \\
|
|
\end{tabular}
|
|
|
|
|
|
\subsubsection{Timekeping}
|
|
|
|
\begin{tabular}{ p{8cm} l }
|
|
\lstinline$double MPI_Wtime()$ & Measure local wall-clock time. \\
|
|
\lstinline$double MPI_Wtick()$ & Obtain the precision of the MPI timer. \\
|
|
\end{tabular}
|
|
|
|
\subsection{CUDA}
|
|
|
|
|
|
\subsubsection{General Memory Operations}
|
|
\begin{tabular}{ p{7cm} l }
|
|
\lstinline$cudaMalloc(&p, n*sizeof(float));$ & Allocate memory of n floats on GPU, save pointer in p\\
|
|
\lstinline$cudaFree(p);$& Free memory on GPU, which p is pointing to\\
|
|
\lstinline$cudaHostMalloc(&p, n*sizeof(float));$ & Allocate memory of n floats on Host, save pointer in p\\
|
|
\lstinline$cudaHostFree(p);$& Free memory on Host, which p is pointing to\\
|
|
\lstinline$cudaMemcpy(d, h, n * sizeof(float),DIR);$ & memcopy Host => GPU, if DIR = cudaMemcpyHostToDevice \\
|
|
\lstinline$cudaMemcpy(h, d, n * sizeof(float),DIR);$& GPU => Host if DIR = cudaMemcpyDeviceToHost\\
|
|
\lstinline$func<<<blockNum, threadNum>>>();$ & Call CUDA Kernel code $func$, with given \# of blocks / threads\\
|
|
\end{tabular}
|
|
|
|
\subsubsection{Shared Memory}
|
|
\begin{tabular}{ p{7cm} l }
|
|
\lstinline$__shared__ double[x][y];$ & Static shared memory between threads\\
|
|
\lstinline$extern __shared__ double[x][y];$ & Dynamic shared memory between threads\\
|
|
\lstinline$__syncthreads();$ & Synchronize threads after shared memory usage \\
|
|
\end{tabular}
|
|
|
|
\subsubsection{Asynchronous operations}
|
|
\begin{tabular}{ p{7cm} l }
|
|
\lstinline$cudaStream_t st1,st2;$ & Defines streams. Instructions of one stream are exec'd in order.\\
|
|
\lstinline$cudaStreamCreate(&st1)$& Initialize Stream\\
|
|
\lstinline$cudaMemcpyAsync(dst,src,size,dir,st1);$& Async memcpy in Stream st1. (nonlbocking)\\
|
|
\lstinline$cudaStreamSynchronize(st1);$ & Host waits for stream\\
|
|
\lstinline$cudaDeviceSynchronize();$ & Host waits for everything \\
|
|
\end{tabular}
|
|
|
|
\subsection{OpenMP}
|
|
|
|
\begin{tabular}{ p{7cm} p{9cm} }
|
|
\lstinline$#pragma omp parallel$ & Forms a team of threads and starts parallel execution. \\
|
|
\lstinline$#pragma omp for$ & Split iterations of following for loop across threads. \\
|
|
\lstinline$#pragma omp sections$ & Non-iterative worksharing of diffrent sections. \\
|
|
\lstinline$#pragma omp section$ & \\
|
|
\lstinline$#pragma omp single$ & Region shall only be executed by a single thread. \\
|
|
\lstinline$#pragma omp critical$ & Region is executed by all threads but only
|
|
by a single simultaneously. \\
|
|
\lstinline$#pragma omp barrier$ & All tasks created by any thread of the current
|
|
team are guaranteed to complete at barrier exit. \\
|
|
\lstinline$#pragma omp taskwait$ & Encountering task suspends until all (direct) child tasks are complete. \\
|
|
\lstinline$#pragma omp task$ & Defines an explicit task. \\
|
|
\lstinline$#pragma omp target [data]$ & \\
|
|
\lstinline$#pragma omp distribute$ & \\
|
|
\lstinline$#pragma omp teams$ & \\
|
|
\end{tabular}
|
|
|
|
\subsubsection{Clauses}
|
|
|
|
\begin{tabular}{ p{7cm} p{9cm} }
|
|
\lstinline$num_threads(cnt)$ & Number of threads for a parallel region. \\
|
|
\lstinline$default(shared|none)$ & Set default scope. \\
|
|
\lstinline$[first|last]private(list)$ & Thread private scope. \\
|
|
\lstinline$shared(list)$ & Shared scope among thread team. \\
|
|
\lstinline$proc_bind(master|close|spread)$ & OMP places? \\
|
|
\lstinline$nowait, untied$ & \\
|
|
\lstinline$reduction(op:list)$ & Do reduction after parallel block. \\
|
|
\lstinline$schedule(kind,chunk_size)$ & \\
|
|
\lstinline$if(condition)$ & Only fork/enter thread/parallel region if \texttt{condition} is true. \\
|
|
\end{tabular}
|
|
|
|
\subsubsection{Environment variables}
|
|
|
|
\begin{tabular}{ p{7cm} l }
|
|
\lstinline$OMP_NUM_THREADS$ & \\
|
|
\lstinline$OMP_PLACES$ & \\
|
|
\lstinline$KMP_AFFINITY$ & \\
|
|
\end{tabular}
|
|
|
|
\subsubsection{Runtime API}
|
|
|
|
\begin{tabular}{ p{7cm} p{9cm} }
|
|
\lstinline$omp_{set,get}_num_threads(n)$ & Get/set the number of threads used for the encountering region. \\
|
|
\lstinline$omp_get_thread_num()$ & Returns the number of the current thread (see \lstinline$MPI_Comm_rank()$).
|
|
|
|
\end{tabular}
|
|
|
|
\subsection{OpenACC}
|
|
|
|
\begin{tabular}{ p{7cm} l }
|
|
\lstinline$#pragma acc parallel [loop, d-clauses]$ & \\
|
|
\lstinline$#pragma acc kernels [loop, d-clauses]$ & \\
|
|
\lstinline$#pragma acc loop [gang, reduction(op:list)]$ & \\
|
|
\lstinline$#pragma acc wait$ & \\
|
|
\lstinline$#pragma acc data [d-clauses]$ & \\
|
|
\lstinline$#pragma acc update {host,device}(data)$ & \\
|
|
\lstinline$#pragma acc {enter,exit} data [d-clauses]$ & \\
|
|
\lstinline$#pragma acc routine (name)$ & \\
|
|
\end{tabular}
|
|
|
|
\section{LSF}
|
|
|
|
See \texttt{man bsub}.
|
|
|
|
\section{Common problems and operations}
|
|
|
|
\begin{tabular}{ p{7cm} l }
|
|
Single-precision real Alpha X Plus Y & \( \vec{y} = \alpha \cdot \vec{x} + \vec{y} \) \\
|
|
(S/DAXPY, FMA, Triad) & \\
|
|
Sparse matrix vector multiply & \( \vec{y} = \mathbf{A} \cdot \vec{x} \) \\
|
|
Dense matrix vector multiply & \\
|
|
Dense matrix multiplication & \( \mathbf{A} = \mathbf{B} \cdot \mathbf{C} \) \\
|
|
Dense matrix diagonalization & \( \mathbf{P}^{-1} \mathbf{A} \mathbf{P} = diag(\lambda_1, ..., \lambda_n) \) \\
|
|
Scalar product & \( s = \vec{x} \cdot \vec{y} \) \\
|
|
Vector addition & \( \vec{y} = \vec{x} + \vec{z} \) \\
|
|
Matrix addition & \( \mathbf{A} = \mathbf{B} + \mathbf{C} \) \\
|
|
Matrix transpose & \( \mathbf{A} = \mathbf{B}' \) \\
|
|
\end{tabular}
|
|
|
|
\newpage
|
|
\section{What you've learned?}
|
|
|
|
\subsection{Why supercomputers?}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What is a supercomputer?] A supercomputer is a computer
|
|
at the frontline of current processing capacity, particularly speed of calculation.
|
|
\item[What are supercomputers used for?]
|
|
\begin{itemize}
|
|
\item Scientific computing \& simulation
|
|
\item Weapon research
|
|
\item Economics (Oil, minerals)
|
|
\item Medicine
|
|
\item Meteorology
|
|
\end{itemize}
|
|
\item[What are recent supercomputers] See TOP500 list of world's fastest systems:
|
|
\begin{enumerate}
|
|
\item Tianhe-2 (33 PFlops/s)
|
|
\item Titan (17 PFlops/s)
|
|
\end{enumerate}
|
|
\item[What can we read from the performance development as measured in the
|
|
TOP500?] The TOP500 lists by peak performance $R_{max}$
|
|
in Flops/s (double) measured by a standardized benchmark (LIN/LAPACK). \\
|
|
Future trends are predictable (exascale computing, architectures).\\
|
|
At the moment, it takes approximately 11 years to increase performance by 3
|
|
orders of magnitude (ExaFlop/s projected for 2019).
|
|
|
|
\item[What does Moore's Law tell you? Is it still valid?]
|
|
"The number of transistors per chip doubles around every one to two years." \\
|
|
Slope is now declining slowly. But mostly still valid.
|
|
|
|
\item[Why do we have multi-core architectures today?]
|
|
Clock frequencies can't be pushed further due to limited cooling capabilities. \\
|
|
Solution: Decrease frequency; increase die-size by putting more cores on a single chip and exploit parallelism.
|
|
|
|
\end{description}
|
|
|
|
\newpage
|
|
\subsection{Energy Efficiency}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What is the power wall?] 20 to 25 MegaWatt of energy consumption is
|
|
considered an important barrier for HPC centres.\\
|
|
The power wall gives us an upper limit on power consumption.
|
|
|
|
\item[What is the problem?]
|
|
To meet the exa-scale goal in 2019, the current trend of energy efficiency isn't sufficient.
|
|
Power consumption will become the dominating cost factor for
|
|
operate clusters in the future, with energy costs surpassing investment costs.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[Aspects of energy efficency] Energy efficiency is continuously
|
|
improving. To meet the exa-scale goal this has to be focused more by using
|
|
new architectures (accelerators).
|
|
At the moment, we have a performance increase of 2 orders of magnitude
|
|
every 7 years while energy efficiency ($\frac{\text{power consumption}}{\text{Flop/s}}$)
|
|
decreases by only 1 order of magnitude in the same time.
|
|
This means power consumption is increasing 1 order of magnitude every 7 years.
|
|
|
|
\item[Physical descriptions \& limits]
|
|
Landauer's principle: "Changing one bit of information needs at
|
|
least an amount of energy of $ln(2) \cdot k \cdot T$". \\
|
|
The current top Green500 systems are about 10 orders of magnitude away from this limitation.
|
|
\end{description}
|
|
|
|
\item[How can we benchmark energy efficency?] Measure only the consumption of a
|
|
single rack (submodule) and extrapolate. Average over different benchmarks.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[Advantages \& disadvantages of Green500, SPEC Power]
|
|
High variations in idle and during benchmarks and between nodes.
|
|
Power extrapolation may not be allowed.
|
|
The Green500 assumes that the computational workload during the Linpack
|
|
benchmark is well-balanced across all units and that all units are
|
|
identical and have the same power consumption for the same workload.
|
|
Those assumptions are however very dangerous and often wrong. Idle and
|
|
Linpack power consumption can vary from node to node as well as over time.
|
|
\end{description}
|
|
|
|
\item[What are the architectural aspects of energy efficency?]
|
|
Current von Neumann architectures weren't designed with energy
|
|
efficiency in mind:
|
|
OoO execution, cache coherency and prefetchers are inefficient. \\
|
|
Desktop/mobile power saving methods are applied to HPC (C and
|
|
P-states) C0 is the operating state, everything above is an idle state.
|
|
With increasing numbers, more and more parts are shut down to conserve
|
|
energy (at the cost of an increased wakeup time). P-states are all
|
|
operational states, but they switch between supported frequencies \&
|
|
voltages to save energy in case maximal performance is not needed
|
|
$\Longrightarrow$ DVFS.\\
|
|
There are many parameters that affect energy efficiency: number of
|
|
cores, clock frequency, prefetcher, SMT, branch prediction, GPU clock
|
|
frequency, cache size, speed of interconnects. It is important to find
|
|
sweet spots for those parameters, not only for whole applications but
|
|
also for each application phase.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What is dynamic frequency and voltage scaling?]
|
|
Reduce voltage/frequency during idle times to save energy.
|
|
Implemented by processor P-states.
|
|
\end{description}
|
|
|
|
\item[What are the future plans to climb the power wall?]
|
|
\begin{itemize}
|
|
\item Optimize current architectures
|
|
\item Power-efficient accelerators
|
|
\item Reduce idle power to $\approx 0 W$
|
|
\item Load-balancing by DVS/DFS.
|
|
\end{itemize}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What about GPU's?] GPU's are a lot more efficient because more
|
|
transistors are used for computation instead of controlling/management tasks.
|
|
They already use faster DDR5-RAM. But PCIe will become a bottleneck when CPUs get stacked RAM.
|
|
|
|
\item[What about stacked memory?] Is a must-have to reach the exa-scale goal.
|
|
Shorter paths allow higher bandwidths and lower voltages (maintainability? extend
|
|
the caching hierarchy by SSDs?).
|
|
\end{description}
|
|
\end{description}
|
|
|
|
\newpage
|
|
\subsection{Modern Processors}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What is an ISA?] The Instruction Set Architecture is the part of an
|
|
architecture that is visible to the application programmer / compiler.
|
|
It contains e.g. opcode, operands, registers, addressing modes\dots
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What are the differences in ISA's?]
|
|
\begin{itemize}
|
|
\item Instructions, CISC or RISC
|
|
\item Memory addressing modes (effective of relative)
|
|
\item Type of Internal storage, e.g. stack, accumulator or
|
|
general-purpose registers (dominant today)
|
|
\item Endianess
|
|
\item Type and size of operands
|
|
\end{itemize}
|
|
|
|
\item[What are the main differences between RISC and CISC?]
|
|
\begin{description}
|
|
\item[CISC] Instructions may involve complex operations and are variable in length.
|
|
\item[RISC] Similar encoding for all instructions, fixed length.
|
|
Strict distinction between data loading/storing and manipulation.
|
|
\end{description}
|
|
|
|
\item[What do we have today?] HPC dominant x86 architectures are CISC
|
|
based (but get translated to internal RISC micro instructions).
|
|
\end{description}
|
|
|
|
\item[What is the basis of today's processors?]
|
|
They mostly decode CISC instructions to microcode ops (hybrid CISC/RISC).
|
|
Modern processors are still based on the Von Neumann architecture,
|
|
albeit they usually consist of two or more cores.
|
|
In addition, ALU, control unit, memory controller etc. are all
|
|
integrated on the core.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What are the corresponding bottlenecks?]
|
|
The main bottlenecks today are memory access speed and sequential
|
|
architecture (SISD)
|
|
\item[How can we overcome these?]
|
|
Pipelining, ILP (superscalarity), SIMD - Single Instruction
|
|
Multiple Data, Prefetching.
|
|
|
|
\end{description}
|
|
\item[What is ILP?]
|
|
Instruction Level Parallelism is the potential overlap of instruction
|
|
execution caused by superscalarity, OoO and pipelines.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What is its influence on today's computers? over time?]
|
|
Todays CPU's use hardware implemented schedulers, dependency
|
|
resolvers for microcode operations $\Longrightarrow$ very complex
|
|
controlling.
|
|
|
|
\end{description}
|
|
\item[Pipelining]
|
|
\begin{description}[style=nextline]
|
|
\item[How does it work?]
|
|
Pipelining splits the execution of an instruction in multiple
|
|
preferably equally long 'stages' which are orthogonal to each other. \\
|
|
This allows reusing the execution unit of one stage for the next
|
|
instruction before the first instruction has retired.
|
|
|
|
\item[How to compute possible speedup/throughput?] See formulas.
|
|
|
|
\item[Problems?]
|
|
\begin{itemize}
|
|
\item Pipeline stalls caused by phases of different lengths
|
|
\item Hazards caused by dependent data, instructions, branch-misses...
|
|
\item Imbalanced CISC instruction decoding/fetching. Instructions
|
|
are not of equal length and complexity.
|
|
\item Aliasing...
|
|
\end{itemize}
|
|
|
|
\end{description}
|
|
\item[Superscalarity]
|
|
"Execute multiple instructions per cycle."
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What is a superscalar processor?]
|
|
A superscalar processor consist of multiple FP pipelines,
|
|
instruction fetchers and decoders.
|
|
Thereby can execute multiple instructions per cycle and allows
|
|
a higher pipeline utilization and SMT. \\
|
|
This is a kind of ILP.
|
|
|
|
\end{description}
|
|
\item[SIMD] Single-Instruction-Multiple-Data
|
|
\begin{description}[style=nextline]
|
|
\item[How does it work?]
|
|
Execute a single instruction which initiates multiple integer or floating
|
|
point operations on wide "registers"/"vectors".
|
|
|
|
\item[What are the recent vector register widths?]
|
|
SSE has 128 Bits, AVX has 256 Bits, the Xeon Phi even 512 Bits with AVX2.
|
|
|
|
\item[What to think about if code vectorization is desired?]
|
|
\begin{itemize}
|
|
\item Check for automatic loop-unrolling by compilers
|
|
\item Profiling is recommended to eliminate memory bottlenecks (alignment!)
|
|
\item Avoid high level C++ abstractions which could avoid optimization!
|
|
\end{itemize}
|
|
|
|
\end{description}
|
|
\item[Why were caches introduced?]
|
|
To close the increasing gap between memory bandwidth and processor
|
|
performance by adding several layers of caches with increasing bandwidth.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What is a cache?]
|
|
It's a high speed memory with low capacity which is usually organized
|
|
in multiple layers. Commonly the caches are integrated into the CPU
|
|
die for high bandwidth.
|
|
|
|
\item[What performance characteristics does it have?]
|
|
High speed: usually allows multiple transfers per cycle
|
|
|
|
\item[When does it make sense to use a cache?]
|
|
If computation performance is limited by memory bandwidth
|
|
("memory-bound problems").
|
|
|
|
\item[What are cache hits/misses?]
|
|
Cache hits occur when requested items are currently held in
|
|
the cache. Otherwise a cache miss happens.
|
|
|
|
\item[What is the principle of locality?]
|
|
A item that was referenced will soon be referenced again (temporal).
|
|
Items that are close to a recently referenced item are likely to
|
|
be referenced in near future (spatial).
|
|
|
|
\end{description}
|
|
\item[Which cache types do exist \& what are their differences?]
|
|
\begin{itemize}
|
|
\item Data cache => stores data only
|
|
\item Instruction cache => stores instructions only
|
|
\item Unified cache => stores both
|
|
\item Trace/Micro-OP cache => stores decoded instructions
|
|
\end{itemize}
|
|
|
|
\item[Why is cache coherence important?]
|
|
Memory contents have to be kept consistent across multiple cores.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[How is the coherence achieved?]
|
|
Two main strategies: directory based (status of physical memory is
|
|
kept in one place only) and snooping (caches are connected via a
|
|
broadcast medium ans monitor on that medium).
|
|
By using cache invalidation protocols like MESI. \\
|
|
Write-back/write-through strategies.
|
|
Write-through: information is written to the cache line as well as
|
|
the corresponding location in main memory.
|
|
Write-back: information is only written to the cache and only
|
|
transfered to main memory if the cache line is evicted.
|
|
|
|
\end{description}
|
|
\item[When is prefetching used?]
|
|
Prefetching is used to continuously fetch data from the main memory
|
|
$\Longrightarrow$ keep the bus busy.
|
|
Cache misses usually trigger prefetches of the neighboring cache line.
|
|
|
|
\end{description}
|
|
|
|
\newpage
|
|
\subsection{Basic optimization techniques for serial code}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What is a hot spot?] A part of the code where a dominant fraction of the execution time is spent.
|
|
|
|
\item[What kind of trigger mechanisms do exist?] Event-driven and trigger-driven.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What is the difference between event-driven and sample-driven triggers?] \textbf{Event}-driven
|
|
triggers count accuratly every occurance by using instrumentation or PMCs => probably high overhead. \
|
|
\textbf{Sample}-driven triggers save the state/PC at periodic times => constant overhead.
|
|
|
|
\item[What is instrumentation?] Instrumentation is used to trigger events during
|
|
runtime by putting additional instructions in an existing program.
|
|
This may be done by libraries, emulators or the compiler or the programmer itself.
|
|
|
|
\end{description}
|
|
\item[What kind of recording mechanisms do exist?] Profiling and Tracing.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What is the difference between profiling and tracing?] Profiling summaries information about
|
|
the runtime spent per routine/line/block. Tracing collects call-paths
|
|
with timing information and is therefore much more detailed.
|
|
|
|
\end{description}
|
|
\item[What can hardware performance counter measure?] PMCs are MSRs/hw-counters to measure processor
|
|
events like cache/branch-misses, loads/stores and much more.
|
|
|
|
\item[What are basic single-core optimizations?]
|
|
\begin{itemize}
|
|
\item Do-less work
|
|
\item Avoid expensive operations
|
|
\item Shrink the working set
|
|
\item Eliminate common subexpressions
|
|
\item Avoid branches
|
|
\item Use SIMD instruction sets
|
|
\item Support the compiler use flags wisely
|
|
\item C++
|
|
\end{itemize}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What are simple loop optimizations?] Break-out the loop as soon as possible.
|
|
Jam neighbouring loops.
|
|
|
|
\item[How to avoid expensive operations?] Use precalculated tables for fast lookup of results.
|
|
|
|
\item[What to consider with data types?] Aligning is important. Take care when using SIMD.
|
|
|
|
\item[How can temporary variables increase performance?] They can be kept in registers and
|
|
therefore reduce the memory transations.
|
|
|
|
\item[Can vectorization help for memory-bound programs?] No.
|
|
|
|
\item[What is the role of the compiler?] Register usage, inlining, general optimizations
|
|
|
|
\item[What are problems with C++ codes?] Avoid frequent (de)allocation,
|
|
use static varibales whenever possible, avoid using to much abstraction.
|
|
|
|
\end{description}
|
|
\end{description}
|
|
|
|
\newpage
|
|
\subsection{Data access optimization}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What can be modelled with the balance metric?] Ratio of memory bandwidth \[Words/s\] to peak
|
|
performance \[Flops/s\].
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[How are lightspeed, machine and code balance defined?] See formulars.
|
|
|
|
\item[How can we get values for lightspeed, machine and code balance?] Count \# of
|
|
load/stores and arithmetic operations per loop iteration.
|
|
|
|
\item[What are the limitations of this model?] Expects peak performance: saturated pipelines,
|
|
superscalarity. Latency is ignored. Data transfer and calculation overlaps perfectly.
|
|
\end{description}
|
|
|
|
\item[How can algorithms be classified depending on the number of arithmetic operations and data transfers?] By
|
|
the scaling behaviour of memory transactions and arithmetic operations vs. problem size: $O(N), O(N^2), ...$.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[Which algorithm types have potential for optimizations?] Memory-bound usually not so much.
|
|
Take care for computate-bound problems: \( \frac{O(N^3)}{O(N^2)} \).
|
|
|
|
\item[Which optimizations should be applied?] Loop unrolling, jamming and blocking.
|
|
\end{description}
|
|
|
|
\item[Sparse matrix-vector multiply] Number of non-zero entries grows linearly with number of matrix rows.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[How can sparse matrices be stored and why?] Compressed row storage (CRS): store only
|
|
non-zero entries by using additional index- and row-pointer- arrays.
|
|
|
|
\item[How is the sparse matrix-vector multiply computed and optimized?] Workingset shrinking by
|
|
using CRS, loop unrolling, etc..
|
|
\end{description}
|
|
\end{description}
|
|
|
|
\newpage
|
|
\subsection{Parallel computers}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[Which taxonomy for parallel computing does exist?] Flynns: SIMD, MIMD, (SISD, MISD)
|
|
|
|
\item[What can limit scalability?] Serial parts: synchronization and communication
|
|
|
|
\item[What does Amdahl's Law (strong scaling) say?] For a fixed data set, speedup $S_p$ converges
|
|
with increasing number of workers towards $\frac{1}{s}$, where $s$ denotes the
|
|
fraction of serial code.
|
|
|
|
\item[What does Gustafson's Law (weak scaling) say?] For an arbitrarily large data set,
|
|
speedup $S_p$ grows linearly with the number $N$ of parallel workers: $N p + s$ with $p = 1 - s$. \\
|
|
Does not model the reality!
|
|
|
|
\item[What is a advantage? of multicore processors?] Higher performance is feasible with lower
|
|
clock frequencies and therefore lower energy consumption.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[Why do we have multicore processors?] Increasing the clock rate would require
|
|
enormous cooling efforts which arn't economic.
|
|
\end{description}
|
|
|
|
\item[Which advantages do SMT have?] Improves the exploitation of ILP. Increase in piplined throughput.
|
|
|
|
\item[What is a shared-memory computer?] Multiple cores are using the same main memory and address space.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What is the difference between UMA and ccNUMA?] The flat model of uniform memory access (UMA)
|
|
has equal costs for memory accesses across all cores.
|
|
Their memory is connected to a shared chipset.
|
|
Non-uniform memory access (NUMA) costs are dependent on the path between the memory and
|
|
the socket which initiates the transfer.
|
|
Memory is connected directly to sockets and/or distributed accross multiple
|
|
boards which are connected by custom interconnects (mostly propriatary: QPI, HT).
|
|
\end{description}
|
|
|
|
\item[What is a distributed-memory computer?] Unlike (N)UMA systems,
|
|
so called "No Remote Memory Access" (NORMA) systems don't have a common address space.
|
|
Their nodes a coupled via networks and use message passing to communicate.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[How do hybrid systems look like?] They combine shared and distributed memory aspects
|
|
in one cluster. Multiple shared memory nodes are coupled to a
|
|
distributed system => multiple isles of cache corherence!
|
|
\end{description}
|
|
\end{description}
|
|
|
|
\subsubsection{Networks}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[Which network performance metrics do exist?]
|
|
\begin{itemize}
|
|
\item Bisection (band)width $b$: \# of edges which have to be removed to split the network
|
|
into two (equal sized) halves.
|
|
\item Diameter $dm$: Max distance between to nodes/processors.
|
|
\item Edge connectivity $e$: Min \# of edges that must be removed to break the network it
|
|
into two disconnected parts.
|
|
\item Node connectivity $\delta$: Max \# of edges in the graph per node (max degree).
|
|
\end{itemize}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What can the PingPong benchmark model (latency, effective bandwidth)?] \textbf{Small}
|
|
messages have larger overhead and therefore lower effective bandwidth (latency dominates). \\
|
|
\textbf{Large} messages have less overhead better effective bandwidth (latency plays no role).
|
|
|
|
\item[What does the LogP model do?] Generic model for scaling behaviour of connections between
|
|
parallel systems. It includes:
|
|
\begin{description}
|
|
\item[L] Upper boundary on latency for one transfer
|
|
\item[o] Overhead: time that a processor is involved with the transfer
|
|
\item[g] Gap: minimum time interval between to consective transfers.
|
|
\item[P] The number of processors
|
|
\end{description}
|
|
|
|
\item[What are the definitions for bisection bandwidth, diameter \& edge connectivity?] See above.
|
|
\end{description}
|
|
|
|
\item[What are relevant network topologies in HPC?] See table above.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What are the bisection bandwidth, diameter, node \& edge connectivity of networks such as
|
|
Ring, Bus, Switched/Fat Tree, Meshes, Hypercubes, Tori, Hybrids?] See tables above.
|
|
\end{description}
|
|
\end{description}
|
|
|
|
\newpage
|
|
\subsection{Parallelization and optimization strategies}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[Which types of parallelism do exist?]
|
|
\begin{itemize}
|
|
\item Instruction/Processor level parallelism: Superscalarity, SIMD, SMT?
|
|
\item Node/chip level: several cores run in parallel on the same memory
|
|
\item System level: serveral nodes run in parallel, communication over a network
|
|
\end{itemize}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What overhead do the different types have \& how can it be reduced?]
|
|
Overhead increases from instruction to system level parallelism.
|
|
Overhead includes Amdahl's Law implications as well as the additional time required for implementation!
|
|
Use compiler optimization and existing patterns to reduce implementation overhead.
|
|
|
|
\item[What is a task dependency graph \& its critical path and average concurrency?] It's a DAG
|
|
which show dependencies on data and execution order of diffrent parts of the code.
|
|
\begin{description}
|
|
\item[Critial Path] Length of the longest path (sum of weights)
|
|
\item[Avg Concurrency] Ratio of total amount of work to critical path length.
|
|
\end{description}
|
|
\end{description}
|
|
|
|
\item[Which parallel design spaces \& patterns are defined by Mattson?]
|
|
\begin{itemize}
|
|
\item Finding Concurrency: Dependency Analysis \& Decomposition.
|
|
\item Algorithm Structure: Organize by.. tasks, data decomposition, flow of of data.
|
|
\item Supporting Structures: Program \& Data Structures
|
|
\item Implementation Mechanisms: UE Management, Sychronization, Communication.
|
|
\end{itemize}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What is geometric decomposition \& adaptive mesh refinement (AMR)?] Problem/data dependent
|
|
adaption of grid resolution. Increasing accuracy in important regions.
|
|
Split the problem space geometrically and distribute these parts to UEs.
|
|
|
|
\item[What does load imbalance mean? Which types do exist? How to get a load balanced application?] In
|
|
a team of threads/UE, one takes longer because it's working set takes longer to
|
|
solve than the others. These threads are called "speeders" \& "laggers". \\
|
|
There are geometric and graph-based load balancing methods which are statically or dynamic.
|
|
|
|
\item[What are differences between SPMD, Master/Worker, Loop Parallelism and Fork/Join?
|
|
Name a typical example.]
|
|
All these are supporting structures to parallelize algorithms.
|
|
SPMD and Loop Parallelism execute the same code on multiple UEs.
|
|
Master/Worker, Fork/Join offer a greater flexibility in this regard.
|
|
|
|
\item[What does NUMA affinity mean? How can memory placement and binding affect performance?]
|
|
For optimal performance, data has to be closely located to the
|
|
cores which are using it. Modern OSes use "affinity on first touch"
|
|
which allocates memory on the same socket where the task is executed.
|
|
\end{description}
|
|
|
|
\item[Which are typical approaches in a Molecular Dynamics application?]
|
|
\begin{itemize}
|
|
\item Particle Decomposition
|
|
\item Force Decomposition
|
|
\item Domain Decomposition
|
|
\end{itemize}
|
|
\end{description}
|
|
|
|
\newpage
|
|
\subsection{Distributed-memory programming with MPI}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[Basic ideas of MPI] Don't rely on cache coherence.
|
|
Use messages to communicate between workers/UEs => SPMD.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[What is SPMD and how is it implemented by MPI?]
|
|
All workers execute the same program.
|
|
Program flow is directed by so called "ranks" which are unique per process.
|
|
|
|
\item[How MPI abstracts the communication?] MPI provides a runtime API with a large set of
|
|
functions for point-to-point and collective operations.
|
|
It also has abstraction mechanisms for the underlying network topology.
|
|
There's a set of MPI types which are portable between architectures / programming languages.
|
|
\end{description}
|
|
|
|
\item[Point-to-point operations]
|
|
\begin{description}[style=nextline]
|
|
\item[How to transfer data between processes in the form of messages?]
|
|
Use \lstinline$MPI_Send()$, \lstinline$MPI_Recv()$ functions
|
|
(or their variants) for point-to-point communication.
|
|
See API reference above.
|
|
\item[How to prevent deadlocks and overlap communication and computation?]
|
|
Use non-blocking \lstinline$MPI_Isend()$, \lstinline$MPI_Irecv()$
|
|
in conjunction with the MPI request methods \lstinline$MPI_Wait()$
|
|
and \lstinline$MPI_Test()$. \\
|
|
Alternatively a deadlock-free \lstinline$MPI_Sendrecv()$ is available
|
|
which combines sending and receiving in on function call.
|
|
\end{description}
|
|
|
|
\item[Collective operations]
|
|
\begin{description}[style=nextline]
|
|
\item[How to scatter and gather data and perform operations on distributed data?] Use the
|
|
\lstinline$MPI_Scatter()$ and \lstinline$MPI_Gather()$ or their variable chunk variants.
|
|
\end{description}
|
|
|
|
\item[Virtual topologies]
|
|
\begin{description}[style=nextline]
|
|
\item[How to distribute processes over a regular grid?]
|
|
Use the \lstinline$MPI_Cart_*()$ to create a new MPI communicator.
|
|
There are functions to translate MPI ranks to coordinates
|
|
as well to split grids and get adjacent ranks.
|
|
\end{description}
|
|
|
|
\item[Derived datatypes]
|
|
\begin{description}[style=nextline]
|
|
\item[How to combine MPI datatypes into more complex entities?] Use the
|
|
\lstinline$MPI_Type_contiguous()$ and \lstinline$MPI_Type_vector()$ functions.
|
|
Type signature must be the same between processes,
|
|
gaps \& alignment are allowed to differ (architectural properties).
|
|
\end{description}
|
|
\end{description}
|
|
|
|
\newpage
|
|
\subsection{Shared-memory programming with OpenMP}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[Basic principle of OpenMP] Extend an existing application by
|
|
using \texttt{\#pragma} directives.
|
|
Worksharing is used to distribute loops, sections to multiple threads
|
|
on a shared-memory machine.
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[Execution model] OpenMP programs start with a single master thread.
|
|
Worker threads are spawned/joined by the master
|
|
in parallel regions => Fork/Join concept.
|
|
\item[Parallel region + worksharing constructs] OpenMP handles the thread management work.
|
|
\end{description}
|
|
|
|
\item[Scoping] There are OpenMP clauses the specifiy the scope (shared or private) of variables.
|
|
\begin{description}[style=nextline]
|
|
\item[Data sharing clauses] See API reference above.
|
|
\end{description}
|
|
|
|
\item[Synchronization]
|
|
\begin{description}[style=nextline]
|
|
\item[Critical section] Use \lstinline$#pragma omp critical$ directive.
|
|
\item[Reduction clause] Use \lstinline$reduction(operation:var-list)$ clause.
|
|
\item[Team and Task-Barriers] Use \lstinline$#pragma omp barrier$ and \lstinline$#pragma omp taskwait$ directives.
|
|
\end{description}
|
|
|
|
\item[Runtime library]
|
|
\begin{description}[style=nextline]
|
|
\item[Important functions] See API reference above.
|
|
\end{description}
|
|
|
|
\end{description}
|
|
|
|
\newpage
|
|
\subsection{Hybrid programming (MPI + OpenMP)}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[Hybrid programming basics] \hfill
|
|
\begin{description}[style=nextline]
|
|
\item[Why we need hybrid programs] \hfill
|
|
\item[Hybrid programming models] \hfill
|
|
\end{description}
|
|
|
|
\item[Threading modes of MPI] \hfill
|
|
\begin{description}[style=nextline]
|
|
\item[What levels of thread support MPI provides] \hfill
|
|
\item[Potential troubles with multithreaded MPI programs] \hfill
|
|
\end{description}
|
|
|
|
\item[Addressing multiple threads within MPI processes] \hfill
|
|
\begin{description}[style=nextline]
|
|
\item[How to work around the flat addressing space of MPI] \hfill
|
|
\item[Using multiple communicators in a hybrid context] \hfill
|
|
\end{description}
|
|
|
|
\item[Running hybrid programs on the RWTH cluster] \hfill
|
|
\begin{description}[style=nextline]
|
|
\item[How to properly instruct LSF to run your hybrid job] \hfill
|
|
\end{description}
|
|
\end{description}
|
|
|
|
\newpage
|
|
\subsection{Heterogeneous architectures (GPUs, Xeon Phis)}
|
|
|
|
\subsubsection{GPGPU's}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[How does a GPU look like?]
|
|
A GPGPU is a manycore architecture optimized for high data-parallel throughput.
|
|
It consists of multiple streaming processors (SM), which itself consists of many cores.
|
|
\begin{description}[style=nextline]
|
|
\item[Why do GPUs deliver a good performance per Watt ratio?] GPGPUs have small caches,
|
|
many low frequency cores and little control logic.
|
|
Memory is close to the processing unit.
|
|
\item[What is the difference to CPUs?] There are more transistors dedicated to computation
|
|
instead of controlling. A CPU has large caches with multiple levels,
|
|
supports OoO execution, sophisticated control logic like speculative
|
|
execution and hw-scheduling of microops.
|
|
\item[How does the memory hierarchy look like?] Each SM has seperated shared memories,
|
|
caches, registers and texture storage.
|
|
All SM have access to a common LLC and global memory of the GPU.
|
|
This global memory is seperated from the host.
|
|
There's no cache coherence for GPU caches!
|
|
Communication with the host has to be done by DMA transfers.
|
|
\item[How can the logical programming hierarchy be mapped to the execution model?]
|
|
The execution model is host-directed.
|
|
Portions of the code, so called "kernels" are offloaded to the GPU.
|
|
These kernels are executed highly parallel by an array of threads.
|
|
\end{description}
|
|
|
|
\item[Which models can be used to program a GPU?] There are special GPU programming APIs
|
|
like CUDA and more general directive based models like OpenACC/MP.
|
|
\begin{description}[style=nextline]
|
|
\item[How to handle offloading of regions?]
|
|
Programmer has to manually find regions which can be parallelized.
|
|
These "kernels" are compiled to special GPU code and called by the host.
|
|
\item[How to handle data management?] Data transfer has to be done explicitly.
|
|
|
|
\item[What are the main differences?] \hfill
|
|
\end{description}
|
|
|
|
\item[Which impact does the PCIe have?] Usually GPUs are have very fast memory.
|
|
The PCIe bus isn't fast enough to saturate this memory bandwidth => bottleneck.
|
|
|
|
\item[What is branch divergence?] 32 threads are joined to "warps".
|
|
These warps share a common program counter => branches are serialized inside warps.
|
|
\begin{description}[style=nextline]
|
|
\item[Which performance impact does it have?]
|
|
Branch divergancy causes threads which doesn't took a branch to idle.
|
|
It therefore leds to performance degration =>
|
|
Branch only with sets of multiple warps.
|
|
\end{description}
|
|
|
|
\item[What can be a good launch configuration and why?] \hfill
|
|
\begin{description}[style=nextline]
|
|
\item[How can the launch configuration be specified?] \hfill
|
|
\end{description}
|
|
|
|
\item[What can be done to saturate the bus?] Maximize PCIe throughput by transfer
|
|
less data, batch smaller transfers to larger ones.
|
|
\begin{description}[style=nextline]
|
|
\item[What is coalescing?] \hfill
|
|
\item[How can it be achieved?] \hfill
|
|
\item[What impact does AoS and SoA have?] \hfill
|
|
\item[What is the difference between caching and non-caching loads?] \hfill
|
|
\end{description}
|
|
|
|
\item[What is shared memory?] CUDA SM/blocks have dedicated shared memory.
|
|
This can be used for low-latency inter-thread communication.
|
|
|
|
\item[How to avoid CPU/ GPU idling?] Use asynchronous data transfers and CUDA streams
|
|
to enqueue multiple kernels for execution.
|
|
\end{description}
|
|
|
|
\subsubsection{Xeon Phi}
|
|
|
|
\begin{description}[style=nextline]
|
|
\item[How does a Xeon Phi look like?]
|
|
The Xeon Phi consists of 60 cores with 4-fold SMT,
|
|
has a clock frequency of 1090~Mhz
|
|
and a peak performance of about 1 TFlop/s.
|
|
\begin{description}[style=nextline]
|
|
\item[How does the memory hierarchy look like?]
|
|
All cores are connected to their own L1/L2 caches by a ring bus.
|
|
There's a common Memory and I/O interface which also connected to ring bus
|
|
and used to access the global GDDR5 RAM.
|
|
Caches a kept coherent. Data transfer to the host has to be done via DMA.
|
|
There's no common address space with the host.
|
|
|
|
\item[How many threads/ vector-widths are available?]
|
|
\begin{description}
|
|
\item[Threads] 60 cores * 4 threads = 240 threads
|
|
\item[Vector-width] AVX-512 has 512 bits per vector
|
|
(extension of AVX2, MIC only).
|
|
\end{description}
|
|
\end{description}
|
|
|
|
\item[Which programming concepts do exist?] OpenMP/CL/ACC, MPI and Posix threads.
|
|
The Phi is a slightly modified x86 architecture and can be programmed used
|
|
like a normal x86 system. Intel offers propriatary language offloading extensions (LEO)
|
|
for their own C compiler.
|
|
\begin{description}[style=nextline]
|
|
\item[How can OpenMP (4.0) be used?] The recent OpenMP spec adds new
|
|
directives (\texttt{target} and \texttt{teams}) which can be used
|
|
to offload parts of the code.
|
|
This concept has been adopted from the OpenACC
|
|
standard which can be considered to be a testbed for OpenMP.
|
|
\end{description}
|
|
|
|
\item[Which optimization strategies should be applied?]
|
|
Massive SIMD vectorization, check data aligment (SoA, AoS), avoid aliasing.
|
|
\begin{description}[style=nextline]
|
|
\item[Which impact can the PCIe have?]
|
|
PCIe is a bottleneck for host to Phi data transfer.
|
|
Adds latency for communication => see GPGPU.
|
|
\item[Which impact does vectorization have?] Huge impact for the Phi.
|
|
It has support for FMA-3 with large SIMD vector registers.
|
|
\end{description}
|
|
\end{description}
|
|
|
|
\end{document}
|