Merge branch 'master' of github.com:stv0g/rwth-misc
This commit is contained in:
commit
5e9c9083dc
1 changed files with 74 additions and 74 deletions
|
@ -37,13 +37,13 @@
|
|||
|
||||
\section{Amdahl \& Gustafson}
|
||||
|
||||
\begin{tabular}{ l | c c c }
|
||||
\begin{tabular}{ l l | c c }
|
||||
|
||||
& \textbf{General} & \textbf{Amdahl's Law} & \textbf{Gustafson's Law} \\
|
||||
& & \textit{strong scaling} & \textit{weak scaling} \\
|
||||
& & \textbf{Amdahl's Law} & \textbf{Gustafson's Law} \\
|
||||
& & \textit{strong scaling} & \textit{weak scaling} \\
|
||||
\hline
|
||||
Speedup \( S_p(N) \) & \( \frac{T(1)}{T(N)} \) & \( \frac{1}{S + \frac{1 - S}{N}} \) & \( Np + s \) \\
|
||||
Efficency \( \varepsilon_p(N) \) & \( \frac{S_p(N)}{N} = \frac{T(1)}{N \cdot T(N)} \) & \( \frac{1}{s(N-1) + 1} \) & \( \frac{1 - p}{N} \) \\
|
||||
\textbf{Speedup} & \( S_p(N) = \frac{T(1)}{T(N)} \) & \( \frac{1}{s + \frac{1 - s}{N}} \xrightarrow{N \rightarrow \infty} \frac{1}{s} \) & \( Np + s \xrightarrow{N \rightarrow \infty} \infty \) \\
|
||||
\textbf{Efficency} & \( \varepsilon_p(N) = \frac{S_p(N)}{N} \) & \( \frac{1}{s (N-1) + 1} \xrightarrow{N \rightarrow \infty} 0 \) & \( \frac{1 - p}{N} + p \xrightarrow{N \rightarrow \infty} p \) \\
|
||||
\end{tabular}
|
||||
|
||||
\section{Moore's Law}
|
||||
|
@ -75,7 +75,7 @@
|
|||
Stages & $m$ \\
|
||||
Operations & $N$ \\
|
||||
Without pipeline & \( T_{seq} = m \cdot N \) \\
|
||||
With pipeline & \( T_{pipe} = N + m − 1 \) \\
|
||||
With pipeline & \( T_{pipe} = N + m - 1 \) \\
|
||||
Speedup & \(S_{pipe} = \frac{m}{1 + \frac{m-1}{N}} \xrightarrow{N \rightarrow \infty} m \) \\
|
||||
Throughput (results/cycle) & \( \frac{N}{T_{pipe}} = \frac{1}{1 + \frac{m-1}{N}} \xrightarrow{N \rightarrow \infty} 1 \) \\
|
||||
\end{tabular}
|
||||
|
@ -104,14 +104,13 @@
|
|||
\begin{tabular}{l | c c c c }
|
||||
\textbf{Topology} & \textbf{Max degree} & \textbf{Edge connectivity} & \textbf{Diameter} & \textbf{Bisection BW} \\
|
||||
\hline
|
||||
Bus & 1 & 1 & 1 & B \\
|
||||
Ring & 2 & 2 & \( \lfloor \frac{N}{2} \rfloor \) & 2B \\
|
||||
Fully connected & \( \frac{N(N-1)}{2} \) & N-1 & 1 & \( \frac{N^2}{4} \) \\
|
||||
Bus & $1$ & $1$ & $1$ & $B$ \\
|
||||
Ring & $2$ & $2$ & $\lfloor \frac{N}{2} \rfloor$ & $2B$ \\
|
||||
Fully connected & $N-1$ & $N-1$ & $1$ & $\frac{N^2}{4}$ \\
|
||||
Sw. Fat Tree & 1 w/o redunancy & depends on design & 2 hierarchy height & depends on design \\
|
||||
Mesh & 2d & d & \( \sum_{i=1}^d (N_i - 1) \) & \( B ( \prod_{i=1}^{d-1} N_i ) \) \\
|
||||
Torus & 2d & 2d & \( \sum_{i=1}^d \lfloor \frac{N}{2} \rfloor \) & \( 2B ( \prod_{i=1}^{d-1} N_i ) \) \\
|
||||
|
||||
Hypercube & d & d & d & \( B2^{d-1} \) \\
|
||||
Mesh & $2d$ & $d$ & $\sum_{i=1}^d (N_i - 1)$ & $B ( \prod_{i=1}^{d-1} N_i )$ \\
|
||||
Torus & $2d$ & $2d$ & $\sum_{i=1}^d \lfloor \frac{N}{2} \rfloor$ & $2B ( \prod_{i=1}^{d-1} N_i )$ \\
|
||||
Hypercube & $d$ & $d$ & $d$ & $B2^{d-1}$ \\
|
||||
\end{tabular}
|
||||
|
||||
\section{Balance, Lightspeed}
|
||||
|
@ -183,7 +182,7 @@ All MPI routines return an integer error code!
|
|||
\lstinline$ ?rbuf?, §rcnt§, §rtype§, §root§, §cm§)$ & \\
|
||||
\lstinline$MPI_Allgather(...)$ & \\
|
||||
\lstinline$MPI_Alltoall()$ & \\
|
||||
\lstinline$MPI_Redude()$ & \\
|
||||
\lstinline$MPI_Reduce()$ & \\
|
||||
\lstinline$MPI_Allreduce(...)$ & \\
|
||||
\end{tabular}
|
||||
|
||||
|
@ -204,7 +203,7 @@ All MPI routines return an integer error code!
|
|||
|
||||
\subsubsection{Virtual topologies}
|
||||
\begin{tabular}{ p{8cm} l }
|
||||
\lstinline$MPI_Cart_create(...)$ & \\
|
||||
\lstinline$MPI_Cart_create(..)$ & Creates a new cartesian cm from given old one.\\
|
||||
\lstinline$MPI_Cart_rank(...)$ & \\
|
||||
\lstinline$MPI_Cart_shift(...)$ & \\
|
||||
\lstinline$MPI_Cart_sub(...)$ & \\
|
||||
|
@ -266,8 +265,10 @@ All MPI routines return an integer error code!
|
|||
\lstinline$#pragma omp sections$ & Non-iterative worksharing of diffrent sections. \\
|
||||
\lstinline$#pragma omp section$ & \\
|
||||
\lstinline$#pragma omp single$ & Region shall only be executed by a single thread. \\
|
||||
\lstinline$#pragma omp critical$ & Region is executed by all threads but only by a single simultaneously. \\
|
||||
\lstinline$#pragma omp barrier$ & All tasks created by any thread of the current team are guaranteed to complete at barrier exit. \\
|
||||
\lstinline$#pragma omp critical$ & Region is executed by all threads but only
|
||||
by a single simultaneously. \\
|
||||
\lstinline$#pragma omp barrier$ & All tasks created by any thread of the current
|
||||
team are guaranteed to complete at barrier exit. \\
|
||||
\lstinline$#pragma omp taskwait$ & Encountering task suspends until all (direct) child tasks are complete. \\
|
||||
\lstinline$#pragma omp task$ & Defines an explicit task. \\
|
||||
\lstinline$#pragma omp target [data]$ & \\
|
||||
|
@ -359,16 +360,19 @@ See \texttt{man bsub}.
|
|||
\item Titan (17 PFlops/s)
|
||||
\end{enumerate}
|
||||
\item[What can we read from the performance development as measured in the
|
||||
TOP500?] The TOP500 lists by peak performance $R_{max}$ in Flops/s (double) measured by a standardized benchmark (LIN/LAPACK). \\
|
||||
Future trends are predictable (exascale computing, architectures).\\
|
||||
At the moment, it takes approximately 11 years to increase performance by 3
|
||||
orders of magnitude (ExaFlop/s projected for 2019).
|
||||
TOP500?] The TOP500 lists by peak performance $R_{max}$
|
||||
in Flops/s (double) measured by a standardized benchmark (LIN/LAPACK). \\
|
||||
Future trends are predictable (exascale computing, architectures).\\
|
||||
At the moment, it takes approximately 11 years to increase performance by 3
|
||||
orders of magnitude (ExaFlop/s projected for 2019).
|
||||
|
||||
\item[What does Moore's Law tell you? Is it still valid?] "The number of transistors per chip doubles around every one to two years." \\
|
||||
Slope is now declining slowly. But mostly still valid.
|
||||
\item[What does Moore's Law tell you? Is it still valid?]
|
||||
"The number of transistors per chip doubles around every one to two years." \\
|
||||
Slope is now declining slowly. But mostly still valid.
|
||||
|
||||
\item[Why do we have multi-core architectures today?] Clock frequencies can't be pushed further due to limited cooling capabilities. \\
|
||||
Solution: Decrease frequency; increase die-size by putting more cores on a single chip and exploit parallelism.
|
||||
\item[Why do we have multi-core architectures today?]
|
||||
Clock frequencies can't be pushed further due to limited cooling capabilities. \\
|
||||
Solution: Decrease frequency; increase die-size by putting more cores on a single chip and exploit parallelism.
|
||||
|
||||
\end{description}
|
||||
|
||||
|
@ -798,6 +802,7 @@ See \texttt{man bsub}.
|
|||
|
||||
\item[What are the definitions for bisection bandwidth, diameter \& edge connectivity?] See above.
|
||||
\end{description}
|
||||
|
||||
\item[What are relevant network topologies in HPC?] See table above.
|
||||
|
||||
\begin{description}[style=nextline]
|
||||
|
@ -849,7 +854,8 @@ See \texttt{man bsub}.
|
|||
solve than the others. These threads are called "speeders" \& "laggers". \\
|
||||
There are geometric and graph-based load balancing methods which are statically or dynamic.
|
||||
|
||||
\item[What are differences between SPMD, Master/Worker, Loop Parallelism and Fork/Join? Name a typical example.]
|
||||
\item[What are differences between SPMD, Master/Worker, Loop Parallelism and Fork/Join?
|
||||
Name a typical example.]
|
||||
All these are supporting structures to parallelize algorithms.
|
||||
SPMD and Loop Parallelism execute the same code on multiple UEs.
|
||||
Master/Worker, Fork/Join offer a greater flexibility in this regard.
|
||||
|
@ -940,27 +946,22 @@ See \texttt{man bsub}.
|
|||
\end{description}
|
||||
|
||||
\item[Scoping] There are OpenMP clauses the specifiy the scope (shared or private) of variables.
|
||||
|
||||
\begin{description}[style=nextline]
|
||||
\item[Data sharing clauses] \hfill
|
||||
|
||||
\item[Data sharing clauses] See API reference above.
|
||||
\end{description}
|
||||
\item[Synchronization] \hfill
|
||||
|
||||
\item[Synchronization]
|
||||
\begin{description}[style=nextline]
|
||||
\item[Critical section] \hfill
|
||||
|
||||
\item[Reduction clause] \hfill
|
||||
|
||||
\item[Team and Task-Barriers] \hfill
|
||||
|
||||
\end{description}
|
||||
\item[Runtime library] \hfill
|
||||
\item[Critical section] Use \lstinline$#pragma omp critical$ directive.
|
||||
\item[Reduction clause] Use \lstinline$reduction(operation:var-list)$ clause.
|
||||
\item[Team and Task-Barriers] Use \lstinline$#pragma omp barrier$ and \lstinline$#pragma omp taskwait$ directives.
|
||||
\end{description}
|
||||
|
||||
\item[Runtime library]
|
||||
\begin{description}[style=nextline]
|
||||
\item[Important functions] \hfill
|
||||
|
||||
\item[Important functions] See API reference above.
|
||||
\end{description}
|
||||
|
||||
\end{description}
|
||||
|
||||
\newpage
|
||||
|
@ -968,34 +969,26 @@ See \texttt{man bsub}.
|
|||
|
||||
\begin{description}[style=nextline]
|
||||
\item[Hybrid programming basics] \hfill
|
||||
|
||||
\begin{description}[style=nextline]
|
||||
\item[Why we need hybrid programs] \hfill
|
||||
|
||||
\item[Hybrid programming models] \hfill
|
||||
|
||||
\item[Hybrid programming models] \hfill
|
||||
\end{description}
|
||||
|
||||
\item[Threading modes of MPI] \hfill
|
||||
|
||||
\begin{description}[style=nextline]
|
||||
\begin{description}[style=nextline]
|
||||
\item[What levels of thread support MPI provides] \hfill
|
||||
|
||||
\item[Potential troubles with multithreaded MPI programs] \hfill
|
||||
|
||||
\item[Potential troubles with multithreaded MPI programs] \hfill
|
||||
\end{description}
|
||||
\item[Addressing multiple threads within MPI processes] \hfill
|
||||
|
||||
\item[Addressing multiple threads within MPI processes] \hfill
|
||||
\begin{description}[style=nextline]
|
||||
\item[How to work around the flat addressing space of MPI] \hfill
|
||||
|
||||
\item[Using multiple communicators in a hybrid context] \hfill
|
||||
|
||||
\end{description}
|
||||
|
||||
\item[Running hybrid programs on the RWTH cluster] \hfill
|
||||
|
||||
\begin{description}[style=nextline]
|
||||
\item[How to properly instruct LSF to run your hybrid job] \hfill
|
||||
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
|
@ -1008,24 +1001,20 @@ See \texttt{man bsub}.
|
|||
\item[How does a GPU look like?]
|
||||
A GPGPU is a manycore architecture optimized for high data-parallel throughput.
|
||||
It consists of multiple streaming processors (SM), which itself consists of many cores.
|
||||
|
||||
\begin{description}[style=nextline]
|
||||
\item[Why do GPUs deliver a good performance per Watt ratio?] GPGPUs have small caches,
|
||||
many low frequency cores and little control logic.
|
||||
Memory is close to the processing unit.
|
||||
|
||||
\item[What is the difference to CPUs?] There are more transistors dedicated to computation
|
||||
instead of controlling. A CPU has large caches with multiple levels,
|
||||
supports OoO execution, sophisticated control logic like speculative
|
||||
execution and hw-scheduling of microops.
|
||||
|
||||
\item[How does the memory hierarchy look like?] Each SM has seperated shared memories,
|
||||
caches, registers and texture storage.
|
||||
All SM have access to a common LLC and global memory of the GPU.
|
||||
This global memory is seperated from the host.
|
||||
There's no cache coherence for GPU caches!
|
||||
Communication with the host has to be done by DMA transfers.
|
||||
|
||||
\item[How can the logical programming hierarchy be mapped to the execution model?]
|
||||
The execution model is host-directed.
|
||||
Portions of the code, so called "kernels" are offloaded to the GPU.
|
||||
|
@ -1034,23 +1023,20 @@ See \texttt{man bsub}.
|
|||
|
||||
\item[Which models can be used to program a GPU?] There are special GPU programming APIs
|
||||
like CUDA and more general directive based models like OpenACC/MP.
|
||||
|
||||
\begin{description}[style=nextline]
|
||||
\item[How to handle offloading of regions?] Programmer has to find regions
|
||||
which can be parallelized.
|
||||
\item[How to handle offloading of regions?]
|
||||
Programmer has to manually find regions which can be parallelized.
|
||||
These "kernels" are compiled to special GPU code and called by the host.
|
||||
|
||||
\item[How to handle data management?] Data transfer has to be done explicitly.
|
||||
|
||||
\item[What are the main differences?] \hfill
|
||||
|
||||
\end{description}
|
||||
|
||||
\item[Which impact does the PCIe have?] Usually GPUs are have very fast memory.
|
||||
The PCIe bus isn't fast enough to saturate this memory bandwidth => bottleneck.
|
||||
|
||||
\item[What is branch divergence?] 32 threads are joined to "warps".
|
||||
These warps share a common program counter => branches are serialized inside warps.
|
||||
|
||||
\begin{description}[style=nextline]
|
||||
\item[Which performance impact does it have?]
|
||||
Branch divergancy causes threads which doesn't took a branch to idle.
|
||||
|
@ -1065,7 +1051,6 @@ See \texttt{man bsub}.
|
|||
|
||||
\item[What can be done to saturate the bus?] Maximize PCIe throughput by transfer
|
||||
less data, batch smaller transfers to larger ones.
|
||||
|
||||
\begin{description}[style=nextline]
|
||||
\item[What is coalescing?] \hfill
|
||||
\item[How can it be achieved?] \hfill
|
||||
|
@ -1083,28 +1068,43 @@ See \texttt{man bsub}.
|
|||
\subsubsection{Xeon Phi}
|
||||
|
||||
\begin{description}[style=nextline]
|
||||
\item[How does a Xeon Phi look like?] The Xeon Phi consists of 60 cores with 4-times SMT.
|
||||
\item[How does a Xeon Phi look like?]
|
||||
The Xeon Phi consists of 60 cores with 4-fold SMT,
|
||||
has a clock frequency of 1090~Mhz
|
||||
and a peak performance of about 1 TFlop/s.
|
||||
\begin{description}[style=nextline]
|
||||
\item[How does the memory hierarchy look like?] All cores are connected
|
||||
to caches and dedicated memory by a ring bus. Caches a kept coherent.
|
||||
Data transfer to the host has to be done via DMA.
|
||||
Theres no common address space.
|
||||
\item[How does the memory hierarchy look like?]
|
||||
All cores are connected to their own L1/L2 caches by a ring bus.
|
||||
There's a common Memory and I/O interface which also connected to ring bus
|
||||
and used to access the global GDDR5 RAM.
|
||||
Caches a kept coherent. Data transfer to the host has to be done via DMA.
|
||||
There's no common address space with the host.
|
||||
|
||||
\item[How many threads/ vector-widths are available?]
|
||||
\begin{description}
|
||||
\item[Threads] 60 cores * 4 threads = 240 threads
|
||||
\item[Vector-width] AVX-512 has 512 bits per vector (extension of AVX2, MIC only)
|
||||
\item[Vector-width] AVX-512 has 512 bits per vector
|
||||
(extension of AVX2, MIC only).
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
\item[Which programming concepts do exist?] OpenCL, OpenMP, MPI, OpenACC?
|
||||
\item[Which programming concepts do exist?] OpenMP/CL/ACC, MPI and Posix threads.
|
||||
The Phi is a slightly modified x86 architecture and can be programmed used
|
||||
like a normal x86 system. Intel offers propriatary language offloading extensions (LEO)
|
||||
for their own C compiler.
|
||||
\begin{description}[style=nextline]
|
||||
\item[How can OpenMP (4.0) be used?] \hfill
|
||||
\item[How can OpenMP (4.0) be used?] The recent OpenMP spec adds new
|
||||
directives (\texttt{target} and \texttt{teams}) which can be used
|
||||
to offload parts of the code.
|
||||
This concept has been adopted from the OpenACC
|
||||
standard which can be considered to be a testbed for OpenMP.
|
||||
\end{description}
|
||||
|
||||
\item[Which optimization strategies should be applied?]
|
||||
Massive SIMD vectorization, check data aligment (SoA, AoS), avoid aliasing.
|
||||
\begin{description}[style=nextline]
|
||||
\item[Which impact can the PCIe have?] PCIe is a bottleneck for host to Phi data transfer.
|
||||
\item[Which impact can the PCIe have?]
|
||||
PCIe is a bottleneck for host to Phi data transfer.
|
||||
Adds latency for communication => see GPGPU.
|
||||
\item[Which impact does vectorization have?] Huge impact for the Phi.
|
||||
It has support for FMA-3 with large SIMD vector registers.
|
||||
|
|
Loading…
Add table
Reference in a new issue