From 89ed65020a936b15fd6bc62a644831cfe19d0b8d Mon Sep 17 00:00:00 2001
From: Peter de Lange <peter.de.lange@rwth-aachen.de>
Date: Thu, 14 Aug 2014 10:15:00 +0200
Subject: [PATCH 1/7] small additions

---
 hpc_summary.tex | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hpc_summary.tex b/hpc_summary.tex
index a34d3fe..5326c17 100644
--- a/hpc_summary.tex
+++ b/hpc_summary.tex
@@ -183,7 +183,7 @@ All MPI routines return an integer error code!
 	\lstinline$     ?rbuf?, §rcnt§, §rtype§, §root§, §cm§)$ & \\
 	\lstinline$MPI_Allgather(...)$ & \\
 	\lstinline$MPI_Alltoall()$ & \\
-	\lstinline$MPI_Redude()$ & \\
+	\lstinline$MPI_Reduce()$ & \\
 	\lstinline$MPI_Allreduce(...)$ & \\
 \end{tabular}
 
@@ -204,7 +204,7 @@ All MPI routines return an integer error code!
 
 \subsubsection{Virtual topologies}
 \begin{tabular}{ p{8cm} l }
-	\lstinline$MPI_Cart_create(...)$ & \\
+	\lstinline$MPI_Cart_create(..)$ & Creates a new cartesian cm from given old one.\\
 	\lstinline$MPI_Cart_rank(...)$ & \\
 	\lstinline$MPI_Cart_shift(...)$ & \\
 	\lstinline$MPI_Cart_sub(...)$ & \\

From 8794406c241159161ec7f8311d69201ca3986af5 Mon Sep 17 00:00:00 2001
From: Steffen Vogel <post@steffenvogel.de>
Date: Thu, 14 Aug 2014 10:41:50 +0200
Subject: [PATCH 2/7] whitespace cleanup (I've really messed this up on my
 netbook o_O)

---
 hpc_summary.tex | 67 +++++++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/hpc_summary.tex b/hpc_summary.tex
index ffaf5b1..846df8d 100644
--- a/hpc_summary.tex
+++ b/hpc_summary.tex
@@ -266,8 +266,10 @@ All MPI routines return an integer error code!
 	\lstinline$#pragma omp sections$ & Non-iterative worksharing of diffrent sections. \\
 	\lstinline$#pragma omp section$ & \\
 	\lstinline$#pragma omp single$ & Region shall only be executed by a single thread. \\
-	\lstinline$#pragma omp critical$ & Region is executed by all threads but only by a single simultaneously. \\
-	\lstinline$#pragma omp barrier$ & All tasks created by any thread of the current team are guaranteed to complete at barrier exit. \\
+	\lstinline$#pragma omp critical$ & Region is executed by all threads but only
+		by a single simultaneously. \\
+	\lstinline$#pragma omp barrier$ & All tasks created by any thread of the current
+		team are guaranteed to complete at barrier exit. \\
 	\lstinline$#pragma omp taskwait$ & Encountering task suspends until all (direct) child tasks are complete. \\
 	\lstinline$#pragma omp task$ & Defines an explicit task. \\
 	\lstinline$#pragma omp target [data]$ & \\
@@ -359,16 +361,19 @@ See \texttt{man bsub}.
 			\item Titan (17 PFlops/s)
 		\end{enumerate}
 	\item[What can we read from the performance development as measured in the
-        TOP500?] The TOP500 lists by peak performance $R_{max}$ in Flops/s (double) measured by a standardized benchmark (LIN/LAPACK). \\
-	Future trends are predictable (exascale computing, architectures).\\
-    	At the moment, it takes approximately 11 years to increase performance by 3
-    	orders of magnitude (ExaFlop/s projected for 2019).
+		TOP500?] The TOP500 lists by peak performance $R_{max}$
+		in Flops/s (double) measured by a standardized benchmark (LIN/LAPACK). \\
+		Future trends are predictable (exascale computing, architectures).\\
+		At the moment, it takes approximately 11 years to increase performance by 3
+		orders of magnitude (ExaFlop/s projected for 2019).
 	
-	\item[What does Moore's Law tell you? Is it still valid?] "The number of transistors per chip doubles around every one to two years." \\
-	Slope is now declining slowly. But mostly still valid.
+	\item[What does Moore's Law tell you? Is it still valid?]
+		"The number of transistors per chip doubles around every one to two years." \\
+		Slope is now declining slowly. But mostly still valid.
 	
-	\item[Why do we have multi-core architectures today?] Clock frequencies can't be pushed further due to limited cooling capabilities. \\
-	Solution: Decrease frequency; increase die-size by putting more cores on a single chip and exploit parallelism.
+	\item[Why do we have multi-core architectures today?]
+		Clock frequencies can't be pushed further due to limited cooling capabilities. \\
+		Solution: Decrease frequency; increase die-size by putting more cores on a single chip and exploit parallelism.
 	
 \end{description}
 
@@ -798,6 +803,7 @@ See \texttt{man bsub}.
 
 		\item[What are the definitions for bisection bandwidth, diameter \& edge connectivity?] See above.
 	\end{description}
+
 	\item[What are relevant network topologies in HPC?] See table above.
  
 	\begin{description}[style=nextline]
@@ -849,7 +855,8 @@ See \texttt{man bsub}.
 			solve than the others. These threads are called "speeders" \& "laggers". \\
 			There are geometric and graph-based load balancing methods which are statically or dynamic.
  
-		\item[What are differences between SPMD, Master/Worker, Loop Parallelism and Fork/Join? Name a typical example.]
+		\item[What are differences between SPMD, Master/Worker, Loop Parallelism and Fork/Join?
+			Name a typical example.]
 			All these are supporting structures to parallelize algorithms.
 			SPMD and Loop Parallelism execute the same code on multiple UEs.
 			Master/Worker, Fork/Join offer a greater flexibility in this regard.
@@ -940,13 +947,12 @@ See \texttt{man bsub}.
 	\end{description}
 
 	\item[Scoping] There are OpenMP clauses the specifiy the scope (shared or private) of variables.
-
 	\begin{description}[style=nextline]
 		\item[Data sharing clauses] \hfill
 
 	\end{description}
-	\item[Synchronization] \hfill
 
+	\item[Synchronization]
 	\begin{description}[style=nextline]
 		\item[Critical section] \hfill
 
@@ -961,6 +967,7 @@ See \texttt{man bsub}.
 		\item[Important functions] \hfill
 
 	\end{description}
+
 \end{description}
 
 \newpage
@@ -968,34 +975,26 @@ See \texttt{man bsub}.
 
 \begin{description}[style=nextline]
 	\item[Hybrid programming basics] \hfill
-
 	\begin{description}[style=nextline]
 		\item[Why we need hybrid programs] \hfill
-
-		\item[Hybrid programming models] \hfill
- 	
+		\item[Hybrid programming models] \hfill	
 	\end{description}
+
 	\item[Threading modes of MPI] \hfill
- 
-	\begin{description}[style=nextline]
+ 	\begin{description}[style=nextline]
 		\item[What levels of thread support MPI provides] \hfill
-
-		\item[Potential troubles with multithreaded MPI programs] \hfill
- 
+		\item[Potential troubles with multithreaded MPI programs] \hfill 
 	\end{description}
-	\item[Addressing multiple threads within MPI processes] \hfill
 
+	\item[Addressing multiple threads within MPI processes] \hfill
 	\begin{description}[style=nextline]
 		\item[How to work around the flat addressing space of MPI] \hfill
-
 		\item[Using multiple communicators in a hybrid context] \hfill
- 
 	\end{description}
+	
 	\item[Running hybrid programs on the RWTH cluster] \hfill
-
 	\begin{description}[style=nextline]
 		\item[How to properly instruct LSF to run your hybrid job] \hfill
- 	
 	\end{description}
 \end{description}	
 
@@ -1008,24 +1007,20 @@ See \texttt{man bsub}.
 	\item[How does a GPU look like?]
 		A GPGPU is a manycore architecture optimized for high data-parallel throughput.
 		It consists of  multiple streaming processors (SM), which itself consists of many cores.
-
 	\begin{description}[style=nextline]
 		\item[Why do GPUs deliver a good performance per Watt ratio?] GPGPUs have small caches,
 			many low frequency cores and little control logic.
 			Memory is close to the processing unit.
-
 		\item[What is the difference to CPUs?] There are more transistors dedicated to computation
 			instead of controlling. A CPU has large caches with multiple levels,
 			supports OoO execution, sophisticated control logic like speculative
 			execution and hw-scheduling of microops.
- 
 		\item[How does the memory hierarchy look like?] Each SM has seperated shared memories,
 			caches, registers and texture storage.
 			All SM have access to a common LLC and global memory of the GPU.
 			This global memory is seperated from the host.
 			There's no cache coherence for GPU caches!
 			Communication with the host has to be done by DMA transfers.
- 
 		\item[How can the logical programming hierarchy be mapped to the execution model?]
 			The execution model is host-directed.
 			Portions of the code, so called "kernels" are offloaded to the GPU.
@@ -1034,23 +1029,20 @@ See \texttt{man bsub}.
 
 	\item[Which models can be used to program a GPU?] There are special GPU programming APIs
 		like CUDA and more general directive based models like OpenACC/MP.
-
 	\begin{description}[style=nextline]
-		\item[How to handle offloading of regions?] Programmer has to find regions
-			which can be parallelized.
+		\item[How to handle offloading of regions?]
+			Programmer has to manually find regions	which can be parallelized.
 			These "kernels" are compiled to special GPU code and called by the host.
-
 		\item[How to handle data management?] Data transfer has to be done explicitly.
  	
 		\item[What are the main differences?] \hfill
- 
 	\end{description}
+
 	\item[Which impact does the PCIe have?] Usually GPUs are have very fast memory.
 		The PCIe bus isn't fast enough to saturate this memory bandwidth => bottleneck.
 
 	\item[What is branch divergence?] 32 threads are joined to "warps".
 		These warps share a common program counter => branches are serialized inside warps.
-
 	\begin{description}[style=nextline]
 		\item[Which performance impact does it have?]
 			Branch divergancy causes threads which doesn't took a branch to idle.
@@ -1065,7 +1057,6 @@ See \texttt{man bsub}.
 
 	\item[What can be done to saturate the bus?] Maximize PCIe throughput by transfer
 		less data, batch smaller transfers to larger ones.
-
 	\begin{description}[style=nextline]
 		\item[What is coalescing?] \hfill
 		\item[How can it be achieved?] \hfill

From 40b146b45fd8ddbb461d581cccc359dcc7bffea6 Mon Sep 17 00:00:00 2001
From: Steffen Vogel <post@steffenvogel.de>
Date: Thu, 14 Aug 2014 10:42:08 +0200
Subject: [PATCH 3/7] some additional stuff for xeon phi

---
 hpc_summary.tex | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/hpc_summary.tex b/hpc_summary.tex
index 846df8d..903aee9 100644
--- a/hpc_summary.tex
+++ b/hpc_summary.tex
@@ -1074,28 +1074,43 @@ See \texttt{man bsub}.
 \subsubsection{Xeon Phi}
 
 \begin{description}[style=nextline]
-	\item[How does a Xeon Phi look like?] The Xeon Phi consists of 60 cores with 4-times SMT.
+	\item[How does a Xeon Phi look like?]
+		The Xeon Phi consists of 60 cores with 4-fold SMT,
+		has a clock frequency of 1090~Mhz
+		and a peak performance of about 1 TFlop/s.
 	\begin{description}[style=nextline]
-		\item[How does the memory hierarchy look like?] All cores are connected
-			to caches and dedicated memory by a ring bus. Caches a kept coherent.
-			Data transfer to the host has to be done via DMA.
-			Theres no common address space.
+		\item[How does the memory hierarchy look like?]
+			All cores are connected to their own L1/L2 caches by a ring bus.
+			There's a common Memory and I/O interface which also connected to ring bus
+			and used to access the global GDDR5 RAM.
+			Caches a kept coherent. Data transfer to the host has to be done via DMA.
+			There's no common address space with the host.
 
 		\item[How many threads/ vector-widths are available?]
 		\begin{description}
 			\item[Threads] 60 cores * 4 threads = 240 threads
-			\item[Vector-width] AVX-512 has 512 bits per vector (extension of AVX2, MIC only)
+			\item[Vector-width] AVX-512 has 512 bits per vector
+				(extension of AVX2, MIC only).
 		\end{description}
 	\end{description}
 
-	\item[Which programming concepts do exist?] OpenCL, OpenMP, MPI, OpenACC?
+	\item[Which programming concepts do exist?] OpenMP/CL/ACC, MPI and Posix threads.
+		The Phi is a slightly modified x86 architecture and can be programmed used
+		like a normal x86 system. Intel offers propriatary language offloading extensions (LEO)
+		for their own C compiler.
 	\begin{description}[style=nextline]
-		\item[How can OpenMP (4.0) be used?] \hfill
+		\item[How can OpenMP (4.0) be used?] The recent OpenMP spec adds new
+			directives (\texttt{target} and \texttt{teams}) which can be used
+			to offload parts of the code.
+			This concept has been adopted from the OpenACC
+			standard which can be considered to be a testbed for OpenMP.
 	\end{description}
 
 	\item[Which optimization strategies should be applied?]
+		Massive SIMD vectorization, check data aligment (SoA, AoS), avoid aliasing.
 	\begin{description}[style=nextline]
-		\item[Which impact can the PCIe have?] PCIe is a bottleneck for host to Phi data transfer.
+		\item[Which impact can the PCIe have?]
+			PCIe is a bottleneck for host to Phi data transfer.
 			Adds latency for communication => see GPGPU.
 		\item[Which impact does vectorization have?] Huge impact for the Phi.
 			It has support for FMA-3 with large SIMD vector registers.

From 6eb6a8b06eee848e49ae63c038d70489946336d1 Mon Sep 17 00:00:00 2001
From: Steffen Vogel <post@steffenvogel.de>
Date: Thu, 14 Aug 2014 10:42:19 +0200
Subject: [PATCH 4/7] more on openmp

---
 hpc_summary.tex | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/hpc_summary.tex b/hpc_summary.tex
index 903aee9..e55364f 100644
--- a/hpc_summary.tex
+++ b/hpc_summary.tex
@@ -948,24 +948,19 @@ See \texttt{man bsub}.
 
 	\item[Scoping] There are OpenMP clauses the specifiy the scope (shared or private) of variables.
 	\begin{description}[style=nextline]
-		\item[Data sharing clauses] \hfill
-
+		\item[Data sharing clauses] See API reference above.
 	\end{description}
 
 	\item[Synchronization]
 	\begin{description}[style=nextline]
-		\item[Critical section] \hfill
-
-		\item[Reduction clause] \hfill
- 
-		\item[Team and Task-Barriers] \hfill
- 
-	\end{description} 
-	\item[Runtime library] \hfill
+		\item[Critical section] Use \lstinline$#pragma omp critical$ directive.
+		\item[Reduction clause] Use \lstinline$reduction(operation:var-list)$ clause.
+ 		\item[Team and Task-Barriers] Use \lstinline$#pragma omp barrier$ and \lstinline$#pragma omp taskwait$ directives.
+ 	\end{description} 
 
+	\item[Runtime library]
 	\begin{description}[style=nextline]
-		\item[Important functions] \hfill
-
+		\item[Important functions] See API reference above.
 	\end{description}
 
 \end{description}

From 3b0180373d3935abf480900b912e7b32e2636b8a Mon Sep 17 00:00:00 2001
From: Steffen Vogel <post@steffenvogel.de>
Date: Thu, 14 Aug 2014 14:31:32 +0200
Subject: [PATCH 5/7] added limit considerations to Amdahl's and Gustafson's
 law

---
 hpc_summary.tex | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hpc_summary.tex b/hpc_summary.tex
index e55364f..a6d9c8b 100644
--- a/hpc_summary.tex
+++ b/hpc_summary.tex
@@ -37,13 +37,13 @@
 
 \section{Amdahl \& Gustafson}
 
-\begin{tabular}{ l | c c c }
+\begin{tabular}{ l l | c c }
 
-	& \textbf{General} & \textbf{Amdahl's Law} & \textbf{Gustafson's Law} \\
-	&	  & \textit{strong scaling}	& \textit{weak scaling} \\
+	&  & \textbf{Amdahl's Law} & \textbf{Gustafson's Law} \\
+	&	 & \textit{strong scaling}	& \textit{weak scaling} \\
 	\hline
-	Speedup \( S_p(N) \)  & \( \frac{T(1)}{T(N)} \) & \( \frac{1}{S + \frac{1 - S}{N}} \) & \( Np + s \) \\
-	Efficency \( \varepsilon_p(N) \) & \( \frac{S_p(N)}{N} = \frac{T(1)}{N \cdot T(N)} \) & \( \frac{1}{s(N-1) + 1} \) & \( \frac{1 - p}{N} \) \\
+	\textbf{Speedup} & \( S_p(N) = \frac{T(1)}{T(N)} \) & \( \frac{1}{s + \frac{1 - s}{N}} \xrightarrow{N \rightarrow \infty} \frac{1}{s} \) & \( Np + s \xrightarrow{N \rightarrow \infty} \infty \) \\
+	\textbf{Efficency} & \( \varepsilon_p(N) = \frac{S_p(N)}{N} \) & \( \frac{1}{s (N-1) + 1} \xrightarrow{N \rightarrow \infty} 0 \) & \( \frac{1 - p}{N} + p \xrightarrow{N \rightarrow \infty} p \) \\
 \end{tabular}
 
 \section{Moore's Law}

From 432abe0ab1583930dddbcdee8dcd940d353aa803 Mon Sep 17 00:00:00 2001
From: Steffen Vogel <post@steffenvogel.de>
Date: Thu, 14 Aug 2014 14:31:48 +0200
Subject: [PATCH 6/7] fixed encoding of minus sign

---
 hpc_summary.tex | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hpc_summary.tex b/hpc_summary.tex
index a6d9c8b..08e5845 100644
--- a/hpc_summary.tex
+++ b/hpc_summary.tex
@@ -75,7 +75,7 @@
 	Stages & $m$ \\
 	Operations & $N$ \\
 	Without pipeline & \( T_{seq} = m \cdot N \) \\
-	With pipeline & \( T_{pipe} = N + m − 1 \) \\
+	With pipeline & \( T_{pipe} = N + m - 1 \) \\
 	Speedup & \(S_{pipe} = \frac{m}{1 + \frac{m-1}{N}} \xrightarrow{N \rightarrow \infty} m \) \\
 	Throughput (results/cycle) & \( \frac{N}{T_{pipe}} = \frac{1}{1 + \frac{m-1}{N}} \xrightarrow{N \rightarrow \infty} 1 \) \\
 \end{tabular}

From 852f6af014f572b108e53307c20d81c5fb49d3c9 Mon Sep 17 00:00:00 2001
From: Steffen Vogel <post@steffenvogel.de>
Date: Thu, 14 Aug 2014 14:46:23 +0200
Subject: [PATCH 7/7] corrected node connectivity of fully meshed net

---
 hpc_summary.tex | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/hpc_summary.tex b/hpc_summary.tex
index f50d426..a8939b8 100644
--- a/hpc_summary.tex
+++ b/hpc_summary.tex
@@ -104,14 +104,13 @@
 \begin{tabular}{l | c c c c }
 	\textbf{Topology} & \textbf{Max degree} & \textbf{Edge connectivity} & \textbf{Diameter} & \textbf{Bisection BW} \\
 	\hline
-	Bus		& 1  & 1   & 1 & B \\
-	Ring		& 2  & 2   & \( \lfloor \frac{N}{2} \rfloor \) & 2B \\
-	Fully connected	& \( \frac{N(N-1)}{2} \) & N-1 & 1 & \( \frac{N^2}{4} \) \\ 
+	Bus		& $1$  & $1$ & $1$ & $B$ \\
+	Ring	& $2$  & $2$ & $\lfloor \frac{N}{2} \rfloor$ & $2B$ \\
+	Fully connected	& $N-1$ & $N-1$ & $1$ & $\frac{N^2}{4}$ \\ 
 	Sw. Fat Tree	& 1 w/o redunancy & depends on design & 2 hierarchy height & depends on design \\
-	Mesh		& 2d & d   & \( \sum_{i=1}^d (N_i - 1) \) & \( B ( \prod_{i=1}^{d-1} N_i ) \) \\
-	Torus		& 2d & 2d  & \( \sum_{i=1}^d \lfloor \frac{N}{2} \rfloor  \) & \( 2B ( \prod_{i=1}^{d-1} N_i ) \) \\
-
-	Hypercube	& d  & d   & d & \( B2^{d-1} \) \\ 
+	Mesh	& $2d$ & $d$ & $\sum_{i=1}^d (N_i - 1)$ & $B ( \prod_{i=1}^{d-1} N_i )$ \\
+	Torus	& $2d$ & $2d$ & $\sum_{i=1}^d \lfloor \frac{N}{2} \rfloor$ & $2B ( \prod_{i=1}^{d-1} N_i )$ \\
+	Hypercube	& $d$ & $d$ & $d$ & $B2^{d-1}$ \\ 
 \end{tabular}
 
 \section{Balance, Lightspeed}