Add lecture codes - cs205-lecture-examples - Example codes used during Harvard CS205 lectures

commit 6c242dbac5429593c3e32d236372913eed5ea3d8
parent e151e67906899c205dde309dca6452b00bb78743
Author: Fabian Wermelinger <fab4100@posteo.net>
Date:   Thu, 15 Aug 2024 17:10:25 +0200

Add lecture codes

Diffstat:
A lecture02/compilation_phases/.gitignore  | 4 ++++
A lecture02/compilation_phases/Makefile  | 22 ++++++++++++++++++++++
A lecture02/compilation_phases/hello.c  | 8 ++++++++
A lecture02/heap_and_stack/.gitignore  | 2 ++
A lecture02/heap_and_stack/Makefile  | 13 +++++++++++++
A lecture02/heap_and_stack/heap_stack.c  | 23 +++++++++++++++++++++++
A lecture02/heap_and_stack/stackoverflow.c  | 22 ++++++++++++++++++++++
A lecture03/multi_index_and_flat_index/.gitignore  | 1 +
A lecture03/multi_index_and_flat_index/Makefile  | 8 ++++++++
A lecture03/multi_index_and_flat_index/multi_and_flat.cpp  | 45 +++++++++++++++++++++++++++++++++++++++++++++
A lecture04/counter/.gitignore  | 1 +
A lecture04/counter/Makefile  | 8 ++++++++
A lecture04/counter/counter.cpp  | 25 +++++++++++++++++++++++++
A lecture05/address_sanitizer/openmp/.gitignore  | 3 +++
A lecture05/address_sanitizer/openmp/Makefile  | 23 +++++++++++++++++++++++
A lecture05/address_sanitizer/openmp/two_ompthreads.cpp  | 28 ++++++++++++++++++++++++++++
A lecture05/address_sanitizer/pthreads/.gitignore  | 3 +++
A lecture05/address_sanitizer/pthreads/Makefile  | 23 +++++++++++++++++++++++
A lecture05/address_sanitizer/pthreads/two_pthreads.cpp  | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture05/sequential_consistency/.gitignore  | 2 ++
A lecture05/sequential_consistency/Makefile  | 14 ++++++++++++++
A lecture05/sequential_consistency/README.md  | 5 +++++
A lecture05/sequential_consistency/scenario_1_3.cpp  | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture05/sequential_consistency/scenario_2.cpp  | 34 ++++++++++++++++++++++++++++++++++
A lecture06/omp_conditional/.gitignore  | 2 ++
A lecture06/omp_conditional/Makefile  | 14 ++++++++++++++
A lecture06/omp_conditional/omp_conditional.cpp  | 12 ++++++++++++
A lecture06/omp_demo/.gitignore  | 1 +
A lecture06/omp_demo/Makefile  | 9 +++++++++
A lecture06/omp_demo/omp_demo.cpp  | 19 +++++++++++++++++++
A lecture07_08/omp_critical/.gitignore  | 2 ++
A lecture07_08/omp_critical/Makefile  | 14 ++++++++++++++
A lecture07_08/omp_critical/omp_critical.cpp  | 38 ++++++++++++++++++++++++++++++++++++++
A lecture07_08/omp_data_attributes/.gitignore  | 5 +++++
A lecture07_08/omp_data_attributes/Makefile  | 29 +++++++++++++++++++++++++++++
A lecture07_08/omp_data_attributes/omp_firstprivate.cpp  | 24 ++++++++++++++++++++++++
A lecture07_08/omp_data_attributes/omp_lastprivate.cpp  | 41 +++++++++++++++++++++++++++++++++++++++++
A lecture07_08/omp_data_attributes/omp_private.cpp  | 24 ++++++++++++++++++++++++
A lecture07_08/omp_data_attributes/omp_region.cpp  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture07_08/omp_data_attributes/omp_static.cpp  | 23 +++++++++++++++++++++++
A lecture07_08/omp_data_attributes/omp_threadprivate.cpp  | 33 +++++++++++++++++++++++++++++++++
A lecture07_08/omp_default/.gitignore  | 1 +
A lecture07_08/omp_default/Makefile  | 9 +++++++++
A lecture07_08/omp_default/omp_default.cpp  | 15 +++++++++++++++
A lecture07_08/omp_device/.gitignore  | 1 +
A lecture07_08/omp_device/Makefile  | 15 +++++++++++++++
A lecture07_08/omp_device/omp_device.cpp  | 19 +++++++++++++++++++
A lecture07_08/omp_schedule/.gitignore  | 1 +
A lecture07_08/omp_schedule/Makefile  | 9 +++++++++
A lecture07_08/omp_schedule/omp_schedule.cpp  | 38 ++++++++++++++++++++++++++++++++++++++
A lecture07_08/omp_scoping/Makefile  | 12 ++++++++++++
A lecture07_08/omp_scoping/main.c  | 18 ++++++++++++++++++
A lecture07_08/omp_scoping/myfunc.c  | 6 ++++++
A lecture07_08/omp_wtime/.gitignore  | 1 +
A lecture07_08/omp_wtime/Makefile  | 9 +++++++++
A lecture07_08/omp_wtime/omp_wtime.cpp  | 35 +++++++++++++++++++++++++++++++++++
A lecture09/thread_affinity/.gitignore  | 4 ++++
A lecture09/thread_affinity/Makefile  | 20 ++++++++++++++++++++
A lecture09/thread_affinity/affinity.cpp  | 35 +++++++++++++++++++++++++++++++++++
A lecture09/thread_affinity/affinity_close.cpp  | 35 +++++++++++++++++++++++++++++++++++
A lecture09/thread_affinity/affinity_master.cpp  | 35 +++++++++++++++++++++++++++++++++++
A lecture09/thread_affinity/affinity_spread.cpp  | 35 +++++++++++++++++++++++++++++++++++
A lecture09/thread_affinity/helper.h  | 26 ++++++++++++++++++++++++++
A lecture09/thread_affinity/omp_places_cores.sh  | 2 ++
A lecture09/thread_affinity/omp_places_sockets.sh  | 2 ++
A lecture09/thread_affinity/omp_places_threads.sh  | 2 ++
A lecture09/thread_affinity/omp_proc_bind_close.sh  | 2 ++
A lecture09/thread_affinity/omp_proc_bind_false.sh  | 2 ++
A lecture09/thread_affinity/omp_proc_bind_master.sh  | 2 ++
A lecture09/thread_affinity/omp_proc_bind_spread.sh  | 2 ++
A lecture09/thread_affinity/omp_proc_bind_true.sh  | 2 ++
A lecture10/openmp_overhead/AMD_Opteron/openmp_overhead.pdf  | 0 
A lecture10/openmp_overhead/AMD_Opteron/proc_bind_false.dat  | 8 ++++++++
A lecture10/openmp_overhead/AMD_Opteron/proc_bind_true.dat  | 8 ++++++++
A lecture10/openmp_overhead/Intel_Xeon2683v4/openmp_overhead.pdf  | 0 
A lecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_false.dat  | 7 +++++++
A lecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_true.dat  | 7 +++++++
A lecture10/openmp_overhead/openmpbench_C_v31/Licence.txt  | 202 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/Makefile  | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs  | 10 ++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.cray  | 10 ++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.pgi  | 13 +++++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.gnu  | 11 +++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.sun  | 10 ++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.gnu  | 11 +++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.intel  | 11 +++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/README.txt  | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/arraybench.c  | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/arraybench.h  | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/common.c  | 374 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/common.h  | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/schedbench.c  | 145 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/schedbench.h  | 46 ++++++++++++++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/syncbench.c  | 253 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/syncbench.h  | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/taskbench.c  | 331 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/openmpbench_C_v31/taskbench.h  | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/plot_overhead.py  | 35 +++++++++++++++++++++++++++++++++++
A lecture10/openmp_overhead/run_benchmark.sh  | 21 +++++++++++++++++++++
A lecture10/prefetch/Makefile  | 13 +++++++++++++
A lecture10/prefetch/loop_unroll_prefetch.cpp  | 44 ++++++++++++++++++++++++++++++++++++++++++++
A lecture11/hello_mpi/.gitignore  | 1 +
A lecture11/hello_mpi/Makefile  | 9 +++++++++
A lecture11/hello_mpi/hello_mpi.cpp  | 17 +++++++++++++++++
A lecture11/rank_size_proc/.gitignore  | 1 +
A lecture11/rank_size_proc/Makefile  | 9 +++++++++
A lecture11/rank_size_proc/rank_size_proc.cpp  | 35 +++++++++++++++++++++++++++++++++++
A lecture11/thread_support/.gitignore  | 1 +
A lecture11/thread_support/Makefile  | 9 +++++++++
A lecture11/thread_support/thread_support.cpp  | 27 +++++++++++++++++++++++++++
A lecture12/endianness/.gitignore  | 1 +
A lecture12/endianness/Makefile  | 9 +++++++++
A lecture12/endianness/endianness.cpp  | 44 ++++++++++++++++++++++++++++++++++++++++++++
A lecture12/mpi_bsend/.gitignore  | 1 +
A lecture12/mpi_bsend/Makefile  | 9 +++++++++
A lecture12/mpi_bsend/mpi_bsend.cpp  | 37 +++++++++++++++++++++++++++++++++++++
A lecture12/mpi_gatherv_scatterv/.gitignore  | 1 +
A lecture12/mpi_gatherv_scatterv/Makefile  | 9 +++++++++
A lecture12/mpi_gatherv_scatterv/mpi_gatherv_scatterv.cpp  | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture12/mpi_get_count/.gitignore  | 1 +
A lecture12/mpi_get_count/Makefile  | 9 +++++++++
A lecture12/mpi_get_count/mpi_get_count.cpp  | 33 +++++++++++++++++++++++++++++++++
A lecture12/mpi_minloc/.gitignore  | 1 +
A lecture12/mpi_minloc/Makefile  | 9 +++++++++
A lecture12/mpi_minloc/mpi_minloc.cpp  | 31 +++++++++++++++++++++++++++++++
A lecture12/mpi_p2p/.gitignore  | 1 +
A lecture12/mpi_p2p/Makefile  | 9 +++++++++
A lecture12/mpi_p2p/mpi_p2p.cpp  | 22 ++++++++++++++++++++++
A lecture12/mpi_pi/.gitignore  | 3 +++
A lecture12/mpi_pi/Makefile  | 17 +++++++++++++++++
A lecture12/mpi_pi/mpi_pi.cpp  | 45 +++++++++++++++++++++++++++++++++++++++++++++
A lecture12/mpi_pi/mpi_pi_reduction.cpp  | 33 +++++++++++++++++++++++++++++++++
A lecture12/mpi_pi/mpi_pi_reduction_inplace.cpp  | 38 ++++++++++++++++++++++++++++++++++++++
A lecture12/mpi_rsend/.gitignore  | 2 ++
A lecture12/mpi_rsend/Makefile  | 20 ++++++++++++++++++++
A lecture12/mpi_rsend/mpi_irsend.cpp  | 46 ++++++++++++++++++++++++++++++++++++++++++++++
A lecture12/mpi_rsend/mpi_rsend.cpp  | 41 +++++++++++++++++++++++++++++++++++++++++
A lecture13/mpi_1D_diffusion/.gitignore  | 4 ++++
A lecture13/mpi_1D_diffusion/IO.h  | 45 +++++++++++++++++++++++++++++++++++++++++++++
A lecture13/mpi_1D_diffusion/Makefile  | 22 ++++++++++++++++++++++
A lecture13/mpi_1D_diffusion/blocking.cpp  | 45 +++++++++++++++++++++++++++++++++++++++++++++
A lecture13/mpi_1D_diffusion/nonblocking.cpp  | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture13/mpi_1D_diffusion/plot.py  | 19 +++++++++++++++++++
A lecture13/mpi_cyclic_shift/.gitignore  | 1 +
A lecture13/mpi_cyclic_shift/Makefile  | 14 ++++++++++++++
A lecture13/mpi_cyclic_shift/mpi_cyclic_shift.cpp  | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture13/mpi_cyclic_shift/mpi_sendrecv.cpp  | 42 ++++++++++++++++++++++++++++++++++++++++++
A lecture13/mpi_p2p_nonblocking/.gitignore  | 1 +
A lecture13/mpi_p2p_nonblocking/Makefile  | 9 +++++++++
A lecture13/mpi_p2p_nonblocking/mpi_p2p_nonblocking.cpp  | 25 +++++++++++++++++++++++++
A lecture14/mpi_1D_diffusion_IO/.gitignore  | 4 ++++
A lecture14/mpi_1D_diffusion_IO/IO.h  | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture14/mpi_1D_diffusion_IO/Makefile  | 13 +++++++++++++
A lecture14/mpi_1D_diffusion_IO/nonblocking.cpp  | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture14/mpi_1D_diffusion_IO/plot.py  | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture14/parallel_hdf5/.gitignore  | 1 +
A lecture14/parallel_hdf5/Makefile  | 12 ++++++++++++
A lecture14/parallel_hdf5/data.h5  | 0 
A lecture14/parallel_hdf5/data.xmf  | 23 +++++++++++++++++++++++
A lecture14/parallel_hdf5/hdf5IO.h  | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture14/parallel_hdf5/hdf5_in_python.py  | 18 ++++++++++++++++++
A lecture14/parallel_hdf5/parallel_hdf5.cpp  | 44 ++++++++++++++++++++++++++++++++++++++++++++
A lecture14/parallel_hdf5/submit.sh  | 23 +++++++++++++++++++++++
A lecture14/parallel_hdf5/xdmf_wrapper.h  | 46 ++++++++++++++++++++++++++++++++++++++++++++++
A lecture15/papi_examples_HW2/asm/.gitignore  | 1 +
A lecture15/papi_examples_HW2/asm/Makefile  | 12 ++++++++++++
A lecture15/papi_examples_HW2/asm/kernel.s  | 19 +++++++++++++++++++
A lecture15/papi_examples_HW2/asm/test.cpp  | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture15/papi_examples_HW2/daxpy/.gitignore  | 2 ++
A lecture15/papi_examples_HW2/daxpy/Makefile  | 25 +++++++++++++++++++++++++
A lecture15/papi_examples_HW2/daxpy/daxpy.cpp  | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture15/papi_examples_HW2/daxpy/disassemble.sh  | 9 +++++++++
A lecture15/papi_examples_HW2/dgemm/.gitignore  | 2 ++
A lecture15/papi_examples_HW2/dgemm/Makefile  | 29 +++++++++++++++++++++++++++++
A lecture15/papi_examples_HW2/dgemm/dgemm.cpp  | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture15/papi_examples_HW2/dgemm/disassemble.sh  | 9 +++++++++
A lecture15/papi_examples_HW2/sgemv/.gitignore  | 2 ++
A lecture15/papi_examples_HW2/sgemv/Makefile  | 25 +++++++++++++++++++++++++
A lecture15/papi_examples_HW2/sgemv/disassemble.sh  | 9 +++++++++
A lecture15/papi_examples_HW2/sgemv/sgemv.cpp  | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture16/allocator_test/.gitignore  | 1 +
A lecture16/allocator_test/Makefile  | 9 +++++++++
A lecture16/allocator_test/allocator_test.cpp  | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture16/pointer_dereference/.gitignore  | 1 +
A lecture16/pointer_dereference/Makefile  | 9 +++++++++
A lecture16/pointer_dereference/pointer_dereference.cpp  | 15 +++++++++++++++
A lecture17/registers/.gitignore  | 2 ++
A lecture17/registers/Makefile  | 12 ++++++++++++
A lecture17/registers/registers.s  | 11 +++++++++++
A lecture18_19/aliasing/.gitignore  | 1 +
A lecture18_19/aliasing/Makefile  | 12 ++++++++++++
A lecture18_19/aliasing/aliasing.c  | 13 +++++++++++++
A lecture18_19/aliasing/main.c  | 15 +++++++++++++++
A lecture18_19/align/.gitignore  | 1 +
A lecture18_19/align/Makefile  | 7 +++++++
A lecture18_19/align/align.c  | 30 ++++++++++++++++++++++++++++++
A lecture18_19/autovec/Makefile  | 24 ++++++++++++++++++++++++
A lecture18_19/autovec/f.c  | 20 ++++++++++++++++++++
A lecture18_19/intrinsics/_mm_set_ps1/.gitignore  | 3 +++
A lecture18_19/intrinsics/_mm_set_ps1/Makefile  | 15 +++++++++++++++
A lecture18_19/intrinsics/_mm_set_ps1/main.c  | 9 +++++++++
A lecture18_19/intrinsics/_mm_set_ps1/ones.c  | 6 ++++++
A lecture18_19/intrinsics/constants/set1_ps.c  | 3 +++
A lecture18_19/intrinsics/constants/set_ps.c  | 5 +++++
A lecture18_19/intrinsics/constants/set_ss.c  | 3 +++
A lecture18_19/intrinsics/constants/setr_ps.c  | 5 +++++
A lecture18_19/intrinsics/constants/setzero.c  | 2 ++
A lecture18_19/intrinsics/load_store/aligned_load_store.c  | 7 +++++++
A lecture18_19/intrinsics/load_store/reverse_load_store.c  | 2 ++
A lecture18_19/intrinsics/load_store/unaligned_load_store.c  | 7 +++++++
A lecture18_19/intrinsics/transpose/transpose.c  | 15 +++++++++++++++
A lecture18_19/saxpy/.gitignore  | 2 ++
A lecture18_19/saxpy/Makefile  | 17 +++++++++++++++++
A lecture18_19/saxpy/main.c  | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture18_19/saxpy/saxpy.c  | 7 +++++++
A lecture18_19/saxpy/saxpy.s  | 26 ++++++++++++++++++++++++++
A lecture18_19/saxpy/saxpy_SSE.c  | 12 ++++++++++++
A lecture18_19/saxpy/saxpy_SSE.s  | 30 ++++++++++++++++++++++++++++++
A lecture18_19/saxpy/saxpy_SSE_FMA.c  | 11 +++++++++++
A lecture18_19/sgemv/.gitignore  | 1 +
A lecture18_19/sgemv/Makefile  | 9 +++++++++
A lecture18_19/sgemv/sgemv.cpp  | 22 ++++++++++++++++++++++
A lecture18_19/sgemv/sgemv.s  | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A lecture20/ispc_data_layout/.gitignore  | 2 ++
A lecture20/ispc_data_layout/Makefile  | 17 +++++++++++++++++
A lecture20/ispc_data_layout/kernels.ispc  | 40 ++++++++++++++++++++++++++++++++++++++++
A lecture20/ispc_data_layout/main.c  | 11 +++++++++++
A lecture20/ispc_foreach/.gitignore  | 2 ++
A lecture20/ispc_foreach/Makefile  | 17 +++++++++++++++++
A lecture20/ispc_foreach/kernels.ispc  | 17 +++++++++++++++++
A lecture20/ispc_foreach/main.c  | 10 ++++++++++
A lecture20/ispc_pointers_arrays/.gitignore  | 2 ++
A lecture20/ispc_pointers_arrays/Makefile  | 17 +++++++++++++++++
A lecture20/ispc_pointers_arrays/f.ispc  | 35 +++++++++++++++++++++++++++++++++++
A lecture20/ispc_pointers_arrays/main.c  | 8 ++++++++
A lecture20/maximal_convergence/.gitignore  | 2 ++
A lecture20/maximal_convergence/Makefile  | 17 +++++++++++++++++
A lecture20/maximal_convergence/f.ispc  | 14 ++++++++++++++
A lecture20/maximal_convergence/main.cpp  | 8 ++++++++
A lecture20/sse_vector/.gitignore  | 1 +
A lecture20/sse_vector/Makefile  | 7 +++++++
A lecture20/sse_vector/sse_vector.c  | 8 ++++++++
A lecture21/cuda_vec_add/.gitignore  | 2 ++
A lecture21/cuda_vec_add/Makefile  | 7 +++++++
A lecture21/cuda_vec_add/add_kernel_grid.cu  | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++

245 files changed, 6296 insertions(+), 0 deletions(-)
diff --git a/lecture02/compilation_phases/.gitignore b/lecture02/compilation_phases/.gitignore
@@ -0,0 +1,4 @@
+hello
+hello.i
+hello.o
+hello.s
diff --git a/lecture02/compilation_phases/Makefile b/lecture02/compilation_phases/Makefile
@@ -0,0 +1,22 @@
+CC = gcc
+.PHONY: all clean
+
+all: hello hello.o hello.i hello.s dump
+
+hello: hello.c
+	$(CC) -o $@ $<
+
+hello.o: hello.c
+	$(CC) -c -o $@ $<
+
+hello.i: hello.c
+	$(CC) -E -o $@ $<
+
+hello.s: hello.c
+	$(CC) -S -o $@ $<
+
+dump: hello.o
+	objdump -d $<
+
+clean:
+	rm -f hello hello.o hello.i hello.s
diff --git a/lecture02/compilation_phases/hello.c b/lecture02/compilation_phases/hello.c
@@ -0,0 +1,8 @@
+#include <stdio.h>
+
+int main(void)
+{
+    // Comments are removed by the pre-processor
+    printf("hello, world!\n");
+    return 0;
+}
diff --git a/lecture02/heap_and_stack/.gitignore b/lecture02/heap_and_stack/.gitignore
@@ -0,0 +1,2 @@
+heap_stack
+stackoverflow
diff --git a/lecture02/heap_and_stack/Makefile b/lecture02/heap_and_stack/Makefile
@@ -0,0 +1,13 @@
+CC = gcc
+.PHONY: all clean
+
+all: heap_stack stackoverflow
+
+heap_stack: heap_stack.c
+	$(CC) -O2 -o $@ $<
+
+stackoverflow: stackoverflow.c
+	$(CC) -O2 -o $@ $<
+
+clean:
+	rm -f heap_stack stackoverflow
diff --git a/lecture02/heap_and_stack/heap_stack.c b/lecture02/heap_and_stack/heap_stack.c
@@ -0,0 +1,23 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(void)
+{
+    // Allocated on stack when main function runs.  This is an example of
+    // automatic memory allocation managed by the compiler using the stack.
+    char on_stack[1 << 10]; // allocates 1024 bytes on the stack
+
+    // Allocated on the heap at runtime.  This is an example of dynamic memory
+    // allocation managed by the operating system at runtime (this memory
+    // request involves the OS kernel)
+    char *pointer = (char *)malloc(1 << 10); // allocates 1024 bytes on the heap
+
+    // Compare the memory addresses of the two allocations
+    // clang-format off
+    printf("Address of variable `on_stack` (on stack):   %p\n", (void *)&on_stack);
+    printf("Address of variable `pointer` (on stack):    %p\n", (void *)&pointer);
+    printf("Address where `pointer` points to (on heap): %p\n", (void *)pointer);
+    // clang-format on
+
+    return 0;
+}
diff --git a/lecture02/heap_and_stack/stackoverflow.c b/lecture02/heap_and_stack/stackoverflow.c
@@ -0,0 +1,22 @@
+#include <stdio.h>
+
+int main(void)
+{
+    // On Linux system the size of the stack can be checked (and configured)
+    // with
+    //     ulimit -s
+    //  On my system the stack size is 8192kB (8MB).  If your program allocates
+    //  more than 8MB on the stack your application will fail with a
+    //  segmentation fault.  The stack is used for automatic memory allocation
+    //  when functions execute and should not be used for simulation data for
+    //  example.  Such large data must always be allocated on the heap.  If you
+    //  have a very deep recursion in your code that calls many function which
+    //  all in total require more than 8MB (on my system) you will also run into
+    //  a stack overflow.
+
+    // The following code is buggy and will crash due to stack overflow
+    char large_memory[8 * (1 << 20)]; // 8MB of memory on stack
+    printf("Memory access will fail: %d\n", large_memory[0]); // segfault
+
+    return 0;
+}
diff --git a/lecture03/multi_index_and_flat_index/.gitignore b/lecture03/multi_index_and_flat_index/.gitignore
@@ -0,0 +1 @@
+main
diff --git a/lecture03/multi_index_and_flat_index/Makefile b/lecture03/multi_index_and_flat_index/Makefile
@@ -0,0 +1,8 @@
+CXX = g++
+.PHONY: clean
+
+main: multi_and_flat.cpp
+	$(CXX) -O1 -o $@ $<
+
+clean:
+	rm -f main
diff --git a/lecture03/multi_index_and_flat_index/multi_and_flat.cpp b/lecture03/multi_index_and_flat_index/multi_and_flat.cpp
@@ -0,0 +1,45 @@
+#include <iostream>
+
+#define N 2
+#define M 4
+
+void initialize(int *p)
+{
+    for (int i = 0; i < N * M; ++i) {
+        p[i] = i; // 0, 1, 2, 3, ...
+    }
+}
+
+int main(void)
+{
+    int A[N][M];
+    int *p = &A[0][0]; // pointer to first element in A
+    initialize(p);
+
+    // multi-dimensional indexing
+    std::cout << "Multi-dimensional indexing:\n";
+    for (int i = 0; i < N; ++i) {
+        for (int j = 0; j < M; ++j) {
+            std::cout << '\t' << A[i][j];
+        }
+        std::cout << '\n';
+    }
+
+    // flat indexing
+    std::cout << "\nFlat indexing:\n";
+    for (int i = 0; i < N; ++i) {
+        for (int j = 0; j < M; ++j) {
+            std::cout << '\t' << p[i * M + j]; // j is fastest moving index!
+        }
+        std::cout << '\n';
+    }
+
+    // representation in memory
+    std::cout << "\nRow-major layout in memory:\n";
+    for (int i = 0; i < N * M; ++i) {
+        std::cout << '\t' << p[i];
+    }
+    std::cout << '\n';
+
+    return 0;
+}
diff --git a/lecture04/counter/.gitignore b/lecture04/counter/.gitignore
@@ -0,0 +1 @@
+counter
diff --git a/lecture04/counter/Makefile b/lecture04/counter/Makefile
@@ -0,0 +1,8 @@
+CXX = g++
+.PHONY: clean
+
+counter: counter.cpp
+	$(CXX) -fopenmp -o $@ $<
+
+clean:
+	rm -f counter
diff --git a/lecture04/counter/counter.cpp b/lecture04/counter/counter.cpp
@@ -0,0 +1,25 @@
+#include <iostream>
+
+// DISCLAIMER: this code contains a race condition
+class Counter
+{
+public:
+    Counter() : count_(0) {}
+
+    void increment() { ++count_; }
+    unsigned int get_count() const { return count_; }
+
+private:
+    unsigned int count_;
+};
+
+int main(void)
+{
+    Counter counter;
+#pragma omp parallel for
+    for (int i = 0; i < 100; ++i) {
+        counter.increment();
+    }
+    std::cout << counter.get_count() << '\n';
+    return 0;
+}
diff --git a/lecture05/address_sanitizer/openmp/.gitignore b/lecture05/address_sanitizer/openmp/.gitignore
@@ -0,0 +1,3 @@
+asm
+main
+sanitized
diff --git a/lecture05/address_sanitizer/openmp/Makefile b/lecture05/address_sanitizer/openmp/Makefile
@@ -0,0 +1,23 @@
+CXX = g++
+CXXFLAGS = -g -O0 -fopenmp
+.PHONY: clean
+
+# Since GCC 4.8 (LLVM clang also supports this) you can use the
+# -fsanitize=thread option to check if there are race conditions detected.  But
+# be aware the tool is not perfect but can be helpful when hunting bugs!  See
+# this link for more info:
+# https://clang.llvm.org/docs/ThreadSanitizer.html
+
+all: main sanitized
+
+main: two_ompthreads.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+sanitized: two_ompthreads.cpp
+	$(CXX) $(CXXFLAGS) -fsanitize=thread -o $@ $<
+
+asm: two_ompthreads.cpp
+	$(CXX) $(CXXFLAGS) -S -o $@ $<
+
+clean:
+	rm -f main sanitized asm
diff --git a/lecture05/address_sanitizer/openmp/two_ompthreads.cpp b/lecture05/address_sanitizer/openmp/two_ompthreads.cpp
@@ -0,0 +1,28 @@
+#include <iostream>
+#include <omp.h>
+
+// DISCLAIMER: this code contains a race condition. Beware of false positives
+//             from the thread sanitizer due to the OpenMP runtime, which TSan
+//             has a hard time to understand.  See the pthreads version for the
+//             proof.
+int main(void)
+{
+    // shared variables
+    int A = 0;
+    int B = 0;
+
+#pragma omp parallel num_threads(2) // use 2 threads
+    {
+        const int tid = omp_get_thread_num(); // my ID
+        if (0 == tid) {
+            A = 1;
+            const int x = B; // thread local variable
+            std::cout << "Thread 0: x = " << x << std::endl;
+        } else if (1 == tid) {
+            B = 1;
+            const int y = A; // thread local variable
+            std::cout << "Thread 1: y = " << y << std::endl;
+        }
+    }
+    return 0;
+}
diff --git a/lecture05/address_sanitizer/pthreads/.gitignore b/lecture05/address_sanitizer/pthreads/.gitignore
@@ -0,0 +1,3 @@
+asm
+main
+sanitized
diff --git a/lecture05/address_sanitizer/pthreads/Makefile b/lecture05/address_sanitizer/pthreads/Makefile
@@ -0,0 +1,23 @@
+CXX = g++
+CXXFLAGS = -g -O0 -pthread
+.PHONY: clean
+
+# Since GCC 4.8 (LLVM clang also supports this) you can use the
+# -fsanitize=thread option to check if there are race conditions detected.  But
+# be aware the tool is not perfect but can be helpful when hunting bugs!  See
+# this link for more info:
+# https://clang.llvm.org/docs/ThreadSanitizer.html
+
+all: main sanitized
+
+main: two_pthreads.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+sanitized: two_pthreads.cpp
+	$(CXX) $(CXXFLAGS) -fsanitize=thread -o $@ $<
+
+asm: two_pthreads.cpp
+	$(CXX) $(CXXFLAGS) -S -o $@ $<
+
+clean:
+	rm -f main sanitized asm
diff --git a/lecture05/address_sanitizer/pthreads/two_pthreads.cpp b/lecture05/address_sanitizer/pthreads/two_pthreads.cpp
@@ -0,0 +1,63 @@
+#include <cassert>
+#include <iostream>
+#include <pthread.h>
+
+// DISCLAIMER: this code contains a race condition.
+
+// global barrier
+pthread_barrier_t barrier;
+
+// shared variables
+int A = 0;
+int B = 0;
+
+// work done by the threads
+void *worker(void *t)
+{
+    long tid = (long)t;
+
+    if (0 == tid) {
+        A = 1;
+        // pthread_barrier_wait(&barrier);
+        const int x = B; // thread local variable
+        std::cout << "Thread 0: x = " << x << std::endl;
+    } else if (1 == tid) {
+        B = 1;
+        // pthread_barrier_wait(&barrier);
+        const int y = A; // thread local variable
+        std::cout << "Thread 1: y = " << y << std::endl;
+    }
+
+    return nullptr;
+}
+
+int main(void)
+{
+    // we use two threads
+    constexpr int nthreads = 2;
+
+    // allocate thread array
+    pthread_t threads[nthreads];
+
+    // initialize barrier
+    int s = pthread_barrier_init(&barrier, nullptr, nthreads);
+    assert(s == 0);
+
+    // fork
+    for (long i = 0; i < nthreads; ++i) {
+        s = pthread_create(&threads[i], nullptr, worker, (void *)i);
+        assert(s == 0);
+    }
+
+    // join
+    for (int i = 0; i < nthreads; ++i) {
+        s = pthread_join(threads[i], nullptr);
+        assert(s == 0);
+    }
+
+    // clean up
+    s = pthread_barrier_destroy(&barrier);
+    assert(s == 0);
+
+    return 0;
+}
diff --git a/lecture05/sequential_consistency/.gitignore b/lecture05/sequential_consistency/.gitignore
@@ -0,0 +1,2 @@
+scenario_1_3
+scenario_2
diff --git a/lecture05/sequential_consistency/Makefile b/lecture05/sequential_consistency/Makefile
@@ -0,0 +1,14 @@
+CXX = g++
+CXXFLAGS = -g -O1 -fopenmp
+.PHONY: clean
+
+all: scenario_1_3 scenario_2
+
+scenario_1_3: scenario_1_3.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+scenario_2: scenario_2.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f scenario_1_3 scenario_2
diff --git a/lecture05/sequential_consistency/README.md b/lecture05/sequential_consistency/README.md
@@ -0,0 +1,5 @@
+The code in this directory accommodates slide 12 of lecture 5.
+
+All examples assume two threads are running.  The code (partial orders) the
+threads execute involves read and write access to shared variables `A` and `B`.
+The variables `x` and `y` are thread private.
diff --git a/lecture05/sequential_consistency/scenario_1_3.cpp b/lecture05/sequential_consistency/scenario_1_3.cpp
@@ -0,0 +1,51 @@
+#include <iostream>
+#include <omp.h>
+#include <random>
+#include <unistd.h>
+
+// shared variables
+int A = 0;
+int B = 0;
+
+void task1()
+{
+    A = 1;
+    const int x = B;
+    std::cout << "Thread 0: x = " << x << std::endl;
+}
+
+void task2()
+{
+    B = 1;
+    const int y = A;
+    std::cout << "Thread 1: y = " << y << std::endl;
+}
+
+int main(void)
+{
+#pragma omp parallel num_threads(2) // use 2 threads
+    {
+        // generate some randomness to simulate load imbalance
+        std::random_device rd;
+        std::mt19937 rng(rd());
+        std::uniform_int_distribution<int> uniform(10, 50);
+        usleep(uniform(rng));
+
+        const int tid = omp_get_thread_num(); // my ID
+
+        // the following pragma necessarily serializes the code -> sequential
+        // consistency (synchronization primitive)
+#pragma omp critical
+        {
+            // This code must be in a critical section because task1 and task2
+            // read and write from/to shared memory.  Depending on which thread
+            // acquires the lock first, you will either get scenario 1 or 3:
+            if (0 == tid) {
+                task1();
+            } else {
+                task2();
+            }
+        }
+    }
+    return 0;
+}
diff --git a/lecture05/sequential_consistency/scenario_2.cpp b/lecture05/sequential_consistency/scenario_2.cpp
@@ -0,0 +1,34 @@
+#include <iostream>
+#include <omp.h>
+
+// shared variables
+int A = 0;
+int B = 0;
+
+int main(void)
+{
+#pragma omp parallel num_threads(2) // use 2 threads
+    {
+        const int tid = omp_get_thread_num(); // my ID
+        if (0 == tid) {
+            A = 1;
+            // the following pragma necessarily serializes the code ->
+            // sequential consistency (synchronization primitive)
+#pragma omp barrier
+            const int x = B;
+
+#pragma omp critical // avoid garbled output (only for cosmetics)
+            std::cout << "Thread 0: x = " << x << std::endl;
+        } else {
+            B = 1;
+            // the following pragma necessarily serializes the code ->
+            // sequential consistency (synchronization primitive)
+#pragma omp barrier
+            const int y = A;
+
+#pragma omp critical // avoid garbled output (only for cosmetics)
+            std::cout << "Thread 1: y = " << y << std::endl;
+        }
+    }
+    return 0;
+}
diff --git a/lecture06/omp_conditional/.gitignore b/lecture06/omp_conditional/.gitignore
@@ -0,0 +1,2 @@
+with_omp
+without_omp
diff --git a/lecture06/omp_conditional/Makefile b/lecture06/omp_conditional/Makefile
@@ -0,0 +1,14 @@
+CXX = g++
+CXXFLAGS = -O1
+.PHONY: clean
+
+all: with_omp without_omp
+
+with_omp: omp_conditional.cpp
+	$(CXX) $(CXXFLAGS) -fopenmp -o $@ $<
+
+without_omp: omp_conditional.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f with_omp without_omp
diff --git a/lecture06/omp_conditional/omp_conditional.cpp b/lecture06/omp_conditional/omp_conditional.cpp
@@ -0,0 +1,12 @@
+#include <iostream>
+
+int main(void)
+{
+#ifdef _OPENMP
+    std::cout << "Compiled with -fopenmp (OpenMP Standard: " << _OPENMP
+              << ")\n";
+#else
+    std::cout << "Compiled without -fopenmp\n";
+#endif /* _OPENMP */
+    return 0;
+}
diff --git a/lecture06/omp_demo/.gitignore b/lecture06/omp_demo/.gitignore
@@ -0,0 +1 @@
+demo
diff --git a/lecture06/omp_demo/Makefile b/lecture06/omp_demo/Makefile
@@ -0,0 +1,9 @@
+CXX = g++
+CXXFLAGS = -O1
+.PHONY: clean
+
+demo: omp_demo.cpp
+	$(CXX) $(CXXFLAGS) -fopenmp -o $@ $<
+
+clean:
+	rm -f demo
diff --git a/lecture06/omp_demo/omp_demo.cpp b/lecture06/omp_demo/omp_demo.cpp
@@ -0,0 +1,19 @@
+#include <iostream>
+#include <omp.h>
+
+int main(void)
+{
+    for (int i = 1; i <= 4; ++i)
+    {
+        std::cout << "Loop counter = " << i << ":\n";
+#pragma omp parallel num_threads(i) if (parallel: i > 2)
+        {
+            const int tid = omp_get_thread_num(); // get my ID
+            // the next line is a critical section! (std::cout is not
+            // thread-safe)
+#pragma omp critical
+            std::cout << "\tHello from thread " << tid << '\n';
+        }
+    }
+    return 0;
+}
diff --git a/lecture07_08/omp_critical/.gitignore b/lecture07_08/omp_critical/.gitignore
@@ -0,0 +1,2 @@
+omp_critical
+omp_critical_two
diff --git a/lecture07_08/omp_critical/Makefile b/lecture07_08/omp_critical/Makefile
@@ -0,0 +1,14 @@
+CXX = g++
+CXXFLAGS = -O1 -fopenmp
+.PHONY: clean
+
+all: omp_critical omp_critical_two
+
+omp_critical: omp_critical.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+omp_critical_two: omp_critical.cpp
+	$(CXX) $(CXXFLAGS) -DTWO_CRITICAL -o $@ $<
+
+clean:
+	rm -f omp_critical omp_critical_two
diff --git a/lecture07_08/omp_critical/omp_critical.cpp b/lecture07_08/omp_critical/omp_critical.cpp
@@ -0,0 +1,38 @@
+#include <iostream>
+#include <omp.h>
+#include <unistd.h>
+
+int main(void)
+{
+    int a = 0;
+    int b = 0;
+    // benchmark execution time of parallel region
+    const double t0 = omp_get_wtime();
+#pragma omp parallel
+    {
+        const int tid = omp_get_thread_num();
+#ifdef TWO_CRITICAL
+#pragma omp critical(critical_a)
+#else
+#pragma omp critical
+#endif /* TWO_CRITICAL */
+        {
+            a += tid;
+            usleep(50000);
+        }
+#ifdef TWO_CRITICAL
+#pragma omp critical(critical_b)
+#else
+#pragma omp critical
+#endif /* TWO_CRITICAL */
+        {
+            // if the operation would involve a read or write from/to `a`, then
+            // this would be a race condition
+            b += tid;
+            usleep(50000);
+        }
+    }
+    const double dt = omp_get_wtime() - t0;
+    std::cout << "Elapsed time: " << dt << " seconds\n";
+    return a + b;
+}
diff --git a/lecture07_08/omp_data_attributes/.gitignore b/lecture07_08/omp_data_attributes/.gitignore
@@ -0,0 +1,5 @@
+omp_private
+omp_region
+omp_region.s
+omp_static
+omp_threadprivate
diff --git a/lecture07_08/omp_data_attributes/Makefile b/lecture07_08/omp_data_attributes/Makefile
@@ -0,0 +1,29 @@
+CXX = g++
+CXXFLAGS = -O1 -fopenmp
+.PHONY: clean
+
+all: omp_private omp_firstprivate omp_lastprivate omp_static omp_threadprivate omp_region omp_region.s
+
+omp_private: omp_private.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+omp_firstprivate: omp_firstprivate.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+omp_lastprivate: omp_lastprivate.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+omp_static: omp_static.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+omp_threadprivate: omp_threadprivate.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+omp_region: omp_region.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+omp_region.s: omp_region.cpp
+	$(CXX) $(CXXFLAGS) -S -o $@ $<
+
+clean:
+	rm -f omp_private omp_firstprivate omp_lastprivate omp_static omp_threadprivate omp_region omp_region.s
diff --git a/lecture07_08/omp_data_attributes/omp_firstprivate.cpp b/lecture07_08/omp_data_attributes/omp_firstprivate.cpp
@@ -0,0 +1,24 @@
+#include <iostream>
+#include <omp.h>
+
+int main(void)
+{
+    int shared = -1; // shared variable
+
+#pragma omp parallel firstprivate(shared) // create private copy (initialized!)
+    {
+        const int tid = omp_get_thread_num();
+#pragma omp single
+        {
+            shared = tid;
+        } // no need to wait -> `shared` is private -> no race condition below!
+
+#pragma omp critical
+        {
+            std::cout << "Thread " << tid
+                      << ":\tvalue of shared (private copy) = " << shared
+                      << '\n';
+        }
+    }
+    return 0;
+}
diff --git a/lecture07_08/omp_data_attributes/omp_lastprivate.cpp b/lecture07_08/omp_data_attributes/omp_lastprivate.cpp
@@ -0,0 +1,41 @@
+#include <iostream>
+
+int main(void)
+{
+    int a = -1;
+#pragma omp parallel
+    {
+#pragma omp for lastprivate(a) // private copy of a (uninitialized!)
+        for (int i = 0; i < 100; ++i) {
+            a = i; // no race condition -> `a` is private copy in loop body
+        }
+#pragma omp single
+        {
+            std::cout << "After consecutive iterations: a = " << a << std::endl;
+            a = -1;
+        } // implicit barrier required here -> otherwise race condition on `a`
+          // below!
+
+#pragma omp for lastprivate(a) // private copy of a (uninitialized!)
+        for (int i = 0; i < 100; ++i) {
+            // An optimizing compiler will remove this loop completely and let
+            // one thread write to the shared variable a = 66 because the given
+            // lastprivate(a) OpenMP clause will still apply even if the
+            // optimizer phase removes the loop (see for yourself what happens
+            // if you change it to private(a)).
+            //
+            // If the loop was not optimized away, it would be executed in
+            // parallel and the last element 99 in the iteration sequence would
+            // apply to the lastprivate(a) update.  Because that thread will
+            // have never written to its private copy, it would result in
+            // undefined behavior because lastprivate(a) like private(a) do not
+            // initialize the private copies!  It would be a bug.
+            if (66 == i) {
+                a = i;
+            }
+        }
+#pragma omp single
+        std::cout << "After non-consecutive iterations: a = " << a << std::endl;
+    }
+    return 0;
+}
diff --git a/lecture07_08/omp_data_attributes/omp_private.cpp b/lecture07_08/omp_data_attributes/omp_private.cpp
@@ -0,0 +1,24 @@
+#include <iostream>
+#include <omp.h>
+
+int main(void)
+{
+    int shared = -1; // shared variable
+
+#pragma omp parallel private(shared) // create private copy (uninitialized!)
+    {
+        const int tid = omp_get_thread_num();
+#pragma omp single nowait
+        {
+            shared = tid;
+        } // no need to wait -> `shared` is private -> no race condition below!
+
+#pragma omp critical
+        {
+            std::cout << "Thread " << tid
+                      << ":\tvalue of shared (private copy) = " << shared
+                      << '\n';
+        }
+    }
+    return 0;
+}
diff --git a/lecture07_08/omp_data_attributes/omp_region.cpp b/lecture07_08/omp_data_attributes/omp_region.cpp
@@ -0,0 +1,54 @@
+// vim: foldmethod=marker
+#include <iostream>
+#include <omp.h>
+
+void declare_static(void)
+{
+    const int tid = omp_get_thread_num(); // reference
+
+    // declared_in_region is shared among threads who call this function.
+    // NOTE: the assignment here is not a race condition -> the compiler
+    // initializes this variable at compile time, not the threads at runtime!
+    static int declared_in_region = 1234567890;
+    //
+    // PROOF: see assembly code
+    // all threads print the initial value of declared_in_region {{{1
+#pragma omp single // write header line (only one needed)
+    {
+        std::cout << "Initial value of declared_in_region:\n";
+    } // implicit barrier here
+#pragma omp critical // all threads must write this
+    {
+        std::cout << "\ttid " << tid << ":\t" << declared_in_region << '\n';
+    } // NO implicit barrier here!
+    // we must synchronize here, otherwise a thread might continue while others
+    // still need to print the std::cout line above.  Test it by commenting out
+    // the barrier.
+#pragma omp barrier
+    // 1}}}
+
+    // DISCLAIMER: the next commented line would be a race condition!
+    // declared_in_region = tid;
+    //
+    // PROOF: declared_in_region is shared
+#pragma omp single
+    {
+        declared_in_region = tid;
+    } // others wait here
+#pragma omp single nowait
+    {
+        std::cout << "I am tid " << tid
+                  << " and `declared_in_region` has been written to by tid "
+                  << declared_in_region << '\n';
+    }
+}
+
+int main(void)
+{
+#pragma omp parallel
+    {
+        declare_static(); // all threads call this function
+    }
+
+    return 0;
+}
diff --git a/lecture07_08/omp_data_attributes/omp_static.cpp b/lecture07_08/omp_data_attributes/omp_static.cpp
@@ -0,0 +1,23 @@
+#include <iostream>
+#include <omp.h>
+
+int main(void)
+{
+#pragma omp parallel
+    {
+        static int static_tid; // static storage duration -> shared
+
+        const int tid = omp_get_thread_num();
+#pragma omp single
+        {
+            static_tid = tid;
+        } // implicit barrier required here -> no race condition below!
+
+#pragma omp critical
+        {
+            std::cout << "Thread " << tid
+                      << ":\treading static_tid = " << static_tid << '\n';
+        }
+    }
+    return 0;
+}
diff --git a/lecture07_08/omp_data_attributes/omp_threadprivate.cpp b/lecture07_08/omp_data_attributes/omp_threadprivate.cpp
@@ -0,0 +1,33 @@
+#include <iostream>
+#include <omp.h>
+
+// static (global) in compilation unit (shared by default)
+static int global = -1;
+
+// predetermine `global` as thread private (has to be done before first use).
+// Without this pragma global will be shared
+#pragma omp threadprivate(global)
+
+void access_global(void)
+{
+    const int tid = omp_get_thread_num(); // reference
+
+    // DISCLAIMER: this may be a race condition, depending on threadprivate
+    // above!
+    global = tid;
+
+#pragma omp critical // all threads must write this
+    {
+        std::cout << "\ttid " << tid << ":\t" << global << '\n';
+    }
+}
+
+int main(void)
+{
+#pragma omp parallel
+    {
+        access_global();
+    }
+
+    return 0;
+}
diff --git a/lecture07_08/omp_default/.gitignore b/lecture07_08/omp_default/.gitignore
@@ -0,0 +1 @@
+omp_default
diff --git a/lecture07_08/omp_default/Makefile b/lecture07_08/omp_default/Makefile
@@ -0,0 +1,9 @@
+CXX = g++
+CXXFLAGS = -O1 -fopenmp
+.PHONY: clean
+
+omp_default: omp_default.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f omp_default
diff --git a/lecture07_08/omp_default/omp_default.cpp b/lecture07_08/omp_default/omp_default.cpp
@@ -0,0 +1,15 @@
+#include <omp.h>
+
+int main(void)
+{
+    int var;
+#pragma omp parallel default(none) // shared(var)
+    {
+#pragma omp critical
+        {
+            // var was not explicitly specified -> will not compile
+            var = omp_get_thread_num();
+        }
+    }
+    return 0;
+}
diff --git a/lecture07_08/omp_device/.gitignore b/lecture07_08/omp_device/.gitignore
@@ -0,0 +1 @@
+omp_device
diff --git a/lecture07_08/omp_device/Makefile b/lecture07_08/omp_device/Makefile
@@ -0,0 +1,15 @@
+CXX = g++
+CXXFLAGS = -O1 -fopenmp
+.PHONY: clean run_dynamic run_static
+
+omp_device: omp_device.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+run_dynamic: omp_device
+	OMP_NUM_THREADS=8 OMP_DYNAMIC=true ./omp_device
+
+run_static: omp_device
+	OMP_NUM_THREADS=8 OMP_DYNAMIC=false ./omp_device
+
+clean:
+	rm -f omp_device
diff --git a/lecture07_08/omp_device/omp_device.cpp b/lecture07_08/omp_device/omp_device.cpp
@@ -0,0 +1,19 @@
+#include <iostream>
+#include <omp.h>
+
+int main(void)
+{
+#pragma omp parallel
+    {
+#pragma omp single
+        {
+            const int nprocs = omp_get_num_procs();
+            const int nthreads = omp_get_num_threads();
+            const int maxthreads = omp_get_max_threads();
+            std::cout << "Number of processors:      " << nprocs << '\n';
+            std::cout << "Number of threads in team: " << nthreads << '\n';
+            std::cout << "Maximum number of threads: " << maxthreads << '\n';
+        }
+    }
+    return 0;
+}
diff --git a/lecture07_08/omp_schedule/.gitignore b/lecture07_08/omp_schedule/.gitignore
@@ -0,0 +1 @@
+omp_schedule
diff --git a/lecture07_08/omp_schedule/Makefile b/lecture07_08/omp_schedule/Makefile
@@ -0,0 +1,9 @@
+CXX = g++
+CXXFLAGS = -O1 -fopenmp
+.PHONY: clean
+
+omp_schedule: omp_schedule.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f omp_schedule
diff --git a/lecture07_08/omp_schedule/omp_schedule.cpp b/lecture07_08/omp_schedule/omp_schedule.cpp
@@ -0,0 +1,38 @@
+#include <iostream>
+#include <omp.h>
+
+int main(void)
+{
+#pragma omp parallel
+    {
+        const int tid = omp_get_thread_num();
+
+        omp_sched_t sched;
+        int chunk;
+
+#pragma omp single
+        {
+            // check the default before we change:
+            omp_get_schedule(&sched, &chunk);
+            std::cout << "Default: sched=" << sched << "; chunk=" << chunk
+                      << " (tid=" << tid << ")\n";
+
+            // DISCLAIMER: this will only set the schedule for the calling
+            // thread!
+            // If the second arg is < 1: OpenMP uses the default chunk_size for
+            // the chosen scheduling policy, for omp_sched_auto the second arg
+            // is irrelevant:
+            omp_set_schedule(omp_sched_guided, 8);
+        }
+
+        // see what you get with all threads
+        omp_get_schedule(&sched, &chunk);
+
+#pragma omp critical
+        {
+            std::cout << "Thread " << tid << ":\tsched=" << sched
+                      << "; chunk=" << chunk << '\n';
+        }
+    }
+    return 0;
+}
diff --git a/lecture07_08/omp_scoping/Makefile b/lecture07_08/omp_scoping/Makefile
@@ -0,0 +1,12 @@
+CXX = gcc
+CXXFLAGS = -O1 -fopenmp
+.PHONY: clean
+
+main: main.c myfunc.o
+	$(CXX) $(CXXFLAGS) -o $@ $^
+
+myfunc.o: myfunc.c
+	$(CXX) $(CXXFLAGS) -c -o $@ $<
+
+clean:
+	rm -f main myfunc.o
diff --git a/lecture07_08/omp_scoping/main.c b/lecture07_08/omp_scoping/main.c
@@ -0,0 +1,18 @@
+#include <omp.h>
+
+void myfunc(int A[], const int N, const int tid);
+
+int main(void)
+{
+    const int N = 1000;
+    int A[N];
+
+#pragma omp parallel
+    {
+        const int tid = omp_get_thread_num();
+        myfunc(A, N, tid);
+    }
+    // The #pragma omp for directive has no effect in dynamic extent
+    myfunc(A, N, 0);
+    return 0;
+}
diff --git a/lecture07_08/omp_scoping/myfunc.c b/lecture07_08/omp_scoping/myfunc.c
@@ -0,0 +1,6 @@
+void myfunc(int A[], const int N, const int tid)
+{
+#pragma omp for
+    for (int i = 0; i < N; ++i)
+        A[i] = tid;
+}
diff --git a/lecture07_08/omp_wtime/.gitignore b/lecture07_08/omp_wtime/.gitignore
@@ -0,0 +1 @@
+omp_wtime
diff --git a/lecture07_08/omp_wtime/Makefile b/lecture07_08/omp_wtime/Makefile
@@ -0,0 +1,9 @@
+CXX = g++
+CXXFLAGS = -O1 -fopenmp
+.PHONY: clean
+
+omp_wtime: omp_wtime.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f omp_wtime
diff --git a/lecture07_08/omp_wtime/omp_wtime.cpp b/lecture07_08/omp_wtime/omp_wtime.cpp
@@ -0,0 +1,35 @@
+#include <iostream>
+#include <omp.h>
+#include <random>
+#include <unistd.h>
+
+int main(void)
+{
+#pragma omp parallel
+    {
+#pragma omp single
+        {
+            const double resolution = omp_get_wtick();
+            std::cout << "Clock resolution: " << std::scientific << resolution
+                      << " sec\n";
+        }
+
+        const int tid = omp_get_thread_num();
+        // the random number generator must be thread private -> race condition
+        // otherwise!
+        std::mt19937 gen(tid);
+        std::uniform_int_distribution uniform(1, 500);
+
+        const int mytime = uniform(gen) * 1000; // micro seconds
+        const double t0 = omp_get_wtime();
+        usleep(mytime);
+        const double dt = omp_get_wtime() - t0;
+
+#pragma omp critical
+        {
+            std::cout << "Thread " << tid << ":\t" << std::scientific << dt
+                      << " sec elapsed [expected: " << mytime * 1.0e-6 << "]\n";
+        }
+    }
+    return 0;
+}
diff --git a/lecture09/thread_affinity/.gitignore b/lecture09/thread_affinity/.gitignore
@@ -0,0 +1,4 @@
+affinity
+affinity_close
+affinity_spread
+affinity_master
diff --git a/lecture09/thread_affinity/Makefile b/lecture09/thread_affinity/Makefile
@@ -0,0 +1,20 @@
+CXX = g++
+CXXFLAGS = -O1 -fopenmp
+.PHONY: clean
+
+all: affinity affinity_close affinity_spread affinity_master
+
+affinity: affinity.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+affinity_close: affinity_close.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+affinity_spread: affinity_spread.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+affinity_master: affinity_master.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f affinity affinity_close affinity_spread affinity_master
diff --git a/lecture09/thread_affinity/affinity.cpp b/lecture09/thread_affinity/affinity.cpp
@@ -0,0 +1,35 @@
+#include "helper.h"
+#include <iostream>
+#include <omp.h>
+#include <sstream>
+#include <vector>
+
+int main(void)
+{
+    std::vector<std::string> ordered_output;
+
+#pragma omp parallel
+    {
+        const int tid = omp_get_thread_num();
+        const int nthreads = omp_get_num_threads();
+#pragma omp single
+        ordered_output.resize(nthreads);
+
+        int cpuid, nodeid;
+        tacc_rdtscp(&cpuid, &nodeid);
+
+        const std::string hostname = gethostname();
+
+        std::ostringstream mystream;
+        mystream << "[thread=" << tid << "/" << nthreads << "]\t"
+                 << "Running on host " << hostname.c_str() << "\tCPU " << cpuid
+                 << "\tNUMA NODE " << nodeid << '\n';
+        ordered_output[tid] = mystream.str();
+    }
+
+    for (const auto &s : ordered_output) {
+        std::cout << s;
+    }
+
+    return 0;
+}
diff --git a/lecture09/thread_affinity/affinity_close.cpp b/lecture09/thread_affinity/affinity_close.cpp
@@ -0,0 +1,35 @@
+#include "helper.h"
+#include <iostream>
+#include <omp.h>
+#include <sstream>
+#include <vector>
+
+int main(void)
+{
+    std::vector<std::string> ordered_output;
+
+#pragma omp parallel proc_bind(close)
+    {
+        const int tid = omp_get_thread_num();
+        const int nthreads = omp_get_num_threads();
+#pragma omp single
+        ordered_output.resize(nthreads);
+
+        int cpuid, nodeid;
+        tacc_rdtscp(&cpuid, &nodeid);
+
+        const std::string hostname = gethostname();
+
+        std::ostringstream mystream;
+        mystream << "[thread=" << tid << "/" << nthreads << "]\t"
+                 << "Running on host " << hostname.c_str() << "\tCPU " << cpuid
+                 << "\tNUMA NODE " << nodeid << '\n';
+        ordered_output[tid] = mystream.str();
+    }
+
+    for (const auto &s : ordered_output) {
+        std::cout << s;
+    }
+
+    return 0;
+}
diff --git a/lecture09/thread_affinity/affinity_master.cpp b/lecture09/thread_affinity/affinity_master.cpp
@@ -0,0 +1,35 @@
+#include "helper.h"
+#include <iostream>
+#include <omp.h>
+#include <sstream>
+#include <vector>
+
+int main(void)
+{
+    std::vector<std::string> ordered_output;
+
+#pragma omp parallel proc_bind(master) // use `primary` in OpenMP v5.1
+    {
+        const int tid = omp_get_thread_num();
+        const int nthreads = omp_get_num_threads();
+#pragma omp single
+        ordered_output.resize(nthreads);
+
+        int cpuid, nodeid;
+        tacc_rdtscp(&cpuid, &nodeid);
+
+        const std::string hostname = gethostname();
+
+        std::ostringstream mystream;
+        mystream << "[thread=" << tid << "/" << nthreads << "]\t"
+                 << "Running on host " << hostname.c_str() << "\tCPU " << cpuid
+                 << "\tNUMA NODE " << nodeid << '\n';
+        ordered_output[tid] = mystream.str();
+    }
+
+    for (const auto &s : ordered_output) {
+        std::cout << s;
+    }
+
+    return 0;
+}
diff --git a/lecture09/thread_affinity/affinity_spread.cpp b/lecture09/thread_affinity/affinity_spread.cpp
@@ -0,0 +1,35 @@
+#include "helper.h"
+#include <iostream>
+#include <omp.h>
+#include <sstream>
+#include <vector>
+
+int main(void)
+{
+    std::vector<std::string> ordered_output;
+
+#pragma omp parallel proc_bind(spread)
+    {
+        const int tid = omp_get_thread_num();
+        const int nthreads = omp_get_num_threads();
+#pragma omp single
+        ordered_output.resize(nthreads);
+
+        int cpuid, nodeid;
+        tacc_rdtscp(&cpuid, &nodeid);
+
+        const std::string hostname = gethostname();
+
+        std::ostringstream mystream;
+        mystream << "[thread=" << tid << "/" << nthreads << "]\t"
+                 << "Running on host " << hostname.c_str() << "\tCPU " << cpuid
+                 << "\tNUMA NODE " << nodeid << '\n';
+        ordered_output[tid] = mystream.str();
+    }
+
+    for (const auto &s : ordered_output) {
+        std::cout << s;
+    }
+
+    return 0;
+}
diff --git a/lecture09/thread_affinity/helper.h b/lecture09/thread_affinity/helper.h
@@ -0,0 +1,26 @@
+#ifndef HELPER_H_T9QL1OGR
+#define HELPER_H_T9QL1OGR
+
+#include <string>
+#include <unistd.h>
+
+// return cpu and numa node id
+// https://stackoverflow.com/questions/16862620/numa-get-current-node-core
+unsigned long tacc_rdtscp(int *core, int *node)
+{
+    unsigned long a,d,c;
+    __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c));
+    *node= (c & 0xFFF000)>>12;
+    *core = c & 0xFFF;
+    return ((unsigned long)a) | (((unsigned long)d) << 32);;
+}
+
+// get hostname of node
+std::string gethostname(void)
+{
+    std::string hostname(1024, '\0');
+    gethostname(&hostname.front(), hostname.size());
+    return hostname;
+}
+
+#endif /* HELPER_H_T9QL1OGR */
diff --git a/lecture09/thread_affinity/omp_places_cores.sh b/lecture09/thread_affinity/omp_places_cores.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+OMP_NUM_THREADS=16 OMP_PLACES='cores' ./affinity
diff --git a/lecture09/thread_affinity/omp_places_sockets.sh b/lecture09/thread_affinity/omp_places_sockets.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+OMP_NUM_THREADS=16 OMP_PLACES='sockets' ./affinity
diff --git a/lecture09/thread_affinity/omp_places_threads.sh b/lecture09/thread_affinity/omp_places_threads.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+OMP_NUM_THREADS=16 OMP_PLACES='threads' ./affinity
diff --git a/lecture09/thread_affinity/omp_proc_bind_close.sh b/lecture09/thread_affinity/omp_proc_bind_close.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+OMP_NUM_THREADS=16 OMP_PROC_BIND='close' ./affinity
diff --git a/lecture09/thread_affinity/omp_proc_bind_false.sh b/lecture09/thread_affinity/omp_proc_bind_false.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+OMP_NUM_THREADS=16 OMP_PROC_BIND='false' ./affinity
diff --git a/lecture09/thread_affinity/omp_proc_bind_master.sh b/lecture09/thread_affinity/omp_proc_bind_master.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+OMP_NUM_THREADS=16 OMP_PROC_BIND='master' ./affinity
diff --git a/lecture09/thread_affinity/omp_proc_bind_spread.sh b/lecture09/thread_affinity/omp_proc_bind_spread.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+OMP_NUM_THREADS=16 OMP_PROC_BIND='spread' ./affinity
diff --git a/lecture09/thread_affinity/omp_proc_bind_true.sh b/lecture09/thread_affinity/omp_proc_bind_true.sh
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+OMP_NUM_THREADS=16 OMP_PROC_BIND='true' ./affinity
diff --git a/lecture10/openmp_overhead/AMD_Opteron/openmp_overhead.pdf b/lecture10/openmp_overhead/AMD_Opteron/openmp_overhead.pdf
Binary files differ.
diff --git a/lecture10/openmp_overhead/AMD_Opteron/proc_bind_false.dat b/lecture10/openmp_overhead/AMD_Opteron/proc_bind_false.dat
@@ -0,0 +1,8 @@
+# threads parallel for parallel_for barrier single critical lock_unlock ordered atomic reduction
+1	1.402517e+00 2.040600e-02	9.638550e-01 1.060350e-01	1.407802e+00 3.316600e-02	9.672210e-01 6.728000e-03	9.601640e-01 6.021000e-03	1.316700e-02 2.729000e-03	2.073300e-02 1.648000e-03	8.487000e-03 1.494000e-03	2.950700e-02 2.240000e-04	1.467251e+00 3.183000e-02
+2	3.377395e+00 1.364400e-01	1.180594e+00 1.220300e-02	3.449368e+00 2.590500e-02	1.229767e+00 9.989000e-03	1.174797e+00 1.923300e-02	1.595980e-01 2.828300e-02	1.604950e-01 2.559000e-03	1.190669e+00 7.037000e-03	2.340520e-01 4.240000e-03	3.332934e+00 3.320900e-02
+4	3.472150e+00 1.092780e-01	1.297534e+00 1.140100e-01	3.849536e+00 3.932600e-02	1.285642e+00 2.517300e-02	1.520340e+00 1.487600e-02	5.914140e-01 1.249400e-02	5.639350e-01 8.597000e-03	9.850230e-01 5.318000e-03	5.995950e-01 3.523400e-02	3.844296e+00 1.692840e-01
+8	5.642615e+00 4.424500e-02	2.428380e+00 1.504480e-01	5.853155e+00 8.348600e-02	2.516674e+00 2.587440e-01	2.774199e+00 1.654420e-01	1.363838e+00 1.112150e-01	1.480740e+00 8.594900e-02	9.917380e-01 4.070400e-02	1.194033e+00 1.714040e-01	6.632784e+00 9.369850e-01
+16	9.253381e+00 6.832900e-02	4.036582e+00 1.559480e-01	9.907635e+00 8.686000e-02	4.085306e+00 2.851050e-01	6.297392e+00 6.432800e-02	5.481086e+00 3.757630e-01	5.427509e+00 5.264830e-01	1.048119e+00 3.806800e-02	2.159469e+00 3.160940e-01	1.361704e+01 9.204500e-02
+32	1.907739e+01 8.178810e-01	9.850260e+00 2.483310e-01	1.961147e+01 5.749560e-01	9.472037e+00 1.867770e-01	1.260011e+01 3.464590e-01	6.311141e+00 6.595200e-01	6.336537e+00 5.468470e-01	1.075643e+00 5.919800e-02	1.972749e+00 2.196170e-01	1.753772e+01 1.873420e-01
+64	3.966405e+01 1.517342e+00	2.037396e+01 3.263870e-01	3.982416e+01 2.570315e+00	2.378785e+01 3.674876e+00	3.555074e+01 7.258960e-01	1.290547e+01 9.898270e-01	1.279011e+01 1.402299e+00	1.994510e+00 2.245406e+00	6.436716e+00 2.400394e+00	5.790044e+01 3.615972e+00
diff --git a/lecture10/openmp_overhead/AMD_Opteron/proc_bind_true.dat b/lecture10/openmp_overhead/AMD_Opteron/proc_bind_true.dat
@@ -0,0 +1,8 @@
+# threads parallel for parallel_for barrier single critical lock_unlock ordered atomic reduction
+1	1.331503e+00 3.412900e-02	9.543120e-01 3.804000e-03	1.319009e+00 1.155800e-02	9.673880e-01 5.830000e-03	9.758090e-01 1.115710e-01	1.657900e-02 1.376000e-03	2.218500e-02 9.500000e-04	8.604000e-03 1.165000e-03	3.166400e-02 1.290000e-04	1.380004e+00 9.710000e-03
+2	2.111130e+00 1.984500e-02	9.014150e-01 2.217300e-02	2.118879e+00 6.912300e-02	8.981230e-01 1.857600e-02	8.820540e-01 1.878000e-02	1.411180e-01 1.614200e-02	1.361210e-01 1.582300e-02	2.049800e-01 4.185200e-02	1.143560e-01 5.316100e-02	2.178722e+00 1.131700e-02
+4	3.377356e+00 1.873800e-02	1.189846e+00 1.187900e-02	3.727039e+00 8.104910e-01	1.532612e+00 2.901200e-02	1.922584e+00 1.012850e-01	6.452860e-01 3.303500e-02	5.906120e-01 4.363800e-02	6.596520e-01 3.989000e-03	7.609000e-01 1.503940e-01	4.120677e+00 5.567500e-02
+8	5.242987e+00 4.543800e-02	2.616701e+00 3.318600e-02	5.388962e+00 3.659300e-02	2.980622e+00 1.598151e+00	4.435692e+00 3.942300e-02	2.326891e+00 1.345630e-01	2.621963e+00 1.714290e-01	6.337460e-01 4.899000e-03	1.612638e+00 2.495420e-01	6.703456e+00 9.251780e-01
+16	1.062893e+01 9.372600e-02	4.779814e+00 3.467300e-02	1.108604e+01 8.040300e-02	4.514903e+00 3.296900e-02	8.394583e+00 1.162530e-01	3.032364e+00 2.738930e-01	3.503220e+00 2.002430e-01	6.613400e-01 1.308900e-02	1.895607e+00 1.709140e-01	1.271062e+01 9.121110e-01
+32	2.334865e+01 3.237670e-01	1.098605e+01 2.548060e-01	2.420802e+01 4.485240e-01	1.058177e+01 1.130700e-01	2.544649e+01 7.436380e-01	8.309794e+00 6.075940e-01	8.515407e+00 5.558380e-01	6.330950e-01 1.718000e-02	2.892609e+00 3.540830e-01	2.937155e+01 1.409414e+00
+64	4.689721e+01 2.414586e+00	2.405726e+01 1.071561e+00	4.803695e+01 9.345560e-01	2.145390e+01 1.830180e+00	4.902376e+01 2.166267e+00	1.071435e+01 1.146510e+00	1.048317e+01 9.043630e-01	5.841290e-01 1.429300e-02	4.735033e+00 7.720940e-01	5.227324e+01 1.618774e+00
diff --git a/lecture10/openmp_overhead/Intel_Xeon2683v4/openmp_overhead.pdf b/lecture10/openmp_overhead/Intel_Xeon2683v4/openmp_overhead.pdf
Binary files differ.
diff --git a/lecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_false.dat b/lecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_false.dat
@@ -0,0 +1,7 @@
+# threads parallel for parallel_for barrier single critical lock_unlock ordered atomic reduction
+1	7.937410e-01 1.405980e-01	3.000770e-01 1.206040e-01	4.544730e-01 1.123700e-02	2.687350e-01 1.593800e-02	2.730820e-01 1.165900e-02	2.865200e-02 1.129000e-02	2.914300e-02 1.115500e-02	3.834300e-02 1.210000e-02	1.108800e-02 1.120000e-04	5.043380e-01 1.135200e-02
+2	2.296052e+00 3.159780e-01	5.839310e-01 8.841900e-02	2.338442e+00 4.084700e-02	5.603300e-01 2.168600e-02	5.850920e-01 2.377500e-02	2.069800e-02 1.844200e-02	1.026780e-01 2.057000e-02	3.690150e-01 2.261200e-02	5.144500e-02 7.918000e-03	2.410278e+00 2.044020e-01
+4	3.596719e+00 8.907000e-02	9.386340e-01 1.131030e-01	3.260242e+00 8.585100e-02	8.744930e-01 5.123000e-02	1.064314e+00 3.732200e-02	1.319100e-01 4.183900e-02	2.094040e-01 3.205800e-02	2.848370e-01 2.944000e-02	6.543800e-02 1.459200e-02	3.787471e+00 1.519180e-01
+8	4.852901e+00 3.355180e-01	1.610962e+00 1.007260e-01	4.687913e+00 1.203670e-01	1.537271e+00 4.187100e-02	2.084429e+00 6.085100e-02	1.180850e-01 2.578100e-02	2.970390e-01 2.665600e-02	4.483030e-01 2.467000e-02	1.066430e-01 2.261700e-02	6.218094e+00 2.258230e-01
+16	8.113731e+00 2.710340e-01	2.883032e+00 9.578900e-02	7.760440e+00 7.466270e-01	2.792584e+00 2.279100e-02	4.360566e+00 4.993000e-02	2.259970e-01 9.855000e-03	3.028450e-01 1.392300e-02	3.789050e-01 6.589200e-02	1.573170e-01 2.331600e-02	1.036501e+01 3.679590e-01
+32	1.608341e+01 1.229300e+00	6.486622e+00 1.396300e-01	1.380231e+01 1.676100e-01	6.275401e+00 9.366000e-02	9.445811e+00 2.926130e-01	2.089660e-01 2.764600e-02	4.907780e-01 3.136800e-02	4.022040e-01 2.630400e-02	1.350580e-01 5.654100e-02	2.042983e+01 7.251660e-01
diff --git a/lecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_true.dat b/lecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_true.dat
@@ -0,0 +1,7 @@
+# threads parallel for parallel_for barrier single critical lock_unlock ordered atomic reduction
+1	4.835700e-01 1.885300e-02	2.935020e-01 2.998700e-02	4.894620e-01 1.666000e-02	2.892380e-01 2.749000e-02	2.904410e-01 1.838200e-02	1.611700e-02 1.957300e-02	1.696700e-02 1.636500e-02	2.689700e-02 1.681900e-02	1.111900e-02 1.970000e-04	5.234470e-01 5.894000e-03
+2	2.186639e+00 1.256210e-01	7.102430e-01 8.755600e-02	2.391640e+00 1.740600e-02	6.741400e-01 2.361500e-02	6.415170e-01 9.107000e-03	7.414600e-02 7.116000e-03	1.516090e-01 1.201600e-02	3.238180e-01 8.829000e-03	4.892700e-02 7.032000e-03	2.390954e+00 1.887100e-01
+4	3.448366e+00 1.703340e-01	1.056021e+00 1.249040e-01	3.242441e+00 5.770700e-02	8.994180e-01 2.416800e-02	1.244440e+00 3.263000e-02	1.071660e-01 1.635700e-02	2.538550e-01 1.493900e-02	4.703550e-01 2.858200e-02	1.017960e-01 2.866100e-02	4.026791e+00 1.076080e-01
+8	4.858263e+00 2.654620e-01	1.616485e+00 7.317800e-02	4.621479e+00 1.826780e-01	1.574272e+00 3.862300e-02	2.098022e+00 4.825200e-02	2.062030e-01 1.756400e-02	2.892610e-01 1.873800e-02	3.959510e-01 1.332400e-02	1.123810e-01 2.337200e-02	6.513596e+00 6.708810e-01
+16	7.681152e+00 3.059600e-01	2.693141e+00 1.062240e-01	7.003080e+00 8.182500e-02	2.629225e+00 1.980100e-02	3.907993e+00 3.472200e-02	2.274840e-01 1.167300e-02	3.173700e-01 2.057600e-02	4.397950e-01 4.770000e-02	1.369940e-01 3.960400e-02	9.669193e+00 8.244130e-01
+32	1.507245e+01 8.887600e-01	5.700451e+00 2.852420e-01	1.394916e+01 9.880200e-02	5.628993e+00 3.959500e-02	8.543289e+00 8.844400e-02	2.853500e-01 1.297900e-02	4.435590e-01 2.863600e-02	4.402650e-01 2.675900e-02	1.696370e-01 2.608900e-02	2.016809e+01 8.817090e-01
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Licence.txt b/lecture10/openmp_overhead/openmpbench_C_v31/Licence.txt
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [2011] [The University of Edinburgh]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile
@@ -0,0 +1,69 @@
+include Makefile.defs
+
+# If CFLAGS_CRAY is empty set it to CFLAGS
+ifeq ($(CFLAGS_CRAY),)
+CFLAGS_CRAY = ${CFLAGS}
+endif
+
+.c.o:
+	${CC} ${CFLAGS} $(OMPFLAG) -c $*.c 
+
+SYNCOBJS =  syncbench.o common.o 
+SCHEDOBJS = schedbench.o common_sched.o 
+ARRAYOBJS = arraybench_$(IDA).o common.o 
+TASKOBJS =  taskbench.o common.o 
+SCHEDFLAGS = -DSCHEDBENCH
+
+all:	syncbench schedbench taskbench
+	$(MAKE) IDA=1 prog
+	$(MAKE) IDA=3 prog
+	$(MAKE) IDA=9 prog
+	$(MAKE) IDA=27 prog
+	$(MAKE) IDA=81 prog
+	$(MAKE) IDA=243 prog
+	$(MAKE) IDA=729 prog
+	$(MAKE) IDA=2187 prog
+	$(MAKE) IDA=6561 prog
+	$(MAKE) IDA=19683 prog
+	$(MAKE) IDA=59049 prog
+
+prog: arraybench_$(IDA) 
+
+syncbench: $(SYNCOBJS)
+	$(CC) -o syncbench $(LDFLAGS) $(SYNCOBJS) $(CLOCKOBJS) $(LIBS) -lm
+
+# Rule to ensure the lower optimisation level is picked up for common.c 
+# with the Cray compiler
+common.o:	
+	${CC} ${CFLAGS_CRAY} $(OMPFLAG) -c $*.c 
+
+# Separate rule to build common_sched.o as we need to ensure the correct 
+# DEFAULT_DELAY_TIME is used. 
+common_sched.o:	
+	${CC} ${CFLAGS_CRAY} $(SCHEDFLAGS) $(OMPFLAG) -o common_sched.o -c common.c
+
+schedbench: $(SCHEDOBJS)
+	$(CC) -o schedbench  $(LDFLAGS) $(SCHEDOBJS) $(CLOCKOBJS) $(LIBS) -lm 
+
+# Multiple header files due to multiple array sizes, makes header file arraybench_*.h
+arraybench_$(IDA).h: arraybench.h
+	$(CPP) -DIDA=$(IDA) $(OMPFLAG) -P arraybench.h -o $@
+
+# Multiple object files due to multiple array sizes, makes object file arraybench_*.o
+arraybench_$(IDA).o: arraybench_$(IDA).h arraybench.c
+	$(CC) $(CFLAGS) -DIDA=$(IDA) $(OMPFLAG) arraybench.c -c -o $@
+
+# Multiple executables due to multiple array sizes, makes exe file arraybench_*
+arraybench_$(IDA): $(ARRAYOBJS) $(CLOCKOBJS) arraybench.c
+	$(CC) $(LDFLAGS) $(ARRAYOBJS) $(CLOCKOBJS) $(LIBS) -lm -o $@ 
+
+taskbench: $(TASKOBJS)
+	$(CC) -o taskbench $(LDFLAGS) $(OMPFLAG) $(TASKOBJS) $(CLOCKOBJS) $(LIBS) -lm 
+
+clean: 
+	-rm *.o syncbench schedbench arraybench_* taskbench
+
+clean-all: clean 	
+	-rm OpenMPBench.* *.all
+
+
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs
@@ -0,0 +1,10 @@
+# Uncomment the following line to use OpenMP 2.0 features
+#OMPFLAG = -DOMPVER2
+# Uncomment the following line to use OpenMP 3.0 features
+OMPFLAG = -DOMPVER2 -DOMPVER3
+
+CC=gcc
+CFLAGS =  -O1 -lm -fopenmp
+LDFLAGS = -O0 -lm -fopenmp
+CPP = /usr/bin/cpp
+LIBS = 
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.cray b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.cray
@@ -0,0 +1,10 @@
+# Uncomment the following line to use OpenMP 2.0 features
+#OMPFLAG = -DOMPVER2
+# Uncomment the following line to use OpenMP 3.0 features
+OMPFLAG = -DOMPVER2 -DOMPVER3
+
+CC=cc 
+CFLAGS = -O1 -lm
+LDFLAGS = -O0 -lm
+CPP = /usr/bin/cpp
+LIBS = 
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.pgi b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.pgi
@@ -0,0 +1,12 @@
+# Uncomment the following line to use OpenMP 2.0 features
+#OMPFLAG = -DOMPVER2
+# Uncomment the following line to use OpenMP 3.0 features
+# Nested tasks don't work on HECToR Phase 3 for the PGI compiler
+# and thus are disabled. 
+OMPFLAG = -DOMPVER2 -DOMPVER3 -DDISABLE_NESTED_TASKS_TESTS
+
+CC =cc 
+CFLAGS = -fast -mp=nonuma -lm
+LDFLAGS = -fast -mp=nonuma -lm
+CPP = /usr/bin/cpp
+LIBS = 
+\ No newline at end of file
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.gnu b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.gnu
@@ -0,0 +1,10 @@
+# Uncomment the following line to use OpenMP 2.0 features
+#OMPFLAG = -DOMPVER2
+# Uncomment the following line to use OpenMP 3.0 features
+OMPFLAG = -DDISABLE_BARRIER_TEST -DOMPVER2 -DOMPVER3
+
+CC = /usr/local/bin/gcc-4.6 
+CFLAGS = -fopenmp -O1 -lm
+LDFLAGS = -fopenmp -O1 -lm
+CPP = /usr/bin/cpp
+LIBS = 
+\ No newline at end of file
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.sun b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.sun
@@ -0,0 +1,10 @@
+# Uncomment the following line to use OpenMP 2.0 features
+#OMPFLAG = -DOMPVER2
+# Uncomment the following line to use OpenMP 3.0 features
+OMPFLAG = -DOMPVER2 -DOMPVER3
+
+CC = /home/h012/fiona/SolarisStudio12.2-linux-x86-tar-ML/solstudio12.2/bin/suncc
+CFLAGS = -xopenmp -xO3 -lm
+LDFLAGS = -xopenmp -xO3 -lm
+CPP = /usr/bin/cpp
+LIBS = 
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.gnu b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.gnu
@@ -0,0 +1,10 @@
+# Uncomment the following line to use OpenMP 2.0 features
+#OMPFLAG = -DOMPVER2
+# Uncomment the following line to use OpenMP 3.0 features
+OMPFLAG = -DDISABLE_BARRIER_TEST -DOMPVER2 -DOMPVER3
+
+CC = gcc
+CFLAGS = -fopenmp -O1 -lm
+LDFLAGS = -fopenmp -O1 -lm
+CPP = /usr/bin/cpp
+LIBS = 
+\ No newline at end of file
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.intel b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.intel
@@ -0,0 +1,10 @@
+# Uncomment the following line to use OpenMP 2.0 features
+#OMPFLAG = -DOMPVER2
+# Uncomment the following line to use OpenMP 3.0 features
+OMPFLAG = -DDISABLE_BARRIER_TEST -DOMPVER2 -DOMPVER3
+
+CC = icc
+CFLAGS = -openmp -O1 -lm
+LDFLAGS = -openmp -O1 -lm
+CPP = /usr/bin/cpp
+LIBS = 
+\ No newline at end of file
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/README.txt b/lecture10/openmp_overhead/openmpbench_C_v31/README.txt
@@ -0,0 +1,113 @@
+/****************************************************************************
+*                                                                           *
+*             OpenMP MicroBenchmark Suite - Version 3.1                     *
+*                                                                           *
+*                            produced by                                    *
+*                                                                           *
+*             Mark Bull, Fiona Reid and Nix Mc Donnell                      *
+*                                                                           *
+*                                at                                         *
+*                                                                           *
+*                Edinburgh Parallel Computing Centre                        *
+*                                                                           *
+*         email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk                 *
+*                                                                           *
+*                                                                           *
+*      This version copyright (c) The University of Edinburgh, 2015.        *
+*                                                                           *
+*                                                                           *
+*  Licensed under the Apache License, Version 2.0 (the "License");          *
+*  you may not use this file except in compliance with the License.         *
+*  You may obtain a copy of the License at                                  *
+*                                                                           *
+*      http://www.apache.org/licenses/LICENSE-2.0                           *
+*                                                                           *
+*  Unless required by applicable law or agreed to in writing, software      *
+*  distributed under the License is distributed on an "AS IS" BASIS,        *
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
+*  See the License for the specific language governing permissions and      *
+*  limitations under the License.                                           *
+*                                                                           *
+****************************************************************************/
+
+===============
+ Licence
+===============
+This software is released under the licence in Licence.txt
+
+===============
+ Introduction
+===============
+Overheads due to synchronisation, loop scheduling, array operations and 
+task scheduling are an important factor in determining the performance of 
+shared memory parallel programs. We have designed a set of microbenchmarks 
+to measure these classes of overhead for language constructs in OpenMP. 
+
+===============
+ Installation
+===============
+ 1. Unpack the tar file
+
+ 2. Edit the Makefile.defs as follows:
+    * Set CC to the C compiler you wish to use (e.g. gcc pgcc icc xlc etc)
+    * Set CFLAGS to any required C compiler flags to enable processing of 
+      OpenMP directives (e.g. -fopenmp -mp, -omp); standard optimisation is 
+      also recommended (e.g. -O).
+    * Set LDFLAGS to any required C linker flags
+    * Set CPP to the local C-Preprocessor (e.g. /usr/local/bin/cpp) to 
+      make the C compiler invoke cpp on .c and .h files
+    * To benchmark OpenMP 2.0 features can be invoked by setting the flag 
+	OMPFLAG = -DOMPVER2
+    * To benchmark OpenMP 2.0 & 3.0 features can be invoked by setting the flag 
+	OMPFLAG = -DOMPVER2 -DOMPVER3
+    * If neither of these flags are set then OpenMP 1.0 compatibility is 
+      ensured.
+
+3. Type "make" to build all 4 benchmarks or "make benchmark" where benchmark 
+    is one of syncbench, taskbench, schedbench. By default "make" will build 
+    executables with array sizes ranging in powers of 3 from 1 to 59049. To 
+    build the array benchmark with an array size of arraysize, use 
+    "make IDA=arraysize prog" where arraysize is a positive integer. 
+
+
+Example Makefile.defs.* files are supplied for several machines and
+compiler versions, e.g. 
+	 Makefile.defs.hector.* - Cray XE6 
+	 Makefile.defs.magny0.* - 48 core AMD Magny Cours machine
+	 Makefile.defs.stokes.*	- SGI Altix ICE 8200EX
+
+
+===============
+ Running
+===============
+
+1. Set OMP_NUM_THREADS to the number of OpenMP threads you want to run with, 
+   e.g. export OMP_NUM_THREADS = 4
+   OMP_NUM_THREADS should be less than or equal to the number of physical 
+   cores available to you. 
+
+2. Run the benchmark with:
+   ./benchmark 
+
+   The output will go to STDOUT and thus you will probably want to re-direct 
+   this to a file. ./benchmark --help will give the usage options. 
+
+
+=================
+Additional notes
+=================
+
+ 1. If you encounter problems with the value of innerreps becoming too 
+    large (an error will be reported) try recompiling with a lower level of 
+    optimisation, ideally with inlining turned off. 
+
+ 2. It is common to observe significant variability between the overhead 
+    values obtained on different runs of the benchmark programs. Therefore, 
+    it is advisable to run each benchmark, say, 10-20 times and average the 
+    results obtained.
+
+ 3. You should use whatever mechanisms are at your disposal to ensure that 
+    threads have exclusive or almost exclusive access to processors. You 
+    should rejects runs where the standard deviation or number of outliers is 
+    large: this is a good indication that the benchmark did not have almost 
+    exclusive access to processors. 
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/arraybench.c b/lecture10/openmp_overhead/openmpbench_C_v31/arraybench.c
@@ -0,0 +1,133 @@
+/****************************************************************************
+*                                                                           *
+*             OpenMP MicroBenchmark Suite - Version 3.1                     *
+*                                                                           *
+*                            produced by                                    *
+*                                                                           *
+*             Mark Bull, Fiona Reid and Nix Mc Donnell                      *
+*                                                                           *
+*                                at                                         *
+*                                                                           *
+*                Edinburgh Parallel Computing Centre                        *
+*                                                                           *
+*         email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk                 *
+*                                                                           *
+*                                                                           *
+*      This version copyright (c) The University of Edinburgh, 2015.        *
+*                                                                           *
+*                                                                           *
+*  Licensed under the Apache License, Version 2.0 (the "License");          *
+*  you may not use this file except in compliance with the License.         *
+*  You may obtain a copy of the License at                                  *
+*                                                                           *
+*      http://www.apache.org/licenses/LICENSE-2.0                           *
+*                                                                           *
+*  Unless required by applicable law or agreed to in writing, software      *
+*  distributed under the License is distributed on an "AS IS" BASIS,        *
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
+*  See the License for the specific language governing permissions and      *
+*  limitations under the License.                                           *
+*                                                                           *
+****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <omp.h>
+
+#include "common.h"
+#include "arraybench.h"
+
+double btest[IDA];
+double atest[IDA];
+
+#pragma omp threadprivate (btest)
+
+int main(int argc, char **argv) {
+
+    init(argc, argv);
+
+    /* GENERATE REFERENCE TIME */
+    reference("reference time 1", &refer);
+
+    char testName[32];
+
+    /* TEST  PRIVATE */
+    sprintf(testName, "PRIVATE %d", IDA);
+    benchmark(testName, &testprivnew);
+
+    /* TEST  FIRSTPRIVATE */
+    sprintf(testName, "FIRSTPRIVATE %d", IDA);
+    benchmark(testName, &testfirstprivnew);
+
+#ifdef OMPVER2
+    /* TEST  COPYPRIVATE */
+    sprintf(testName, "COPYPRIVATE %d", IDA);
+    benchmark(testName, &testcopyprivnew);
+#endif
+
+    /* TEST  THREADPRIVATE - COPYIN */
+    sprintf(testName, "COPYIN %d", IDA);
+    benchmark(testName, &testthrprivnew);
+
+    finalise();
+
+    return EXIT_SUCCESS;
+
+}
+
+void refer() {
+    int j;
+    double a[1];
+    for (j = 0; j < innerreps; j++) {
+	array_delay(delaylength, a);
+    }
+}
+
+void testfirstprivnew() {
+    int j;
+    for (j = 0; j < innerreps; j++) {
+#pragma omp parallel firstprivate(atest)
+	{
+	    array_delay(delaylength, atest);
+	}
+    }
+}
+
+void testprivnew() {
+    int j;
+    for (j = 0; j < innerreps; j++) {
+#pragma omp parallel private(atest)
+	{
+	    array_delay(delaylength, atest);
+	}
+    }
+}
+
+#ifdef OMPVER2
+void testcopyprivnew()
+{
+    int j;
+    for (j=0; j<innerreps; j++) {
+#pragma omp parallel private(atest)
+	{
+#pragma omp single copyprivate(atest)
+		{
+	    	array_delay(delaylength, atest);
+		}
+    	}
+    }
+}
+
+#endif
+
+void testthrprivnew() {
+    int j;
+    for (j = 0; j < innerreps; j++) {
+#pragma omp parallel copyin(btest)
+	{
+	    array_delay(delaylength, btest);
+	}
+    }
+
+}
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/arraybench.h b/lecture10/openmp_overhead/openmpbench_C_v31/arraybench.h
@@ -0,0 +1,52 @@
+/****************************************************************************
+*                                                                           *
+*             OpenMP MicroBenchmark Suite - Version 3.1                     *
+*                                                                           *
+*                            produced by                                    *
+*                                                                           *
+*             Mark Bull, Fiona Reid and Nix Mc Donnell                      *
+*                                                                           *
+*                                at                                         *
+*                                                                           *
+*                Edinburgh Parallel Computing Centre                        *
+*                                                                           *
+*         email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk                 *
+*                                                                           *
+*                                                                           *
+*      This version copyright (c) The University of Edinburgh, 2015.        *
+*                                                                           *
+*                                                                           *
+*  Licensed under the Apache License, Version 2.0 (the "License");          *
+*  you may not use this file except in compliance with the License.         *
+*  You may obtain a copy of the License at                                  *
+*                                                                           *
+*      http://www.apache.org/licenses/LICENSE-2.0                           *
+*                                                                           *
+*  Unless required by applicable law or agreed to in writing, software      *
+*  distributed under the License is distributed on an "AS IS" BASIS,        *
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
+*  See the License for the specific language governing permissions and      *
+*  limitations under the License.                                           *
+*                                                                           *
+****************************************************************************/
+
+#ifndef ARRAYBENCH_H
+#define ARRAYBENCH_H
+
+void refer();
+
+void testfirstprivnew();
+
+void testprivnew();
+
+#ifdef OMPVER2
+
+void testcopyprivnew();
+
+#endif
+
+void testthrprivnew();
+
+void stats(double*, double*);
+
+#endif //ARRAYBENCH_H
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/common.c b/lecture10/openmp_overhead/openmpbench_C_v31/common.c
@@ -0,0 +1,374 @@
+/****************************************************************************
+*                                                                           *
+*             OpenMP MicroBenchmark Suite - Version 3.1                     *
+*                                                                           *
+*                            produced by                                    *
+*                                                                           *
+*             Mark Bull, Fiona Reid and Nix Mc Donnell                      *
+*                                                                           *
+*                                at                                         *
+*                                                                           *
+*                Edinburgh Parallel Computing Centre                        *
+*                                                                           *
+*         email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk                 *
+*                                                                           *
+*                                                                           *
+*      This version copyright (c) The University of Edinburgh, 2015.        *
+*                                                                           *
+*                                                                           *
+*  Licensed under the Apache License, Version 2.0 (the "License");          *
+*  you may not use this file except in compliance with the License.         *
+*  You may obtain a copy of the License at                                  *
+*                                                                           *
+*      http://www.apache.org/licenses/LICENSE-2.0                           *
+*                                                                           *
+*  Unless required by applicable law or agreed to in writing, software      *
+*  distributed under the License is distributed on an "AS IS" BASIS,        *
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
+*  See the License for the specific language governing permissions and      *
+*  limitations under the License.                                           *
+*                                                                           *
+****************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <omp.h>
+
+#include "common.h"
+
+#define CONF95 1.96
+
+int nthreads = -1;           // Number of OpenMP threads
+int delaylength = -1;        // The number of iterations to delay for
+int outerreps = -1;          // Outer repetitions
+double delaytime = -1.0;     // Length of time to delay for in microseconds
+double targettesttime = 0.0; // The length of time in microseconds that the test
+                             // should run for.
+unsigned long innerreps; // Inner repetitions
+double *times;           // Array of doubles storing the benchmark times in microseconds
+double referencetime;    // The average reference time in microseconds to perform
+			 // outerreps runs
+double referencesd;      // The standard deviation in the reference time in
+			 // microseconds for outerreps runs.
+double testtime;         // The average test time in microseconds for
+			 // outerreps runs
+double testsd;		 // The standard deviation in the test time in
+			 // microseconds for outerreps runs.
+
+void usage(char *argv[]) {
+    printf("Usage: %s.x \n"
+	   "\t--outer-repetitions <outer-repetitions> (default %d)\n"
+	   "\t--test-time <target-test-time> (default %0.2f microseconds)\n"
+	   "\t--delay-time <delay-time> (default %0.4f microseconds)\n"
+	   "\t--delay-length <delay-length> "
+	   "(default auto-generated based on processor speed)\n",
+	   argv[0],
+	   DEFAULT_OUTER_REPS, DEFAULT_TEST_TARGET_TIME, DEFAULT_DELAY_TIME);
+}
+
+void parse_args(int argc, char *argv[]) {
+    // Parse the parameters
+    int arg;
+    for (arg = 1; arg < argc; arg++) {
+	if (strcmp(argv[arg], "--delay-time") == 0.0) {
+	    delaytime = atof(argv[++arg]);
+	    if (delaytime == 0.0) {
+		printf("Invalid float:--delay-time: %s\n", argv[arg]);
+		usage(argv);
+		exit(EXIT_FAILURE);
+	    }
+		
+	} else if (strcmp(argv[arg], "--outer-repetitions") == 0) {
+	    outerreps = atoi(argv[++arg]);
+	    if (outerreps == 0) {
+		printf("Invalid integer:--outer-repetitions: %s\n", argv[arg]);
+		usage(argv);
+		exit(EXIT_FAILURE);
+	    }
+		
+	} else if (strcmp(argv[arg], "--test-time") == 0) {
+	    targettesttime = atof(argv[++arg]);
+	    if (targettesttime == 0) {
+		printf("Invalid integer:--test-time: %s\n", argv[arg]);
+		usage(argv);
+		exit(EXIT_FAILURE);
+	    }
+		
+	} else if (strcmp(argv[arg], "-h") == 0) {
+	    usage(argv);
+	    exit(EXIT_SUCCESS);
+		
+	} else {
+	    printf("Invalid parameters: %s\n", argv[arg]);
+	    usage(argv);
+	    exit(EXIT_FAILURE);
+	}
+    }
+}
+
+int getdelaylengthfromtime(double delaytime) {
+    int i, reps;
+    double lapsedtime, starttime; // seconds
+
+    reps = 1000;
+    lapsedtime = 0.0;
+
+    delaytime = delaytime/1.0E6; // convert from microseconds to seconds
+
+    // Note: delaytime is local to this function and thus the conversion
+    // does not propagate to the main code. 
+
+    // Here we want to use the delaytime in microseconds to find the 
+    // delaylength in iterations. We start with delaylength=0 and 
+    // increase until we get a large enough delaytime, return delaylength 
+    // in iterations. 
+
+    delaylength = 0;
+    delay(delaylength);
+
+    while (lapsedtime < delaytime) {
+	delaylength = delaylength * 1.1 + 1;
+	starttime = getclock();
+	for (i = 0; i < reps; i++) {
+	    delay(delaylength);
+	}
+	lapsedtime = (getclock() - starttime) / (double) reps;
+    }
+    return delaylength;
+
+}
+
+unsigned long getinnerreps(void (*test)(void)) {
+    innerreps = 10L;  // some initial value
+    double time = 0.0;
+
+    while (time < targettesttime) {
+	double start  = getclock();
+	test();
+	time = (getclock() - start) * 1.0e6;
+	innerreps *=2;
+
+	// Test to stop code if compiler is optimising reference time expressions away
+	if (innerreps > (targettesttime*1.0e15)) {
+	    printf("Compiler has optimised reference loop away, STOP! \n");
+	    printf("Try recompiling with lower optimisation level \n");
+	    exit(1);
+	}
+    }
+    return innerreps;
+}
+
+void printheader(char *name) {
+    printf("\n");
+    printf("--------------------------------------------------------\n");
+    printf("Computing %s time using %lu reps\n", name, innerreps);
+}
+
+void stats(double *mtp, double *sdp) {
+
+    double meantime, totaltime, sumsq, mintime, maxtime, sd, cutoff;
+
+    int i, nr;
+
+    mintime = 1.0e10;
+    maxtime = 0.;
+    totaltime = 0.;
+
+    for (i = 1; i <= outerreps; i++) {
+	mintime = (mintime < times[i]) ? mintime : times[i];
+	maxtime = (maxtime > times[i]) ? maxtime : times[i];
+	totaltime += times[i];
+    }
+
+    meantime = totaltime / outerreps;
+    sumsq = 0;
+
+    for (i = 1; i <= outerreps; i++) {
+	sumsq += (times[i] - meantime) * (times[i] - meantime);
+    }
+    sd = sqrt(sumsq / (outerreps - 1));
+
+    cutoff = 3.0 * sd;
+
+    nr = 0;
+
+    for (i = 1; i <= outerreps; i++) {
+	if (fabs(times[i] - meantime) > cutoff)
+	    nr++;
+    }
+
+    printf("\n");
+    printf("Sample_size       Average     Min         Max          S.D.          Outliers\n");
+    printf(" %d                %f   %f   %f    %f      %d\n",
+	   outerreps, meantime, mintime, maxtime, sd, nr);
+    printf("\n");
+
+    *mtp = meantime;
+    *sdp = sd;
+
+}
+
+void printfooter(char *name, double testtime, double testsd,
+		 double referencetime, double refsd) {
+    printf("%s time     = %f microseconds +/- %f\n",
+	   name, testtime, CONF95*testsd);
+    printf("%s overhead = %f microseconds +/- %f\n",
+	   name, testtime-referencetime, CONF95*(testsd+referencesd));
+
+}
+
+void printreferencefooter(char *name, double referencetime, double referencesd) {
+    printf("%s time     = %f microseconds +/- %f\n",
+	   name, referencetime, CONF95 * referencesd);
+}
+
+void init(int argc, char **argv)
+{
+#pragma omp parallel
+    {
+#pragma omp master
+	{
+	    nthreads = omp_get_num_threads();
+	}
+
+    }
+
+    parse_args(argc, argv);
+
+    if (outerreps == -1) {
+	outerreps = DEFAULT_OUTER_REPS;
+    }
+    if (targettesttime == 0.0) {
+	targettesttime = DEFAULT_TEST_TARGET_TIME;
+    }
+    if (delaytime == -1.0) {
+	delaytime = DEFAULT_DELAY_TIME; 
+    }
+    delaylength = getdelaylengthfromtime(delaytime); // Always need to compute delaylength in iterations 
+    
+    times = malloc((outerreps+1) * sizeof(double));
+
+    printf("Running OpenMP benchmark version 3.0\n"
+	   "\t%d thread(s)\n"
+	   "\t%d outer repetitions\n"
+	   "\t%0.2f test time (microseconds)\n"
+	   "\t%d delay length (iterations) \n"
+	   "\t%f delay time (microseconds)\n",
+	   nthreads,
+	   outerreps, targettesttime,
+	   delaylength, delaytime);
+}
+
+void finalise(void) {
+    free(times);
+
+}
+
+void initreference(char *name) {
+    printheader(name);
+
+}
+
+/* Calculate the reference time. */
+void reference(char *name, void (*refer)(void)) {
+    int k;
+    double start;
+
+    // Calculate the required number of innerreps
+    innerreps = getinnerreps(refer);
+
+    initreference(name);
+
+    for (k = 0; k <= outerreps; k++) {
+	start = getclock();
+	refer();
+	times[k] = (getclock() - start) * 1.0e6 / (double) innerreps;
+    }
+
+    finalisereference(name);
+
+}
+
+void finalisereference(char *name) {
+    stats(&referencetime, &referencesd);
+    printreferencefooter(name, referencetime, referencesd);
+
+}
+
+void intitest(char *name) {
+    printheader(name);
+
+}
+
+void finalisetest(char *name) {
+    stats(&testtime, &testsd);
+    printfooter(name, testtime, testsd, referencetime, referencesd);
+
+}
+
+/* Function to run a microbenchmark test*/
+void benchmark(char *name, void (*test)(void))
+{
+    int k;
+    double start;
+
+    // Calculate the required number of innerreps
+    innerreps = getinnerreps(test);
+
+    intitest(name);
+
+    for (k=0; k<=outerreps; k++) {
+	start = getclock();
+	test();
+	times[k] = (getclock() - start) * 1.0e6 / (double) innerreps;
+    }
+
+    finalisetest(name);
+
+}
+
+// For the Cray compiler on HECToR we need to turn off optimisation 
+// for the delay and array_delay functions. Other compilers should
+// not be afffected. 
+#pragma _CRI noopt
+void delay(int delaylength) {
+
+    int i;
+    float a = 0.;
+
+    for (i = 0; i < delaylength; i++)
+	a += i;
+    if (a < 0)
+	printf("%f \n", a);
+
+}
+
+void array_delay(int delaylength, double a[1]) {
+
+    int i;
+    a[0] = 1.0;
+    for (i = 0; i < delaylength; i++)
+	a[0] += i;
+    if (a[0] < 0)
+	printf("%f \n", a[0]);
+
+}
+// Re-enable optimisation for remainder of source. 
+#pragma _CRI opt
+
+double getclock() {
+    double time;
+    // Returns a value in seconds of the time elapsed from some arbitrary,
+    // but consistent point.
+    double omp_get_wtime(void);
+    time = omp_get_wtime();
+    return time;
+}
+
+int returnfalse() {
+    return 0;
+
+}
+
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/common.h b/lecture10/openmp_overhead/openmpbench_C_v31/common.h
@@ -0,0 +1,80 @@
+/****************************************************************************
+*                                                                           *
+*             OpenMP MicroBenchmark Suite - Version 3.1                     *
+*                                                                           *
+*                            produced by                                    *
+*                                                                           *
+*             Mark Bull, Fiona Reid and Nix Mc Donnell                      *
+*                                                                           *
+*                                at                                         *
+*                                                                           *
+*                Edinburgh Parallel Computing Centre                        *
+*                                                                           *
+*         email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk                 *
+*                                                                           *
+*                                                                           *
+*      This version copyright (c) The University of Edinburgh, 2015.        *
+*                                                                           *
+*                                                                           *
+*  Licensed under the Apache License, Version 2.0 (the "License");          *
+*  you may not use this file except in compliance with the License.         *
+*  You may obtain a copy of the License at                                  *
+*                                                                           *
+*      http://www.apache.org/licenses/LICENSE-2.0                           *
+*                                                                           *
+*  Unless required by applicable law or agreed to in writing, software      *
+*  distributed under the License is distributed on an "AS IS" BASIS,        *
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
+*  See the License for the specific language governing permissions and      *
+*  limitations under the License.                                           *
+*                                                                           *
+****************************************************************************/
+
+#ifndef COMMON_H
+#define COMMON_H
+
+#define DEFAULT_DELAY_LENGTH -1 // -1 means the delay length should be auto generated
+#define DEFAULT_OUTER_REPS 20   // Outer repetitions
+#define DEFAULT_TEST_TARGET_TIME 1000.0 // Test Target time in microseconds.
+#ifdef SCHEDBENCH
+#define DEFAULT_DELAY_TIME 15.0  // Default delaytime in microseconds for schedbench
+#else
+#define DEFAULT_DELAY_TIME 0.10  // Default delaytime in microseconds
+#endif
+
+extern int nthreads;              // Number of OpenMP threads
+extern int delaylength;           // The number of iterations to delay for
+extern int outerreps;             // Outer repetitions
+extern unsigned long innerreps;   // Inner repetitions
+extern double delaytime;          // Delay time in microseconds
+extern double targettesttime;     // The length of time in microseconds the test
+                                  // should run for
+extern double *times;             // Array to store results in
+
+void init(int argc, char **argv);
+
+void initreference(char *name);
+
+void finalisereference(char *name);
+
+void intitest(char *name);
+
+void finalisetest(char *name);
+
+double getclock();
+
+void delay(int delaylength);
+
+void array_delay(int delaylength, double a[1]);
+
+int getdelaylengthfromtime(double delaytime);
+
+int returnfalse(void);
+
+void finalise(void);
+
+void benchmark(char *name, void (*test)(void));
+
+void reference(char *name, void (*refer)(void));
+
+#endif //COMMON_H
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/schedbench.c b/lecture10/openmp_overhead/openmpbench_C_v31/schedbench.c
@@ -0,0 +1,145 @@
+/****************************************************************************
+*                                                                           *
+*             OpenMP MicroBenchmark Suite - Version 3.1                     *
+*                                                                           *
+*                            produced by                                    *
+*                                                                           *
+*             Mark Bull, Fiona Reid and Nix Mc Donnell                      *
+*                                                                           *
+*                                at                                         *
+*                                                                           *
+*                Edinburgh Parallel Computing Centre                        *
+*                                                                           *
+*         email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk                 *
+*                                                                           *
+*                                                                           *
+*      This version copyright (c) The University of Edinburgh, 2015.        *
+*                                                                           *
+*                                                                           *
+*  Licensed under the Apache License, Version 2.0 (the "License");          *
+*  you may not use this file except in compliance with the License.         *
+*  You may obtain a copy of the License at                                  *
+*                                                                           *
+*      http://www.apache.org/licenses/LICENSE-2.0                           *
+*                                                                           *
+*  Unless required by applicable law or agreed to in writing, software      *
+*  distributed under the License is distributed on an "AS IS" BASIS,        *
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
+*  See the License for the specific language governing permissions and      *
+*  limitations under the License.                                           *
+*                                                                           *
+****************************************************************************/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <omp.h>
+
+#include "common.h"
+#include "schedbench.h"
+
+int cksz, itersperthr = 128;
+char testName[32];
+
+int main(int argc, char **argv) {
+
+    init(argc, argv);
+
+    /* GENERATE REFERENCE TIME */
+    reference("reference time", &refer);
+
+    /* TEST STATIC */
+    benchmark("STATIC", &teststatic);
+
+    /* TEST STATIC,n */
+    cksz = 1;
+    while (cksz <= itersperthr) {
+	sprintf(testName, "STATIC %d", cksz);
+	benchmark(testName, &teststaticn);
+	cksz *= 2;
+    }
+
+    /* TEST DYNAMIC,n */
+    cksz = 1;
+    while (cksz <= itersperthr) {
+	sprintf(testName, "DYNAMIC %d", cksz);
+	benchmark(testName, &testdynamicn);
+	cksz *= 2;
+    }
+
+    /* TEST GUIDED,n */
+    cksz = 1;
+    while (cksz <= itersperthr / nthreads) {
+	sprintf(testName, "GUIDED %d", cksz);
+	benchmark(testName, &testguidedn);
+	cksz *= 2;
+    }
+
+    finalise();
+
+    return EXIT_SUCCESS;
+
+}
+
+void refer() {
+    int i, j;
+    for (j = 0; j < innerreps; j++) {
+	for (i = 0; i < itersperthr; i++) {
+	    delay(delaylength);
+	}
+    }
+}
+
+void teststatic() {
+
+    int i, j;
+#pragma omp parallel private(j)
+    {
+	for (j = 0; j < innerreps; j++) {
+#pragma omp for schedule(static)
+	    for (i = 0; i < itersperthr * nthreads; i++) {
+		delay(delaylength);
+	    }
+	}
+    }
+}
+
+void teststaticn() {
+    int i, j;
+#pragma omp parallel private(j)
+    {
+	for (j = 0; j < innerreps; j++) {
+#pragma omp for schedule(static,cksz)
+	    for (i = 0; i < itersperthr * nthreads; i++) {
+		delay(delaylength);
+	    }
+	}
+    }
+}
+
+void testdynamicn() {
+    int i, j;
+#pragma omp parallel private(j)
+    {
+	for (j = 0; j < innerreps; j++) {
+#pragma omp for schedule(dynamic,cksz)
+	    for (i = 0; i < itersperthr * nthreads; i++) {
+		delay(delaylength);
+	    }
+	}
+    }
+}
+
+void testguidedn() {
+    int i, j;
+#pragma omp parallel private(j)
+    {
+	for (j = 0; j < innerreps; j++) {
+#pragma omp for schedule(guided,cksz)
+	    for (i = 0; i < itersperthr * nthreads; i++) {
+		delay(delaylength);
+	    }
+	}
+    }
+}
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/schedbench.h b/lecture10/openmp_overhead/openmpbench_C_v31/schedbench.h
@@ -0,0 +1,46 @@
+/****************************************************************************
+*                                                                           *
+*             OpenMP MicroBenchmark Suite - Version 3.1                     *
+*                                                                           *
+*                            produced by                                    *
+*                                                                           *
+*             Mark Bull, Fiona Reid and Nix Mc Donnell                      *
+*                                                                           *
+*                                at                                         *
+*                                                                           *
+*                Edinburgh Parallel Computing Centre                        *
+*                                                                           *
+*         email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk                 *
+*                                                                           *
+*                                                                           *
+*      This version copyright (c) The University of Edinburgh, 2015.        *
+*                                                                           *
+*                                                                           *
+*  Licensed under the Apache License, Version 2.0 (the "License");          *
+*  you may not use this file except in compliance with the License.         *
+*  You may obtain a copy of the License at                                  *
+*                                                                           *
+*      http://www.apache.org/licenses/LICENSE-2.0                           *
+*                                                                           *
+*  Unless required by applicable law or agreed to in writing, software      *
+*  distributed under the License is distributed on an "AS IS" BASIS,        *
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
+*  See the License for the specific language governing permissions and      *
+*  limitations under the License.                                           *
+*                                                                           *
+****************************************************************************/
+
+#ifndef SCHEDBENCH_H
+#define SCHEDBENCH_H
+
+void refer(void);
+
+void teststatic(void);
+
+void teststaticn(void);
+
+void testdynamicn(void);
+
+void testguidedn(void);
+
+#endif //SCHEDBENCH_H
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/syncbench.c b/lecture10/openmp_overhead/openmpbench_C_v31/syncbench.c
@@ -0,0 +1,253 @@
+/****************************************************************************
+*                                                                           *
+*             OpenMP MicroBenchmark Suite - Version 3.1                     *
+*                                                                           *
+*                            produced by                                    *
+*                                                                           *
+*             Mark Bull, Fiona Reid and Nix Mc Donnell                      *
+*                                                                           *
+*                                at                                         *
+*                                                                           *
+*                Edinburgh Parallel Computing Centre                        *
+*                                                                           *
+*         email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk                 *
+*                                                                           *
+*                                                                           *
+*      This version copyright (c) The University of Edinburgh, 2015.        *
+*                                                                           *
+*                                                                           *
+*  Licensed under the Apache License, Version 2.0 (the "License");          *
+*  you may not use this file except in compliance with the License.         *
+*  You may obtain a copy of the License at                                  *
+*                                                                           *
+*      http://www.apache.org/licenses/LICENSE-2.0                           *
+*                                                                           *
+*  Unless required by applicable law or agreed to in writing, software      *
+*  distributed under the License is distributed on an "AS IS" BASIS,        *
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
+*  See the License for the specific language governing permissions and      *
+*  limitations under the License.                                           *
+*                                                                           *
+****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <omp.h>
+
+#include "common.h"
+#include "syncbench.h"
+
+omp_lock_t lock;
+
+int main(int argc, char **argv) {
+
+    // Start Paraver tracing
+#ifdef PARAVERTRACE
+    Extrae_init();
+#endif
+
+    init(argc, argv);
+
+    omp_init_lock(&lock);
+
+    /* GENERATE REFERENCE TIME */
+    reference("reference time 1", &refer);
+
+    /* TEST PARALLEL REGION */
+    benchmark("PARALLEL", &testpr);
+
+    /* TEST FOR */
+    benchmark("FOR", &testfor);
+
+    /* TEST PARALLEL FOR */
+    benchmark("PARALLEL FOR", &testpfor);
+
+    /* TEST BARRIER */
+    benchmark("BARRIER", &testbar);
+
+    /* TEST SINGLE */
+    benchmark("SINGLE", &testsing);
+
+    /* TEST  CRITICAL*/
+    benchmark("CRITICAL", &testcrit);
+
+    /* TEST  LOCK/UNLOCK */
+    benchmark("LOCK/UNLOCK", &testlock);
+
+    /* TEST ORDERED SECTION */
+    benchmark("ORDERED", &testorder);
+
+    /* GENERATE NEW REFERENCE TIME */
+    reference("reference time 2", &referatom);
+
+    /* TEST ATOMIC */
+    benchmark("ATOMIC", &testatom);
+
+    /* GENERATE NEW REFERENCE TIME */
+    reference("reference time 3", &referred);
+
+    /* TEST REDUCTION (1 var)  */
+    benchmark("REDUCTION", &testred);
+
+#ifdef PARAVERTRACE
+    Extrae_fini();
+#endif
+
+    finalise();
+
+    return EXIT_SUCCESS;
+}
+
+void refer() {
+    int j;
+    for (j = 0; j < innerreps; j++) {
+	delay(delaylength);
+    }
+}
+
+void referatom(){
+    int j;
+    double aaaa = 0.0;
+    double epsilon = 1.0e-15;
+    double b, c;
+    b = 1.0;
+    c = (1.0 + epsilon);
+    for (j = 0; j < innerreps; j++) {
+	aaaa += b;
+	b *= c;
+    }
+    if (aaaa < 0.0)
+	printf("%f\n", aaaa);
+}
+
+void referred() {
+    int j;
+    int aaaa = 0;
+    for (j = 0; j < innerreps; j++) {
+	delay(delaylength);
+	aaaa += 1;
+    }
+}
+
+void testpr() {
+    int j;
+    for (j = 0; j < innerreps; j++) {
+#pragma omp parallel
+	{
+	    delay(delaylength);
+	}
+    }
+}
+
+void testfor() {
+    int i, j;
+#pragma omp parallel private(j)
+    {
+	for (j = 0; j < innerreps; j++) {
+#pragma omp for
+	    for (i = 0; i < nthreads; i++) {
+		delay(delaylength);
+	    }
+	}
+    }
+}
+
+void testpfor() {
+    int i, j;
+    for (j = 0; j < innerreps; j++) {
+#pragma omp parallel for
+	for (i = 0; i < nthreads; i++) {
+	    delay(delaylength);
+	}
+    }
+}
+
+void testbar() {
+    int j;
+#pragma omp parallel private(j)
+    {
+	for (j = 0; j < innerreps; j++) {
+	    delay(delaylength);
+#pragma omp barrier
+	}
+    }
+}
+
+void testsing() {
+    int j;
+#pragma omp parallel private(j)
+    {
+	for (j = 0; j < innerreps; j++) {
+#pragma omp single
+	    delay(delaylength);
+	}
+    }
+}
+
+void testcrit() {
+    int j;
+#pragma omp parallel private(j)
+    {
+	for (j = 0; j < innerreps / nthreads; j++) {
+#pragma omp critical
+	    {
+		delay(delaylength);
+	    }
+	}
+    }
+}
+
+void testlock() {
+    int j;
+
+#pragma omp parallel private(j)
+    {
+	for (j = 0; j < innerreps / nthreads; j++) {
+	    omp_set_lock(&lock);
+	    delay(delaylength);
+	    omp_unset_lock(&lock);
+	}
+    }
+}
+
+void testorder() {
+    int j;
+#pragma omp parallel for ordered schedule (static,1)
+    for (j = 0; j < (int)innerreps; j++) {
+#pragma omp ordered
+	delay(delaylength);
+    }
+}
+
+void testatom() {
+    int j;
+    double aaaa = 0.0;
+    double epsilon = 1.0e-15;
+    double b,c;
+    b = 1.0;
+    c = (1.0 + epsilon);
+#pragma omp parallel private(j) firstprivate(b)
+    {
+	for (j = 0; j < innerreps / nthreads; j++) {
+#pragma omp atomic	
+	    aaaa += b;
+	    b *= c;
+	}
+    }
+    if (aaaa < 0.0)
+	printf("%f\n", aaaa);
+}
+
+void testred() {
+    int j;
+    int aaaa = 0;
+    for (j = 0; j < innerreps; j++) {
+#pragma omp parallel reduction(+:aaaa)
+	{
+	    delay(delaylength);
+	    aaaa += 1;
+	}
+    }
+}
+
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/syncbench.h b/lecture10/openmp_overhead/openmpbench_C_v31/syncbench.h
@@ -0,0 +1,63 @@
+/****************************************************************************
+*                                                                           *
+*             OpenMP MicroBenchmark Suite - Version 3.1                     *
+*                                                                           *
+*                            produced by                                    *
+*                                                                           *
+*             Mark Bull, Fiona Reid and Nix Mc Donnell                      *
+*                                                                           *
+*                                at                                         *
+*                                                                           *
+*                Edinburgh Parallel Computing Centre                        *
+*                                                                           *
+*         email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk                 *
+*                                                                           *
+*                                                                           *
+*      This version copyright (c) The University of Edinburgh, 2015.        *
+*                                                                           *
+*                                                                           *
+*  Licensed under the Apache License, Version 2.0 (the "License");          *
+*  you may not use this file except in compliance with the License.         *
+*  You may obtain a copy of the License at                                  *
+*                                                                           *
+*      http://www.apache.org/licenses/LICENSE-2.0                           *
+*                                                                           *
+*  Unless required by applicable law or agreed to in writing, software      *
+*  distributed under the License is distributed on an "AS IS" BASIS,        *
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
+*  See the License for the specific language governing permissions and      *
+*  limitations under the License.                                           *
+*                                                                           *
+****************************************************************************/
+
+
+#ifndef SYNCBENCH_H
+#define SYNCBENCH_H
+
+void refer(void);
+
+void referatom(void);
+
+void referred(void);
+
+void testpr(void);
+
+void testfor(void);
+
+void testpfor(void);
+
+void testbar(void);
+
+void testsing(void);
+
+void testcrit(void);
+
+void testlock(void);
+
+void testorder(void);
+
+void testatom(void);
+
+void testred(void);
+
+#endif //SYNCBENCH_H
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/taskbench.c b/lecture10/openmp_overhead/openmpbench_C_v31/taskbench.c
@@ -0,0 +1,331 @@
+/****************************************************************************
+*                                                                           *
+*             OpenMP MicroBenchmark Suite - Version 3.1                     *
+*                                                                           *
+*                            produced by                                    *
+*                                                                           *
+*             Mark Bull, Fiona Reid and Nix Mc Donnell                      *
+*                                                                           *
+*                                at                                         *
+*                                                                           *
+*                Edinburgh Parallel Computing Centre                        *
+*                                                                           *
+*         email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk                 *
+*                                                                           *
+*                                                                           *
+*      This version copyright (c) The University of Edinburgh, 2015.        *
+*                                                                           *
+*                                                                           *
+*  Licensed under the Apache License, Version 2.0 (the "License");          *
+*  you may not use this file except in compliance with the License.         *
+*  You may obtain a copy of the License at                                  *
+*                                                                           *
+*      http://www.apache.org/licenses/LICENSE-2.0                           *
+*                                                                           *
+*  Unless required by applicable law or agreed to in writing, software      *
+*  distributed under the License is distributed on an "AS IS" BASIS,        *
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
+*  See the License for the specific language governing permissions and      *
+*  limitations under the License.                                           *
+*                                                                           *
+****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <omp.h>
+
+#include "common.h"
+#include "taskbench.h"
+
+#define DEPTH 6
+
+int main(int argc, char **argv) {
+
+    init(argc, argv);
+
+#ifdef OMPVER3
+
+    /* GENERATE REFERENCE TIME */
+    reference("reference time 1", &refer);
+
+    /* TEST PARALLEL TASK GENERATION */
+    benchmark("PARALLEL TASK", &testParallelTaskGeneration);
+
+    /* TEST MASTER TASK GENERATION */
+    benchmark("MASTER TASK", &testMasterTaskGeneration);
+
+    /* TEST MASTER TASK GENERATION WITH BUSY SLAVES */
+    benchmark("MASTER TASK BUSY SLAVES", &testMasterTaskGenerationWithBusySlaves);
+
+    /* TEST CONDITIONAL TASK GENERATION */
+#ifndef DISABLE_CONDITIONAL_TASK_TEST
+    benchmark("CONDITIONAL TASK", &testConditionalTaskGeneration);
+#endif // DISABLE_CONDITIONAL_TASK_TEST
+
+    /* TEST TASK WAIT */
+    benchmark("TASK WAIT", &testTaskWait);
+
+    /* TEST TASK BARRIER */
+#ifndef DISABLE_BARRIER_TEST
+    benchmark("TASK BARRIER", &testTaskBarrier);
+#endif //DISABLE_BARRIER_TEST
+
+#ifndef DISABLE_NESTED_TASKS_TESTS
+    /* TEST NESTED TASK GENERATION */
+    benchmark("NESTED TASK", &testNestedTaskGeneration);
+
+    /* TEST NESTED MASTER TASK GENERATION */
+    benchmark("NESTED MASTER TASK", &testNestedMasterTaskGeneration);
+
+#endif // DISABLE_NESTED_TASKS_TESTS
+
+    /* GENERATE THE SECOND REFERENCE TIME */
+    reference("reference time 2", &refer);
+
+    /* TEST BRANCH TASK TREE */
+    benchmark("BRANCH TASK TREE", &testBranchTaskGeneration);
+
+    /* TEST LEAF TASK TREE */
+    benchmark("LEAF TASK TREE", &testLeafTaskGeneration);
+
+#endif // OMPVER3
+
+    finalise();
+
+    return EXIT_SUCCESS;
+
+}
+
+/* Calculate the reference time. */
+void refer() {
+    int j;
+    for (j = 0; j < innerreps; j++) {
+	delay(delaylength);
+    }
+
+}
+
+/* Calculate the second reference time. */
+void refer2() {
+    int j;
+    for (j = 0; j < (innerreps >> DEPTH) * (1 << DEPTH); j++) {
+	delay(delaylength);
+    };
+
+}
+
+/* Test parallel task generation overhead */
+void testParallelTaskGeneration() {
+    int j;
+#pragma omp parallel private( j )
+    {
+	for ( j = 0; j < innerreps; j ++ ) {
+#pragma omp task
+	    {
+		delay( delaylength );
+
+	    } // task
+	}; // for j
+    } // parallel
+
+}
+
+/* Test master task generation overhead */
+void testMasterTaskGeneration() {
+    int j;
+#pragma omp parallel private(j)
+    {
+#pragma omp master
+	{
+	    /* Since this is executed by one thread we need innerreps * nthreads
+	       iterations */
+	    for (j = 0; j < innerreps * nthreads; j++) {
+#pragma omp task
+		{
+		    delay(delaylength);
+
+		}
+
+	    } /* End for j */
+	} /* End master */
+    } /* End parallel */
+
+}
+
+/* Test master task generation overhead when the slave threads are busy */
+void testMasterTaskGenerationWithBusySlaves() {
+    int j;
+#pragma omp parallel private( j )
+    {
+	int thread_num = omp_get_thread_num();
+	for (j = 0; j < innerreps; j ++ ) {
+
+	    if ( thread_num == 0 ) {
+#pragma omp task
+		{
+		    delay( delaylength );
+		} // task
+
+	    } else {
+		delay( delaylength );
+
+	    }; // if
+	}; // for j
+    } // parallel
+}
+
+/* Measure overhead of checking if a task should be spawned. */
+void testConditionalTaskGeneration() {
+    int j;
+#pragma omp parallel private(j)
+    {
+	for (j = 0; j < innerreps; j++) {
+#pragma omp task if(returnfalse())
+	    {
+		delay( delaylength );
+	    }
+	}
+    }
+}
+
+#ifndef DISABLE_NESTED_TASKS_TESTS
+
+/* Measure overhead of nested tasks (all threads construct outer tasks) */
+void testNestedTaskGeneration() {
+    int i,j;
+#pragma omp parallel private( i, j )
+    {
+	for ( j = 0; j < innerreps / nthreads; j ++ ) {
+#pragma omp task private( i )
+	    {
+		for ( i = 0; i < nthreads; i ++ ) {
+#pragma omp task untied
+		    {
+			delay( delaylength );
+
+		    } // task
+		}; // for i
+
+		// wait for inner tasks to complete
+#pragma omp taskwait
+
+	    } // task
+	}; // for j
+    } // parallel
+}
+
+/* Measure overhead of nested tasks (master thread constructs outer tasks) */
+void testNestedMasterTaskGeneration() {
+    int i, j;
+#pragma omp parallel private( i, j )
+    {
+#pragma omp master
+	{
+	    for ( j = 0; j < innerreps; j ++ ) {
+#pragma omp task private( i )
+		{
+		    for ( i = 0; i < nthreads; i ++ ) {
+#pragma omp task
+			{
+			    delay( delaylength );
+
+			} // task
+		    }; // for i
+
+		    // wait for inner tasks to complete
+#pragma omp taskwait
+
+		} // task
+	    }; // for j
+	} // master
+    } // parallel
+}
+#endif // DISABLE_NESTED_TASKS_TESTS
+
+/* Measure overhead of taskwait (all threads construct tasks) */
+void testTaskWait() {
+    int j;
+#pragma omp parallel private( j )
+    {
+	for ( j = 0; j < innerreps; j ++ ) {
+#pragma omp task
+	    {
+		delay( delaylength );
+
+	    } // task
+#pragma omp taskwait
+
+	}; // for j
+    } // parallel
+}
+
+/* Measure overhead of tasking barrier (all threads construct tasks) */
+void testTaskBarrier() {
+    int j;
+#pragma omp parallel private( j )
+    {
+	for ( j = 0; j < innerreps; j ++ ) {
+#pragma omp task
+	    {
+		delay( delaylength );
+
+	    } // task
+#pragma omp barrier
+
+	}; // for j
+    } // parallel
+}
+
+/* Test parallel task generation overhead where work is done at all levels. */
+void testBranchTaskGeneration() {
+    int j;
+#pragma omp parallel private(j)
+    {
+	for (j = 0; j < (innerreps >> DEPTH); j++) {
+#pragma omp task
+	    {
+		branchTaskTree(DEPTH);
+		delay(delaylength);
+	    }
+
+	}
+    }
+}
+
+void branchTaskTree(int tree_level) {
+    if ( tree_level > 0 ) {
+#pragma omp task
+	{
+	    branchTaskTree(tree_level - 1);
+	    branchTaskTree(tree_level - 1);
+	    delay(delaylength);
+	}
+    }
+}
+
+/* Test parallel task generation overhead where work is done only at the leaf level. */
+void testLeafTaskGeneration() {
+    int j;
+#pragma omp parallel private(j)
+    {
+	for (j = 0; j < (innerreps >> DEPTH); j++) {
+	    leafTaskTree(DEPTH);
+
+	}
+    }
+
+}
+
+void leafTaskTree(int tree_level) {
+    if ( tree_level == 0 ) {
+	delay(delaylength);
+
+    } else {
+#pragma omp task
+	{
+	    leafTaskTree(tree_level - 1);
+	    leafTaskTree(tree_level - 1);
+	}
+    }
+}
+
diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/taskbench.h b/lecture10/openmp_overhead/openmpbench_C_v31/taskbench.h
@@ -0,0 +1,66 @@
+/****************************************************************************
+*                                                                           *
+*             OpenMP MicroBenchmark Suite - Version 3.1                     *
+*                                                                           *
+*                            produced by                                    *
+*                                                                           *
+*             Mark Bull, Fiona Reid and Nix Mc Donnell                      *
+*                                                                           *
+*                                at                                         *
+*                                                                           *
+*                Edinburgh Parallel Computing Centre                        *
+*                                                                           *
+*         email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk                 *
+*                                                                           *
+*                                                                           *
+*      This version copyright (c) The University of Edinburgh, 2015.        *
+*                                                                           *
+*                                                                           *
+*  Licensed under the Apache License, Version 2.0 (the "License");          *
+*  you may not use this file except in compliance with the License.         *
+*  You may obtain a copy of the License at                                  *
+*                                                                           *
+*      http://www.apache.org/licenses/LICENSE-2.0                           *
+*                                                                           *
+*  Unless required by applicable law or agreed to in writing, software      *
+*  distributed under the License is distributed on an "AS IS" BASIS,        *
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
+*  See the License for the specific language governing permissions and      *
+*  limitations under the License.                                           *
+*                                                                           *
+****************************************************************************/
+
+#ifndef TASKBENCH_H
+#define TASKBENCH_H
+
+void refer(void);
+
+void refer2(void);
+
+void stats(double*, double*);
+
+void testParallelTaskGeneration(void);
+
+void testMasterTaskGeneration(void);
+
+void testMasterTaskGenerationWithBusySlaves(void);
+
+void testNestedTaskGeneration(void);
+
+void testNestedMasterTaskGeneration(void);
+
+void testTaskWait(void);
+
+void testTaskBarrier(void);
+
+void testConditionalTaskGeneration(void);
+
+void testBranchTaskGeneration(void);
+
+void branchTaskTree(int tree_level);
+
+void testLeafTaskGeneration(void);
+
+void leafTaskTree(int tree_level);
+
+#endif //TASKBENCH_H
diff --git a/lecture10/openmp_overhead/plot_overhead.py b/lecture10/openmp_overhead/plot_overhead.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+# File       : plot_overhead.py
+# Description: Plot OpenMP micro bench results
+# Copyright 2022 Harvard University. All Rights Reserved.
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+import numpy as np
+
+def plot():
+    fig, ax = plt.subplots()
+
+    d = np.loadtxt('proc_bind_false.dat')
+    # ax.grid(which='both')
+    ax.set_xlabel(r'Number of threads')
+    ax.set_ylabel(r'Overhead [$\mu s$]')
+    # yapf: disable
+    ax.errorbar(d[:, 0], d[:, 19], yerr=d[:, 20], marker='P', lw=1.4, ms=5, label=r'\texttt{reduction}')
+    ax.errorbar(d[:, 0], d[:, 1],  yerr=d[:, 2],  marker='o', lw=1.4, ms=5, label=r'\texttt{parallel}')
+    ax.errorbar(d[:, 0], d[:, 3],  yerr=d[:, 4],  marker='s', lw=1.4, ms=5, label=r'\texttt{for}')
+    ax.errorbar(d[:, 0], d[:, 5],  yerr=d[:, 6],  marker='^', lw=1.4, ms=5, label=r'\texttt{parallel for}')
+    ax.errorbar(d[:, 0], d[:, 7],  yerr=d[:, 8],  marker='*', lw=1.4, ms=5, label=r'\texttt{barrier}')
+    ax.errorbar(d[:, 0], d[:, 9],  yerr=d[:, 10], marker='>', lw=1.4, ms=5, label=r'\texttt{single}')
+    ax.errorbar(d[:, 0], d[:, 11], yerr=d[:, 12], marker='x', lw=1.4, ms=5, label=r'\texttt{critical}')
+    ax.errorbar(d[:, 0], d[:, 13], yerr=d[:, 14], marker='<', lw=1.4, ms=5, label=r'\texttt{lock/unlock}')
+    ax.errorbar(d[:, 0], d[:, 17], yerr=d[:, 18], marker='d', lw=1.4, ms=5, label=r'\texttt{atomic}')
+    # yapf: enable
+    ax.legend(loc='upper left')
+    fig.savefig('openmp_overhead.svg', bbox_inches='tight')
+
+def main():
+    plot()
+
+if __name__ == "__main__":
+    main()
diff --git a/lecture10/openmp_overhead/run_benchmark.sh b/lecture10/openmp_overhead/run_benchmark.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# threads=(1 2 4 8 16 32 64) # AMD Opteron
+threads=(1 2 4 8 16 32) # Intel Xeon
+
+run() {
+    fname="${1}"; shift
+    echo '# threads parallel for parallel_for barrier single critical lock_unlock ordered atomic reduction' >${fname}
+    for t in "${threads[@]}"; do
+        echo -n "${t}" >>${fname}
+        OMP_NUM_THREADS=$t ./openmpbench_C_v31/syncbench | \
+            awk '/overhead/ {print $0}' | \
+            awk -F '[^0-9.]+' '{printf "\t%e %e", $2, $3}' >>${fname}
+        echo '' >>${fname}
+    done
+}
+
+export OMP_PROC_BIND='false'
+run 'proc_bind_false.dat'
+
+export OMP_PROC_BIND='true'
+run 'proc_bind_true.dat'
diff --git a/lecture10/prefetch/Makefile b/lecture10/prefetch/Makefile
@@ -0,0 +1,13 @@
+CXX = g++
+.PHONY: clean
+
+all: main main_noprefetch
+
+main: loop_unroll_prefetch.cpp
+	$(CXX) -O3 -fno-unroll-loops -o $@ $<
+
+main_noprefetch: loop_unroll_prefetch.cpp
+	$(CXX) -O3 -fno-unroll-loops -fno-prefetch-loop-arrays -o $@ $<
+
+clean:
+	rm -f main main_noprefetch
diff --git a/lecture10/prefetch/loop_unroll_prefetch.cpp b/lecture10/prefetch/loop_unroll_prefetch.cpp
@@ -0,0 +1,44 @@
+#include <chrono>
+#include <iostream>
+
+#define N (1 << 20)
+int main(void)
+{
+    double *A = new double[N];
+    double *B = new double[N];
+    double *C = new double[N];
+    for (int i = 0; i < N; ++i) {
+        A[i] = 0;
+        B[i] = i + 0;
+        C[i] = i + 2;
+    }
+
+    typedef std::chrono::high_resolution_clock Clock;
+    auto t1 = Clock::now();
+    // manual 4-fold loop unroll
+    for (int i = 0; i < N; i += 4) {
+        __builtin_prefetch(&A[i + 4], 1, 1);
+        __builtin_prefetch(&B[i + 4], 0, 1);
+        __builtin_prefetch(&C[i + 4], 0, 1);
+        A[i + 0] = A[i + 0] * B[i + 0] + C[i + 0];
+        A[i + 1] = A[i + 1] * B[i + 1] + C[i + 1];
+        A[i + 2] = A[i + 2] * B[i + 2] + C[i + 2];
+        A[i + 3] = A[i + 3] * B[i + 3] + C[i + 3];
+    }
+    auto t2 = Clock::now();
+    std::cout
+        << "Time:   "
+        << std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count()
+        << " ns\n";
+
+    volatile double sum = 0.0;
+    for (int i = 0; i < N; ++i) {
+        sum += A[i];
+    }
+    std::cout << "Result: " << sum << '\n';
+
+    delete[] A;
+    delete[] B;
+    delete[] C;
+    return 0;
+}
diff --git a/lecture11/hello_mpi/.gitignore b/lecture11/hello_mpi/.gitignore
@@ -0,0 +1 @@
+hello_mpi
diff --git a/lecture11/hello_mpi/Makefile b/lecture11/hello_mpi/Makefile
@@ -0,0 +1,9 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic
+.PHONY: clean
+
+hello_mpi: hello_mpi.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f hello_mpi
diff --git a/lecture11/hello_mpi/hello_mpi.cpp b/lecture11/hello_mpi/hello_mpi.cpp
@@ -0,0 +1,17 @@
+#include <iostream>
+#include <mpi.h>
+
+int main(int argc, char *argv[])
+{
+    MPI_Init(&argc, &argv);
+
+    std::cout << "Hello MPI\n";
+
+    // for demonstration with system monitor
+    volatile bool cond = true;
+    while (cond) {
+    }
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture11/rank_size_proc/.gitignore b/lecture11/rank_size_proc/.gitignore
@@ -0,0 +1 @@
+rank_size_proc
diff --git a/lecture11/rank_size_proc/Makefile b/lecture11/rank_size_proc/Makefile
@@ -0,0 +1,9 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic
+.PHONY: clean
+
+rank_size_proc: rank_size_proc.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f rank_size_proc
diff --git a/lecture11/rank_size_proc/rank_size_proc.cpp b/lecture11/rank_size_proc/rank_size_proc.cpp
@@ -0,0 +1,35 @@
+#include <cassert>
+#include <iostream>
+#include <mpi.h>
+
+int main(int argc, char *argv[])
+{
+    int size, rank, len, err;
+    char hostname[MPI_MAX_PROCESSOR_NAME];
+
+    // initialize MPI
+    err = MPI_Init(&argc, &argv);
+    assert(err == MPI_SUCCESS);
+
+    // get number of ranks in the MPI_COMM_WORLD communicator
+    err = MPI_Comm_size(MPI_COMM_WORLD, &size);
+    assert(err == MPI_SUCCESS);
+
+    // get my rank
+    err = MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    assert(err == MPI_SUCCESS);
+
+    // query processor name (the return value is implementation dependent)
+    err = MPI_Get_processor_name(hostname, &len);
+    assert(err == MPI_SUCCESS);
+    std::cout << "Rank " << rank << "/" << size << " running on: " << hostname
+              << '\n';
+
+    // call other MPI routines
+
+    // terminate the MPI environment
+    err = MPI_Finalize();
+    assert(err == MPI_SUCCESS);
+
+    return 0;
+}
diff --git a/lecture11/thread_support/.gitignore b/lecture11/thread_support/.gitignore
@@ -0,0 +1 @@
+thread_support
diff --git a/lecture11/thread_support/Makefile b/lecture11/thread_support/Makefile
@@ -0,0 +1,9 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic
+.PHONY: clean
+
+thread_support: thread_support.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f thread_support
diff --git a/lecture11/thread_support/thread_support.cpp b/lecture11/thread_support/thread_support.cpp
@@ -0,0 +1,27 @@
+#include <iostream>
+#include <mpi.h>
+
+int main(int argc, char *argv[])
+{
+    int rank, size, version, subversion, provided;
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
+
+    MPI_Get_version(&version, &subversion);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    const bool isroot = (0 == rank);
+    if (isroot) { // only the root rank executes this code
+        std::cout << "MPI version:           " << version << '.' << subversion
+                  << '\n';
+        std::cout << "Total ranks:           " << size << '\n';
+        std::cout << "Thread support:        " << provided
+                  << " (requested MPI_THREAD_FUNNELED)\n";
+        std::cout << "MPI_THREAD_SINGLE:     " << MPI_THREAD_SINGLE << '\n';
+        std::cout << "MPI_THREAD_FUNNELED:   " << MPI_THREAD_FUNNELED << '\n';
+        std::cout << "MPI_THREAD_SERIALIZED: " << MPI_THREAD_SERIALIZED << '\n';
+        std::cout << "MPI_THREAD_MULTIPLE:   " << MPI_THREAD_MULTIPLE << '\n';
+    }
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture12/endianness/.gitignore b/lecture12/endianness/.gitignore
@@ -0,0 +1 @@
+endianness
diff --git a/lecture12/endianness/Makefile b/lecture12/endianness/Makefile
@@ -0,0 +1,9 @@
+CXX = g++
+CXXFLAGS = -O1 -g -Wall -Wextra -Wpedantic
+.PHONY: clean
+
+endianness: endianness.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f endianness
diff --git a/lecture12/endianness/endianness.cpp b/lecture12/endianness/endianness.cpp
@@ -0,0 +1,44 @@
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+int main(void)
+{
+    // clang-format off
+
+                 // Little endian byte order: (e.g Intel, AMD)
+                 //
+                 //    high address
+                 //    |
+                 //    |     low address
+                 //    |     |
+    int one = 1; // 0x00000001
+                 //   ||    ||
+                 //   ||    least significant byte
+                 //   ||
+                 //   most significant byte
+
+                 // Big endian byte order: (e.g. IBM)
+                 //
+                 //    high address
+                 //    |
+                 //    |     low address
+                 //    |     |
+                 // 0x01000000
+                 //   ||    ||
+                 //   ||    least significant byte
+                 //   ||
+                 //   most significant byte
+
+    // clang-format on
+
+    // create a pointer to the low address byte
+    char *low_address = (char *)&one;
+
+    // print its value
+    const std::string endianness = (*low_address) ? "little" : "big";
+    std::cout << "Low address byte = 0x" << std::hex << std::setfill('0')
+              << std::setw(2) << (int)(*low_address) << " (" << endianness
+              << " endian)\n";
+    return 0;
+}
diff --git a/lecture12/mpi_bsend/.gitignore b/lecture12/mpi_bsend/.gitignore
@@ -0,0 +1 @@
+mpi_bsend
diff --git a/lecture12/mpi_bsend/Makefile b/lecture12/mpi_bsend/Makefile
@@ -0,0 +1,9 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX
+.PHONY: clean
+
+mpi_bsend: mpi_bsend.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f mpi_bsend
diff --git a/lecture12/mpi_bsend/mpi_bsend.cpp b/lecture12/mpi_bsend/mpi_bsend.cpp
@@ -0,0 +1,37 @@
+#include <mpi.h>
+
+int main(int argc, char* argv[])
+{
+    constexpr int n = 1000000;
+    int rank;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    // allocate a memory buffer for MPI
+    int bufsize = 0;
+    MPI_Pack_size(n, MPI_INT, MPI_COMM_WORLD, &bufsize);
+    bufsize += MPI_BSEND_OVERHEAD;
+    char *mpibuf = new char[bufsize];
+    MPI_Buffer_attach(mpibuf, bufsize);
+
+    // send/receive buffers
+    int *sendbuf = new int[n];
+    int *recvbuf = new int[n];
+
+    if (0 == rank) {
+        MPI_Bsend(sendbuf, n, MPI_INT, 1, 98, MPI_COMM_WORLD); // local
+        MPI_Recv(recvbuf, n, MPI_INT, 1, 99, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    } else {
+        MPI_Bsend(sendbuf, n, MPI_INT, 0, 99, MPI_COMM_WORLD); // local
+        MPI_Recv(recvbuf, n, MPI_INT, 0, 98, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    }
+
+    // clean up
+    MPI_Buffer_detach(mpibuf, &bufsize); // blocking
+    delete[] mpibuf;
+    delete[] sendbuf;
+    delete[] recvbuf;
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture12/mpi_gatherv_scatterv/.gitignore b/lecture12/mpi_gatherv_scatterv/.gitignore
@@ -0,0 +1 @@
+mpi_gatherv_scatterv
diff --git a/lecture12/mpi_gatherv_scatterv/Makefile b/lecture12/mpi_gatherv_scatterv/Makefile
@@ -0,0 +1,9 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX
+.PHONY: clean
+
+mpi_gatherv_scatterv: mpi_gatherv_scatterv.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f mpi_gatherv_scatterv
diff --git a/lecture12/mpi_gatherv_scatterv/mpi_gatherv_scatterv.cpp b/lecture12/mpi_gatherv_scatterv/mpi_gatherv_scatterv.cpp
@@ -0,0 +1,78 @@
+#include <algorithm>
+#include <iostream>
+#include <mpi.h>
+#include <sstream>
+#include <vector>
+
+int main(int argc, char* argv[])
+{
+    int rank, size;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    const bool isroot = (0 == rank);
+
+    std::vector<int> sendbuf(rank + 1, rank);
+    std::vector<int> recvbuf, recvcounts, displacement;
+    if (isroot) {
+        recvbuf.resize(size * (size + 1) / 2);
+        recvcounts.resize(size);
+        displacement.resize(size);
+        for (int i = 0; i < size; ++i) {
+            recvcounts[i] = i + 1;
+            displacement[i] = i * (i + 1) / 2;
+        }
+    }
+
+    MPI_Gatherv(sendbuf.data(),
+                sendbuf.size(),
+                MPI_INT,
+                recvbuf.data(),
+                recvcounts.data(),
+                displacement.data(),
+                MPI_INT,
+                0,
+                MPI_COMM_WORLD);
+
+    if (isroot) {
+        std::cout << "Result of MPI_Gatherv (on root rank)\n";
+        for (int r = 0; r < size; ++r) {
+            const int offset = displacement[r];
+            for (int i = 0; i < recvcounts[r]; ++i) {
+                std::cout << ' ' << recvbuf[offset + i];
+            }
+            std::cout << '\n';
+        }
+    }
+
+    // inverse operation
+    std::vector<int> sendcounts;
+    sendbuf.swap(recvbuf);
+    if (isroot) {
+        sendcounts.swap(recvcounts);
+    }
+
+    MPI_Scatterv(sendbuf.data(),
+                 sendcounts.data(),
+                 displacement.data(),
+                 MPI_INT,
+                 recvbuf.data(),
+                 recvbuf.size(),
+                 MPI_INT,
+                 0,
+                 MPI_COMM_WORLD);
+
+    if (isroot) {
+        std::cout << "\nResult of MPI_Scatterv\n";
+    }
+    std::ostringstream out;
+    out << "Rank " << rank << ":";
+    for (const auto v : recvbuf) {
+        out << ' ' << v;
+    }
+    out << '\n';
+    std::cout << out.str();
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture12/mpi_get_count/.gitignore b/lecture12/mpi_get_count/.gitignore
@@ -0,0 +1 @@
+mpi_get_count
diff --git a/lecture12/mpi_get_count/Makefile b/lecture12/mpi_get_count/Makefile
@@ -0,0 +1,9 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX
+.PHONY: clean
+
+mpi_get_count: mpi_get_count.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f mpi_get_count
diff --git a/lecture12/mpi_get_count/mpi_get_count.cpp b/lecture12/mpi_get_count/mpi_get_count.cpp
@@ -0,0 +1,33 @@
+#include <cassert>
+#include <iostream>
+#include <mpi.h>
+#include <string>
+
+int main(int argc, char* argv[])
+{
+    assert(argc == 2); // pass some string as argument
+    int rank;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    if (0 == rank) {
+        char message[1024];
+        MPI_Status status;
+        MPI_Recv(message, 1024, MPI_CHAR, 1, 99, MPI_COMM_WORLD, &status);
+
+        // get the count of characters
+        int count;
+        MPI_Get_count(&status, MPI_CHAR, &count);
+        assert(count < 1024);
+        message[count + 1] = '\0'; // terminating null
+        std::cout << "Rank " << rank << " received: " << message << " ("
+                  << count << " characters)\n";
+    } else {
+        const std::string message(argv[1]);
+        MPI_Send(
+            message.c_str(), message.size(), MPI_CHAR, 0, 99, MPI_COMM_WORLD);
+    }
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture12/mpi_minloc/.gitignore b/lecture12/mpi_minloc/.gitignore
@@ -0,0 +1 @@
+mpi_minloc
diff --git a/lecture12/mpi_minloc/Makefile b/lecture12/mpi_minloc/Makefile
@@ -0,0 +1,9 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX
+.PHONY: clean
+
+mpi_minloc: mpi_minloc.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f mpi_minloc
diff --git a/lecture12/mpi_minloc/mpi_minloc.cpp b/lecture12/mpi_minloc/mpi_minloc.cpp
@@ -0,0 +1,31 @@
+#include <cstdlib>
+#include <iostream>
+#include <mpi.h>
+
+int main(int argc, char* argv[])
+{
+    int rank;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    srand48(rank);
+    const double value = drand48();
+    std::cout << "Rank " << rank << ": value = " << value << '\n';
+
+    // packed value and rank ID
+    struct DoubleInt {
+        double val;
+        int rank;
+    } send, recv;
+    send.val = value;
+    send.rank = rank;
+
+    MPI_Reduce(&send, &recv, 1, MPI_DOUBLE_INT, MPI_MINLOC, 0, MPI_COMM_WORLD);
+    if (0 == rank) {
+        std::cout << "Minimum: value = " << recv.val << " on rank " << recv.rank
+                  << '\n';
+    }
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture12/mpi_p2p/.gitignore b/lecture12/mpi_p2p/.gitignore
@@ -0,0 +1 @@
+mpi_p2p
diff --git a/lecture12/mpi_p2p/Makefile b/lecture12/mpi_p2p/Makefile
@@ -0,0 +1,9 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX
+.PHONY: clean
+
+mpi_p2p: mpi_p2p.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f mpi_p2p
diff --git a/lecture12/mpi_p2p/mpi_p2p.cpp b/lecture12/mpi_p2p/mpi_p2p.cpp
@@ -0,0 +1,22 @@
+#include <mpi.h>
+#include <iostream>
+
+// exchange rank IDs between 2 processes
+// DISCLAIMER: this code contains a bug
+int main(int argc, char* argv[])
+{
+    int rank, recv;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    if (0 == rank) {
+        MPI_Send(&rank, 1, MPI_INT, 1, 98, MPI_COMM_WORLD);
+        MPI_Recv(&recv, 1, MPI_INT, 1, 99, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    } else {
+        MPI_Send(&rank, 1, MPI_INT, 0, 99, MPI_COMM_WORLD);
+        MPI_Recv(&recv, 1, MPI_INT, 0, 98, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    }
+    std::cout << "Rank " << rank << " got ID " << recv << " from other rank\n";
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture12/mpi_pi/.gitignore b/lecture12/mpi_pi/.gitignore
@@ -0,0 +1,3 @@
+mpi_pi
+mpi_pi_reduction
+mpi_pi_reduction_inplace
diff --git a/lecture12/mpi_pi/Makefile b/lecture12/mpi_pi/Makefile
@@ -0,0 +1,17 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX
+.PHONY: clean
+
+all: mpi_pi mpi_pi_reduction mpi_pi_reduction_inplace
+
+mpi_pi: mpi_pi.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+mpi_pi_reduction: mpi_pi_reduction.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+mpi_pi_reduction_inplace: mpi_pi_reduction_inplace.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f mpi_pi mpi_pi_reduction mpi_pi_reduction_inplace
diff --git a/lecture12/mpi_pi/mpi_pi.cpp b/lecture12/mpi_pi/mpi_pi.cpp
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <mpi.h>
+
+int main(int argc, char* argv[])
+{
+    int rank, size;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    const size_t n = 100000000;
+
+    // domain decomposition
+    const size_t chunk_size = (n + size - 1) / size;
+    const size_t my_start = rank * chunk_size;
+    const size_t my_end = std::min((rank + 1) * chunk_size, n);
+
+    // partial sum
+    double sum = 0.0; // rank local sum (on each process, no race condition!)
+    for (size_t k = my_start; k < my_end; ++k) {
+        sum += (1.0 - 2.0 * (k % 2)) / (2.0 * k + 1.0);
+    }
+
+    // reduction
+    if (0 == rank) {
+        double pi = sum;
+        double recv;
+        for (int r = 1; r < size; ++r) {
+            MPI_Recv(&recv,
+                     1,
+                     MPI_DOUBLE,
+                     r,
+                     123,
+                     MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+            pi += recv;
+        }
+        std::cout << "pi = " << 4.0 * pi << '\n';
+    } else {
+        MPI_Send(&sum, 1, MPI_DOUBLE, 0, 123, MPI_COMM_WORLD);
+    }
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture12/mpi_pi/mpi_pi_reduction.cpp b/lecture12/mpi_pi/mpi_pi_reduction.cpp
@@ -0,0 +1,33 @@
+#include <iostream>
+#include <mpi.h>
+
+int main(int argc, char* argv[])
+{
+    int rank, size;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    const size_t n = 100000000;
+
+    // domain decomposition
+    const size_t chunk_size = (n + size - 1) / size;
+    const size_t my_start = rank * chunk_size;
+    const size_t my_end = std::min((rank + 1) * chunk_size, n);
+
+    // partial sum
+    double sum = 0.0; // rank local sum (on each process, no race condition!)
+    for (size_t k = my_start; k < my_end; ++k) {
+        sum += (1.0 - 2.0 * (k % 2)) / (2.0 * k + 1.0);
+    }
+
+    // reduction
+    double pi = 0.0;
+    MPI_Reduce(&sum, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    if (0 == rank) {
+        std::cout << "pi = " << 4.0 * pi << '\n';
+    }
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture12/mpi_pi/mpi_pi_reduction_inplace.cpp b/lecture12/mpi_pi/mpi_pi_reduction_inplace.cpp
@@ -0,0 +1,38 @@
+#include <iostream>
+#include <mpi.h>
+
+int main(int argc, char* argv[])
+{
+    int rank, size;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    const size_t n = 100000000;
+
+    // domain decomposition
+    const size_t chunk_size = (n + size - 1) / size;
+    const size_t my_start = rank * chunk_size;
+    const size_t my_end = std::min((rank + 1) * chunk_size, n);
+
+    // partial sum
+    double sum = 0.0; // rank local sum (on each process, no race condition!)
+    for (size_t k = my_start; k < my_end; ++k) {
+        sum += (1.0 - 2.0 * (k % 2)) / (2.0 * k + 1.0);
+    }
+
+    // reduction
+    MPI_Reduce((0 == rank) ? MPI_IN_PLACE : &sum,
+               &sum,
+               1,
+               MPI_DOUBLE,
+               MPI_SUM,
+               0,
+               MPI_COMM_WORLD);
+    if (0 == rank) {
+        std::cout << "pi = " << 4.0 * sum << '\n';
+    }
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture12/mpi_rsend/.gitignore b/lecture12/mpi_rsend/.gitignore
@@ -0,0 +1,2 @@
+mpi_rsend
+mpi_irsend
diff --git a/lecture12/mpi_rsend/Makefile b/lecture12/mpi_rsend/Makefile
@@ -0,0 +1,20 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX
+.PHONY: clean
+
+all: mpi_rsend mpi_irsend
+
+run_rsend: mpi_rsend
+	mpirun -n 2 ./mpi_rsend
+
+run_irsend: mpi_irsend
+	mpirun -n 2 ./mpi_irsend
+
+mpi_rsend: mpi_rsend.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+mpi_irsend: mpi_irsend.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f mpi_rsend mpi_irsend
diff --git a/lecture12/mpi_rsend/mpi_irsend.cpp b/lecture12/mpi_rsend/mpi_irsend.cpp
@@ -0,0 +1,46 @@
+#include <iostream>
+#include <mpi.h>
+#include <numeric>
+#include <vector>
+
+int main(int argc, char* argv[])
+{
+    int rank;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    // send/receive buffers
+    constexpr int n = 1000000;
+    std::vector<int> sendbuf(n, 1);
+    std::vector<int> recvbuf(n);
+
+    // post non-blocking receive request first
+    MPI_Request recv_req = MPI_REQUEST_NULL;
+    if (1 == rank) {
+        MPI_Irecv(recvbuf.data(),
+                  recvbuf.size(),
+                  MPI_INT,
+                  0,
+                  98,
+                  MPI_COMM_WORLD,
+                  &recv_req);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD); // synchronize all processes
+
+    if (0 == rank) {
+        MPI_Rsend(
+            sendbuf.data(), sendbuf.size(), MPI_INT, 1, 98, MPI_COMM_WORLD);
+    }
+
+    // Rank 0 is OK to call this because its request is NULL, the other ranks
+    // blocks until the receive has completed.
+    MPI_Wait(&recv_req, MPI_STATUS_IGNORE);
+
+    std::cout << "Rank " << rank << ": result = "
+              << std::accumulate(recvbuf.begin(), recvbuf.end(), 0)
+              << std::endl;
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture12/mpi_rsend/mpi_rsend.cpp b/lecture12/mpi_rsend/mpi_rsend.cpp
@@ -0,0 +1,41 @@
+#include <iostream>
+#include <mpi.h>
+#include <numeric>
+#include <unistd.h>
+#include <vector>
+
+// DISCLAIMER: this code is not correct (the result of the receive is undefined)
+int main(int argc, char* argv[])
+{
+    int rank;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    // send/receive buffers
+    constexpr int n = 1000000;
+    std::vector<int> sendbuf(n, 1);
+    std::vector<int> recvbuf(n);
+
+    if (0 == rank) {
+        MPI_Rsend(
+            sendbuf.data(), sendbuf.size(), MPI_INT, 1, 98, MPI_COMM_WORLD);
+    } else {
+        // this should somewhat ensure that the receive is surely posted after
+        // the send above
+        sleep(10);
+        MPI_Recv(recvbuf.data(),
+                 recvbuf.size(),
+                 MPI_INT,
+                 0,
+                 98,
+                 MPI_COMM_WORLD,
+                 MPI_STATUS_IGNORE);
+    }
+
+    std::cout << "Rank " << rank << ": result = "
+              << std::accumulate(recvbuf.begin(), recvbuf.end(), 0)
+              << std::endl;
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture13/mpi_1D_diffusion/.gitignore b/lecture13/mpi_1D_diffusion/.gitignore
@@ -0,0 +1,4 @@
+blocking
+nonblocking
+u.dat
+u.png
diff --git a/lecture13/mpi_1D_diffusion/IO.h b/lecture13/mpi_1D_diffusion/IO.h
@@ -0,0 +1,45 @@
+#ifndef IO_H_C0WPQOQZ
+#define IO_H_C0WPQOQZ
+
+#include <fstream>
+#include <mpi.h>
+#include <vector>
+
+void initialize(std::vector<double> &v)
+{
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    const int N = static_cast<int>(v.size()) - 2;
+    const double dx = 1.0 / (size * N);
+    const double x0 = rank * N * dx;
+    for (int i = 0; i < N; ++i) {
+        const double x = x0 + (i + 0.5) * dx;
+        if (x > 0.4 && x < 0.6) {
+            v[i + 1] = 1.0;
+        } else {
+            v[i + 1] = 0.0;
+        }
+    }
+}
+
+void dump(const std::vector<double> &v)
+{
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    const int N = static_cast<int>(v.size()) - 2;
+    std::vector<double> u(size * N);
+    MPI_Gather(
+        &v[1], N, MPI_DOUBLE, u.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    if (0 == rank) {
+        std::ofstream out("u.dat", std::ios::app);
+        out << std::scientific;
+        for (size_t i = 0; i < u.size(); ++i) {
+            out << ' ' << u[i];
+        }
+        out << '\n';
+    }
+}
+
+#endif /* IO_H_C0WPQOQZ */
diff --git a/lecture13/mpi_1D_diffusion/Makefile b/lecture13/mpi_1D_diffusion/Makefile
@@ -0,0 +1,22 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX
+.PHONY: clean
+
+all: blocking nonblocking
+
+run_blocking: blocking
+	rm -f u.dat
+	mpirun -n 4 ./blocking
+
+run_nonblocking: nonblocking
+	rm -f u.dat
+	mpirun -n 4 ./nonblocking
+
+blocking: blocking.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+nonblocking: nonblocking.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f u.dat u.png blocking nonblocking
diff --git a/lecture13/mpi_1D_diffusion/blocking.cpp b/lecture13/mpi_1D_diffusion/blocking.cpp
@@ -0,0 +1,45 @@
+#include "IO.h"
+#include <mpi.h>
+#include <vector>
+
+int main(int argc, char *argv[])
+{
+    int rank, size;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // source and destination (periodic boundaries)
+    const int left = (rank + size - 1) % size;
+    const int right = (rank + size + 1) % size;
+
+    const int nsteps = 1000;         // number of time steps
+    const int N = 128;               // number of cells in subdomain
+    const double Fo = 0.45;          // Fourier number
+    std::vector<double> curr(N + 2); // +2 ghost cells
+    std::vector<double> next(N + 2); // +2 ghost cells
+    initialize(curr);
+    for (int step = 0; step < nsteps; ++step) {
+        // clang-format off
+        MPI_Sendrecv(&curr[1],     1, MPI_DOUBLE, left,  123,
+                     &curr[N + 1], 1, MPI_DOUBLE, right, 123,
+                     MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        MPI_Sendrecv(&curr[N], 1, MPI_DOUBLE, right, 123,
+                     &curr[0], 1, MPI_DOUBLE, left,  123,
+                     MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        // clang-format on
+
+        // stencil scheme
+        for (int i = 1; i < N + 1; ++i) {
+            next[i] =
+                curr[i] + Fo * (curr[i - 1] - 2.0 * curr[i] + curr[i + 1]);
+        }
+        if (step % 200 == 0) {
+            dump(curr);
+        }
+        curr.swap(next);
+    }
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture13/mpi_1D_diffusion/nonblocking.cpp b/lecture13/mpi_1D_diffusion/nonblocking.cpp
@@ -0,0 +1,54 @@
+#include "IO.h"
+#include <mpi.h>
+#include <vector>
+
+int main(int argc, char *argv[])
+{
+    int rank, size;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // source and destination (periodic boundaries)
+    const int left = (rank + size - 1) % size;
+    const int right = (rank + size + 1) % size;
+
+    const int nsteps = 1000;         // number of time steps
+    const int N = 128;               // number of cells in subdomain
+    const double Fo = 0.45;          // Fourier number
+    std::vector<double> curr(N + 2); // +2 ghost cells
+    std::vector<double> next(N + 2); // +2 ghost cells
+    initialize(curr);
+    auto applyStencil = [&](int i) {
+        next[i] = curr[i] + Fo * (curr[i - 1] - 2.0 * curr[i] + curr[i + 1]);
+    };
+    for (int step = 0; step < nsteps; ++step) {
+        MPI_Request reqs[4];
+        // clang-format off
+        MPI_Irecv(&curr[0],     1, MPI_DOUBLE, left,  123, MPI_COMM_WORLD, &reqs[0]);
+        MPI_Irecv(&curr[N + 1], 1, MPI_DOUBLE, right, 321, MPI_COMM_WORLD, &reqs[1]);
+        MPI_Isend(&curr[1],     1, MPI_DOUBLE, left,  321, MPI_COMM_WORLD, &reqs[2]);
+        MPI_Isend(&curr[N],     1, MPI_DOUBLE, right, 123, MPI_COMM_WORLD, &reqs[3]);
+        // clang-format on
+
+        // compute internal domain
+        for (int i = 2; i < N; ++i) {
+            applyStencil(i);
+        }
+
+        // wait for completion
+        MPI_Waitall(4, reqs, MPI_STATUSES_IGNORE);
+
+        // complete boundary
+        applyStencil(1); // left
+        applyStencil(N); // right
+
+        if (step % 200 == 0) {
+            dump(curr);
+        }
+        curr.swap(next);
+    }
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture13/mpi_1D_diffusion/plot.py b/lecture13/mpi_1D_diffusion/plot.py
@@ -0,0 +1,19 @@
+# File       : plot.py
+# Description: Plot temporal evolution of diffusing quantity
+# Copyright 2022 Harvard University. All Rights Reserved.
+import numpy as np
+import matplotlib.pyplot as plt
+
+def main():
+    data = np.loadtxt('u.dat')
+    N = data.shape[1]
+    dx = 1.0 / N
+    for step in range(data.shape[0]):
+        x = (np.arange(N) + 0.5) * dx
+        plt.plot(x, data[step, :])
+    plt.xlabel(r'$x$')
+    plt.ylabel(r'$u(x,t)$')
+    plt.savefig('u.png', dpi=300, bbox_inches='tight')
+
+if __name__ == "__main__":
+    main()
diff --git a/lecture13/mpi_cyclic_shift/.gitignore b/lecture13/mpi_cyclic_shift/.gitignore
@@ -0,0 +1 @@
+mpi_cyclic_shift
diff --git a/lecture13/mpi_cyclic_shift/Makefile b/lecture13/mpi_cyclic_shift/Makefile
@@ -0,0 +1,14 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX
+.PHONY: clean
+
+all: mpi_cyclic_shift mpi_sendrecv
+
+mpi_cyclic_shift: mpi_cyclic_shift.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+mpi_sendrecv: mpi_sendrecv.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f mpi_cyclic_shift mpi_sendrecv
diff --git a/lecture13/mpi_cyclic_shift/mpi_cyclic_shift.cpp b/lecture13/mpi_cyclic_shift/mpi_cyclic_shift.cpp
@@ -0,0 +1,59 @@
+#include <iostream>
+#include <mpi.h>
+#include <sstream>
+#include <vector>
+
+int main(int argc, char *argv[])
+{
+    int rank, size;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // source and destination (periodic boundaries)
+    const int left = (rank + size - 1) % size;
+    const int right = (rank + size + 1) % size;
+
+    std::ostringstream out;
+    std::vector<double> send(100, rank), recv(100);
+    for (int cycle = 0; cycle < 3; ++cycle) {
+        out << "Cycle " << cycle << ": ";
+        if (0 == rank % 2) { // even ranks send first
+            MPI_Send(send.data(),
+                     send.size(),
+                     MPI_DOUBLE,
+                     right,
+                     123,
+                     MPI_COMM_WORLD);
+            MPI_Recv(recv.data(),
+                     recv.size(),
+                     MPI_DOUBLE,
+                     left,
+                     123,
+                     MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+        } else {
+            MPI_Recv(recv.data(),
+                     recv.size(),
+                     MPI_DOUBLE,
+                     left,
+                     123,
+                     MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+            MPI_Send(send.data(),
+                     send.size(),
+                     MPI_DOUBLE,
+                     right,
+                     123,
+                     MPI_COMM_WORLD);
+        }
+        out << "Rank " << rank << ": sent right 100x " << send[0]
+            << ", received left 100x " << recv[0] << '\n';
+        send.swap(recv);
+    }
+
+    std::cout << out.str();
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture13/mpi_cyclic_shift/mpi_sendrecv.cpp b/lecture13/mpi_cyclic_shift/mpi_sendrecv.cpp
@@ -0,0 +1,42 @@
+#include <iostream>
+#include <mpi.h>
+#include <sstream>
+#include <vector>
+
+int main(int argc, char *argv[])
+{
+    int rank, size;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // source and destination (periodic boundaries)
+    const int left = (rank + size - 1) % size;
+    const int right = (rank + size + 1) % size;
+
+    std::ostringstream out;
+    std::vector<double> send(100, rank), recv(100);
+    for (int cycle = 0; cycle < 3; ++cycle) {
+        out << "Cycle " << cycle << ": ";
+        MPI_Sendrecv(send.data(),
+                     send.size(),
+                     MPI_DOUBLE,
+                     right,
+                     123,
+                     recv.data(),
+                     recv.size(),
+                     MPI_DOUBLE,
+                     left,
+                     123,
+                     MPI_COMM_WORLD,
+                     MPI_STATUS_IGNORE);
+        out << "Rank " << rank << ": sent right 100x " << send[0]
+            << ", received left 100x " << recv[0] << '\n';
+        send.swap(recv);
+    }
+
+    std::cout << out.str();
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture13/mpi_p2p_nonblocking/.gitignore b/lecture13/mpi_p2p_nonblocking/.gitignore
@@ -0,0 +1 @@
+mpi_p2p_nonblocking
diff --git a/lecture13/mpi_p2p_nonblocking/Makefile b/lecture13/mpi_p2p_nonblocking/Makefile
@@ -0,0 +1,9 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX
+.PHONY: clean
+
+mpi_p2p_nonblocking: mpi_p2p_nonblocking.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f mpi_p2p_nonblocking
diff --git a/lecture13/mpi_p2p_nonblocking/mpi_p2p_nonblocking.cpp b/lecture13/mpi_p2p_nonblocking/mpi_p2p_nonblocking.cpp
@@ -0,0 +1,25 @@
+#include <mpi.h>
+#include <iostream>
+
+// exchange rank IDs between 2 processes
+// DISCLAIMER: this code contains a bug
+int main(int argc, char* argv[])
+{
+    int rank, recv;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Request request;
+    if (0 == rank) {
+        MPI_Irecv(&recv, 1, MPI_INT, 1, 99, MPI_COMM_WORLD, &request);
+        MPI_Send(&rank, 1, MPI_INT, 1, 98, MPI_COMM_WORLD);
+    } else {
+        MPI_Irecv(&recv, 1, MPI_INT, 0, 98, MPI_COMM_WORLD, &request);
+        MPI_Send(&rank, 1, MPI_INT, 0, 99, MPI_COMM_WORLD);
+    }
+    // The following line is required to fix the bug:
+    // MPI_Wait(&request, MPI_STATUS_IGNORE); // synchronization
+    std::cout << "Rank " << rank << " got ID " << recv << " from other rank\n";
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture14/mpi_1D_diffusion_IO/.gitignore b/lecture14/mpi_1D_diffusion_IO/.gitignore
@@ -0,0 +1,4 @@
+nonblocking
+*.dat
+*.bin
+*.png
diff --git a/lecture14/mpi_1D_diffusion_IO/IO.h b/lecture14/mpi_1D_diffusion_IO/IO.h
@@ -0,0 +1,144 @@
+#ifndef IO_H_C0WPQOQZ
+#define IO_H_C0WPQOQZ
+
+#include <fstream>
+#include <mpi.h>
+#include <sstream>
+#include <string>
+#include <vector>
+
+void initialize(std::vector<double> &v)
+{
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    const int N = static_cast<int>(v.size()) - 2;
+    const double dx = 1.0 / (size * N);
+    const double x0 = rank * N * dx;
+    for (int i = 0; i < N; ++i) {
+        const double x = x0 + (i + 0.5) * dx;
+        if (x > 0.4 && x < 0.6) {
+            v[i + 1] = 1.0;
+        } else {
+            v[i + 1] = 0.0;
+        }
+    }
+}
+
+void dump_posix(const std::vector<double> &v)
+{
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    const int N = static_cast<int>(v.size()) - 2; // ignore ghost cells
+    std::vector<double> u(size * N);
+    MPI_Gather(
+        &v[1], N, MPI_DOUBLE, u.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD);
+    if (0 == rank) { // not parallel
+        std::ofstream out("u.dat", std::ios::app);
+        out << std::scientific;
+        for (size_t i = 0; i < u.size(); ++i) {
+            out << ' ' << u[i];
+        }
+        out << '\n';
+    }
+}
+
+void dump_rank(const std::vector<double> &v)
+{
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    std::ofstream out("u_rank" + std::to_string(rank) + ".dat", std::ios::app);
+    out << std::scientific;
+    for (int i = 1; i < (int)v.size() - 1; ++i) {
+        out << ' ' << v[i]; // skip ghost cells
+    }
+    out << '\n';
+}
+
+void dump_mpi_binary(const std::vector<double> &v)
+{
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_File fh; // MPI file handle
+    MPI_Offset bytes, file_offset;
+
+    const int N = static_cast<int>(v.size()) - 2; // ignore ghost cells
+    bytes = N * sizeof(double);
+    file_offset = rank * bytes;
+
+    MPI_File_open(MPI_COMM_WORLD,
+                  "u_mpi.bin",
+                  MPI_MODE_CREATE | MPI_MODE_WRONLY,
+                  MPI_INFO_NULL,
+                  &fh);
+    MPI_File_get_size(fh, &bytes); // append to file
+    file_offset += bytes;
+    MPI_File_write_at_all(
+        fh, file_offset, &v[1], N, MPI_DOUBLE, MPI_STATUS_IGNORE);
+    MPI_File_close(&fh);
+}
+
+void dump_mpi_ascii(const std::vector<double> &v)
+{
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    std::ostringstream out;
+    out << std::scientific;
+    for (int i = 1; i < (int)v.size() - 1; ++i) {
+        out << ' ' << v[i]; // skip ghost cells
+    }
+    if (rank == size - 1) {
+        out << '\n';
+    }
+    const std::string ascii = out.str();
+
+    MPI_File fh; // MPI file handle
+    MPI_Offset bytes, file_offset = 0;
+    bytes = ascii.size() * sizeof(char);
+    MPI_Exscan(&bytes, &file_offset, 1, MPI_OFFSET, MPI_SUM, MPI_COMM_WORLD);
+
+    MPI_File_open(MPI_COMM_WORLD,
+                  "u_mpi.dat",
+                  MPI_MODE_CREATE | MPI_MODE_WRONLY,
+                  MPI_INFO_NULL,
+                  &fh);
+    MPI_File_get_size(fh, &bytes); // append to file
+    file_offset += bytes;
+    MPI_File_write_at_all(fh,
+                          file_offset,
+                          ascii.data(),
+                          ascii.size(),
+                          MPI_CHAR,
+                          MPI_STATUS_IGNORE);
+    MPI_File_close(&fh);
+}
+
+void dump_mpi_ascii_ordered(const std::vector<double> &v)
+{
+    int rank, size;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    std::ostringstream out;
+    out << std::scientific;
+    for (int i = 1; i < (int)v.size() - 1; ++i) {
+        out << ' ' << v[i]; // skip ghost cells
+    }
+    if (rank == size - 1) {
+        out << '\n';
+    }
+    const std::string ascii = out.str();
+
+    MPI_File fh; // MPI file handle
+    MPI_File_open(MPI_COMM_WORLD,
+                  "u_mpi_ordered.dat",
+                  MPI_MODE_CREATE | MPI_MODE_WRONLY | MPI_MODE_APPEND,
+                  MPI_INFO_NULL,
+                  &fh);
+    MPI_File_write_ordered(
+        fh, ascii.data(), ascii.size(), MPI_CHAR, MPI_STATUS_IGNORE);
+    MPI_File_close(&fh);
+}
+
+#endif /* IO_H_C0WPQOQZ */
diff --git a/lecture14/mpi_1D_diffusion_IO/Makefile b/lecture14/mpi_1D_diffusion_IO/Makefile
@@ -0,0 +1,13 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX
+.PHONY: clean
+
+run_nonblocking: clean nonblocking
+	mpirun -n 4 ./nonblocking
+	python plot.py
+
+nonblocking: nonblocking.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f u*.dat u*.bin u.png IO.png nonblocking
diff --git a/lecture14/mpi_1D_diffusion_IO/nonblocking.cpp b/lecture14/mpi_1D_diffusion_IO/nonblocking.cpp
@@ -0,0 +1,58 @@
+#include "IO.h"
+#include <mpi.h>
+#include <vector>
+
+int main(int argc, char *argv[])
+{
+    int rank, size;
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    // source and destination (periodic boundaries)
+    const int left = (rank + size - 1) % size;
+    const int right = (rank + size + 1) % size;
+
+    const int nsteps = 1000;         // number of time steps
+    const int N = 128;               // number of cells in subdomain
+    const double Fo = 0.45;          // Fourier number
+    std::vector<double> curr(N + 2); // +2 ghost cells
+    std::vector<double> next(N + 2); // +2 ghost cells
+    initialize(curr);
+    auto applyStencil = [&](int i) {
+        next[i] = curr[i] + Fo * (curr[i - 1] - 2.0 * curr[i] + curr[i + 1]);
+    };
+    for (int step = 0; step < nsteps; ++step) {
+        MPI_Request reqs[4];
+        // clang-format off
+        MPI_Irecv(&curr[0],     1, MPI_DOUBLE, left,  123, MPI_COMM_WORLD, &reqs[0]);
+        MPI_Irecv(&curr[N + 1], 1, MPI_DOUBLE, right, 123, MPI_COMM_WORLD, &reqs[1]);
+        MPI_Isend(&curr[1],     1, MPI_DOUBLE, left,  123, MPI_COMM_WORLD, &reqs[2]);
+        MPI_Isend(&curr[N],     1, MPI_DOUBLE, right, 123, MPI_COMM_WORLD, &reqs[3]);
+        // clang-format on
+
+        // compute internal domain
+        for (int i = 2; i < N; ++i) {
+            applyStencil(i);
+        }
+
+        // wait for completion
+        MPI_Waitall(4, reqs, MPI_STATUSES_IGNORE);
+
+        // complete boundary
+        applyStencil(1); // left
+        applyStencil(N); // right
+
+        if (step % 200 == 0) {
+            dump_posix(curr);
+            dump_rank(curr);
+            dump_mpi_binary(curr);
+            dump_mpi_ascii(curr);
+            dump_mpi_ascii_ordered(curr);
+        }
+        curr.swap(next);
+    }
+
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture14/mpi_1D_diffusion_IO/plot.py b/lecture14/mpi_1D_diffusion_IO/plot.py
@@ -0,0 +1,52 @@
+# File       : plot.py
+# Description: Plot temporal evolution of diffusing quantity
+# Copyright 2022 Harvard University. All Rights Reserved.
+import numpy as np
+import matplotlib.pyplot as plt
+
+def plot(ax, fname, title, dx, N, *, start=0.0, binary=False):
+    if binary:
+        data = np.fromfile(fname, dtype=float)
+        nsteps = data.size // N
+        data = data.reshape((nsteps, N))
+    else:
+        data = np.loadtxt(fname)
+    N = data.shape[1]
+    for step in range(data.shape[0]):
+        x = start + (np.arange(N) + 0.5) * dx
+        ax.plot(x, data[step, :])
+    ax.text(0.03, 0.9, title, fontsize=6)
+
+
+def main():
+    N = 4 * 128
+    dx = 1.0 / N
+
+    fig, ax = plt.subplots(4, 2, sharex=True)
+    plot(ax[0][0], 'u.dat', 'POSIX', dx, N)
+    plot(ax[1][0], 'u_mpi.bin', 'MPI (binary)', dx, N, binary=True)
+    plot(ax[2][0], 'u_mpi.dat', 'MPI (ascii)', dx, N)
+    plot(ax[3][0], 'u_mpi_ordered.dat', 'MPI (ascii, ordered)', dx, N)
+    plot(ax[0][1], 'u_rank0.dat', 'MPI (rank 0)', dx, N, start=0 * 128 * dx)
+    plot(ax[1][1], 'u_rank1.dat', 'MPI (rank 1)', dx, N, start=1 * 128 * dx)
+    plot(ax[2][1], 'u_rank2.dat', 'MPI (rank 2)', dx, N, start=2 * 128 * dx)
+    plot(ax[3][1], 'u_rank3.dat', 'MPI (rank 3)', dx, N, start=3 * 128 * dx)
+    for a in ax.flatten():
+        a.set_xlim([0, 1])
+        a.set_ylim([-0.05, 1.05])
+    ax[3][0].set_xlabel(r'$x$')
+    ax[3][1].set_xlabel(r'$x$')
+    for i in range(4):
+        ax[i][0].set_ylabel(r'$u(x,t)$')
+    plt.savefig('IO.png', dpi=600, bbox_inches='tight')
+    plt.close()
+
+    fig, ax = plt.subplots()
+    plot(ax, 'u.dat', '', dx, N)
+    plt.xlabel(r'$x$')
+    plt.ylabel(r'$u(x,t)$')
+    plt.savefig('u.png', dpi=600, bbox_inches='tight')
+    plt.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/lecture14/parallel_hdf5/.gitignore b/lecture14/parallel_hdf5/.gitignore
@@ -0,0 +1 @@
+parallel_hdf5
diff --git a/lecture14/parallel_hdf5/Makefile b/lecture14/parallel_hdf5/Makefile
@@ -0,0 +1,12 @@
+CXX = mpic++
+CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX
+.PHONY: clean
+
+parallel_hdf5: parallel_hdf5.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $< -lhdf5 -lz -lm
+
+run: clean parallel_hdf5
+	mpirun -n 8 ./parallel_hdf5
+
+clean:
+	rm -f data.* parallel_hdf5
diff --git a/lecture14/parallel_hdf5/data.h5 b/lecture14/parallel_hdf5/data.h5
Binary files differ.
diff --git a/lecture14/parallel_hdf5/data.xmf b/lecture14/parallel_hdf5/data.xmf
@@ -0,0 +1,23 @@
+<?xml version="1.0" ?>
+<!DOCTYPE Xdmf SYSTEM "Xdmf.dtd" []>
+<Xdmf Version="2.0">
+  <Domain>
+    <Grid GridType="Uniform">
+      <Time Value="00000"/>
+      <Topology TopologyType="3DRECTMESH" Dimensions ="33 17 9"/>
+      <Geometry GeometryType ="ORIGIN_DXDYDZ">
+        <DataItem Name ="Origin" Dimensions="3" NumberType="Float" Precision ="8" Format="XML">
+          0.000000e+00 0.000000e+00 0.000000e+00
+        </DataItem>
+        <DataItem Name ="Spacing" Dimensions="3" NumberType="Float" Precision ="8" Format="XML">
+          3.125000e-02 3.125000e-02 3.125000e-02
+        </DataItem>
+      </Geometry>
+      <Attribute Name="data" AttributeType="Scalar" Center="Cell">
+        <DataItem Dimensions ="32 16 8" NumberType="Float" Precision="4" Format ="HDF">
+          data.h5:/u
+        </DataItem>
+      </Attribute>
+    </Grid>
+  </Domain>
+</Xdmf>
diff --git a/lecture14/parallel_hdf5/hdf5IO.h b/lecture14/parallel_hdf5/hdf5IO.h
@@ -0,0 +1,98 @@
+// File       : hdf5IO.h
+// Description: HDF5 parallel MPI dumper
+// Copyright 2023 Harvard University. All Rights Reserved.
+#ifndef HDF5IO_H_XUXZ7EB0
+#define HDF5IO_H_XUXZ7EB0
+
+#include "xdmf_wrapper.h"
+#include <hdf5.h>
+#include <iostream>
+#include <mpi.h>
+#include <string>
+#include <vector>
+
+void write_hdf5_mpi(const std::string fname,
+                    const std::vector<float> &u,
+                    const int Nx,
+                    const int Ny,
+                    const int Nz,
+                    const MPI_Comm cart_comm)
+{
+    int rank, dims[3], periods[3], coords[3];
+    MPI_Comm_rank(cart_comm, &rank);
+    MPI_Cart_get(cart_comm, 3, dims, periods, coords);
+
+    // open HDF5 file
+    H5open();
+    hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); // file access policy
+    H5Pset_fapl_mpio(plist_id, cart_comm, MPI_INFO_NULL);
+    hid_t file_id = H5Fcreate(
+        (fname + ".h5").c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
+    H5Pclose(plist_id);
+
+    // rank local data and offset and global file dimensions
+    hsize_t file_count[3] = {static_cast<unsigned int>(Nz),
+                             static_cast<unsigned int>(Ny),
+                             static_cast<unsigned int>(Nx)};
+    hsize_t file_offset[3] = {
+        static_cast<unsigned int>(coords[2]) * Nz,
+        static_cast<unsigned int>(coords[1]) * Ny,
+        static_cast<unsigned int>(coords[0]) * Nx,
+    };
+    hsize_t file_dims[3] = {
+        static_cast<unsigned int>(dims[2]) * Nz,
+        static_cast<unsigned int>(dims[1]) * Ny,
+        static_cast<unsigned int>(dims[0]) * Nx,
+    };
+    if (0 == rank) {
+        std::cout << "HDF5 file size: "
+                  << (file_dims[0] * file_dims[1] * file_dims[2] *
+                      sizeof(float)) /
+                         (1024.0 * 1024.0)
+                  << " MB\n";
+    }
+
+    // dataspace for dataset
+    hid_t fspace_id = H5Screate_simple(3, file_dims, NULL);
+    hid_t mspace_id = H5Screate_simple(3, file_count, NULL);
+
+    // chunked dataset
+    plist_id = H5Pcreate(H5P_DATASET_CREATE);
+    H5Pset_chunk(plist_id, 3, file_count);
+    hid_t dataset_id = H5Dcreate(file_id,
+                                 "u",
+                                 H5T_NATIVE_FLOAT,
+                                 fspace_id,
+                                 H5P_DEFAULT,
+                                 plist_id,
+                                 H5P_DEFAULT);
+    H5Pclose(plist_id);
+    H5Sclose(fspace_id);
+
+    // define the hyperslab
+    fspace_id = H5Dget_space(dataset_id);
+    H5Sselect_hyperslab(
+        fspace_id, H5S_SELECT_SET, file_offset, NULL, file_count, NULL);
+
+    // create property list for collective write
+    plist_id = H5Pcreate(H5P_DATASET_XFER);
+    H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
+
+    // write the data
+    H5Dwrite(
+        dataset_id, H5T_NATIVE_FLOAT, mspace_id, fspace_id, plist_id, u.data());
+
+    H5Dclose(dataset_id);
+    H5Sclose(fspace_id);
+    H5Sclose(mspace_id);
+    H5Pclose(plist_id);
+    H5Fclose(file_id);
+    H5close();
+
+    // root rank writes XDMF wrapper
+    if (0 == rank) {
+        xdmf_wrapper(fname, file_dims);
+    }
+}
+
+#endif /* HDF5IO_H_XUXZ7EB0 */
diff --git a/lecture14/parallel_hdf5/hdf5_in_python.py b/lecture14/parallel_hdf5/hdf5_in_python.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+# File       : hdf5_in_python.py
+# Description: Example to load a HDF5 data file in Python
+# Copyright 2023 Harvard University. All Rights Reserved.
+import h5py as h5
+
+
+def load_hdf5(fname, group):
+    with h5.File(fname, 'r') as data:
+        return data[group][()]
+
+
+if __name__ == '__main__':
+    data = load_hdf5(
+        'data.h5', group='u'
+    )  # you could have many groups in a HDF5 file!
+    print(data.shape)  # print dimension of data
+    print(data[0, 0, :])  # prints the first 8 data entries in field `u`
diff --git a/lecture14/parallel_hdf5/parallel_hdf5.cpp b/lecture14/parallel_hdf5/parallel_hdf5.cpp
@@ -0,0 +1,44 @@
+// File       : parallel_hdf5.cpp
+// Description: Example parallel I/O using HDF5 and MPI
+//              To compile this code on the academic cluster load the following:
+//
+//              module load gcc/10.2.0-fasrc01
+//                          openmpi/4.1.1-fasrc01
+//                          hdf5/1.12.1-fasrc01
+//
+// Copyright 2023 Harvard University. All Rights Reserved.
+#include "hdf5IO.h"
+#include "xdmf_wrapper.h"
+#include <mpi.h>
+#include <vector>
+
+int main(int argc, char *argv[])
+{
+    MPI_Init(&argc, &argv);
+
+    int size;
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    if (8 != size) { // need p x p x p = 8 ranks for this demo code
+        MPI_Abort(MPI_COMM_WORLD, MPI_ERR_TOPOLOGY);
+    }
+
+    // create Cartesian communicator
+    const int dims[] = {2, 2, 2};
+    const int periods[] = {false, false, false};
+    MPI_Comm cart_comm;
+    MPI_Cart_create(MPI_COMM_WORLD, 3, dims, periods, false, &cart_comm);
+
+    // subdomain size and data u
+    int rank;
+    MPI_Comm_rank(cart_comm, &rank);
+    constexpr int Nx = 4;
+    constexpr int Ny = 8;
+    constexpr int Nz = 16;
+    std::vector<float> u(Nx * Ny * Nz, static_cast<float>(rank));
+
+    write_hdf5_mpi("data", u, Nx, Ny, Nz, cart_comm);
+
+    MPI_Comm_free(&cart_comm);
+    MPI_Finalize();
+    return 0;
+}
diff --git a/lecture14/parallel_hdf5/submit.sh b/lecture14/parallel_hdf5/submit.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# File       : submit.sh
+# Description: Example HDF5 application job submission script using 8 MPI ranks
+# Copyright 2023 Harvard University. All Rights Reserved.
+#SBATCH --job-name=parallel_hdf5
+#SBATCH --output=parallel_hdf5_%j.out
+#SBATCH --error=parallel_hdf5_%j.err
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=8
+#SBATCH --cpus-per-task=1
+#SBATCH --time=00:15:00
+
+module purge
+module load \
+    gcc/10.2.0-fasrc01 \
+    openmpi/4.1.1-fasrc01 \
+    hdf5/1.12.1-fasrc01
+
+srun -n1 -c1 make clean parallel_hdf5
+
+srun ./parallel_hdf5
+
+exit $?
diff --git a/lecture14/parallel_hdf5/xdmf_wrapper.h b/lecture14/parallel_hdf5/xdmf_wrapper.h
@@ -0,0 +1,46 @@
+// File       : xdmf_wrapper.h
+// Description: XDMF wrapper for HDF5 file
+//              https://www.xdmf.org/index.php/XDMF_Model_and_Format
+// Copyright 2023 Harvard University. All Rights Reserved.
+#ifndef XDMF_WRAPPER_H_9PHDQVEM
+#define XDMF_WRAPPER_H_9PHDQVEM
+
+#include <cstdio>
+#include <string>
+
+void xdmf_wrapper(const std::string &fname,
+                  const unsigned long long file_dims[])
+{
+    const double dx =
+        1.0 / std::max(file_dims[0], std::max(file_dims[1], file_dims[2]));
+    FILE *xmf = 0;
+    xmf = fopen((fname + ".xmf").c_str(), "w");
+    // clang-format off
+        fprintf(xmf, "<?xml version=\"1.0\" ?>\n");
+        fprintf(xmf, "<!DOCTYPE Xdmf SYSTEM \"Xdmf.dtd\" []>\n");
+        fprintf(xmf, "<Xdmf Version=\"2.0\">\n");
+        fprintf(xmf, "  <Domain>\n");
+        fprintf(xmf, "    <Grid GridType=\"Uniform\">\n");
+        fprintf(xmf, "      <Time Value=\"00000\"/>\n");
+        fprintf(xmf, "      <Topology TopologyType=\"3DRECTMESH\" Dimensions =\"%llu %llu %llu\"/>\n", file_dims[0] + 1, file_dims[1] + 1, file_dims[2] + 1);
+        fprintf(xmf, "      <Geometry GeometryType =\"ORIGIN_DXDYDZ\">\n");
+        fprintf(xmf, "        <DataItem Name =\"Origin\" Dimensions=\"3\" NumberType=\"Float\" Precision =\"8\" Format=\"XML\">\n");
+        fprintf(xmf, "          0.000000e+00 0.000000e+00 0.000000e+00\n");
+        fprintf(xmf, "        </DataItem>\n");
+        fprintf(xmf, "        <DataItem Name =\"Spacing\" Dimensions=\"3\" NumberType=\"Float\" Precision =\"8\" Format=\"XML\">\n");
+        fprintf(xmf, "          %e %e %e\n", dx, dx, dx);
+        fprintf(xmf, "        </DataItem>\n");
+        fprintf(xmf, "      </Geometry>\n");
+        fprintf(xmf, "      <Attribute Name=\"data\" AttributeType=\"Scalar\" Center=\"Cell\">\n");
+        fprintf(xmf, "        <DataItem Dimensions =\"%llu %llu %llu\" NumberType=\"Float\" Precision=\"4\" Format =\"HDF\">\n", file_dims[0], file_dims[1], file_dims[2]);
+        fprintf(xmf, "          %s:/u\n", (fname + ".h5").c_str());
+        fprintf(xmf, "        </DataItem>\n"); 
+        fprintf(xmf, "      </Attribute>\n");
+        fprintf(xmf, "    </Grid>\n");
+        fprintf(xmf, "  </Domain>\n");
+        fprintf(xmf, "</Xdmf>\n");
+    // clang-format on
+    fclose(xmf);
+}
+
+#endif /* XDMF_WRAPPER_H_9PHDQVEM */
diff --git a/lecture15/papi_examples_HW2/asm/.gitignore b/lecture15/papi_examples_HW2/asm/.gitignore
@@ -0,0 +1 @@
+test
diff --git a/lecture15/papi_examples_HW2/asm/Makefile b/lecture15/papi_examples_HW2/asm/Makefile
@@ -0,0 +1,12 @@
+CXX = g++
+CXXFLAGS = -g -Wall -Wextra -Wpedantic
+LIBS = -lpapi
+
+.PHONY: clean
+
+test: test.cpp kernel.s
+	$(CXX) -O2 -I$(SHARED_DATA)/local/include -L$(SHARED_DATA)/local/lib \
+		$(CXXFLAGS) -o $@ $^ $(LIBS)
+
+clean:
+	rm -f test
diff --git a/lecture15/papi_examples_HW2/asm/kernel.s b/lecture15/papi_examples_HW2/asm/kernel.s
@@ -0,0 +1,19 @@
+    .intel_syntax noprefix
+    .text
+    .globl  _Z4testPc
+    .type   _Z4testPc, @function
+_Z4testPc:
+    # Stack operations that end up in L1,L2,L3 caches. The stack size on Linux
+    # is usually around 8MB (see `ulimit -s`) so most of the stack operations
+    # will have around 4 cycle latency (L1 ideally).
+    # PAPI counts these in PAPI_LST_INS counter also (NOT WHAT WE WANT!)
+    mov QWORD PTR [rsp-8], rdi
+    mov rax, QWORD PTR [rsp-8]
+    # data cache miss: read from RAM (about 200 cycles)
+    # PAPI counts this in PAPI_LST_INS counter (WHAT WE WANT)
+    mov cl, BYTE PTR [rdi]
+    # write to pointer A[0] = 1 [this will cause the printed result to be 1]
+    # (about 200 cycles)
+    # PAPI counts this in PAPI_LST_INS counter (WHAT WE WANT)
+    mov BYTE PTR [rdi], 0x1
+    ret
diff --git a/lecture15/papi_examples_HW2/asm/test.cpp b/lecture15/papi_examples_HW2/asm/test.cpp
@@ -0,0 +1,61 @@
+#include "papi.h"
+#include <iostream>
+
+// Broadwell CPU on cluster, you can get one with
+//   salloc -N1 -c32 -t 01:00:00
+//
+// Model name: Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz
+// L1d cache:  32K
+// L1i cache:  32K
+// L2 cache:   256K
+// L3 cache:   40960K
+#define L1_SIZE_KB 32
+#define L2_SIZE_KB 256
+#define L3_SIZE_KB 40960
+
+void test(char *A);
+
+int main()
+{
+    char A[100] = {0};
+
+    // Initialize PAPI
+    int event_set = PAPI_NULL;
+    int events[4] = {PAPI_LST_INS, PAPI_L1_DCM, PAPI_L2_DCM, PAPI_L3_DCA};
+    long long int counters[4];
+    PAPI_library_init(PAPI_VER_CURRENT);
+    PAPI_create_eventset(&event_set);
+    PAPI_add_events(event_set, events, 4);
+
+    // start PAPI measurement
+    PAPI_start(event_set);
+
+    // run code to be measured
+    test(A);
+
+    // stop PAPI and get counter values
+    PAPI_stop(event_set, counters);
+
+    // clang-format off
+    const long long total_lst = counters[0]; // total load/store
+    const long long total_l1m = counters[1]; // total L1 load misses
+    const long long total_l2m = counters[2]; // total L2 load misses
+    const long long total_l3m = counters[3]; // total L3 load misses
+    // clang-format on
+
+    int sum = 0;
+    for (int i = 0; i < 100; ++i) {
+        sum += A[i];
+    }
+
+    std::cout << "Result:                       " << sum << '\n';
+    std::cout << "L1 cache size:                " << L1_SIZE_KB << " KB\n";
+    std::cout << "L2 cache size:                " << L2_SIZE_KB << " KB\n";
+    std::cout << "L3 cache size:                " << L3_SIZE_KB << " KB\n";
+    std::cout << "Total L1 data misses:         " << total_l1m << '\n';
+    std::cout << "Total L2 data misses:         " << total_l2m << '\n';
+    std::cout << "Total L3 data accesses:       " << total_l3m << '\n';
+    std::cout << "Total load/store:             " << total_lst << '\n';
+
+    return 0;
+}
diff --git a/lecture15/papi_examples_HW2/daxpy/.gitignore b/lecture15/papi_examples_HW2/daxpy/.gitignore
@@ -0,0 +1,2 @@
+daxpy_O*
+*.swp
diff --git a/lecture15/papi_examples_HW2/daxpy/Makefile b/lecture15/papi_examples_HW2/daxpy/Makefile
@@ -0,0 +1,25 @@
+CXX = g++
+CXXFLAGS = -g -Wall -Wextra -Wpedantic -I$(SHARED_DATA)/local/include -L$(SHARED_DATA)/local/lib
+LIBS = -lpapi
+
+.PHONY: clean
+
+all: daxpy_O0 daxpy_O1 daxpy_O2 daxpy_O3_autovec daxpy_O3_noautovec
+
+daxpy_O0: daxpy.cpp
+	$(CXX) -O0 $(CXXFLAGS) -o $@ $< $(LIBS)
+
+daxpy_O1: daxpy.cpp
+	$(CXX) -O1 $(CXXFLAGS) -o $@ $< $(LIBS)
+
+daxpy_O2: daxpy.cpp
+	$(CXX) -O2 $(CXXFLAGS) -o $@ $< $(LIBS)
+
+daxpy_O3_autovec: daxpy.cpp
+	$(CXX) -O3 $(CXXFLAGS) -o $@ $< $(LIBS)
+
+daxpy_O3_noautovec: daxpy.cpp
+	$(CXX) -O3 -fno-tree-vectorize $(CXXFLAGS) -o $@ $< $(LIBS)
+
+clean:
+	rm -f daxpy_O*
diff --git a/lecture15/papi_examples_HW2/daxpy/daxpy.cpp b/lecture15/papi_examples_HW2/daxpy/daxpy.cpp
@@ -0,0 +1,106 @@
+#include "papi.h"
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+// Broadwell CPU on cluster, you can get one with
+//   salloc -N1 -c32 -t 01:00:00
+//
+// Model name: Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz
+// L1d cache:  32K
+// L1i cache:  32K
+// L2 cache:   256K
+// L3 cache:   40960K
+#define L1_SIZE_KB 32
+#define L2_SIZE_KB 256
+#define L3_SIZE_KB 40960
+
+typedef double Real;
+
+void daxpy(const Real a, const Real *x, Real *y, const int n)
+{
+    // 2n flops
+    // 3n memory accesses
+    for (int i = 0; i < n; ++i) {
+        y[i] = a * x[i] + y[i];
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    int n = 1000000;
+    if (argc > 1) {
+        n = atoi(argv[1]);
+    }
+    std::vector<Real> x(n, 1.0);
+    std::vector<Real> y(n, 0.001);
+
+    // Initialize PAPI
+    int event_set = PAPI_NULL;
+    int events[4] = {PAPI_TOT_CYC,
+                     PAPI_TOT_INS,
+                     PAPI_LST_INS,
+                     PAPI_L1_DCM};
+    long long int counters[4];
+    PAPI_library_init(PAPI_VER_CURRENT);
+    PAPI_create_eventset(&event_set);
+    PAPI_add_events(event_set, events, 4);
+
+    // warm up
+    daxpy(1.0, x.data(), y.data(), n);
+
+    // start PAPI measurement
+    PAPI_start(event_set);
+
+    // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
+    // PAPI_TOT_INS slightly, neglected here)
+    const long long int t0 = PAPI_get_real_nsec();
+
+    // run code to be measured
+    daxpy(1.0, x.data(), y.data(), n);
+
+    // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
+    // PAPI_TOT_INS slightly, neglected here)
+    const long long int t1 = PAPI_get_real_nsec();
+
+    // stop PAPI and get counter values
+    PAPI_stop(event_set, counters);
+
+    // clang-format off
+    const long long total_cycles = counters[0];       // cpu cycles
+    const long long total_instructions = counters[1]; // any
+    const long long total_load_stores = counters[2];  // number of such instructions
+    const long long total_l1d_misses = counters[3];   // number of access request to cache line
+    // clang-format on
+
+    const size_t flops = 2 * n;
+    const size_t mem_ops = 3 * n;
+    const double twall = (static_cast<double>(t1) - t0) * 1.0e-9; // seconds
+    const double IPC = static_cast<double>(total_instructions) / total_cycles;
+    const double OI =
+        static_cast<double>(flops) / (total_load_stores * sizeof(Real));
+    const double OI_theory =
+        static_cast<double>(flops) / (mem_ops * sizeof(Real));
+    const double float_perf = flops / twall * 1.0e-9; // Gflop/s
+    const double sum = std::accumulate(y.begin(), y.end(), 0.0);
+
+    std::cout << "Result:                       " << sum << '\n';
+    std::cout << "Total cycles:                 " << total_cycles << '\n';
+    std::cout << "Total instructions:           " << total_instructions << '\n';
+    std::cout << "Instructions per cycle (IPC): " << IPC << '\n';
+    std::cout << "L1 cache size:                " << L1_SIZE_KB << " KB\n";
+    std::cout << "L2 cache size:                " << L2_SIZE_KB << " KB\n";
+    std::cout << "L3 cache size:                " << L3_SIZE_KB << " KB\n";
+    std::cout << "Total problem size:           " << 2 * n * sizeof(Real) / 1024
+              << " KB\n";
+    std::cout << "Total L1 data misses:         " << total_l1d_misses << '\n';
+    std::cout << "Total load/store:             " << total_load_stores
+              << " (expected: " << mem_ops << ")\n";
+    std::cout << "Operational intensity:        " << std::scientific << OI
+              << " (expected: " << OI_theory << ")\n";
+    std::cout << "Performance [Gflop/s]:        " << float_perf << '\n';
+    std::cout << "Wall-time   [micro-seconds]:  " << twall * 1.0e6 << '\n';
+
+    return 0;
+}
diff --git a/lecture15/papi_examples_HW2/daxpy/disassemble.sh b/lecture15/papi_examples_HW2/daxpy/disassemble.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+exec='daxpy'
+if [[ $# -ne 1 ]]; then
+    cat <<EOF
+    USAGE: $0 <$exec executable>
+EOF
+fi
+
+gdb $1 -batch -ex 'set disassembly-flavor intel' -ex "disassemble $exec"
diff --git a/lecture15/papi_examples_HW2/dgemm/.gitignore b/lecture15/papi_examples_HW2/dgemm/.gitignore
@@ -0,0 +1,2 @@
+dgemm_O*
+*.swp
diff --git a/lecture15/papi_examples_HW2/dgemm/Makefile b/lecture15/papi_examples_HW2/dgemm/Makefile
@@ -0,0 +1,29 @@
+CXX = g++
+CXXFLAGS = -g -Wall -Wextra -Wpedantic -I$(SHARED_DATA)/local/include -L$(SHARED_DATA)/local/lib
+LIBS = -lpapi
+
+# CXXFLAGS += -DCBLAS
+# LIBS += -lopenblas
+# CXXFLAGS += -DEIGEN -march=broadwell
+
+.PHONY: clean
+
+all: dgemm_O0 dgemm_O1 dgemm_O2 dgemm_O3_autovec dgemm_O3_noautovec
+
+dgemm_O0: dgemm.cpp
+	$(CXX) -O0 $(CXXFLAGS) -o $@ $< $(LIBS)
+
+dgemm_O1: dgemm.cpp
+	$(CXX) -O1 $(CXXFLAGS) -o $@ $< $(LIBS)
+
+dgemm_O2: dgemm.cpp
+	$(CXX) -O2 $(CXXFLAGS) -o $@ $< $(LIBS)
+
+dgemm_O3_autovec: dgemm.cpp
+	$(CXX) -O3 $(CXXFLAGS) -o $@ $< $(LIBS)
+
+dgemm_O3_noautovec: dgemm.cpp
+	$(CXX) -O3 -fno-tree-vectorize $(CXXFLAGS) -o $@ $< $(LIBS)
+
+clean:
+	rm -f dgemm_O*
diff --git a/lecture15/papi_examples_HW2/dgemm/dgemm.cpp b/lecture15/papi_examples_HW2/dgemm/dgemm.cpp
@@ -0,0 +1,139 @@
+#include "papi.h"
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+#ifdef CBLAS
+#include <cblas.h>
+#elif defined(EIGEN)
+#include <eigen3/Eigen/Dense>
+// Wrapper to map pointer to Eigen matrix type (needed in benchmark)
+template <typename T>
+using EMat = Eigen::Map<
+    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+#endif /* CBLAS */
+
+// Broadwell CPU on cluster, you can get one with
+//   salloc -N1 -c32 -t 01:00:00
+//
+// Model name: Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz
+// L1d cache:  32K
+// L1i cache:  32K
+// L2 cache:   256K
+// L3 cache:   40960K
+#define L1_SIZE_KB 32
+#define L2_SIZE_KB 256
+#define L3_SIZE_KB 40960
+
+typedef double Real;
+
+void dgemm(const Real *A, const Real *B, Real *__restrict__ C, const size_t n)
+{
+#ifdef CBLAS
+    cblas_dgemm(CblasRowMajor,
+                CblasNoTrans,
+                CblasNoTrans,
+                n,
+                n,
+                n,
+                1.0,
+                A,
+                n,
+                B,
+                n,
+                1.0,
+                C,
+                n);
+#elif defined(EIGEN)
+    // Matrix wrappers
+    const EMat<Real> Ap(const_cast<Real *>(A), n, n);
+    const EMat<Real> Bp(const_cast<Real *>(B), n, n);
+    EMat<Real> Cp(C, n, n);
+    Cp.noalias() += Ap * Bp;
+#else
+    for (size_t i = 0; i < n; ++i) {
+        for (size_t k = 0; k < n; ++k) {
+            for (size_t j = 0; j < n; ++j) {
+                C[i * n + j] += A[i * n + k] * B[k * n + j];
+            }
+        }
+    }
+#endif /* CBLAS */
+}
+
+int main(int argc, char *argv[])
+{
+    int n = 1000;
+    if (argc > 1) {
+        n = atoi(argv[1]);
+    }
+    std::vector<Real> A(n * n, 0.1);
+    std::vector<Real> B(n * n, 0.2);
+    std::vector<Real> C(n * n, 0.3);
+
+    // Initialize PAPI
+    int event_set = PAPI_NULL;
+    int events[4] = {PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_LST_INS, PAPI_L1_DCM};
+    long long int counters[4];
+    PAPI_library_init(PAPI_VER_CURRENT);
+    PAPI_create_eventset(&event_set);
+    PAPI_add_events(event_set, events, 4);
+
+    // warm up
+    dgemm(A.data(), B.data(), C.data(), n);
+
+    // start PAPI measurement
+    PAPI_start(event_set);
+
+    // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
+    // PAPI_TOT_INS slightly, neglected here)
+    const long long int t0 = PAPI_get_real_nsec();
+
+    // run code to be measured
+    dgemm(A.data(), B.data(), C.data(), n);
+
+    // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
+    // PAPI_TOT_INS slightly, neglected here)
+    const long long int t1 = PAPI_get_real_nsec();
+
+    // stop PAPI and get counter values
+    PAPI_stop(event_set, counters);
+
+    // clang-format off
+    const long long total_cycles = counters[0];       // cpu cycles
+    const long long total_instructions = counters[1]; // any
+    const long long total_load_stores = counters[2];  // number of such instructions
+    const long long total_l1d_misses = counters[3];   // number of access request to cache line
+    // clang-format on
+
+    const size_t flops = 2 * n * n * n + n * n;
+    const size_t mem_ops = 4 * n * n;
+    const double twall = (static_cast<double>(t1) - t0) * 1.0e-9; // seconds
+    const double IPC = static_cast<double>(total_instructions) / total_cycles;
+    const double OI =
+        static_cast<double>(flops) / (total_load_stores * sizeof(Real));
+    const double OI_theory =
+        static_cast<double>(flops) / (mem_ops * sizeof(Real));
+    const double float_perf = flops / twall * 1.0e-9; // Gflop/s
+    const double sum = std::accumulate(C.begin(), C.end(), 0.0);
+
+    std::cout << "Result:                       " << sum << '\n';
+    std::cout << "Total cycles:                 " << total_cycles << '\n';
+    std::cout << "Total instructions:           " << total_instructions << '\n';
+    std::cout << "Instructions per cycle (IPC): " << IPC << '\n';
+    std::cout << "L1 cache size:                " << L1_SIZE_KB << " KB\n";
+    std::cout << "L2 cache size:                " << L2_SIZE_KB << " KB\n";
+    std::cout << "L3 cache size:                " << L3_SIZE_KB << " KB\n";
+    std::cout << "Total problem size:           "
+              << 3 * n * n * sizeof(Real) / 1024 << " KB\n";
+    std::cout << "Total L1 data misses:         " << total_l1d_misses << '\n';
+    std::cout << "Total load/store:             " << total_load_stores
+              << " (expected: " << mem_ops << ")\n";
+    std::cout << "Operational intensity:        " << std::scientific << OI
+              << " (expected: " << OI_theory << ")\n";
+    std::cout << "Performance [Gflop/s]:        " << float_perf << '\n';
+    std::cout << "Wall-time   [micro-seconds]:  " << twall * 1.0e6 << '\n';
+
+    return 0;
+}
diff --git a/lecture15/papi_examples_HW2/dgemm/disassemble.sh b/lecture15/papi_examples_HW2/dgemm/disassemble.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+exec='dgemm'
+if [[ $# -ne 1 ]]; then
+    cat <<EOF
+    USAGE: $0 <$exec executable>
+EOF
+fi
+
+gdb $1 -batch -ex 'set disassembly-flavor intel' -ex "disassemble $exec"
diff --git a/lecture15/papi_examples_HW2/sgemv/.gitignore b/lecture15/papi_examples_HW2/sgemv/.gitignore
@@ -0,0 +1,2 @@
+sgemv_O*
+*.swp
diff --git a/lecture15/papi_examples_HW2/sgemv/Makefile b/lecture15/papi_examples_HW2/sgemv/Makefile
@@ -0,0 +1,25 @@
+CXX = g++
+CXXFLAGS = -g -Wall -Wextra -Wpedantic -I$(SHARED_DATA)/local/include -L$(SHARED_DATA)/local/lib
+LIBS = -lpapi
+
+.PHONY: clean
+
+all: sgemv_O0 sgemv_O1 sgemv_O2 sgemv_O3_autovec sgemv_O3_noautovec
+
+sgemv_O0: sgemv.cpp
+	$(CXX) -O0 $(CXXFLAGS) -o $@ $< $(LIBS)
+
+sgemv_O1: sgemv.cpp
+	$(CXX) -O1 $(CXXFLAGS) -o $@ $< $(LIBS)
+
+sgemv_O2: sgemv.cpp
+	$(CXX) -O2 $(CXXFLAGS) -o $@ $< $(LIBS)
+
+sgemv_O3_autovec: sgemv.cpp
+	$(CXX) -O3 $(CXXFLAGS) -o $@ $< $(LIBS)
+
+sgemv_O3_noautovec: sgemv.cpp
+	$(CXX) -O3 -fno-tree-vectorize $(CXXFLAGS) -o $@ $< $(LIBS)
+
+clean:
+	rm -f sgemv_O*
diff --git a/lecture15/papi_examples_HW2/sgemv/disassemble.sh b/lecture15/papi_examples_HW2/sgemv/disassemble.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+exec='sgemv'
+if [[ $# -ne 1 ]]; then
+    cat <<EOF
+    USAGE: $0 <$exec executable>
+EOF
+fi
+
+gdb $1 -batch -ex 'set disassembly-flavor intel' -ex "disassemble $exec"
diff --git a/lecture15/papi_examples_HW2/sgemv/sgemv.cpp b/lecture15/papi_examples_HW2/sgemv/sgemv.cpp
@@ -0,0 +1,104 @@
+#include "papi.h"
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <vector>
+
+// Broadwell CPU on cluster, you can get one with
+//   salloc -N1 -c32 -t 01:00:00
+//
+// Model name: Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz
+// L1d cache:  32K
+// L1i cache:  32K
+// L2 cache:   256K
+// L3 cache:   40960K
+#define L1_SIZE_KB 32
+#define L2_SIZE_KB 256
+#define L3_SIZE_KB 40960
+
+typedef float Real;
+
+void sgemv(const Real *A, const Real *x, Real *__restrict__ y, const size_t n)
+{
+    for (size_t j = 0; j < n; ++j) {
+        for (size_t i = 0; i < n; ++i) {
+            y[j] += A[j * n + i] * x[i];
+        }
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    int n = 1000;
+    if (argc > 1) {
+        n = atoi(argv[1]);
+    }
+    std::vector<Real> A(n * n, 0.1);
+    std::vector<Real> x(n, 1.0);
+    std::vector<Real> y(n, 0.001);
+
+    // Initialize PAPI
+    int event_set = PAPI_NULL;
+    int events[4] = {PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_LST_INS, PAPI_L1_DCM};
+    long long int counters[4];
+    PAPI_library_init(PAPI_VER_CURRENT);
+    PAPI_create_eventset(&event_set);
+    PAPI_add_events(event_set, events, 4);
+
+    // warm up
+    sgemv(A.data(), x.data(), y.data(), n);
+
+    // start PAPI measurement
+    PAPI_start(event_set);
+
+    // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
+    // PAPI_TOT_INS slightly, neglected here)
+    const long long int t0 = PAPI_get_real_nsec();
+
+    // run code to be measured
+    sgemv(A.data(), x.data(), y.data(), n);
+
+    // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
+    // PAPI_TOT_INS slightly, neglected here)
+    const long long int t1 = PAPI_get_real_nsec();
+
+    // stop PAPI and get counter values
+    PAPI_stop(event_set, counters);
+
+    // clang-format off
+    const long long total_cycles = counters[0];       // cpu cycles
+    const long long total_instructions = counters[1]; // any
+    const long long total_load_stores = counters[2];  // number of such instructions
+    const long long total_l1d_misses = counters[3];   // number of access request to cache line
+    // clang-format on
+
+    const size_t flops = 2 * n * n + n;
+    const size_t mem_ops = 2 * n * n + 2 * n;
+    const double twall = (static_cast<double>(t1) - t0) * 1.0e-9; // seconds
+    const double IPC = static_cast<double>(total_instructions) / total_cycles;
+    const double OI =
+        static_cast<double>(flops) / (total_load_stores * sizeof(Real));
+    const double OI_theory =
+        static_cast<double>(flops) / (mem_ops * sizeof(Real));
+    const double float_perf = flops / twall * 1.0e-9; // Gflop/s
+    const double sum = std::accumulate(y.begin(), y.end(), 0.0);
+
+    std::cout << "Result:                       " << sum << '\n';
+    std::cout << "Total cycles:                 " << total_cycles << '\n';
+    std::cout << "Total instructions:           " << total_instructions << '\n';
+    std::cout << "Instructions per cycle (IPC): " << IPC << '\n';
+    std::cout << "L1 cache size:                " << L1_SIZE_KB << " KB\n";
+    std::cout << "L2 cache size:                " << L2_SIZE_KB << " KB\n";
+    std::cout << "L3 cache size:                " << L3_SIZE_KB << " KB\n";
+    std::cout << "Total problem size:           "
+              << (n * n + 2 * n) * sizeof(Real) / 1024 << " KB\n";
+    std::cout << "Total L1 data misses:         " << total_l1d_misses << '\n';
+    std::cout << "Total load/store:             " << total_load_stores
+              << " (expected: " << mem_ops << ")\n";
+    std::cout << "Operational intensity:        " << std::scientific << OI
+              << " (expected: " << OI_theory << ")\n";
+    std::cout << "Performance [Gflop/s]:        " << float_perf << '\n';
+    std::cout << "Wall-time   [micro-seconds]:  " << twall * 1.0e6 << '\n';
+
+    return 0;
+}
diff --git a/lecture16/allocator_test/.gitignore b/lecture16/allocator_test/.gitignore
@@ -0,0 +1 @@
+allocator_test
diff --git a/lecture16/allocator_test/Makefile b/lecture16/allocator_test/Makefile
@@ -0,0 +1,9 @@
+CXX = g++
+CXXFLAGS = -O1 -g -Wall -Wextra -Wpedantic
+.PHONY: clean
+
+allocator_test: allocator_test.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f allocator_test
diff --git a/lecture16/allocator_test/allocator_test.cpp b/lecture16/allocator_test/allocator_test.cpp
@@ -0,0 +1,55 @@
+#include <cstdio>
+#include <cstdlib>
+
+#define _ALIGN_ 64
+
+struct BadObject_t {
+    char buf[57]; // 57 byte buffer
+};
+
+struct NiceObject_t {
+    char buf[57]; // 57 byte buffer
+    char pad[7];  // 7 byte padding
+};
+
+void testAddress(void *base,
+                 const size_t align_at,
+                 const char *msg,
+                 const size_t size)
+{
+    printf("%s [%lu]: misaligned by %lu bytes\n",
+           msg,
+           size,
+           reinterpret_cast<size_t>(base) % align_at);
+}
+
+template <typename T>
+void testAlignment()
+{
+    constexpr size_t size = sizeof(T);
+
+    // C++ way
+    T *p = new T[2];
+    testAddress(&p[0], _ALIGN_, "p[0]", size);
+    testAddress(&p[1], _ALIGN_, "p[1]", size);
+    delete[] p;
+
+    // C way
+    p = reinterpret_cast<T *>(malloc(2 * size));
+    testAddress(&p[0], _ALIGN_, "p[0]", size);
+    testAddress(&p[1], _ALIGN_, "p[1]", size);
+    free(p);
+
+    // POSIX memalign way
+    posix_memalign((void **)&p, _ALIGN_, 2 * size);
+    testAddress(&p[0], _ALIGN_, "p[0]", size);
+    testAddress(&p[1], _ALIGN_, "p[1]", size);
+    free(p);
+}
+
+int main()
+{
+    testAlignment<BadObject_t>();
+    testAlignment<NiceObject_t>();
+    return 0;
+}
diff --git a/lecture16/pointer_dereference/.gitignore b/lecture16/pointer_dereference/.gitignore
@@ -0,0 +1 @@
+pointer_dereference
diff --git a/lecture16/pointer_dereference/Makefile b/lecture16/pointer_dereference/Makefile
@@ -0,0 +1,9 @@
+CXX = g++
+CXXFLAGS = -O1 -g -Wall -Wextra -Wpedantic
+.PHONY: clean
+
+pointer_dereference: pointer_dereference.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f pointer_dereference
diff --git a/lecture16/pointer_dereference/pointer_dereference.cpp b/lecture16/pointer_dereference/pointer_dereference.cpp
@@ -0,0 +1,15 @@
+#include <iostream>
+
+int main()
+{
+    int *p = new int;
+    *p = 1;
+
+    std::cout << "Pointer address:             " << &p << '\n';
+    std::cout << "Address pointer points to:   " << p << '\n';
+    std::cout << "Hex value stored at address: " << std::hex << "0x0" << *p
+              << '\n';
+
+    delete p;
+    return 0;
+}
diff --git a/lecture17/registers/.gitignore b/lecture17/registers/.gitignore
@@ -0,0 +1,2 @@
+*.o
+registers
diff --git a/lecture17/registers/Makefile b/lecture17/registers/Makefile
@@ -0,0 +1,12 @@
+.PHONY: clean
+
+# linker
+registers: registers.o
+	ld -o $@ $<
+
+# assembly
+registers.o: registers.s
+	as -o $@ $<
+
+clean:
+	rm -f registers registers.o
diff --git a/lecture17/registers/registers.s b/lecture17/registers/registers.s
@@ -0,0 +1,11 @@
+.text
+.intel_syntax noprefix
+.global _start
+_start:
+    mov     rax, 0x1  # initialize rax with value 1
+    mov     rbx, 0x2  # initialize rbx with value 2
+    add     rax, rbx  # add them together and store result in rax
+    # this exits the program without libc and only works on 64-bit Linux:
+    mov     rax, 60
+    xor     rdi, rdi
+    syscall
diff --git a/lecture18_19/aliasing/.gitignore b/lecture18_19/aliasing/.gitignore
@@ -0,0 +1 @@
+align
diff --git a/lecture18_19/aliasing/Makefile b/lecture18_19/aliasing/Makefile
@@ -0,0 +1,12 @@
+.PHONY: clean
+
+all: main aliasing.s
+
+aliasing.s: aliasing.c
+	gcc -O3 -S -masm=intel -o $@ $<
+
+main: main.c aliasing.c
+	gcc -O3 -o $@ $^
+
+clean:
+	rm -f aliasing.s main
diff --git a/lecture18_19/aliasing/aliasing.c b/lecture18_19/aliasing/aliasing.c
@@ -0,0 +1,13 @@
+int f(int *a, int *b)
+{
+    *a = 1;
+    *b = 2;
+    return *a;
+}
+
+int g(int *__restrict__ a, int *__restrict__ b)
+{
+    *a = 1;
+    *b = 2;
+    return *a;
+}
diff --git a/lecture18_19/aliasing/main.c b/lecture18_19/aliasing/main.c
@@ -0,0 +1,15 @@
+#include <stdio.h>
+
+int f(int *a, int *b);
+int g(int *a, int *b);
+
+int main(void)
+{
+    int a = 0;
+    int *b = &a;              // legal alias
+    printf("%d\n", f(&a, b)); // expect 2
+
+    a = 0;
+    printf("%d\n", g(&a, b)); // expect 2(?)
+    return 0;
+}
diff --git a/lecture18_19/align/.gitignore b/lecture18_19/align/.gitignore
@@ -0,0 +1 @@
+align
diff --git a/lecture18_19/align/Makefile b/lecture18_19/align/Makefile
@@ -0,0 +1,7 @@
+.PHONY: clean
+
+align: align.c
+	gcc -o $@ $<
+
+clean:
+	rm -f align
diff --git a/lecture18_19/align/align.c b/lecture18_19/align/align.c
@@ -0,0 +1,30 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(void)
+{
+    printf("Byte alignment of double is: %zu\n", _Alignof(double));
+
+    printf("\nSTACK ALLOCATION:\n");
+    _Alignas(8) double d;
+    // _Alignas(32) double d;
+    // _Alignas(64) double d;
+    printf("Address of d %  2 = %zu\n", ((size_t)&d) % 2);
+    printf("Address of d %  4 = %zu\n", ((size_t)&d) % 4);
+    printf("Address of d %  8 = %zu\n", ((size_t)&d) % 8);
+    printf("Address of d % 16 = %zu\n", ((size_t)&d) % 16);
+    printf("Address of d % 32 = %zu\n", ((size_t)&d) % 32);
+    printf("Address of d % 64 = %zu\n", ((size_t)&d) % 64);
+
+    printf("\nHEAP ALLOCATION:\n");
+    double *pd;
+    posix_memalign((void **)&pd, 8, sizeof(double)); // segfault if alignment <8
+    printf("Address of pd %  2 = %zu\n", ((size_t)pd) % 2);
+    printf("Address of pd %  4 = %zu\n", ((size_t)pd) % 4);
+    printf("Address of pd %  8 = %zu\n", ((size_t)pd) % 8);
+    printf("Address of pd % 16 = %zu\n", ((size_t)pd) % 16);
+    printf("Address of pd % 32 = %zu\n", ((size_t)pd) % 32);
+    printf("Address of pd % 64 = %zu\n", ((size_t)pd) % 64);
+    free(pd);
+    return 0;
+}
diff --git a/lecture18_19/autovec/Makefile b/lecture18_19/autovec/Makefile
@@ -0,0 +1,24 @@
+.PHONY: clean
+
+all: sse4.s avx2.s avx2_fma.s sse4_omp.s avx2_omp.s avx2_fma_omp.s
+
+sse4.s: f.c
+	gcc -O3 -msse4 -S -masm=intel -o $@ $<
+
+avx2.s: f.c
+	gcc -O3 -mavx2 -S -masm=intel -o $@ $<
+
+avx2_fma.s: f.c
+	gcc -O3 -mavx2 -mfma -S -masm=intel -o $@ $<
+
+sse4_omp.s: f.c
+	gcc -O3 -msse4 -fopenmp -S -masm=intel -o $@ $<
+
+avx2_omp.s: f.c
+	gcc -O3 -mavx2 -fopenmp -S -masm=intel -o $@ $<
+
+avx2_fma_omp.s: f.c
+	gcc -O3 -mavx2 -mfma -fopenmp -S -masm=intel -o $@ $<
+
+clean:
+	rm -f sse4*.s avx2*.s avx2_fma*.s
diff --git a/lecture18_19/autovec/f.c b/lecture18_19/autovec/f.c
@@ -0,0 +1,20 @@
+// #define RESTRICT
+// #define IVDEP
+
+#ifdef RESTRICT
+void f(float *__restrict__ dst, float *__restrict__ src, float a)
+#else
+void f(float *dst, float *src, float a)
+#endif /* RESTRICT */
+{
+#ifdef IVDEP
+#pragma GCC ivdep
+#else
+#ifdef _OPENMP
+#pragma omp simd
+#endif /* _OPENMP */
+#endif /* IVDEP */
+    for (int i = 0; i < 1024; ++i) {
+        dst[i] = dst[i] + a * src[i];
+    }
+}
diff --git a/lecture18_19/intrinsics/_mm_set_ps1/.gitignore b/lecture18_19/intrinsics/_mm_set_ps1/.gitignore
@@ -0,0 +1,3 @@
+*.s
+*.o
+main
diff --git a/lecture18_19/intrinsics/_mm_set_ps1/Makefile b/lecture18_19/intrinsics/_mm_set_ps1/Makefile
@@ -0,0 +1,15 @@
+.PHONY: clean
+
+all: sse4.s ones.o main
+
+sse4.s: ones.c
+	gcc -O3 -msse4 -S -masm=intel -o $@ $<
+
+main: main.c ones.o
+	gcc -g -O0 -o $@ $^
+
+ones.o: ones.c
+	gcc -g -O3 -msse4 -c -o $@ $<
+
+clean:
+	rm -f sse4*.s ones.o main
diff --git a/lecture18_19/intrinsics/_mm_set_ps1/main.c b/lecture18_19/intrinsics/_mm_set_ps1/main.c
@@ -0,0 +1,9 @@
+#include <x86intrin.h>
+
+__m128 ones();
+
+int main(void)
+{
+    __m128 ret = ones();
+    return 0;
+}
diff --git a/lecture18_19/intrinsics/_mm_set_ps1/ones.c b/lecture18_19/intrinsics/_mm_set_ps1/ones.c
@@ -0,0 +1,6 @@
+#include <x86intrin.h>
+__m128 ones(void)
+{
+    // broadcast value 1.0f (float) in all 4 SIMD lanes
+    return _mm_set_ps1(1.0f);
+}
diff --git a/lecture18_19/intrinsics/constants/set1_ps.c b/lecture18_19/intrinsics/constants/set1_ps.c
@@ -0,0 +1,3 @@
+#include <x86intrin.h>
+__m128 set1_ps() { return _mm_set1_ps(1.0f); }     // compile-time constant
+__m128 set1_ps(float x) { return _mm_set1_ps(x); } // set by argument
diff --git a/lecture18_19/intrinsics/constants/set_ps.c b/lecture18_19/intrinsics/constants/set_ps.c
@@ -0,0 +1,5 @@
+#include <x86intrin.h>
+__m128 set_ps(float x0, float x1, float x2, float x3)
+{
+    return _mm_set_ps(x3, x2, x1, x0);
+}
diff --git a/lecture18_19/intrinsics/constants/set_ss.c b/lecture18_19/intrinsics/constants/set_ss.c
@@ -0,0 +1,3 @@
+#include <x86intrin.h>
+__m128 set_ss() { return _mm_set_ss(1.0f); }     // compile-time constant
+__m128 set_ss(float x) { return _mm_set_ss(x); } // set by argument
diff --git a/lecture18_19/intrinsics/constants/setr_ps.c b/lecture18_19/intrinsics/constants/setr_ps.c
@@ -0,0 +1,5 @@
+#include <x86intrin.h>
+__m128 setr_ps(float x0, float x1, float x2, float x3)
+{
+    return _mm_setr_ps(x3, x2, x1, x0);
+}
diff --git a/lecture18_19/intrinsics/constants/setzero.c b/lecture18_19/intrinsics/constants/setzero.c
@@ -0,0 +1,2 @@
+#include <x86intrin.h>
+__m128 setzero() { return _mm_setzero_ps(); }
diff --git a/lecture18_19/intrinsics/load_store/aligned_load_store.c b/lecture18_19/intrinsics/load_store/aligned_load_store.c
@@ -0,0 +1,7 @@
+#include <x86intrin.h>
+void f(float *a, float *b)
+{
+    __m128 r0 = _mm_load_ps(a);
+    r0 = _mm_add_ps(r0, _mm_load_ps(a + 4));
+    _mm_store_ps(b, r0);
+}
diff --git a/lecture18_19/intrinsics/load_store/reverse_load_store.c b/lecture18_19/intrinsics/load_store/reverse_load_store.c
@@ -0,0 +1,2 @@
+#include <x86intrin.h>
+void f(float *a, float *b) { _mm_storeu_ps(b, _mm_loadr_ps(a)); }
diff --git a/lecture18_19/intrinsics/load_store/unaligned_load_store.c b/lecture18_19/intrinsics/load_store/unaligned_load_store.c
@@ -0,0 +1,7 @@
+#include <x86intrin.h>
+void f(float *a, float *b)
+{
+    __m128 r0 = _mm_loadu_ps(a);
+    r0 = _mm_add_ps(r0, _mm_loadu_ps(a + 4));
+    _mm_storeu_ps(b, r0);
+}
diff --git a/lecture18_19/intrinsics/transpose/transpose.c b/lecture18_19/intrinsics/transpose/transpose.c
@@ -0,0 +1,15 @@
+#include <x86intrin.h>
+void transpose4(float A[4][4])
+{
+    __m128 a = _mm_load_ps(&A[0][0]);
+    __m128 b = _mm_load_ps(&A[1][0]);
+    __m128 c = _mm_load_ps(&A[2][0]);
+    __m128 d = _mm_load_ps(&A[3][0]);
+
+    _MM_TRANSPOSE4_PS(a, b, c, d);
+
+    _mm_store_ps(&A[0][0], a);
+    _mm_store_ps(&A[1][0], b);
+    _mm_store_ps(&A[2][0], c);
+    _mm_store_ps(&A[3][0], d);
+}
diff --git a/lecture18_19/saxpy/.gitignore b/lecture18_19/saxpy/.gitignore
@@ -0,0 +1,2 @@
+main
+*.o
diff --git a/lecture18_19/saxpy/Makefile b/lecture18_19/saxpy/Makefile
@@ -0,0 +1,17 @@
+CXX = gcc
+.PHONY: clean
+
+main: main.c saxpy.o saxpy_SSE.o saxpy_SSE_FMA.o
+	$(CXX) -O1 -fopenmp -o $@ $^
+
+saxpy.o: saxpy.c
+	$(CXX) -O1 -c -o $@ $<
+
+saxpy_SSE.o: saxpy_SSE.c
+	$(CXX) -O3 -msse4 -c -o $@ $<
+
+saxpy_SSE_FMA.o: saxpy_SSE_FMA.c
+	$(CXX) -O3 -msse4 -mfma -c -o $@ $<
+
+clean:
+	rm -f main *.o
diff --git a/lecture18_19/saxpy/main.c b/lecture18_19/saxpy/main.c
@@ -0,0 +1,66 @@
+#include <omp.h>
+#include <stdio.h>
+#include <x86intrin.h>
+
+#define N (1 << 11) // both vectors fit in cache
+// #define N (1 << 26) // vectors too large to fit in cache -> bandwidth bound
+
+// compute kernels
+void saxpy(float *x, float *y, float a, size_t n);
+void saxpy_SSE(float *x, float *y, float a, size_t n);
+void saxpy_SSE_FMA(float *x, float *y, float a, size_t n);
+
+void initialize(float *x, size_t n)
+{
+    for (size_t i = 0; i < n; ++i) {
+        x[i] = 1.0f;
+    }
+}
+
+int main(void)
+{
+    const int n_sample = 50;
+
+    // allocate aligned vectors:
+    float *x = (float *)_mm_malloc(N * sizeof(float), 16);
+    float *y = (float *)_mm_malloc(N * sizeof(float), 16);
+
+    // initialize data:
+    initialize(x, N);
+    initialize(y, N);
+
+    // scalar:
+    saxpy(x, y, 2.0, N); // warm-up cache
+    double t0 = omp_get_wtime();
+    for (int i = 0; i < n_sample; ++i) {
+        saxpy(x, y, 2.0, N);
+    }
+    double t1 = omp_get_wtime();
+    const double t_gold = (t1 - t0) / n_sample;
+    printf("Scalar saxpy: %e sec\n", t_gold);
+
+    // SIMD (SSE, 128-bit):
+    saxpy_SSE(x, y, 2.0, N); // warm-up cache
+    t0 = omp_get_wtime();
+    for (int i = 0; i < n_sample; ++i) {
+        saxpy_SSE(x, y, 2.0, N);
+    }
+    t1 = omp_get_wtime();
+    const double t_sse = (t1 - t0) / n_sample;
+    printf("SSE saxpy: %e sec (%.2fx speedup)\n", t_sse, t_gold / t_sse);
+
+    // SIMD w/ FMA (SSE, 128-bit):
+    saxpy_SSE_FMA(x, y, 2.0, N); // warm-up cache
+    t0 = omp_get_wtime();
+    for (int i = 0; i < n_sample; ++i) {
+        saxpy_SSE_FMA(x, y, 2.0, N);
+    }
+    t1 = omp_get_wtime();
+    const double t_fma = (t1 - t0) / n_sample;
+    printf("SSE FMA saxpy: %e sec (%.2fx speedup)\n", t_fma, t_gold / t_sse);
+
+    // clean up:
+    _mm_free(x);
+    _mm_free(y);
+    return 0;
+}
diff --git a/lecture18_19/saxpy/saxpy.c b/lecture18_19/saxpy/saxpy.c
@@ -0,0 +1,7 @@
+#include <stddef.h>
+void saxpy(float *x, float *y, float a, size_t n)
+{
+    for (size_t i = 0; i < n; ++i) {
+        y[i] = a * x[i] + y[i];
+    }
+}
diff --git a/lecture18_19/saxpy/saxpy.s b/lecture18_19/saxpy/saxpy.s
@@ -0,0 +1,26 @@
+	.file	"saxpy.c"
+	.intel_syntax noprefix
+	.text
+	.globl	saxpy
+	.type	saxpy, @function
+saxpy:
+.LFB0:
+	.cfi_startproc
+	test	rdx, rdx
+	je	.L1
+	mov	eax, 0
+.L3:
+	movaps	xmm1, xmm0
+	mulss	xmm1, DWORD PTR [rdi+rax*4]
+	addss	xmm1, DWORD PTR [rsi+rax*4]
+	movss	DWORD PTR [rsi+rax*4], xmm1
+	add	rax, 1
+	cmp	rdx, rax
+	jne	.L3
+.L1:
+	ret
+	.cfi_endproc
+.LFE0:
+	.size	saxpy, .-saxpy
+	.ident	"GCC: (GNU) 11.2.0"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/lecture18_19/saxpy/saxpy_SSE.c b/lecture18_19/saxpy/saxpy_SSE.c
@@ -0,0 +1,12 @@
+#include <x86intrin.h>
+
+void saxpy_SSE(float *x, float *y, float a, size_t n)
+{
+    const __m128 a4 = _mm_set1_ps(a);
+    // assumes n % 4 == 0
+    for (size_t i = 0; i < n; i += 4) {
+        __m128 r0 = _mm_load_ps(x + i);
+        __m128 r1 = _mm_load_ps(y + i);
+        _mm_store_ps(y + i, _mm_add_ps(_mm_mul_ps(r0, a4), r1));
+    }
+}
diff --git a/lecture18_19/saxpy/saxpy_SSE.s b/lecture18_19/saxpy/saxpy_SSE.s
@@ -0,0 +1,30 @@
+	.file	"saxpy_SSE.c"
+	.intel_syntax noprefix
+	.text
+	.p2align 4
+	.globl	saxpy_SSE
+	.type	saxpy_SSE, @function
+saxpy_SSE:
+.LFB5667:
+	.cfi_startproc
+	shufps	xmm0, xmm0, 0
+	test	rdx, rdx
+	je	.L1
+	xor	eax, eax
+	.p2align 4,,10
+	.p2align 3
+.L3:
+	movaps	xmm1, XMMWORD PTR [rdi+rax*4]
+	mulps	xmm1, xmm0
+	addps	xmm1, XMMWORD PTR [rsi+rax*4]
+	movaps	XMMWORD PTR [rsi+rax*4], xmm1
+	add	rax, 4
+	cmp	rdx, rax
+	ja	.L3
+.L1:
+	ret
+	.cfi_endproc
+.LFE5667:
+	.size	saxpy_SSE, .-saxpy_SSE
+	.ident	"GCC: (GNU) 11.2.0"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/lecture18_19/saxpy/saxpy_SSE_FMA.c b/lecture18_19/saxpy/saxpy_SSE_FMA.c
@@ -0,0 +1,11 @@
+#include <x86intrin.h>
+
+void saxpy_SSE_FMA(float *x, float *y, float a, size_t n)
+{
+    const __m128 a4 = _mm_set1_ps(a);
+    // assumes n % 4 == 0
+    for (size_t i = 0; i < n; i += 4) {
+        _mm_store_ps(y + i,
+                     _mm_fmadd_ps(a4, _mm_load_ps(x + i), _mm_load_ps(y + i)));
+    }
+}
diff --git a/lecture18_19/sgemv/.gitignore b/lecture18_19/sgemv/.gitignore
@@ -0,0 +1 @@
+*.swp
diff --git a/lecture18_19/sgemv/Makefile b/lecture18_19/sgemv/Makefile
@@ -0,0 +1,9 @@
+CXX = g++
+CXXFLAGS = -O2 -S -masm=intel
+.PHONY: clean
+
+sgemv.s: sgemv.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $<
+
+clean:
+	rm -f sgemv.s
diff --git a/lecture18_19/sgemv/sgemv.cpp b/lecture18_19/sgemv/sgemv.cpp
@@ -0,0 +1,22 @@
+#include <cstdlib>
+
+void sgemv_noaliasing(const float *A,
+                      const float *x,
+                      float *__restrict__ y,
+                      const size_t n)
+{
+    for (size_t j = 0; j < n; ++j) {
+        for (size_t i = 0; i < n; ++i) {
+            y[j] += A[j * n + i] * x[i];
+        }
+    }
+}
+
+void sgemv_aliasing(const float *A, const float *x, float *y, const size_t n)
+{
+    for (size_t j = 0; j < n; ++j) {
+        for (size_t i = 0; i < n; ++i) {
+            y[j] += A[j * n + i] * x[i];
+        }
+    }
+}
diff --git a/lecture18_19/sgemv/sgemv.s b/lecture18_19/sgemv/sgemv.s
@@ -0,0 +1,73 @@
+	.file	"sgemv.cpp"
+	.intel_syntax noprefix
+	.text
+	.p2align 4
+	.globl	_Z16sgemv_noaliasingPKfS0_Pfm
+	.type	_Z16sgemv_noaliasingPKfS0_Pfm, @function
+_Z16sgemv_noaliasingPKfS0_Pfm:
+.LFB20:
+	.cfi_startproc
+	test	rcx, rcx
+	je	.L1
+	lea	r8, 0[0+rcx*4]
+	lea	r9, [rdx+r8]
+	.p2align 4,,10
+	.p2align 3
+.L3:
+	movss	xmm1, DWORD PTR [rdx]
+	xor	eax, eax
+	.p2align 4,,10
+	.p2align 3
+.L4:
+	movss	xmm0, DWORD PTR [rdi+rax*4]
+	mulss	xmm0, DWORD PTR [rsi+rax*4]
+	add	rax, 1
+	addss	xmm1, xmm0
+	cmp	rcx, rax
+	jne	.L4
+	movss	DWORD PTR [rdx], xmm1
+	add	rdx, 4
+	add	rdi, r8
+	cmp	r9, rdx
+	jne	.L3
+.L1:
+	ret
+	.cfi_endproc
+.LFE20:
+	.size	_Z16sgemv_noaliasingPKfS0_Pfm, .-_Z16sgemv_noaliasingPKfS0_Pfm
+	.p2align 4
+	.globl	_Z14sgemv_aliasingPKfS0_Pfm
+	.type	_Z14sgemv_aliasingPKfS0_Pfm, @function
+_Z14sgemv_aliasingPKfS0_Pfm:
+.LFB21:
+	.cfi_startproc
+	test	rcx, rcx
+	je	.L10
+	lea	r8, 0[0+rcx*4]
+	lea	r9, [rdx+r8]
+	.p2align 4,,10
+	.p2align 3
+.L12:
+	movss	xmm1, DWORD PTR [rdx]
+	xor	eax, eax
+	.p2align 4,,10
+	.p2align 3
+.L13:
+	movss	xmm0, DWORD PTR [rdi+rax*4]
+	mulss	xmm0, DWORD PTR [rsi+rax*4]
+	add	rax, 1
+	addss	xmm1, xmm0
+	movss	DWORD PTR [rdx], xmm1
+	cmp	rcx, rax
+	jne	.L13
+	add	rdx, 4
+	add	rdi, r8
+	cmp	r9, rdx
+	jne	.L12
+.L10:
+	ret
+	.cfi_endproc
+.LFE21:
+	.size	_Z14sgemv_aliasingPKfS0_Pfm, .-_Z14sgemv_aliasingPKfS0_Pfm
+	.ident	"GCC: (GNU) 11.2.0"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/lecture20/ispc_data_layout/.gitignore b/lecture20/ispc_data_layout/.gitignore
@@ -0,0 +1,2 @@
+main
+*.o
diff --git a/lecture20/ispc_data_layout/Makefile b/lecture20/ispc_data_layout/Makefile
@@ -0,0 +1,17 @@
+# File       : Makefile
+# Description: Compile targets
+# Copyright 2022 Harvard University. All Rights Reserved.
+CXX = gcc
+ISPC = ispc
+CXXFLAGS = -Wall -Wextra -Wpedantic
+ISPCFLAGS = --arch=x86-64 --target=sse4-i32x4 --pic
+.PHONY: clean
+
+main: main.c kernels.o
+	$(CXX) $(CXXFLAGS) -g -O0 -o $@ $^
+
+kernels.o: kernels.ispc
+	$(ISPC) -g -O0 -DNDEBUG $(ISPCFLAGS) -o $@ $<
+
+clean:
+	rm -f main *.o
diff --git a/lecture20/ispc_data_layout/kernels.ispc b/lecture20/ispc_data_layout/kernels.ispc
@@ -0,0 +1,40 @@
+export void aos()
+{
+    struct Foo {
+        float x, y, z;
+    };
+    uniform Foo a[4] = {
+        {0, 1, 2},
+        {3, 4, 5},
+        {6, 7, 8},
+        {9, 10, 11},
+    };
+    int index = programIndex; // varying int
+    float v = a[index].x;
+}
+
+export void hybrid_soa()
+{
+    struct Foo4 {
+        float x[4], y[4], z[4];
+    };
+    uniform Foo4 a[2] = {
+        {{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}},
+        {{12, 13, 14, 15}, {16, 17, 18, 19}, {20, 21, 22, 23}},
+    };
+    int index = programIndex; // varying int
+    float v = a[index / 4].x[index & 3];
+}
+
+export void short_soa()
+{
+    struct Foo {
+        float x, y, z;
+    };
+    soa<4> struct Foo a[2] = {
+        {{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}},
+        {{12, 13, 14, 15}, {16, 17, 18, 19}, {20, 21, 22, 23}},
+    };
+    int index = programIndex; // varying int
+    float v = a[index].x;
+}
diff --git a/lecture20/ispc_data_layout/main.c b/lecture20/ispc_data_layout/main.c
@@ -0,0 +1,11 @@
+int aos();
+int hybrid_soa();
+int short_soa();
+
+int main(void)
+{
+    aos();
+    hybrid_soa();
+    short_soa();
+    return 0;
+}
diff --git a/lecture20/ispc_foreach/.gitignore b/lecture20/ispc_foreach/.gitignore
@@ -0,0 +1,2 @@
+main
+*.o
diff --git a/lecture20/ispc_foreach/Makefile b/lecture20/ispc_foreach/Makefile
@@ -0,0 +1,17 @@
+# File       : Makefile
+# Description: Compile targets
+# Copyright 2022 Harvard University. All Rights Reserved.
+CXX = gcc
+ISPC = ispc
+CXXFLAGS = -Wall -Wextra -Wpedantic
+ISPCFLAGS = --arch=x86-64 --target=sse4-i32x4 --pic
+.PHONY: clean
+
+main: main.c kernels.o
+	$(CXX) $(CXXFLAGS) -g -O0 -o $@ $^
+
+kernels.o: kernels.ispc
+	$(ISPC) -g -O0 -DNDEBUG $(ISPCFLAGS) -o $@ $<
+
+clean:
+	rm -f main *.o
diff --git a/lecture20/ispc_foreach/kernels.ispc b/lecture20/ispc_foreach/kernels.ispc
@@ -0,0 +1,17 @@
+export uniform int f(const uniform int a[], const uniform int n)
+{
+    varying int sum4 = 0.0;
+    for (uniform int i = 0; i < n; i += programCount) {
+        sum4 += a[i + programIndex];
+    }
+    return reduce_add(sum4);
+}
+
+export uniform int g(const uniform int a[], const uniform int n)
+{
+    varying int sum4 = 0.0;
+    foreach (i = 0 ... n) {
+        sum4 += a[i];
+    }
+    return reduce_add(sum4);
+}
diff --git a/lecture20/ispc_foreach/main.c b/lecture20/ispc_foreach/main.c
@@ -0,0 +1,10 @@
+int f();
+int g();
+
+int main(void)
+{
+    int ary[] = {1, 2, 3, 4};
+    int a = f(ary, 4);
+    int b = g(ary, 4);
+    return a + b;
+}
diff --git a/lecture20/ispc_pointers_arrays/.gitignore b/lecture20/ispc_pointers_arrays/.gitignore
@@ -0,0 +1,2 @@
+main
+*.o
diff --git a/lecture20/ispc_pointers_arrays/Makefile b/lecture20/ispc_pointers_arrays/Makefile
@@ -0,0 +1,17 @@
+# File       : Makefile
+# Description: Compile targets
+# Copyright 2022 Harvard University. All Rights Reserved.
+CXX = gcc
+ISPC = ispc
+CXXFLAGS = -Wall -Wextra -Wpedantic
+ISPCFLAGS = --arch=x86-64 --target=sse4-i32x4 --pic
+.PHONY: clean
+
+main: main.c f.o
+	$(CXX) $(CXXFLAGS) -g -O1 -o $@ $^
+
+f.o: f.ispc
+	$(ISPC) -g -O0 -DNDEBUG $(ISPCFLAGS) -o $@ $<
+
+clean:
+	rm -f main *.o
diff --git a/lecture20/ispc_pointers_arrays/f.ispc b/lecture20/ispc_pointers_arrays/f.ispc
@@ -0,0 +1,35 @@
+export uniform int f()
+{
+    // uniform array shared among gang
+    uniform int a[] = {0, 1, 2, 3};
+
+    // varying array per programIndex
+    varying int b[] = {0, 1, 2, 3, 4};
+
+    if (programIndex == 0) {
+        a[programIndex] = -1;
+        b[programIndex] = -1;
+    }
+    if (programIndex == 1) {
+        // shared among gang
+        a[programIndex] = -2;
+
+        // can only access data for this program instance
+        b[programIndex - 1] = -2;
+        b[programIndex + 0] = -3;
+        b[programIndex + 1] = -4;
+    }
+
+    // you can also define pointers to arrays
+    uniform int *uniform uupa = a; // OK
+    uniform int *varying uvpa = a; // OK (address of array is broadcast)
+    // varying int *uniform vupa = a; // NOT OK!
+    // varying int *varying vvpa = a; // NOT OK!
+
+    // uniform int *uniform uupb = b; // NOT OK
+    // uniform int *varying uvpb = b; // NOT OK
+    varying int *uniform vupb = b; // OK
+    varying int *varying vvpb = b; // OK
+
+    return reduce_add(a[programIndex]) + reduce_add(b[programIndex]);
+}
diff --git a/lecture20/ispc_pointers_arrays/main.c b/lecture20/ispc_pointers_arrays/main.c
@@ -0,0 +1,8 @@
+#include <stdio.h>
+int f();
+
+int main(void)
+{
+    printf("f() = %d\n", f());
+    return 0;
+}
diff --git a/lecture20/maximal_convergence/.gitignore b/lecture20/maximal_convergence/.gitignore
@@ -0,0 +1,2 @@
+main
+*.o
diff --git a/lecture20/maximal_convergence/Makefile b/lecture20/maximal_convergence/Makefile
@@ -0,0 +1,17 @@
+# File       : Makefile
+# Description: Compile targets
+# Copyright 2022 Harvard University. All Rights Reserved.
+CXX ?= g++
+ISPC = ispc
+CXXFLAGS = -Wall -Wextra -Wpedantic -std=c++11
+ISPCFLAGS = --arch=x86-64 --target=sse4-i32x4
+.PHONY: clean
+
+main: main.cpp f.o
+	$(CXX) $(CXXFLAGS) -g -O1 -o $@ $^
+
+f.o: f.ispc
+	$(ISPC) -g -O0 $(extra) -DNDEBUG $(ISPCFLAGS) -o $@ $<
+
+clean:
+	rm -f main *.o
diff --git a/lecture20/maximal_convergence/f.ispc b/lecture20/maximal_convergence/f.ispc
@@ -0,0 +1,14 @@
+export uniform int f()
+{
+    varying int a = {1, -1, 1, 1}; // vector (could omit `varying` as well)
+    varying int b = {-1, -1, -1, -1}; // vector
+
+    if (a < 0) {
+        a = 0;
+    } else {
+        a += b;
+    }
+
+    uniform int x = 1; // scalar storage
+    return x + reduce_add(a); // expect: 1
+}
diff --git a/lecture20/maximal_convergence/main.cpp b/lecture20/maximal_convergence/main.cpp
@@ -0,0 +1,8 @@
+#include <iostream>
+extern "C" int f(void);
+
+int main(void)
+{
+    std::cout << f() << std::endl;
+    return 0;
+}
diff --git a/lecture20/sse_vector/.gitignore b/lecture20/sse_vector/.gitignore
@@ -0,0 +1 @@
+sse_vector
diff --git a/lecture20/sse_vector/Makefile b/lecture20/sse_vector/Makefile
@@ -0,0 +1,7 @@
+.PHONY: clean
+
+sse_vector: sse_vector.c
+	gcc -g -O0 -msse -o $@ $^
+
+clean:
+	rm -f sse_vector
diff --git a/lecture20/sse_vector/sse_vector.c b/lecture20/sse_vector/sse_vector.c
@@ -0,0 +1,8 @@
+#include <x86intrin.h>
+
+int main()
+{
+    __m128 a = _mm_set_ps(4.0, 3.0, 2.0, 1.0);
+
+    return 0;
+}
diff --git a/lecture21/cuda_vec_add/.gitignore b/lecture21/cuda_vec_add/.gitignore
@@ -0,0 +1,2 @@
+main
+*.o
diff --git a/lecture21/cuda_vec_add/Makefile b/lecture21/cuda_vec_add/Makefile
@@ -0,0 +1,7 @@
+.PHONY: clean
+
+main: add_kernel_grid.cu
+	nvcc -O2 -o $@ $<
+
+clean:
+	rm -f main *.o
diff --git a/lecture21/cuda_vec_add/add_kernel_grid.cu b/lecture21/cuda_vec_add/add_kernel_grid.cu
@@ -0,0 +1,51 @@
+// File       : add_kernel_grid.cu
+// Description: Simple vector add kernel (launch grid: 4096 blocks, @256)
+// Copyright 2022 Harvard University. All Rights Reserved.
+#include <cmath>
+#include <iostream>
+
+// Kernel function to add the elements of two arrays
+__global__ void add(int n, float *x, float *y)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int stride = blockDim.x * gridDim.x;
+    for (int i = tid; i < n; i += stride)
+        y[i] = x[i] + y[i];
+}
+
+int main(void)
+{
+    int N = 1 << 20;
+    float *x, *y;
+
+    // Allocate Unified Memory – accessible from CPU or GPU
+    cudaMallocManaged(&x, N * sizeof(float));
+    cudaMallocManaged(&y, N * sizeof(float));
+
+    // initialize x and y arrays on the host
+    for (int i = 0; i < N; i++) {
+        x[i] = 1.0f;
+        y[i] = 2.0f;
+    }
+
+    // Run kernel on 1M elements on the GPU
+    int block_size = 256;
+    int n_blocks = (N + block_size - 1) / block_size;
+    add<<<n_blocks, block_size>>>(N, x, y);
+
+    // Wait for GPU to finish before accessing on host.  CUDA kernel calls do
+    // not block the calling CPU thread.
+    cudaDeviceSynchronize();
+
+    // Check for errors (all values should be 3.0f)
+    float maxError = 0.0f;
+    for (int i = 0; i < N; i++)
+        maxError = std::fmax(maxError, std::fabs(y[i] - 3.0f));
+    std::cout << "Max error: " << maxError << std::endl;
+
+    // Free memory
+    cudaFree(x);
+    cudaFree(y);
+
+    return 0;
+}

	cs205-lecture-examples Example codes used during Harvard CS205 lectures
	git clone https://git.0xfab.ch/cs205-lecture-examples.git
	Log \| Files \| Refs \| README \| LICENSE

A	lecture02/compilation_phases/.gitignore	\|	4	++++
A	lecture02/compilation_phases/Makefile	\|	22	++++++++++++++++++++++
A	lecture02/compilation_phases/hello.c	\|	8	++++++++
A	lecture02/heap_and_stack/.gitignore	\|	2	++
A	lecture02/heap_and_stack/Makefile	\|	13	+++++++++++++
A	lecture02/heap_and_stack/heap_stack.c	\|	23	+++++++++++++++++++++++
A	lecture02/heap_and_stack/stackoverflow.c	\|	22	++++++++++++++++++++++
A	lecture03/multi_index_and_flat_index/.gitignore	\|	1	+
A	lecture03/multi_index_and_flat_index/Makefile	\|	8	++++++++
A	lecture03/multi_index_and_flat_index/multi_and_flat.cpp	\|	45	+++++++++++++++++++++++++++++++++++++++++++++
A	lecture04/counter/.gitignore	\|	1	+
A	lecture04/counter/Makefile	\|	8	++++++++
A	lecture04/counter/counter.cpp	\|	25	+++++++++++++++++++++++++
A	lecture05/address_sanitizer/openmp/.gitignore	\|	3	+++
A	lecture05/address_sanitizer/openmp/Makefile	\|	23	+++++++++++++++++++++++
A	lecture05/address_sanitizer/openmp/two_ompthreads.cpp	\|	28	++++++++++++++++++++++++++++
A	lecture05/address_sanitizer/pthreads/.gitignore	\|	3	+++
A	lecture05/address_sanitizer/pthreads/Makefile	\|	23	+++++++++++++++++++++++
A	lecture05/address_sanitizer/pthreads/two_pthreads.cpp	\|	63	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture05/sequential_consistency/.gitignore	\|	2	++
A	lecture05/sequential_consistency/Makefile	\|	14	++++++++++++++
A	lecture05/sequential_consistency/README.md	\|	5	+++++
A	lecture05/sequential_consistency/scenario_1_3.cpp	\|	51	+++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture05/sequential_consistency/scenario_2.cpp	\|	34	++++++++++++++++++++++++++++++++++
A	lecture06/omp_conditional/.gitignore	\|	2	++
A	lecture06/omp_conditional/Makefile	\|	14	++++++++++++++
A	lecture06/omp_conditional/omp_conditional.cpp	\|	12	++++++++++++
A	lecture06/omp_demo/.gitignore	\|	1	+
A	lecture06/omp_demo/Makefile	\|	9	+++++++++
A	lecture06/omp_demo/omp_demo.cpp	\|	19	+++++++++++++++++++
A	lecture07_08/omp_critical/.gitignore	\|	2	++
A	lecture07_08/omp_critical/Makefile	\|	14	++++++++++++++
A	lecture07_08/omp_critical/omp_critical.cpp	\|	38	++++++++++++++++++++++++++++++++++++++
A	lecture07_08/omp_data_attributes/.gitignore	\|	5	+++++
A	lecture07_08/omp_data_attributes/Makefile	\|	29	+++++++++++++++++++++++++++++
A	lecture07_08/omp_data_attributes/omp_firstprivate.cpp	\|	24	++++++++++++++++++++++++
A	lecture07_08/omp_data_attributes/omp_lastprivate.cpp	\|	41	+++++++++++++++++++++++++++++++++++++++++
A	lecture07_08/omp_data_attributes/omp_private.cpp	\|	24	++++++++++++++++++++++++
A	lecture07_08/omp_data_attributes/omp_region.cpp	\|	54	++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture07_08/omp_data_attributes/omp_static.cpp	\|	23	+++++++++++++++++++++++
A	lecture07_08/omp_data_attributes/omp_threadprivate.cpp	\|	33	+++++++++++++++++++++++++++++++++
A	lecture07_08/omp_default/.gitignore	\|	1	+
A	lecture07_08/omp_default/Makefile	\|	9	+++++++++
A	lecture07_08/omp_default/omp_default.cpp	\|	15	+++++++++++++++
A	lecture07_08/omp_device/.gitignore	\|	1	+
A	lecture07_08/omp_device/Makefile	\|	15	+++++++++++++++
A	lecture07_08/omp_device/omp_device.cpp	\|	19	+++++++++++++++++++
A	lecture07_08/omp_schedule/.gitignore	\|	1	+
A	lecture07_08/omp_schedule/Makefile	\|	9	+++++++++
A	lecture07_08/omp_schedule/omp_schedule.cpp	\|	38	++++++++++++++++++++++++++++++++++++++
A	lecture07_08/omp_scoping/Makefile	\|	12	++++++++++++
A	lecture07_08/omp_scoping/main.c	\|	18	++++++++++++++++++
A	lecture07_08/omp_scoping/myfunc.c	\|	6	++++++
A	lecture07_08/omp_wtime/.gitignore	\|	1	+
A	lecture07_08/omp_wtime/Makefile	\|	9	+++++++++
A	lecture07_08/omp_wtime/omp_wtime.cpp	\|	35	+++++++++++++++++++++++++++++++++++
A	lecture09/thread_affinity/.gitignore	\|	4	++++
A	lecture09/thread_affinity/Makefile	\|	20	++++++++++++++++++++
A	lecture09/thread_affinity/affinity.cpp	\|	35	+++++++++++++++++++++++++++++++++++
A	lecture09/thread_affinity/affinity_close.cpp	\|	35	+++++++++++++++++++++++++++++++++++
A	lecture09/thread_affinity/affinity_master.cpp	\|	35	+++++++++++++++++++++++++++++++++++
A	lecture09/thread_affinity/affinity_spread.cpp	\|	35	+++++++++++++++++++++++++++++++++++
A	lecture09/thread_affinity/helper.h	\|	26	++++++++++++++++++++++++++
A	lecture09/thread_affinity/omp_places_cores.sh	\|	2	++
A	lecture09/thread_affinity/omp_places_sockets.sh	\|	2	++
A	lecture09/thread_affinity/omp_places_threads.sh	\|	2	++
A	lecture09/thread_affinity/omp_proc_bind_close.sh	\|	2	++
A	lecture09/thread_affinity/omp_proc_bind_false.sh	\|	2	++
A	lecture09/thread_affinity/omp_proc_bind_master.sh	\|	2	++
A	lecture09/thread_affinity/omp_proc_bind_spread.sh	\|	2	++
A	lecture09/thread_affinity/omp_proc_bind_true.sh	\|	2	++
A	lecture10/openmp_overhead/AMD_Opteron/openmp_overhead.pdf	\|	0
A	lecture10/openmp_overhead/AMD_Opteron/proc_bind_false.dat	\|	8	++++++++
A	lecture10/openmp_overhead/AMD_Opteron/proc_bind_true.dat	\|	8	++++++++
A	lecture10/openmp_overhead/Intel_Xeon2683v4/openmp_overhead.pdf	\|	0
A	lecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_false.dat	\|	7	+++++++
A	lecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_true.dat	\|	7	+++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/Licence.txt	\|	202	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/Makefile	\|	69	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs	\|	10	++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.cray	\|	10	++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.pgi	\|	13	+++++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.gnu	\|	11	+++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.sun	\|	10	++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.gnu	\|	11	+++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.intel	\|	11	+++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/README.txt	\|	113	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/arraybench.c	\|	133	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/arraybench.h	\|	52	++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/common.c	\|	374	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/common.h	\|	80	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/schedbench.c	\|	145	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/schedbench.h	\|	46	++++++++++++++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/syncbench.c	\|	253	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/syncbench.h	\|	63	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/taskbench.c	\|	331	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/openmpbench_C_v31/taskbench.h	\|	66	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/plot_overhead.py	\|	35	+++++++++++++++++++++++++++++++++++
A	lecture10/openmp_overhead/run_benchmark.sh	\|	21	+++++++++++++++++++++
A	lecture10/prefetch/Makefile	\|	13	+++++++++++++
A	lecture10/prefetch/loop_unroll_prefetch.cpp	\|	44	++++++++++++++++++++++++++++++++++++++++++++
A	lecture11/hello_mpi/.gitignore	\|	1	+
A	lecture11/hello_mpi/Makefile	\|	9	+++++++++
A	lecture11/hello_mpi/hello_mpi.cpp	\|	17	+++++++++++++++++
A	lecture11/rank_size_proc/.gitignore	\|	1	+
A	lecture11/rank_size_proc/Makefile	\|	9	+++++++++
A	lecture11/rank_size_proc/rank_size_proc.cpp	\|	35	+++++++++++++++++++++++++++++++++++
A	lecture11/thread_support/.gitignore	\|	1	+
A	lecture11/thread_support/Makefile	\|	9	+++++++++
A	lecture11/thread_support/thread_support.cpp	\|	27	+++++++++++++++++++++++++++
A	lecture12/endianness/.gitignore	\|	1	+
A	lecture12/endianness/Makefile	\|	9	+++++++++
A	lecture12/endianness/endianness.cpp	\|	44	++++++++++++++++++++++++++++++++++++++++++++
A	lecture12/mpi_bsend/.gitignore	\|	1	+
A	lecture12/mpi_bsend/Makefile	\|	9	+++++++++
A	lecture12/mpi_bsend/mpi_bsend.cpp	\|	37	+++++++++++++++++++++++++++++++++++++
A	lecture12/mpi_gatherv_scatterv/.gitignore	\|	1	+
A	lecture12/mpi_gatherv_scatterv/Makefile	\|	9	+++++++++
A	lecture12/mpi_gatherv_scatterv/mpi_gatherv_scatterv.cpp	\|	78	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture12/mpi_get_count/.gitignore	\|	1	+
A	lecture12/mpi_get_count/Makefile	\|	9	+++++++++
A	lecture12/mpi_get_count/mpi_get_count.cpp	\|	33	+++++++++++++++++++++++++++++++++
A	lecture12/mpi_minloc/.gitignore	\|	1	+
A	lecture12/mpi_minloc/Makefile	\|	9	+++++++++
A	lecture12/mpi_minloc/mpi_minloc.cpp	\|	31	+++++++++++++++++++++++++++++++
A	lecture12/mpi_p2p/.gitignore	\|	1	+
A	lecture12/mpi_p2p/Makefile	\|	9	+++++++++
A	lecture12/mpi_p2p/mpi_p2p.cpp	\|	22	++++++++++++++++++++++
A	lecture12/mpi_pi/.gitignore	\|	3	+++
A	lecture12/mpi_pi/Makefile	\|	17	+++++++++++++++++
A	lecture12/mpi_pi/mpi_pi.cpp	\|	45	+++++++++++++++++++++++++++++++++++++++++++++
A	lecture12/mpi_pi/mpi_pi_reduction.cpp	\|	33	+++++++++++++++++++++++++++++++++
A	lecture12/mpi_pi/mpi_pi_reduction_inplace.cpp	\|	38	++++++++++++++++++++++++++++++++++++++
A	lecture12/mpi_rsend/.gitignore	\|	2	++
A	lecture12/mpi_rsend/Makefile	\|	20	++++++++++++++++++++
A	lecture12/mpi_rsend/mpi_irsend.cpp	\|	46	++++++++++++++++++++++++++++++++++++++++++++++
A	lecture12/mpi_rsend/mpi_rsend.cpp	\|	41	+++++++++++++++++++++++++++++++++++++++++
A	lecture13/mpi_1D_diffusion/.gitignore	\|	4	++++
A	lecture13/mpi_1D_diffusion/IO.h	\|	45	+++++++++++++++++++++++++++++++++++++++++++++
A	lecture13/mpi_1D_diffusion/Makefile	\|	22	++++++++++++++++++++++
A	lecture13/mpi_1D_diffusion/blocking.cpp	\|	45	+++++++++++++++++++++++++++++++++++++++++++++
A	lecture13/mpi_1D_diffusion/nonblocking.cpp	\|	54	++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture13/mpi_1D_diffusion/plot.py	\|	19	+++++++++++++++++++
A	lecture13/mpi_cyclic_shift/.gitignore	\|	1	+
A	lecture13/mpi_cyclic_shift/Makefile	\|	14	++++++++++++++
A	lecture13/mpi_cyclic_shift/mpi_cyclic_shift.cpp	\|	59	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture13/mpi_cyclic_shift/mpi_sendrecv.cpp	\|	42	++++++++++++++++++++++++++++++++++++++++++
A	lecture13/mpi_p2p_nonblocking/.gitignore	\|	1	+
A	lecture13/mpi_p2p_nonblocking/Makefile	\|	9	+++++++++
A	lecture13/mpi_p2p_nonblocking/mpi_p2p_nonblocking.cpp	\|	25	+++++++++++++++++++++++++
A	lecture14/mpi_1D_diffusion_IO/.gitignore	\|	4	++++
A	lecture14/mpi_1D_diffusion_IO/IO.h	\|	144	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture14/mpi_1D_diffusion_IO/Makefile	\|	13	+++++++++++++
A	lecture14/mpi_1D_diffusion_IO/nonblocking.cpp	\|	58	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture14/mpi_1D_diffusion_IO/plot.py	\|	52	++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture14/parallel_hdf5/.gitignore	\|	1	+
A	lecture14/parallel_hdf5/Makefile	\|	12	++++++++++++
A	lecture14/parallel_hdf5/data.h5	\|	0
A	lecture14/parallel_hdf5/data.xmf	\|	23	+++++++++++++++++++++++
A	lecture14/parallel_hdf5/hdf5IO.h	\|	98	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture14/parallel_hdf5/hdf5_in_python.py	\|	18	++++++++++++++++++
A	lecture14/parallel_hdf5/parallel_hdf5.cpp	\|	44	++++++++++++++++++++++++++++++++++++++++++++
A	lecture14/parallel_hdf5/submit.sh	\|	23	+++++++++++++++++++++++
A	lecture14/parallel_hdf5/xdmf_wrapper.h	\|	46	++++++++++++++++++++++++++++++++++++++++++++++
A	lecture15/papi_examples_HW2/asm/.gitignore	\|	1	+
A	lecture15/papi_examples_HW2/asm/Makefile	\|	12	++++++++++++
A	lecture15/papi_examples_HW2/asm/kernel.s	\|	19	+++++++++++++++++++
A	lecture15/papi_examples_HW2/asm/test.cpp	\|	61	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture15/papi_examples_HW2/daxpy/.gitignore	\|	2	++
A	lecture15/papi_examples_HW2/daxpy/Makefile	\|	25	+++++++++++++++++++++++++
A	lecture15/papi_examples_HW2/daxpy/daxpy.cpp	\|	106	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture15/papi_examples_HW2/daxpy/disassemble.sh	\|	9	+++++++++
A	lecture15/papi_examples_HW2/dgemm/.gitignore	\|	2	++
A	lecture15/papi_examples_HW2/dgemm/Makefile	\|	29	+++++++++++++++++++++++++++++
A	lecture15/papi_examples_HW2/dgemm/dgemm.cpp	\|	139	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture15/papi_examples_HW2/dgemm/disassemble.sh	\|	9	+++++++++
A	lecture15/papi_examples_HW2/sgemv/.gitignore	\|	2	++
A	lecture15/papi_examples_HW2/sgemv/Makefile	\|	25	+++++++++++++++++++++++++
A	lecture15/papi_examples_HW2/sgemv/disassemble.sh	\|	9	+++++++++
A	lecture15/papi_examples_HW2/sgemv/sgemv.cpp	\|	104	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture16/allocator_test/.gitignore	\|	1	+
A	lecture16/allocator_test/Makefile	\|	9	+++++++++
A	lecture16/allocator_test/allocator_test.cpp	\|	55	+++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture16/pointer_dereference/.gitignore	\|	1	+
A	lecture16/pointer_dereference/Makefile	\|	9	+++++++++
A	lecture16/pointer_dereference/pointer_dereference.cpp	\|	15	+++++++++++++++
A	lecture17/registers/.gitignore	\|	2	++
A	lecture17/registers/Makefile	\|	12	++++++++++++
A	lecture17/registers/registers.s	\|	11	+++++++++++
A	lecture18_19/aliasing/.gitignore	\|	1	+
A	lecture18_19/aliasing/Makefile	\|	12	++++++++++++
A	lecture18_19/aliasing/aliasing.c	\|	13	+++++++++++++
A	lecture18_19/aliasing/main.c	\|	15	+++++++++++++++
A	lecture18_19/align/.gitignore	\|	1	+
A	lecture18_19/align/Makefile	\|	7	+++++++
A	lecture18_19/align/align.c	\|	30	++++++++++++++++++++++++++++++
A	lecture18_19/autovec/Makefile	\|	24	++++++++++++++++++++++++
A	lecture18_19/autovec/f.c	\|	20	++++++++++++++++++++
A	lecture18_19/intrinsics/_mm_set_ps1/.gitignore	\|	3	+++
A	lecture18_19/intrinsics/_mm_set_ps1/Makefile	\|	15	+++++++++++++++
A	lecture18_19/intrinsics/_mm_set_ps1/main.c	\|	9	+++++++++
A	lecture18_19/intrinsics/_mm_set_ps1/ones.c	\|	6	++++++
A	lecture18_19/intrinsics/constants/set1_ps.c	\|	3	+++
A	lecture18_19/intrinsics/constants/set_ps.c	\|	5	+++++
A	lecture18_19/intrinsics/constants/set_ss.c	\|	3	+++
A	lecture18_19/intrinsics/constants/setr_ps.c	\|	5	+++++
A	lecture18_19/intrinsics/constants/setzero.c	\|	2	++
A	lecture18_19/intrinsics/load_store/aligned_load_store.c	\|	7	+++++++
A	lecture18_19/intrinsics/load_store/reverse_load_store.c	\|	2	++
A	lecture18_19/intrinsics/load_store/unaligned_load_store.c	\|	7	+++++++
A	lecture18_19/intrinsics/transpose/transpose.c	\|	15	+++++++++++++++
A	lecture18_19/saxpy/.gitignore	\|	2	++
A	lecture18_19/saxpy/Makefile	\|	17	+++++++++++++++++
A	lecture18_19/saxpy/main.c	\|	66	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture18_19/saxpy/saxpy.c	\|	7	+++++++
A	lecture18_19/saxpy/saxpy.s	\|	26	++++++++++++++++++++++++++
A	lecture18_19/saxpy/saxpy_SSE.c	\|	12	++++++++++++
A	lecture18_19/saxpy/saxpy_SSE.s	\|	30	++++++++++++++++++++++++++++++
A	lecture18_19/saxpy/saxpy_SSE_FMA.c	\|	11	+++++++++++
A	lecture18_19/sgemv/.gitignore	\|	1	+
A	lecture18_19/sgemv/Makefile	\|	9	+++++++++
A	lecture18_19/sgemv/sgemv.cpp	\|	22	++++++++++++++++++++++
A	lecture18_19/sgemv/sgemv.s	\|	73	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	lecture20/ispc_data_layout/.gitignore	\|	2	++
A	lecture20/ispc_data_layout/Makefile	\|	17	+++++++++++++++++
A	lecture20/ispc_data_layout/kernels.ispc	\|	40	++++++++++++++++++++++++++++++++++++++++
A	lecture20/ispc_data_layout/main.c	\|	11	+++++++++++
A	lecture20/ispc_foreach/.gitignore	\|	2	++
A	lecture20/ispc_foreach/Makefile	\|	17	+++++++++++++++++
A	lecture20/ispc_foreach/kernels.ispc	\|	17	+++++++++++++++++
A	lecture20/ispc_foreach/main.c	\|	10	++++++++++
A	lecture20/ispc_pointers_arrays/.gitignore	\|	2	++
A	lecture20/ispc_pointers_arrays/Makefile	\|	17	+++++++++++++++++
A	lecture20/ispc_pointers_arrays/f.ispc	\|	35	+++++++++++++++++++++++++++++++++++
A	lecture20/ispc_pointers_arrays/main.c	\|	8	++++++++
A	lecture20/maximal_convergence/.gitignore	\|	2	++
A	lecture20/maximal_convergence/Makefile	\|	17	+++++++++++++++++
A	lecture20/maximal_convergence/f.ispc	\|	14	++++++++++++++
A	lecture20/maximal_convergence/main.cpp	\|	8	++++++++
A	lecture20/sse_vector/.gitignore	\|	1	+
A	lecture20/sse_vector/Makefile	\|	7	+++++++
A	lecture20/sse_vector/sse_vector.c	\|	8	++++++++
A	lecture21/cuda_vec_add/.gitignore	\|	2	++
A	lecture21/cuda_vec_add/Makefile	\|	7	+++++++
A	lecture21/cuda_vec_add/add_kernel_grid.cu	\|	51	+++++++++++++++++++++++++++++++++++++++++++++++++++