cs205-lecture-examples

Example codes used during Harvard CS205 lectures
git clone https://git.0xfab.ch/cs205-lecture-examples.git
Log | Files | Refs | README | LICENSE

commit 6c242dbac5429593c3e32d236372913eed5ea3d8
parent e151e67906899c205dde309dca6452b00bb78743
Author: Fabian Wermelinger <fab4100@posteo.net>
Date:   Thu, 15 Aug 2024 17:10:25 +0200

Add lecture codes

Diffstat:
Alecture02/compilation_phases/.gitignore | 4++++
Alecture02/compilation_phases/Makefile | 22++++++++++++++++++++++
Alecture02/compilation_phases/hello.c | 8++++++++
Alecture02/heap_and_stack/.gitignore | 2++
Alecture02/heap_and_stack/Makefile | 13+++++++++++++
Alecture02/heap_and_stack/heap_stack.c | 23+++++++++++++++++++++++
Alecture02/heap_and_stack/stackoverflow.c | 22++++++++++++++++++++++
Alecture03/multi_index_and_flat_index/.gitignore | 1+
Alecture03/multi_index_and_flat_index/Makefile | 8++++++++
Alecture03/multi_index_and_flat_index/multi_and_flat.cpp | 45+++++++++++++++++++++++++++++++++++++++++++++
Alecture04/counter/.gitignore | 1+
Alecture04/counter/Makefile | 8++++++++
Alecture04/counter/counter.cpp | 25+++++++++++++++++++++++++
Alecture05/address_sanitizer/openmp/.gitignore | 3+++
Alecture05/address_sanitizer/openmp/Makefile | 23+++++++++++++++++++++++
Alecture05/address_sanitizer/openmp/two_ompthreads.cpp | 28++++++++++++++++++++++++++++
Alecture05/address_sanitizer/pthreads/.gitignore | 3+++
Alecture05/address_sanitizer/pthreads/Makefile | 23+++++++++++++++++++++++
Alecture05/address_sanitizer/pthreads/two_pthreads.cpp | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture05/sequential_consistency/.gitignore | 2++
Alecture05/sequential_consistency/Makefile | 14++++++++++++++
Alecture05/sequential_consistency/README.md | 5+++++
Alecture05/sequential_consistency/scenario_1_3.cpp | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture05/sequential_consistency/scenario_2.cpp | 34++++++++++++++++++++++++++++++++++
Alecture06/omp_conditional/.gitignore | 2++
Alecture06/omp_conditional/Makefile | 14++++++++++++++
Alecture06/omp_conditional/omp_conditional.cpp | 12++++++++++++
Alecture06/omp_demo/.gitignore | 1+
Alecture06/omp_demo/Makefile | 9+++++++++
Alecture06/omp_demo/omp_demo.cpp | 19+++++++++++++++++++
Alecture07_08/omp_critical/.gitignore | 2++
Alecture07_08/omp_critical/Makefile | 14++++++++++++++
Alecture07_08/omp_critical/omp_critical.cpp | 38++++++++++++++++++++++++++++++++++++++
Alecture07_08/omp_data_attributes/.gitignore | 5+++++
Alecture07_08/omp_data_attributes/Makefile | 29+++++++++++++++++++++++++++++
Alecture07_08/omp_data_attributes/omp_firstprivate.cpp | 24++++++++++++++++++++++++
Alecture07_08/omp_data_attributes/omp_lastprivate.cpp | 41+++++++++++++++++++++++++++++++++++++++++
Alecture07_08/omp_data_attributes/omp_private.cpp | 24++++++++++++++++++++++++
Alecture07_08/omp_data_attributes/omp_region.cpp | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture07_08/omp_data_attributes/omp_static.cpp | 23+++++++++++++++++++++++
Alecture07_08/omp_data_attributes/omp_threadprivate.cpp | 33+++++++++++++++++++++++++++++++++
Alecture07_08/omp_default/.gitignore | 1+
Alecture07_08/omp_default/Makefile | 9+++++++++
Alecture07_08/omp_default/omp_default.cpp | 15+++++++++++++++
Alecture07_08/omp_device/.gitignore | 1+
Alecture07_08/omp_device/Makefile | 15+++++++++++++++
Alecture07_08/omp_device/omp_device.cpp | 19+++++++++++++++++++
Alecture07_08/omp_schedule/.gitignore | 1+
Alecture07_08/omp_schedule/Makefile | 9+++++++++
Alecture07_08/omp_schedule/omp_schedule.cpp | 38++++++++++++++++++++++++++++++++++++++
Alecture07_08/omp_scoping/Makefile | 12++++++++++++
Alecture07_08/omp_scoping/main.c | 18++++++++++++++++++
Alecture07_08/omp_scoping/myfunc.c | 6++++++
Alecture07_08/omp_wtime/.gitignore | 1+
Alecture07_08/omp_wtime/Makefile | 9+++++++++
Alecture07_08/omp_wtime/omp_wtime.cpp | 35+++++++++++++++++++++++++++++++++++
Alecture09/thread_affinity/.gitignore | 4++++
Alecture09/thread_affinity/Makefile | 20++++++++++++++++++++
Alecture09/thread_affinity/affinity.cpp | 35+++++++++++++++++++++++++++++++++++
Alecture09/thread_affinity/affinity_close.cpp | 35+++++++++++++++++++++++++++++++++++
Alecture09/thread_affinity/affinity_master.cpp | 35+++++++++++++++++++++++++++++++++++
Alecture09/thread_affinity/affinity_spread.cpp | 35+++++++++++++++++++++++++++++++++++
Alecture09/thread_affinity/helper.h | 26++++++++++++++++++++++++++
Alecture09/thread_affinity/omp_places_cores.sh | 2++
Alecture09/thread_affinity/omp_places_sockets.sh | 2++
Alecture09/thread_affinity/omp_places_threads.sh | 2++
Alecture09/thread_affinity/omp_proc_bind_close.sh | 2++
Alecture09/thread_affinity/omp_proc_bind_false.sh | 2++
Alecture09/thread_affinity/omp_proc_bind_master.sh | 2++
Alecture09/thread_affinity/omp_proc_bind_spread.sh | 2++
Alecture09/thread_affinity/omp_proc_bind_true.sh | 2++
Alecture10/openmp_overhead/AMD_Opteron/openmp_overhead.pdf | 0
Alecture10/openmp_overhead/AMD_Opteron/proc_bind_false.dat | 8++++++++
Alecture10/openmp_overhead/AMD_Opteron/proc_bind_true.dat | 8++++++++
Alecture10/openmp_overhead/Intel_Xeon2683v4/openmp_overhead.pdf | 0
Alecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_false.dat | 7+++++++
Alecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_true.dat | 7+++++++
Alecture10/openmp_overhead/openmpbench_C_v31/Licence.txt | 202+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/Makefile | 69+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs | 10++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.cray | 10++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.pgi | 13+++++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.gnu | 11+++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.sun | 10++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.gnu | 11+++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.intel | 11+++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/README.txt | 113+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/arraybench.c | 133+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/arraybench.h | 52++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/common.c | 374+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/common.h | 80+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/schedbench.c | 145+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/schedbench.h | 46++++++++++++++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/syncbench.c | 253+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/syncbench.h | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/taskbench.c | 331+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/openmpbench_C_v31/taskbench.h | 66++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/plot_overhead.py | 35+++++++++++++++++++++++++++++++++++
Alecture10/openmp_overhead/run_benchmark.sh | 21+++++++++++++++++++++
Alecture10/prefetch/Makefile | 13+++++++++++++
Alecture10/prefetch/loop_unroll_prefetch.cpp | 44++++++++++++++++++++++++++++++++++++++++++++
Alecture11/hello_mpi/.gitignore | 1+
Alecture11/hello_mpi/Makefile | 9+++++++++
Alecture11/hello_mpi/hello_mpi.cpp | 17+++++++++++++++++
Alecture11/rank_size_proc/.gitignore | 1+
Alecture11/rank_size_proc/Makefile | 9+++++++++
Alecture11/rank_size_proc/rank_size_proc.cpp | 35+++++++++++++++++++++++++++++++++++
Alecture11/thread_support/.gitignore | 1+
Alecture11/thread_support/Makefile | 9+++++++++
Alecture11/thread_support/thread_support.cpp | 27+++++++++++++++++++++++++++
Alecture12/endianness/.gitignore | 1+
Alecture12/endianness/Makefile | 9+++++++++
Alecture12/endianness/endianness.cpp | 44++++++++++++++++++++++++++++++++++++++++++++
Alecture12/mpi_bsend/.gitignore | 1+
Alecture12/mpi_bsend/Makefile | 9+++++++++
Alecture12/mpi_bsend/mpi_bsend.cpp | 37+++++++++++++++++++++++++++++++++++++
Alecture12/mpi_gatherv_scatterv/.gitignore | 1+
Alecture12/mpi_gatherv_scatterv/Makefile | 9+++++++++
Alecture12/mpi_gatherv_scatterv/mpi_gatherv_scatterv.cpp | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture12/mpi_get_count/.gitignore | 1+
Alecture12/mpi_get_count/Makefile | 9+++++++++
Alecture12/mpi_get_count/mpi_get_count.cpp | 33+++++++++++++++++++++++++++++++++
Alecture12/mpi_minloc/.gitignore | 1+
Alecture12/mpi_minloc/Makefile | 9+++++++++
Alecture12/mpi_minloc/mpi_minloc.cpp | 31+++++++++++++++++++++++++++++++
Alecture12/mpi_p2p/.gitignore | 1+
Alecture12/mpi_p2p/Makefile | 9+++++++++
Alecture12/mpi_p2p/mpi_p2p.cpp | 22++++++++++++++++++++++
Alecture12/mpi_pi/.gitignore | 3+++
Alecture12/mpi_pi/Makefile | 17+++++++++++++++++
Alecture12/mpi_pi/mpi_pi.cpp | 45+++++++++++++++++++++++++++++++++++++++++++++
Alecture12/mpi_pi/mpi_pi_reduction.cpp | 33+++++++++++++++++++++++++++++++++
Alecture12/mpi_pi/mpi_pi_reduction_inplace.cpp | 38++++++++++++++++++++++++++++++++++++++
Alecture12/mpi_rsend/.gitignore | 2++
Alecture12/mpi_rsend/Makefile | 20++++++++++++++++++++
Alecture12/mpi_rsend/mpi_irsend.cpp | 46++++++++++++++++++++++++++++++++++++++++++++++
Alecture12/mpi_rsend/mpi_rsend.cpp | 41+++++++++++++++++++++++++++++++++++++++++
Alecture13/mpi_1D_diffusion/.gitignore | 4++++
Alecture13/mpi_1D_diffusion/IO.h | 45+++++++++++++++++++++++++++++++++++++++++++++
Alecture13/mpi_1D_diffusion/Makefile | 22++++++++++++++++++++++
Alecture13/mpi_1D_diffusion/blocking.cpp | 45+++++++++++++++++++++++++++++++++++++++++++++
Alecture13/mpi_1D_diffusion/nonblocking.cpp | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture13/mpi_1D_diffusion/plot.py | 19+++++++++++++++++++
Alecture13/mpi_cyclic_shift/.gitignore | 1+
Alecture13/mpi_cyclic_shift/Makefile | 14++++++++++++++
Alecture13/mpi_cyclic_shift/mpi_cyclic_shift.cpp | 59+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture13/mpi_cyclic_shift/mpi_sendrecv.cpp | 42++++++++++++++++++++++++++++++++++++++++++
Alecture13/mpi_p2p_nonblocking/.gitignore | 1+
Alecture13/mpi_p2p_nonblocking/Makefile | 9+++++++++
Alecture13/mpi_p2p_nonblocking/mpi_p2p_nonblocking.cpp | 25+++++++++++++++++++++++++
Alecture14/mpi_1D_diffusion_IO/.gitignore | 4++++
Alecture14/mpi_1D_diffusion_IO/IO.h | 144+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture14/mpi_1D_diffusion_IO/Makefile | 13+++++++++++++
Alecture14/mpi_1D_diffusion_IO/nonblocking.cpp | 58++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture14/mpi_1D_diffusion_IO/plot.py | 52++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture14/parallel_hdf5/.gitignore | 1+
Alecture14/parallel_hdf5/Makefile | 12++++++++++++
Alecture14/parallel_hdf5/data.h5 | 0
Alecture14/parallel_hdf5/data.xmf | 23+++++++++++++++++++++++
Alecture14/parallel_hdf5/hdf5IO.h | 98+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture14/parallel_hdf5/hdf5_in_python.py | 18++++++++++++++++++
Alecture14/parallel_hdf5/parallel_hdf5.cpp | 44++++++++++++++++++++++++++++++++++++++++++++
Alecture14/parallel_hdf5/submit.sh | 23+++++++++++++++++++++++
Alecture14/parallel_hdf5/xdmf_wrapper.h | 46++++++++++++++++++++++++++++++++++++++++++++++
Alecture15/papi_examples_HW2/asm/.gitignore | 1+
Alecture15/papi_examples_HW2/asm/Makefile | 12++++++++++++
Alecture15/papi_examples_HW2/asm/kernel.s | 19+++++++++++++++++++
Alecture15/papi_examples_HW2/asm/test.cpp | 61+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture15/papi_examples_HW2/daxpy/.gitignore | 2++
Alecture15/papi_examples_HW2/daxpy/Makefile | 25+++++++++++++++++++++++++
Alecture15/papi_examples_HW2/daxpy/daxpy.cpp | 106+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture15/papi_examples_HW2/daxpy/disassemble.sh | 9+++++++++
Alecture15/papi_examples_HW2/dgemm/.gitignore | 2++
Alecture15/papi_examples_HW2/dgemm/Makefile | 29+++++++++++++++++++++++++++++
Alecture15/papi_examples_HW2/dgemm/dgemm.cpp | 139+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture15/papi_examples_HW2/dgemm/disassemble.sh | 9+++++++++
Alecture15/papi_examples_HW2/sgemv/.gitignore | 2++
Alecture15/papi_examples_HW2/sgemv/Makefile | 25+++++++++++++++++++++++++
Alecture15/papi_examples_HW2/sgemv/disassemble.sh | 9+++++++++
Alecture15/papi_examples_HW2/sgemv/sgemv.cpp | 104+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture16/allocator_test/.gitignore | 1+
Alecture16/allocator_test/Makefile | 9+++++++++
Alecture16/allocator_test/allocator_test.cpp | 55+++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture16/pointer_dereference/.gitignore | 1+
Alecture16/pointer_dereference/Makefile | 9+++++++++
Alecture16/pointer_dereference/pointer_dereference.cpp | 15+++++++++++++++
Alecture17/registers/.gitignore | 2++
Alecture17/registers/Makefile | 12++++++++++++
Alecture17/registers/registers.s | 11+++++++++++
Alecture18_19/aliasing/.gitignore | 1+
Alecture18_19/aliasing/Makefile | 12++++++++++++
Alecture18_19/aliasing/aliasing.c | 13+++++++++++++
Alecture18_19/aliasing/main.c | 15+++++++++++++++
Alecture18_19/align/.gitignore | 1+
Alecture18_19/align/Makefile | 7+++++++
Alecture18_19/align/align.c | 30++++++++++++++++++++++++++++++
Alecture18_19/autovec/Makefile | 24++++++++++++++++++++++++
Alecture18_19/autovec/f.c | 20++++++++++++++++++++
Alecture18_19/intrinsics/_mm_set_ps1/.gitignore | 3+++
Alecture18_19/intrinsics/_mm_set_ps1/Makefile | 15+++++++++++++++
Alecture18_19/intrinsics/_mm_set_ps1/main.c | 9+++++++++
Alecture18_19/intrinsics/_mm_set_ps1/ones.c | 6++++++
Alecture18_19/intrinsics/constants/set1_ps.c | 3+++
Alecture18_19/intrinsics/constants/set_ps.c | 5+++++
Alecture18_19/intrinsics/constants/set_ss.c | 3+++
Alecture18_19/intrinsics/constants/setr_ps.c | 5+++++
Alecture18_19/intrinsics/constants/setzero.c | 2++
Alecture18_19/intrinsics/load_store/aligned_load_store.c | 7+++++++
Alecture18_19/intrinsics/load_store/reverse_load_store.c | 2++
Alecture18_19/intrinsics/load_store/unaligned_load_store.c | 7+++++++
Alecture18_19/intrinsics/transpose/transpose.c | 15+++++++++++++++
Alecture18_19/saxpy/.gitignore | 2++
Alecture18_19/saxpy/Makefile | 17+++++++++++++++++
Alecture18_19/saxpy/main.c | 66++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture18_19/saxpy/saxpy.c | 7+++++++
Alecture18_19/saxpy/saxpy.s | 26++++++++++++++++++++++++++
Alecture18_19/saxpy/saxpy_SSE.c | 12++++++++++++
Alecture18_19/saxpy/saxpy_SSE.s | 30++++++++++++++++++++++++++++++
Alecture18_19/saxpy/saxpy_SSE_FMA.c | 11+++++++++++
Alecture18_19/sgemv/.gitignore | 1+
Alecture18_19/sgemv/Makefile | 9+++++++++
Alecture18_19/sgemv/sgemv.cpp | 22++++++++++++++++++++++
Alecture18_19/sgemv/sgemv.s | 73+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Alecture20/ispc_data_layout/.gitignore | 2++
Alecture20/ispc_data_layout/Makefile | 17+++++++++++++++++
Alecture20/ispc_data_layout/kernels.ispc | 40++++++++++++++++++++++++++++++++++++++++
Alecture20/ispc_data_layout/main.c | 11+++++++++++
Alecture20/ispc_foreach/.gitignore | 2++
Alecture20/ispc_foreach/Makefile | 17+++++++++++++++++
Alecture20/ispc_foreach/kernels.ispc | 17+++++++++++++++++
Alecture20/ispc_foreach/main.c | 10++++++++++
Alecture20/ispc_pointers_arrays/.gitignore | 2++
Alecture20/ispc_pointers_arrays/Makefile | 17+++++++++++++++++
Alecture20/ispc_pointers_arrays/f.ispc | 35+++++++++++++++++++++++++++++++++++
Alecture20/ispc_pointers_arrays/main.c | 8++++++++
Alecture20/maximal_convergence/.gitignore | 2++
Alecture20/maximal_convergence/Makefile | 17+++++++++++++++++
Alecture20/maximal_convergence/f.ispc | 14++++++++++++++
Alecture20/maximal_convergence/main.cpp | 8++++++++
Alecture20/sse_vector/.gitignore | 1+
Alecture20/sse_vector/Makefile | 7+++++++
Alecture20/sse_vector/sse_vector.c | 8++++++++
Alecture21/cuda_vec_add/.gitignore | 2++
Alecture21/cuda_vec_add/Makefile | 7+++++++
Alecture21/cuda_vec_add/add_kernel_grid.cu | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
245 files changed, 6296 insertions(+), 0 deletions(-)

diff --git a/lecture02/compilation_phases/.gitignore b/lecture02/compilation_phases/.gitignore @@ -0,0 +1,4 @@ +hello +hello.i +hello.o +hello.s diff --git a/lecture02/compilation_phases/Makefile b/lecture02/compilation_phases/Makefile @@ -0,0 +1,22 @@ +CC = gcc +.PHONY: all clean + +all: hello hello.o hello.i hello.s dump + +hello: hello.c + $(CC) -o $@ $< + +hello.o: hello.c + $(CC) -c -o $@ $< + +hello.i: hello.c + $(CC) -E -o $@ $< + +hello.s: hello.c + $(CC) -S -o $@ $< + +dump: hello.o + objdump -d $< + +clean: + rm -f hello hello.o hello.i hello.s diff --git a/lecture02/compilation_phases/hello.c b/lecture02/compilation_phases/hello.c @@ -0,0 +1,8 @@ +#include <stdio.h> + +int main(void) +{ + // Comments are removed by the pre-processor + printf("hello, world!\n"); + return 0; +} diff --git a/lecture02/heap_and_stack/.gitignore b/lecture02/heap_and_stack/.gitignore @@ -0,0 +1,2 @@ +heap_stack +stackoverflow diff --git a/lecture02/heap_and_stack/Makefile b/lecture02/heap_and_stack/Makefile @@ -0,0 +1,13 @@ +CC = gcc +.PHONY: all clean + +all: heap_stack stackoverflow + +heap_stack: heap_stack.c + $(CC) -O2 -o $@ $< + +stackoverflow: stackoverflow.c + $(CC) -O2 -o $@ $< + +clean: + rm -f heap_stack stackoverflow diff --git a/lecture02/heap_and_stack/heap_stack.c b/lecture02/heap_and_stack/heap_stack.c @@ -0,0 +1,23 @@ +#include <stdio.h> +#include <stdlib.h> + +int main(void) +{ + // Allocated on stack when main function runs. This is an example of + // automatic memory allocation managed by the compiler using the stack. + char on_stack[1 << 10]; // allocates 1024 bytes on the stack + + // Allocated on the heap at runtime. This is an example of dynamic memory + // allocation managed by the operating system at runtime (this memory + // request involves the OS kernel) + char *pointer = (char *)malloc(1 << 10); // allocates 1024 bytes on the heap + + // Compare the memory addresses of the two allocations + // clang-format off + printf("Address of variable `on_stack` (on stack): %p\n", (void *)&on_stack); + printf("Address of variable `pointer` (on stack): %p\n", (void *)&pointer); + printf("Address where `pointer` points to (on heap): %p\n", (void *)pointer); + // clang-format on + + return 0; +} diff --git a/lecture02/heap_and_stack/stackoverflow.c b/lecture02/heap_and_stack/stackoverflow.c @@ -0,0 +1,22 @@ +#include <stdio.h> + +int main(void) +{ + // On Linux system the size of the stack can be checked (and configured) + // with + // ulimit -s + // On my system the stack size is 8192kB (8MB). If your program allocates + // more than 8MB on the stack your application will fail with a + // segmentation fault. The stack is used for automatic memory allocation + // when functions execute and should not be used for simulation data for + // example. Such large data must always be allocated on the heap. If you + // have a very deep recursion in your code that calls many function which + // all in total require more than 8MB (on my system) you will also run into + // a stack overflow. + + // The following code is buggy and will crash due to stack overflow + char large_memory[8 * (1 << 20)]; // 8MB of memory on stack + printf("Memory access will fail: %d\n", large_memory[0]); // segfault + + return 0; +} diff --git a/lecture03/multi_index_and_flat_index/.gitignore b/lecture03/multi_index_and_flat_index/.gitignore @@ -0,0 +1 @@ +main diff --git a/lecture03/multi_index_and_flat_index/Makefile b/lecture03/multi_index_and_flat_index/Makefile @@ -0,0 +1,8 @@ +CXX = g++ +.PHONY: clean + +main: multi_and_flat.cpp + $(CXX) -O1 -o $@ $< + +clean: + rm -f main diff --git a/lecture03/multi_index_and_flat_index/multi_and_flat.cpp b/lecture03/multi_index_and_flat_index/multi_and_flat.cpp @@ -0,0 +1,45 @@ +#include <iostream> + +#define N 2 +#define M 4 + +void initialize(int *p) +{ + for (int i = 0; i < N * M; ++i) { + p[i] = i; // 0, 1, 2, 3, ... + } +} + +int main(void) +{ + int A[N][M]; + int *p = &A[0][0]; // pointer to first element in A + initialize(p); + + // multi-dimensional indexing + std::cout << "Multi-dimensional indexing:\n"; + for (int i = 0; i < N; ++i) { + for (int j = 0; j < M; ++j) { + std::cout << '\t' << A[i][j]; + } + std::cout << '\n'; + } + + // flat indexing + std::cout << "\nFlat indexing:\n"; + for (int i = 0; i < N; ++i) { + for (int j = 0; j < M; ++j) { + std::cout << '\t' << p[i * M + j]; // j is fastest moving index! + } + std::cout << '\n'; + } + + // representation in memory + std::cout << "\nRow-major layout in memory:\n"; + for (int i = 0; i < N * M; ++i) { + std::cout << '\t' << p[i]; + } + std::cout << '\n'; + + return 0; +} diff --git a/lecture04/counter/.gitignore b/lecture04/counter/.gitignore @@ -0,0 +1 @@ +counter diff --git a/lecture04/counter/Makefile b/lecture04/counter/Makefile @@ -0,0 +1,8 @@ +CXX = g++ +.PHONY: clean + +counter: counter.cpp + $(CXX) -fopenmp -o $@ $< + +clean: + rm -f counter diff --git a/lecture04/counter/counter.cpp b/lecture04/counter/counter.cpp @@ -0,0 +1,25 @@ +#include <iostream> + +// DISCLAIMER: this code contains a race condition +class Counter +{ +public: + Counter() : count_(0) {} + + void increment() { ++count_; } + unsigned int get_count() const { return count_; } + +private: + unsigned int count_; +}; + +int main(void) +{ + Counter counter; +#pragma omp parallel for + for (int i = 0; i < 100; ++i) { + counter.increment(); + } + std::cout << counter.get_count() << '\n'; + return 0; +} diff --git a/lecture05/address_sanitizer/openmp/.gitignore b/lecture05/address_sanitizer/openmp/.gitignore @@ -0,0 +1,3 @@ +asm +main +sanitized diff --git a/lecture05/address_sanitizer/openmp/Makefile b/lecture05/address_sanitizer/openmp/Makefile @@ -0,0 +1,23 @@ +CXX = g++ +CXXFLAGS = -g -O0 -fopenmp +.PHONY: clean + +# Since GCC 4.8 (LLVM clang also supports this) you can use the +# -fsanitize=thread option to check if there are race conditions detected. But +# be aware the tool is not perfect but can be helpful when hunting bugs! See +# this link for more info: +# https://clang.llvm.org/docs/ThreadSanitizer.html + +all: main sanitized + +main: two_ompthreads.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +sanitized: two_ompthreads.cpp + $(CXX) $(CXXFLAGS) -fsanitize=thread -o $@ $< + +asm: two_ompthreads.cpp + $(CXX) $(CXXFLAGS) -S -o $@ $< + +clean: + rm -f main sanitized asm diff --git a/lecture05/address_sanitizer/openmp/two_ompthreads.cpp b/lecture05/address_sanitizer/openmp/two_ompthreads.cpp @@ -0,0 +1,28 @@ +#include <iostream> +#include <omp.h> + +// DISCLAIMER: this code contains a race condition. Beware of false positives +// from the thread sanitizer due to the OpenMP runtime, which TSan +// has a hard time to understand. See the pthreads version for the +// proof. +int main(void) +{ + // shared variables + int A = 0; + int B = 0; + +#pragma omp parallel num_threads(2) // use 2 threads + { + const int tid = omp_get_thread_num(); // my ID + if (0 == tid) { + A = 1; + const int x = B; // thread local variable + std::cout << "Thread 0: x = " << x << std::endl; + } else if (1 == tid) { + B = 1; + const int y = A; // thread local variable + std::cout << "Thread 1: y = " << y << std::endl; + } + } + return 0; +} diff --git a/lecture05/address_sanitizer/pthreads/.gitignore b/lecture05/address_sanitizer/pthreads/.gitignore @@ -0,0 +1,3 @@ +asm +main +sanitized diff --git a/lecture05/address_sanitizer/pthreads/Makefile b/lecture05/address_sanitizer/pthreads/Makefile @@ -0,0 +1,23 @@ +CXX = g++ +CXXFLAGS = -g -O0 -pthread +.PHONY: clean + +# Since GCC 4.8 (LLVM clang also supports this) you can use the +# -fsanitize=thread option to check if there are race conditions detected. But +# be aware the tool is not perfect but can be helpful when hunting bugs! See +# this link for more info: +# https://clang.llvm.org/docs/ThreadSanitizer.html + +all: main sanitized + +main: two_pthreads.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +sanitized: two_pthreads.cpp + $(CXX) $(CXXFLAGS) -fsanitize=thread -o $@ $< + +asm: two_pthreads.cpp + $(CXX) $(CXXFLAGS) -S -o $@ $< + +clean: + rm -f main sanitized asm diff --git a/lecture05/address_sanitizer/pthreads/two_pthreads.cpp b/lecture05/address_sanitizer/pthreads/two_pthreads.cpp @@ -0,0 +1,63 @@ +#include <cassert> +#include <iostream> +#include <pthread.h> + +// DISCLAIMER: this code contains a race condition. + +// global barrier +pthread_barrier_t barrier; + +// shared variables +int A = 0; +int B = 0; + +// work done by the threads +void *worker(void *t) +{ + long tid = (long)t; + + if (0 == tid) { + A = 1; + // pthread_barrier_wait(&barrier); + const int x = B; // thread local variable + std::cout << "Thread 0: x = " << x << std::endl; + } else if (1 == tid) { + B = 1; + // pthread_barrier_wait(&barrier); + const int y = A; // thread local variable + std::cout << "Thread 1: y = " << y << std::endl; + } + + return nullptr; +} + +int main(void) +{ + // we use two threads + constexpr int nthreads = 2; + + // allocate thread array + pthread_t threads[nthreads]; + + // initialize barrier + int s = pthread_barrier_init(&barrier, nullptr, nthreads); + assert(s == 0); + + // fork + for (long i = 0; i < nthreads; ++i) { + s = pthread_create(&threads[i], nullptr, worker, (void *)i); + assert(s == 0); + } + + // join + for (int i = 0; i < nthreads; ++i) { + s = pthread_join(threads[i], nullptr); + assert(s == 0); + } + + // clean up + s = pthread_barrier_destroy(&barrier); + assert(s == 0); + + return 0; +} diff --git a/lecture05/sequential_consistency/.gitignore b/lecture05/sequential_consistency/.gitignore @@ -0,0 +1,2 @@ +scenario_1_3 +scenario_2 diff --git a/lecture05/sequential_consistency/Makefile b/lecture05/sequential_consistency/Makefile @@ -0,0 +1,14 @@ +CXX = g++ +CXXFLAGS = -g -O1 -fopenmp +.PHONY: clean + +all: scenario_1_3 scenario_2 + +scenario_1_3: scenario_1_3.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +scenario_2: scenario_2.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f scenario_1_3 scenario_2 diff --git a/lecture05/sequential_consistency/README.md b/lecture05/sequential_consistency/README.md @@ -0,0 +1,5 @@ +The code in this directory accommodates slide 12 of lecture 5. + +All examples assume two threads are running. The code (partial orders) the +threads execute involves read and write access to shared variables `A` and `B`. +The variables `x` and `y` are thread private. diff --git a/lecture05/sequential_consistency/scenario_1_3.cpp b/lecture05/sequential_consistency/scenario_1_3.cpp @@ -0,0 +1,51 @@ +#include <iostream> +#include <omp.h> +#include <random> +#include <unistd.h> + +// shared variables +int A = 0; +int B = 0; + +void task1() +{ + A = 1; + const int x = B; + std::cout << "Thread 0: x = " << x << std::endl; +} + +void task2() +{ + B = 1; + const int y = A; + std::cout << "Thread 1: y = " << y << std::endl; +} + +int main(void) +{ +#pragma omp parallel num_threads(2) // use 2 threads + { + // generate some randomness to simulate load imbalance + std::random_device rd; + std::mt19937 rng(rd()); + std::uniform_int_distribution<int> uniform(10, 50); + usleep(uniform(rng)); + + const int tid = omp_get_thread_num(); // my ID + + // the following pragma necessarily serializes the code -> sequential + // consistency (synchronization primitive) +#pragma omp critical + { + // This code must be in a critical section because task1 and task2 + // read and write from/to shared memory. Depending on which thread + // acquires the lock first, you will either get scenario 1 or 3: + if (0 == tid) { + task1(); + } else { + task2(); + } + } + } + return 0; +} diff --git a/lecture05/sequential_consistency/scenario_2.cpp b/lecture05/sequential_consistency/scenario_2.cpp @@ -0,0 +1,34 @@ +#include <iostream> +#include <omp.h> + +// shared variables +int A = 0; +int B = 0; + +int main(void) +{ +#pragma omp parallel num_threads(2) // use 2 threads + { + const int tid = omp_get_thread_num(); // my ID + if (0 == tid) { + A = 1; + // the following pragma necessarily serializes the code -> + // sequential consistency (synchronization primitive) +#pragma omp barrier + const int x = B; + +#pragma omp critical // avoid garbled output (only for cosmetics) + std::cout << "Thread 0: x = " << x << std::endl; + } else { + B = 1; + // the following pragma necessarily serializes the code -> + // sequential consistency (synchronization primitive) +#pragma omp barrier + const int y = A; + +#pragma omp critical // avoid garbled output (only for cosmetics) + std::cout << "Thread 1: y = " << y << std::endl; + } + } + return 0; +} diff --git a/lecture06/omp_conditional/.gitignore b/lecture06/omp_conditional/.gitignore @@ -0,0 +1,2 @@ +with_omp +without_omp diff --git a/lecture06/omp_conditional/Makefile b/lecture06/omp_conditional/Makefile @@ -0,0 +1,14 @@ +CXX = g++ +CXXFLAGS = -O1 +.PHONY: clean + +all: with_omp without_omp + +with_omp: omp_conditional.cpp + $(CXX) $(CXXFLAGS) -fopenmp -o $@ $< + +without_omp: omp_conditional.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f with_omp without_omp diff --git a/lecture06/omp_conditional/omp_conditional.cpp b/lecture06/omp_conditional/omp_conditional.cpp @@ -0,0 +1,12 @@ +#include <iostream> + +int main(void) +{ +#ifdef _OPENMP + std::cout << "Compiled with -fopenmp (OpenMP Standard: " << _OPENMP + << ")\n"; +#else + std::cout << "Compiled without -fopenmp\n"; +#endif /* _OPENMP */ + return 0; +} diff --git a/lecture06/omp_demo/.gitignore b/lecture06/omp_demo/.gitignore @@ -0,0 +1 @@ +demo diff --git a/lecture06/omp_demo/Makefile b/lecture06/omp_demo/Makefile @@ -0,0 +1,9 @@ +CXX = g++ +CXXFLAGS = -O1 +.PHONY: clean + +demo: omp_demo.cpp + $(CXX) $(CXXFLAGS) -fopenmp -o $@ $< + +clean: + rm -f demo diff --git a/lecture06/omp_demo/omp_demo.cpp b/lecture06/omp_demo/omp_demo.cpp @@ -0,0 +1,19 @@ +#include <iostream> +#include <omp.h> + +int main(void) +{ + for (int i = 1; i <= 4; ++i) + { + std::cout << "Loop counter = " << i << ":\n"; +#pragma omp parallel num_threads(i) if (parallel: i > 2) + { + const int tid = omp_get_thread_num(); // get my ID + // the next line is a critical section! (std::cout is not + // thread-safe) +#pragma omp critical + std::cout << "\tHello from thread " << tid << '\n'; + } + } + return 0; +} diff --git a/lecture07_08/omp_critical/.gitignore b/lecture07_08/omp_critical/.gitignore @@ -0,0 +1,2 @@ +omp_critical +omp_critical_two diff --git a/lecture07_08/omp_critical/Makefile b/lecture07_08/omp_critical/Makefile @@ -0,0 +1,14 @@ +CXX = g++ +CXXFLAGS = -O1 -fopenmp +.PHONY: clean + +all: omp_critical omp_critical_two + +omp_critical: omp_critical.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +omp_critical_two: omp_critical.cpp + $(CXX) $(CXXFLAGS) -DTWO_CRITICAL -o $@ $< + +clean: + rm -f omp_critical omp_critical_two diff --git a/lecture07_08/omp_critical/omp_critical.cpp b/lecture07_08/omp_critical/omp_critical.cpp @@ -0,0 +1,38 @@ +#include <iostream> +#include <omp.h> +#include <unistd.h> + +int main(void) +{ + int a = 0; + int b = 0; + // benchmark execution time of parallel region + const double t0 = omp_get_wtime(); +#pragma omp parallel + { + const int tid = omp_get_thread_num(); +#ifdef TWO_CRITICAL +#pragma omp critical(critical_a) +#else +#pragma omp critical +#endif /* TWO_CRITICAL */ + { + a += tid; + usleep(50000); + } +#ifdef TWO_CRITICAL +#pragma omp critical(critical_b) +#else +#pragma omp critical +#endif /* TWO_CRITICAL */ + { + // if the operation would involve a read or write from/to `a`, then + // this would be a race condition + b += tid; + usleep(50000); + } + } + const double dt = omp_get_wtime() - t0; + std::cout << "Elapsed time: " << dt << " seconds\n"; + return a + b; +} diff --git a/lecture07_08/omp_data_attributes/.gitignore b/lecture07_08/omp_data_attributes/.gitignore @@ -0,0 +1,5 @@ +omp_private +omp_region +omp_region.s +omp_static +omp_threadprivate diff --git a/lecture07_08/omp_data_attributes/Makefile b/lecture07_08/omp_data_attributes/Makefile @@ -0,0 +1,29 @@ +CXX = g++ +CXXFLAGS = -O1 -fopenmp +.PHONY: clean + +all: omp_private omp_firstprivate omp_lastprivate omp_static omp_threadprivate omp_region omp_region.s + +omp_private: omp_private.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +omp_firstprivate: omp_firstprivate.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +omp_lastprivate: omp_lastprivate.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +omp_static: omp_static.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +omp_threadprivate: omp_threadprivate.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +omp_region: omp_region.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +omp_region.s: omp_region.cpp + $(CXX) $(CXXFLAGS) -S -o $@ $< + +clean: + rm -f omp_private omp_firstprivate omp_lastprivate omp_static omp_threadprivate omp_region omp_region.s diff --git a/lecture07_08/omp_data_attributes/omp_firstprivate.cpp b/lecture07_08/omp_data_attributes/omp_firstprivate.cpp @@ -0,0 +1,24 @@ +#include <iostream> +#include <omp.h> + +int main(void) +{ + int shared = -1; // shared variable + +#pragma omp parallel firstprivate(shared) // create private copy (initialized!) + { + const int tid = omp_get_thread_num(); +#pragma omp single + { + shared = tid; + } // no need to wait -> `shared` is private -> no race condition below! + +#pragma omp critical + { + std::cout << "Thread " << tid + << ":\tvalue of shared (private copy) = " << shared + << '\n'; + } + } + return 0; +} diff --git a/lecture07_08/omp_data_attributes/omp_lastprivate.cpp b/lecture07_08/omp_data_attributes/omp_lastprivate.cpp @@ -0,0 +1,41 @@ +#include <iostream> + +int main(void) +{ + int a = -1; +#pragma omp parallel + { +#pragma omp for lastprivate(a) // private copy of a (uninitialized!) + for (int i = 0; i < 100; ++i) { + a = i; // no race condition -> `a` is private copy in loop body + } +#pragma omp single + { + std::cout << "After consecutive iterations: a = " << a << std::endl; + a = -1; + } // implicit barrier required here -> otherwise race condition on `a` + // below! + +#pragma omp for lastprivate(a) // private copy of a (uninitialized!) + for (int i = 0; i < 100; ++i) { + // An optimizing compiler will remove this loop completely and let + // one thread write to the shared variable a = 66 because the given + // lastprivate(a) OpenMP clause will still apply even if the + // optimizer phase removes the loop (see for yourself what happens + // if you change it to private(a)). + // + // If the loop was not optimized away, it would be executed in + // parallel and the last element 99 in the iteration sequence would + // apply to the lastprivate(a) update. Because that thread will + // have never written to its private copy, it would result in + // undefined behavior because lastprivate(a) like private(a) do not + // initialize the private copies! It would be a bug. + if (66 == i) { + a = i; + } + } +#pragma omp single + std::cout << "After non-consecutive iterations: a = " << a << std::endl; + } + return 0; +} diff --git a/lecture07_08/omp_data_attributes/omp_private.cpp b/lecture07_08/omp_data_attributes/omp_private.cpp @@ -0,0 +1,24 @@ +#include <iostream> +#include <omp.h> + +int main(void) +{ + int shared = -1; // shared variable + +#pragma omp parallel private(shared) // create private copy (uninitialized!) + { + const int tid = omp_get_thread_num(); +#pragma omp single nowait + { + shared = tid; + } // no need to wait -> `shared` is private -> no race condition below! + +#pragma omp critical + { + std::cout << "Thread " << tid + << ":\tvalue of shared (private copy) = " << shared + << '\n'; + } + } + return 0; +} diff --git a/lecture07_08/omp_data_attributes/omp_region.cpp b/lecture07_08/omp_data_attributes/omp_region.cpp @@ -0,0 +1,54 @@ +// vim: foldmethod=marker +#include <iostream> +#include <omp.h> + +void declare_static(void) +{ + const int tid = omp_get_thread_num(); // reference + + // declared_in_region is shared among threads who call this function. + // NOTE: the assignment here is not a race condition -> the compiler + // initializes this variable at compile time, not the threads at runtime! + static int declared_in_region = 1234567890; + // + // PROOF: see assembly code + // all threads print the initial value of declared_in_region {{{1 +#pragma omp single // write header line (only one needed) + { + std::cout << "Initial value of declared_in_region:\n"; + } // implicit barrier here +#pragma omp critical // all threads must write this + { + std::cout << "\ttid " << tid << ":\t" << declared_in_region << '\n'; + } // NO implicit barrier here! + // we must synchronize here, otherwise a thread might continue while others + // still need to print the std::cout line above. Test it by commenting out + // the barrier. +#pragma omp barrier + // 1}}} + + // DISCLAIMER: the next commented line would be a race condition! + // declared_in_region = tid; + // + // PROOF: declared_in_region is shared +#pragma omp single + { + declared_in_region = tid; + } // others wait here +#pragma omp single nowait + { + std::cout << "I am tid " << tid + << " and `declared_in_region` has been written to by tid " + << declared_in_region << '\n'; + } +} + +int main(void) +{ +#pragma omp parallel + { + declare_static(); // all threads call this function + } + + return 0; +} diff --git a/lecture07_08/omp_data_attributes/omp_static.cpp b/lecture07_08/omp_data_attributes/omp_static.cpp @@ -0,0 +1,23 @@ +#include <iostream> +#include <omp.h> + +int main(void) +{ +#pragma omp parallel + { + static int static_tid; // static storage duration -> shared + + const int tid = omp_get_thread_num(); +#pragma omp single + { + static_tid = tid; + } // implicit barrier required here -> no race condition below! + +#pragma omp critical + { + std::cout << "Thread " << tid + << ":\treading static_tid = " << static_tid << '\n'; + } + } + return 0; +} diff --git a/lecture07_08/omp_data_attributes/omp_threadprivate.cpp b/lecture07_08/omp_data_attributes/omp_threadprivate.cpp @@ -0,0 +1,33 @@ +#include <iostream> +#include <omp.h> + +// static (global) in compilation unit (shared by default) +static int global = -1; + +// predetermine `global` as thread private (has to be done before first use). +// Without this pragma global will be shared +#pragma omp threadprivate(global) + +void access_global(void) +{ + const int tid = omp_get_thread_num(); // reference + + // DISCLAIMER: this may be a race condition, depending on threadprivate + // above! + global = tid; + +#pragma omp critical // all threads must write this + { + std::cout << "\ttid " << tid << ":\t" << global << '\n'; + } +} + +int main(void) +{ +#pragma omp parallel + { + access_global(); + } + + return 0; +} diff --git a/lecture07_08/omp_default/.gitignore b/lecture07_08/omp_default/.gitignore @@ -0,0 +1 @@ +omp_default diff --git a/lecture07_08/omp_default/Makefile b/lecture07_08/omp_default/Makefile @@ -0,0 +1,9 @@ +CXX = g++ +CXXFLAGS = -O1 -fopenmp +.PHONY: clean + +omp_default: omp_default.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f omp_default diff --git a/lecture07_08/omp_default/omp_default.cpp b/lecture07_08/omp_default/omp_default.cpp @@ -0,0 +1,15 @@ +#include <omp.h> + +int main(void) +{ + int var; +#pragma omp parallel default(none) // shared(var) + { +#pragma omp critical + { + // var was not explicitly specified -> will not compile + var = omp_get_thread_num(); + } + } + return 0; +} diff --git a/lecture07_08/omp_device/.gitignore b/lecture07_08/omp_device/.gitignore @@ -0,0 +1 @@ +omp_device diff --git a/lecture07_08/omp_device/Makefile b/lecture07_08/omp_device/Makefile @@ -0,0 +1,15 @@ +CXX = g++ +CXXFLAGS = -O1 -fopenmp +.PHONY: clean run_dynamic run_static + +omp_device: omp_device.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +run_dynamic: omp_device + OMP_NUM_THREADS=8 OMP_DYNAMIC=true ./omp_device + +run_static: omp_device + OMP_NUM_THREADS=8 OMP_DYNAMIC=false ./omp_device + +clean: + rm -f omp_device diff --git a/lecture07_08/omp_device/omp_device.cpp b/lecture07_08/omp_device/omp_device.cpp @@ -0,0 +1,19 @@ +#include <iostream> +#include <omp.h> + +int main(void) +{ +#pragma omp parallel + { +#pragma omp single + { + const int nprocs = omp_get_num_procs(); + const int nthreads = omp_get_num_threads(); + const int maxthreads = omp_get_max_threads(); + std::cout << "Number of processors: " << nprocs << '\n'; + std::cout << "Number of threads in team: " << nthreads << '\n'; + std::cout << "Maximum number of threads: " << maxthreads << '\n'; + } + } + return 0; +} diff --git a/lecture07_08/omp_schedule/.gitignore b/lecture07_08/omp_schedule/.gitignore @@ -0,0 +1 @@ +omp_schedule diff --git a/lecture07_08/omp_schedule/Makefile b/lecture07_08/omp_schedule/Makefile @@ -0,0 +1,9 @@ +CXX = g++ +CXXFLAGS = -O1 -fopenmp +.PHONY: clean + +omp_schedule: omp_schedule.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f omp_schedule diff --git a/lecture07_08/omp_schedule/omp_schedule.cpp b/lecture07_08/omp_schedule/omp_schedule.cpp @@ -0,0 +1,38 @@ +#include <iostream> +#include <omp.h> + +int main(void) +{ +#pragma omp parallel + { + const int tid = omp_get_thread_num(); + + omp_sched_t sched; + int chunk; + +#pragma omp single + { + // check the default before we change: + omp_get_schedule(&sched, &chunk); + std::cout << "Default: sched=" << sched << "; chunk=" << chunk + << " (tid=" << tid << ")\n"; + + // DISCLAIMER: this will only set the schedule for the calling + // thread! + // If the second arg is < 1: OpenMP uses the default chunk_size for + // the chosen scheduling policy, for omp_sched_auto the second arg + // is irrelevant: + omp_set_schedule(omp_sched_guided, 8); + } + + // see what you get with all threads + omp_get_schedule(&sched, &chunk); + +#pragma omp critical + { + std::cout << "Thread " << tid << ":\tsched=" << sched + << "; chunk=" << chunk << '\n'; + } + } + return 0; +} diff --git a/lecture07_08/omp_scoping/Makefile b/lecture07_08/omp_scoping/Makefile @@ -0,0 +1,12 @@ +CXX = gcc +CXXFLAGS = -O1 -fopenmp +.PHONY: clean + +main: main.c myfunc.o + $(CXX) $(CXXFLAGS) -o $@ $^ + +myfunc.o: myfunc.c + $(CXX) $(CXXFLAGS) -c -o $@ $< + +clean: + rm -f main myfunc.o diff --git a/lecture07_08/omp_scoping/main.c b/lecture07_08/omp_scoping/main.c @@ -0,0 +1,18 @@ +#include <omp.h> + +void myfunc(int A[], const int N, const int tid); + +int main(void) +{ + const int N = 1000; + int A[N]; + +#pragma omp parallel + { + const int tid = omp_get_thread_num(); + myfunc(A, N, tid); + } + // The #pragma omp for directive has no effect in dynamic extent + myfunc(A, N, 0); + return 0; +} diff --git a/lecture07_08/omp_scoping/myfunc.c b/lecture07_08/omp_scoping/myfunc.c @@ -0,0 +1,6 @@ +void myfunc(int A[], const int N, const int tid) +{ +#pragma omp for + for (int i = 0; i < N; ++i) + A[i] = tid; +} diff --git a/lecture07_08/omp_wtime/.gitignore b/lecture07_08/omp_wtime/.gitignore @@ -0,0 +1 @@ +omp_wtime diff --git a/lecture07_08/omp_wtime/Makefile b/lecture07_08/omp_wtime/Makefile @@ -0,0 +1,9 @@ +CXX = g++ +CXXFLAGS = -O1 -fopenmp +.PHONY: clean + +omp_wtime: omp_wtime.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f omp_wtime diff --git a/lecture07_08/omp_wtime/omp_wtime.cpp b/lecture07_08/omp_wtime/omp_wtime.cpp @@ -0,0 +1,35 @@ +#include <iostream> +#include <omp.h> +#include <random> +#include <unistd.h> + +int main(void) +{ +#pragma omp parallel + { +#pragma omp single + { + const double resolution = omp_get_wtick(); + std::cout << "Clock resolution: " << std::scientific << resolution + << " sec\n"; + } + + const int tid = omp_get_thread_num(); + // the random number generator must be thread private -> race condition + // otherwise! + std::mt19937 gen(tid); + std::uniform_int_distribution uniform(1, 500); + + const int mytime = uniform(gen) * 1000; // micro seconds + const double t0 = omp_get_wtime(); + usleep(mytime); + const double dt = omp_get_wtime() - t0; + +#pragma omp critical + { + std::cout << "Thread " << tid << ":\t" << std::scientific << dt + << " sec elapsed [expected: " << mytime * 1.0e-6 << "]\n"; + } + } + return 0; +} diff --git a/lecture09/thread_affinity/.gitignore b/lecture09/thread_affinity/.gitignore @@ -0,0 +1,4 @@ +affinity +affinity_close +affinity_spread +affinity_master diff --git a/lecture09/thread_affinity/Makefile b/lecture09/thread_affinity/Makefile @@ -0,0 +1,20 @@ +CXX = g++ +CXXFLAGS = -O1 -fopenmp +.PHONY: clean + +all: affinity affinity_close affinity_spread affinity_master + +affinity: affinity.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +affinity_close: affinity_close.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +affinity_spread: affinity_spread.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +affinity_master: affinity_master.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f affinity affinity_close affinity_spread affinity_master diff --git a/lecture09/thread_affinity/affinity.cpp b/lecture09/thread_affinity/affinity.cpp @@ -0,0 +1,35 @@ +#include "helper.h" +#include <iostream> +#include <omp.h> +#include <sstream> +#include <vector> + +int main(void) +{ + std::vector<std::string> ordered_output; + +#pragma omp parallel + { + const int tid = omp_get_thread_num(); + const int nthreads = omp_get_num_threads(); +#pragma omp single + ordered_output.resize(nthreads); + + int cpuid, nodeid; + tacc_rdtscp(&cpuid, &nodeid); + + const std::string hostname = gethostname(); + + std::ostringstream mystream; + mystream << "[thread=" << tid << "/" << nthreads << "]\t" + << "Running on host " << hostname.c_str() << "\tCPU " << cpuid + << "\tNUMA NODE " << nodeid << '\n'; + ordered_output[tid] = mystream.str(); + } + + for (const auto &s : ordered_output) { + std::cout << s; + } + + return 0; +} diff --git a/lecture09/thread_affinity/affinity_close.cpp b/lecture09/thread_affinity/affinity_close.cpp @@ -0,0 +1,35 @@ +#include "helper.h" +#include <iostream> +#include <omp.h> +#include <sstream> +#include <vector> + +int main(void) +{ + std::vector<std::string> ordered_output; + +#pragma omp parallel proc_bind(close) + { + const int tid = omp_get_thread_num(); + const int nthreads = omp_get_num_threads(); +#pragma omp single + ordered_output.resize(nthreads); + + int cpuid, nodeid; + tacc_rdtscp(&cpuid, &nodeid); + + const std::string hostname = gethostname(); + + std::ostringstream mystream; + mystream << "[thread=" << tid << "/" << nthreads << "]\t" + << "Running on host " << hostname.c_str() << "\tCPU " << cpuid + << "\tNUMA NODE " << nodeid << '\n'; + ordered_output[tid] = mystream.str(); + } + + for (const auto &s : ordered_output) { + std::cout << s; + } + + return 0; +} diff --git a/lecture09/thread_affinity/affinity_master.cpp b/lecture09/thread_affinity/affinity_master.cpp @@ -0,0 +1,35 @@ +#include "helper.h" +#include <iostream> +#include <omp.h> +#include <sstream> +#include <vector> + +int main(void) +{ + std::vector<std::string> ordered_output; + +#pragma omp parallel proc_bind(master) // use `primary` in OpenMP v5.1 + { + const int tid = omp_get_thread_num(); + const int nthreads = omp_get_num_threads(); +#pragma omp single + ordered_output.resize(nthreads); + + int cpuid, nodeid; + tacc_rdtscp(&cpuid, &nodeid); + + const std::string hostname = gethostname(); + + std::ostringstream mystream; + mystream << "[thread=" << tid << "/" << nthreads << "]\t" + << "Running on host " << hostname.c_str() << "\tCPU " << cpuid + << "\tNUMA NODE " << nodeid << '\n'; + ordered_output[tid] = mystream.str(); + } + + for (const auto &s : ordered_output) { + std::cout << s; + } + + return 0; +} diff --git a/lecture09/thread_affinity/affinity_spread.cpp b/lecture09/thread_affinity/affinity_spread.cpp @@ -0,0 +1,35 @@ +#include "helper.h" +#include <iostream> +#include <omp.h> +#include <sstream> +#include <vector> + +int main(void) +{ + std::vector<std::string> ordered_output; + +#pragma omp parallel proc_bind(spread) + { + const int tid = omp_get_thread_num(); + const int nthreads = omp_get_num_threads(); +#pragma omp single + ordered_output.resize(nthreads); + + int cpuid, nodeid; + tacc_rdtscp(&cpuid, &nodeid); + + const std::string hostname = gethostname(); + + std::ostringstream mystream; + mystream << "[thread=" << tid << "/" << nthreads << "]\t" + << "Running on host " << hostname.c_str() << "\tCPU " << cpuid + << "\tNUMA NODE " << nodeid << '\n'; + ordered_output[tid] = mystream.str(); + } + + for (const auto &s : ordered_output) { + std::cout << s; + } + + return 0; +} diff --git a/lecture09/thread_affinity/helper.h b/lecture09/thread_affinity/helper.h @@ -0,0 +1,26 @@ +#ifndef HELPER_H_T9QL1OGR +#define HELPER_H_T9QL1OGR + +#include <string> +#include <unistd.h> + +// return cpu and numa node id +// https://stackoverflow.com/questions/16862620/numa-get-current-node-core +unsigned long tacc_rdtscp(int *core, int *node) +{ + unsigned long a,d,c; + __asm__ volatile("rdtscp" : "=a" (a), "=d" (d), "=c" (c)); + *node= (c & 0xFFF000)>>12; + *core = c & 0xFFF; + return ((unsigned long)a) | (((unsigned long)d) << 32);; +} + +// get hostname of node +std::string gethostname(void) +{ + std::string hostname(1024, '\0'); + gethostname(&hostname.front(), hostname.size()); + return hostname; +} + +#endif /* HELPER_H_T9QL1OGR */ diff --git a/lecture09/thread_affinity/omp_places_cores.sh b/lecture09/thread_affinity/omp_places_cores.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +OMP_NUM_THREADS=16 OMP_PLACES='cores' ./affinity diff --git a/lecture09/thread_affinity/omp_places_sockets.sh b/lecture09/thread_affinity/omp_places_sockets.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +OMP_NUM_THREADS=16 OMP_PLACES='sockets' ./affinity diff --git a/lecture09/thread_affinity/omp_places_threads.sh b/lecture09/thread_affinity/omp_places_threads.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +OMP_NUM_THREADS=16 OMP_PLACES='threads' ./affinity diff --git a/lecture09/thread_affinity/omp_proc_bind_close.sh b/lecture09/thread_affinity/omp_proc_bind_close.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +OMP_NUM_THREADS=16 OMP_PROC_BIND='close' ./affinity diff --git a/lecture09/thread_affinity/omp_proc_bind_false.sh b/lecture09/thread_affinity/omp_proc_bind_false.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +OMP_NUM_THREADS=16 OMP_PROC_BIND='false' ./affinity diff --git a/lecture09/thread_affinity/omp_proc_bind_master.sh b/lecture09/thread_affinity/omp_proc_bind_master.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +OMP_NUM_THREADS=16 OMP_PROC_BIND='master' ./affinity diff --git a/lecture09/thread_affinity/omp_proc_bind_spread.sh b/lecture09/thread_affinity/omp_proc_bind_spread.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +OMP_NUM_THREADS=16 OMP_PROC_BIND='spread' ./affinity diff --git a/lecture09/thread_affinity/omp_proc_bind_true.sh b/lecture09/thread_affinity/omp_proc_bind_true.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash +OMP_NUM_THREADS=16 OMP_PROC_BIND='true' ./affinity diff --git a/lecture10/openmp_overhead/AMD_Opteron/openmp_overhead.pdf b/lecture10/openmp_overhead/AMD_Opteron/openmp_overhead.pdf Binary files differ. diff --git a/lecture10/openmp_overhead/AMD_Opteron/proc_bind_false.dat b/lecture10/openmp_overhead/AMD_Opteron/proc_bind_false.dat @@ -0,0 +1,8 @@ +# threads parallel for parallel_for barrier single critical lock_unlock ordered atomic reduction +1 1.402517e+00 2.040600e-02 9.638550e-01 1.060350e-01 1.407802e+00 3.316600e-02 9.672210e-01 6.728000e-03 9.601640e-01 6.021000e-03 1.316700e-02 2.729000e-03 2.073300e-02 1.648000e-03 8.487000e-03 1.494000e-03 2.950700e-02 2.240000e-04 1.467251e+00 3.183000e-02 +2 3.377395e+00 1.364400e-01 1.180594e+00 1.220300e-02 3.449368e+00 2.590500e-02 1.229767e+00 9.989000e-03 1.174797e+00 1.923300e-02 1.595980e-01 2.828300e-02 1.604950e-01 2.559000e-03 1.190669e+00 7.037000e-03 2.340520e-01 4.240000e-03 3.332934e+00 3.320900e-02 +4 3.472150e+00 1.092780e-01 1.297534e+00 1.140100e-01 3.849536e+00 3.932600e-02 1.285642e+00 2.517300e-02 1.520340e+00 1.487600e-02 5.914140e-01 1.249400e-02 5.639350e-01 8.597000e-03 9.850230e-01 5.318000e-03 5.995950e-01 3.523400e-02 3.844296e+00 1.692840e-01 +8 5.642615e+00 4.424500e-02 2.428380e+00 1.504480e-01 5.853155e+00 8.348600e-02 2.516674e+00 2.587440e-01 2.774199e+00 1.654420e-01 1.363838e+00 1.112150e-01 1.480740e+00 8.594900e-02 9.917380e-01 4.070400e-02 1.194033e+00 1.714040e-01 6.632784e+00 9.369850e-01 +16 9.253381e+00 6.832900e-02 4.036582e+00 1.559480e-01 9.907635e+00 8.686000e-02 4.085306e+00 2.851050e-01 6.297392e+00 6.432800e-02 5.481086e+00 3.757630e-01 5.427509e+00 5.264830e-01 1.048119e+00 3.806800e-02 2.159469e+00 3.160940e-01 1.361704e+01 9.204500e-02 +32 1.907739e+01 8.178810e-01 9.850260e+00 2.483310e-01 1.961147e+01 5.749560e-01 9.472037e+00 1.867770e-01 1.260011e+01 3.464590e-01 6.311141e+00 6.595200e-01 6.336537e+00 5.468470e-01 1.075643e+00 5.919800e-02 1.972749e+00 2.196170e-01 1.753772e+01 1.873420e-01 +64 3.966405e+01 1.517342e+00 2.037396e+01 3.263870e-01 3.982416e+01 2.570315e+00 2.378785e+01 3.674876e+00 3.555074e+01 7.258960e-01 1.290547e+01 9.898270e-01 1.279011e+01 1.402299e+00 1.994510e+00 2.245406e+00 6.436716e+00 2.400394e+00 5.790044e+01 3.615972e+00 diff --git a/lecture10/openmp_overhead/AMD_Opteron/proc_bind_true.dat b/lecture10/openmp_overhead/AMD_Opteron/proc_bind_true.dat @@ -0,0 +1,8 @@ +# threads parallel for parallel_for barrier single critical lock_unlock ordered atomic reduction +1 1.331503e+00 3.412900e-02 9.543120e-01 3.804000e-03 1.319009e+00 1.155800e-02 9.673880e-01 5.830000e-03 9.758090e-01 1.115710e-01 1.657900e-02 1.376000e-03 2.218500e-02 9.500000e-04 8.604000e-03 1.165000e-03 3.166400e-02 1.290000e-04 1.380004e+00 9.710000e-03 +2 2.111130e+00 1.984500e-02 9.014150e-01 2.217300e-02 2.118879e+00 6.912300e-02 8.981230e-01 1.857600e-02 8.820540e-01 1.878000e-02 1.411180e-01 1.614200e-02 1.361210e-01 1.582300e-02 2.049800e-01 4.185200e-02 1.143560e-01 5.316100e-02 2.178722e+00 1.131700e-02 +4 3.377356e+00 1.873800e-02 1.189846e+00 1.187900e-02 3.727039e+00 8.104910e-01 1.532612e+00 2.901200e-02 1.922584e+00 1.012850e-01 6.452860e-01 3.303500e-02 5.906120e-01 4.363800e-02 6.596520e-01 3.989000e-03 7.609000e-01 1.503940e-01 4.120677e+00 5.567500e-02 +8 5.242987e+00 4.543800e-02 2.616701e+00 3.318600e-02 5.388962e+00 3.659300e-02 2.980622e+00 1.598151e+00 4.435692e+00 3.942300e-02 2.326891e+00 1.345630e-01 2.621963e+00 1.714290e-01 6.337460e-01 4.899000e-03 1.612638e+00 2.495420e-01 6.703456e+00 9.251780e-01 +16 1.062893e+01 9.372600e-02 4.779814e+00 3.467300e-02 1.108604e+01 8.040300e-02 4.514903e+00 3.296900e-02 8.394583e+00 1.162530e-01 3.032364e+00 2.738930e-01 3.503220e+00 2.002430e-01 6.613400e-01 1.308900e-02 1.895607e+00 1.709140e-01 1.271062e+01 9.121110e-01 +32 2.334865e+01 3.237670e-01 1.098605e+01 2.548060e-01 2.420802e+01 4.485240e-01 1.058177e+01 1.130700e-01 2.544649e+01 7.436380e-01 8.309794e+00 6.075940e-01 8.515407e+00 5.558380e-01 6.330950e-01 1.718000e-02 2.892609e+00 3.540830e-01 2.937155e+01 1.409414e+00 +64 4.689721e+01 2.414586e+00 2.405726e+01 1.071561e+00 4.803695e+01 9.345560e-01 2.145390e+01 1.830180e+00 4.902376e+01 2.166267e+00 1.071435e+01 1.146510e+00 1.048317e+01 9.043630e-01 5.841290e-01 1.429300e-02 4.735033e+00 7.720940e-01 5.227324e+01 1.618774e+00 diff --git a/lecture10/openmp_overhead/Intel_Xeon2683v4/openmp_overhead.pdf b/lecture10/openmp_overhead/Intel_Xeon2683v4/openmp_overhead.pdf Binary files differ. diff --git a/lecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_false.dat b/lecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_false.dat @@ -0,0 +1,7 @@ +# threads parallel for parallel_for barrier single critical lock_unlock ordered atomic reduction +1 7.937410e-01 1.405980e-01 3.000770e-01 1.206040e-01 4.544730e-01 1.123700e-02 2.687350e-01 1.593800e-02 2.730820e-01 1.165900e-02 2.865200e-02 1.129000e-02 2.914300e-02 1.115500e-02 3.834300e-02 1.210000e-02 1.108800e-02 1.120000e-04 5.043380e-01 1.135200e-02 +2 2.296052e+00 3.159780e-01 5.839310e-01 8.841900e-02 2.338442e+00 4.084700e-02 5.603300e-01 2.168600e-02 5.850920e-01 2.377500e-02 2.069800e-02 1.844200e-02 1.026780e-01 2.057000e-02 3.690150e-01 2.261200e-02 5.144500e-02 7.918000e-03 2.410278e+00 2.044020e-01 +4 3.596719e+00 8.907000e-02 9.386340e-01 1.131030e-01 3.260242e+00 8.585100e-02 8.744930e-01 5.123000e-02 1.064314e+00 3.732200e-02 1.319100e-01 4.183900e-02 2.094040e-01 3.205800e-02 2.848370e-01 2.944000e-02 6.543800e-02 1.459200e-02 3.787471e+00 1.519180e-01 +8 4.852901e+00 3.355180e-01 1.610962e+00 1.007260e-01 4.687913e+00 1.203670e-01 1.537271e+00 4.187100e-02 2.084429e+00 6.085100e-02 1.180850e-01 2.578100e-02 2.970390e-01 2.665600e-02 4.483030e-01 2.467000e-02 1.066430e-01 2.261700e-02 6.218094e+00 2.258230e-01 +16 8.113731e+00 2.710340e-01 2.883032e+00 9.578900e-02 7.760440e+00 7.466270e-01 2.792584e+00 2.279100e-02 4.360566e+00 4.993000e-02 2.259970e-01 9.855000e-03 3.028450e-01 1.392300e-02 3.789050e-01 6.589200e-02 1.573170e-01 2.331600e-02 1.036501e+01 3.679590e-01 +32 1.608341e+01 1.229300e+00 6.486622e+00 1.396300e-01 1.380231e+01 1.676100e-01 6.275401e+00 9.366000e-02 9.445811e+00 2.926130e-01 2.089660e-01 2.764600e-02 4.907780e-01 3.136800e-02 4.022040e-01 2.630400e-02 1.350580e-01 5.654100e-02 2.042983e+01 7.251660e-01 diff --git a/lecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_true.dat b/lecture10/openmp_overhead/Intel_Xeon2683v4/proc_bind_true.dat @@ -0,0 +1,7 @@ +# threads parallel for parallel_for barrier single critical lock_unlock ordered atomic reduction +1 4.835700e-01 1.885300e-02 2.935020e-01 2.998700e-02 4.894620e-01 1.666000e-02 2.892380e-01 2.749000e-02 2.904410e-01 1.838200e-02 1.611700e-02 1.957300e-02 1.696700e-02 1.636500e-02 2.689700e-02 1.681900e-02 1.111900e-02 1.970000e-04 5.234470e-01 5.894000e-03 +2 2.186639e+00 1.256210e-01 7.102430e-01 8.755600e-02 2.391640e+00 1.740600e-02 6.741400e-01 2.361500e-02 6.415170e-01 9.107000e-03 7.414600e-02 7.116000e-03 1.516090e-01 1.201600e-02 3.238180e-01 8.829000e-03 4.892700e-02 7.032000e-03 2.390954e+00 1.887100e-01 +4 3.448366e+00 1.703340e-01 1.056021e+00 1.249040e-01 3.242441e+00 5.770700e-02 8.994180e-01 2.416800e-02 1.244440e+00 3.263000e-02 1.071660e-01 1.635700e-02 2.538550e-01 1.493900e-02 4.703550e-01 2.858200e-02 1.017960e-01 2.866100e-02 4.026791e+00 1.076080e-01 +8 4.858263e+00 2.654620e-01 1.616485e+00 7.317800e-02 4.621479e+00 1.826780e-01 1.574272e+00 3.862300e-02 2.098022e+00 4.825200e-02 2.062030e-01 1.756400e-02 2.892610e-01 1.873800e-02 3.959510e-01 1.332400e-02 1.123810e-01 2.337200e-02 6.513596e+00 6.708810e-01 +16 7.681152e+00 3.059600e-01 2.693141e+00 1.062240e-01 7.003080e+00 8.182500e-02 2.629225e+00 1.980100e-02 3.907993e+00 3.472200e-02 2.274840e-01 1.167300e-02 3.173700e-01 2.057600e-02 4.397950e-01 4.770000e-02 1.369940e-01 3.960400e-02 9.669193e+00 8.244130e-01 +32 1.507245e+01 8.887600e-01 5.700451e+00 2.852420e-01 1.394916e+01 9.880200e-02 5.628993e+00 3.959500e-02 8.543289e+00 8.844400e-02 2.853500e-01 1.297900e-02 4.435590e-01 2.863600e-02 4.402650e-01 2.675900e-02 1.696370e-01 2.608900e-02 2.016809e+01 8.817090e-01 diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Licence.txt b/lecture10/openmp_overhead/openmpbench_C_v31/Licence.txt @@ -0,0 +1,202 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [2011] [The University of Edinburgh] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile @@ -0,0 +1,69 @@ +include Makefile.defs + +# If CFLAGS_CRAY is empty set it to CFLAGS +ifeq ($(CFLAGS_CRAY),) +CFLAGS_CRAY = ${CFLAGS} +endif + +.c.o: + ${CC} ${CFLAGS} $(OMPFLAG) -c $*.c + +SYNCOBJS = syncbench.o common.o +SCHEDOBJS = schedbench.o common_sched.o +ARRAYOBJS = arraybench_$(IDA).o common.o +TASKOBJS = taskbench.o common.o +SCHEDFLAGS = -DSCHEDBENCH + +all: syncbench schedbench taskbench + $(MAKE) IDA=1 prog + $(MAKE) IDA=3 prog + $(MAKE) IDA=9 prog + $(MAKE) IDA=27 prog + $(MAKE) IDA=81 prog + $(MAKE) IDA=243 prog + $(MAKE) IDA=729 prog + $(MAKE) IDA=2187 prog + $(MAKE) IDA=6561 prog + $(MAKE) IDA=19683 prog + $(MAKE) IDA=59049 prog + +prog: arraybench_$(IDA) + +syncbench: $(SYNCOBJS) + $(CC) -o syncbench $(LDFLAGS) $(SYNCOBJS) $(CLOCKOBJS) $(LIBS) -lm + +# Rule to ensure the lower optimisation level is picked up for common.c +# with the Cray compiler +common.o: + ${CC} ${CFLAGS_CRAY} $(OMPFLAG) -c $*.c + +# Separate rule to build common_sched.o as we need to ensure the correct +# DEFAULT_DELAY_TIME is used. +common_sched.o: + ${CC} ${CFLAGS_CRAY} $(SCHEDFLAGS) $(OMPFLAG) -o common_sched.o -c common.c + +schedbench: $(SCHEDOBJS) + $(CC) -o schedbench $(LDFLAGS) $(SCHEDOBJS) $(CLOCKOBJS) $(LIBS) -lm + +# Multiple header files due to multiple array sizes, makes header file arraybench_*.h +arraybench_$(IDA).h: arraybench.h + $(CPP) -DIDA=$(IDA) $(OMPFLAG) -P arraybench.h -o $@ + +# Multiple object files due to multiple array sizes, makes object file arraybench_*.o +arraybench_$(IDA).o: arraybench_$(IDA).h arraybench.c + $(CC) $(CFLAGS) -DIDA=$(IDA) $(OMPFLAG) arraybench.c -c -o $@ + +# Multiple executables due to multiple array sizes, makes exe file arraybench_* +arraybench_$(IDA): $(ARRAYOBJS) $(CLOCKOBJS) arraybench.c + $(CC) $(LDFLAGS) $(ARRAYOBJS) $(CLOCKOBJS) $(LIBS) -lm -o $@ + +taskbench: $(TASKOBJS) + $(CC) -o taskbench $(LDFLAGS) $(OMPFLAG) $(TASKOBJS) $(CLOCKOBJS) $(LIBS) -lm + +clean: + -rm *.o syncbench schedbench arraybench_* taskbench + +clean-all: clean + -rm OpenMPBench.* *.all + + diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs @@ -0,0 +1,10 @@ +# Uncomment the following line to use OpenMP 2.0 features +#OMPFLAG = -DOMPVER2 +# Uncomment the following line to use OpenMP 3.0 features +OMPFLAG = -DOMPVER2 -DOMPVER3 + +CC=gcc +CFLAGS = -O1 -lm -fopenmp +LDFLAGS = -O0 -lm -fopenmp +CPP = /usr/bin/cpp +LIBS = diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.cray b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.cray @@ -0,0 +1,10 @@ +# Uncomment the following line to use OpenMP 2.0 features +#OMPFLAG = -DOMPVER2 +# Uncomment the following line to use OpenMP 3.0 features +OMPFLAG = -DOMPVER2 -DOMPVER3 + +CC=cc +CFLAGS = -O1 -lm +LDFLAGS = -O0 -lm +CPP = /usr/bin/cpp +LIBS = diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.pgi b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.hector.pgi @@ -0,0 +1,12 @@ +# Uncomment the following line to use OpenMP 2.0 features +#OMPFLAG = -DOMPVER2 +# Uncomment the following line to use OpenMP 3.0 features +# Nested tasks don't work on HECToR Phase 3 for the PGI compiler +# and thus are disabled. +OMPFLAG = -DOMPVER2 -DOMPVER3 -DDISABLE_NESTED_TASKS_TESTS + +CC =cc +CFLAGS = -fast -mp=nonuma -lm +LDFLAGS = -fast -mp=nonuma -lm +CPP = /usr/bin/cpp +LIBS = +\ No newline at end of file diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.gnu b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.gnu @@ -0,0 +1,10 @@ +# Uncomment the following line to use OpenMP 2.0 features +#OMPFLAG = -DOMPVER2 +# Uncomment the following line to use OpenMP 3.0 features +OMPFLAG = -DDISABLE_BARRIER_TEST -DOMPVER2 -DOMPVER3 + +CC = /usr/local/bin/gcc-4.6 +CFLAGS = -fopenmp -O1 -lm +LDFLAGS = -fopenmp -O1 -lm +CPP = /usr/bin/cpp +LIBS = +\ No newline at end of file diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.sun b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.magny0.sun @@ -0,0 +1,10 @@ +# Uncomment the following line to use OpenMP 2.0 features +#OMPFLAG = -DOMPVER2 +# Uncomment the following line to use OpenMP 3.0 features +OMPFLAG = -DOMPVER2 -DOMPVER3 + +CC = /home/h012/fiona/SolarisStudio12.2-linux-x86-tar-ML/solstudio12.2/bin/suncc +CFLAGS = -xopenmp -xO3 -lm +LDFLAGS = -xopenmp -xO3 -lm +CPP = /usr/bin/cpp +LIBS = diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.gnu b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.gnu @@ -0,0 +1,10 @@ +# Uncomment the following line to use OpenMP 2.0 features +#OMPFLAG = -DOMPVER2 +# Uncomment the following line to use OpenMP 3.0 features +OMPFLAG = -DDISABLE_BARRIER_TEST -DOMPVER2 -DOMPVER3 + +CC = gcc +CFLAGS = -fopenmp -O1 -lm +LDFLAGS = -fopenmp -O1 -lm +CPP = /usr/bin/cpp +LIBS = +\ No newline at end of file diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.intel b/lecture10/openmp_overhead/openmpbench_C_v31/Makefile.defs.stokes.intel @@ -0,0 +1,10 @@ +# Uncomment the following line to use OpenMP 2.0 features +#OMPFLAG = -DOMPVER2 +# Uncomment the following line to use OpenMP 3.0 features +OMPFLAG = -DDISABLE_BARRIER_TEST -DOMPVER2 -DOMPVER3 + +CC = icc +CFLAGS = -openmp -O1 -lm +LDFLAGS = -openmp -O1 -lm +CPP = /usr/bin/cpp +LIBS = +\ No newline at end of file diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/README.txt b/lecture10/openmp_overhead/openmpbench_C_v31/README.txt @@ -0,0 +1,113 @@ +/**************************************************************************** +* * +* OpenMP MicroBenchmark Suite - Version 3.1 * +* * +* produced by * +* * +* Mark Bull, Fiona Reid and Nix Mc Donnell * +* * +* at * +* * +* Edinburgh Parallel Computing Centre * +* * +* email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk * +* * +* * +* This version copyright (c) The University of Edinburgh, 2015. * +* * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +* * +****************************************************************************/ + +=============== + Licence +=============== +This software is released under the licence in Licence.txt + +=============== + Introduction +=============== +Overheads due to synchronisation, loop scheduling, array operations and +task scheduling are an important factor in determining the performance of +shared memory parallel programs. We have designed a set of microbenchmarks +to measure these classes of overhead for language constructs in OpenMP. + +=============== + Installation +=============== + 1. Unpack the tar file + + 2. Edit the Makefile.defs as follows: + * Set CC to the C compiler you wish to use (e.g. gcc pgcc icc xlc etc) + * Set CFLAGS to any required C compiler flags to enable processing of + OpenMP directives (e.g. -fopenmp -mp, -omp); standard optimisation is + also recommended (e.g. -O). + * Set LDFLAGS to any required C linker flags + * Set CPP to the local C-Preprocessor (e.g. /usr/local/bin/cpp) to + make the C compiler invoke cpp on .c and .h files + * To benchmark OpenMP 2.0 features can be invoked by setting the flag + OMPFLAG = -DOMPVER2 + * To benchmark OpenMP 2.0 & 3.0 features can be invoked by setting the flag + OMPFLAG = -DOMPVER2 -DOMPVER3 + * If neither of these flags are set then OpenMP 1.0 compatibility is + ensured. + +3. Type "make" to build all 4 benchmarks or "make benchmark" where benchmark + is one of syncbench, taskbench, schedbench. By default "make" will build + executables with array sizes ranging in powers of 3 from 1 to 59049. To + build the array benchmark with an array size of arraysize, use + "make IDA=arraysize prog" where arraysize is a positive integer. + + +Example Makefile.defs.* files are supplied for several machines and +compiler versions, e.g. + Makefile.defs.hector.* - Cray XE6 + Makefile.defs.magny0.* - 48 core AMD Magny Cours machine + Makefile.defs.stokes.* - SGI Altix ICE 8200EX + + +=============== + Running +=============== + +1. Set OMP_NUM_THREADS to the number of OpenMP threads you want to run with, + e.g. export OMP_NUM_THREADS = 4 + OMP_NUM_THREADS should be less than or equal to the number of physical + cores available to you. + +2. Run the benchmark with: + ./benchmark + + The output will go to STDOUT and thus you will probably want to re-direct + this to a file. ./benchmark --help will give the usage options. + + +================= +Additional notes +================= + + 1. If you encounter problems with the value of innerreps becoming too + large (an error will be reported) try recompiling with a lower level of + optimisation, ideally with inlining turned off. + + 2. It is common to observe significant variability between the overhead + values obtained on different runs of the benchmark programs. Therefore, + it is advisable to run each benchmark, say, 10-20 times and average the + results obtained. + + 3. You should use whatever mechanisms are at your disposal to ensure that + threads have exclusive or almost exclusive access to processors. You + should rejects runs where the standard deviation or number of outliers is + large: this is a good indication that the benchmark did not have almost + exclusive access to processors. diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/arraybench.c b/lecture10/openmp_overhead/openmpbench_C_v31/arraybench.c @@ -0,0 +1,133 @@ +/**************************************************************************** +* * +* OpenMP MicroBenchmark Suite - Version 3.1 * +* * +* produced by * +* * +* Mark Bull, Fiona Reid and Nix Mc Donnell * +* * +* at * +* * +* Edinburgh Parallel Computing Centre * +* * +* email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk * +* * +* * +* This version copyright (c) The University of Edinburgh, 2015. * +* * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +* * +****************************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <omp.h> + +#include "common.h" +#include "arraybench.h" + +double btest[IDA]; +double atest[IDA]; + +#pragma omp threadprivate (btest) + +int main(int argc, char **argv) { + + init(argc, argv); + + /* GENERATE REFERENCE TIME */ + reference("reference time 1", &refer); + + char testName[32]; + + /* TEST PRIVATE */ + sprintf(testName, "PRIVATE %d", IDA); + benchmark(testName, &testprivnew); + + /* TEST FIRSTPRIVATE */ + sprintf(testName, "FIRSTPRIVATE %d", IDA); + benchmark(testName, &testfirstprivnew); + +#ifdef OMPVER2 + /* TEST COPYPRIVATE */ + sprintf(testName, "COPYPRIVATE %d", IDA); + benchmark(testName, &testcopyprivnew); +#endif + + /* TEST THREADPRIVATE - COPYIN */ + sprintf(testName, "COPYIN %d", IDA); + benchmark(testName, &testthrprivnew); + + finalise(); + + return EXIT_SUCCESS; + +} + +void refer() { + int j; + double a[1]; + for (j = 0; j < innerreps; j++) { + array_delay(delaylength, a); + } +} + +void testfirstprivnew() { + int j; + for (j = 0; j < innerreps; j++) { +#pragma omp parallel firstprivate(atest) + { + array_delay(delaylength, atest); + } + } +} + +void testprivnew() { + int j; + for (j = 0; j < innerreps; j++) { +#pragma omp parallel private(atest) + { + array_delay(delaylength, atest); + } + } +} + +#ifdef OMPVER2 +void testcopyprivnew() +{ + int j; + for (j=0; j<innerreps; j++) { +#pragma omp parallel private(atest) + { +#pragma omp single copyprivate(atest) + { + array_delay(delaylength, atest); + } + } + } +} + +#endif + +void testthrprivnew() { + int j; + for (j = 0; j < innerreps; j++) { +#pragma omp parallel copyin(btest) + { + array_delay(delaylength, btest); + } + } + +} diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/arraybench.h b/lecture10/openmp_overhead/openmpbench_C_v31/arraybench.h @@ -0,0 +1,52 @@ +/**************************************************************************** +* * +* OpenMP MicroBenchmark Suite - Version 3.1 * +* * +* produced by * +* * +* Mark Bull, Fiona Reid and Nix Mc Donnell * +* * +* at * +* * +* Edinburgh Parallel Computing Centre * +* * +* email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk * +* * +* * +* This version copyright (c) The University of Edinburgh, 2015. * +* * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +* * +****************************************************************************/ + +#ifndef ARRAYBENCH_H +#define ARRAYBENCH_H + +void refer(); + +void testfirstprivnew(); + +void testprivnew(); + +#ifdef OMPVER2 + +void testcopyprivnew(); + +#endif + +void testthrprivnew(); + +void stats(double*, double*); + +#endif //ARRAYBENCH_H diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/common.c b/lecture10/openmp_overhead/openmpbench_C_v31/common.c @@ -0,0 +1,374 @@ +/**************************************************************************** +* * +* OpenMP MicroBenchmark Suite - Version 3.1 * +* * +* produced by * +* * +* Mark Bull, Fiona Reid and Nix Mc Donnell * +* * +* at * +* * +* Edinburgh Parallel Computing Centre * +* * +* email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk * +* * +* * +* This version copyright (c) The University of Edinburgh, 2015. * +* * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +* * +****************************************************************************/ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <math.h> +#include <omp.h> + +#include "common.h" + +#define CONF95 1.96 + +int nthreads = -1; // Number of OpenMP threads +int delaylength = -1; // The number of iterations to delay for +int outerreps = -1; // Outer repetitions +double delaytime = -1.0; // Length of time to delay for in microseconds +double targettesttime = 0.0; // The length of time in microseconds that the test + // should run for. +unsigned long innerreps; // Inner repetitions +double *times; // Array of doubles storing the benchmark times in microseconds +double referencetime; // The average reference time in microseconds to perform + // outerreps runs +double referencesd; // The standard deviation in the reference time in + // microseconds for outerreps runs. +double testtime; // The average test time in microseconds for + // outerreps runs +double testsd; // The standard deviation in the test time in + // microseconds for outerreps runs. + +void usage(char *argv[]) { + printf("Usage: %s.x \n" + "\t--outer-repetitions <outer-repetitions> (default %d)\n" + "\t--test-time <target-test-time> (default %0.2f microseconds)\n" + "\t--delay-time <delay-time> (default %0.4f microseconds)\n" + "\t--delay-length <delay-length> " + "(default auto-generated based on processor speed)\n", + argv[0], + DEFAULT_OUTER_REPS, DEFAULT_TEST_TARGET_TIME, DEFAULT_DELAY_TIME); +} + +void parse_args(int argc, char *argv[]) { + // Parse the parameters + int arg; + for (arg = 1; arg < argc; arg++) { + if (strcmp(argv[arg], "--delay-time") == 0.0) { + delaytime = atof(argv[++arg]); + if (delaytime == 0.0) { + printf("Invalid float:--delay-time: %s\n", argv[arg]); + usage(argv); + exit(EXIT_FAILURE); + } + + } else if (strcmp(argv[arg], "--outer-repetitions") == 0) { + outerreps = atoi(argv[++arg]); + if (outerreps == 0) { + printf("Invalid integer:--outer-repetitions: %s\n", argv[arg]); + usage(argv); + exit(EXIT_FAILURE); + } + + } else if (strcmp(argv[arg], "--test-time") == 0) { + targettesttime = atof(argv[++arg]); + if (targettesttime == 0) { + printf("Invalid integer:--test-time: %s\n", argv[arg]); + usage(argv); + exit(EXIT_FAILURE); + } + + } else if (strcmp(argv[arg], "-h") == 0) { + usage(argv); + exit(EXIT_SUCCESS); + + } else { + printf("Invalid parameters: %s\n", argv[arg]); + usage(argv); + exit(EXIT_FAILURE); + } + } +} + +int getdelaylengthfromtime(double delaytime) { + int i, reps; + double lapsedtime, starttime; // seconds + + reps = 1000; + lapsedtime = 0.0; + + delaytime = delaytime/1.0E6; // convert from microseconds to seconds + + // Note: delaytime is local to this function and thus the conversion + // does not propagate to the main code. + + // Here we want to use the delaytime in microseconds to find the + // delaylength in iterations. We start with delaylength=0 and + // increase until we get a large enough delaytime, return delaylength + // in iterations. + + delaylength = 0; + delay(delaylength); + + while (lapsedtime < delaytime) { + delaylength = delaylength * 1.1 + 1; + starttime = getclock(); + for (i = 0; i < reps; i++) { + delay(delaylength); + } + lapsedtime = (getclock() - starttime) / (double) reps; + } + return delaylength; + +} + +unsigned long getinnerreps(void (*test)(void)) { + innerreps = 10L; // some initial value + double time = 0.0; + + while (time < targettesttime) { + double start = getclock(); + test(); + time = (getclock() - start) * 1.0e6; + innerreps *=2; + + // Test to stop code if compiler is optimising reference time expressions away + if (innerreps > (targettesttime*1.0e15)) { + printf("Compiler has optimised reference loop away, STOP! \n"); + printf("Try recompiling with lower optimisation level \n"); + exit(1); + } + } + return innerreps; +} + +void printheader(char *name) { + printf("\n"); + printf("--------------------------------------------------------\n"); + printf("Computing %s time using %lu reps\n", name, innerreps); +} + +void stats(double *mtp, double *sdp) { + + double meantime, totaltime, sumsq, mintime, maxtime, sd, cutoff; + + int i, nr; + + mintime = 1.0e10; + maxtime = 0.; + totaltime = 0.; + + for (i = 1; i <= outerreps; i++) { + mintime = (mintime < times[i]) ? mintime : times[i]; + maxtime = (maxtime > times[i]) ? maxtime : times[i]; + totaltime += times[i]; + } + + meantime = totaltime / outerreps; + sumsq = 0; + + for (i = 1; i <= outerreps; i++) { + sumsq += (times[i] - meantime) * (times[i] - meantime); + } + sd = sqrt(sumsq / (outerreps - 1)); + + cutoff = 3.0 * sd; + + nr = 0; + + for (i = 1; i <= outerreps; i++) { + if (fabs(times[i] - meantime) > cutoff) + nr++; + } + + printf("\n"); + printf("Sample_size Average Min Max S.D. Outliers\n"); + printf(" %d %f %f %f %f %d\n", + outerreps, meantime, mintime, maxtime, sd, nr); + printf("\n"); + + *mtp = meantime; + *sdp = sd; + +} + +void printfooter(char *name, double testtime, double testsd, + double referencetime, double refsd) { + printf("%s time = %f microseconds +/- %f\n", + name, testtime, CONF95*testsd); + printf("%s overhead = %f microseconds +/- %f\n", + name, testtime-referencetime, CONF95*(testsd+referencesd)); + +} + +void printreferencefooter(char *name, double referencetime, double referencesd) { + printf("%s time = %f microseconds +/- %f\n", + name, referencetime, CONF95 * referencesd); +} + +void init(int argc, char **argv) +{ +#pragma omp parallel + { +#pragma omp master + { + nthreads = omp_get_num_threads(); + } + + } + + parse_args(argc, argv); + + if (outerreps == -1) { + outerreps = DEFAULT_OUTER_REPS; + } + if (targettesttime == 0.0) { + targettesttime = DEFAULT_TEST_TARGET_TIME; + } + if (delaytime == -1.0) { + delaytime = DEFAULT_DELAY_TIME; + } + delaylength = getdelaylengthfromtime(delaytime); // Always need to compute delaylength in iterations + + times = malloc((outerreps+1) * sizeof(double)); + + printf("Running OpenMP benchmark version 3.0\n" + "\t%d thread(s)\n" + "\t%d outer repetitions\n" + "\t%0.2f test time (microseconds)\n" + "\t%d delay length (iterations) \n" + "\t%f delay time (microseconds)\n", + nthreads, + outerreps, targettesttime, + delaylength, delaytime); +} + +void finalise(void) { + free(times); + +} + +void initreference(char *name) { + printheader(name); + +} + +/* Calculate the reference time. */ +void reference(char *name, void (*refer)(void)) { + int k; + double start; + + // Calculate the required number of innerreps + innerreps = getinnerreps(refer); + + initreference(name); + + for (k = 0; k <= outerreps; k++) { + start = getclock(); + refer(); + times[k] = (getclock() - start) * 1.0e6 / (double) innerreps; + } + + finalisereference(name); + +} + +void finalisereference(char *name) { + stats(&referencetime, &referencesd); + printreferencefooter(name, referencetime, referencesd); + +} + +void intitest(char *name) { + printheader(name); + +} + +void finalisetest(char *name) { + stats(&testtime, &testsd); + printfooter(name, testtime, testsd, referencetime, referencesd); + +} + +/* Function to run a microbenchmark test*/ +void benchmark(char *name, void (*test)(void)) +{ + int k; + double start; + + // Calculate the required number of innerreps + innerreps = getinnerreps(test); + + intitest(name); + + for (k=0; k<=outerreps; k++) { + start = getclock(); + test(); + times[k] = (getclock() - start) * 1.0e6 / (double) innerreps; + } + + finalisetest(name); + +} + +// For the Cray compiler on HECToR we need to turn off optimisation +// for the delay and array_delay functions. Other compilers should +// not be afffected. +#pragma _CRI noopt +void delay(int delaylength) { + + int i; + float a = 0.; + + for (i = 0; i < delaylength; i++) + a += i; + if (a < 0) + printf("%f \n", a); + +} + +void array_delay(int delaylength, double a[1]) { + + int i; + a[0] = 1.0; + for (i = 0; i < delaylength; i++) + a[0] += i; + if (a[0] < 0) + printf("%f \n", a[0]); + +} +// Re-enable optimisation for remainder of source. +#pragma _CRI opt + +double getclock() { + double time; + // Returns a value in seconds of the time elapsed from some arbitrary, + // but consistent point. + double omp_get_wtime(void); + time = omp_get_wtime(); + return time; +} + +int returnfalse() { + return 0; + +} + diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/common.h b/lecture10/openmp_overhead/openmpbench_C_v31/common.h @@ -0,0 +1,80 @@ +/**************************************************************************** +* * +* OpenMP MicroBenchmark Suite - Version 3.1 * +* * +* produced by * +* * +* Mark Bull, Fiona Reid and Nix Mc Donnell * +* * +* at * +* * +* Edinburgh Parallel Computing Centre * +* * +* email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk * +* * +* * +* This version copyright (c) The University of Edinburgh, 2015. * +* * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +* * +****************************************************************************/ + +#ifndef COMMON_H +#define COMMON_H + +#define DEFAULT_DELAY_LENGTH -1 // -1 means the delay length should be auto generated +#define DEFAULT_OUTER_REPS 20 // Outer repetitions +#define DEFAULT_TEST_TARGET_TIME 1000.0 // Test Target time in microseconds. +#ifdef SCHEDBENCH +#define DEFAULT_DELAY_TIME 15.0 // Default delaytime in microseconds for schedbench +#else +#define DEFAULT_DELAY_TIME 0.10 // Default delaytime in microseconds +#endif + +extern int nthreads; // Number of OpenMP threads +extern int delaylength; // The number of iterations to delay for +extern int outerreps; // Outer repetitions +extern unsigned long innerreps; // Inner repetitions +extern double delaytime; // Delay time in microseconds +extern double targettesttime; // The length of time in microseconds the test + // should run for +extern double *times; // Array to store results in + +void init(int argc, char **argv); + +void initreference(char *name); + +void finalisereference(char *name); + +void intitest(char *name); + +void finalisetest(char *name); + +double getclock(); + +void delay(int delaylength); + +void array_delay(int delaylength, double a[1]); + +int getdelaylengthfromtime(double delaytime); + +int returnfalse(void); + +void finalise(void); + +void benchmark(char *name, void (*test)(void)); + +void reference(char *name, void (*refer)(void)); + +#endif //COMMON_H diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/schedbench.c b/lecture10/openmp_overhead/openmpbench_C_v31/schedbench.c @@ -0,0 +1,145 @@ +/**************************************************************************** +* * +* OpenMP MicroBenchmark Suite - Version 3.1 * +* * +* produced by * +* * +* Mark Bull, Fiona Reid and Nix Mc Donnell * +* * +* at * +* * +* Edinburgh Parallel Computing Centre * +* * +* email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk * +* * +* * +* This version copyright (c) The University of Edinburgh, 2015. * +* * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +* * +****************************************************************************/ + + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <omp.h> + +#include "common.h" +#include "schedbench.h" + +int cksz, itersperthr = 128; +char testName[32]; + +int main(int argc, char **argv) { + + init(argc, argv); + + /* GENERATE REFERENCE TIME */ + reference("reference time", &refer); + + /* TEST STATIC */ + benchmark("STATIC", &teststatic); + + /* TEST STATIC,n */ + cksz = 1; + while (cksz <= itersperthr) { + sprintf(testName, "STATIC %d", cksz); + benchmark(testName, &teststaticn); + cksz *= 2; + } + + /* TEST DYNAMIC,n */ + cksz = 1; + while (cksz <= itersperthr) { + sprintf(testName, "DYNAMIC %d", cksz); + benchmark(testName, &testdynamicn); + cksz *= 2; + } + + /* TEST GUIDED,n */ + cksz = 1; + while (cksz <= itersperthr / nthreads) { + sprintf(testName, "GUIDED %d", cksz); + benchmark(testName, &testguidedn); + cksz *= 2; + } + + finalise(); + + return EXIT_SUCCESS; + +} + +void refer() { + int i, j; + for (j = 0; j < innerreps; j++) { + for (i = 0; i < itersperthr; i++) { + delay(delaylength); + } + } +} + +void teststatic() { + + int i, j; +#pragma omp parallel private(j) + { + for (j = 0; j < innerreps; j++) { +#pragma omp for schedule(static) + for (i = 0; i < itersperthr * nthreads; i++) { + delay(delaylength); + } + } + } +} + +void teststaticn() { + int i, j; +#pragma omp parallel private(j) + { + for (j = 0; j < innerreps; j++) { +#pragma omp for schedule(static,cksz) + for (i = 0; i < itersperthr * nthreads; i++) { + delay(delaylength); + } + } + } +} + +void testdynamicn() { + int i, j; +#pragma omp parallel private(j) + { + for (j = 0; j < innerreps; j++) { +#pragma omp for schedule(dynamic,cksz) + for (i = 0; i < itersperthr * nthreads; i++) { + delay(delaylength); + } + } + } +} + +void testguidedn() { + int i, j; +#pragma omp parallel private(j) + { + for (j = 0; j < innerreps; j++) { +#pragma omp for schedule(guided,cksz) + for (i = 0; i < itersperthr * nthreads; i++) { + delay(delaylength); + } + } + } +} diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/schedbench.h b/lecture10/openmp_overhead/openmpbench_C_v31/schedbench.h @@ -0,0 +1,46 @@ +/**************************************************************************** +* * +* OpenMP MicroBenchmark Suite - Version 3.1 * +* * +* produced by * +* * +* Mark Bull, Fiona Reid and Nix Mc Donnell * +* * +* at * +* * +* Edinburgh Parallel Computing Centre * +* * +* email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk * +* * +* * +* This version copyright (c) The University of Edinburgh, 2015. * +* * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +* * +****************************************************************************/ + +#ifndef SCHEDBENCH_H +#define SCHEDBENCH_H + +void refer(void); + +void teststatic(void); + +void teststaticn(void); + +void testdynamicn(void); + +void testguidedn(void); + +#endif //SCHEDBENCH_H diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/syncbench.c b/lecture10/openmp_overhead/openmpbench_C_v31/syncbench.c @@ -0,0 +1,253 @@ +/**************************************************************************** +* * +* OpenMP MicroBenchmark Suite - Version 3.1 * +* * +* produced by * +* * +* Mark Bull, Fiona Reid and Nix Mc Donnell * +* * +* at * +* * +* Edinburgh Parallel Computing Centre * +* * +* email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk * +* * +* * +* This version copyright (c) The University of Edinburgh, 2015. * +* * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +* * +****************************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <omp.h> + +#include "common.h" +#include "syncbench.h" + +omp_lock_t lock; + +int main(int argc, char **argv) { + + // Start Paraver tracing +#ifdef PARAVERTRACE + Extrae_init(); +#endif + + init(argc, argv); + + omp_init_lock(&lock); + + /* GENERATE REFERENCE TIME */ + reference("reference time 1", &refer); + + /* TEST PARALLEL REGION */ + benchmark("PARALLEL", &testpr); + + /* TEST FOR */ + benchmark("FOR", &testfor); + + /* TEST PARALLEL FOR */ + benchmark("PARALLEL FOR", &testpfor); + + /* TEST BARRIER */ + benchmark("BARRIER", &testbar); + + /* TEST SINGLE */ + benchmark("SINGLE", &testsing); + + /* TEST CRITICAL*/ + benchmark("CRITICAL", &testcrit); + + /* TEST LOCK/UNLOCK */ + benchmark("LOCK/UNLOCK", &testlock); + + /* TEST ORDERED SECTION */ + benchmark("ORDERED", &testorder); + + /* GENERATE NEW REFERENCE TIME */ + reference("reference time 2", &referatom); + + /* TEST ATOMIC */ + benchmark("ATOMIC", &testatom); + + /* GENERATE NEW REFERENCE TIME */ + reference("reference time 3", &referred); + + /* TEST REDUCTION (1 var) */ + benchmark("REDUCTION", &testred); + +#ifdef PARAVERTRACE + Extrae_fini(); +#endif + + finalise(); + + return EXIT_SUCCESS; +} + +void refer() { + int j; + for (j = 0; j < innerreps; j++) { + delay(delaylength); + } +} + +void referatom(){ + int j; + double aaaa = 0.0; + double epsilon = 1.0e-15; + double b, c; + b = 1.0; + c = (1.0 + epsilon); + for (j = 0; j < innerreps; j++) { + aaaa += b; + b *= c; + } + if (aaaa < 0.0) + printf("%f\n", aaaa); +} + +void referred() { + int j; + int aaaa = 0; + for (j = 0; j < innerreps; j++) { + delay(delaylength); + aaaa += 1; + } +} + +void testpr() { + int j; + for (j = 0; j < innerreps; j++) { +#pragma omp parallel + { + delay(delaylength); + } + } +} + +void testfor() { + int i, j; +#pragma omp parallel private(j) + { + for (j = 0; j < innerreps; j++) { +#pragma omp for + for (i = 0; i < nthreads; i++) { + delay(delaylength); + } + } + } +} + +void testpfor() { + int i, j; + for (j = 0; j < innerreps; j++) { +#pragma omp parallel for + for (i = 0; i < nthreads; i++) { + delay(delaylength); + } + } +} + +void testbar() { + int j; +#pragma omp parallel private(j) + { + for (j = 0; j < innerreps; j++) { + delay(delaylength); +#pragma omp barrier + } + } +} + +void testsing() { + int j; +#pragma omp parallel private(j) + { + for (j = 0; j < innerreps; j++) { +#pragma omp single + delay(delaylength); + } + } +} + +void testcrit() { + int j; +#pragma omp parallel private(j) + { + for (j = 0; j < innerreps / nthreads; j++) { +#pragma omp critical + { + delay(delaylength); + } + } + } +} + +void testlock() { + int j; + +#pragma omp parallel private(j) + { + for (j = 0; j < innerreps / nthreads; j++) { + omp_set_lock(&lock); + delay(delaylength); + omp_unset_lock(&lock); + } + } +} + +void testorder() { + int j; +#pragma omp parallel for ordered schedule (static,1) + for (j = 0; j < (int)innerreps; j++) { +#pragma omp ordered + delay(delaylength); + } +} + +void testatom() { + int j; + double aaaa = 0.0; + double epsilon = 1.0e-15; + double b,c; + b = 1.0; + c = (1.0 + epsilon); +#pragma omp parallel private(j) firstprivate(b) + { + for (j = 0; j < innerreps / nthreads; j++) { +#pragma omp atomic + aaaa += b; + b *= c; + } + } + if (aaaa < 0.0) + printf("%f\n", aaaa); +} + +void testred() { + int j; + int aaaa = 0; + for (j = 0; j < innerreps; j++) { +#pragma omp parallel reduction(+:aaaa) + { + delay(delaylength); + aaaa += 1; + } + } +} + diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/syncbench.h b/lecture10/openmp_overhead/openmpbench_C_v31/syncbench.h @@ -0,0 +1,63 @@ +/**************************************************************************** +* * +* OpenMP MicroBenchmark Suite - Version 3.1 * +* * +* produced by * +* * +* Mark Bull, Fiona Reid and Nix Mc Donnell * +* * +* at * +* * +* Edinburgh Parallel Computing Centre * +* * +* email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk * +* * +* * +* This version copyright (c) The University of Edinburgh, 2015. * +* * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +* * +****************************************************************************/ + + +#ifndef SYNCBENCH_H +#define SYNCBENCH_H + +void refer(void); + +void referatom(void); + +void referred(void); + +void testpr(void); + +void testfor(void); + +void testpfor(void); + +void testbar(void); + +void testsing(void); + +void testcrit(void); + +void testlock(void); + +void testorder(void); + +void testatom(void); + +void testred(void); + +#endif //SYNCBENCH_H diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/taskbench.c b/lecture10/openmp_overhead/openmpbench_C_v31/taskbench.c @@ -0,0 +1,331 @@ +/**************************************************************************** +* * +* OpenMP MicroBenchmark Suite - Version 3.1 * +* * +* produced by * +* * +* Mark Bull, Fiona Reid and Nix Mc Donnell * +* * +* at * +* * +* Edinburgh Parallel Computing Centre * +* * +* email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk * +* * +* * +* This version copyright (c) The University of Edinburgh, 2015. * +* * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +* * +****************************************************************************/ + +#include <stdio.h> +#include <stdlib.h> +#include <omp.h> + +#include "common.h" +#include "taskbench.h" + +#define DEPTH 6 + +int main(int argc, char **argv) { + + init(argc, argv); + +#ifdef OMPVER3 + + /* GENERATE REFERENCE TIME */ + reference("reference time 1", &refer); + + /* TEST PARALLEL TASK GENERATION */ + benchmark("PARALLEL TASK", &testParallelTaskGeneration); + + /* TEST MASTER TASK GENERATION */ + benchmark("MASTER TASK", &testMasterTaskGeneration); + + /* TEST MASTER TASK GENERATION WITH BUSY SLAVES */ + benchmark("MASTER TASK BUSY SLAVES", &testMasterTaskGenerationWithBusySlaves); + + /* TEST CONDITIONAL TASK GENERATION */ +#ifndef DISABLE_CONDITIONAL_TASK_TEST + benchmark("CONDITIONAL TASK", &testConditionalTaskGeneration); +#endif // DISABLE_CONDITIONAL_TASK_TEST + + /* TEST TASK WAIT */ + benchmark("TASK WAIT", &testTaskWait); + + /* TEST TASK BARRIER */ +#ifndef DISABLE_BARRIER_TEST + benchmark("TASK BARRIER", &testTaskBarrier); +#endif //DISABLE_BARRIER_TEST + +#ifndef DISABLE_NESTED_TASKS_TESTS + /* TEST NESTED TASK GENERATION */ + benchmark("NESTED TASK", &testNestedTaskGeneration); + + /* TEST NESTED MASTER TASK GENERATION */ + benchmark("NESTED MASTER TASK", &testNestedMasterTaskGeneration); + +#endif // DISABLE_NESTED_TASKS_TESTS + + /* GENERATE THE SECOND REFERENCE TIME */ + reference("reference time 2", &refer); + + /* TEST BRANCH TASK TREE */ + benchmark("BRANCH TASK TREE", &testBranchTaskGeneration); + + /* TEST LEAF TASK TREE */ + benchmark("LEAF TASK TREE", &testLeafTaskGeneration); + +#endif // OMPVER3 + + finalise(); + + return EXIT_SUCCESS; + +} + +/* Calculate the reference time. */ +void refer() { + int j; + for (j = 0; j < innerreps; j++) { + delay(delaylength); + } + +} + +/* Calculate the second reference time. */ +void refer2() { + int j; + for (j = 0; j < (innerreps >> DEPTH) * (1 << DEPTH); j++) { + delay(delaylength); + }; + +} + +/* Test parallel task generation overhead */ +void testParallelTaskGeneration() { + int j; +#pragma omp parallel private( j ) + { + for ( j = 0; j < innerreps; j ++ ) { +#pragma omp task + { + delay( delaylength ); + + } // task + }; // for j + } // parallel + +} + +/* Test master task generation overhead */ +void testMasterTaskGeneration() { + int j; +#pragma omp parallel private(j) + { +#pragma omp master + { + /* Since this is executed by one thread we need innerreps * nthreads + iterations */ + for (j = 0; j < innerreps * nthreads; j++) { +#pragma omp task + { + delay(delaylength); + + } + + } /* End for j */ + } /* End master */ + } /* End parallel */ + +} + +/* Test master task generation overhead when the slave threads are busy */ +void testMasterTaskGenerationWithBusySlaves() { + int j; +#pragma omp parallel private( j ) + { + int thread_num = omp_get_thread_num(); + for (j = 0; j < innerreps; j ++ ) { + + if ( thread_num == 0 ) { +#pragma omp task + { + delay( delaylength ); + } // task + + } else { + delay( delaylength ); + + }; // if + }; // for j + } // parallel +} + +/* Measure overhead of checking if a task should be spawned. */ +void testConditionalTaskGeneration() { + int j; +#pragma omp parallel private(j) + { + for (j = 0; j < innerreps; j++) { +#pragma omp task if(returnfalse()) + { + delay( delaylength ); + } + } + } +} + +#ifndef DISABLE_NESTED_TASKS_TESTS + +/* Measure overhead of nested tasks (all threads construct outer tasks) */ +void testNestedTaskGeneration() { + int i,j; +#pragma omp parallel private( i, j ) + { + for ( j = 0; j < innerreps / nthreads; j ++ ) { +#pragma omp task private( i ) + { + for ( i = 0; i < nthreads; i ++ ) { +#pragma omp task untied + { + delay( delaylength ); + + } // task + }; // for i + + // wait for inner tasks to complete +#pragma omp taskwait + + } // task + }; // for j + } // parallel +} + +/* Measure overhead of nested tasks (master thread constructs outer tasks) */ +void testNestedMasterTaskGeneration() { + int i, j; +#pragma omp parallel private( i, j ) + { +#pragma omp master + { + for ( j = 0; j < innerreps; j ++ ) { +#pragma omp task private( i ) + { + for ( i = 0; i < nthreads; i ++ ) { +#pragma omp task + { + delay( delaylength ); + + } // task + }; // for i + + // wait for inner tasks to complete +#pragma omp taskwait + + } // task + }; // for j + } // master + } // parallel +} +#endif // DISABLE_NESTED_TASKS_TESTS + +/* Measure overhead of taskwait (all threads construct tasks) */ +void testTaskWait() { + int j; +#pragma omp parallel private( j ) + { + for ( j = 0; j < innerreps; j ++ ) { +#pragma omp task + { + delay( delaylength ); + + } // task +#pragma omp taskwait + + }; // for j + } // parallel +} + +/* Measure overhead of tasking barrier (all threads construct tasks) */ +void testTaskBarrier() { + int j; +#pragma omp parallel private( j ) + { + for ( j = 0; j < innerreps; j ++ ) { +#pragma omp task + { + delay( delaylength ); + + } // task +#pragma omp barrier + + }; // for j + } // parallel +} + +/* Test parallel task generation overhead where work is done at all levels. */ +void testBranchTaskGeneration() { + int j; +#pragma omp parallel private(j) + { + for (j = 0; j < (innerreps >> DEPTH); j++) { +#pragma omp task + { + branchTaskTree(DEPTH); + delay(delaylength); + } + + } + } +} + +void branchTaskTree(int tree_level) { + if ( tree_level > 0 ) { +#pragma omp task + { + branchTaskTree(tree_level - 1); + branchTaskTree(tree_level - 1); + delay(delaylength); + } + } +} + +/* Test parallel task generation overhead where work is done only at the leaf level. */ +void testLeafTaskGeneration() { + int j; +#pragma omp parallel private(j) + { + for (j = 0; j < (innerreps >> DEPTH); j++) { + leafTaskTree(DEPTH); + + } + } + +} + +void leafTaskTree(int tree_level) { + if ( tree_level == 0 ) { + delay(delaylength); + + } else { +#pragma omp task + { + leafTaskTree(tree_level - 1); + leafTaskTree(tree_level - 1); + } + } +} + diff --git a/lecture10/openmp_overhead/openmpbench_C_v31/taskbench.h b/lecture10/openmp_overhead/openmpbench_C_v31/taskbench.h @@ -0,0 +1,66 @@ +/**************************************************************************** +* * +* OpenMP MicroBenchmark Suite - Version 3.1 * +* * +* produced by * +* * +* Mark Bull, Fiona Reid and Nix Mc Donnell * +* * +* at * +* * +* Edinburgh Parallel Computing Centre * +* * +* email: markb@epcc.ed.ac.uk or fiona@epcc.ed.ac.uk * +* * +* * +* This version copyright (c) The University of Edinburgh, 2015. * +* * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +* * +****************************************************************************/ + +#ifndef TASKBENCH_H +#define TASKBENCH_H + +void refer(void); + +void refer2(void); + +void stats(double*, double*); + +void testParallelTaskGeneration(void); + +void testMasterTaskGeneration(void); + +void testMasterTaskGenerationWithBusySlaves(void); + +void testNestedTaskGeneration(void); + +void testNestedMasterTaskGeneration(void); + +void testTaskWait(void); + +void testTaskBarrier(void); + +void testConditionalTaskGeneration(void); + +void testBranchTaskGeneration(void); + +void branchTaskTree(int tree_level); + +void testLeafTaskGeneration(void); + +void leafTaskTree(int tree_level); + +#endif //TASKBENCH_H diff --git a/lecture10/openmp_overhead/plot_overhead.py b/lecture10/openmp_overhead/plot_overhead.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# File : plot_overhead.py +# Description: Plot OpenMP micro bench results +# Copyright 2022 Harvard University. All Rights Reserved. +import matplotlib as mpl +mpl.use('Agg') +import matplotlib.pyplot as plt +import numpy as np + +def plot(): + fig, ax = plt.subplots() + + d = np.loadtxt('proc_bind_false.dat') + # ax.grid(which='both') + ax.set_xlabel(r'Number of threads') + ax.set_ylabel(r'Overhead [$\mu s$]') + # yapf: disable + ax.errorbar(d[:, 0], d[:, 19], yerr=d[:, 20], marker='P', lw=1.4, ms=5, label=r'\texttt{reduction}') + ax.errorbar(d[:, 0], d[:, 1], yerr=d[:, 2], marker='o', lw=1.4, ms=5, label=r'\texttt{parallel}') + ax.errorbar(d[:, 0], d[:, 3], yerr=d[:, 4], marker='s', lw=1.4, ms=5, label=r'\texttt{for}') + ax.errorbar(d[:, 0], d[:, 5], yerr=d[:, 6], marker='^', lw=1.4, ms=5, label=r'\texttt{parallel for}') + ax.errorbar(d[:, 0], d[:, 7], yerr=d[:, 8], marker='*', lw=1.4, ms=5, label=r'\texttt{barrier}') + ax.errorbar(d[:, 0], d[:, 9], yerr=d[:, 10], marker='>', lw=1.4, ms=5, label=r'\texttt{single}') + ax.errorbar(d[:, 0], d[:, 11], yerr=d[:, 12], marker='x', lw=1.4, ms=5, label=r'\texttt{critical}') + ax.errorbar(d[:, 0], d[:, 13], yerr=d[:, 14], marker='<', lw=1.4, ms=5, label=r'\texttt{lock/unlock}') + ax.errorbar(d[:, 0], d[:, 17], yerr=d[:, 18], marker='d', lw=1.4, ms=5, label=r'\texttt{atomic}') + # yapf: enable + ax.legend(loc='upper left') + fig.savefig('openmp_overhead.svg', bbox_inches='tight') + +def main(): + plot() + +if __name__ == "__main__": + main() diff --git a/lecture10/openmp_overhead/run_benchmark.sh b/lecture10/openmp_overhead/run_benchmark.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# threads=(1 2 4 8 16 32 64) # AMD Opteron +threads=(1 2 4 8 16 32) # Intel Xeon + +run() { + fname="${1}"; shift + echo '# threads parallel for parallel_for barrier single critical lock_unlock ordered atomic reduction' >${fname} + for t in "${threads[@]}"; do + echo -n "${t}" >>${fname} + OMP_NUM_THREADS=$t ./openmpbench_C_v31/syncbench | \ + awk '/overhead/ {print $0}' | \ + awk -F '[^0-9.]+' '{printf "\t%e %e", $2, $3}' >>${fname} + echo '' >>${fname} + done +} + +export OMP_PROC_BIND='false' +run 'proc_bind_false.dat' + +export OMP_PROC_BIND='true' +run 'proc_bind_true.dat' diff --git a/lecture10/prefetch/Makefile b/lecture10/prefetch/Makefile @@ -0,0 +1,13 @@ +CXX = g++ +.PHONY: clean + +all: main main_noprefetch + +main: loop_unroll_prefetch.cpp + $(CXX) -O3 -fno-unroll-loops -o $@ $< + +main_noprefetch: loop_unroll_prefetch.cpp + $(CXX) -O3 -fno-unroll-loops -fno-prefetch-loop-arrays -o $@ $< + +clean: + rm -f main main_noprefetch diff --git a/lecture10/prefetch/loop_unroll_prefetch.cpp b/lecture10/prefetch/loop_unroll_prefetch.cpp @@ -0,0 +1,44 @@ +#include <chrono> +#include <iostream> + +#define N (1 << 20) +int main(void) +{ + double *A = new double[N]; + double *B = new double[N]; + double *C = new double[N]; + for (int i = 0; i < N; ++i) { + A[i] = 0; + B[i] = i + 0; + C[i] = i + 2; + } + + typedef std::chrono::high_resolution_clock Clock; + auto t1 = Clock::now(); + // manual 4-fold loop unroll + for (int i = 0; i < N; i += 4) { + __builtin_prefetch(&A[i + 4], 1, 1); + __builtin_prefetch(&B[i + 4], 0, 1); + __builtin_prefetch(&C[i + 4], 0, 1); + A[i + 0] = A[i + 0] * B[i + 0] + C[i + 0]; + A[i + 1] = A[i + 1] * B[i + 1] + C[i + 1]; + A[i + 2] = A[i + 2] * B[i + 2] + C[i + 2]; + A[i + 3] = A[i + 3] * B[i + 3] + C[i + 3]; + } + auto t2 = Clock::now(); + std::cout + << "Time: " + << std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count() + << " ns\n"; + + volatile double sum = 0.0; + for (int i = 0; i < N; ++i) { + sum += A[i]; + } + std::cout << "Result: " << sum << '\n'; + + delete[] A; + delete[] B; + delete[] C; + return 0; +} diff --git a/lecture11/hello_mpi/.gitignore b/lecture11/hello_mpi/.gitignore @@ -0,0 +1 @@ +hello_mpi diff --git a/lecture11/hello_mpi/Makefile b/lecture11/hello_mpi/Makefile @@ -0,0 +1,9 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic +.PHONY: clean + +hello_mpi: hello_mpi.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f hello_mpi diff --git a/lecture11/hello_mpi/hello_mpi.cpp b/lecture11/hello_mpi/hello_mpi.cpp @@ -0,0 +1,17 @@ +#include <iostream> +#include <mpi.h> + +int main(int argc, char *argv[]) +{ + MPI_Init(&argc, &argv); + + std::cout << "Hello MPI\n"; + + // for demonstration with system monitor + volatile bool cond = true; + while (cond) { + } + + MPI_Finalize(); + return 0; +} diff --git a/lecture11/rank_size_proc/.gitignore b/lecture11/rank_size_proc/.gitignore @@ -0,0 +1 @@ +rank_size_proc diff --git a/lecture11/rank_size_proc/Makefile b/lecture11/rank_size_proc/Makefile @@ -0,0 +1,9 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic +.PHONY: clean + +rank_size_proc: rank_size_proc.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f rank_size_proc diff --git a/lecture11/rank_size_proc/rank_size_proc.cpp b/lecture11/rank_size_proc/rank_size_proc.cpp @@ -0,0 +1,35 @@ +#include <cassert> +#include <iostream> +#include <mpi.h> + +int main(int argc, char *argv[]) +{ + int size, rank, len, err; + char hostname[MPI_MAX_PROCESSOR_NAME]; + + // initialize MPI + err = MPI_Init(&argc, &argv); + assert(err == MPI_SUCCESS); + + // get number of ranks in the MPI_COMM_WORLD communicator + err = MPI_Comm_size(MPI_COMM_WORLD, &size); + assert(err == MPI_SUCCESS); + + // get my rank + err = MPI_Comm_rank(MPI_COMM_WORLD, &rank); + assert(err == MPI_SUCCESS); + + // query processor name (the return value is implementation dependent) + err = MPI_Get_processor_name(hostname, &len); + assert(err == MPI_SUCCESS); + std::cout << "Rank " << rank << "/" << size << " running on: " << hostname + << '\n'; + + // call other MPI routines + + // terminate the MPI environment + err = MPI_Finalize(); + assert(err == MPI_SUCCESS); + + return 0; +} diff --git a/lecture11/thread_support/.gitignore b/lecture11/thread_support/.gitignore @@ -0,0 +1 @@ +thread_support diff --git a/lecture11/thread_support/Makefile b/lecture11/thread_support/Makefile @@ -0,0 +1,9 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic +.PHONY: clean + +thread_support: thread_support.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f thread_support diff --git a/lecture11/thread_support/thread_support.cpp b/lecture11/thread_support/thread_support.cpp @@ -0,0 +1,27 @@ +#include <iostream> +#include <mpi.h> + +int main(int argc, char *argv[]) +{ + int rank, size, version, subversion, provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided); + + MPI_Get_version(&version, &subversion); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + const bool isroot = (0 == rank); + if (isroot) { // only the root rank executes this code + std::cout << "MPI version: " << version << '.' << subversion + << '\n'; + std::cout << "Total ranks: " << size << '\n'; + std::cout << "Thread support: " << provided + << " (requested MPI_THREAD_FUNNELED)\n"; + std::cout << "MPI_THREAD_SINGLE: " << MPI_THREAD_SINGLE << '\n'; + std::cout << "MPI_THREAD_FUNNELED: " << MPI_THREAD_FUNNELED << '\n'; + std::cout << "MPI_THREAD_SERIALIZED: " << MPI_THREAD_SERIALIZED << '\n'; + std::cout << "MPI_THREAD_MULTIPLE: " << MPI_THREAD_MULTIPLE << '\n'; + } + + MPI_Finalize(); + return 0; +} diff --git a/lecture12/endianness/.gitignore b/lecture12/endianness/.gitignore @@ -0,0 +1 @@ +endianness diff --git a/lecture12/endianness/Makefile b/lecture12/endianness/Makefile @@ -0,0 +1,9 @@ +CXX = g++ +CXXFLAGS = -O1 -g -Wall -Wextra -Wpedantic +.PHONY: clean + +endianness: endianness.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f endianness diff --git a/lecture12/endianness/endianness.cpp b/lecture12/endianness/endianness.cpp @@ -0,0 +1,44 @@ +#include <iomanip> +#include <iostream> +#include <string> + +int main(void) +{ + // clang-format off + + // Little endian byte order: (e.g Intel, AMD) + // + // high address + // | + // | low address + // | | + int one = 1; // 0x00000001 + // || || + // || least significant byte + // || + // most significant byte + + // Big endian byte order: (e.g. IBM) + // + // high address + // | + // | low address + // | | + // 0x01000000 + // || || + // || least significant byte + // || + // most significant byte + + // clang-format on + + // create a pointer to the low address byte + char *low_address = (char *)&one; + + // print its value + const std::string endianness = (*low_address) ? "little" : "big"; + std::cout << "Low address byte = 0x" << std::hex << std::setfill('0') + << std::setw(2) << (int)(*low_address) << " (" << endianness + << " endian)\n"; + return 0; +} diff --git a/lecture12/mpi_bsend/.gitignore b/lecture12/mpi_bsend/.gitignore @@ -0,0 +1 @@ +mpi_bsend diff --git a/lecture12/mpi_bsend/Makefile b/lecture12/mpi_bsend/Makefile @@ -0,0 +1,9 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX +.PHONY: clean + +mpi_bsend: mpi_bsend.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f mpi_bsend diff --git a/lecture12/mpi_bsend/mpi_bsend.cpp b/lecture12/mpi_bsend/mpi_bsend.cpp @@ -0,0 +1,37 @@ +#include <mpi.h> + +int main(int argc, char* argv[]) +{ + constexpr int n = 1000000; + int rank; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + // allocate a memory buffer for MPI + int bufsize = 0; + MPI_Pack_size(n, MPI_INT, MPI_COMM_WORLD, &bufsize); + bufsize += MPI_BSEND_OVERHEAD; + char *mpibuf = new char[bufsize]; + MPI_Buffer_attach(mpibuf, bufsize); + + // send/receive buffers + int *sendbuf = new int[n]; + int *recvbuf = new int[n]; + + if (0 == rank) { + MPI_Bsend(sendbuf, n, MPI_INT, 1, 98, MPI_COMM_WORLD); // local + MPI_Recv(recvbuf, n, MPI_INT, 1, 99, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } else { + MPI_Bsend(sendbuf, n, MPI_INT, 0, 99, MPI_COMM_WORLD); // local + MPI_Recv(recvbuf, n, MPI_INT, 0, 98, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } + + // clean up + MPI_Buffer_detach(mpibuf, &bufsize); // blocking + delete[] mpibuf; + delete[] sendbuf; + delete[] recvbuf; + + MPI_Finalize(); + return 0; +} diff --git a/lecture12/mpi_gatherv_scatterv/.gitignore b/lecture12/mpi_gatherv_scatterv/.gitignore @@ -0,0 +1 @@ +mpi_gatherv_scatterv diff --git a/lecture12/mpi_gatherv_scatterv/Makefile b/lecture12/mpi_gatherv_scatterv/Makefile @@ -0,0 +1,9 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX +.PHONY: clean + +mpi_gatherv_scatterv: mpi_gatherv_scatterv.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f mpi_gatherv_scatterv diff --git a/lecture12/mpi_gatherv_scatterv/mpi_gatherv_scatterv.cpp b/lecture12/mpi_gatherv_scatterv/mpi_gatherv_scatterv.cpp @@ -0,0 +1,78 @@ +#include <algorithm> +#include <iostream> +#include <mpi.h> +#include <sstream> +#include <vector> + +int main(int argc, char* argv[]) +{ + int rank, size; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + const bool isroot = (0 == rank); + + std::vector<int> sendbuf(rank + 1, rank); + std::vector<int> recvbuf, recvcounts, displacement; + if (isroot) { + recvbuf.resize(size * (size + 1) / 2); + recvcounts.resize(size); + displacement.resize(size); + for (int i = 0; i < size; ++i) { + recvcounts[i] = i + 1; + displacement[i] = i * (i + 1) / 2; + } + } + + MPI_Gatherv(sendbuf.data(), + sendbuf.size(), + MPI_INT, + recvbuf.data(), + recvcounts.data(), + displacement.data(), + MPI_INT, + 0, + MPI_COMM_WORLD); + + if (isroot) { + std::cout << "Result of MPI_Gatherv (on root rank)\n"; + for (int r = 0; r < size; ++r) { + const int offset = displacement[r]; + for (int i = 0; i < recvcounts[r]; ++i) { + std::cout << ' ' << recvbuf[offset + i]; + } + std::cout << '\n'; + } + } + + // inverse operation + std::vector<int> sendcounts; + sendbuf.swap(recvbuf); + if (isroot) { + sendcounts.swap(recvcounts); + } + + MPI_Scatterv(sendbuf.data(), + sendcounts.data(), + displacement.data(), + MPI_INT, + recvbuf.data(), + recvbuf.size(), + MPI_INT, + 0, + MPI_COMM_WORLD); + + if (isroot) { + std::cout << "\nResult of MPI_Scatterv\n"; + } + std::ostringstream out; + out << "Rank " << rank << ":"; + for (const auto v : recvbuf) { + out << ' ' << v; + } + out << '\n'; + std::cout << out.str(); + + MPI_Finalize(); + return 0; +} diff --git a/lecture12/mpi_get_count/.gitignore b/lecture12/mpi_get_count/.gitignore @@ -0,0 +1 @@ +mpi_get_count diff --git a/lecture12/mpi_get_count/Makefile b/lecture12/mpi_get_count/Makefile @@ -0,0 +1,9 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX +.PHONY: clean + +mpi_get_count: mpi_get_count.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f mpi_get_count diff --git a/lecture12/mpi_get_count/mpi_get_count.cpp b/lecture12/mpi_get_count/mpi_get_count.cpp @@ -0,0 +1,33 @@ +#include <cassert> +#include <iostream> +#include <mpi.h> +#include <string> + +int main(int argc, char* argv[]) +{ + assert(argc == 2); // pass some string as argument + int rank; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + if (0 == rank) { + char message[1024]; + MPI_Status status; + MPI_Recv(message, 1024, MPI_CHAR, 1, 99, MPI_COMM_WORLD, &status); + + // get the count of characters + int count; + MPI_Get_count(&status, MPI_CHAR, &count); + assert(count < 1024); + message[count + 1] = '\0'; // terminating null + std::cout << "Rank " << rank << " received: " << message << " (" + << count << " characters)\n"; + } else { + const std::string message(argv[1]); + MPI_Send( + message.c_str(), message.size(), MPI_CHAR, 0, 99, MPI_COMM_WORLD); + } + + MPI_Finalize(); + return 0; +} diff --git a/lecture12/mpi_minloc/.gitignore b/lecture12/mpi_minloc/.gitignore @@ -0,0 +1 @@ +mpi_minloc diff --git a/lecture12/mpi_minloc/Makefile b/lecture12/mpi_minloc/Makefile @@ -0,0 +1,9 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX +.PHONY: clean + +mpi_minloc: mpi_minloc.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f mpi_minloc diff --git a/lecture12/mpi_minloc/mpi_minloc.cpp b/lecture12/mpi_minloc/mpi_minloc.cpp @@ -0,0 +1,31 @@ +#include <cstdlib> +#include <iostream> +#include <mpi.h> + +int main(int argc, char* argv[]) +{ + int rank; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + srand48(rank); + const double value = drand48(); + std::cout << "Rank " << rank << ": value = " << value << '\n'; + + // packed value and rank ID + struct DoubleInt { + double val; + int rank; + } send, recv; + send.val = value; + send.rank = rank; + + MPI_Reduce(&send, &recv, 1, MPI_DOUBLE_INT, MPI_MINLOC, 0, MPI_COMM_WORLD); + if (0 == rank) { + std::cout << "Minimum: value = " << recv.val << " on rank " << recv.rank + << '\n'; + } + + MPI_Finalize(); + return 0; +} diff --git a/lecture12/mpi_p2p/.gitignore b/lecture12/mpi_p2p/.gitignore @@ -0,0 +1 @@ +mpi_p2p diff --git a/lecture12/mpi_p2p/Makefile b/lecture12/mpi_p2p/Makefile @@ -0,0 +1,9 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX +.PHONY: clean + +mpi_p2p: mpi_p2p.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f mpi_p2p diff --git a/lecture12/mpi_p2p/mpi_p2p.cpp b/lecture12/mpi_p2p/mpi_p2p.cpp @@ -0,0 +1,22 @@ +#include <mpi.h> +#include <iostream> + +// exchange rank IDs between 2 processes +// DISCLAIMER: this code contains a bug +int main(int argc, char* argv[]) +{ + int rank, recv; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (0 == rank) { + MPI_Send(&rank, 1, MPI_INT, 1, 98, MPI_COMM_WORLD); + MPI_Recv(&recv, 1, MPI_INT, 1, 99, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } else { + MPI_Send(&rank, 1, MPI_INT, 0, 99, MPI_COMM_WORLD); + MPI_Recv(&recv, 1, MPI_INT, 0, 98, MPI_COMM_WORLD, MPI_STATUS_IGNORE); + } + std::cout << "Rank " << rank << " got ID " << recv << " from other rank\n"; + + MPI_Finalize(); + return 0; +} diff --git a/lecture12/mpi_pi/.gitignore b/lecture12/mpi_pi/.gitignore @@ -0,0 +1,3 @@ +mpi_pi +mpi_pi_reduction +mpi_pi_reduction_inplace diff --git a/lecture12/mpi_pi/Makefile b/lecture12/mpi_pi/Makefile @@ -0,0 +1,17 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX +.PHONY: clean + +all: mpi_pi mpi_pi_reduction mpi_pi_reduction_inplace + +mpi_pi: mpi_pi.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +mpi_pi_reduction: mpi_pi_reduction.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +mpi_pi_reduction_inplace: mpi_pi_reduction_inplace.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f mpi_pi mpi_pi_reduction mpi_pi_reduction_inplace diff --git a/lecture12/mpi_pi/mpi_pi.cpp b/lecture12/mpi_pi/mpi_pi.cpp @@ -0,0 +1,45 @@ +#include <iostream> +#include <mpi.h> + +int main(int argc, char* argv[]) +{ + int rank, size; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + const size_t n = 100000000; + + // domain decomposition + const size_t chunk_size = (n + size - 1) / size; + const size_t my_start = rank * chunk_size; + const size_t my_end = std::min((rank + 1) * chunk_size, n); + + // partial sum + double sum = 0.0; // rank local sum (on each process, no race condition!) + for (size_t k = my_start; k < my_end; ++k) { + sum += (1.0 - 2.0 * (k % 2)) / (2.0 * k + 1.0); + } + + // reduction + if (0 == rank) { + double pi = sum; + double recv; + for (int r = 1; r < size; ++r) { + MPI_Recv(&recv, + 1, + MPI_DOUBLE, + r, + 123, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + pi += recv; + } + std::cout << "pi = " << 4.0 * pi << '\n'; + } else { + MPI_Send(&sum, 1, MPI_DOUBLE, 0, 123, MPI_COMM_WORLD); + } + + MPI_Finalize(); + return 0; +} diff --git a/lecture12/mpi_pi/mpi_pi_reduction.cpp b/lecture12/mpi_pi/mpi_pi_reduction.cpp @@ -0,0 +1,33 @@ +#include <iostream> +#include <mpi.h> + +int main(int argc, char* argv[]) +{ + int rank, size; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + const size_t n = 100000000; + + // domain decomposition + const size_t chunk_size = (n + size - 1) / size; + const size_t my_start = rank * chunk_size; + const size_t my_end = std::min((rank + 1) * chunk_size, n); + + // partial sum + double sum = 0.0; // rank local sum (on each process, no race condition!) + for (size_t k = my_start; k < my_end; ++k) { + sum += (1.0 - 2.0 * (k % 2)) / (2.0 * k + 1.0); + } + + // reduction + double pi = 0.0; + MPI_Reduce(&sum, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + if (0 == rank) { + std::cout << "pi = " << 4.0 * pi << '\n'; + } + + MPI_Finalize(); + return 0; +} diff --git a/lecture12/mpi_pi/mpi_pi_reduction_inplace.cpp b/lecture12/mpi_pi/mpi_pi_reduction_inplace.cpp @@ -0,0 +1,38 @@ +#include <iostream> +#include <mpi.h> + +int main(int argc, char* argv[]) +{ + int rank, size; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + const size_t n = 100000000; + + // domain decomposition + const size_t chunk_size = (n + size - 1) / size; + const size_t my_start = rank * chunk_size; + const size_t my_end = std::min((rank + 1) * chunk_size, n); + + // partial sum + double sum = 0.0; // rank local sum (on each process, no race condition!) + for (size_t k = my_start; k < my_end; ++k) { + sum += (1.0 - 2.0 * (k % 2)) / (2.0 * k + 1.0); + } + + // reduction + MPI_Reduce((0 == rank) ? MPI_IN_PLACE : &sum, + &sum, + 1, + MPI_DOUBLE, + MPI_SUM, + 0, + MPI_COMM_WORLD); + if (0 == rank) { + std::cout << "pi = " << 4.0 * sum << '\n'; + } + + MPI_Finalize(); + return 0; +} diff --git a/lecture12/mpi_rsend/.gitignore b/lecture12/mpi_rsend/.gitignore @@ -0,0 +1,2 @@ +mpi_rsend +mpi_irsend diff --git a/lecture12/mpi_rsend/Makefile b/lecture12/mpi_rsend/Makefile @@ -0,0 +1,20 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX +.PHONY: clean + +all: mpi_rsend mpi_irsend + +run_rsend: mpi_rsend + mpirun -n 2 ./mpi_rsend + +run_irsend: mpi_irsend + mpirun -n 2 ./mpi_irsend + +mpi_rsend: mpi_rsend.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +mpi_irsend: mpi_irsend.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f mpi_rsend mpi_irsend diff --git a/lecture12/mpi_rsend/mpi_irsend.cpp b/lecture12/mpi_rsend/mpi_irsend.cpp @@ -0,0 +1,46 @@ +#include <iostream> +#include <mpi.h> +#include <numeric> +#include <vector> + +int main(int argc, char* argv[]) +{ + int rank; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + // send/receive buffers + constexpr int n = 1000000; + std::vector<int> sendbuf(n, 1); + std::vector<int> recvbuf(n); + + // post non-blocking receive request first + MPI_Request recv_req = MPI_REQUEST_NULL; + if (1 == rank) { + MPI_Irecv(recvbuf.data(), + recvbuf.size(), + MPI_INT, + 0, + 98, + MPI_COMM_WORLD, + &recv_req); + } + + MPI_Barrier(MPI_COMM_WORLD); // synchronize all processes + + if (0 == rank) { + MPI_Rsend( + sendbuf.data(), sendbuf.size(), MPI_INT, 1, 98, MPI_COMM_WORLD); + } + + // Rank 0 is OK to call this because its request is NULL, the other ranks + // blocks until the receive has completed. + MPI_Wait(&recv_req, MPI_STATUS_IGNORE); + + std::cout << "Rank " << rank << ": result = " + << std::accumulate(recvbuf.begin(), recvbuf.end(), 0) + << std::endl; + + MPI_Finalize(); + return 0; +} diff --git a/lecture12/mpi_rsend/mpi_rsend.cpp b/lecture12/mpi_rsend/mpi_rsend.cpp @@ -0,0 +1,41 @@ +#include <iostream> +#include <mpi.h> +#include <numeric> +#include <unistd.h> +#include <vector> + +// DISCLAIMER: this code is not correct (the result of the receive is undefined) +int main(int argc, char* argv[]) +{ + int rank; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + // send/receive buffers + constexpr int n = 1000000; + std::vector<int> sendbuf(n, 1); + std::vector<int> recvbuf(n); + + if (0 == rank) { + MPI_Rsend( + sendbuf.data(), sendbuf.size(), MPI_INT, 1, 98, MPI_COMM_WORLD); + } else { + // this should somewhat ensure that the receive is surely posted after + // the send above + sleep(10); + MPI_Recv(recvbuf.data(), + recvbuf.size(), + MPI_INT, + 0, + 98, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + } + + std::cout << "Rank " << rank << ": result = " + << std::accumulate(recvbuf.begin(), recvbuf.end(), 0) + << std::endl; + + MPI_Finalize(); + return 0; +} diff --git a/lecture13/mpi_1D_diffusion/.gitignore b/lecture13/mpi_1D_diffusion/.gitignore @@ -0,0 +1,4 @@ +blocking +nonblocking +u.dat +u.png diff --git a/lecture13/mpi_1D_diffusion/IO.h b/lecture13/mpi_1D_diffusion/IO.h @@ -0,0 +1,45 @@ +#ifndef IO_H_C0WPQOQZ +#define IO_H_C0WPQOQZ + +#include <fstream> +#include <mpi.h> +#include <vector> + +void initialize(std::vector<double> &v) +{ + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + const int N = static_cast<int>(v.size()) - 2; + const double dx = 1.0 / (size * N); + const double x0 = rank * N * dx; + for (int i = 0; i < N; ++i) { + const double x = x0 + (i + 0.5) * dx; + if (x > 0.4 && x < 0.6) { + v[i + 1] = 1.0; + } else { + v[i + 1] = 0.0; + } + } +} + +void dump(const std::vector<double> &v) +{ + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + const int N = static_cast<int>(v.size()) - 2; + std::vector<double> u(size * N); + MPI_Gather( + &v[1], N, MPI_DOUBLE, u.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD); + if (0 == rank) { + std::ofstream out("u.dat", std::ios::app); + out << std::scientific; + for (size_t i = 0; i < u.size(); ++i) { + out << ' ' << u[i]; + } + out << '\n'; + } +} + +#endif /* IO_H_C0WPQOQZ */ diff --git a/lecture13/mpi_1D_diffusion/Makefile b/lecture13/mpi_1D_diffusion/Makefile @@ -0,0 +1,22 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX +.PHONY: clean + +all: blocking nonblocking + +run_blocking: blocking + rm -f u.dat + mpirun -n 4 ./blocking + +run_nonblocking: nonblocking + rm -f u.dat + mpirun -n 4 ./nonblocking + +blocking: blocking.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +nonblocking: nonblocking.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f u.dat u.png blocking nonblocking diff --git a/lecture13/mpi_1D_diffusion/blocking.cpp b/lecture13/mpi_1D_diffusion/blocking.cpp @@ -0,0 +1,45 @@ +#include "IO.h" +#include <mpi.h> +#include <vector> + +int main(int argc, char *argv[]) +{ + int rank, size; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // source and destination (periodic boundaries) + const int left = (rank + size - 1) % size; + const int right = (rank + size + 1) % size; + + const int nsteps = 1000; // number of time steps + const int N = 128; // number of cells in subdomain + const double Fo = 0.45; // Fourier number + std::vector<double> curr(N + 2); // +2 ghost cells + std::vector<double> next(N + 2); // +2 ghost cells + initialize(curr); + for (int step = 0; step < nsteps; ++step) { + // clang-format off + MPI_Sendrecv(&curr[1], 1, MPI_DOUBLE, left, 123, + &curr[N + 1], 1, MPI_DOUBLE, right, 123, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + MPI_Sendrecv(&curr[N], 1, MPI_DOUBLE, right, 123, + &curr[0], 1, MPI_DOUBLE, left, 123, + MPI_COMM_WORLD, MPI_STATUS_IGNORE); + // clang-format on + + // stencil scheme + for (int i = 1; i < N + 1; ++i) { + next[i] = + curr[i] + Fo * (curr[i - 1] - 2.0 * curr[i] + curr[i + 1]); + } + if (step % 200 == 0) { + dump(curr); + } + curr.swap(next); + } + + MPI_Finalize(); + return 0; +} diff --git a/lecture13/mpi_1D_diffusion/nonblocking.cpp b/lecture13/mpi_1D_diffusion/nonblocking.cpp @@ -0,0 +1,54 @@ +#include "IO.h" +#include <mpi.h> +#include <vector> + +int main(int argc, char *argv[]) +{ + int rank, size; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // source and destination (periodic boundaries) + const int left = (rank + size - 1) % size; + const int right = (rank + size + 1) % size; + + const int nsteps = 1000; // number of time steps + const int N = 128; // number of cells in subdomain + const double Fo = 0.45; // Fourier number + std::vector<double> curr(N + 2); // +2 ghost cells + std::vector<double> next(N + 2); // +2 ghost cells + initialize(curr); + auto applyStencil = [&](int i) { + next[i] = curr[i] + Fo * (curr[i - 1] - 2.0 * curr[i] + curr[i + 1]); + }; + for (int step = 0; step < nsteps; ++step) { + MPI_Request reqs[4]; + // clang-format off + MPI_Irecv(&curr[0], 1, MPI_DOUBLE, left, 123, MPI_COMM_WORLD, &reqs[0]); + MPI_Irecv(&curr[N + 1], 1, MPI_DOUBLE, right, 321, MPI_COMM_WORLD, &reqs[1]); + MPI_Isend(&curr[1], 1, MPI_DOUBLE, left, 321, MPI_COMM_WORLD, &reqs[2]); + MPI_Isend(&curr[N], 1, MPI_DOUBLE, right, 123, MPI_COMM_WORLD, &reqs[3]); + // clang-format on + + // compute internal domain + for (int i = 2; i < N; ++i) { + applyStencil(i); + } + + // wait for completion + MPI_Waitall(4, reqs, MPI_STATUSES_IGNORE); + + // complete boundary + applyStencil(1); // left + applyStencil(N); // right + + if (step % 200 == 0) { + dump(curr); + } + curr.swap(next); + } + + MPI_Finalize(); + return 0; +} diff --git a/lecture13/mpi_1D_diffusion/plot.py b/lecture13/mpi_1D_diffusion/plot.py @@ -0,0 +1,19 @@ +# File : plot.py +# Description: Plot temporal evolution of diffusing quantity +# Copyright 2022 Harvard University. All Rights Reserved. +import numpy as np +import matplotlib.pyplot as plt + +def main(): + data = np.loadtxt('u.dat') + N = data.shape[1] + dx = 1.0 / N + for step in range(data.shape[0]): + x = (np.arange(N) + 0.5) * dx + plt.plot(x, data[step, :]) + plt.xlabel(r'$x$') + plt.ylabel(r'$u(x,t)$') + plt.savefig('u.png', dpi=300, bbox_inches='tight') + +if __name__ == "__main__": + main() diff --git a/lecture13/mpi_cyclic_shift/.gitignore b/lecture13/mpi_cyclic_shift/.gitignore @@ -0,0 +1 @@ +mpi_cyclic_shift diff --git a/lecture13/mpi_cyclic_shift/Makefile b/lecture13/mpi_cyclic_shift/Makefile @@ -0,0 +1,14 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX +.PHONY: clean + +all: mpi_cyclic_shift mpi_sendrecv + +mpi_cyclic_shift: mpi_cyclic_shift.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +mpi_sendrecv: mpi_sendrecv.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f mpi_cyclic_shift mpi_sendrecv diff --git a/lecture13/mpi_cyclic_shift/mpi_cyclic_shift.cpp b/lecture13/mpi_cyclic_shift/mpi_cyclic_shift.cpp @@ -0,0 +1,59 @@ +#include <iostream> +#include <mpi.h> +#include <sstream> +#include <vector> + +int main(int argc, char *argv[]) +{ + int rank, size; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // source and destination (periodic boundaries) + const int left = (rank + size - 1) % size; + const int right = (rank + size + 1) % size; + + std::ostringstream out; + std::vector<double> send(100, rank), recv(100); + for (int cycle = 0; cycle < 3; ++cycle) { + out << "Cycle " << cycle << ": "; + if (0 == rank % 2) { // even ranks send first + MPI_Send(send.data(), + send.size(), + MPI_DOUBLE, + right, + 123, + MPI_COMM_WORLD); + MPI_Recv(recv.data(), + recv.size(), + MPI_DOUBLE, + left, + 123, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + } else { + MPI_Recv(recv.data(), + recv.size(), + MPI_DOUBLE, + left, + 123, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + MPI_Send(send.data(), + send.size(), + MPI_DOUBLE, + right, + 123, + MPI_COMM_WORLD); + } + out << "Rank " << rank << ": sent right 100x " << send[0] + << ", received left 100x " << recv[0] << '\n'; + send.swap(recv); + } + + std::cout << out.str(); + + MPI_Finalize(); + return 0; +} diff --git a/lecture13/mpi_cyclic_shift/mpi_sendrecv.cpp b/lecture13/mpi_cyclic_shift/mpi_sendrecv.cpp @@ -0,0 +1,42 @@ +#include <iostream> +#include <mpi.h> +#include <sstream> +#include <vector> + +int main(int argc, char *argv[]) +{ + int rank, size; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // source and destination (periodic boundaries) + const int left = (rank + size - 1) % size; + const int right = (rank + size + 1) % size; + + std::ostringstream out; + std::vector<double> send(100, rank), recv(100); + for (int cycle = 0; cycle < 3; ++cycle) { + out << "Cycle " << cycle << ": "; + MPI_Sendrecv(send.data(), + send.size(), + MPI_DOUBLE, + right, + 123, + recv.data(), + recv.size(), + MPI_DOUBLE, + left, + 123, + MPI_COMM_WORLD, + MPI_STATUS_IGNORE); + out << "Rank " << rank << ": sent right 100x " << send[0] + << ", received left 100x " << recv[0] << '\n'; + send.swap(recv); + } + + std::cout << out.str(); + + MPI_Finalize(); + return 0; +} diff --git a/lecture13/mpi_p2p_nonblocking/.gitignore b/lecture13/mpi_p2p_nonblocking/.gitignore @@ -0,0 +1 @@ +mpi_p2p_nonblocking diff --git a/lecture13/mpi_p2p_nonblocking/Makefile b/lecture13/mpi_p2p_nonblocking/Makefile @@ -0,0 +1,9 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX +.PHONY: clean + +mpi_p2p_nonblocking: mpi_p2p_nonblocking.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f mpi_p2p_nonblocking diff --git a/lecture13/mpi_p2p_nonblocking/mpi_p2p_nonblocking.cpp b/lecture13/mpi_p2p_nonblocking/mpi_p2p_nonblocking.cpp @@ -0,0 +1,25 @@ +#include <mpi.h> +#include <iostream> + +// exchange rank IDs between 2 processes +// DISCLAIMER: this code contains a bug +int main(int argc, char* argv[]) +{ + int rank, recv; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Request request; + if (0 == rank) { + MPI_Irecv(&recv, 1, MPI_INT, 1, 99, MPI_COMM_WORLD, &request); + MPI_Send(&rank, 1, MPI_INT, 1, 98, MPI_COMM_WORLD); + } else { + MPI_Irecv(&recv, 1, MPI_INT, 0, 98, MPI_COMM_WORLD, &request); + MPI_Send(&rank, 1, MPI_INT, 0, 99, MPI_COMM_WORLD); + } + // The following line is required to fix the bug: + // MPI_Wait(&request, MPI_STATUS_IGNORE); // synchronization + std::cout << "Rank " << rank << " got ID " << recv << " from other rank\n"; + + MPI_Finalize(); + return 0; +} diff --git a/lecture14/mpi_1D_diffusion_IO/.gitignore b/lecture14/mpi_1D_diffusion_IO/.gitignore @@ -0,0 +1,4 @@ +nonblocking +*.dat +*.bin +*.png diff --git a/lecture14/mpi_1D_diffusion_IO/IO.h b/lecture14/mpi_1D_diffusion_IO/IO.h @@ -0,0 +1,144 @@ +#ifndef IO_H_C0WPQOQZ +#define IO_H_C0WPQOQZ + +#include <fstream> +#include <mpi.h> +#include <sstream> +#include <string> +#include <vector> + +void initialize(std::vector<double> &v) +{ + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + const int N = static_cast<int>(v.size()) - 2; + const double dx = 1.0 / (size * N); + const double x0 = rank * N * dx; + for (int i = 0; i < N; ++i) { + const double x = x0 + (i + 0.5) * dx; + if (x > 0.4 && x < 0.6) { + v[i + 1] = 1.0; + } else { + v[i + 1] = 0.0; + } + } +} + +void dump_posix(const std::vector<double> &v) +{ + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + const int N = static_cast<int>(v.size()) - 2; // ignore ghost cells + std::vector<double> u(size * N); + MPI_Gather( + &v[1], N, MPI_DOUBLE, u.data(), N, MPI_DOUBLE, 0, MPI_COMM_WORLD); + if (0 == rank) { // not parallel + std::ofstream out("u.dat", std::ios::app); + out << std::scientific; + for (size_t i = 0; i < u.size(); ++i) { + out << ' ' << u[i]; + } + out << '\n'; + } +} + +void dump_rank(const std::vector<double> &v) +{ + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + std::ofstream out("u_rank" + std::to_string(rank) + ".dat", std::ios::app); + out << std::scientific; + for (int i = 1; i < (int)v.size() - 1; ++i) { + out << ' ' << v[i]; // skip ghost cells + } + out << '\n'; +} + +void dump_mpi_binary(const std::vector<double> &v) +{ + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_File fh; // MPI file handle + MPI_Offset bytes, file_offset; + + const int N = static_cast<int>(v.size()) - 2; // ignore ghost cells + bytes = N * sizeof(double); + file_offset = rank * bytes; + + MPI_File_open(MPI_COMM_WORLD, + "u_mpi.bin", + MPI_MODE_CREATE | MPI_MODE_WRONLY, + MPI_INFO_NULL, + &fh); + MPI_File_get_size(fh, &bytes); // append to file + file_offset += bytes; + MPI_File_write_at_all( + fh, file_offset, &v[1], N, MPI_DOUBLE, MPI_STATUS_IGNORE); + MPI_File_close(&fh); +} + +void dump_mpi_ascii(const std::vector<double> &v) +{ + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + std::ostringstream out; + out << std::scientific; + for (int i = 1; i < (int)v.size() - 1; ++i) { + out << ' ' << v[i]; // skip ghost cells + } + if (rank == size - 1) { + out << '\n'; + } + const std::string ascii = out.str(); + + MPI_File fh; // MPI file handle + MPI_Offset bytes, file_offset = 0; + bytes = ascii.size() * sizeof(char); + MPI_Exscan(&bytes, &file_offset, 1, MPI_OFFSET, MPI_SUM, MPI_COMM_WORLD); + + MPI_File_open(MPI_COMM_WORLD, + "u_mpi.dat", + MPI_MODE_CREATE | MPI_MODE_WRONLY, + MPI_INFO_NULL, + &fh); + MPI_File_get_size(fh, &bytes); // append to file + file_offset += bytes; + MPI_File_write_at_all(fh, + file_offset, + ascii.data(), + ascii.size(), + MPI_CHAR, + MPI_STATUS_IGNORE); + MPI_File_close(&fh); +} + +void dump_mpi_ascii_ordered(const std::vector<double> &v) +{ + int rank, size; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + std::ostringstream out; + out << std::scientific; + for (int i = 1; i < (int)v.size() - 1; ++i) { + out << ' ' << v[i]; // skip ghost cells + } + if (rank == size - 1) { + out << '\n'; + } + const std::string ascii = out.str(); + + MPI_File fh; // MPI file handle + MPI_File_open(MPI_COMM_WORLD, + "u_mpi_ordered.dat", + MPI_MODE_CREATE | MPI_MODE_WRONLY | MPI_MODE_APPEND, + MPI_INFO_NULL, + &fh); + MPI_File_write_ordered( + fh, ascii.data(), ascii.size(), MPI_CHAR, MPI_STATUS_IGNORE); + MPI_File_close(&fh); +} + +#endif /* IO_H_C0WPQOQZ */ diff --git a/lecture14/mpi_1D_diffusion_IO/Makefile b/lecture14/mpi_1D_diffusion_IO/Makefile @@ -0,0 +1,13 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX +.PHONY: clean + +run_nonblocking: clean nonblocking + mpirun -n 4 ./nonblocking + python plot.py + +nonblocking: nonblocking.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f u*.dat u*.bin u.png IO.png nonblocking diff --git a/lecture14/mpi_1D_diffusion_IO/nonblocking.cpp b/lecture14/mpi_1D_diffusion_IO/nonblocking.cpp @@ -0,0 +1,58 @@ +#include "IO.h" +#include <mpi.h> +#include <vector> + +int main(int argc, char *argv[]) +{ + int rank, size; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + // source and destination (periodic boundaries) + const int left = (rank + size - 1) % size; + const int right = (rank + size + 1) % size; + + const int nsteps = 1000; // number of time steps + const int N = 128; // number of cells in subdomain + const double Fo = 0.45; // Fourier number + std::vector<double> curr(N + 2); // +2 ghost cells + std::vector<double> next(N + 2); // +2 ghost cells + initialize(curr); + auto applyStencil = [&](int i) { + next[i] = curr[i] + Fo * (curr[i - 1] - 2.0 * curr[i] + curr[i + 1]); + }; + for (int step = 0; step < nsteps; ++step) { + MPI_Request reqs[4]; + // clang-format off + MPI_Irecv(&curr[0], 1, MPI_DOUBLE, left, 123, MPI_COMM_WORLD, &reqs[0]); + MPI_Irecv(&curr[N + 1], 1, MPI_DOUBLE, right, 123, MPI_COMM_WORLD, &reqs[1]); + MPI_Isend(&curr[1], 1, MPI_DOUBLE, left, 123, MPI_COMM_WORLD, &reqs[2]); + MPI_Isend(&curr[N], 1, MPI_DOUBLE, right, 123, MPI_COMM_WORLD, &reqs[3]); + // clang-format on + + // compute internal domain + for (int i = 2; i < N; ++i) { + applyStencil(i); + } + + // wait for completion + MPI_Waitall(4, reqs, MPI_STATUSES_IGNORE); + + // complete boundary + applyStencil(1); // left + applyStencil(N); // right + + if (step % 200 == 0) { + dump_posix(curr); + dump_rank(curr); + dump_mpi_binary(curr); + dump_mpi_ascii(curr); + dump_mpi_ascii_ordered(curr); + } + curr.swap(next); + } + + MPI_Finalize(); + return 0; +} diff --git a/lecture14/mpi_1D_diffusion_IO/plot.py b/lecture14/mpi_1D_diffusion_IO/plot.py @@ -0,0 +1,52 @@ +# File : plot.py +# Description: Plot temporal evolution of diffusing quantity +# Copyright 2022 Harvard University. All Rights Reserved. +import numpy as np +import matplotlib.pyplot as plt + +def plot(ax, fname, title, dx, N, *, start=0.0, binary=False): + if binary: + data = np.fromfile(fname, dtype=float) + nsteps = data.size // N + data = data.reshape((nsteps, N)) + else: + data = np.loadtxt(fname) + N = data.shape[1] + for step in range(data.shape[0]): + x = start + (np.arange(N) + 0.5) * dx + ax.plot(x, data[step, :]) + ax.text(0.03, 0.9, title, fontsize=6) + + +def main(): + N = 4 * 128 + dx = 1.0 / N + + fig, ax = plt.subplots(4, 2, sharex=True) + plot(ax[0][0], 'u.dat', 'POSIX', dx, N) + plot(ax[1][0], 'u_mpi.bin', 'MPI (binary)', dx, N, binary=True) + plot(ax[2][0], 'u_mpi.dat', 'MPI (ascii)', dx, N) + plot(ax[3][0], 'u_mpi_ordered.dat', 'MPI (ascii, ordered)', dx, N) + plot(ax[0][1], 'u_rank0.dat', 'MPI (rank 0)', dx, N, start=0 * 128 * dx) + plot(ax[1][1], 'u_rank1.dat', 'MPI (rank 1)', dx, N, start=1 * 128 * dx) + plot(ax[2][1], 'u_rank2.dat', 'MPI (rank 2)', dx, N, start=2 * 128 * dx) + plot(ax[3][1], 'u_rank3.dat', 'MPI (rank 3)', dx, N, start=3 * 128 * dx) + for a in ax.flatten(): + a.set_xlim([0, 1]) + a.set_ylim([-0.05, 1.05]) + ax[3][0].set_xlabel(r'$x$') + ax[3][1].set_xlabel(r'$x$') + for i in range(4): + ax[i][0].set_ylabel(r'$u(x,t)$') + plt.savefig('IO.png', dpi=600, bbox_inches='tight') + plt.close() + + fig, ax = plt.subplots() + plot(ax, 'u.dat', '', dx, N) + plt.xlabel(r'$x$') + plt.ylabel(r'$u(x,t)$') + plt.savefig('u.png', dpi=600, bbox_inches='tight') + plt.close() + +if __name__ == "__main__": + main() diff --git a/lecture14/parallel_hdf5/.gitignore b/lecture14/parallel_hdf5/.gitignore @@ -0,0 +1 @@ +parallel_hdf5 diff --git a/lecture14/parallel_hdf5/Makefile b/lecture14/parallel_hdf5/Makefile @@ -0,0 +1,12 @@ +CXX = mpic++ +CXXFLAGS = -O1 -Wall -Wextra -Wpedantic -DOMPI_SKIP_MPICXX +.PHONY: clean + +parallel_hdf5: parallel_hdf5.cpp + $(CXX) $(CXXFLAGS) -o $@ $< -lhdf5 -lz -lm + +run: clean parallel_hdf5 + mpirun -n 8 ./parallel_hdf5 + +clean: + rm -f data.* parallel_hdf5 diff --git a/lecture14/parallel_hdf5/data.h5 b/lecture14/parallel_hdf5/data.h5 Binary files differ. diff --git a/lecture14/parallel_hdf5/data.xmf b/lecture14/parallel_hdf5/data.xmf @@ -0,0 +1,23 @@ +<?xml version="1.0" ?> +<!DOCTYPE Xdmf SYSTEM "Xdmf.dtd" []> +<Xdmf Version="2.0"> + <Domain> + <Grid GridType="Uniform"> + <Time Value="00000"/> + <Topology TopologyType="3DRECTMESH" Dimensions ="33 17 9"/> + <Geometry GeometryType ="ORIGIN_DXDYDZ"> + <DataItem Name ="Origin" Dimensions="3" NumberType="Float" Precision ="8" Format="XML"> + 0.000000e+00 0.000000e+00 0.000000e+00 + </DataItem> + <DataItem Name ="Spacing" Dimensions="3" NumberType="Float" Precision ="8" Format="XML"> + 3.125000e-02 3.125000e-02 3.125000e-02 + </DataItem> + </Geometry> + <Attribute Name="data" AttributeType="Scalar" Center="Cell"> + <DataItem Dimensions ="32 16 8" NumberType="Float" Precision="4" Format ="HDF"> + data.h5:/u + </DataItem> + </Attribute> + </Grid> + </Domain> +</Xdmf> diff --git a/lecture14/parallel_hdf5/hdf5IO.h b/lecture14/parallel_hdf5/hdf5IO.h @@ -0,0 +1,98 @@ +// File : hdf5IO.h +// Description: HDF5 parallel MPI dumper +// Copyright 2023 Harvard University. All Rights Reserved. +#ifndef HDF5IO_H_XUXZ7EB0 +#define HDF5IO_H_XUXZ7EB0 + +#include "xdmf_wrapper.h" +#include <hdf5.h> +#include <iostream> +#include <mpi.h> +#include <string> +#include <vector> + +void write_hdf5_mpi(const std::string fname, + const std::vector<float> &u, + const int Nx, + const int Ny, + const int Nz, + const MPI_Comm cart_comm) +{ + int rank, dims[3], periods[3], coords[3]; + MPI_Comm_rank(cart_comm, &rank); + MPI_Cart_get(cart_comm, 3, dims, periods, coords); + + // open HDF5 file + H5open(); + hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); // file access policy + H5Pset_fapl_mpio(plist_id, cart_comm, MPI_INFO_NULL); + hid_t file_id = H5Fcreate( + (fname + ".h5").c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); + H5Pclose(plist_id); + + // rank local data and offset and global file dimensions + hsize_t file_count[3] = {static_cast<unsigned int>(Nz), + static_cast<unsigned int>(Ny), + static_cast<unsigned int>(Nx)}; + hsize_t file_offset[3] = { + static_cast<unsigned int>(coords[2]) * Nz, + static_cast<unsigned int>(coords[1]) * Ny, + static_cast<unsigned int>(coords[0]) * Nx, + }; + hsize_t file_dims[3] = { + static_cast<unsigned int>(dims[2]) * Nz, + static_cast<unsigned int>(dims[1]) * Ny, + static_cast<unsigned int>(dims[0]) * Nx, + }; + if (0 == rank) { + std::cout << "HDF5 file size: " + << (file_dims[0] * file_dims[1] * file_dims[2] * + sizeof(float)) / + (1024.0 * 1024.0) + << " MB\n"; + } + + // dataspace for dataset + hid_t fspace_id = H5Screate_simple(3, file_dims, NULL); + hid_t mspace_id = H5Screate_simple(3, file_count, NULL); + + // chunked dataset + plist_id = H5Pcreate(H5P_DATASET_CREATE); + H5Pset_chunk(plist_id, 3, file_count); + hid_t dataset_id = H5Dcreate(file_id, + "u", + H5T_NATIVE_FLOAT, + fspace_id, + H5P_DEFAULT, + plist_id, + H5P_DEFAULT); + H5Pclose(plist_id); + H5Sclose(fspace_id); + + // define the hyperslab + fspace_id = H5Dget_space(dataset_id); + H5Sselect_hyperslab( + fspace_id, H5S_SELECT_SET, file_offset, NULL, file_count, NULL); + + // create property list for collective write + plist_id = H5Pcreate(H5P_DATASET_XFER); + H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE); + + // write the data + H5Dwrite( + dataset_id, H5T_NATIVE_FLOAT, mspace_id, fspace_id, plist_id, u.data()); + + H5Dclose(dataset_id); + H5Sclose(fspace_id); + H5Sclose(mspace_id); + H5Pclose(plist_id); + H5Fclose(file_id); + H5close(); + + // root rank writes XDMF wrapper + if (0 == rank) { + xdmf_wrapper(fname, file_dims); + } +} + +#endif /* HDF5IO_H_XUXZ7EB0 */ diff --git a/lecture14/parallel_hdf5/hdf5_in_python.py b/lecture14/parallel_hdf5/hdf5_in_python.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +# File : hdf5_in_python.py +# Description: Example to load a HDF5 data file in Python +# Copyright 2023 Harvard University. All Rights Reserved. +import h5py as h5 + + +def load_hdf5(fname, group): + with h5.File(fname, 'r') as data: + return data[group][()] + + +if __name__ == '__main__': + data = load_hdf5( + 'data.h5', group='u' + ) # you could have many groups in a HDF5 file! + print(data.shape) # print dimension of data + print(data[0, 0, :]) # prints the first 8 data entries in field `u` diff --git a/lecture14/parallel_hdf5/parallel_hdf5.cpp b/lecture14/parallel_hdf5/parallel_hdf5.cpp @@ -0,0 +1,44 @@ +// File : parallel_hdf5.cpp +// Description: Example parallel I/O using HDF5 and MPI +// To compile this code on the academic cluster load the following: +// +// module load gcc/10.2.0-fasrc01 +// openmpi/4.1.1-fasrc01 +// hdf5/1.12.1-fasrc01 +// +// Copyright 2023 Harvard University. All Rights Reserved. +#include "hdf5IO.h" +#include "xdmf_wrapper.h" +#include <mpi.h> +#include <vector> + +int main(int argc, char *argv[]) +{ + MPI_Init(&argc, &argv); + + int size; + MPI_Comm_size(MPI_COMM_WORLD, &size); + if (8 != size) { // need p x p x p = 8 ranks for this demo code + MPI_Abort(MPI_COMM_WORLD, MPI_ERR_TOPOLOGY); + } + + // create Cartesian communicator + const int dims[] = {2, 2, 2}; + const int periods[] = {false, false, false}; + MPI_Comm cart_comm; + MPI_Cart_create(MPI_COMM_WORLD, 3, dims, periods, false, &cart_comm); + + // subdomain size and data u + int rank; + MPI_Comm_rank(cart_comm, &rank); + constexpr int Nx = 4; + constexpr int Ny = 8; + constexpr int Nz = 16; + std::vector<float> u(Nx * Ny * Nz, static_cast<float>(rank)); + + write_hdf5_mpi("data", u, Nx, Ny, Nz, cart_comm); + + MPI_Comm_free(&cart_comm); + MPI_Finalize(); + return 0; +} diff --git a/lecture14/parallel_hdf5/submit.sh b/lecture14/parallel_hdf5/submit.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# File : submit.sh +# Description: Example HDF5 application job submission script using 8 MPI ranks +# Copyright 2023 Harvard University. All Rights Reserved. +#SBATCH --job-name=parallel_hdf5 +#SBATCH --output=parallel_hdf5_%j.out +#SBATCH --error=parallel_hdf5_%j.err +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --cpus-per-task=1 +#SBATCH --time=00:15:00 + +module purge +module load \ + gcc/10.2.0-fasrc01 \ + openmpi/4.1.1-fasrc01 \ + hdf5/1.12.1-fasrc01 + +srun -n1 -c1 make clean parallel_hdf5 + +srun ./parallel_hdf5 + +exit $? diff --git a/lecture14/parallel_hdf5/xdmf_wrapper.h b/lecture14/parallel_hdf5/xdmf_wrapper.h @@ -0,0 +1,46 @@ +// File : xdmf_wrapper.h +// Description: XDMF wrapper for HDF5 file +// https://www.xdmf.org/index.php/XDMF_Model_and_Format +// Copyright 2023 Harvard University. All Rights Reserved. +#ifndef XDMF_WRAPPER_H_9PHDQVEM +#define XDMF_WRAPPER_H_9PHDQVEM + +#include <cstdio> +#include <string> + +void xdmf_wrapper(const std::string &fname, + const unsigned long long file_dims[]) +{ + const double dx = + 1.0 / std::max(file_dims[0], std::max(file_dims[1], file_dims[2])); + FILE *xmf = 0; + xmf = fopen((fname + ".xmf").c_str(), "w"); + // clang-format off + fprintf(xmf, "<?xml version=\"1.0\" ?>\n"); + fprintf(xmf, "<!DOCTYPE Xdmf SYSTEM \"Xdmf.dtd\" []>\n"); + fprintf(xmf, "<Xdmf Version=\"2.0\">\n"); + fprintf(xmf, " <Domain>\n"); + fprintf(xmf, " <Grid GridType=\"Uniform\">\n"); + fprintf(xmf, " <Time Value=\"00000\"/>\n"); + fprintf(xmf, " <Topology TopologyType=\"3DRECTMESH\" Dimensions =\"%llu %llu %llu\"/>\n", file_dims[0] + 1, file_dims[1] + 1, file_dims[2] + 1); + fprintf(xmf, " <Geometry GeometryType =\"ORIGIN_DXDYDZ\">\n"); + fprintf(xmf, " <DataItem Name =\"Origin\" Dimensions=\"3\" NumberType=\"Float\" Precision =\"8\" Format=\"XML\">\n"); + fprintf(xmf, " 0.000000e+00 0.000000e+00 0.000000e+00\n"); + fprintf(xmf, " </DataItem>\n"); + fprintf(xmf, " <DataItem Name =\"Spacing\" Dimensions=\"3\" NumberType=\"Float\" Precision =\"8\" Format=\"XML\">\n"); + fprintf(xmf, " %e %e %e\n", dx, dx, dx); + fprintf(xmf, " </DataItem>\n"); + fprintf(xmf, " </Geometry>\n"); + fprintf(xmf, " <Attribute Name=\"data\" AttributeType=\"Scalar\" Center=\"Cell\">\n"); + fprintf(xmf, " <DataItem Dimensions =\"%llu %llu %llu\" NumberType=\"Float\" Precision=\"4\" Format =\"HDF\">\n", file_dims[0], file_dims[1], file_dims[2]); + fprintf(xmf, " %s:/u\n", (fname + ".h5").c_str()); + fprintf(xmf, " </DataItem>\n"); + fprintf(xmf, " </Attribute>\n"); + fprintf(xmf, " </Grid>\n"); + fprintf(xmf, " </Domain>\n"); + fprintf(xmf, "</Xdmf>\n"); + // clang-format on + fclose(xmf); +} + +#endif /* XDMF_WRAPPER_H_9PHDQVEM */ diff --git a/lecture15/papi_examples_HW2/asm/.gitignore b/lecture15/papi_examples_HW2/asm/.gitignore @@ -0,0 +1 @@ +test diff --git a/lecture15/papi_examples_HW2/asm/Makefile b/lecture15/papi_examples_HW2/asm/Makefile @@ -0,0 +1,12 @@ +CXX = g++ +CXXFLAGS = -g -Wall -Wextra -Wpedantic +LIBS = -lpapi + +.PHONY: clean + +test: test.cpp kernel.s + $(CXX) -O2 -I$(SHARED_DATA)/local/include -L$(SHARED_DATA)/local/lib \ + $(CXXFLAGS) -o $@ $^ $(LIBS) + +clean: + rm -f test diff --git a/lecture15/papi_examples_HW2/asm/kernel.s b/lecture15/papi_examples_HW2/asm/kernel.s @@ -0,0 +1,19 @@ + .intel_syntax noprefix + .text + .globl _Z4testPc + .type _Z4testPc, @function +_Z4testPc: + # Stack operations that end up in L1,L2,L3 caches. The stack size on Linux + # is usually around 8MB (see `ulimit -s`) so most of the stack operations + # will have around 4 cycle latency (L1 ideally). + # PAPI counts these in PAPI_LST_INS counter also (NOT WHAT WE WANT!) + mov QWORD PTR [rsp-8], rdi + mov rax, QWORD PTR [rsp-8] + # data cache miss: read from RAM (about 200 cycles) + # PAPI counts this in PAPI_LST_INS counter (WHAT WE WANT) + mov cl, BYTE PTR [rdi] + # write to pointer A[0] = 1 [this will cause the printed result to be 1] + # (about 200 cycles) + # PAPI counts this in PAPI_LST_INS counter (WHAT WE WANT) + mov BYTE PTR [rdi], 0x1 + ret diff --git a/lecture15/papi_examples_HW2/asm/test.cpp b/lecture15/papi_examples_HW2/asm/test.cpp @@ -0,0 +1,61 @@ +#include "papi.h" +#include <iostream> + +// Broadwell CPU on cluster, you can get one with +// salloc -N1 -c32 -t 01:00:00 +// +// Model name: Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz +// L1d cache: 32K +// L1i cache: 32K +// L2 cache: 256K +// L3 cache: 40960K +#define L1_SIZE_KB 32 +#define L2_SIZE_KB 256 +#define L3_SIZE_KB 40960 + +void test(char *A); + +int main() +{ + char A[100] = {0}; + + // Initialize PAPI + int event_set = PAPI_NULL; + int events[4] = {PAPI_LST_INS, PAPI_L1_DCM, PAPI_L2_DCM, PAPI_L3_DCA}; + long long int counters[4]; + PAPI_library_init(PAPI_VER_CURRENT); + PAPI_create_eventset(&event_set); + PAPI_add_events(event_set, events, 4); + + // start PAPI measurement + PAPI_start(event_set); + + // run code to be measured + test(A); + + // stop PAPI and get counter values + PAPI_stop(event_set, counters); + + // clang-format off + const long long total_lst = counters[0]; // total load/store + const long long total_l1m = counters[1]; // total L1 load misses + const long long total_l2m = counters[2]; // total L2 load misses + const long long total_l3m = counters[3]; // total L3 load misses + // clang-format on + + int sum = 0; + for (int i = 0; i < 100; ++i) { + sum += A[i]; + } + + std::cout << "Result: " << sum << '\n'; + std::cout << "L1 cache size: " << L1_SIZE_KB << " KB\n"; + std::cout << "L2 cache size: " << L2_SIZE_KB << " KB\n"; + std::cout << "L3 cache size: " << L3_SIZE_KB << " KB\n"; + std::cout << "Total L1 data misses: " << total_l1m << '\n'; + std::cout << "Total L2 data misses: " << total_l2m << '\n'; + std::cout << "Total L3 data accesses: " << total_l3m << '\n'; + std::cout << "Total load/store: " << total_lst << '\n'; + + return 0; +} diff --git a/lecture15/papi_examples_HW2/daxpy/.gitignore b/lecture15/papi_examples_HW2/daxpy/.gitignore @@ -0,0 +1,2 @@ +daxpy_O* +*.swp diff --git a/lecture15/papi_examples_HW2/daxpy/Makefile b/lecture15/papi_examples_HW2/daxpy/Makefile @@ -0,0 +1,25 @@ +CXX = g++ +CXXFLAGS = -g -Wall -Wextra -Wpedantic -I$(SHARED_DATA)/local/include -L$(SHARED_DATA)/local/lib +LIBS = -lpapi + +.PHONY: clean + +all: daxpy_O0 daxpy_O1 daxpy_O2 daxpy_O3_autovec daxpy_O3_noautovec + +daxpy_O0: daxpy.cpp + $(CXX) -O0 $(CXXFLAGS) -o $@ $< $(LIBS) + +daxpy_O1: daxpy.cpp + $(CXX) -O1 $(CXXFLAGS) -o $@ $< $(LIBS) + +daxpy_O2: daxpy.cpp + $(CXX) -O2 $(CXXFLAGS) -o $@ $< $(LIBS) + +daxpy_O3_autovec: daxpy.cpp + $(CXX) -O3 $(CXXFLAGS) -o $@ $< $(LIBS) + +daxpy_O3_noautovec: daxpy.cpp + $(CXX) -O3 -fno-tree-vectorize $(CXXFLAGS) -o $@ $< $(LIBS) + +clean: + rm -f daxpy_O* diff --git a/lecture15/papi_examples_HW2/daxpy/daxpy.cpp b/lecture15/papi_examples_HW2/daxpy/daxpy.cpp @@ -0,0 +1,106 @@ +#include "papi.h" +#include <cstdlib> +#include <iostream> +#include <numeric> +#include <vector> + +// Broadwell CPU on cluster, you can get one with +// salloc -N1 -c32 -t 01:00:00 +// +// Model name: Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz +// L1d cache: 32K +// L1i cache: 32K +// L2 cache: 256K +// L3 cache: 40960K +#define L1_SIZE_KB 32 +#define L2_SIZE_KB 256 +#define L3_SIZE_KB 40960 + +typedef double Real; + +void daxpy(const Real a, const Real *x, Real *y, const int n) +{ + // 2n flops + // 3n memory accesses + for (int i = 0; i < n; ++i) { + y[i] = a * x[i] + y[i]; + } +} + +int main(int argc, char *argv[]) +{ + int n = 1000000; + if (argc > 1) { + n = atoi(argv[1]); + } + std::vector<Real> x(n, 1.0); + std::vector<Real> y(n, 0.001); + + // Initialize PAPI + int event_set = PAPI_NULL; + int events[4] = {PAPI_TOT_CYC, + PAPI_TOT_INS, + PAPI_LST_INS, + PAPI_L1_DCM}; + long long int counters[4]; + PAPI_library_init(PAPI_VER_CURRENT); + PAPI_create_eventset(&event_set); + PAPI_add_events(event_set, events, 4); + + // warm up + daxpy(1.0, x.data(), y.data(), n); + + // start PAPI measurement + PAPI_start(event_set); + + // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and + // PAPI_TOT_INS slightly, neglected here) + const long long int t0 = PAPI_get_real_nsec(); + + // run code to be measured + daxpy(1.0, x.data(), y.data(), n); + + // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and + // PAPI_TOT_INS slightly, neglected here) + const long long int t1 = PAPI_get_real_nsec(); + + // stop PAPI and get counter values + PAPI_stop(event_set, counters); + + // clang-format off + const long long total_cycles = counters[0]; // cpu cycles + const long long total_instructions = counters[1]; // any + const long long total_load_stores = counters[2]; // number of such instructions + const long long total_l1d_misses = counters[3]; // number of access request to cache line + // clang-format on + + const size_t flops = 2 * n; + const size_t mem_ops = 3 * n; + const double twall = (static_cast<double>(t1) - t0) * 1.0e-9; // seconds + const double IPC = static_cast<double>(total_instructions) / total_cycles; + const double OI = + static_cast<double>(flops) / (total_load_stores * sizeof(Real)); + const double OI_theory = + static_cast<double>(flops) / (mem_ops * sizeof(Real)); + const double float_perf = flops / twall * 1.0e-9; // Gflop/s + const double sum = std::accumulate(y.begin(), y.end(), 0.0); + + std::cout << "Result: " << sum << '\n'; + std::cout << "Total cycles: " << total_cycles << '\n'; + std::cout << "Total instructions: " << total_instructions << '\n'; + std::cout << "Instructions per cycle (IPC): " << IPC << '\n'; + std::cout << "L1 cache size: " << L1_SIZE_KB << " KB\n"; + std::cout << "L2 cache size: " << L2_SIZE_KB << " KB\n"; + std::cout << "L3 cache size: " << L3_SIZE_KB << " KB\n"; + std::cout << "Total problem size: " << 2 * n * sizeof(Real) / 1024 + << " KB\n"; + std::cout << "Total L1 data misses: " << total_l1d_misses << '\n'; + std::cout << "Total load/store: " << total_load_stores + << " (expected: " << mem_ops << ")\n"; + std::cout << "Operational intensity: " << std::scientific << OI + << " (expected: " << OI_theory << ")\n"; + std::cout << "Performance [Gflop/s]: " << float_perf << '\n'; + std::cout << "Wall-time [micro-seconds]: " << twall * 1.0e6 << '\n'; + + return 0; +} diff --git a/lecture15/papi_examples_HW2/daxpy/disassemble.sh b/lecture15/papi_examples_HW2/daxpy/disassemble.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +exec='daxpy' +if [[ $# -ne 1 ]]; then + cat <<EOF + USAGE: $0 <$exec executable> +EOF +fi + +gdb $1 -batch -ex 'set disassembly-flavor intel' -ex "disassemble $exec" diff --git a/lecture15/papi_examples_HW2/dgemm/.gitignore b/lecture15/papi_examples_HW2/dgemm/.gitignore @@ -0,0 +1,2 @@ +dgemm_O* +*.swp diff --git a/lecture15/papi_examples_HW2/dgemm/Makefile b/lecture15/papi_examples_HW2/dgemm/Makefile @@ -0,0 +1,29 @@ +CXX = g++ +CXXFLAGS = -g -Wall -Wextra -Wpedantic -I$(SHARED_DATA)/local/include -L$(SHARED_DATA)/local/lib +LIBS = -lpapi + +# CXXFLAGS += -DCBLAS +# LIBS += -lopenblas +# CXXFLAGS += -DEIGEN -march=broadwell + +.PHONY: clean + +all: dgemm_O0 dgemm_O1 dgemm_O2 dgemm_O3_autovec dgemm_O3_noautovec + +dgemm_O0: dgemm.cpp + $(CXX) -O0 $(CXXFLAGS) -o $@ $< $(LIBS) + +dgemm_O1: dgemm.cpp + $(CXX) -O1 $(CXXFLAGS) -o $@ $< $(LIBS) + +dgemm_O2: dgemm.cpp + $(CXX) -O2 $(CXXFLAGS) -o $@ $< $(LIBS) + +dgemm_O3_autovec: dgemm.cpp + $(CXX) -O3 $(CXXFLAGS) -o $@ $< $(LIBS) + +dgemm_O3_noautovec: dgemm.cpp + $(CXX) -O3 -fno-tree-vectorize $(CXXFLAGS) -o $@ $< $(LIBS) + +clean: + rm -f dgemm_O* diff --git a/lecture15/papi_examples_HW2/dgemm/dgemm.cpp b/lecture15/papi_examples_HW2/dgemm/dgemm.cpp @@ -0,0 +1,139 @@ +#include "papi.h" +#include <cstdlib> +#include <iostream> +#include <numeric> +#include <vector> + +#ifdef CBLAS +#include <cblas.h> +#elif defined(EIGEN) +#include <eigen3/Eigen/Dense> +// Wrapper to map pointer to Eigen matrix type (needed in benchmark) +template <typename T> +using EMat = Eigen::Map< + Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>; +#endif /* CBLAS */ + +// Broadwell CPU on cluster, you can get one with +// salloc -N1 -c32 -t 01:00:00 +// +// Model name: Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz +// L1d cache: 32K +// L1i cache: 32K +// L2 cache: 256K +// L3 cache: 40960K +#define L1_SIZE_KB 32 +#define L2_SIZE_KB 256 +#define L3_SIZE_KB 40960 + +typedef double Real; + +void dgemm(const Real *A, const Real *B, Real *__restrict__ C, const size_t n) +{ +#ifdef CBLAS + cblas_dgemm(CblasRowMajor, + CblasNoTrans, + CblasNoTrans, + n, + n, + n, + 1.0, + A, + n, + B, + n, + 1.0, + C, + n); +#elif defined(EIGEN) + // Matrix wrappers + const EMat<Real> Ap(const_cast<Real *>(A), n, n); + const EMat<Real> Bp(const_cast<Real *>(B), n, n); + EMat<Real> Cp(C, n, n); + Cp.noalias() += Ap * Bp; +#else + for (size_t i = 0; i < n; ++i) { + for (size_t k = 0; k < n; ++k) { + for (size_t j = 0; j < n; ++j) { + C[i * n + j] += A[i * n + k] * B[k * n + j]; + } + } + } +#endif /* CBLAS */ +} + +int main(int argc, char *argv[]) +{ + int n = 1000; + if (argc > 1) { + n = atoi(argv[1]); + } + std::vector<Real> A(n * n, 0.1); + std::vector<Real> B(n * n, 0.2); + std::vector<Real> C(n * n, 0.3); + + // Initialize PAPI + int event_set = PAPI_NULL; + int events[4] = {PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_LST_INS, PAPI_L1_DCM}; + long long int counters[4]; + PAPI_library_init(PAPI_VER_CURRENT); + PAPI_create_eventset(&event_set); + PAPI_add_events(event_set, events, 4); + + // warm up + dgemm(A.data(), B.data(), C.data(), n); + + // start PAPI measurement + PAPI_start(event_set); + + // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and + // PAPI_TOT_INS slightly, neglected here) + const long long int t0 = PAPI_get_real_nsec(); + + // run code to be measured + dgemm(A.data(), B.data(), C.data(), n); + + // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and + // PAPI_TOT_INS slightly, neglected here) + const long long int t1 = PAPI_get_real_nsec(); + + // stop PAPI and get counter values + PAPI_stop(event_set, counters); + + // clang-format off + const long long total_cycles = counters[0]; // cpu cycles + const long long total_instructions = counters[1]; // any + const long long total_load_stores = counters[2]; // number of such instructions + const long long total_l1d_misses = counters[3]; // number of access request to cache line + // clang-format on + + const size_t flops = 2 * n * n * n + n * n; + const size_t mem_ops = 4 * n * n; + const double twall = (static_cast<double>(t1) - t0) * 1.0e-9; // seconds + const double IPC = static_cast<double>(total_instructions) / total_cycles; + const double OI = + static_cast<double>(flops) / (total_load_stores * sizeof(Real)); + const double OI_theory = + static_cast<double>(flops) / (mem_ops * sizeof(Real)); + const double float_perf = flops / twall * 1.0e-9; // Gflop/s + const double sum = std::accumulate(C.begin(), C.end(), 0.0); + + std::cout << "Result: " << sum << '\n'; + std::cout << "Total cycles: " << total_cycles << '\n'; + std::cout << "Total instructions: " << total_instructions << '\n'; + std::cout << "Instructions per cycle (IPC): " << IPC << '\n'; + std::cout << "L1 cache size: " << L1_SIZE_KB << " KB\n"; + std::cout << "L2 cache size: " << L2_SIZE_KB << " KB\n"; + std::cout << "L3 cache size: " << L3_SIZE_KB << " KB\n"; + std::cout << "Total problem size: " + << 3 * n * n * sizeof(Real) / 1024 << " KB\n"; + std::cout << "Total L1 data misses: " << total_l1d_misses << '\n'; + std::cout << "Total load/store: " << total_load_stores + << " (expected: " << mem_ops << ")\n"; + std::cout << "Operational intensity: " << std::scientific << OI + << " (expected: " << OI_theory << ")\n"; + std::cout << "Performance [Gflop/s]: " << float_perf << '\n'; + std::cout << "Wall-time [micro-seconds]: " << twall * 1.0e6 << '\n'; + + return 0; +} diff --git a/lecture15/papi_examples_HW2/dgemm/disassemble.sh b/lecture15/papi_examples_HW2/dgemm/disassemble.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +exec='dgemm' +if [[ $# -ne 1 ]]; then + cat <<EOF + USAGE: $0 <$exec executable> +EOF +fi + +gdb $1 -batch -ex 'set disassembly-flavor intel' -ex "disassemble $exec" diff --git a/lecture15/papi_examples_HW2/sgemv/.gitignore b/lecture15/papi_examples_HW2/sgemv/.gitignore @@ -0,0 +1,2 @@ +sgemv_O* +*.swp diff --git a/lecture15/papi_examples_HW2/sgemv/Makefile b/lecture15/papi_examples_HW2/sgemv/Makefile @@ -0,0 +1,25 @@ +CXX = g++ +CXXFLAGS = -g -Wall -Wextra -Wpedantic -I$(SHARED_DATA)/local/include -L$(SHARED_DATA)/local/lib +LIBS = -lpapi + +.PHONY: clean + +all: sgemv_O0 sgemv_O1 sgemv_O2 sgemv_O3_autovec sgemv_O3_noautovec + +sgemv_O0: sgemv.cpp + $(CXX) -O0 $(CXXFLAGS) -o $@ $< $(LIBS) + +sgemv_O1: sgemv.cpp + $(CXX) -O1 $(CXXFLAGS) -o $@ $< $(LIBS) + +sgemv_O2: sgemv.cpp + $(CXX) -O2 $(CXXFLAGS) -o $@ $< $(LIBS) + +sgemv_O3_autovec: sgemv.cpp + $(CXX) -O3 $(CXXFLAGS) -o $@ $< $(LIBS) + +sgemv_O3_noautovec: sgemv.cpp + $(CXX) -O3 -fno-tree-vectorize $(CXXFLAGS) -o $@ $< $(LIBS) + +clean: + rm -f sgemv_O* diff --git a/lecture15/papi_examples_HW2/sgemv/disassemble.sh b/lecture15/papi_examples_HW2/sgemv/disassemble.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +exec='sgemv' +if [[ $# -ne 1 ]]; then + cat <<EOF + USAGE: $0 <$exec executable> +EOF +fi + +gdb $1 -batch -ex 'set disassembly-flavor intel' -ex "disassemble $exec" diff --git a/lecture15/papi_examples_HW2/sgemv/sgemv.cpp b/lecture15/papi_examples_HW2/sgemv/sgemv.cpp @@ -0,0 +1,104 @@ +#include "papi.h" +#include <cstdlib> +#include <iostream> +#include <numeric> +#include <vector> + +// Broadwell CPU on cluster, you can get one with +// salloc -N1 -c32 -t 01:00:00 +// +// Model name: Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz +// L1d cache: 32K +// L1i cache: 32K +// L2 cache: 256K +// L3 cache: 40960K +#define L1_SIZE_KB 32 +#define L2_SIZE_KB 256 +#define L3_SIZE_KB 40960 + +typedef float Real; + +void sgemv(const Real *A, const Real *x, Real *__restrict__ y, const size_t n) +{ + for (size_t j = 0; j < n; ++j) { + for (size_t i = 0; i < n; ++i) { + y[j] += A[j * n + i] * x[i]; + } + } +} + +int main(int argc, char *argv[]) +{ + int n = 1000; + if (argc > 1) { + n = atoi(argv[1]); + } + std::vector<Real> A(n * n, 0.1); + std::vector<Real> x(n, 1.0); + std::vector<Real> y(n, 0.001); + + // Initialize PAPI + int event_set = PAPI_NULL; + int events[4] = {PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_LST_INS, PAPI_L1_DCM}; + long long int counters[4]; + PAPI_library_init(PAPI_VER_CURRENT); + PAPI_create_eventset(&event_set); + PAPI_add_events(event_set, events, 4); + + // warm up + sgemv(A.data(), x.data(), y.data(), n); + + // start PAPI measurement + PAPI_start(event_set); + + // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and + // PAPI_TOT_INS slightly, neglected here) + const long long int t0 = PAPI_get_real_nsec(); + + // run code to be measured + sgemv(A.data(), x.data(), y.data(), n); + + // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and + // PAPI_TOT_INS slightly, neglected here) + const long long int t1 = PAPI_get_real_nsec(); + + // stop PAPI and get counter values + PAPI_stop(event_set, counters); + + // clang-format off + const long long total_cycles = counters[0]; // cpu cycles + const long long total_instructions = counters[1]; // any + const long long total_load_stores = counters[2]; // number of such instructions + const long long total_l1d_misses = counters[3]; // number of access request to cache line + // clang-format on + + const size_t flops = 2 * n * n + n; + const size_t mem_ops = 2 * n * n + 2 * n; + const double twall = (static_cast<double>(t1) - t0) * 1.0e-9; // seconds + const double IPC = static_cast<double>(total_instructions) / total_cycles; + const double OI = + static_cast<double>(flops) / (total_load_stores * sizeof(Real)); + const double OI_theory = + static_cast<double>(flops) / (mem_ops * sizeof(Real)); + const double float_perf = flops / twall * 1.0e-9; // Gflop/s + const double sum = std::accumulate(y.begin(), y.end(), 0.0); + + std::cout << "Result: " << sum << '\n'; + std::cout << "Total cycles: " << total_cycles << '\n'; + std::cout << "Total instructions: " << total_instructions << '\n'; + std::cout << "Instructions per cycle (IPC): " << IPC << '\n'; + std::cout << "L1 cache size: " << L1_SIZE_KB << " KB\n"; + std::cout << "L2 cache size: " << L2_SIZE_KB << " KB\n"; + std::cout << "L3 cache size: " << L3_SIZE_KB << " KB\n"; + std::cout << "Total problem size: " + << (n * n + 2 * n) * sizeof(Real) / 1024 << " KB\n"; + std::cout << "Total L1 data misses: " << total_l1d_misses << '\n'; + std::cout << "Total load/store: " << total_load_stores + << " (expected: " << mem_ops << ")\n"; + std::cout << "Operational intensity: " << std::scientific << OI + << " (expected: " << OI_theory << ")\n"; + std::cout << "Performance [Gflop/s]: " << float_perf << '\n'; + std::cout << "Wall-time [micro-seconds]: " << twall * 1.0e6 << '\n'; + + return 0; +} diff --git a/lecture16/allocator_test/.gitignore b/lecture16/allocator_test/.gitignore @@ -0,0 +1 @@ +allocator_test diff --git a/lecture16/allocator_test/Makefile b/lecture16/allocator_test/Makefile @@ -0,0 +1,9 @@ +CXX = g++ +CXXFLAGS = -O1 -g -Wall -Wextra -Wpedantic +.PHONY: clean + +allocator_test: allocator_test.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f allocator_test diff --git a/lecture16/allocator_test/allocator_test.cpp b/lecture16/allocator_test/allocator_test.cpp @@ -0,0 +1,55 @@ +#include <cstdio> +#include <cstdlib> + +#define _ALIGN_ 64 + +struct BadObject_t { + char buf[57]; // 57 byte buffer +}; + +struct NiceObject_t { + char buf[57]; // 57 byte buffer + char pad[7]; // 7 byte padding +}; + +void testAddress(void *base, + const size_t align_at, + const char *msg, + const size_t size) +{ + printf("%s [%lu]: misaligned by %lu bytes\n", + msg, + size, + reinterpret_cast<size_t>(base) % align_at); +} + +template <typename T> +void testAlignment() +{ + constexpr size_t size = sizeof(T); + + // C++ way + T *p = new T[2]; + testAddress(&p[0], _ALIGN_, "p[0]", size); + testAddress(&p[1], _ALIGN_, "p[1]", size); + delete[] p; + + // C way + p = reinterpret_cast<T *>(malloc(2 * size)); + testAddress(&p[0], _ALIGN_, "p[0]", size); + testAddress(&p[1], _ALIGN_, "p[1]", size); + free(p); + + // POSIX memalign way + posix_memalign((void **)&p, _ALIGN_, 2 * size); + testAddress(&p[0], _ALIGN_, "p[0]", size); + testAddress(&p[1], _ALIGN_, "p[1]", size); + free(p); +} + +int main() +{ + testAlignment<BadObject_t>(); + testAlignment<NiceObject_t>(); + return 0; +} diff --git a/lecture16/pointer_dereference/.gitignore b/lecture16/pointer_dereference/.gitignore @@ -0,0 +1 @@ +pointer_dereference diff --git a/lecture16/pointer_dereference/Makefile b/lecture16/pointer_dereference/Makefile @@ -0,0 +1,9 @@ +CXX = g++ +CXXFLAGS = -O1 -g -Wall -Wextra -Wpedantic +.PHONY: clean + +pointer_dereference: pointer_dereference.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f pointer_dereference diff --git a/lecture16/pointer_dereference/pointer_dereference.cpp b/lecture16/pointer_dereference/pointer_dereference.cpp @@ -0,0 +1,15 @@ +#include <iostream> + +int main() +{ + int *p = new int; + *p = 1; + + std::cout << "Pointer address: " << &p << '\n'; + std::cout << "Address pointer points to: " << p << '\n'; + std::cout << "Hex value stored at address: " << std::hex << "0x0" << *p + << '\n'; + + delete p; + return 0; +} diff --git a/lecture17/registers/.gitignore b/lecture17/registers/.gitignore @@ -0,0 +1,2 @@ +*.o +registers diff --git a/lecture17/registers/Makefile b/lecture17/registers/Makefile @@ -0,0 +1,12 @@ +.PHONY: clean + +# linker +registers: registers.o + ld -o $@ $< + +# assembly +registers.o: registers.s + as -o $@ $< + +clean: + rm -f registers registers.o diff --git a/lecture17/registers/registers.s b/lecture17/registers/registers.s @@ -0,0 +1,11 @@ +.text +.intel_syntax noprefix +.global _start +_start: + mov rax, 0x1 # initialize rax with value 1 + mov rbx, 0x2 # initialize rbx with value 2 + add rax, rbx # add them together and store result in rax + # this exits the program without libc and only works on 64-bit Linux: + mov rax, 60 + xor rdi, rdi + syscall diff --git a/lecture18_19/aliasing/.gitignore b/lecture18_19/aliasing/.gitignore @@ -0,0 +1 @@ +align diff --git a/lecture18_19/aliasing/Makefile b/lecture18_19/aliasing/Makefile @@ -0,0 +1,12 @@ +.PHONY: clean + +all: main aliasing.s + +aliasing.s: aliasing.c + gcc -O3 -S -masm=intel -o $@ $< + +main: main.c aliasing.c + gcc -O3 -o $@ $^ + +clean: + rm -f aliasing.s main diff --git a/lecture18_19/aliasing/aliasing.c b/lecture18_19/aliasing/aliasing.c @@ -0,0 +1,13 @@ +int f(int *a, int *b) +{ + *a = 1; + *b = 2; + return *a; +} + +int g(int *__restrict__ a, int *__restrict__ b) +{ + *a = 1; + *b = 2; + return *a; +} diff --git a/lecture18_19/aliasing/main.c b/lecture18_19/aliasing/main.c @@ -0,0 +1,15 @@ +#include <stdio.h> + +int f(int *a, int *b); +int g(int *a, int *b); + +int main(void) +{ + int a = 0; + int *b = &a; // legal alias + printf("%d\n", f(&a, b)); // expect 2 + + a = 0; + printf("%d\n", g(&a, b)); // expect 2(?) + return 0; +} diff --git a/lecture18_19/align/.gitignore b/lecture18_19/align/.gitignore @@ -0,0 +1 @@ +align diff --git a/lecture18_19/align/Makefile b/lecture18_19/align/Makefile @@ -0,0 +1,7 @@ +.PHONY: clean + +align: align.c + gcc -o $@ $< + +clean: + rm -f align diff --git a/lecture18_19/align/align.c b/lecture18_19/align/align.c @@ -0,0 +1,30 @@ +#include <stdio.h> +#include <stdlib.h> + +int main(void) +{ + printf("Byte alignment of double is: %zu\n", _Alignof(double)); + + printf("\nSTACK ALLOCATION:\n"); + _Alignas(8) double d; + // _Alignas(32) double d; + // _Alignas(64) double d; + printf("Address of d % 2 = %zu\n", ((size_t)&d) % 2); + printf("Address of d % 4 = %zu\n", ((size_t)&d) % 4); + printf("Address of d % 8 = %zu\n", ((size_t)&d) % 8); + printf("Address of d % 16 = %zu\n", ((size_t)&d) % 16); + printf("Address of d % 32 = %zu\n", ((size_t)&d) % 32); + printf("Address of d % 64 = %zu\n", ((size_t)&d) % 64); + + printf("\nHEAP ALLOCATION:\n"); + double *pd; + posix_memalign((void **)&pd, 8, sizeof(double)); // segfault if alignment <8 + printf("Address of pd % 2 = %zu\n", ((size_t)pd) % 2); + printf("Address of pd % 4 = %zu\n", ((size_t)pd) % 4); + printf("Address of pd % 8 = %zu\n", ((size_t)pd) % 8); + printf("Address of pd % 16 = %zu\n", ((size_t)pd) % 16); + printf("Address of pd % 32 = %zu\n", ((size_t)pd) % 32); + printf("Address of pd % 64 = %zu\n", ((size_t)pd) % 64); + free(pd); + return 0; +} diff --git a/lecture18_19/autovec/Makefile b/lecture18_19/autovec/Makefile @@ -0,0 +1,24 @@ +.PHONY: clean + +all: sse4.s avx2.s avx2_fma.s sse4_omp.s avx2_omp.s avx2_fma_omp.s + +sse4.s: f.c + gcc -O3 -msse4 -S -masm=intel -o $@ $< + +avx2.s: f.c + gcc -O3 -mavx2 -S -masm=intel -o $@ $< + +avx2_fma.s: f.c + gcc -O3 -mavx2 -mfma -S -masm=intel -o $@ $< + +sse4_omp.s: f.c + gcc -O3 -msse4 -fopenmp -S -masm=intel -o $@ $< + +avx2_omp.s: f.c + gcc -O3 -mavx2 -fopenmp -S -masm=intel -o $@ $< + +avx2_fma_omp.s: f.c + gcc -O3 -mavx2 -mfma -fopenmp -S -masm=intel -o $@ $< + +clean: + rm -f sse4*.s avx2*.s avx2_fma*.s diff --git a/lecture18_19/autovec/f.c b/lecture18_19/autovec/f.c @@ -0,0 +1,20 @@ +// #define RESTRICT +// #define IVDEP + +#ifdef RESTRICT +void f(float *__restrict__ dst, float *__restrict__ src, float a) +#else +void f(float *dst, float *src, float a) +#endif /* RESTRICT */ +{ +#ifdef IVDEP +#pragma GCC ivdep +#else +#ifdef _OPENMP +#pragma omp simd +#endif /* _OPENMP */ +#endif /* IVDEP */ + for (int i = 0; i < 1024; ++i) { + dst[i] = dst[i] + a * src[i]; + } +} diff --git a/lecture18_19/intrinsics/_mm_set_ps1/.gitignore b/lecture18_19/intrinsics/_mm_set_ps1/.gitignore @@ -0,0 +1,3 @@ +*.s +*.o +main diff --git a/lecture18_19/intrinsics/_mm_set_ps1/Makefile b/lecture18_19/intrinsics/_mm_set_ps1/Makefile @@ -0,0 +1,15 @@ +.PHONY: clean + +all: sse4.s ones.o main + +sse4.s: ones.c + gcc -O3 -msse4 -S -masm=intel -o $@ $< + +main: main.c ones.o + gcc -g -O0 -o $@ $^ + +ones.o: ones.c + gcc -g -O3 -msse4 -c -o $@ $< + +clean: + rm -f sse4*.s ones.o main diff --git a/lecture18_19/intrinsics/_mm_set_ps1/main.c b/lecture18_19/intrinsics/_mm_set_ps1/main.c @@ -0,0 +1,9 @@ +#include <x86intrin.h> + +__m128 ones(); + +int main(void) +{ + __m128 ret = ones(); + return 0; +} diff --git a/lecture18_19/intrinsics/_mm_set_ps1/ones.c b/lecture18_19/intrinsics/_mm_set_ps1/ones.c @@ -0,0 +1,6 @@ +#include <x86intrin.h> +__m128 ones(void) +{ + // broadcast value 1.0f (float) in all 4 SIMD lanes + return _mm_set_ps1(1.0f); +} diff --git a/lecture18_19/intrinsics/constants/set1_ps.c b/lecture18_19/intrinsics/constants/set1_ps.c @@ -0,0 +1,3 @@ +#include <x86intrin.h> +__m128 set1_ps() { return _mm_set1_ps(1.0f); } // compile-time constant +__m128 set1_ps(float x) { return _mm_set1_ps(x); } // set by argument diff --git a/lecture18_19/intrinsics/constants/set_ps.c b/lecture18_19/intrinsics/constants/set_ps.c @@ -0,0 +1,5 @@ +#include <x86intrin.h> +__m128 set_ps(float x0, float x1, float x2, float x3) +{ + return _mm_set_ps(x3, x2, x1, x0); +} diff --git a/lecture18_19/intrinsics/constants/set_ss.c b/lecture18_19/intrinsics/constants/set_ss.c @@ -0,0 +1,3 @@ +#include <x86intrin.h> +__m128 set_ss() { return _mm_set_ss(1.0f); } // compile-time constant +__m128 set_ss(float x) { return _mm_set_ss(x); } // set by argument diff --git a/lecture18_19/intrinsics/constants/setr_ps.c b/lecture18_19/intrinsics/constants/setr_ps.c @@ -0,0 +1,5 @@ +#include <x86intrin.h> +__m128 setr_ps(float x0, float x1, float x2, float x3) +{ + return _mm_setr_ps(x3, x2, x1, x0); +} diff --git a/lecture18_19/intrinsics/constants/setzero.c b/lecture18_19/intrinsics/constants/setzero.c @@ -0,0 +1,2 @@ +#include <x86intrin.h> +__m128 setzero() { return _mm_setzero_ps(); } diff --git a/lecture18_19/intrinsics/load_store/aligned_load_store.c b/lecture18_19/intrinsics/load_store/aligned_load_store.c @@ -0,0 +1,7 @@ +#include <x86intrin.h> +void f(float *a, float *b) +{ + __m128 r0 = _mm_load_ps(a); + r0 = _mm_add_ps(r0, _mm_load_ps(a + 4)); + _mm_store_ps(b, r0); +} diff --git a/lecture18_19/intrinsics/load_store/reverse_load_store.c b/lecture18_19/intrinsics/load_store/reverse_load_store.c @@ -0,0 +1,2 @@ +#include <x86intrin.h> +void f(float *a, float *b) { _mm_storeu_ps(b, _mm_loadr_ps(a)); } diff --git a/lecture18_19/intrinsics/load_store/unaligned_load_store.c b/lecture18_19/intrinsics/load_store/unaligned_load_store.c @@ -0,0 +1,7 @@ +#include <x86intrin.h> +void f(float *a, float *b) +{ + __m128 r0 = _mm_loadu_ps(a); + r0 = _mm_add_ps(r0, _mm_loadu_ps(a + 4)); + _mm_storeu_ps(b, r0); +} diff --git a/lecture18_19/intrinsics/transpose/transpose.c b/lecture18_19/intrinsics/transpose/transpose.c @@ -0,0 +1,15 @@ +#include <x86intrin.h> +void transpose4(float A[4][4]) +{ + __m128 a = _mm_load_ps(&A[0][0]); + __m128 b = _mm_load_ps(&A[1][0]); + __m128 c = _mm_load_ps(&A[2][0]); + __m128 d = _mm_load_ps(&A[3][0]); + + _MM_TRANSPOSE4_PS(a, b, c, d); + + _mm_store_ps(&A[0][0], a); + _mm_store_ps(&A[1][0], b); + _mm_store_ps(&A[2][0], c); + _mm_store_ps(&A[3][0], d); +} diff --git a/lecture18_19/saxpy/.gitignore b/lecture18_19/saxpy/.gitignore @@ -0,0 +1,2 @@ +main +*.o diff --git a/lecture18_19/saxpy/Makefile b/lecture18_19/saxpy/Makefile @@ -0,0 +1,17 @@ +CXX = gcc +.PHONY: clean + +main: main.c saxpy.o saxpy_SSE.o saxpy_SSE_FMA.o + $(CXX) -O1 -fopenmp -o $@ $^ + +saxpy.o: saxpy.c + $(CXX) -O1 -c -o $@ $< + +saxpy_SSE.o: saxpy_SSE.c + $(CXX) -O3 -msse4 -c -o $@ $< + +saxpy_SSE_FMA.o: saxpy_SSE_FMA.c + $(CXX) -O3 -msse4 -mfma -c -o $@ $< + +clean: + rm -f main *.o diff --git a/lecture18_19/saxpy/main.c b/lecture18_19/saxpy/main.c @@ -0,0 +1,66 @@ +#include <omp.h> +#include <stdio.h> +#include <x86intrin.h> + +#define N (1 << 11) // both vectors fit in cache +// #define N (1 << 26) // vectors too large to fit in cache -> bandwidth bound + +// compute kernels +void saxpy(float *x, float *y, float a, size_t n); +void saxpy_SSE(float *x, float *y, float a, size_t n); +void saxpy_SSE_FMA(float *x, float *y, float a, size_t n); + +void initialize(float *x, size_t n) +{ + for (size_t i = 0; i < n; ++i) { + x[i] = 1.0f; + } +} + +int main(void) +{ + const int n_sample = 50; + + // allocate aligned vectors: + float *x = (float *)_mm_malloc(N * sizeof(float), 16); + float *y = (float *)_mm_malloc(N * sizeof(float), 16); + + // initialize data: + initialize(x, N); + initialize(y, N); + + // scalar: + saxpy(x, y, 2.0, N); // warm-up cache + double t0 = omp_get_wtime(); + for (int i = 0; i < n_sample; ++i) { + saxpy(x, y, 2.0, N); + } + double t1 = omp_get_wtime(); + const double t_gold = (t1 - t0) / n_sample; + printf("Scalar saxpy: %e sec\n", t_gold); + + // SIMD (SSE, 128-bit): + saxpy_SSE(x, y, 2.0, N); // warm-up cache + t0 = omp_get_wtime(); + for (int i = 0; i < n_sample; ++i) { + saxpy_SSE(x, y, 2.0, N); + } + t1 = omp_get_wtime(); + const double t_sse = (t1 - t0) / n_sample; + printf("SSE saxpy: %e sec (%.2fx speedup)\n", t_sse, t_gold / t_sse); + + // SIMD w/ FMA (SSE, 128-bit): + saxpy_SSE_FMA(x, y, 2.0, N); // warm-up cache + t0 = omp_get_wtime(); + for (int i = 0; i < n_sample; ++i) { + saxpy_SSE_FMA(x, y, 2.0, N); + } + t1 = omp_get_wtime(); + const double t_fma = (t1 - t0) / n_sample; + printf("SSE FMA saxpy: %e sec (%.2fx speedup)\n", t_fma, t_gold / t_sse); + + // clean up: + _mm_free(x); + _mm_free(y); + return 0; +} diff --git a/lecture18_19/saxpy/saxpy.c b/lecture18_19/saxpy/saxpy.c @@ -0,0 +1,7 @@ +#include <stddef.h> +void saxpy(float *x, float *y, float a, size_t n) +{ + for (size_t i = 0; i < n; ++i) { + y[i] = a * x[i] + y[i]; + } +} diff --git a/lecture18_19/saxpy/saxpy.s b/lecture18_19/saxpy/saxpy.s @@ -0,0 +1,26 @@ + .file "saxpy.c" + .intel_syntax noprefix + .text + .globl saxpy + .type saxpy, @function +saxpy: +.LFB0: + .cfi_startproc + test rdx, rdx + je .L1 + mov eax, 0 +.L3: + movaps xmm1, xmm0 + mulss xmm1, DWORD PTR [rdi+rax*4] + addss xmm1, DWORD PTR [rsi+rax*4] + movss DWORD PTR [rsi+rax*4], xmm1 + add rax, 1 + cmp rdx, rax + jne .L3 +.L1: + ret + .cfi_endproc +.LFE0: + .size saxpy, .-saxpy + .ident "GCC: (GNU) 11.2.0" + .section .note.GNU-stack,"",@progbits diff --git a/lecture18_19/saxpy/saxpy_SSE.c b/lecture18_19/saxpy/saxpy_SSE.c @@ -0,0 +1,12 @@ +#include <x86intrin.h> + +void saxpy_SSE(float *x, float *y, float a, size_t n) +{ + const __m128 a4 = _mm_set1_ps(a); + // assumes n % 4 == 0 + for (size_t i = 0; i < n; i += 4) { + __m128 r0 = _mm_load_ps(x + i); + __m128 r1 = _mm_load_ps(y + i); + _mm_store_ps(y + i, _mm_add_ps(_mm_mul_ps(r0, a4), r1)); + } +} diff --git a/lecture18_19/saxpy/saxpy_SSE.s b/lecture18_19/saxpy/saxpy_SSE.s @@ -0,0 +1,30 @@ + .file "saxpy_SSE.c" + .intel_syntax noprefix + .text + .p2align 4 + .globl saxpy_SSE + .type saxpy_SSE, @function +saxpy_SSE: +.LFB5667: + .cfi_startproc + shufps xmm0, xmm0, 0 + test rdx, rdx + je .L1 + xor eax, eax + .p2align 4,,10 + .p2align 3 +.L3: + movaps xmm1, XMMWORD PTR [rdi+rax*4] + mulps xmm1, xmm0 + addps xmm1, XMMWORD PTR [rsi+rax*4] + movaps XMMWORD PTR [rsi+rax*4], xmm1 + add rax, 4 + cmp rdx, rax + ja .L3 +.L1: + ret + .cfi_endproc +.LFE5667: + .size saxpy_SSE, .-saxpy_SSE + .ident "GCC: (GNU) 11.2.0" + .section .note.GNU-stack,"",@progbits diff --git a/lecture18_19/saxpy/saxpy_SSE_FMA.c b/lecture18_19/saxpy/saxpy_SSE_FMA.c @@ -0,0 +1,11 @@ +#include <x86intrin.h> + +void saxpy_SSE_FMA(float *x, float *y, float a, size_t n) +{ + const __m128 a4 = _mm_set1_ps(a); + // assumes n % 4 == 0 + for (size_t i = 0; i < n; i += 4) { + _mm_store_ps(y + i, + _mm_fmadd_ps(a4, _mm_load_ps(x + i), _mm_load_ps(y + i))); + } +} diff --git a/lecture18_19/sgemv/.gitignore b/lecture18_19/sgemv/.gitignore @@ -0,0 +1 @@ +*.swp diff --git a/lecture18_19/sgemv/Makefile b/lecture18_19/sgemv/Makefile @@ -0,0 +1,9 @@ +CXX = g++ +CXXFLAGS = -O2 -S -masm=intel +.PHONY: clean + +sgemv.s: sgemv.cpp + $(CXX) $(CXXFLAGS) -o $@ $< + +clean: + rm -f sgemv.s diff --git a/lecture18_19/sgemv/sgemv.cpp b/lecture18_19/sgemv/sgemv.cpp @@ -0,0 +1,22 @@ +#include <cstdlib> + +void sgemv_noaliasing(const float *A, + const float *x, + float *__restrict__ y, + const size_t n) +{ + for (size_t j = 0; j < n; ++j) { + for (size_t i = 0; i < n; ++i) { + y[j] += A[j * n + i] * x[i]; + } + } +} + +void sgemv_aliasing(const float *A, const float *x, float *y, const size_t n) +{ + for (size_t j = 0; j < n; ++j) { + for (size_t i = 0; i < n; ++i) { + y[j] += A[j * n + i] * x[i]; + } + } +} diff --git a/lecture18_19/sgemv/sgemv.s b/lecture18_19/sgemv/sgemv.s @@ -0,0 +1,73 @@ + .file "sgemv.cpp" + .intel_syntax noprefix + .text + .p2align 4 + .globl _Z16sgemv_noaliasingPKfS0_Pfm + .type _Z16sgemv_noaliasingPKfS0_Pfm, @function +_Z16sgemv_noaliasingPKfS0_Pfm: +.LFB20: + .cfi_startproc + test rcx, rcx + je .L1 + lea r8, 0[0+rcx*4] + lea r9, [rdx+r8] + .p2align 4,,10 + .p2align 3 +.L3: + movss xmm1, DWORD PTR [rdx] + xor eax, eax + .p2align 4,,10 + .p2align 3 +.L4: + movss xmm0, DWORD PTR [rdi+rax*4] + mulss xmm0, DWORD PTR [rsi+rax*4] + add rax, 1 + addss xmm1, xmm0 + cmp rcx, rax + jne .L4 + movss DWORD PTR [rdx], xmm1 + add rdx, 4 + add rdi, r8 + cmp r9, rdx + jne .L3 +.L1: + ret + .cfi_endproc +.LFE20: + .size _Z16sgemv_noaliasingPKfS0_Pfm, .-_Z16sgemv_noaliasingPKfS0_Pfm + .p2align 4 + .globl _Z14sgemv_aliasingPKfS0_Pfm + .type _Z14sgemv_aliasingPKfS0_Pfm, @function +_Z14sgemv_aliasingPKfS0_Pfm: +.LFB21: + .cfi_startproc + test rcx, rcx + je .L10 + lea r8, 0[0+rcx*4] + lea r9, [rdx+r8] + .p2align 4,,10 + .p2align 3 +.L12: + movss xmm1, DWORD PTR [rdx] + xor eax, eax + .p2align 4,,10 + .p2align 3 +.L13: + movss xmm0, DWORD PTR [rdi+rax*4] + mulss xmm0, DWORD PTR [rsi+rax*4] + add rax, 1 + addss xmm1, xmm0 + movss DWORD PTR [rdx], xmm1 + cmp rcx, rax + jne .L13 + add rdx, 4 + add rdi, r8 + cmp r9, rdx + jne .L12 +.L10: + ret + .cfi_endproc +.LFE21: + .size _Z14sgemv_aliasingPKfS0_Pfm, .-_Z14sgemv_aliasingPKfS0_Pfm + .ident "GCC: (GNU) 11.2.0" + .section .note.GNU-stack,"",@progbits diff --git a/lecture20/ispc_data_layout/.gitignore b/lecture20/ispc_data_layout/.gitignore @@ -0,0 +1,2 @@ +main +*.o diff --git a/lecture20/ispc_data_layout/Makefile b/lecture20/ispc_data_layout/Makefile @@ -0,0 +1,17 @@ +# File : Makefile +# Description: Compile targets +# Copyright 2022 Harvard University. All Rights Reserved. +CXX = gcc +ISPC = ispc +CXXFLAGS = -Wall -Wextra -Wpedantic +ISPCFLAGS = --arch=x86-64 --target=sse4-i32x4 --pic +.PHONY: clean + +main: main.c kernels.o + $(CXX) $(CXXFLAGS) -g -O0 -o $@ $^ + +kernels.o: kernels.ispc + $(ISPC) -g -O0 -DNDEBUG $(ISPCFLAGS) -o $@ $< + +clean: + rm -f main *.o diff --git a/lecture20/ispc_data_layout/kernels.ispc b/lecture20/ispc_data_layout/kernels.ispc @@ -0,0 +1,40 @@ +export void aos() +{ + struct Foo { + float x, y, z; + }; + uniform Foo a[4] = { + {0, 1, 2}, + {3, 4, 5}, + {6, 7, 8}, + {9, 10, 11}, + }; + int index = programIndex; // varying int + float v = a[index].x; +} + +export void hybrid_soa() +{ + struct Foo4 { + float x[4], y[4], z[4]; + }; + uniform Foo4 a[2] = { + {{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}}, + {{12, 13, 14, 15}, {16, 17, 18, 19}, {20, 21, 22, 23}}, + }; + int index = programIndex; // varying int + float v = a[index / 4].x[index & 3]; +} + +export void short_soa() +{ + struct Foo { + float x, y, z; + }; + soa<4> struct Foo a[2] = { + {{0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}}, + {{12, 13, 14, 15}, {16, 17, 18, 19}, {20, 21, 22, 23}}, + }; + int index = programIndex; // varying int + float v = a[index].x; +} diff --git a/lecture20/ispc_data_layout/main.c b/lecture20/ispc_data_layout/main.c @@ -0,0 +1,11 @@ +int aos(); +int hybrid_soa(); +int short_soa(); + +int main(void) +{ + aos(); + hybrid_soa(); + short_soa(); + return 0; +} diff --git a/lecture20/ispc_foreach/.gitignore b/lecture20/ispc_foreach/.gitignore @@ -0,0 +1,2 @@ +main +*.o diff --git a/lecture20/ispc_foreach/Makefile b/lecture20/ispc_foreach/Makefile @@ -0,0 +1,17 @@ +# File : Makefile +# Description: Compile targets +# Copyright 2022 Harvard University. All Rights Reserved. +CXX = gcc +ISPC = ispc +CXXFLAGS = -Wall -Wextra -Wpedantic +ISPCFLAGS = --arch=x86-64 --target=sse4-i32x4 --pic +.PHONY: clean + +main: main.c kernels.o + $(CXX) $(CXXFLAGS) -g -O0 -o $@ $^ + +kernels.o: kernels.ispc + $(ISPC) -g -O0 -DNDEBUG $(ISPCFLAGS) -o $@ $< + +clean: + rm -f main *.o diff --git a/lecture20/ispc_foreach/kernels.ispc b/lecture20/ispc_foreach/kernels.ispc @@ -0,0 +1,17 @@ +export uniform int f(const uniform int a[], const uniform int n) +{ + varying int sum4 = 0.0; + for (uniform int i = 0; i < n; i += programCount) { + sum4 += a[i + programIndex]; + } + return reduce_add(sum4); +} + +export uniform int g(const uniform int a[], const uniform int n) +{ + varying int sum4 = 0.0; + foreach (i = 0 ... n) { + sum4 += a[i]; + } + return reduce_add(sum4); +} diff --git a/lecture20/ispc_foreach/main.c b/lecture20/ispc_foreach/main.c @@ -0,0 +1,10 @@ +int f(); +int g(); + +int main(void) +{ + int ary[] = {1, 2, 3, 4}; + int a = f(ary, 4); + int b = g(ary, 4); + return a + b; +} diff --git a/lecture20/ispc_pointers_arrays/.gitignore b/lecture20/ispc_pointers_arrays/.gitignore @@ -0,0 +1,2 @@ +main +*.o diff --git a/lecture20/ispc_pointers_arrays/Makefile b/lecture20/ispc_pointers_arrays/Makefile @@ -0,0 +1,17 @@ +# File : Makefile +# Description: Compile targets +# Copyright 2022 Harvard University. All Rights Reserved. +CXX = gcc +ISPC = ispc +CXXFLAGS = -Wall -Wextra -Wpedantic +ISPCFLAGS = --arch=x86-64 --target=sse4-i32x4 --pic +.PHONY: clean + +main: main.c f.o + $(CXX) $(CXXFLAGS) -g -O1 -o $@ $^ + +f.o: f.ispc + $(ISPC) -g -O0 -DNDEBUG $(ISPCFLAGS) -o $@ $< + +clean: + rm -f main *.o diff --git a/lecture20/ispc_pointers_arrays/f.ispc b/lecture20/ispc_pointers_arrays/f.ispc @@ -0,0 +1,35 @@ +export uniform int f() +{ + // uniform array shared among gang + uniform int a[] = {0, 1, 2, 3}; + + // varying array per programIndex + varying int b[] = {0, 1, 2, 3, 4}; + + if (programIndex == 0) { + a[programIndex] = -1; + b[programIndex] = -1; + } + if (programIndex == 1) { + // shared among gang + a[programIndex] = -2; + + // can only access data for this program instance + b[programIndex - 1] = -2; + b[programIndex + 0] = -3; + b[programIndex + 1] = -4; + } + + // you can also define pointers to arrays + uniform int *uniform uupa = a; // OK + uniform int *varying uvpa = a; // OK (address of array is broadcast) + // varying int *uniform vupa = a; // NOT OK! + // varying int *varying vvpa = a; // NOT OK! + + // uniform int *uniform uupb = b; // NOT OK + // uniform int *varying uvpb = b; // NOT OK + varying int *uniform vupb = b; // OK + varying int *varying vvpb = b; // OK + + return reduce_add(a[programIndex]) + reduce_add(b[programIndex]); +} diff --git a/lecture20/ispc_pointers_arrays/main.c b/lecture20/ispc_pointers_arrays/main.c @@ -0,0 +1,8 @@ +#include <stdio.h> +int f(); + +int main(void) +{ + printf("f() = %d\n", f()); + return 0; +} diff --git a/lecture20/maximal_convergence/.gitignore b/lecture20/maximal_convergence/.gitignore @@ -0,0 +1,2 @@ +main +*.o diff --git a/lecture20/maximal_convergence/Makefile b/lecture20/maximal_convergence/Makefile @@ -0,0 +1,17 @@ +# File : Makefile +# Description: Compile targets +# Copyright 2022 Harvard University. All Rights Reserved. +CXX ?= g++ +ISPC = ispc +CXXFLAGS = -Wall -Wextra -Wpedantic -std=c++11 +ISPCFLAGS = --arch=x86-64 --target=sse4-i32x4 +.PHONY: clean + +main: main.cpp f.o + $(CXX) $(CXXFLAGS) -g -O1 -o $@ $^ + +f.o: f.ispc + $(ISPC) -g -O0 $(extra) -DNDEBUG $(ISPCFLAGS) -o $@ $< + +clean: + rm -f main *.o diff --git a/lecture20/maximal_convergence/f.ispc b/lecture20/maximal_convergence/f.ispc @@ -0,0 +1,14 @@ +export uniform int f() +{ + varying int a = {1, -1, 1, 1}; // vector (could omit `varying` as well) + varying int b = {-1, -1, -1, -1}; // vector + + if (a < 0) { + a = 0; + } else { + a += b; + } + + uniform int x = 1; // scalar storage + return x + reduce_add(a); // expect: 1 +} diff --git a/lecture20/maximal_convergence/main.cpp b/lecture20/maximal_convergence/main.cpp @@ -0,0 +1,8 @@ +#include <iostream> +extern "C" int f(void); + +int main(void) +{ + std::cout << f() << std::endl; + return 0; +} diff --git a/lecture20/sse_vector/.gitignore b/lecture20/sse_vector/.gitignore @@ -0,0 +1 @@ +sse_vector diff --git a/lecture20/sse_vector/Makefile b/lecture20/sse_vector/Makefile @@ -0,0 +1,7 @@ +.PHONY: clean + +sse_vector: sse_vector.c + gcc -g -O0 -msse -o $@ $^ + +clean: + rm -f sse_vector diff --git a/lecture20/sse_vector/sse_vector.c b/lecture20/sse_vector/sse_vector.c @@ -0,0 +1,8 @@ +#include <x86intrin.h> + +int main() +{ + __m128 a = _mm_set_ps(4.0, 3.0, 2.0, 1.0); + + return 0; +} diff --git a/lecture21/cuda_vec_add/.gitignore b/lecture21/cuda_vec_add/.gitignore @@ -0,0 +1,2 @@ +main +*.o diff --git a/lecture21/cuda_vec_add/Makefile b/lecture21/cuda_vec_add/Makefile @@ -0,0 +1,7 @@ +.PHONY: clean + +main: add_kernel_grid.cu + nvcc -O2 -o $@ $< + +clean: + rm -f main *.o diff --git a/lecture21/cuda_vec_add/add_kernel_grid.cu b/lecture21/cuda_vec_add/add_kernel_grid.cu @@ -0,0 +1,51 @@ +// File : add_kernel_grid.cu +// Description: Simple vector add kernel (launch grid: 4096 blocks, @256) +// Copyright 2022 Harvard University. All Rights Reserved. +#include <cmath> +#include <iostream> + +// Kernel function to add the elements of two arrays +__global__ void add(int n, float *x, float *y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = tid; i < n; i += stride) + y[i] = x[i] + y[i]; +} + +int main(void) +{ + int N = 1 << 20; + float *x, *y; + + // Allocate Unified Memory – accessible from CPU or GPU + cudaMallocManaged(&x, N * sizeof(float)); + cudaMallocManaged(&y, N * sizeof(float)); + + // initialize x and y arrays on the host + for (int i = 0; i < N; i++) { + x[i] = 1.0f; + y[i] = 2.0f; + } + + // Run kernel on 1M elements on the GPU + int block_size = 256; + int n_blocks = (N + block_size - 1) / block_size; + add<<<n_blocks, block_size>>>(N, x, y); + + // Wait for GPU to finish before accessing on host. CUDA kernel calls do + // not block the calling CPU thread. + cudaDeviceSynchronize(); + + // Check for errors (all values should be 3.0f) + float maxError = 0.0f; + for (int i = 0; i < N; i++) + maxError = std::fmax(maxError, std::fabs(y[i] - 3.0f)); + std::cout << "Max error: " << maxError << std::endl; + + // Free memory + cudaFree(x); + cudaFree(y); + + return 0; +}