Merge pull request #7 from markdryan/benchmarking

Add benchmark code and fix various issues
intel · Mar 1, 2022 · 61cdceb · 61cdceb
2 parents ab388ad + 673a807
commit 61cdceb
Show file tree

Hide file tree

Showing 329 changed files with 7,395 additions and 487 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -26,6 +26,10 @@ jobs:
       - uses: actions/checkout@v2
 
       # Runs a set of commands using the runners shell
+
+      - name: Install dependencies
+        run: sudo apt-get install -y libbenchmark-dev elfutils
+
       - name: build code
         run: ./verify.sh
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,6 +1,8 @@
 cmake_minimum_required (VERSION 3.16.3)
 project(optimization C CXX ASM)
 
+find_package(benchmark QUIET)
+
 if (CMAKE_CXX_COMPILER_ID MATCHES MSVC)
   enable_language(ASM_MASM)
 endif()

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -1,3 +1,5 @@
 # Intel
 - [email protected]
-- [email protected]
+- [email protected]
+- [email protected]
+- [email protected]
diff --git a/README.md b/README.md
@@ -17,8 +17,9 @@ To run the unit tests
 5. make && make test
 
 GCC 8.1 or higher is required to build the unit tests.  The unit tests are
-compiled with --march=haswell and so a Haswell CPU or later is required to run
-them.  Tests that execute instructions not present on Haswell will be
+compiled with --march=haswell and so a fourth-generation Intel® Core™ (Haswell)
+CPU or later is required to run them.  Tests that execute instructions not present
+on fourth-generation Intel® Core™ (Haswell) will be
 skipped if the CPU on which they are run does not support those instructions.
 
 The code samples can also be compiled with clang:
@@ -44,18 +45,25 @@ Dependency- Visual Studio 2019
 5. To Build- build "ALL_BUILD" project
 6. To Run tests- build "RUN_TESTS" project.
 
+## Building the Benchmarks
+
+Benchmark code is supplied for some of the code samples.  These benchmarks are
+built using [Google's Benchmark project](https://github.com/google/benchmark).
+If Benchmark is installed and discoverable by CMake, the benchmarks for the code
+samples will be automatically built when you type make.
+
 ## CPU Requirements
 
-The code samples assume that they are being run on a Haswell processor
+The code samples assume that they are being run on a fourth-generation Intel® Core™ (Haswell) processor
 or later and do not perform runtime checks for the instructions that
-they use that are present in Haswell, for example, FMA or AVX-2.
+they use that are present in fourth-generation Intel® Core™ (Haswell), for example, FMA or AVX-2.
 Some of the code samples may then crash if they are run
 on a device that does not support these instructions.
 
-The code samples do however check for post Haswell instruction sets such as AVX-512 and VNNI
-before running.  Tests will skip if they detect that the post Haswell instructions
+The code samples do however check for post fourth-generation Intel® Core™ (Haswell) instruction sets such as AVX-512 and VNNI
+before running.  Tests will skip if they detect that the post fourth-generation Intel® Core™ (Haswell) instructions
 they need are not present.   Some of the newest examples use new instructions only found
-in SkylakeX or later processors.  If you have an older CPU
+in seventh-generation Intel® Core™ (SkylakeX) or later processors.  If you have an older CPU
 in your PC you may find that everything builds on your system
 but that some of the tests are skipped or crash (if you don't have AVX2) when run. In this case,
 to fully run the tests, you need to run them under the SDE.

diff --git a/chap15/ex1/CMakeLists.txt b/chap15/ex1/CMakeLists.txt
@@ -1,3 +1,10 @@
-add_executable(avx_ex1_tests ex1_test.cpp transform_sse.c transform_avx.c)
+set(avx_ex1_srcs transform_sse.c transform_avx.c)
+add_executable(avx_ex1_tests ex1_test.cpp ${avx_ex1_srcs})
 target_link_libraries(avx_ex1_tests gtest_main)
+
+IF( benchmark_FOUND )
+  add_executable(avx_ex1_bench ex1_bench.cpp ${avx_ex1_srcs})
+  target_link_libraries(avx_ex1_bench benchmark::benchmark)
+ENDIF()
+
 add_test(NAME avx_ex1_test COMMAND avx_ex1_tests)
diff --git a/chap15/ex1/ex1_bench.cpp b/chap15/ex1/ex1_bench.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (C) 2021 by Intel Corporation
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+ * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+ * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <benchmark/benchmark.h>
+#include <xmmintrin.h>
+
+#include "transform_avx.h"
+#include "transform_sse.h"
+
+static void BM_transform_sse(benchmark::State &state)
+{
+	int len = state.range(0);
+	// Dynamic memory allocation with 16byte
+	// alignment
+	float *pInVector = (float *)_mm_malloc(len * sizeof(float), 16);
+	float *pOutVector = (float *)_mm_malloc(len * sizeof(float), 16);
+	// init data
+	for (int i = 0; i < len; i++)
+		pInVector[i] = 1;
+	float cos_teta = 0.8660254037;
+	float sin_teta = 0.5;
+
+	for (auto _ : state) {
+		transform_sse(sin_teta, cos_teta, pInVector, pOutVector, len);
+	}
+	state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) *
+				int64_t(sizeof(pInVector[0])));
+
+	_mm_free(pInVector);
+	_mm_free(pOutVector);
+}
+
+static void BM_transform_avx(benchmark::State &state)
+{
+	int len = state.range(0);
+	// Dynamic memory allocation with 32byte
+	// alignment
+	float *pInVector = (float *)_mm_malloc(len * sizeof(float), 32);
+	float *pOutVector = (float *)_mm_malloc(len * sizeof(float), 32);
+	// init data
+	for (int i = 0; i < len; i++)
+		pInVector[i] = 1;
+	float cos_teta = 0.8660254037;
+	float sin_teta = 0.5;
+
+	for (auto _ : state) {
+		transform_avx(sin_teta, cos_teta, pInVector, pOutVector, len);
+	}
+	state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) *
+				int64_t(sizeof(pInVector[0])));
+
+	_mm_free(pInVector);
+	_mm_free(pOutVector);
+}
+
+BENCHMARK(BM_transform_sse)
+    ->Arg(1 << 6)
+    ->Arg(1 << 8)
+    ->Arg(1 << 10)
+    ->Arg(1 << 12)
+    ->Arg(1 << 14)
+    ->Arg(1 << 16)
+    ->Arg(1 << 18);
+BENCHMARK(BM_transform_avx)
+    ->Arg(1 << 6)
+    ->Arg(1 << 8)
+    ->Arg(1 << 10)
+    ->Arg(1 << 12)
+    ->Arg(1 << 14)
+    ->Arg(1 << 16)
+    ->Arg(1 << 18);
+BENCHMARK_MAIN();
diff --git a/chap15/ex1/ex1_test.cpp b/chap15/ex1/ex1_test.cpp
@@ -40,15 +40,14 @@ TEST(avx_1, transform_sse)
 	    true);
 
 	for (int i = 0; i < len; i += 2) {
-		if (i & 1) {
-			float cosx = pInVector[i + 1] * cos_teta;
-			float sinx = pInVector[i + 1] * sin_teta;
-			ASSERT_FLOAT_EQ(sinx + cosx, pOutVector[i]);
-		} else {
-			float cosx = pInVector[i] * cos_teta;
-			float sinx = pInVector[i] * sin_teta;
-			ASSERT_FLOAT_EQ(cosx - sinx, pOutVector[i]);
-		}
+		// Assert X'
+		float cosx = pInVector[i] * cos_teta;
+		float siny = pInVector[i + 1] * sin_teta;
+		ASSERT_FLOAT_EQ(cosx - siny, pOutVector[i]);
+		// Assert Y'
+		float sinx = pInVector[i] * sin_teta;
+		float cosy = pInVector[i + 1] * cos_teta;
+		ASSERT_FLOAT_EQ(sinx + cosy, pOutVector[i + 1]);
 	}
 
 	ASSERT_EQ(
@@ -91,15 +90,14 @@ TEST(avx_1, transform_avx)
 	    true);
 
 	for (int i = 0; i < len; i += 2) {
-		if (i & 1) {
-			float cosx = pInVector[i + 1] * cos_teta;
-			float sinx = pInVector[i + 1] * sin_teta;
-			ASSERT_FLOAT_EQ(sinx + cosx, pOutVector[i]);
-		} else {
-			float cosx = pInVector[i] * cos_teta;
-			float sinx = pInVector[i] * sin_teta;
-			ASSERT_FLOAT_EQ(cosx - sinx, pOutVector[i]);
-		}
+		// Assert X'
+		float cosx = pInVector[i] * cos_teta;
+		float siny = pInVector[i + 1] * sin_teta;
+		ASSERT_FLOAT_EQ(cosx - siny, pOutVector[i]);
+		// Assert Y'
+		float sinx = pInVector[i] * sin_teta;
+		float cosy = pInVector[i + 1] * cos_teta;
+		ASSERT_FLOAT_EQ(sinx + cosy, pOutVector[i + 1]);
 	}
 
 	ASSERT_EQ(

diff --git a/chap15/ex10/CMakeLists.txt b/chap15/ex10/CMakeLists.txt
@@ -1,9 +1,14 @@
-set(avx_ex10_srcs ex10_test.cpp saxpy32.c)
 if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang)
-set(avx_ex10_srcs ${avx_ex10_srcs} saxpy32.s)
+set(avx_ex10_ass saxpy32.s)
 elseif(MSVC)
-set(avx_ex10_srcs ${avx_ex10_srcs} saxpy32.asm)
+set(avx_ex10_ass saxpy32.asm)
 endif()
-add_executable(avx_ex10_tests ${avx_ex10_srcs})
+add_executable(avx_ex10_tests ex10_test.cpp saxpy32.c ${avx_ex10_ass})
 target_link_libraries(avx_ex10_tests gtest_main)
+
+IF( benchmark_FOUND )
+  add_executable(avx_ex10_bench ex10_bench.cpp ${avx_ex10_ass})
+  target_link_libraries(avx_ex10_bench benchmark::benchmark)
+ENDIF()
+
 add_test(NAME avx_ex10_test COMMAND avx_ex10_tests)
diff --git a/chap15/ex10/ex10_bench.cpp b/chap15/ex10/ex10_bench.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (C) 2022 by Intel Corporation
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+ * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+ * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+ * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+ * PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <benchmark/benchmark.h>
+#include <xmmintrin.h>
+
+#include "saxpy32.h"
+
+static void init_sources(float *src, float *src2, int len)
+{
+	for (int i = 0; i < len; i++) {
+		src[i] = 2.0f * i;
+		src2[i] = 3.0f * i;
+	}
+}
+
+static void BM_saxpy_avx_aligned(benchmark::State &state)
+{
+	int len = state.range(0);
+	float *src = (float *)_mm_malloc(len * sizeof(float), 32);
+	float *src2 = (float *)_mm_malloc(len * sizeof(float), 32);
+	float *dest = (float *)_mm_malloc(len * sizeof(float), 32);
+
+	init_sources(src, src2, len);
+
+	for (auto _ : state) {
+		saxpy32(src, src2, len * sizeof(float), dest, 10.0);
+	}
+
+	state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) *
+				int64_t(sizeof(float) * 2));
+
+	_mm_free(dest);
+	_mm_free(src2);
+	_mm_free(src);
+}
+
+static void BM_saxpy_avx_misaligned1(benchmark::State &state)
+{
+	int len = state.range(0);
+	float *src_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32);
+	float *src = &src_mem[1];
+	float *src2 = (float *)_mm_malloc(len * sizeof(float), 32);
+	float *dest = (float *)_mm_malloc(len * sizeof(float), 32);
+
+	init_sources(src, src2, len);
+
+	for (auto _ : state) {
+		saxpy32(src, src2, len * sizeof(float), dest, 10.0);
+	}
+
+	state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) *
+				int64_t(sizeof(float) * 2));
+
+	_mm_free(dest);
+	_mm_free(src2);
+	_mm_free(src_mem);
+}
+
+static void BM_saxpy_avx_misaligned3(benchmark::State &state)
+{
+	int len = state.range(0);
+	float *src_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32);
+	float *src = &src_mem[1];
+	float *src2_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32);
+	float *src2 = &src2_mem[1];
+	float *dest_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32);
+	float *dest = &dest_mem[1];
+
+	init_sources(src, src2, len);
+
+	for (auto _ : state) {
+		saxpy32(src, src2, len * sizeof(float), dest, 10.0);
+	}
+
+	state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) *
+				int64_t(sizeof(float) * 2));
+
+	_mm_free(dest_mem);
+	_mm_free(src2_mem);
+	_mm_free(src_mem);
+}
+
+BENCHMARK(BM_saxpy_avx_aligned)
+    ->Arg(1 << 6)
+    ->Arg(1 << 8)
+    ->Arg(1 << 10)
+    ->Arg(1 << 12)
+    ->Arg(1 << 14)
+    ->Arg(1 << 16)
+    ->Arg(1 << 18);
+BENCHMARK(BM_saxpy_avx_misaligned1)
+    ->Arg(1 << 6)
+    ->Arg(1 << 8)
+    ->Arg(1 << 10)
+    ->Arg(1 << 12)
+    ->Arg(1 << 14)
+    ->Arg(1 << 16)
+    ->Arg(1 << 18);
+BENCHMARK(BM_saxpy_avx_misaligned3)
+    ->Arg(1 << 6)
+    ->Arg(1 << 8)
+    ->Arg(1 << 10)
+    ->Arg(1 << 12)
+    ->Arg(1 << 14)
+    ->Arg(1 << 16)
+    ->Arg(1 << 18);
+BENCHMARK_MAIN();
diff --git a/chap15/ex10/ex10_test.cpp b/chap15/ex10/ex10_test.cpp
@@ -28,7 +28,8 @@ static float src[MAX_SIZE] __attribute__((aligned(32)));
 static float dest[MAX_SIZE] __attribute__((aligned(32)));
 static float src2[MAX_SIZE] __attribute__((aligned(32)));
 #endif
-void init_sources()
+
+static void init_sources()
 {
 	for (int i = 0; i < MAX_SIZE; i++) {
 		src[i] = 2.0f * i;

diff --git a/chap15/ex10/saxpy32.s b/chap15/ex10/saxpy32.s
@@ -54,3 +54,7 @@ start_loop:
 	vzeroupper
 	pop rbx
 	ret
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/chap15/ex12/CMakeLists.txt b/chap15/ex12/CMakeLists.txt
@@ -1,9 +1,14 @@
-set(avx_ex12_srcs ex12_test.cpp saxpy32.c saxpy16.c)
 if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang)
-set(avx_ex12_srcs ${avx_ex12_srcs} saxpy32.s saxpy16.s)
+set(avx_ex12_ass saxpy32.s saxpy16.s)
 elseif(MSVC)
-set(avx_ex12_srcs ${avx_ex12_srcs} saxpy32.asm saxpy16.asm)
+set(avx_ex12_ass saxpy32.asm saxpy16.asm)
 endif()
-add_executable(avx_ex12_tests ${avx_ex12_srcs})
+add_executable(avx_ex12_tests ex12_test.cpp saxpy32.c saxpy16.c ${avx_ex12_ass})
 target_link_libraries(avx_ex12_tests gtest_main)
+
+IF( benchmark_FOUND )
+  add_executable(avx_ex12_bench ex12_bench.cpp ${avx_ex12_ass})
+  target_link_libraries(avx_ex12_bench benchmark::benchmark)
+ENDIF()
+
 add_test(NAME avx_ex12_test COMMAND avx_ex12_tests)