diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 147398a..878f4a0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -26,6 +26,10 @@ jobs: - uses: actions/checkout@v2 # Runs a set of commands using the runners shell + + - name: Install dependencies + run: sudo apt-get install -y libbenchmark-dev elfutils + - name: build code run: ./verify.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 1bc020c..772bec8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,8 @@ cmake_minimum_required (VERSION 3.16.3) project(optimization C CXX ASM) +find_package(benchmark QUIET) + if (CMAKE_CXX_COMPILER_ID MATCHES MSVC) enable_language(ASM_MASM) endif() diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 3978526..456100b 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -1,3 +1,5 @@ # Intel - mark.d.ryan@intel.com -- Laxman.Sole@intel.com \ No newline at end of file +- Laxman.Sole@intel.com +- athenas.jimenez.gonzalez@intel.com +- joe.konno@intel.com diff --git a/README.md b/README.md index 3d90bb0..a4e5d47 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,9 @@ To run the unit tests 5. make && make test GCC 8.1 or higher is required to build the unit tests. The unit tests are -compiled with --march=haswell and so a Haswell CPU or later is required to run -them. Tests that execute instructions not present on Haswell will be +compiled with --march=haswell and so a fourth-generation Intel® Core™ (Haswell) +CPU or later is required to run them. Tests that execute instructions not present +on fourth-generation Intel® Core™ (Haswell) will be skipped if the CPU on which they are run does not support those instructions. The code samples can also be compiled with clang: @@ -44,18 +45,25 @@ Dependency- Visual Studio 2019 5. To Build- build "ALL_BUILD" project 6. To Run tests- build "RUN_TESTS" project. +## Building the Benchmarks + +Benchmark code is supplied for some of the code samples. These benchmarks are +built using [Google's Benchmark project](https://github.com/google/benchmark). +If Benchmark is installed and discoverable by CMake, the benchmarks for the code +samples will be automatically built when you type make. + ## CPU Requirements -The code samples assume that they are being run on a Haswell processor +The code samples assume that they are being run on a fourth-generation Intel® Core™ (Haswell) processor or later and do not perform runtime checks for the instructions that -they use that are present in Haswell, for example, FMA or AVX-2. +they use that are present in fourth-generation Intel® Core™ (Haswell), for example, FMA or AVX-2. Some of the code samples may then crash if they are run on a device that does not support these instructions. -The code samples do however check for post Haswell instruction sets such as AVX-512 and VNNI -before running. Tests will skip if they detect that the post Haswell instructions +The code samples do however check for post fourth-generation Intel® Core™ (Haswell) instruction sets such as AVX-512 and VNNI +before running. Tests will skip if they detect that the post fourth-generation Intel® Core™ (Haswell) instructions they need are not present. Some of the newest examples use new instructions only found -in SkylakeX or later processors. If you have an older CPU +in seventh-generation Intel® Core™ (SkylakeX) or later processors. If you have an older CPU in your PC you may find that everything builds on your system but that some of the tests are skipped or crash (if you don't have AVX2) when run. In this case, to fully run the tests, you need to run them under the SDE. diff --git a/chap15/ex1/CMakeLists.txt b/chap15/ex1/CMakeLists.txt index 0492c0f..c1e5e58 100644 --- a/chap15/ex1/CMakeLists.txt +++ b/chap15/ex1/CMakeLists.txt @@ -1,3 +1,10 @@ -add_executable(avx_ex1_tests ex1_test.cpp transform_sse.c transform_avx.c) +set(avx_ex1_srcs transform_sse.c transform_avx.c) +add_executable(avx_ex1_tests ex1_test.cpp ${avx_ex1_srcs}) target_link_libraries(avx_ex1_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex1_bench ex1_bench.cpp ${avx_ex1_srcs}) + target_link_libraries(avx_ex1_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex1_test COMMAND avx_ex1_tests) diff --git a/chap15/ex1/ex1_bench.cpp b/chap15/ex1/ex1_bench.cpp new file mode 100644 index 0000000..f7f678f --- /dev/null +++ b/chap15/ex1/ex1_bench.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "transform_avx.h" +#include "transform_sse.h" + +static void BM_transform_sse(benchmark::State &state) +{ + int len = state.range(0); + // Dynamic memory allocation with 16byte + // alignment + float *pInVector = (float *)_mm_malloc(len * sizeof(float), 16); + float *pOutVector = (float *)_mm_malloc(len * sizeof(float), 16); + // init data + for (int i = 0; i < len; i++) + pInVector[i] = 1; + float cos_teta = 0.8660254037; + float sin_teta = 0.5; + + for (auto _ : state) { + transform_sse(sin_teta, cos_teta, pInVector, pOutVector, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(pInVector[0]))); + + _mm_free(pInVector); + _mm_free(pOutVector); +} + +static void BM_transform_avx(benchmark::State &state) +{ + int len = state.range(0); + // Dynamic memory allocation with 32byte + // alignment + float *pInVector = (float *)_mm_malloc(len * sizeof(float), 32); + float *pOutVector = (float *)_mm_malloc(len * sizeof(float), 32); + // init data + for (int i = 0; i < len; i++) + pInVector[i] = 1; + float cos_teta = 0.8660254037; + float sin_teta = 0.5; + + for (auto _ : state) { + transform_avx(sin_teta, cos_teta, pInVector, pOutVector, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(pInVector[0]))); + + _mm_free(pInVector); + _mm_free(pOutVector); +} + +BENCHMARK(BM_transform_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_transform_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex1/ex1_test.cpp b/chap15/ex1/ex1_test.cpp index 57998c0..a97cece 100644 --- a/chap15/ex1/ex1_test.cpp +++ b/chap15/ex1/ex1_test.cpp @@ -40,15 +40,14 @@ TEST(avx_1, transform_sse) true); for (int i = 0; i < len; i += 2) { - if (i & 1) { - float cosx = pInVector[i + 1] * cos_teta; - float sinx = pInVector[i + 1] * sin_teta; - ASSERT_FLOAT_EQ(sinx + cosx, pOutVector[i]); - } else { - float cosx = pInVector[i] * cos_teta; - float sinx = pInVector[i] * sin_teta; - ASSERT_FLOAT_EQ(cosx - sinx, pOutVector[i]); - } + // Assert X' + float cosx = pInVector[i] * cos_teta; + float siny = pInVector[i + 1] * sin_teta; + ASSERT_FLOAT_EQ(cosx - siny, pOutVector[i]); + // Assert Y' + float sinx = pInVector[i] * sin_teta; + float cosy = pInVector[i + 1] * cos_teta; + ASSERT_FLOAT_EQ(sinx + cosy, pOutVector[i + 1]); } ASSERT_EQ( @@ -91,15 +90,14 @@ TEST(avx_1, transform_avx) true); for (int i = 0; i < len; i += 2) { - if (i & 1) { - float cosx = pInVector[i + 1] * cos_teta; - float sinx = pInVector[i + 1] * sin_teta; - ASSERT_FLOAT_EQ(sinx + cosx, pOutVector[i]); - } else { - float cosx = pInVector[i] * cos_teta; - float sinx = pInVector[i] * sin_teta; - ASSERT_FLOAT_EQ(cosx - sinx, pOutVector[i]); - } + // Assert X' + float cosx = pInVector[i] * cos_teta; + float siny = pInVector[i + 1] * sin_teta; + ASSERT_FLOAT_EQ(cosx - siny, pOutVector[i]); + // Assert Y' + float sinx = pInVector[i] * sin_teta; + float cosy = pInVector[i + 1] * cos_teta; + ASSERT_FLOAT_EQ(sinx + cosy, pOutVector[i + 1]); } ASSERT_EQ( diff --git a/chap15/ex10/CMakeLists.txt b/chap15/ex10/CMakeLists.txt index f7b4ff8..094df68 100644 --- a/chap15/ex10/CMakeLists.txt +++ b/chap15/ex10/CMakeLists.txt @@ -1,9 +1,14 @@ -set(avx_ex10_srcs ex10_test.cpp saxpy32.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex10_srcs ${avx_ex10_srcs} saxpy32.s) +set(avx_ex10_ass saxpy32.s) elseif(MSVC) -set(avx_ex10_srcs ${avx_ex10_srcs} saxpy32.asm) +set(avx_ex10_ass saxpy32.asm) endif() -add_executable(avx_ex10_tests ${avx_ex10_srcs}) +add_executable(avx_ex10_tests ex10_test.cpp saxpy32.c ${avx_ex10_ass}) target_link_libraries(avx_ex10_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex10_bench ex10_bench.cpp ${avx_ex10_ass}) + target_link_libraries(avx_ex10_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex10_test COMMAND avx_ex10_tests) diff --git a/chap15/ex10/ex10_bench.cpp b/chap15/ex10/ex10_bench.cpp new file mode 100644 index 0000000..0567415 --- /dev/null +++ b/chap15/ex10/ex10_bench.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "saxpy32.h" + +static void init_sources(float *src, float *src2, int len) +{ + for (int i = 0; i < len; i++) { + src[i] = 2.0f * i; + src2[i] = 3.0f * i; + } +} + +static void BM_saxpy_avx_aligned(benchmark::State &state) +{ + int len = state.range(0); + float *src = (float *)_mm_malloc(len * sizeof(float), 32); + float *src2 = (float *)_mm_malloc(len * sizeof(float), 32); + float *dest = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(src, src2, len); + + for (auto _ : state) { + saxpy32(src, src2, len * sizeof(float), dest, 10.0); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(float) * 2)); + + _mm_free(dest); + _mm_free(src2); + _mm_free(src); +} + +static void BM_saxpy_avx_misaligned1(benchmark::State &state) +{ + int len = state.range(0); + float *src_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32); + float *src = &src_mem[1]; + float *src2 = (float *)_mm_malloc(len * sizeof(float), 32); + float *dest = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(src, src2, len); + + for (auto _ : state) { + saxpy32(src, src2, len * sizeof(float), dest, 10.0); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(float) * 2)); + + _mm_free(dest); + _mm_free(src2); + _mm_free(src_mem); +} + +static void BM_saxpy_avx_misaligned3(benchmark::State &state) +{ + int len = state.range(0); + float *src_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32); + float *src = &src_mem[1]; + float *src2_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32); + float *src2 = &src2_mem[1]; + float *dest_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32); + float *dest = &dest_mem[1]; + + init_sources(src, src2, len); + + for (auto _ : state) { + saxpy32(src, src2, len * sizeof(float), dest, 10.0); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(float) * 2)); + + _mm_free(dest_mem); + _mm_free(src2_mem); + _mm_free(src_mem); +} + +BENCHMARK(BM_saxpy_avx_aligned) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_saxpy_avx_misaligned1) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_saxpy_avx_misaligned3) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex10/ex10_test.cpp b/chap15/ex10/ex10_test.cpp index a6fd37f..338b936 100644 --- a/chap15/ex10/ex10_test.cpp +++ b/chap15/ex10/ex10_test.cpp @@ -28,7 +28,8 @@ static float src[MAX_SIZE] __attribute__((aligned(32))); static float dest[MAX_SIZE] __attribute__((aligned(32))); static float src2[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() + +static void init_sources() { for (int i = 0; i < MAX_SIZE; i++) { src[i] = 2.0f * i; diff --git a/chap15/ex10/saxpy32.s b/chap15/ex10/saxpy32.s index f2b763c..4b9e2d7 100644 --- a/chap15/ex10/saxpy32.s +++ b/chap15/ex10/saxpy32.s @@ -54,3 +54,7 @@ start_loop: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex12/CMakeLists.txt b/chap15/ex12/CMakeLists.txt index c515e92..a147ad2 100644 --- a/chap15/ex12/CMakeLists.txt +++ b/chap15/ex12/CMakeLists.txt @@ -1,9 +1,14 @@ -set(avx_ex12_srcs ex12_test.cpp saxpy32.c saxpy16.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex12_srcs ${avx_ex12_srcs} saxpy32.s saxpy16.s) +set(avx_ex12_ass saxpy32.s saxpy16.s) elseif(MSVC) -set(avx_ex12_srcs ${avx_ex12_srcs} saxpy32.asm saxpy16.asm) +set(avx_ex12_ass saxpy32.asm saxpy16.asm) endif() -add_executable(avx_ex12_tests ${avx_ex12_srcs}) +add_executable(avx_ex12_tests ex12_test.cpp saxpy32.c saxpy16.c ${avx_ex12_ass}) target_link_libraries(avx_ex12_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex12_bench ex12_bench.cpp ${avx_ex12_ass}) + target_link_libraries(avx_ex12_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex12_test COMMAND avx_ex12_tests) diff --git a/chap15/ex12/ex12_bench.cpp b/chap15/ex12/ex12_bench.cpp new file mode 100644 index 0000000..85e586e --- /dev/null +++ b/chap15/ex12/ex12_bench.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "saxpy16.h" +#include "saxpy32.h" + +static void init_sources(float *src, float *src2, int len) +{ + for (int i = 0; i < len; i++) { + src[i] = 2.0f * i; + src2[i] = 3.0f * i; + } +} + +static void BM_saxpy32(benchmark::State &state) +{ + int len = state.range(0); + float *src_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32); + float *src = &src_mem[1]; + float *src2_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32); + float *src2 = &src2_mem[1]; + float *dest = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(src, src2, len); + + for (auto _ : state) { + saxpy32(src, src2, len * sizeof(float), dest, 10.0); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(float) * 2)); + + _mm_free(dest); + _mm_free(src2_mem); + _mm_free(src_mem); +} + +static void BM_saxpy16(benchmark::State &state) +{ + int len = state.range(0); + float *src_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32); + float *src = &src_mem[1]; + float *src2_mem = (float *)_mm_malloc((len + 1) * sizeof(float), 32); + float *src2 = &src2_mem[1]; + float *dest = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(src, src2, len); + + for (auto _ : state) { + saxpy16(src, src2, len * sizeof(float), dest, 10.0); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(float) * 2)); + + _mm_free(dest); + _mm_free(src2_mem); + _mm_free(src_mem); +} + +BENCHMARK(BM_saxpy32) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_saxpy16) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex12/ex12_test.cpp b/chap15/ex12/ex12_test.cpp index 4894074..7a94e44 100644 --- a/chap15/ex12/ex12_test.cpp +++ b/chap15/ex12/ex12_test.cpp @@ -30,7 +30,7 @@ static float dest[MAX_SIZE] __attribute__((aligned(16))); static float src2[MAX_SIZE] __attribute__((aligned(16))); #endif -void init_sources() +static void init_sources() { for (int i = 0; i < MAX_SIZE; i++) { src[i] = 2.0f * i; diff --git a/chap15/ex12/saxpy16.s b/chap15/ex12/saxpy16.s index 5516633..700a3b3 100644 --- a/chap15/ex12/saxpy16.s +++ b/chap15/ex12/saxpy16.s @@ -59,3 +59,7 @@ start_loop: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex12/saxpy32.s b/chap15/ex12/saxpy32.s index 5344113..673e3df 100644 --- a/chap15/ex12/saxpy32.s +++ b/chap15/ex12/saxpy32.s @@ -54,3 +54,7 @@ start_loop: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex14/CMakeLists.txt b/chap15/ex14/CMakeLists.txt index 6dc0667..fc2cb66 100644 --- a/chap15/ex14/CMakeLists.txt +++ b/chap15/ex14/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex14_srcs ex14_test.cpp cond_scalar.c cond_vmaskmov.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex14_srcs ${avx_ex14_srcs} cond_scalar.s cond_vmaskmov.s) +set(avx_ex14_ass cond_scalar.s cond_vmaskmov.s) elseif(MSVC) -set(avx_ex14_srcs ${avx_ex14_srcs} cond_scalar.asm cond_vmaskmov.asm) +set(avx_ex14_ass cond_scalar.asm cond_vmaskmov.asm) endif() -add_executable(avx_ex14_tests ${avx_ex14_srcs}) - +add_executable(avx_ex14_tests ex14_test.cpp cond_scalar.c cond_vmaskmov.c ${avx_ex14_ass}) target_link_libraries(avx_ex14_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex14_bench ex14_bench.cpp ${avx_ex14_ass}) + target_link_libraries(avx_ex14_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex14_test COMMAND avx_ex14_tests) diff --git a/chap15/ex14/cond_scalar.s b/chap15/ex14/cond_scalar.s index 089f2ea..74f2e14 100644 --- a/chap15/ex14/cond_scalar.s +++ b/chap15/ex14/cond_scalar.s @@ -61,3 +61,7 @@ mul: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex14/cond_vmaskmov.s b/chap15/ex14/cond_vmaskmov.s index 0f948ce..aa72158 100644 --- a/chap15/ex14/cond_vmaskmov.s +++ b/chap15/ex14/cond_vmaskmov.s @@ -60,3 +60,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex14/ex14_bench.cpp b/chap15/ex14/ex14_bench.cpp new file mode 100644 index 0000000..4966496 --- /dev/null +++ b/chap15/ex14/ex14_bench.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "cond_scalar.h" +#include "cond_vmaskmov.h" + +static void init_sources(float *a, float *c, float *d, float *e, int len) +{ + for (int i = 0; i < len; i++) { + a[i] = (float)(i & 1); + e[i] = (float)i; + c[i] = (float)i * 2; + d[i] = (float)i * 3; + } +} + +static void BM_cond_scalar(benchmark::State &state) +{ + int len = state.range(0); + float *a = (float *)_mm_malloc(len * sizeof(float), 32); + float *b = (float *)_mm_malloc(len * sizeof(float), 32); + float *c = (float *)_mm_malloc(len * sizeof(float), 32); + float *d = (float *)_mm_malloc(len * sizeof(float), 32); + float *e = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(a, c, d, e, len); + + for (auto _ : state) { + cond_scalar(a, b, d, c, e, len); + } + _mm_free(a); + _mm_free(b); + _mm_free(c); + _mm_free(d); + _mm_free(e); +} + +static void BM_cond_vmaskmov(benchmark::State &state) +{ + int len = state.range(0); + float *a = (float *)_mm_malloc(len * sizeof(float), 32); + float *b = (float *)_mm_malloc(len * sizeof(float), 32); + float *c = (float *)_mm_malloc(len * sizeof(float), 32); + float *d = (float *)_mm_malloc(len * sizeof(float), 32); + float *e = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(a, c, d, e, len); + + for (auto _ : state) { + cond_vmaskmov(a, b, d, c, e, len); + } + _mm_free(a); + _mm_free(b); + _mm_free(c); + _mm_free(d); + _mm_free(e); +} + +BENCHMARK(BM_cond_scalar) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_cond_vmaskmov) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex14/ex14_test.cpp b/chap15/ex14/ex14_test.cpp index 635871f..22ebc22 100644 --- a/chap15/ex14/ex14_test.cpp +++ b/chap15/ex14/ex14_test.cpp @@ -34,7 +34,7 @@ static float d[MAX_SIZE] __attribute__((aligned(32))); static float e[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { a[i] = (float)(i & 1); diff --git a/chap15/ex16/CMakeLists.txt b/chap15/ex16/CMakeLists.txt index 87bfea5..d0e2310 100644 --- a/chap15/ex16/CMakeLists.txt +++ b/chap15/ex16/CMakeLists.txt @@ -1,9 +1,14 @@ -set(avx_ex16_srcs ex16_test.cpp three_tap_sse.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex16_srcs ${avx_ex16_srcs} three_tap_sse.s) +set(avx_ex16_ass three_tap_sse.s) elseif(MSVC) -set(avx_ex16_srcs ${avx_ex16_srcs} three_tap_sse.asm) +set(avx_ex16_ass three_tap_sse.asm) endif() -add_executable(avx_ex16_tests ${avx_ex16_srcs}) +add_executable(avx_ex16_tests ex16_test.cpp three_tap_sse.c ${avx_ex16_ass}) target_link_libraries(avx_ex16_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex16_bench ex16_bench.cpp ${avx_ex16_ass}) + target_link_libraries(avx_ex16_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex16_test COMMAND avx_ex16_tests) diff --git a/chap15/ex16/ex16_bench.cpp b/chap15/ex16/ex16_bench.cpp new file mode 100644 index 0000000..13c0fa5 --- /dev/null +++ b/chap15/ex16/ex16_bench.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "three_tap_sse.h" + +static void init_sources(float *a, float *coeff, int len) +{ + coeff[0] = 1; + coeff[1] = 3; + coeff[2] = 7; + + for (int i = 0; i < len; i++) + a[i] = (float)i; +} + +static void BM_three_tap_sse(benchmark::State &state) +{ + int len = state.range(0); + float coeff[3]; + float *a = (float *)_mm_malloc(len * sizeof(float), 16); + float *out = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(a, coeff, len); + + for (auto _ : state) { + three_tap_sse(a, coeff, out, len - 2); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(float) + 1)); + + _mm_free(out); + _mm_free(a); +} + +BENCHMARK(BM_three_tap_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex16/three_tap_sse.s b/chap15/ex16/three_tap_sse.s index 9b0d528..3e5c7e0 100644 --- a/chap15/ex16/three_tap_sse.s +++ b/chap15/ex16/three_tap_sse.s @@ -68,3 +68,7 @@ loop_start: pop r15 pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex17/CMakeLists.txt b/chap15/ex17/CMakeLists.txt index 3d18719..3f46a61 100644 --- a/chap15/ex17/CMakeLists.txt +++ b/chap15/ex17/CMakeLists.txt @@ -1,9 +1,15 @@ -set(avx_ex17_srcs ex17_test.cpp three_tap_avx.c) +set(avx_ex17_srcs) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex17_srcs ${avx_ex17_srcs} three_tap_avx.s) +set(avx_ex17_ass three_tap_avx.s) elseif(MSVC) -set(avx_ex17_srcs ${avx_ex17_srcs} three_tap_avx.asm) +set(avx_ex17_ass three_tap_avx.asm) endif() -add_executable(avx_ex17_tests ${avx_ex17_srcs}) +add_executable(avx_ex17_tests ex17_test.cpp three_tap_avx.c ${avx_ex17_ass}) target_link_libraries(avx_ex17_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex17_bench ex17_bench.cpp ${avx_ex17_ass}) + target_link_libraries(avx_ex17_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex17_test COMMAND avx_ex17_tests) diff --git a/chap15/ex17/ex17_bench.cpp b/chap15/ex17/ex17_bench.cpp new file mode 100644 index 0000000..5abc8f9 --- /dev/null +++ b/chap15/ex17/ex17_bench.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "three_tap_avx.h" + +static void init_sources(float *a, float *coeff, int len) +{ + coeff[0] = 1; + coeff[1] = 3; + coeff[2] = 7; + + for (int i = 0; i < len; i++) + a[i] = (float)i; +} + +static void BM_three_tap_avx(benchmark::State &state) +{ + int len = state.range(0); + float coeff[3]; + float *a = (float *)_mm_malloc(len * sizeof(float), 32); + float *out = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(a, coeff, len); + + for (auto _ : state) { + three_tap_avx(a, coeff, out, len - 2); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(float) + 1)); + + _mm_free(out); + _mm_free(a); +} + +BENCHMARK(BM_three_tap_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex17/three_tap_avx.s b/chap15/ex17/three_tap_avx.s index b71313b..d2ebb67 100644 --- a/chap15/ex17/three_tap_avx.s +++ b/chap15/ex17/three_tap_avx.s @@ -70,3 +70,7 @@ loop_start: pop r15 pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex18/CMakeLists.txt b/chap15/ex18/CMakeLists.txt index 375e25b..ab55d9d 100644 --- a/chap15/ex18/CMakeLists.txt +++ b/chap15/ex18/CMakeLists.txt @@ -1,9 +1,14 @@ -set(avx_ex18_srcs ex18_test.cpp three_tap_mixed_avx.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex18_srcs ${avx_ex18_srcs} three_tap_mixed_avx.s) +set(avx_ex18_ass three_tap_mixed_avx.s) elseif(MSVC) -set(avx_ex18_srcs ${avx_ex18_srcs} three_tap_mixed_avx.asm) +set(avx_ex18_ass three_tap_mixed_avx.asm) endif() -add_executable(avx_ex18_tests ${avx_ex18_srcs}) +add_executable(avx_ex18_tests ex18_test.cpp three_tap_mixed_avx.c ${avx_ex18_ass}) target_link_libraries(avx_ex18_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex18_bench ex18_bench.cpp ${avx_ex18_ass}) + target_link_libraries(avx_ex18_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex18_test COMMAND avx_ex18_tests) diff --git a/chap15/ex18/ex18_bench.cpp b/chap15/ex18/ex18_bench.cpp new file mode 100644 index 0000000..c41284c --- /dev/null +++ b/chap15/ex18/ex18_bench.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "three_tap_mixed_avx.h" + +static void init_sources(float *a, float *coeff, int len) +{ + coeff[0] = 1; + coeff[1] = 3; + coeff[2] = 7; + + for (int i = 0; i < len; i++) + a[i] = (float)i; +} + +static void BM_three_tap_mixed_avx(benchmark::State &state) +{ + int len = state.range(0); + float coeff[3]; + float *a = (float *)_mm_malloc(len * sizeof(float), 32); + float *out = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(a, coeff, len); + + for (auto _ : state) { + three_tap_mixed_avx(a, coeff, out, len - 2); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(float) + 1)); + + _mm_free(out); + _mm_free(a); +} + +BENCHMARK(BM_three_tap_mixed_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex18/ex18_test.cpp b/chap15/ex18/ex18_test.cpp index 601776e..fbe9688 100644 --- a/chap15/ex18/ex18_test.cpp +++ b/chap15/ex18/ex18_test.cpp @@ -29,7 +29,7 @@ static float coeff[3] __attribute__((aligned(32))); static float out[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() +static void init_sources() { coeff[0] = 1; coeff[1] = 3; @@ -40,7 +40,7 @@ void init_sources() } } -TEST(avx_18, three_tap_avx) +TEST(avx_18, three_tap_mixed_avx) { init_sources(); ASSERT_EQ(three_tap_mixed_avx_check(a, coeff, out, MAX_SIZE - 2), true); diff --git a/chap15/ex18/three_tap_mixed_avx.s b/chap15/ex18/three_tap_mixed_avx.s index 50d46ab..2290042 100644 --- a/chap15/ex18/three_tap_mixed_avx.s +++ b/chap15/ex18/three_tap_mixed_avx.s @@ -77,3 +77,7 @@ loop_start: pop r15 pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex19/CMakeLists.txt b/chap15/ex19/CMakeLists.txt index c7dee8a..66a8c31 100644 --- a/chap15/ex19/CMakeLists.txt +++ b/chap15/ex19/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex19_srcs ex19_test.cpp vshufps_transpose.c vblendps_transpose.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex19_srcs ${avx_ex19_srcs} vshufps_transpose.s vblendps_transpose.s) +set(avx_ex19_ass vshufps_transpose.s vblendps_transpose.s) elseif(MSVC) -set(avx_ex19_srcs ${avx_ex19_srcs} vshufps_transpose.asm vblendps_transpose.asm) +set(avx_ex19_ass vshufps_transpose.asm vblendps_transpose.asm) endif() -add_executable(avx_ex19_tests ${avx_ex19_srcs}) +add_executable(avx_ex19_tests ex19_test.cpp vshufps_transpose.c vblendps_transpose.c ${avx_ex19_ass}) + +IF( benchmark_FOUND ) + add_executable(avx_ex19_bench ex19_bench.cpp ${avx_ex19_ass}) + target_link_libraries(avx_ex19_bench benchmark::benchmark) +ENDIF() target_link_libraries(avx_ex19_tests gtest_main) add_test(NAME avx_ex19_test COMMAND avx_ex19_tests) diff --git a/chap15/ex19/ex19_bench.cpp b/chap15/ex19/ex19_bench.cpp new file mode 100644 index 0000000..729bea5 --- /dev/null +++ b/chap15/ex19/ex19_bench.cpp @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "vblendps_transpose.h" +#include "vshufps_transpose.h" + +const int MAX_SIZE = 8; /* Must be 8 */ + +#ifdef _MSC_VER // Preferred VS2019 version 16.3 or higher +__declspec(align(32)) static float x[MAX_SIZE][MAX_SIZE]; +__declspec(align(32)) static float y[MAX_SIZE][MAX_SIZE]; +#else +static float x[MAX_SIZE][MAX_SIZE] __attribute__((aligned(32))); +static float y[MAX_SIZE][MAX_SIZE] __attribute__((aligned(32))); +#endif + +static void init_sources() +{ + for (size_t i = 0; i < MAX_SIZE; i++) + for (size_t j = 0; j < MAX_SIZE; j++) { + x[i][j] = (float)i * MAX_SIZE + j; + } +} + +static void BM_vshufps_transpose(benchmark::State &state) +{ + int len = state.range(0); + + init_sources(); + + for (auto _ : state) { + vshufps_transpose(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(float) * MAX_SIZE * MAX_SIZE)); +} + +static void BM_blendps_transpose(benchmark::State &state) +{ + int len = state.range(0); + + init_sources(); + + for (auto _ : state) { + vblendps_transpose(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(float) * MAX_SIZE * MAX_SIZE)); +} + +BENCHMARK(BM_vshufps_transpose) + ->Arg(1 << 4) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12); +BENCHMARK(BM_blendps_transpose) + ->Arg(1 << 4) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12); +BENCHMARK_MAIN(); diff --git a/chap15/ex19/ex19_test.cpp b/chap15/ex19/ex19_test.cpp index c1de15c..437583e 100644 --- a/chap15/ex19/ex19_test.cpp +++ b/chap15/ex19/ex19_test.cpp @@ -18,7 +18,7 @@ #include "vblendps_transpose.h" #include "vshufps_transpose.h" -const int MAX_SIZE = 8; /* Must divisible by 8 */ +const int MAX_SIZE = 8; /* Must be 8 */ struct unaligned_matrix { float dummy; @@ -35,7 +35,7 @@ static float y[MAX_SIZE][MAX_SIZE] __attribute__((aligned(32))); static unaligned_matrix unaligned_m __attribute__((aligned(32))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) for (size_t j = 0; j < MAX_SIZE; j++) { diff --git a/chap15/ex19/vblendps_transpose.c b/chap15/ex19/vblendps_transpose.c index 8745d36..f0b36c4 100644 --- a/chap15/ex19/vblendps_transpose.c +++ b/chap15/ex19/vblendps_transpose.c @@ -33,8 +33,8 @@ bool vblendps_transpose_check(float in[][8], float out[][8], size_t len) return false; /* - * len is the matrix width and height divided by 8. It must be greater - * than 0. + * len is the number of times the transpose should be repeated. + * Must not be 0. */ if (len == 0) diff --git a/chap15/ex19/vblendps_transpose.s b/chap15/ex19/vblendps_transpose.s index da924cb..37653e2 100644 --- a/chap15/ex19/vblendps_transpose.s +++ b/chap15/ex19/vblendps_transpose.s @@ -83,3 +83,7 @@ loop1: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex19/vshufps_transpose.c b/chap15/ex19/vshufps_transpose.c index 8959637..992452a 100644 --- a/chap15/ex19/vshufps_transpose.c +++ b/chap15/ex19/vshufps_transpose.c @@ -33,8 +33,8 @@ bool vshufps_transpose_check(float in[][8], float out[][8], size_t len) return false; /* - * len is the matrix width and height divided by 8. It must be greater - * than 0. + * len is the number of times the transpose should be repeated. + * Must not be 0. */ if (len == 0) diff --git a/chap15/ex19/vshufps_transpose.s b/chap15/ex19/vshufps_transpose.s index f56e308..79ae922 100644 --- a/chap15/ex19/vshufps_transpose.s +++ b/chap15/ex19/vshufps_transpose.s @@ -83,3 +83,7 @@ loop1: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex2/CMakeLists.txt b/chap15/ex2/CMakeLists.txt index 3f9ef20..44357b2 100644 --- a/chap15/ex2/CMakeLists.txt +++ b/chap15/ex2/CMakeLists.txt @@ -1,9 +1,15 @@ -set(avx_ex2_srcs ex2_test.cpp transform_sse.c transform_avx.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex2_srcs ${avx_ex2_srcs} transform_sse.s transform_avx.s) +set(avx_ex2_ass transform_sse.s transform_avx.s) elseif(MSVC) -set(avx_ex2_srcs ${avx_ex2_srcs} transform_sse.asm transform_avx.asm) +set(avx_ex2_ass transform_sse.asm transform_avx.asm) endif() -add_executable(avx_ex2_tests ${avx_ex2_srcs}) + +add_executable(avx_ex2_tests ex2_test.cpp transform_sse.c transform_avx.c ${avx_ex2_ass}) target_link_libraries(avx_ex2_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex2_bench ex2_bench.cpp ${avx_ex2_ass}) + target_link_libraries(avx_ex2_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex2_test COMMAND avx_ex2_tests) diff --git a/chap15/ex2/ex2_bench.cpp b/chap15/ex2/ex2_bench.cpp new file mode 100644 index 0000000..413e8ea --- /dev/null +++ b/chap15/ex2/ex2_bench.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "transform_avx.h" +#include "transform_sse.h" + +static void BM_transform_sse(benchmark::State &state) +{ + int len = state.range(0); + + // Dynamic memory allocation with 16byte + // alignment + float *pInVector = (float *)_mm_malloc(len * sizeof(float), 16); + float *pOutVector = (float *)_mm_malloc(len * sizeof(float), 16); + // init data + for (int i = 0; i < len; i++) + pInVector[i] = 1; + float cos_teta = 0.8660254037f; + float sin_teta = 0.5f; + + // clang-format off + + // Static memory allocation of 4 floats with 16byte alignment +#ifdef _MSC_VER // Preferred VS2019 version 16.3 or higher + __declspec(align(16)) float cos_sin_teta_vec[4] = { + cos_teta, sin_teta, cos_teta, sin_teta}; + __declspec(align(16)) float sin_cos_teta_vec[4] = { + sin_teta, cos_teta, sin_teta, cos_teta}; +#else + float cos_sin_teta_vec[4] __attribute__((aligned(16))) = { + cos_teta, sin_teta, cos_teta, sin_teta}; + float sin_cos_teta_vec[4] __attribute__((aligned(16))) = { + sin_teta, cos_teta, sin_teta, cos_teta}; +#endif + + // clang-format on + + for (auto _ : state) { + transform_sse(cos_sin_teta_vec, sin_cos_teta_vec, pInVector, + pOutVector, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(pInVector[0]))); + + _mm_free(pInVector); + _mm_free(pOutVector); +} +static void BM_transform_avx(benchmark::State &state) +{ + int len = state.range(0); + // Dynamic memory allocation with 32byte alignment + float *pInVector = (float *)_mm_malloc(len * sizeof(float), 32); + float *pOutVector = (float *)_mm_malloc(len * sizeof(float), 32); + + // init data + for (int i = 0; i < len; i++) + pInVector[i] = 1; + + float cos_teta = 0.8660254037; + float sin_teta = 0.5; + + // clang-format off + + //Static memory allocation of 8 floats with 32byte alignments +#ifdef _MSC_VER + __declspec(align(32)) float cos_sin_teta_vec[8] = { +#else + float cos_sin_teta_vec[8] __attribute__((aligned(32))) = { +#endif + cos_teta, sin_teta, cos_teta, sin_teta, + cos_teta, sin_teta, cos_teta, sin_teta + }; +#ifdef _MSC_VER + __declspec(align(32)) float sin_cos_teta_vec[8] = { +#else + float sin_cos_teta_vec[8] __attribute__((aligned(32))) = { +#endif + sin_teta, cos_teta, sin_teta, cos_teta, + sin_teta, cos_teta, sin_teta, cos_teta + }; + + // clang-format on + + for (auto _ : state) { + transform_avx(cos_sin_teta_vec, sin_cos_teta_vec, pInVector, + pOutVector, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(pInVector[0]))); + + _mm_free(pInVector); + _mm_free(pOutVector); +} + +BENCHMARK(BM_transform_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_transform_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex2/ex2_test.cpp b/chap15/ex2/ex2_test.cpp index dca9e09..0894781 100644 --- a/chap15/ex2/ex2_test.cpp +++ b/chap15/ex2/ex2_test.cpp @@ -57,13 +57,14 @@ TEST(avx_2, transform_sse) true); ; for (int i = 0; i + 1 < len; i += 2) { + // Assert X' float cosx = pInVector[i] * cos_teta; + float siny = pInVector[i + 1] * sin_teta; + ASSERT_FLOAT_EQ(cosx - siny, pOutVector[i]); + // Assert Y' float sinx = pInVector[i] * sin_teta; - ASSERT_FLOAT_EQ(cosx - sinx, pOutVector[i]); - - cosx = pInVector[i + 1] * cos_teta; - sinx = pInVector[i + 1] * sin_teta; - ASSERT_FLOAT_EQ(sinx + cosx, pOutVector[i + 1]); + float cosy = pInVector[i + 1] * cos_teta; + ASSERT_FLOAT_EQ(sinx + cosy, pOutVector[i + 1]); } ASSERT_EQ(transform_sse_check(cos_sin_teta_vec, sin_cos_teta_vec, NULL, @@ -121,13 +122,14 @@ TEST(avx_2, transform_avx) true); for (int i = 0; i + 1 < len; i += 2) { + // Assert X' float cosx = pInVector[i] * cos_teta; + float siny = pInVector[i + 1] * sin_teta; + ASSERT_FLOAT_EQ(cosx - siny, pOutVector[i]); + // Assert Y' float sinx = pInVector[i] * sin_teta; - ASSERT_FLOAT_EQ(cosx - sinx, pOutVector[i]); - - cosx = pInVector[i + 1] * cos_teta; - sinx = pInVector[i + 1] * sin_teta; - ASSERT_FLOAT_EQ(sinx + cosx, pOutVector[i + 1]); + float cosy = pInVector[i + 1] * cos_teta; + ASSERT_FLOAT_EQ(sinx + cosy, pOutVector[i + 1]); } ASSERT_EQ(transform_avx_check(cos_sin_teta_vec, sin_cos_teta_vec, NULL, diff --git a/chap15/ex2/transform_avx.s b/chap15/ex2/transform_avx.s index b38d3c6..d7fdd09 100644 --- a/chap15/ex2/transform_avx.s +++ b/chap15/ex2/transform_avx.s @@ -67,3 +67,7 @@ loop1: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex2/transform_sse.s b/chap15/ex2/transform_sse.s index 64d2085..9ebb546 100644 --- a/chap15/ex2/transform_sse.s +++ b/chap15/ex2/transform_sse.s @@ -67,3 +67,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex20/CMakeLists.txt b/chap15/ex20/CMakeLists.txt index c177f6a..1f2201e 100644 --- a/chap15/ex20/CMakeLists.txt +++ b/chap15/ex20/CMakeLists.txt @@ -1,9 +1,14 @@ -set(avx_ex20_srcs ex20_test.cpp vinsertps_transpose.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex20_srcs ${avx_ex20_srcs} vinsertps_transpose.s) +set(avx_ex20_ass ${avx_ex20_ass} vinsertps_transpose.s) elseif(MSVC) -set(avx_ex20_srcs ${avx_ex20_srcs} vinsertps_transpose.asm) +set(avx_ex20_ass ${avx_ex20_ass} vinsertps_transpose.asm) endif() -add_executable(avx_ex20_tests ${avx_ex20_srcs}) +add_executable(avx_ex20_tests ex20_test.cpp vinsertps_transpose.c ${avx_ex20_ass}) target_link_libraries(avx_ex20_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex20_bench ex20_bench.cpp ${avx_ex20_ass}) + target_link_libraries(avx_ex20_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex20_test COMMAND avx_ex20_tests) diff --git a/chap15/ex20/ex20_bench.cpp b/chap15/ex20/ex20_bench.cpp new file mode 100644 index 0000000..989f8ab --- /dev/null +++ b/chap15/ex20/ex20_bench.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "vinsertps_transpose.h" + +const int MAX_SIZE = 8; /* Must be 8 */ + +#ifdef _MSC_VER // Preferred VS2019 version 16.3 or higher +__declspec(align(32)) static float x[MAX_SIZE][MAX_SIZE]; +__declspec(align(32)) static float y[MAX_SIZE][MAX_SIZE]; +#else +static float x[MAX_SIZE][MAX_SIZE] __attribute__((aligned(32))); +static float y[MAX_SIZE][MAX_SIZE] __attribute__((aligned(32))); +#endif + +static void init_sources() +{ + for (size_t i = 0; i < MAX_SIZE; i++) + for (size_t j = 0; j < MAX_SIZE; j++) { + x[i][j] = (float)i * MAX_SIZE + j; + } +} + +static void BM_vinsertps_transpose(benchmark::State &state) +{ + int len = state.range(0); + + init_sources(); + + for (auto _ : state) { + vinsertps_transpose(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(float) * MAX_SIZE * MAX_SIZE)); +} + +BENCHMARK(BM_vinsertps_transpose) + ->Arg(1 << 4) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12); +BENCHMARK_MAIN(); diff --git a/chap15/ex20/ex20_test.cpp b/chap15/ex20/ex20_test.cpp index 69bbf6a..1676d4b 100644 --- a/chap15/ex20/ex20_test.cpp +++ b/chap15/ex20/ex20_test.cpp @@ -34,7 +34,7 @@ static float y[MAX_SIZE][MAX_SIZE] __attribute__((aligned(32))); static unaligned_matrix unaligned_m __attribute__((aligned(32))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) for (size_t j = 0; j < MAX_SIZE; j++) { diff --git a/chap15/ex20/vinsertps_transpose.c b/chap15/ex20/vinsertps_transpose.c index 4e0e62a..da7a091 100644 --- a/chap15/ex20/vinsertps_transpose.c +++ b/chap15/ex20/vinsertps_transpose.c @@ -33,8 +33,8 @@ bool vinsertps_transpose_check(float in[][8], float out[][8], size_t len) return false; /* - * len is the matrix width and height divided by 8. It must be greater - * than 0. + * len is the number of times the transpose should be repeated. + * Must not be 0. */ if (len == 0) diff --git a/chap15/ex20/vinsertps_transpose.s b/chap15/ex20/vinsertps_transpose.s index fd6771c..de1d68f 100644 --- a/chap15/ex20/vinsertps_transpose.s +++ b/chap15/ex20/vinsertps_transpose.s @@ -79,3 +79,7 @@ loop1: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex21/CMakeLists.txt b/chap15/ex21/CMakeLists.txt index a999453..f25c141 100644 --- a/chap15/ex21/CMakeLists.txt +++ b/chap15/ex21/CMakeLists.txt @@ -1,9 +1,14 @@ -set(avx_ex21_srcs ex21_test.cpp mul_cpx_reg.c mul_cpx_mem.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex21_srcs ${avx_ex21_srcs} mul_cpx_reg.s mul_cpx_mem.s) +set(avx_ex21_ass mul_cpx_reg.s mul_cpx_mem.s) elseif(MSVC) -set(avx_ex21_srcs ${avx_ex21_srcs} mul_cpx_reg.asm mul_cpx_mem.asm) +set(avx_ex21_ass mul_cpx_reg.asm mul_cpx_mem.asm) endif() -add_executable(avx_ex21_tests ${avx_ex21_srcs}) +add_executable(avx_ex21_tests ex21_test.cpp mul_cpx_reg.c mul_cpx_mem.c ${avx_ex21_ass}) target_link_libraries(avx_ex21_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex21_bench ex21_bench.cpp ${avx_ex21_ass}) + target_link_libraries(avx_ex21_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex21_test COMMAND avx_ex21_tests) diff --git a/chap15/ex21/ex21_bench.cpp b/chap15/ex21/ex21_bench.cpp new file mode 100644 index 0000000..a2531b7 --- /dev/null +++ b/chap15/ex21/ex21_bench.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "mul_cpx_mem.h" +#include "mul_cpx_reg.h" + +static void init_sources(complex_num *x, complex_num *y, int len) +{ + for (int i = 0; i < len; i++) { + x[i].real = (float)i; + x[i].imaginary = (float)i + 1; + y[i].real = x[i].real * 2; + y[i].imaginary = x[i].imaginary * 2; + } +} + +static void BM_mul_cpx_reg(benchmark::State &state) +{ + int len = state.range(0); + complex_num *x = + (complex_num *)_mm_malloc(len * sizeof(complex_num), 32); + complex_num *y = + (complex_num *)_mm_malloc(len * sizeof(complex_num), 32); + complex_num *z = + (complex_num *)_mm_malloc(len * sizeof(complex_num), 32); + + init_sources(x, y, len); + + for (auto _ : state) { + mul_cpx_reg(x, y, z, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x) * 2)); + + _mm_free(z); + _mm_free(y); + _mm_free(x); +} + +static void BM_mul_cpx_mem(benchmark::State &state) +{ + int len = state.range(0); + complex_num *x = + (complex_num *)_mm_malloc(len * sizeof(complex_num), 32); + complex_num *y = + (complex_num *)_mm_malloc(len * sizeof(complex_num), 32); + complex_num *z = + (complex_num *)_mm_malloc(len * sizeof(complex_num), 32); + + init_sources(x, y, len); + + for (auto _ : state) { + mul_cpx_mem(x, y, z, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x) * 2)); + + _mm_free(z); + _mm_free(y); + _mm_free(x); +} + +BENCHMARK(BM_mul_cpx_reg) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_mul_cpx_mem) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex21/ex21_test.cpp b/chap15/ex21/ex21_test.cpp index 72327d6..571453d 100644 --- a/chap15/ex21/ex21_test.cpp +++ b/chap15/ex21/ex21_test.cpp @@ -30,7 +30,7 @@ static complex_num y[MAX_SIZE] __attribute__((aligned(32))); static complex_num z[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { x[i].real = (float)i; diff --git a/chap15/ex21/mul_cpx_mem.asm b/chap15/ex21/mul_cpx_mem.asm index 0ce2960..db34739 100644 --- a/chap15/ex21/mul_cpx_mem.asm +++ b/chap15/ex21/mul_cpx_mem.asm @@ -60,8 +60,8 @@ loop1: jl loop1 vzeroupper - vmovaps xmmword ptr[rsp+16], xmm7 - vmovaps xmmword ptr[rsp], xmm6 + vmovaps xmm7, xmmword ptr[rsp+16] + vmovaps xmm6, xmmword ptr[rsp] add rsp, 32 pop rbx ret diff --git a/chap15/ex21/mul_cpx_mem.s b/chap15/ex21/mul_cpx_mem.s index 528ce52..cb4f8df 100644 --- a/chap15/ex21/mul_cpx_mem.s +++ b/chap15/ex21/mul_cpx_mem.s @@ -63,3 +63,7 @@ loop1: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex21/mul_cpx_reg.s b/chap15/ex21/mul_cpx_reg.s index 8e679b2..118a04d 100644 --- a/chap15/ex21/mul_cpx_reg.s +++ b/chap15/ex21/mul_cpx_reg.s @@ -64,3 +64,7 @@ loop1: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex22/CMakeLists.txt b/chap15/ex22/CMakeLists.txt index 5f9ae6b..e680849 100644 --- a/chap15/ex22/CMakeLists.txt +++ b/chap15/ex22/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex22_srcs ex22_test.cpp divps_sse.c vdivps_avx.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex22_srcs ${avx_ex22_srcs} divps_sse.s vdivps_avx.s) +set(avx_ex22_ass divps_sse.s vdivps_avx.s) elseif(MSVC) -set(avx_ex22_srcs ${avx_ex22_srcs} divps_sse.asm vdivps_avx.asm) +set(avx_ex22_ass divps_sse.asm vdivps_avx.asm) endif() -add_executable(avx_ex22_tests ${avx_ex22_srcs}) - +add_executable(avx_ex22_tests ex22_test.cpp divps_sse.c vdivps_avx.c ${avx_ex22_ass}) target_link_libraries(avx_ex22_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex22_bench ex22_bench.cpp ${avx_ex22_ass}) + target_link_libraries(avx_ex22_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex22_test COMMAND avx_ex22_tests) diff --git a/chap15/ex22/divps_sse.s b/chap15/ex22/divps_sse.s index 2e7a5c7..cba31c5 100644 --- a/chap15/ex22/divps_sse.s +++ b/chap15/ex22/divps_sse.s @@ -50,3 +50,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex22/ex22_bench.cpp b/chap15/ex22/ex22_bench.cpp new file mode 100644 index 0000000..1306f83 --- /dev/null +++ b/chap15/ex22/ex22_bench.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "divps_sse.h" +#include "vdivps_avx.h" + +static void init_sources(float *x, float *y, int len) +{ + for (int i = 0; i < len; i++) { + x[i] = i * 1.0f; + y[i] = (len - i) * 1.0f; + } +} + +static void BM_divps_sse(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 16); + float *y = (float *)_mm_malloc(len * sizeof(float), 16); + float *z = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(x, y, len); + + for (auto _ : state) { + divps_sse(x, y, z, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x) * 2)); + + _mm_free(z); + _mm_free(y); + _mm_free(x); +} + +static void BM_vdivps_avx(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + float *y = (float *)_mm_malloc(len * sizeof(float), 32); + float *z = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, y, len); + + for (auto _ : state) { + vdivps_avx(x, y, z, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x) * 2)); + + _mm_free(z); + _mm_free(y); + _mm_free(x); +} + +BENCHMARK(BM_divps_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_vdivps_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex22/vdivps_avx.s b/chap15/ex22/vdivps_avx.s index e573f79..988f9b8 100644 --- a/chap15/ex22/vdivps_avx.s +++ b/chap15/ex22/vdivps_avx.s @@ -51,3 +51,7 @@ loop1: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex23/CMakeLists.txt b/chap15/ex23/CMakeLists.txt index 7d71fab..56fd7fb 100644 --- a/chap15/ex23/CMakeLists.txt +++ b/chap15/ex23/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex23_srcs ex23_test.cpp rcpps_sse.c vrcpps_avx.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex23_srcs ${avx_ex23_srcs} rcpps_sse.s vrcpps_avx.s) +set(avx_ex23_ass rcpps_sse.s vrcpps_avx.s) elseif(MSVC) -set(avx_ex23_srcs ${avx_ex23_srcs} rcpps_sse.asm vrcpps_avx.asm) +set(avx_ex23_ass rcpps_sse.asm vrcpps_avx.asm) endif() -add_executable(avx_ex23_tests ${avx_ex23_srcs}) - +add_executable(avx_ex23_tests ex23_test.cpp rcpps_sse.c vrcpps_avx.c ${avx_ex23_ass}) target_link_libraries(avx_ex23_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex23_bench ex23_bench.cpp ${avx_ex23_ass}) + target_link_libraries(avx_ex23_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex23_test COMMAND avx_ex23_tests) diff --git a/chap15/ex23/ex23_bench.cpp b/chap15/ex23/ex23_bench.cpp new file mode 100644 index 0000000..cc39063 --- /dev/null +++ b/chap15/ex23/ex23_bench.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "rcpps_sse.h" +#include "vrcpps_avx.h" + +static void init_sources(float *x, float *y, int len) +{ + for (int i = 0; i < len; i++) { + x[i] = i * 1.0f; + y[i] = (len - i) * 1.0f; + } +} + +static void BM_rcpps_sse(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 16); + float *y = (float *)_mm_malloc(len * sizeof(float), 16); + float *z = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(x, y, len); + + for (auto _ : state) { + rcpps_sse(x, y, z, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x) * 2)); + + _mm_free(z); + _mm_free(y); + _mm_free(x); +} + +static void BM_vrcpps_avx(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + float *y = (float *)_mm_malloc(len * sizeof(float), 32); + float *z = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, y, len); + + for (auto _ : state) { + vrcpps_avx(x, y, z, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x) * 2)); + + _mm_free(z); + _mm_free(y); + _mm_free(x); +} + +BENCHMARK(BM_rcpps_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_vrcpps_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex23/ex23_test.cpp b/chap15/ex23/ex23_test.cpp index 51b7877..94dbf59 100644 --- a/chap15/ex23/ex23_test.cpp +++ b/chap15/ex23/ex23_test.cpp @@ -30,7 +30,7 @@ static float y[MAX_SIZE] __attribute__((aligned(32))); static float z[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { x[i] = i * 1.0f; diff --git a/chap15/ex23/rcpps_sse.s b/chap15/ex23/rcpps_sse.s index c5a391f..531aded 100644 --- a/chap15/ex23/rcpps_sse.s +++ b/chap15/ex23/rcpps_sse.s @@ -51,3 +51,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex23/vrcpps_avx.s b/chap15/ex23/vrcpps_avx.s index bcace94..c554788 100644 --- a/chap15/ex23/vrcpps_avx.s +++ b/chap15/ex23/vrcpps_avx.s @@ -52,3 +52,7 @@ loop1: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex24/CMakeLists.txt b/chap15/ex24/CMakeLists.txt index 232f791..59373f5 100644 --- a/chap15/ex24/CMakeLists.txt +++ b/chap15/ex24/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex24_srcs ex24_test.cpp rcpps_mul_sse.c vrcpps_mul_avx.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex24_srcs ${avx_ex24_srcs} rcpps_mul_sse.s vrcpps_mul_avx.s) +set(avx_ex24_ass rcpps_mul_sse.s vrcpps_mul_avx.s) elseif(MSVC) -set(avx_ex24_srcs ${avx_ex24_srcs} rcpps_mul_sse.asm vrcpps_mul_avx.asm) +set(avx_ex24_ass rcpps_mul_sse.asm vrcpps_mul_avx.asm) endif() -add_executable(avx_ex24_tests ${avx_ex24_srcs}) - +add_executable(avx_ex24_tests ex24_test.cpp rcpps_mul_sse.c vrcpps_mul_avx.c ${avx_ex24_ass}) target_link_libraries(avx_ex24_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex24_bench ex24_bench.cpp ${avx_ex24_ass}) + target_link_libraries(avx_ex24_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex24_test COMMAND avx_ex24_tests) diff --git a/chap15/ex24/ex24_bench.cpp b/chap15/ex24/ex24_bench.cpp new file mode 100644 index 0000000..c7010e7 --- /dev/null +++ b/chap15/ex24/ex24_bench.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "rcpps_mul_sse.h" +#include "vrcpps_mul_avx.h" + +static void init_sources(float *x, float *y, int len) +{ + for (int i = 0; i < len; i++) { + x[i] = i * 1.0f; + y[i] = (len - i) * 1.0f; + } +} + +static void BM_rcpps_mul_sse(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 16); + float *y = (float *)_mm_malloc(len * sizeof(float), 16); + float *z = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(x, y, len); + + for (auto _ : state) { + rcpps_mul_sse(x, y, z, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x) * 2)); + + _mm_free(z); + _mm_free(y); + _mm_free(x); +} + +static void BM_vrcpps_mul_avx(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + float *y = (float *)_mm_malloc(len * sizeof(float), 32); + float *z = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, y, len); + + for (auto _ : state) { + vrcpps_mul_avx(x, y, z, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x) * 2)); + + _mm_free(z); + _mm_free(y); + _mm_free(x); +} + +BENCHMARK(BM_rcpps_mul_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_vrcpps_mul_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex24/rcpps_mul_sse.s b/chap15/ex24/rcpps_mul_sse.s index c1983bf..dc3c276 100644 --- a/chap15/ex24/rcpps_mul_sse.s +++ b/chap15/ex24/rcpps_mul_sse.s @@ -56,3 +56,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex24/vrcpps_mul_avx.s b/chap15/ex24/vrcpps_mul_avx.s index b74464d..f19a0a0 100644 --- a/chap15/ex24/vrcpps_mul_avx.s +++ b/chap15/ex24/vrcpps_mul_avx.s @@ -56,3 +56,7 @@ loop1: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex25/CMakeLists.txt b/chap15/ex25/CMakeLists.txt index cd249d1..4374767 100644 --- a/chap15/ex25/CMakeLists.txt +++ b/chap15/ex25/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex25_srcs ex25_test.cpp sqrtps_divps_sse.c vsqrtps_vdivps_avx.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex25_srcs ${avx_ex25_srcs} sqrtps_divps_sse.s vsqrtps_vdivps_avx.s) +set(avx_ex25_ass sqrtps_divps_sse.s vsqrtps_vdivps_avx.s) elseif(MSVC) -set(avx_ex25_srcs ${avx_ex25_srcs} sqrtps_divps_sse.asm vsqrtps_vdivps_avx.asm) +set(avx_ex25_ass sqrtps_divps_sse.asm vsqrtps_vdivps_avx.asm) endif() -add_executable(avx_ex25_tests ${avx_ex25_srcs}) - +add_executable(avx_ex25_tests ex25_test.cpp sqrtps_divps_sse.c vsqrtps_vdivps_avx.c ${avx_ex25_ass}) target_link_libraries(avx_ex25_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex25_bench ex25_bench.cpp ${avx_ex25_ass}) + target_link_libraries(avx_ex25_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex25_test COMMAND avx_ex25_tests) diff --git a/chap15/ex25/ex25_bench.cpp b/chap15/ex25/ex25_bench.cpp new file mode 100644 index 0000000..3352e95 --- /dev/null +++ b/chap15/ex25/ex25_bench.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "sqrtps_divps_sse.h" +#include "vsqrtps_vdivps_avx.h" + +static void init_sources(float *x, int len) +{ + for (int i = 0; i < len; i++) + x[i] = (i + 1) * 1.0f; +} + +static void BM_sqrtps_divps_sse(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 16); + float *y = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(x, len); + + for (auto _ : state) { + sqrtps_divps_sse(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +static void BM_sqrtps_vdivps_avx(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + float *y = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, len); + + for (auto _ : state) { + vsqrtps_vdivps_avx(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +BENCHMARK(BM_sqrtps_divps_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_sqrtps_vdivps_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex25/ex25_test.cpp b/chap15/ex25/ex25_test.cpp index f77ec34..4db54c3 100644 --- a/chap15/ex25/ex25_test.cpp +++ b/chap15/ex25/ex25_test.cpp @@ -30,7 +30,7 @@ static float x[MAX_SIZE] __attribute__((aligned(32))); static float y[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { x[i] = (i + 1) * 1.0f; diff --git a/chap15/ex25/sqrtps_divps_sse.s b/chap15/ex25/sqrtps_divps_sse.s index 9002ca4..5e957aa 100644 --- a/chap15/ex25/sqrtps_divps_sse.s +++ b/chap15/ex25/sqrtps_divps_sse.s @@ -30,7 +30,7 @@ _sqrtps_divps_sse: sqrtps_divps_sse: push rbx - + mov rax, rdi mov rbx, rsi mov rcx, rdx @@ -48,3 +48,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex25/vsqrtps_vdivps_avx.s b/chap15/ex25/vsqrtps_vdivps_avx.s index c84a136..a5072b3 100644 --- a/chap15/ex25/vsqrtps_vdivps_avx.s +++ b/chap15/ex25/vsqrtps_vdivps_avx.s @@ -49,3 +49,7 @@ loop1: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex26/CMakeLists.txt b/chap15/ex26/CMakeLists.txt index d478f60..aa9d87a 100644 --- a/chap15/ex26/CMakeLists.txt +++ b/chap15/ex26/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex26_srcs ex26_test.cpp rsqrtps_sse.c vrsqrtps_avx.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex26_srcs ${avx_ex26_srcs} rsqrtps_sse.s vrsqrtps_avx.s) +set(avx_ex26_ass rsqrtps_sse.s vrsqrtps_avx.s) elseif(MSVC) -set(avx_ex26_srcs ${avx_ex26_srcs} rsqrtps_sse.asm vrsqrtps_avx.asm) +set(avx_ex26_ass rsqrtps_sse.asm vrsqrtps_avx.asm) endif() -add_executable(avx_ex26_tests ${avx_ex26_srcs}) +add_executable(avx_ex26_tests ex26_test.cpp rsqrtps_sse.c vrsqrtps_avx.c ${avx_ex26_ass}) + +IF( benchmark_FOUND ) + add_executable(avx_ex26_bench ex26_bench.cpp ${avx_ex26_ass}) + target_link_libraries(avx_ex26_bench benchmark::benchmark) +ENDIF() target_link_libraries(avx_ex26_tests gtest_main) add_test(NAME avx_ex26_test COMMAND avx_ex26_tests) diff --git a/chap15/ex26/ex26_bench.cpp b/chap15/ex26/ex26_bench.cpp new file mode 100644 index 0000000..dd08146 --- /dev/null +++ b/chap15/ex26/ex26_bench.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "rsqrtps_sse.h" +#include "vrsqrtps_avx.h" + +static void init_sources(float *x, int len) +{ + for (int i = 0; i < len; i++) + x[i] = (i + 1) * 1.0f; +} + +static void BM_rsqrtps_sse(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 16); + float *y = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(x, len); + + for (auto _ : state) { + rsqrtps_sse(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +static void BM_vrsqrtps_avx(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + float *y = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, len); + + for (auto _ : state) { + vrsqrtps_avx(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +BENCHMARK(BM_rsqrtps_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_vrsqrtps_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex26/ex26_test.cpp b/chap15/ex26/ex26_test.cpp index 52040cc..b641a70 100644 --- a/chap15/ex26/ex26_test.cpp +++ b/chap15/ex26/ex26_test.cpp @@ -30,7 +30,7 @@ static float x[MAX_SIZE] __attribute__((aligned(32))); static float y[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { x[i] = (i + 1) * 1.0f; diff --git a/chap15/ex26/rsqrtps_sse.s b/chap15/ex26/rsqrtps_sse.s index 3d0dc88..f3cf3a7 100644 --- a/chap15/ex26/rsqrtps_sse.s +++ b/chap15/ex26/rsqrtps_sse.s @@ -46,3 +46,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex26/vrsqrtps_avx.s b/chap15/ex26/vrsqrtps_avx.s index 230a300..da97072 100644 --- a/chap15/ex26/vrsqrtps_avx.s +++ b/chap15/ex26/vrsqrtps_avx.s @@ -47,3 +47,7 @@ loop1: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex27/CMakeLists.txt b/chap15/ex27/CMakeLists.txt index 904e88d..1aa1921 100644 --- a/chap15/ex27/CMakeLists.txt +++ b/chap15/ex27/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex27_srcs ex27_test.cpp rsqrtps_newt_sse.c vrsqrtps_newt_avx.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex27_srcs ${avx_ex27_srcs} rsqrtps_newt_sse.s vrsqrtps_newt_avx.s) +set(avx_ex27_ass rsqrtps_newt_sse.s vrsqrtps_newt_avx.s) elseif(MSVC) -set(avx_ex27_srcs ${avx_ex27_srcs} rsqrtps_newt_sse.asm vrsqrtps_newt_avx.asm) +set(avx_ex27_ass rsqrtps_newt_sse.asm vrsqrtps_newt_avx.asm) endif() -add_executable(avx_ex27_tests ${avx_ex27_srcs}) - +add_executable(avx_ex27_tests ex27_test.cpp rsqrtps_newt_sse.c vrsqrtps_newt_avx.c ${avx_ex27_ass}) target_link_libraries(avx_ex27_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex27_bench ex27_bench.cpp ${avx_ex27_ass}) + target_link_libraries(avx_ex27_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex27_test COMMAND avx_ex27_tests) diff --git a/chap15/ex27/ex27_bench.cpp b/chap15/ex27/ex27_bench.cpp new file mode 100644 index 0000000..c3867fe --- /dev/null +++ b/chap15/ex27/ex27_bench.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "rsqrtps_newt_sse.h" +#include "vrsqrtps_newt_avx.h" + +static void init_sources(float *x, int len) +{ + for (int i = 0; i < len; i++) + x[i] = (i + 1) * 1.0f; +} + +static void BM_rsqrtps_newt_sse(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 16); + float *y = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(x, len); + + for (auto _ : state) { + rsqrtps_newt_sse(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +static void BM_vrsqrtps_newt_avx(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + float *y = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, len); + + for (auto _ : state) { + vrsqrtps_newt_avx(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +BENCHMARK(BM_rsqrtps_newt_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_vrsqrtps_newt_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex27/ex27_test.cpp b/chap15/ex27/ex27_test.cpp index 820e714..ebfd047 100644 --- a/chap15/ex27/ex27_test.cpp +++ b/chap15/ex27/ex27_test.cpp @@ -30,7 +30,7 @@ static float x[MAX_SIZE] __attribute__((aligned(32))); static float y[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { x[i] = (i + 1) * 1.0f; diff --git a/chap15/ex27/rsqrtps_newt_sse.s b/chap15/ex27/rsqrtps_newt_sse.s index aca775a..a2375e7 100644 --- a/chap15/ex27/rsqrtps_newt_sse.s +++ b/chap15/ex27/rsqrtps_newt_sse.s @@ -63,3 +63,7 @@ minus_half: three: .float 3.0, 3.0, 3.0, 3.0 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex27/vrsqrtps_newt_avx.s b/chap15/ex27/vrsqrtps_newt_avx.s index 4d1d3f8..2cbebe8 100644 --- a/chap15/ex27/vrsqrtps_newt_avx.s +++ b/chap15/ex27/vrsqrtps_newt_avx.s @@ -65,4 +65,7 @@ half: three: .float 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0 - + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex28/CMakeLists.txt b/chap15/ex28/CMakeLists.txt index 0824387..07d6261 100644 --- a/chap15/ex28/CMakeLists.txt +++ b/chap15/ex28/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex28_srcs ex28_test.cpp sqrtps_sse.c vsqrtps_avx.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex28_srcs ${avx_ex28_srcs} sqrtps_sse.s vsqrtps_avx.s) +set(avx_ex28_ass sqrtps_sse.s vsqrtps_avx.s) elseif(MSVC) -set(avx_ex28_srcs ${avx_ex28_srcs} sqrtps_sse.asm vsqrtps_avx.asm) +set(avx_ex28_ass sqrtps_sse.asm vsqrtps_avx.asm) endif() -add_executable(avx_ex28_tests ${avx_ex28_srcs}) +add_executable(avx_ex28_tests ex28_test.cpp sqrtps_sse.c vsqrtps_avx.c ${avx_ex28_ass}) + +IF( benchmark_FOUND ) + add_executable(avx_ex28_bench ex28_bench.cpp ${avx_ex28_ass}) + target_link_libraries(avx_ex28_bench benchmark::benchmark) +ENDIF() target_link_libraries(avx_ex28_tests gtest_main) add_test(NAME avx_ex28_test COMMAND avx_ex28_tests) diff --git a/chap15/ex28/ex28_bench.cpp b/chap15/ex28/ex28_bench.cpp new file mode 100644 index 0000000..1ade01a --- /dev/null +++ b/chap15/ex28/ex28_bench.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "sqrtps_sse.h" +#include "vsqrtps_avx.h" + +static void init_sources(float *x, int len) +{ + for (int i = 0; i < len; i++) + x[i] = (i + 2) * 1.0f; +} + +static void BM_sqrtps_sse(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 16); + float *y = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(x, len); + + for (auto _ : state) { + sqrtps_sse(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +static void BM_vsqrtps_avx(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + float *y = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, len); + + for (auto _ : state) { + vsqrtps_avx(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +BENCHMARK(BM_sqrtps_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_vsqrtps_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex28/ex28_test.cpp b/chap15/ex28/ex28_test.cpp index 4b55c88..33efd0b 100644 --- a/chap15/ex28/ex28_test.cpp +++ b/chap15/ex28/ex28_test.cpp @@ -30,7 +30,7 @@ static float x[MAX_SIZE] __attribute__((aligned(32))); static float y[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { x[i] = i * 1.0f; diff --git a/chap15/ex28/sqrtps_sse.s b/chap15/ex28/sqrtps_sse.s index f7bef3a..07c4373 100644 --- a/chap15/ex28/sqrtps_sse.s +++ b/chap15/ex28/sqrtps_sse.s @@ -47,3 +47,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex28/vsqrtps_avx.s b/chap15/ex28/vsqrtps_avx.s index 9d272ab..5104956 100644 --- a/chap15/ex28/vsqrtps_avx.s +++ b/chap15/ex28/vsqrtps_avx.s @@ -50,3 +50,6 @@ loop1: pop rbx ret +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex29/CMakeLists.txt b/chap15/ex29/CMakeLists.txt index 18accb4..86653e8 100644 --- a/chap15/ex29/CMakeLists.txt +++ b/chap15/ex29/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex29_srcs ex29_test.cpp sqrt_rsqrtps_sse.c sqrt_vrsqrtps_avx.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex29_srcs ${avx_ex29_srcs} sqrt_rsqrtps_sse.s sqrt_vrsqrtps_avx.s) +set(avx_ex29_ass sqrt_rsqrtps_sse.s sqrt_vrsqrtps_avx.s) elseif(MSVC) -set(avx_ex29_srcs ${avx_ex29_srcs} sqrt_rsqrtps_sse.asm sqrt_vrsqrtps_avx.asm) +set(avx_ex29_ass sqrt_rsqrtps_sse.asm sqrt_vrsqrtps_avx.asm) endif() -add_executable(avx_ex29_tests ${avx_ex29_srcs}) - +add_executable(avx_ex29_tests ex29_test.cpp sqrt_rsqrtps_sse.c sqrt_vrsqrtps_avx.c ${avx_ex29_ass}) target_link_libraries(avx_ex29_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex29_bench ex29_bench.cpp ${avx_ex29_ass}) + target_link_libraries(avx_ex29_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex29_test COMMAND avx_ex29_tests) diff --git a/chap15/ex29/ex29_bench.cpp b/chap15/ex29/ex29_bench.cpp new file mode 100644 index 0000000..367e64e --- /dev/null +++ b/chap15/ex29/ex29_bench.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "sqrt_rsqrtps_sse.h" +#include "sqrt_vrsqrtps_avx.h" + +static void init_sources(float *x, int len) +{ + for (int i = 0; i < len; i++) + x[i] = i * 1.0f; +} + +static void BM_sqrt_rsqrtps_sse(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 16); + float *y = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(x, len); + + for (auto _ : state) { + sqrt_rsqrtps_sse(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +static void BM_sqrt_vrsqrtps_avx(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + float *y = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, len); + + for (auto _ : state) { + sqrt_vrsqrtps_avx(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +BENCHMARK(BM_sqrt_rsqrtps_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_sqrt_vrsqrtps_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex29/ex29_test.cpp b/chap15/ex29/ex29_test.cpp index 50cc6dc..1a6ca11 100644 --- a/chap15/ex29/ex29_test.cpp +++ b/chap15/ex29/ex29_test.cpp @@ -30,7 +30,7 @@ static float x[MAX_SIZE] __attribute__((aligned(32))); static float y[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { x[i] = i * 1.0f; diff --git a/chap15/ex29/sqrt_rsqrtps_sse.s b/chap15/ex29/sqrt_rsqrtps_sse.s index 8b1b981..afd1266 100644 --- a/chap15/ex29/sqrt_rsqrtps_sse.s +++ b/chap15/ex29/sqrt_rsqrtps_sse.s @@ -51,3 +51,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex29/sqrt_vrsqrtps_avx.s b/chap15/ex29/sqrt_vrsqrtps_avx.s index e90bda0..461d070 100644 --- a/chap15/ex29/sqrt_vrsqrtps_avx.s +++ b/chap15/ex29/sqrt_vrsqrtps_avx.s @@ -52,3 +52,7 @@ loop1: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex3/CMakeLists.txt b/chap15/ex3/CMakeLists.txt index 87a4085..5a86297 100644 --- a/chap15/ex3/CMakeLists.txt +++ b/chap15/ex3/CMakeLists.txt @@ -1,9 +1,15 @@ -set(avx_ex3_src ex3_test.cpp poly_sse.c poly_avx_128.c poly_avx_256.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex3_src ${avx_ex3_src} poly_sse.s poly_avx_128.s poly_avx_256.s) +set(avx_ex3_ass poly_sse.s poly_avx_128.s poly_avx_256.s) elseif(MSVC) -set(avx_ex3_src ${avx_ex3_src} poly_sse.asm poly_avx_128.asm poly_avx_256.asm) +set(avx_ex3_ass poly_sse.asm poly_avx_128.asm poly_avx_256.asm) endif() -add_executable(avx_ex3_tests ${avx_ex3_src}) + +add_executable(avx_ex3_tests ex3_test.cpp poly_sse.c poly_avx_128.c poly_avx_256.c ${avx_ex3_ass}) target_link_libraries(avx_ex3_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex3_bench ex3_bench.cpp ${avx_ex3_ass}) + target_link_libraries(avx_ex3_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex3_test COMMAND avx_ex3_tests) diff --git a/chap15/ex3/ex3_bench.cpp b/chap15/ex3/ex3_bench.cpp new file mode 100644 index 0000000..7455012 --- /dev/null +++ b/chap15/ex3/ex3_bench.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "poly_avx_128.h" +#include "poly_avx_256.h" +#include "poly_sse.h" + +static void init_sources(float *in, float *out, int len) +{ + for (int i = 0; i < len; i++) { + in[i] = (float)i / 4.0f; + out[i] = 0.0f; + } +} + +static void BM_poly_sse(benchmark::State &state) +{ + int len = state.range(0); + float *in = (float *)_mm_malloc(len * sizeof(float), 16); + float *out = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(in, out, len); + + for (auto _ : state) { + poly_sse(in, out, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(in[0]))); + + _mm_free(out); + _mm_free(in); +} + +static void BM_poly_avx_128(benchmark::State &state) +{ + int len = state.range(0); + float *in = (float *)_mm_malloc(len * sizeof(float), 32); + float *out = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(in, out, len); + + for (auto _ : state) { + poly_avx_128(in, out, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(in[0]))); + + _mm_free(out); + _mm_free(in); +} + +static void BM_poly_avx_256(benchmark::State &state) +{ + int len = state.range(0); + float *in = (float *)_mm_malloc(len * sizeof(float), 32); + float *out = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(in, out, len); + + for (auto _ : state) { + poly_avx_256(in, out, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(in[0]))); + + _mm_free(out); + _mm_free(in); +} + +BENCHMARK(BM_poly_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_poly_avx_128) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_poly_avx_256) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex3/ex3_test.cpp b/chap15/ex3/ex3_test.cpp index 46c5e53..db1aa97 100644 --- a/chap15/ex3/ex3_test.cpp +++ b/chap15/ex3/ex3_test.cpp @@ -30,7 +30,8 @@ __declspec(align(32)) static float out[MAX_SIZE]; static float in[MAX_SIZE] __attribute__((aligned(32))); static float out[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() + +static void init_sources() { for (int i = 0; i < MAX_SIZE; i++) { in[i] = (float)i / 4.0f; @@ -48,6 +49,7 @@ TEST(avx_3, poly_sse) ASSERT_FLOAT_EQ(sq + cb + in[i], out[i]); } + ASSERT_EQ(poly_sse_check(in, out, 0), false); ASSERT_EQ(poly_sse_check(in, out, 3), false); ASSERT_EQ(poly_sse_check(NULL, out, MAX_SIZE), false); ASSERT_EQ(poly_sse_check(in, NULL, MAX_SIZE), false); @@ -62,6 +64,7 @@ TEST(avx_3, poly_avx_128) float cb = sq * in[i]; ASSERT_FLOAT_EQ(sq + cb + in[i], out[i]); } + ASSERT_EQ(poly_avx_128_check(in, out, 0), false); ASSERT_EQ(poly_avx_128_check(in, out, 7), false); ASSERT_EQ(poly_avx_128_check(in, NULL, MAX_SIZE), false); ASSERT_EQ(poly_avx_128_check(NULL, out, MAX_SIZE), false); @@ -76,6 +79,7 @@ TEST(avx_3, poly_avx_256) float cb = sq * in[i]; ASSERT_FLOAT_EQ(sq + cb + in[i], out[i]); } + ASSERT_EQ(poly_avx_256_check(in, out, 0), false); ASSERT_EQ(poly_avx_256_check(in, out, 7), false); ASSERT_EQ(poly_avx_256_check(in, NULL, MAX_SIZE), false); ASSERT_EQ(poly_avx_256_check(NULL, out, MAX_SIZE), false); diff --git a/chap15/ex3/poly_avx_128.asm b/chap15/ex3/poly_avx_128.asm index c7a345b..24a1213 100644 --- a/chap15/ex3/poly_avx_128.asm +++ b/chap15/ex3/poly_avx_128.asm @@ -15,20 +15,22 @@ ; .globl poly_avx_128 - ; void poly_avx_128(float *in, float *out, size_t len); + ; void poly_avx_128(float *in, float *out, int32_t len); ; On entry: ; rcx = in ; rdx = out - ; r8 = len + ; r8d = len .code poly_avx_128 PROC public push rbx + mov rax, rcx ; mov rax, pA mov rbx, rdx ; mov rbx, pB -; movsxd r8, edx ; movsxd r8, len + movsxd r8, r8d ; movsxd r8, len + sub r8, 4 loop1: ; Load A vmovups xmm0, [rax+r8*4] diff --git a/chap15/ex3/poly_avx_128.c b/chap15/ex3/poly_avx_128.c index ededb94..706cc0e 100644 --- a/chap15/ex3/poly_avx_128.c +++ b/chap15/ex3/poly_avx_128.c @@ -25,10 +25,10 @@ bool poly_avx_128_check(float *in, float *out, int32_t len) return false; /* - * len must be > 0 and a multiple of 4. + * len must be >= 4 and a multiple of 4. */ - if (len <= 0 || (len % 4) != 0) + if (len < 4 || (len % 4) != 0) return false; poly_avx_128(in, out, len); diff --git a/chap15/ex3/poly_avx_128.s b/chap15/ex3/poly_avx_128.s index 5c6aa68..fd981d3 100644 --- a/chap15/ex3/poly_avx_128.s +++ b/chap15/ex3/poly_avx_128.s @@ -18,11 +18,11 @@ .globl _poly_avx_128 .globl poly_avx_128 - # void poly_avx_128(float *in, float *out, size_t len); + # void poly_avx_128(float *in, float *out, int32_t len); # On entry: # rdi = in # rsi = out - # rdx = len + # edx = len .text _poly_avx_128: @@ -33,6 +33,7 @@ poly_avx_128: mov rax, rdi # mov rax, pA mov rbx, rsi # mov rbx, pB movsxd r8, edx # movsxd r8, len + sub r8, 4 loop1: # Load A vmovups xmm0, [rax+r8*4] @@ -51,3 +52,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex3/poly_avx_256.asm b/chap15/ex3/poly_avx_256.asm index cc7af24..08cab8f 100644 --- a/chap15/ex3/poly_avx_256.asm +++ b/chap15/ex3/poly_avx_256.asm @@ -15,11 +15,11 @@ ; .globl poly_avx_256 - ; void poly_avx_256(float *in, float *out, size_t len); + ; void poly_avx_256(float *in, float *out, int32_t len); ; On entry: ; rcx = in ; rdx = out - ; r8 = len + ; r8d = len .code poly_avx_256 PROC public @@ -28,7 +28,8 @@ poly_avx_256 PROC public mov rax, rcx ; mov rax, pA mov rbx, rdx ; mov rbx, pB -; movsxd r8, edx ; movsxd r8, len + movsxd r8, r8d ; movsxd r8, len + sub r8, 8 loop1: ; Load A vmovups ymm0, [rax+r8*4] diff --git a/chap15/ex3/poly_avx_256.c b/chap15/ex3/poly_avx_256.c index d7b1d92..ad25b20 100644 --- a/chap15/ex3/poly_avx_256.c +++ b/chap15/ex3/poly_avx_256.c @@ -25,10 +25,10 @@ bool poly_avx_256_check(float *in, float *out, int32_t len) return false; /* - * len must be > 0 and a multiple of 8. + * len must be >= 8 and a multiple of 8. */ - if (len <= 0 || (len % 8) != 0) + if (len < 8 || (len % 8) != 0) return false; poly_avx_256(in, out, len); diff --git a/chap15/ex3/poly_avx_256.s b/chap15/ex3/poly_avx_256.s index 1bb2dd7..0522d16 100644 --- a/chap15/ex3/poly_avx_256.s +++ b/chap15/ex3/poly_avx_256.s @@ -18,11 +18,11 @@ .globl _poly_avx_256 .globl poly_avx_256 - # void poly_avx_256(float *in, float *out, size_t len); + # void poly_avx_256(float *in, float *out, int32_t len); # On entry: # rdi = in # rsi = out - # rdx = len + # edx = len .text _poly_avx_256: @@ -33,6 +33,7 @@ poly_avx_256: mov rax, rdi # mov rax, pA mov rbx, rsi # mov rbx, pB movsxd r8, edx # movsxd r8, len + sub r8, 8 loop1: # Load A vmovups ymm0, [rax+r8*4] @@ -52,3 +53,7 @@ loop1: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex3/poly_sse.asm b/chap15/ex3/poly_sse.asm index e4c8ee2..4cc413c 100644 --- a/chap15/ex3/poly_sse.asm +++ b/chap15/ex3/poly_sse.asm @@ -15,11 +15,11 @@ ;.globl poly_sse - ; void poly_sse(float *in, float *out, size_t len); + ; void poly_sse(float *in, float *out, int32_t len); ; On entry: ; rcx = in ; rdx = out - ; r8 = len + ; r8d = len .code poly_sse PROC public @@ -28,7 +28,8 @@ poly_sse PROC public mov rax, rcx ; mov rax, pA mov rbx, rdx ; mov rbx, pB -; movsxd r8, edx ; movsxd r8, len + movsxd r8, r8d ; movsxd r8, len + sub r8, 4 loop1: ; Load A movups xmm0, [rax+r8*4] diff --git a/chap15/ex3/poly_sse.c b/chap15/ex3/poly_sse.c index 710ba4f..9cd4fb1 100644 --- a/chap15/ex3/poly_sse.c +++ b/chap15/ex3/poly_sse.c @@ -25,10 +25,10 @@ bool poly_sse_check(float *in, float *out, int32_t len) return false; /* - * len must be > 0 and a multiple of 4. + * len must be >= 4 and a multiple of 4. */ - if (len <= 0 || (len % 4) != 0) + if (len < 4 || (len % 4) != 0) return false; poly_sse(in, out, len); diff --git a/chap15/ex3/poly_sse.s b/chap15/ex3/poly_sse.s index 0b53366..09a2671 100644 --- a/chap15/ex3/poly_sse.s +++ b/chap15/ex3/poly_sse.s @@ -18,11 +18,11 @@ .globl _poly_sse .globl poly_sse - # void poly_sse(float *in, float *out, size_t len); + # void poly_sse(float *in, float *out, int32_t len); # On entry: # rdi = in # rsi = out - # rdx = len + # edx = len .text _poly_sse: @@ -33,6 +33,7 @@ poly_sse: mov rax, rdi # mov rax, pA mov rbx, rsi # mov rbx, pB movsxd r8, edx # movsxd r8, len + sub r8, 4 loop1: # Load A movups xmm0, [rax+r8*4] @@ -55,3 +56,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex30/CMakeLists.txt b/chap15/ex30/CMakeLists.txt index 3fbbe3b..6433bd6 100644 --- a/chap15/ex30/CMakeLists.txt +++ b/chap15/ex30/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex30_srcs ex30_test.cpp sqrt_rsqrtps_taylor_sse.c sqrt_vrsqrtps_taylor_avx.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex30_srcs ${avx_ex30_srcs} sqrt_rsqrtps_taylor_sse.s sqrt_vrsqrtps_taylor_avx.s) +set(avx_ex30_ass sqrt_rsqrtps_taylor_sse.s sqrt_vrsqrtps_taylor_avx.s) elseif(MSVC) -set(avx_ex30_srcs ${avx_ex30_srcs} sqrt_rsqrtps_taylor_sse.asm sqrt_vrsqrtps_taylor_avx.asm) +set(avx_ex30_ass sqrt_rsqrtps_taylor_sse.asm sqrt_vrsqrtps_taylor_avx.asm) endif() -add_executable(avx_ex30_tests ${avx_ex30_srcs}) - +add_executable(avx_ex30_tests ex30_test.cpp sqrt_rsqrtps_taylor_sse.c sqrt_vrsqrtps_taylor_avx.c ${avx_ex30_ass}) target_link_libraries(avx_ex30_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex30_bench ex30_bench.cpp ${avx_ex30_ass}) + target_link_libraries(avx_ex30_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex30_test COMMAND avx_ex30_tests) diff --git a/chap15/ex30/ex30_bench.cpp b/chap15/ex30/ex30_bench.cpp new file mode 100644 index 0000000..ead387f --- /dev/null +++ b/chap15/ex30/ex30_bench.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "sqrt_rsqrtps_taylor_sse.h" +#include "sqrt_vrsqrtps_taylor_avx.h" + +static void init_sources(float *x, int len) +{ + for (int i = 0; i < len; i++) + x[i] = i * 1.0f; +} + +static void BM_sqrt_rsqrtps_taylor_sse(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 16); + float *y = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(x, len); + + for (auto _ : state) { + sqrt_rsqrtps_taylor_sse(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +static void BM_sqrt_vrsqrtps_taylor_avx(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + float *y = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, len); + + for (auto _ : state) { + sqrt_vrsqrtps_taylor_avx(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +BENCHMARK(BM_sqrt_rsqrtps_taylor_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_sqrt_vrsqrtps_taylor_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex30/sqrt_rsqrtps_taylor_sse.s b/chap15/ex30/sqrt_rsqrtps_taylor_sse.s index ba08fe7..eb53132 100644 --- a/chap15/ex30/sqrt_rsqrtps_taylor_sse.s +++ b/chap15/ex30/sqrt_rsqrtps_taylor_sse.s @@ -69,3 +69,7 @@ minus_half: three: .float 3.0, 3.0, 3.0, 3.0 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex30/sqrt_vrsqrtps_taylor_avx.s b/chap15/ex30/sqrt_vrsqrtps_taylor_avx.s index d68bac7..6685014 100644 --- a/chap15/ex30/sqrt_vrsqrtps_taylor_avx.s +++ b/chap15/ex30/sqrt_vrsqrtps_taylor_avx.s @@ -68,3 +68,6 @@ minus_half: three: .float 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0 +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex31/CMakeLists.txt b/chap15/ex31/CMakeLists.txt index 6182609..88b716a 100644 --- a/chap15/ex31/CMakeLists.txt +++ b/chap15/ex31/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex31_srcs ex31_test.cpp subsum_avx.c subsum_sse.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex31_srcs ${avx_ex31_srcs} subsum_avx.s subsum_sse.s) +set(avx_ex31_ass subsum_avx.s subsum_sse.s) elseif(MSVC) -set(avx_ex31_srcs ${avx_ex31_srcs} subsum_avx.asm subsum_sse.asm) +set(avx_ex31_ass subsum_avx.asm subsum_sse.asm) endif() -add_executable(avx_ex31_tests ${avx_ex31_srcs}) - +add_executable(avx_ex31_tests ex31_test.cpp subsum_avx.c subsum_sse.c ${avx_ex31_ass}) target_link_libraries(avx_ex31_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex31_bench ex31_bench.cpp ${avx_ex31_ass}) + target_link_libraries(avx_ex31_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex31_test COMMAND avx_ex31_tests) diff --git a/chap15/ex31/ex31_bench.cpp b/chap15/ex31/ex31_bench.cpp new file mode 100644 index 0000000..d332aac --- /dev/null +++ b/chap15/ex31/ex31_bench.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "subsum_avx.h" +#include "subsum_sse.h" + +static void init_sources(float *x, int len) +{ + for (int i = 0; i < len; i++) + x[i] = i * 1.0f; +} + +static void BM_subsum_sse(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 16); + float *y = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(x, len); + + for (auto _ : state) { + subsum_sse(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +static void BM_subsum_avx(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + float *y = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, len); + + for (auto _ : state) { + subsum_avx(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +BENCHMARK(BM_subsum_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_subsum_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex31/ex31_test.cpp b/chap15/ex31/ex31_test.cpp index 16fb7f2..abe28ba 100644 --- a/chap15/ex31/ex31_test.cpp +++ b/chap15/ex31/ex31_test.cpp @@ -28,7 +28,7 @@ static float x[MAX_SIZE] __attribute__((aligned(32))); static float y[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { x[i] = i * 1.0f; diff --git a/chap15/ex31/subsum_avx.s b/chap15/ex31/subsum_avx.s index eea7726..e3418d0 100644 --- a/chap15/ex31/subsum_avx.s +++ b/chap15/ex31/subsum_avx.s @@ -59,3 +59,7 @@ loop1: pop rbx vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex31/subsum_sse.s b/chap15/ex31/subsum_sse.s index f08fbed..677a309 100644 --- a/chap15/ex31/subsum_sse.s +++ b/chap15/ex31/subsum_sse.s @@ -58,3 +58,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex34/CMakeLists.txt b/chap15/ex34/CMakeLists.txt index aabca9a..16bcd8b 100644 --- a/chap15/ex34/CMakeLists.txt +++ b/chap15/ex34/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex34_srcs ex34_test.cpp singlep.c halfp.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex34_srcs ${avx_ex34_srcs} singlep.s halfp.s) +set(avx_ex34_ass singlep.s halfp.s) elseif(MSVC) -set(avx_ex34_srcs ${avx_ex34_srcs} singlep.asm halfp.asm) +set(avx_ex34_ass singlep.asm halfp.asm) endif() -add_executable(avx_ex34_tests ${avx_ex34_srcs}) - +add_executable(avx_ex34_tests ex34_test.cpp singlep.c halfp.c ${avx_ex34_ass}) target_link_libraries(avx_ex34_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex34_bench ex34_bench.cpp ${avx_ex34_ass}) + target_link_libraries(avx_ex34_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex34_test COMMAND avx_ex34_tests) diff --git a/chap15/ex34/ex34_bench.cpp b/chap15/ex34/ex34_bench.cpp new file mode 100644 index 0000000..9137232 --- /dev/null +++ b/chap15/ex34/ex34_bench.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "halfp.h" +#include "singlep.h" + +static void init_sources(float *x, int len) +{ + for (int i = 0; i < len; i++) + x[i] = i * 1.0f; +} + +static void init_sources_half(float *x, __m128i *xh, int len) +{ + init_sources(x, len); + + for (int i = 0; i < len / 8; i++) { + __m256 a = _mm256_loadu_ps(&x[i * 8]); + __m128i ah = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT); + _mm_store_si128(&xh[i], ah); + } +} + +static void BM_singlep(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc((len + 8) * sizeof(float), 32); + float *y = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, len); + + for (auto _ : state) { + singlep(x, y, len - 32); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +static void BM_halfp(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + __m128i *xh = + (__m128i *)_mm_malloc(((len + 1) * sizeof(__m128i)) / 8, 32); + __m128i *yh = (__m128i *)_mm_malloc((len * sizeof(__m128i)) / 8, 32); + + init_sources_half(x, xh, len); + + for (auto _ : state) { + halfp(xh, yh, len - 16); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*xh))); + + _mm_free(yh); + _mm_free(xh); + _mm_free(x); +} + +BENCHMARK(BM_singlep) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18) + ->Arg(1 << 20); +BENCHMARK(BM_halfp) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18) + ->Arg(1 << 20); +BENCHMARK_MAIN(); diff --git a/chap15/ex34/ex34_test.cpp b/chap15/ex34/ex34_test.cpp index 33b2524..0084bd4 100644 --- a/chap15/ex34/ex34_test.cpp +++ b/chap15/ex34/ex34_test.cpp @@ -21,10 +21,10 @@ const int MAX_SIZE = 24; /* Must divisible by 8 */ #ifdef _MSC_VER // Preferred VS2019 version 16.3 or higher -__declspec(align(32)) static float x[MAX_SIZE]; +__declspec(align(32)) static float x[MAX_SIZE + 8]; __declspec(align(32)) static float y[MAX_SIZE]; -__declspec(align(16)) static __m128i xh[MAX_SIZE / 8]; +__declspec(align(16)) static __m128i xh[(MAX_SIZE / 8) + 1]; __declspec(align(16)) static __m128i yh[MAX_SIZE / 8]; #else static float x[MAX_SIZE] __attribute__((aligned(32))); @@ -34,7 +34,7 @@ static __m128i xh[MAX_SIZE / 8] __attribute__((aligned(16))); static __m128i yh[MAX_SIZE / 8] __attribute__((aligned(16))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { x[i] = i * 1.0f; diff --git a/chap15/ex34/halfp.c b/chap15/ex34/halfp.c index 216b9eb..30200af 100644 --- a/chap15/ex34/halfp.c +++ b/chap15/ex34/halfp.c @@ -18,7 +18,9 @@ bool halfp_check(__m128i *x, __m128i *y, uint64_t len) { /* - * x and y must be non-NULL. + * x and y must be non-NULL. x must be 16 bytes larger than y. + * These additional bytes aren't used but are required by the algorithm. + * The number of valid half floats in y is len - 2. */ if (!x || !y) diff --git a/chap15/ex34/halfp.s b/chap15/ex34/halfp.s index 27c268b..a005ca7 100644 --- a/chap15/ex34/halfp.s +++ b/chap15/ex34/halfp.s @@ -56,3 +56,7 @@ loop: pop rbx vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex34/singlep.c b/chap15/ex34/singlep.c index fdf38d8..d70a4c3 100644 --- a/chap15/ex34/singlep.c +++ b/chap15/ex34/singlep.c @@ -18,7 +18,10 @@ bool singlep_check(float *x, float *y, uint64_t len) { /* - * x and y must be non-NULL and 32 byte aligned. + * x and y must be non-NULL and 32 byte aligned. x must be + * 32 bytes larger than y. These additional bytes aren't used + * but are required by the algorithm. The number of valid + * floats in y is len - 2. */ if (!x || !y) diff --git a/chap15/ex34/singlep.s b/chap15/ex34/singlep.s index 405c6aa..51e9721 100644 --- a/chap15/ex34/singlep.s +++ b/chap15/ex34/singlep.s @@ -54,3 +54,7 @@ loop: pop rbx vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex35/CMakeLists.txt b/chap15/ex35/CMakeLists.txt index 838a93e..814dd95 100644 --- a/chap15/ex35/CMakeLists.txt +++ b/chap15/ex35/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex35_srcs ex35_test.cpp fp_mul_add.c fp_fma.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex35_srcs ${avx_ex35_srcs} fp_mul_add.s fp_fma.s) +set(avx_ex35_ass fp_mul_add.s fp_fma.s) elseif(MSVC) -set(avx_ex35_srcs ${avx_ex35_srcs} fp_mul_add.asm fp_fma.asm) +set(avx_ex35_ass fp_mul_add.asm fp_fma.asm) endif() -add_executable(avx_ex35_tests ${avx_ex35_srcs}) - +add_executable(avx_ex35_tests ex35_test.cpp fp_mul_add.c fp_fma.c ${avx_ex35_ass}) target_link_libraries(avx_ex35_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex35_bench ex35_bench.cpp ${avx_ex35_ass}) + target_link_libraries(avx_ex35_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex35_test COMMAND avx_ex35_tests) diff --git a/chap15/ex35/ex35_bench.cpp b/chap15/ex35/ex35_bench.cpp new file mode 100644 index 0000000..95a2778 --- /dev/null +++ b/chap15/ex35/ex35_bench.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "fp_fma.h" +#include "fp_mul_add.h" + +const int MAX_SIZE = 8; + +#ifdef _MSC_VER // Preferred VS2019 version 16.3 or higher +__declspec(align(32)) static float a[MAX_SIZE]; +__declspec(align(32)) static float c1[MAX_SIZE]; +__declspec(align(32)) static float c2[MAX_SIZE]; +#else +static float a[MAX_SIZE] __attribute__((aligned(32))); +static float c1[MAX_SIZE] __attribute__((aligned(32))); +static float c2[MAX_SIZE] __attribute__((aligned(32))); +#endif + +static void init_sources() +{ + for (int i = 0; i < MAX_SIZE; i++) { + a[i] = i * 1.0f; + c1[i] = i * 2.0f; + c2[i] = i * 4.0f; + } +} + +static void BM_fp_mul_add(benchmark::State &state) +{ + int iters = state.range(0); + + init_sources(); + + for (auto _ : state) { + fp_mul_add(a, c1, c2, iters); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(MAX_SIZE) * int64_t(sizeof(*a) * 3)); +} + +static void BM_fp_fma(benchmark::State &state) +{ + int iters = state.range(0); + + init_sources(); + + for (auto _ : state) { + fp_fma(a, c1, c2, iters); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(MAX_SIZE) * int64_t(sizeof(*a) * 3)); +} + +BENCHMARK(BM_fp_mul_add) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_fp_fma) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex35/ex35_test.cpp b/chap15/ex35/ex35_test.cpp index ac352c3..7256e67 100644 --- a/chap15/ex35/ex35_test.cpp +++ b/chap15/ex35/ex35_test.cpp @@ -25,7 +25,7 @@ static float a[8]; static float c1[8]; static float c2[8]; -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { a[i] = i * 1.0f; diff --git a/chap15/ex35/fp_fma.s b/chap15/ex35/fp_fma.s index 2af8cfe..82e2b5f 100644 --- a/chap15/ex35/fp_fma.s +++ b/chap15/ex35/fp_fma.s @@ -48,3 +48,7 @@ loop: pop rbx vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex35/fp_mul_add.s b/chap15/ex35/fp_mul_add.s index 4853e38..d222466 100644 --- a/chap15/ex35/fp_mul_add.s +++ b/chap15/ex35/fp_mul_add.s @@ -49,3 +49,7 @@ loop: pop rbx vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex36/CMakeLists.txt b/chap15/ex36/CMakeLists.txt index 809a035..b6b322f 100644 --- a/chap15/ex36/CMakeLists.txt +++ b/chap15/ex36/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex36_srcs ex36_test.cpp no_unroll_reduce.c unroll_reduce.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex36_srcs ${avx_ex36_srcs} no_unroll_reduce.s unroll_reduce.s) +set(avx_ex36_ass no_unroll_reduce.s unroll_reduce.s) elseif(MSVC) -set(avx_ex36_srcs ${avx_ex36_srcs} no_unroll_reduce.asm unroll_reduce.asm) +set(avx_ex36_ass no_unroll_reduce.asm unroll_reduce.asm) endif() -add_executable(avx_ex36_tests ${avx_ex36_srcs}) - +add_executable(avx_ex36_tests ex36_test.cpp no_unroll_reduce.c unroll_reduce.c ${avx_ex36_ass}) target_link_libraries(avx_ex36_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex36_bench ex36_bench.cpp ${avx_ex36_ass}) + target_link_libraries(avx_ex36_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex36_test COMMAND avx_ex36_tests) diff --git a/chap15/ex36/ex36_bench.cpp b/chap15/ex36/ex36_bench.cpp new file mode 100644 index 0000000..4831782 --- /dev/null +++ b/chap15/ex36/ex36_bench.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "no_unroll_reduce.h" +#include "unroll_reduce.h" + +static void init_sources(float *a, int len) +{ + for (int i = 0; i < len; i++) + a[i] = i + 1.0f; +} + +static void BM_no_unroll_reduce(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, len); + + for (auto _ : state) { + (void)no_unroll_reduce(x, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(x); +} + +static void BM_unroll_reduce(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, len); + + for (auto _ : state) { + (void)unroll_reduce(x, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(x); +} + +BENCHMARK(BM_no_unroll_reduce) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_unroll_reduce) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex36/ex36_test.cpp b/chap15/ex36/ex36_test.cpp index d6a37b6..3419d8b 100644 --- a/chap15/ex36/ex36_test.cpp +++ b/chap15/ex36/ex36_test.cpp @@ -22,7 +22,7 @@ const int MAX_SIZE = 4096; static float a[MAX_SIZE]; -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) a[i] = i + 1.0f; diff --git a/chap15/ex36/no_unroll_reduce.s b/chap15/ex36/no_unroll_reduce.s index b2bca2d..3106635 100644 --- a/chap15/ex36/no_unroll_reduce.s +++ b/chap15/ex36/no_unroll_reduce.s @@ -57,3 +57,7 @@ loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex36/unroll_reduce.s b/chap15/ex36/unroll_reduce.s index f67850d..150fe61 100644 --- a/chap15/ex36/unroll_reduce.s +++ b/chap15/ex36/unroll_reduce.s @@ -79,3 +79,7 @@ loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex41/CMakeLists.txt b/chap15/ex41/CMakeLists.txt index 2f91528..a78ebcf 100644 --- a/chap15/ex41/CMakeLists.txt +++ b/chap15/ex41/CMakeLists.txt @@ -6,3 +6,14 @@ target_include_directories(avx_ex41_tests ../ex42 ) add_test(NAME avx_ex41_test COMMAND avx_ex41_tests) + +IF( benchmark_FOUND ) + add_executable(avx_ex41_bench ex41_bench.cpp i64toa_avx2.c) + target_link_libraries(avx_ex41_bench benchmark::benchmark) + target_include_directories(avx_ex41_bench + PUBLIC + ../ex40 + ../ex42 +) +ENDIF() + diff --git a/chap15/ex41/ex41_bench.cpp b/chap15/ex41/ex41_bench.cpp new file mode 100644 index 0000000..9a69d73 --- /dev/null +++ b/chap15/ex41/ex41_bench.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "i64toa_avx2.h" + +#include + +static void BM_sprintf_itoa(benchmark::State &state) +{ +#ifdef _MSC_VER // Preferred VS2019 version 16.3 or higher + __declspec(align(32)) char buf[128]; +#else + char buf[128] __attribute__((aligned(32))); +#endif + int64_t num; + + for (auto _ : state) { + num = 1; + for (size_t i = 0; i < 62; i++) { + sprintf(buf, "%" PRId64, num); + num *= 2; + } + } + + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(sizeof(int64_t) * 62)); +} + +static void BM_i64toa_avx2i(benchmark::State &state) +{ +#ifdef _MSC_VER // Preferred VS2019 version 16.3 or higher + __declspec(align(32)) char buf[128]; +#else + char buf[128] __attribute__((aligned(32))); +#endif + int64_t num; + + for (auto _ : state) { + num = 1; + for (size_t i = 0; i < 62; i++) { + i64toa_avx2i(num, buf); + num *= 2; + } + } + + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(sizeof(int64_t) * 62)); +} + +BENCHMARK(BM_sprintf_itoa); +BENCHMARK(BM_i64toa_avx2i); +BENCHMARK_MAIN(); diff --git a/chap15/ex45/ex45_test.cpp b/chap15/ex45/ex45_test.cpp index 7d8b072..f129622 100644 --- a/chap15/ex45/ex45_test.cpp +++ b/chap15/ex45/ex45_test.cpp @@ -25,7 +25,7 @@ static uint32_t in[MAX_SIZE]; static uint32_t indices[MAX_SIZE]; static uint32_t out[MAX_SIZE]; -void init_sources() +static void init_sources() { for (uint32_t i = 0; i < MAX_SIZE; i++) { in[i] = i + 1; diff --git a/chap15/ex45/vpgatherd_soft.s b/chap15/ex45/vpgatherd_soft.s index e56c8f1..2ab825e 100644 --- a/chap15/ex45/vpgatherd_soft.s +++ b/chap15/ex45/vpgatherd_soft.s @@ -49,5 +49,7 @@ vpgatherd_soft8: vzeroupper ret - +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex46/CMakeLists.txt b/chap15/ex46/CMakeLists.txt index 77cc91f..7d87a31 100644 --- a/chap15/ex46/CMakeLists.txt +++ b/chap15/ex46/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex46_srcs ex46_test.cpp scalar.c avx2_vpgatherd.c avx_vinsrt.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex46_srcs ${avx_ex46_srcs} scalar.s avx2_vpgatherd.s avx_vinsrt.s) +set(avx_ex46_ass scalar.s avx2_vpgatherd.s avx_vinsrt.s) elseif(MSVC) -set(avx_ex46_srcs ${avx_ex46_srcs} scalar.asm avx2_vpgatherd.asm avx_vinsrt.asm) +set(avx_ex46_ass scalar.asm avx2_vpgatherd.asm avx_vinsrt.asm) endif() -add_executable(avx_ex46_tests ${avx_ex46_srcs}) - +add_executable(avx_ex46_tests ex46_test.cpp scalar.c avx2_vpgatherd.c avx_vinsrt.c ${avx_ex46_ass}) target_link_libraries(avx_ex46_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex46_bench ex46_bench.cpp ${avx_ex46_ass}) + target_link_libraries(avx_ex46_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex46_test COMMAND avx_ex46_tests) diff --git a/chap15/ex46/avx2_vpgatherd.s b/chap15/ex46/avx2_vpgatherd.s index 8dc508e..d665bc0 100644 --- a/chap15/ex46/avx2_vpgatherd.s +++ b/chap15/ex46/avx2_vpgatherd.s @@ -75,5 +75,7 @@ cplx_offset: .quad 0x700000005 .quad 0xB00000009 .quad 0xF0000000D - +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex46/avx_vinsrt.s b/chap15/ex46/avx_vinsrt.s index 554c066..c7a2e0a 100644 --- a/chap15/ex46/avx_vinsrt.s +++ b/chap15/ex46/avx_vinsrt.s @@ -63,3 +63,6 @@ loop: vzeroupper ret +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex46/ex46_bench.cpp b/chap15/ex46/ex46_bench.cpp new file mode 100644 index 0000000..9c9c9a0 --- /dev/null +++ b/chap15/ex46/ex46_bench.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "avx2_vpgatherd.h" +#include "avx_vinsrt.h" +#include "scalar.h" + +static void init_sources(complex_num *aos, int len) +{ + for (int i = 0; i < len; i++) { + aos[i].real = (float)i; + aos[i].imaginary = (float)i + 1; + } +} + +static void BM_scalar(benchmark::State &state) +{ + int len = state.range(0); + complex_num *aos = (complex_num *)malloc(sizeof(*aos) * len); + float *soa_real = (float *)malloc(sizeof(float) * len); + float *soa_imaginary = (float *)malloc(sizeof(float) * len); + + init_sources(aos, len); + for (auto _ : state) { + scalar(len, aos, soa_imaginary, soa_real); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*aos))); + free(soa_imaginary); + free(soa_real); + free(aos); +} + +static void BM_avx_vinsrt(benchmark::State &state) +{ + int len = state.range(0); + complex_num *aos = (complex_num *)_mm_malloc(sizeof(*aos) * len, 32); + float *soa_real = (float *)_mm_malloc(sizeof(float) * len, 32); + float *soa_imaginary = (float *)_mm_malloc(sizeof(float) * len, 32); + + init_sources(aos, len); + for (auto _ : state) { + avx_vinsrt(len, aos, soa_imaginary, soa_real); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*aos))); + + _mm_free(soa_imaginary); + _mm_free(soa_real); + _mm_free(aos); +} + +static void BM_avx2_vpgatherd(benchmark::State &state) +{ + int len = state.range(0); + complex_num *aos = (complex_num *)_mm_malloc(sizeof(*aos) * len, 32); + float *soa_real = (float *)_mm_malloc(sizeof(float) * len, 32); + float *soa_imaginary = (float *)_mm_malloc(sizeof(float) * len, 32); + + init_sources(aos, len); + for (auto _ : state) { + avx2_vpgatherd(len, aos, soa_imaginary, soa_real); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*aos))); + + _mm_free(soa_imaginary); + _mm_free(soa_real); + _mm_free(aos); +} + +BENCHMARK(BM_scalar) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_avx_vinsrt) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_avx2_vpgatherd) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex46/ex46_test.cpp b/chap15/ex46/ex46_test.cpp index ae9109d..cabec8d 100644 --- a/chap15/ex46/ex46_test.cpp +++ b/chap15/ex46/ex46_test.cpp @@ -25,7 +25,7 @@ static complex_num aos[MAX_SIZE]; static float soa_real[MAX_SIZE]; static float soa_imaginary[MAX_SIZE]; -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { aos[i].real = (float)i; diff --git a/chap15/ex46/scalar.s b/chap15/ex46/scalar.s index 99cb151..04d8fb4 100644 --- a/chap15/ex46/scalar.s +++ b/chap15/ex46/scalar.s @@ -50,3 +50,7 @@ loop: jl loop ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex47/CMakeLists.txt b/chap15/ex47/CMakeLists.txt index 58731e1..54bd540 100644 --- a/chap15/ex47/CMakeLists.txt +++ b/chap15/ex47/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex47_srcs ex47_test.cpp avx2_gatherpd.c avx_vinsert.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex47_srcs ${avx_ex47_srcs} avx2_gatherpd.s avx_vinsert.s) +set(avx_ex47_ass avx2_gatherpd.s avx_vinsert.s) elseif(MSVC) -set(avx_ex47_srcs ${avx_ex47_srcs} avx2_gatherpd.asm avx_vinsert.asm) +set(avx_ex47_ass avx2_gatherpd.asm avx_vinsert.asm) endif() -add_executable(avx_ex47_tests ${avx_ex47_srcs}) - +add_executable(avx_ex47_tests ex47_test.cpp avx2_gatherpd.c avx_vinsert.c ${avx_ex47_ass}) target_link_libraries(avx_ex47_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex47_bench ex47_bench.cpp ${avx_ex47_ass}) + target_link_libraries(avx_ex47_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex47_test COMMAND avx_ex47_tests) diff --git a/chap15/ex47/avx2_gatherpd.asm b/chap15/ex47/avx2_gatherpd.asm index 5916251..1549ffa 100644 --- a/chap15/ex47/avx2_gatherpd.asm +++ b/chap15/ex47/avx2_gatherpd.asm @@ -27,16 +27,18 @@ .code avx2_gatherpd PROC public - sub rsp, 120 + sub rsp, 152 vmovaps xmmword ptr[rsp], xmm6 vmovaps xmmword ptr[rsp+16], xmm7 - vmovaps xmmword ptr[rsp+32], xmm9 - vmovaps xmmword ptr[rsp+48], xmm10 - vmovaps xmmword ptr[rsp+64], xmm11 - vmovaps xmmword ptr[rsp+80], xmm13 - vmovaps xmmword ptr[rsp+96], xmm14 + vmovaps xmmword ptr[rsp+32], xmm8 + vmovaps xmmword ptr[rsp+48], xmm9 + vmovaps xmmword ptr[rsp+64], xmm10 + vmovaps xmmword ptr[rsp+80], xmm11 + vmovaps xmmword ptr[rsp+96], xmm12 + vmovaps xmmword ptr[rsp+112], xmm13 + vmovaps xmmword ptr[rsp+128], xmm14 - mov r10, qword ptr[rsp+40+120] + mov r10, qword ptr[rsp+40+152] mov eax, 80000000h movd xmm0, eax mov eax, 1 @@ -76,12 +78,14 @@ loop_start: vzeroupper vmovaps xmm6, xmmword ptr[rsp] vmovaps xmm7, xmmword ptr[rsp+16] - vmovaps xmm9, xmmword ptr[rsp+32] - vmovaps xmm10, xmmword ptr[rsp+48] - vmovaps xmm11, xmmword ptr[rsp+64] - vmovaps xmm13, xmmword ptr[rsp+80] - vmovaps xmm14, xmmword ptr[rsp+96] - add rsp, 120 + vmovaps xmm8, xmmword ptr[rsp+32] + vmovaps xmm9, xmmword ptr[rsp+48] + vmovaps xmm10, xmmword ptr[rsp+64] + vmovaps xmm11, xmmword ptr[rsp+80] + vmovaps xmm12, xmmword ptr[rsp+96] + vmovaps xmm13, xmmword ptr[rsp+112] + vmovaps xmm14, xmmword ptr[rsp+128] + add rsp, 152 ret avx2_gatherpd ENDP end \ No newline at end of file diff --git a/chap15/ex47/avx2_gatherpd.s b/chap15/ex47/avx2_gatherpd.s index f787232..69ac7be 100644 --- a/chap15/ex47/avx2_gatherpd.s +++ b/chap15/ex47/avx2_gatherpd.s @@ -70,3 +70,7 @@ loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex47/avx_vinsert.asm b/chap15/ex47/avx_vinsert.asm index a216d1d..83da5bb 100644 --- a/chap15/ex47/avx_vinsert.asm +++ b/chap15/ex47/avx_vinsert.asm @@ -22,13 +22,20 @@ ; rdx = index_buffer ; r8 = imaginary_buffer ; r9 = real_buffer - ; [rsp+48] = complex_buffer + ; [rsp+48] = complex_buffer .code avx_vinsert PROC public push rbx mov rbx, qword ptr[rsp+48] + sub rsp, 96 + vmovaps xmmword ptr[rsp], xmm6 + vmovaps xmmword ptr[rsp+16], xmm7 + vmovaps xmmword ptr[rsp+32], xmm8 + vmovaps xmmword ptr[rsp+48], xmm9 + vmovaps xmmword ptr[rsp+64], xmm10 + vmovaps xmmword ptr[rsp+80], xmm11 xor rax, rax loop_start: @@ -71,6 +78,13 @@ loop_start: jl loop_start vzeroupper + vmovaps xmm6, xmmword ptr[rsp] + vmovaps xmm7, xmmword ptr[rsp+16] + vmovaps xmm8, xmmword ptr[rsp+32] + vmovaps xmm9, xmmword ptr[rsp+48] + vmovaps xmm10, xmmword ptr[rsp+64] + vmovaps xmm11, xmmword ptr[rsp+80] + add rsp, 96 pop rbx ret avx_vinsert ENDP diff --git a/chap15/ex47/avx_vinsert.s b/chap15/ex47/avx_vinsert.s index 4b936c0..1362b23 100644 --- a/chap15/ex47/avx_vinsert.s +++ b/chap15/ex47/avx_vinsert.s @@ -80,3 +80,6 @@ loop: vzeroupper ret +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex47/ex47_bench.cpp b/chap15/ex47/ex47_bench.cpp new file mode 100644 index 0000000..aa0146e --- /dev/null +++ b/chap15/ex47/ex47_bench.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#include +#include + +#include "avx2_gatherpd.h" +#include "avx_vinsert.h" + +static void init_sources(complex_num *aos, uint32_t *indices, int len) +{ + for (int i = 0; i < len; i++) { + indices[i] = ((uint32_t)len - (i + 1)); + aos[i].real = (double)i; + aos[i].imaginary = (double)i + 1; + } +} + +static void BM_avx2_gatherpd(benchmark::State &state) +{ + int len = state.range(0); + complex_num *aos = (complex_num *)_mm_malloc(sizeof(*aos) * len, 32); + double *soa_real = (double *)_mm_malloc(sizeof(double) * len, 32); + double *soa_imaginary = (double *)_mm_malloc(sizeof(double) * len, 32); + uint32_t *indices = (uint32_t *)_mm_malloc(sizeof(uint32_t) * len, 32); + + init_sources(aos, indices, len); + for (auto _ : state) { + avx2_gatherpd(len, indices, soa_imaginary, soa_real, aos); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*aos))); + + _mm_free(indices); + _mm_free(soa_imaginary); + _mm_free(soa_real); + _mm_free(aos); +} + +static void BM_avx_vinsert(benchmark::State &state) +{ + int len = state.range(0); + complex_num *aos = (complex_num *)_mm_malloc(sizeof(*aos) * len, 32); + double *soa_real = (double *)_mm_malloc(sizeof(double) * len, 32); + double *soa_imaginary = (double *)_mm_malloc(sizeof(double) * len, 32); + uint32_t *indices = (uint32_t *)_mm_malloc(sizeof(uint32_t) * len, 32); + + init_sources(aos, indices, len); + for (auto _ : state) { + avx_vinsert(len, indices, soa_imaginary, soa_real, aos); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*aos))); + + _mm_free(indices); + _mm_free(soa_imaginary); + _mm_free(soa_real); + _mm_free(aos); +} + +BENCHMARK(BM_avx2_gatherpd) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_avx_vinsert) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex47/ex47_test.cpp b/chap15/ex47/ex47_test.cpp index 1e4bbf9..734e58c 100644 --- a/chap15/ex47/ex47_test.cpp +++ b/chap15/ex47/ex47_test.cpp @@ -27,7 +27,7 @@ static double soa_real[MAX_SIZE]; static double soa_imaginary[MAX_SIZE]; static uint32_t indices[MAX_SIZE]; -void init_sources() +static void init_sources() { for (uint32_t i = 0; i < MAX_SIZE; i++) { indices[i] = MAX_SIZE - (i + 1); diff --git a/chap15/ex48/CMakeLists.txt b/chap15/ex48/CMakeLists.txt index fe1bdf6..6b17ba2 100644 --- a/chap15/ex48/CMakeLists.txt +++ b/chap15/ex48/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex48_srcs ex48_test.cpp mmx_min_max.c avx2_min_max.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex48_srcs ${avx_ex48_srcs} mmx_min_max.s avx2_min_max.s) +set(avx_ex48_ass mmx_min_max.s avx2_min_max.s) elseif(MSVC) -set(avx_ex48_srcs ${avx_ex48_srcs} mmx_min_max.asm avx2_min_max.asm) +set(avx_ex48_ass mmx_min_max.asm avx2_min_max.asm) endif() -add_executable(avx_ex48_tests ${avx_ex48_srcs}) - +add_executable(avx_ex48_tests ex48_test.cpp mmx_min_max.c avx2_min_max.c ${avx_ex48_ass}) target_link_libraries(avx_ex48_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex48_bench ex48_bench.cpp ${avx_ex48_ass}) + target_link_libraries(avx_ex48_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex48_test COMMAND avx_ex48_tests) diff --git a/chap15/ex48/avx2_min_max.s b/chap15/ex48/avx2_min_max.s index dc8e984..1e56b1d 100644 --- a/chap15/ex48/avx2_min_max.s +++ b/chap15/ex48/avx2_min_max.s @@ -81,3 +81,6 @@ end: ret +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex48/ex48_bench.cpp b/chap15/ex48/ex48_bench.cpp new file mode 100644 index 0000000..87ed71b --- /dev/null +++ b/chap15/ex48/ex48_bench.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include +#include + +#include "avx2_min_max.h" +#include "mmx_min_max.h" + +static void init_sources(int16_t *in, int len) +{ + for (int i = 0; i < len; i++) + in[i] = (int16_t)i - (len / 2); + + for (int i = 0; i < len; i++) { + int x = rand() % len; + int y = rand() % len; + int16_t tmp = in[x]; + in[x] = in[y]; + in[y] = tmp; + } +} + +static void BM_mmx_min_max(benchmark::State &state) +{ + min_max res; + int len = state.range(0); + int16_t *in = (int16_t *)_mm_malloc(sizeof(uint16_t) * len, 32); + + init_sources(in, len); + for (auto _ : state) { + mmx_min_max(in, &res, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*in))); + + _mm_free(in); +} + +static void BM_avx2_min_max(benchmark::State &state) +{ + min_max res; + int len = state.range(0); + int16_t *in = (int16_t *)_mm_malloc(sizeof(uint16_t) * len, 32); + + init_sources(in, len); + for (auto _ : state) { + avx2_min_max(in, &res, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*in))); + + _mm_free(in); +} + +BENCHMARK(BM_mmx_min_max) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_avx2_min_max) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex48/ex48_test.cpp b/chap15/ex48/ex48_test.cpp index 68092d3..7dfa120 100644 --- a/chap15/ex48/ex48_test.cpp +++ b/chap15/ex48/ex48_test.cpp @@ -24,7 +24,7 @@ const int MAX_SIZE = 4096; static int16_t in[MAX_SIZE]; -void init_sources() +static void init_sources() { for (int16_t i = 0; i < MAX_SIZE; i++) in[i] = i - (MAX_SIZE / 2); diff --git a/chap15/ex48/mmx_min_max.s b/chap15/ex48/mmx_min_max.s index 4f42fad..ee5657b 100644 --- a/chap15/ex48/mmx_min_max.s +++ b/chap15/ex48/mmx_min_max.s @@ -72,3 +72,7 @@ end: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex6/CMakeLists.txt b/chap15/ex6/CMakeLists.txt index 1a45c9b..57ae8f5 100644 --- a/chap15/ex6/CMakeLists.txt +++ b/chap15/ex6/CMakeLists.txt @@ -1,9 +1,15 @@ -set(avx_ex6_srcs ex6_test.cpp complex_conv_sse.c complex_conv_avx_stride.c) +set(avx_ex6_srcs ) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex6_srcs ${avx_ex6_srcs} complex_conv_sse.s complex_conv_avx_stride.s) +set(avx_ex6_ass complex_conv_sse.s complex_conv_avx_stride.s) elseif(MSVC) -set(avx_ex6_srcs ${avx_ex6_srcs} complex_conv_sse.asm complex_conv_avx_stride.asm) +set(avx_ex6_ass complex_conv_sse.asm complex_conv_avx_stride.asm) endif() -add_executable(avx_ex6_tests ${avx_ex6_srcs}) +add_executable(avx_ex6_tests ex6_test.cpp complex_conv_sse.c complex_conv_avx_stride.c ${avx_ex6_ass}) target_link_libraries(avx_ex6_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex6_bench ex6_bench.cpp ${avx_ex6_ass}) + target_link_libraries(avx_ex6_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex6_test COMMAND avx_ex6_tests) diff --git a/chap15/ex6/complex_conv_avx_stride.s b/chap15/ex6/complex_conv_avx_stride.s index e93c222..8e131b3 100644 --- a/chap15/ex6/complex_conv_avx_stride.s +++ b/chap15/ex6/complex_conv_avx_stride.s @@ -57,3 +57,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex6/complex_conv_sse.s b/chap15/ex6/complex_conv_sse.s index e37f9fb..62df65c 100644 --- a/chap15/ex6/complex_conv_sse.s +++ b/chap15/ex6/complex_conv_sse.s @@ -57,3 +57,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex6/ex6_bench.cpp b/chap15/ex6/ex6_bench.cpp new file mode 100644 index 0000000..9c337a3 --- /dev/null +++ b/chap15/ex6/ex6_bench.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "complex_conv_avx_stride.h" +#include "complex_conv_sse.h" + +static void init_sources(complex_num *aos, int len) +{ + for (int i = 0; i < len; i++) { + aos[i].real = (float)i; + aos[i].imaginary = (float)i + 1; + } +} + +static void BM_complex_conv_sse(benchmark::State &state) +{ + int len = state.range(0); + complex_num *aos = (complex_num *)_mm_malloc(len * sizeof(*aos), 16); + float *real = (float *)_mm_malloc(len * sizeof(float), 16); + float *imag = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(aos, len); + + for (auto _ : state) { + complex_conv_sse(aos, real, imag, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*aos))); + + _mm_free(imag); + _mm_free(real); + _mm_free(aos); +} + +static void BM_complex_conv_avx_stride(benchmark::State &state) +{ + int len = state.range(0); + complex_num *aos = (complex_num *)_mm_malloc(len * sizeof(*aos), 32); + float *real = (float *)_mm_malloc(len * sizeof(float), 32); + float *imag = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(aos, len); + + for (auto _ : state) { + complex_conv_avx_stride(aos, real, imag, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*aos))); + + _mm_free(imag); + _mm_free(real); + _mm_free(aos); +} + +BENCHMARK(BM_complex_conv_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_complex_conv_avx_stride) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex7/CMakeLists.txt b/chap15/ex7/CMakeLists.txt index 3c1e400..6668b11 100644 --- a/chap15/ex7/CMakeLists.txt +++ b/chap15/ex7/CMakeLists.txt @@ -1,9 +1,14 @@ -set(avx_ex7_srcs ex7_test.cpp median_sse.c median_avx_overlap.c median_avx_vperm.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex7_srcs ${avx_ex7_srcs} median_sse.s median_avx_overlap.s median_avx_vperm.s) +set(avx_ex7_ass median_sse.s median_avx_overlap.s median_avx_vperm.s) elseif(MSVC) -set(avx_ex7_srcs ${avx_ex7_srcs} median_sse.asm median_avx_overlap.asm median_avx_vperm.asm) +set(avx_ex7_ass median_sse.asm median_avx_overlap.asm median_avx_vperm.asm) endif() -add_executable(avx_ex7_tests ${avx_ex7_srcs}) +add_executable(avx_ex7_tests ex7_test.cpp median_sse.c median_avx_overlap.c median_avx_vperm.c ${avx_ex7_ass}) target_link_libraries(avx_ex7_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex7_bench ex7_bench.cpp ${avx_ex7_ass}) + target_link_libraries(avx_ex7_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex7_test COMMAND avx_ex7_tests) diff --git a/chap15/ex7/ex7_bench.cpp b/chap15/ex7/ex7_bench.cpp new file mode 100644 index 0000000..aed9a02 --- /dev/null +++ b/chap15/ex7/ex7_bench.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "median_avx_overlap.h" +#include "median_avx_vperm.h" +#include "median_sse.h" + +static void init_sources(float *x, int len) +{ + for (int i = 0; i < len; i++) + x[i] = i * 1.0f; +} + +static void BM_median_sse(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 16); + float *y = (float *)_mm_malloc(len * sizeof(float), 16); + + init_sources(x, len); + + for (auto _ : state) { + median_sse(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +static void BM_median_avx_overlap(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + float *y = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, len); + + for (auto _ : state) { + median_avx_overlap(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +static void BM_median_avx_vperm(benchmark::State &state) +{ + int len = state.range(0); + float *x = (float *)_mm_malloc(len * sizeof(float), 32); + float *y = (float *)_mm_malloc(len * sizeof(float), 32); + + init_sources(x, len); + + for (auto _ : state) { + median_avx_vperm(x, y, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*x))); + + _mm_free(y); + _mm_free(x); +} + +BENCHMARK(BM_median_sse) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_median_avx_overlap) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_median_avx_vperm) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex7/ex7_test.cpp b/chap15/ex7/ex7_test.cpp index 7d0882c..11149f2 100644 --- a/chap15/ex7/ex7_test.cpp +++ b/chap15/ex7/ex7_test.cpp @@ -28,7 +28,7 @@ __declspec(align(32)) static float y[MAX_SIZE]; static float x[MAX_SIZE] __attribute__((aligned(32))); static float y[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { x[i] = i * 1.0f; diff --git a/chap15/ex7/median_avx_overlap.s b/chap15/ex7/median_avx_overlap.s index 7f4ec00..1588a92 100644 --- a/chap15/ex7/median_avx_overlap.s +++ b/chap15/ex7/median_avx_overlap.s @@ -57,3 +57,7 @@ loop_start: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex7/median_avx_vperm.s b/chap15/ex7/median_avx_vperm.s index b3210f0..1530bd5 100644 --- a/chap15/ex7/median_avx_vperm.s +++ b/chap15/ex7/median_avx_vperm.s @@ -59,3 +59,7 @@ loop_start: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex7/median_sse.s b/chap15/ex7/median_sse.s index f8688f5..ba7171c 100644 --- a/chap15/ex7/median_sse.s +++ b/chap15/ex7/median_sse.s @@ -60,3 +60,7 @@ loop_start: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex8/CMakeLists.txt b/chap15/ex8/CMakeLists.txt index 77d6295..099e3a6 100644 --- a/chap15/ex8/CMakeLists.txt +++ b/chap15/ex8/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx_ex8_srcs ex8_test.cpp gather_scalar.c gather_vinsert.c gather_vinsert_vshufps.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex8_srcs ${avx_ex8_srcs} gather_scalar.s gather_vinsert.s gather_vinsert_vshufps.s) +set(avx_ex8_ass gather_scalar.s gather_vinsert.s gather_vinsert_vshufps.s) elseif(MSVC) -set(avx_ex8_srcs ${avx_ex8_srcs} gather_scalar.asm gather_vinsert.asm gather_vinsert_vshufps.asm) +set(avx_ex8_ass gather_scalar.asm gather_vinsert.asm gather_vinsert_vshufps.asm) endif() -add_executable(avx_ex8_tests ${avx_ex8_srcs}) - +add_executable(avx_ex8_tests ex8_test.cpp gather_scalar.c gather_vinsert.c gather_vinsert_vshufps.c ${avx_ex8_ass}) target_link_libraries(avx_ex8_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex8_bench ex8_bench.cpp ${avx_ex8_ass}) + target_link_libraries(avx_ex8_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex8_test COMMAND avx_ex8_tests) diff --git a/chap15/ex8/ex8_bench.cpp b/chap15/ex8/ex8_bench.cpp new file mode 100644 index 0000000..43ee956 --- /dev/null +++ b/chap15/ex8/ex8_bench.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "gather_scalar.h" +#include "gather_vinsert.h" +#include "gather_vinsert_vshufps.h" + +static void init_sources(int32_t *in, uint32_t *indices, int len) +{ + for (int i = 0; i < len; i++) { + in[i] = i - (len / 2); + indices[i] = i & 1 ? i - 1 : i + 1; + } +} + +static void BM_gather_scalar(benchmark::State &state) +{ + int len = state.range(0); + int32_t *in = (int32_t *)malloc(len * sizeof(*in)); + uint32_t *indices = (uint32_t *)malloc(len * sizeof(*indices)); + int32_t *out = (int32_t *)malloc(len * sizeof(*out)); + + init_sources(in, indices, len); + for (auto _ : state) { + gather_scalar(in, out, indices, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*in) + sizeof(*indices))); + + free(out); + free(indices); + free(in); +} + +static void BM_gather_vinsert(benchmark::State &state) +{ + int len = state.range(0); + int32_t *in = (int32_t *)_mm_malloc(len * sizeof(*in), 32); + uint32_t *indices = (uint32_t *)_mm_malloc(len * sizeof(*indices), 32); + int32_t *out = (int32_t *)_mm_malloc(len * sizeof(*out), 32); + + init_sources(in, indices, len); + for (auto _ : state) { + gather_vinsert(in, out, indices, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*in) + sizeof(*indices))); + + _mm_free(out); + _mm_free(indices); + _mm_free(in); +} + +static void BM_gather_vinsert_vshufps(benchmark::State &state) +{ + int len = state.range(0); + int32_t *in = (int32_t *)_mm_malloc(len * sizeof(*in), 32); + uint32_t *indices = (uint32_t *)_mm_malloc(len * sizeof(*indices), 32); + int32_t *out = (int32_t *)_mm_malloc(len * sizeof(*out), 32); + + init_sources(in, indices, len); + for (auto _ : state) { + gather_vinsert_vshufps(in, out, indices, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*in) + sizeof(*indices))); + + _mm_free(out); + _mm_free(indices); + _mm_free(in); +} + +BENCHMARK(BM_gather_scalar) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_gather_vinsert) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_gather_vinsert_vshufps) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex8/ex8_test.cpp b/chap15/ex8/ex8_test.cpp index 3a6f140..9986de7 100644 --- a/chap15/ex8/ex8_test.cpp +++ b/chap15/ex8/ex8_test.cpp @@ -31,7 +31,8 @@ static int32_t in[MAX_SIZE] __attribute__((aligned(32))); static int32_t out[MAX_SIZE] __attribute__((aligned(32))); static uint32_t indices[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() + +static void init_sources() { for (int i = 0; i < MAX_SIZE; i++) { in[i] = i - (MAX_SIZE / 2); diff --git a/chap15/ex8/gather_scalar.s b/chap15/ex8/gather_scalar.s index 74d4de4..ae598a3 100644 --- a/chap15/ex8/gather_scalar.s +++ b/chap15/ex8/gather_scalar.s @@ -76,3 +76,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex8/gather_vinsert.s b/chap15/ex8/gather_vinsert.s index c696577..8e18170 100644 --- a/chap15/ex8/gather_vinsert.s +++ b/chap15/ex8/gather_vinsert.s @@ -68,3 +68,7 @@ loop1: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex8/gather_vinsert_vshufps.s b/chap15/ex8/gather_vinsert_vshufps.s index 571ce27..575af41 100644 --- a/chap15/ex8/gather_vinsert_vshufps.s +++ b/chap15/ex8/gather_vinsert_vshufps.s @@ -70,3 +70,7 @@ loop1: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex9/CMakeLists.txt b/chap15/ex9/CMakeLists.txt index 3854670..4aef62c 100644 --- a/chap15/ex9/CMakeLists.txt +++ b/chap15/ex9/CMakeLists.txt @@ -1,9 +1,14 @@ -set(avx_ex9_srcs ex9_test.cpp scatter_scalar.c scatter_avx.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx_ex9_srcs ${avx_ex9_srcs} scatter_scalar.s scatter_avx.s) +set(avx_ex9_ass scatter_scalar.s scatter_avx.s) elseif(MSVC) -set(avx_ex9_srcs ${avx_ex9_srcs} scatter_scalar.asm scatter_avx.asm) +set(avx_ex9_ass scatter_scalar.asm scatter_avx.asm) endif() -add_executable(avx_ex9_tests ${avx_ex9_srcs}) +add_executable(avx_ex9_tests ex9_test.cpp scatter_scalar.c scatter_avx.c ${avx_ex9_ass}) target_link_libraries(avx_ex9_tests gtest_main) + +IF( benchmark_FOUND ) + add_executable(avx_ex9_bench ex9_bench.cpp ${avx_ex9_ass}) + target_link_libraries(avx_ex9_bench benchmark::benchmark) +ENDIF() + add_test(NAME avx_ex9_test COMMAND avx_ex9_tests) diff --git a/chap15/ex9/ex9_bench.cpp b/chap15/ex9/ex9_bench.cpp new file mode 100644 index 0000000..774a174 --- /dev/null +++ b/chap15/ex9/ex9_bench.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2022 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "scatter_avx.h" +#include "scatter_scalar.h" + +static void init_sources(int32_t *in, uint32_t *indices, int len) +{ + for (int i = 0; i < len; i++) { + in[i] = i - (len / 2); + indices[i] = i & 1 ? i - 1 : i + 1; + } +} + +static void BM_scatter_scalar(benchmark::State &state) +{ + int len = state.range(0); + int32_t *in = (int32_t *)malloc(len * sizeof(*in)); + uint32_t *indices = (uint32_t *)malloc(len * sizeof(*indices)); + int32_t *out = (int32_t *)malloc(len * sizeof(*out)); + + init_sources(in, indices, len); + for (auto _ : state) { + scatter_scalar(in, out, indices, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*in) + sizeof(*indices))); + + free(out); + free(indices); + free(in); +} + +static void BM_scatter_avx(benchmark::State &state) +{ + int len = state.range(0); + int32_t *in = (int32_t *)_mm_malloc(len * sizeof(*in), 32); + uint32_t *indices = (uint32_t *)_mm_malloc(len * sizeof(*indices), 32); + int32_t *out = (int32_t *)_mm_malloc(len * sizeof(*out), 32); + + init_sources(in, indices, len); + for (auto _ : state) { + scatter_avx(in, out, indices, len); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*in) + sizeof(*indices))); + + _mm_free(out); + _mm_free(indices); + _mm_free(in); +} + +BENCHMARK(BM_scatter_scalar) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_scatter_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap15/ex9/ex9_test.cpp b/chap15/ex9/ex9_test.cpp index 2e805c5..4aef99f 100644 --- a/chap15/ex9/ex9_test.cpp +++ b/chap15/ex9/ex9_test.cpp @@ -30,7 +30,8 @@ static int32_t in[MAX_SIZE] __attribute__((aligned(32))); static int32_t out[MAX_SIZE] __attribute__((aligned(32))); static uint32_t indices[MAX_SIZE] __attribute__((aligned(32))); #endif -void init_sources() + +static void init_sources() { for (int i = 0; i < MAX_SIZE; i++) { in[i] = i - (MAX_SIZE / 2); diff --git a/chap15/ex9/scatter_avx.s b/chap15/ex9/scatter_avx.s index d2e914e..323f905 100644 --- a/chap15/ex9/scatter_avx.s +++ b/chap15/ex9/scatter_avx.s @@ -71,3 +71,7 @@ loop1: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap15/ex9/scatter_scalar.s b/chap15/ex9/scatter_scalar.s index 710c2fe..15ece71 100644 --- a/chap15/ex9/scatter_scalar.s +++ b/chap15/ex9/scatter_scalar.s @@ -71,3 +71,7 @@ loop1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex1/CMakeLists.txt b/chap18/ex1/CMakeLists.txt index ccac34d..232b648 100644 --- a/chap18/ex1/CMakeLists.txt +++ b/chap18/ex1/CMakeLists.txt @@ -1,6 +1,13 @@ if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) set_property(SOURCE transform_avx512.c APPEND PROPERTY COMPILE_OPTIONS "-mavx512f") endif() -add_executable(avx512_ex1_tests ex1_test.cpp transform_avx.c transform_avx512.c) +set(avx512_ex1_srcs transform_avx.c transform_avx512.c) +add_executable(avx512_ex1_tests ex1_test.cpp ${avx512_ex1_srcs}) target_link_libraries(avx512_ex1_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex1_bench ex1_bench.cpp ${avx512_ex1_srcs}) + target_link_libraries(avx512_ex1_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex1_test COMMAND avx512_ex1_tests) diff --git a/chap18/ex1/ex1_bench.cpp b/chap18/ex1/ex1_bench.cpp new file mode 100644 index 0000000..6a29e08 --- /dev/null +++ b/chap18/ex1/ex1_bench.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "optimisation_common.h" + +#include "transform_avx.h" +#include "transform_avx512.h" + +static void BM_transform_avx(benchmark::State &state) +{ + int len = state.range(0); + // Dynamic memory allocation with 32byte + // alignment + float *pInVector = (float *)_mm_malloc(len * sizeof(float), 32); + float *pOutVector = (float *)_mm_malloc(len * sizeof(float), 32); + // init data + for (int i = 0; i < len; i++) + pInVector[i] = 1; + float cos_teta = 0.8660254037; + float sin_teta = 0.5; + + for (auto _ : state) { + transform_avx(sin_teta, cos_teta, pInVector, pOutVector, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(pInVector[0]))); + + _mm_free(pInVector); + _mm_free(pOutVector); +} + +static void BM_transform_avx512(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + // Dynamic memory allocation with 64byte + // alignment + float *pInVector = (float *)_mm_malloc(len * sizeof(float), 64); + float *pOutVector = (float *)_mm_malloc(len * sizeof(float), 64); + // init data + for (int i = 0; i < len; i++) + pInVector[i] = 1; + float cos_teta = 0.8660254037; + float sin_teta = 0.5; + + for (auto _ : state) { + transform_avx512(sin_teta, cos_teta, pInVector, pOutVector, + len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(pInVector[0]))); + + _mm_free(pInVector); + _mm_free(pOutVector); +} + +BENCHMARK(BM_transform_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_transform_avx512) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex1/ex1_test.cpp b/chap18/ex1/ex1_test.cpp index 4235052..fa95b6a 100644 --- a/chap18/ex1/ex1_test.cpp +++ b/chap18/ex1/ex1_test.cpp @@ -41,15 +41,14 @@ TEST(avx512_1, transform_avx) true); for (int i = 0; i < len; i += 2) { - if (i & 1) { - float cosx = pInVector[i + 1] * cos_teta; - float sinx = pInVector[i + 1] * sin_teta; - ASSERT_FLOAT_EQ(sinx + cosx, pOutVector[i]); - } else { - float cosx = pInVector[i] * cos_teta; - float sinx = pInVector[i] * sin_teta; - ASSERT_FLOAT_EQ(cosx - sinx, pOutVector[i]); - } + // Assert X' + float cosx = pInVector[i] * cos_teta; + float siny = pInVector[i + 1] * sin_teta; + ASSERT_FLOAT_EQ(cosx - siny, pOutVector[i]); + // Assert Y' + float sinx = pInVector[i] * sin_teta; + float cosy = pInVector[i + 1] * cos_teta; + ASSERT_FLOAT_EQ(sinx + cosy, pOutVector[i + 1]); } ASSERT_EQ( @@ -94,15 +93,14 @@ TEST(avx512_1, transform_avx512) true); for (int i = 0; i < len; i += 2) { - if (i & 1) { - float cosx = pInVector[i + 1] * cos_teta; - float sinx = pInVector[i + 1] * sin_teta; - ASSERT_FLOAT_EQ(sinx + cosx, pOutVector[i]); - } else { - float cosx = pInVector[i] * cos_teta; - float sinx = pInVector[i] * sin_teta; - ASSERT_FLOAT_EQ(cosx - sinx, pOutVector[i]); - } + // Assert X' + float cosx = pInVector[i] * cos_teta; + float siny = pInVector[i + 1] * sin_teta; + ASSERT_FLOAT_EQ(cosx - siny, pOutVector[i]); + // Assert Y' + float sinx = pInVector[i] * sin_teta; + float cosy = pInVector[i + 1] * cos_teta; + ASSERT_FLOAT_EQ(sinx + cosy, pOutVector[i + 1]); } ASSERT_EQ( diff --git a/chap18/ex10/CMakeLists.txt b/chap18/ex10/CMakeLists.txt index 5b0b9be..b8c6617 100644 --- a/chap18/ex10/CMakeLists.txt +++ b/chap18/ex10/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex10_srcs ex10_test.cpp scalar_compress.c avx_compress.c avx2_compress.c avx512_compress.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex10_srcs ${avx512_ex10_srcs} scalar_compress.s avx_compress.s avx2_compress.s avx512_compress.s) +set(avx512_ex10_ass scalar_compress.s avx_compress.s avx2_compress.s avx512_compress.s) elseif(MSVC) -set(avx512_ex10_srcs ${avx512_ex10_srcs} scalar_compress.asm avx_compress.asm avx2_compress.asm avx512_compress.asm) +set(avx512_ex10_ass scalar_compress.asm avx_compress.asm avx2_compress.asm avx512_compress.asm) endif() -add_executable(avx512_ex10_tests ${avx512_ex10_srcs}) - +add_executable(avx512_ex10_tests ex10_test.cpp scalar_compress.c avx_compress.c avx2_compress.c avx512_compress.c ${avx512_ex10_ass}) target_link_libraries(avx512_ex10_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex10_bench ex10_bench.cpp ${avx512_ex10_ass}) + target_link_libraries(avx512_ex10_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex10_test COMMAND avx512_ex10_tests) diff --git a/chap18/ex10/avx2_compress.s b/chap18/ex10/avx2_compress.s index 97e9ea1..68f5288 100644 --- a/chap18/ex10/avx2_compress.s +++ b/chap18/ex10/avx2_compress.s @@ -338,3 +338,7 @@ write_mask: .int 0x80000000, 0x80000000, 0x80000000, 0x80000000 .int 0x00000000, 0x00000000, 0x00000000, 0x00000000 .int 0x00000000, 0x00000000, 0x00000000, 0x00000000 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex10/avx512_compress.s b/chap18/ex10/avx512_compress.s index a3f2faa..2bb6dbb 100644 --- a/chap18/ex10/avx512_compress.s +++ b/chap18/ex10/avx512_compress.s @@ -59,3 +59,7 @@ mainloop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex10/avx_compress.s b/chap18/ex10/avx_compress.s index dc19b03..70f55e3 100644 --- a/chap18/ex10/avx_compress.s +++ b/chap18/ex10/avx_compress.s @@ -98,3 +98,6 @@ write_mask: .int 0x80000000, 0x80000000, 0x80000000, 0x80000000 .int 0x00000000, 0x00000000, 0x00000000, 0x00000000 +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex10/ex10_bench.cpp b/chap18/ex10/ex10_bench.cpp new file mode 100644 index 0000000..315120d --- /dev/null +++ b/chap18/ex10/ex10_bench.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "avx2_compress.h" +#include "avx512_compress.h" +#include "avx_compress.h" +#include "optimisation_common.h" +#include "scalar_compress.h" + +static void init_sources(uint32_t *in, int len) +{ + for (int i = 0; i < len; i++) + in[i] = i & 3; +} + +static void BM_scalar_compress(benchmark::State &state) +{ + int len = state.range(0); + + uint32_t *in = (uint32_t *)malloc(len * sizeof(*in)); + uint32_t *out = (uint32_t *)malloc(len * sizeof(*in)); + + init_sources(in, len); + memset(out, 0, sizeof(*out) * len); + + for (auto _ : state) { + (void)scalar_compress(out, in, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(in[0]))); + + free(out); + free(in); +} + +static void BM_AVX_compress(benchmark::State &state) +{ + int len = state.range(0); + uint32_t *in = (uint32_t *)_mm_malloc(len * sizeof(*in), 32); + uint32_t *out = (uint32_t *)_mm_malloc(len * sizeof(*in), 32); + + init_sources(in, len); + memset(out, 0, sizeof(*out) * len); + + for (auto _ : state) { + (void)avx_compress(out, in, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(in[0]))); + + _mm_free(out); + _mm_free(in); +} + +static void BM_AVX2_compress(benchmark::State &state) +{ + int len = state.range(0); + uint32_t *in = (uint32_t *)_mm_malloc(len * sizeof(*in), 32); + uint32_t *out = (uint32_t *)_mm_malloc(len * sizeof(*in), 32); + + init_sources(in, len); + memset(out, 0, sizeof(*out) * len); + + for (auto _ : state) { + (void)avx2_compress(out, in, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(in[0]))); + + _mm_free(out); + _mm_free(in); +} + +static void BM_AVX512_compress(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + uint32_t *in = (uint32_t *)_mm_malloc(len * sizeof(*in), 64); + uint32_t *out = (uint32_t *)_mm_malloc(len * sizeof(*in), 64); + + init_sources(in, len); + memset(out, 0, sizeof(*out) * len); + + for (auto _ : state) { + (void)avx512_compress(out, in, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(in[0]))); + + _mm_free(out); + _mm_free(in); +} + +BENCHMARK(BM_scalar_compress) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_AVX_compress) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_AVX2_compress) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_AVX512_compress) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex10/scalar_compress.s b/chap18/ex10/scalar_compress.s index 4b806f6..70563e7 100644 --- a/chap18/ex10/scalar_compress.s +++ b/chap18/ex10/scalar_compress.s @@ -51,3 +51,7 @@ m1: mov rax, r10 ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex11/CMakeLists.txt b/chap18/ex11/CMakeLists.txt index 89cb241..e5a7d74 100644 --- a/chap18/ex11/CMakeLists.txt +++ b/chap18/ex11/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex11_srcs ex11_test.cpp expand_scalar.c expand_avx2.c expand_avx512.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex11_srcs ${avx512_ex11_srcs} expand_scalar.s expand_avx2.s expand_avx512.s) +set(avx512_ex11_ass expand_scalar.s expand_avx2.s expand_avx512.s) elseif(MSVC) -set(avx512_ex11_srcs ${avx512_ex11_srcs} expand_scalar.asm expand_avx2.asm expand_avx512.asm) +set(avx512_ex11_ass expand_scalar.asm expand_avx2.asm expand_avx512.asm) endif() -add_executable(avx512_ex11_tests ${avx512_ex11_srcs}) - +add_executable(avx512_ex11_tests ex11_test.cpp expand_scalar.c expand_avx2.c expand_avx512.c ${avx512_ex11_ass}) target_link_libraries(avx512_ex11_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex11_bench ex11_bench.cpp ${avx512_ex11_ass}) + target_link_libraries(avx512_ex11_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex11_test COMMAND avx512_ex11_tests) diff --git a/chap18/ex11/ex11_bench.cpp b/chap18/ex11/ex11_bench.cpp new file mode 100644 index 0000000..3bd4e6a --- /dev/null +++ b/chap18/ex11/ex11_bench.cpp @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "expand_avx2.h" +#include "expand_avx512.h" +#include "expand_scalar.h" +#include "optimisation_common.h" + +static void init_sources(int32_t *in, int len) +{ + for (int i = 0; i < len; i++) + in[i] = i & 3; +} + +static void BM_scalar_expand(benchmark::State &state) +{ + int len = state.range(0); + + int32_t *in = (int32_t *)malloc(len * sizeof(*in)); + int32_t *out = (int32_t *)malloc(len * sizeof(*in)); + + init_sources(in, len); + memset(out, 0, sizeof(*out) * len); + + for (auto _ : state) { + (void)expand_scalar(out, in, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(in[0]))); + + free(out); + free(in); +} + +static void BM_AVX2_expand(benchmark::State &state) +{ + int len = state.range(0); + int32_t *in = (int32_t *)_mm_malloc(len * sizeof(*in), 32); + int32_t *out = (int32_t *)_mm_malloc(len * sizeof(*in), 32); + + init_sources(in, len); + memset(out, 0, sizeof(*out) * len); + + for (auto _ : state) { + (void)expand_avx2(out, in, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(in[0]))); + + _mm_free(out); + _mm_free(in); +} + +static void BM_AVX512_expand(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + int32_t *in = (int32_t *)_mm_malloc(len * sizeof(*in), 64); + int32_t *out = (int32_t *)_mm_malloc(len * sizeof(*in), 64); + + init_sources(in, len); + memset(out, 0, sizeof(*out) * len); + + for (auto _ : state) { + (void)expand_avx512(out, in, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(in[0]))); + + _mm_free(out); + _mm_free(in); +} + +BENCHMARK(BM_scalar_expand) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_AVX2_expand) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_AVX512_expand) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex11/expand_avx2.s b/chap18/ex11/expand_avx2.s index 1d2d1b9..bb8d0fa 100644 --- a/chap18/ex11/expand_avx2.s +++ b/chap18/ex11/expand_avx2.s @@ -326,3 +326,7 @@ shuf2: .int 0, 0, 1, 2, 3, 4, 5, 6 .int 0, 1, 2, 3, 4, 5, 6, 7 +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/chap18/ex11/expand_avx512.s b/chap18/ex11/expand_avx512.s index 1f6ba72..8b55605 100644 --- a/chap18/ex11/expand_avx512.s +++ b/chap18/ex11/expand_avx512.s @@ -55,3 +55,7 @@ mainloop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex11/expand_scalar.s b/chap18/ex11/expand_scalar.s index c6f7806..1255a70 100644 --- a/chap18/ex11/expand_scalar.s +++ b/chap18/ex11/expand_scalar.s @@ -48,3 +48,8 @@ m1: jne mainloop ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/chap18/ex12/CMakeLists.txt b/chap18/ex12/CMakeLists.txt index e103dd3..c9482a7 100644 --- a/chap18/ex12/CMakeLists.txt +++ b/chap18/ex12/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex12_srcs ex12_test.cpp) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex12_srcs ${avx512_ex12_srcs} ternary_avx2.s ternary_avx512.s ternary_vpternlog.s) +set(avx512_ex12_ass ternary_avx2.s ternary_avx512.s ternary_vpternlog.s) elseif(MSVC) -set(avx512_ex12_srcs ${avx512_ex12_srcs} ternary_avx2.asm ternary_avx512.asm ternary_vpternlog.asm) +set(avx512_ex12_ass ternary_avx2.asm ternary_avx512.asm ternary_vpternlog.asm) endif() -add_executable(avx512_ex12_tests ${avx512_ex12_srcs}) - +add_executable(avx512_ex12_tests ex12_test.cpp ${avx512_ex12_ass}) target_link_libraries(avx512_ex12_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex12_bench ex12_bench.cpp ${avx512_ex12_ass}) + target_link_libraries(avx512_ex12_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex12_test COMMAND avx512_ex12_tests) diff --git a/chap18/ex12/ex12_bench.cpp b/chap18/ex12/ex12_bench.cpp new file mode 100644 index 0000000..21da2a3 --- /dev/null +++ b/chap18/ex12/ex12_bench.cpp @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "optimisation_common.h" + +#include "ternary_avx2.h" +#include "ternary_avx512.h" +#include "ternary_vpternlog.h" + +static void init_sources(uint32_t *a, uint32_t *b, uint32_t *c, size_t len) +{ + for (size_t i = 0; i < len; i++) { + a[i] = std::rand() & 1; + b[i] = std::rand() & 1; + c[i] = std::rand() & 1; + } +} + +static void BM_avx2_ternary(benchmark::State &state) +{ + int len = state.range(0); + + uint32_t *a = (uint32_t *)_mm_malloc(len * sizeof(*a), 32); + uint32_t *b = (uint32_t *)_mm_malloc(len * sizeof(*a), 32); + uint32_t *c = (uint32_t *)_mm_malloc(len * sizeof(*a), 32); + uint32_t *out = (uint32_t *)_mm_malloc(len * sizeof(*a), 32); + + init_sources(a, b, c, len); + memset(out, 0, sizeof(*out) * len); + + for (auto _ : state) { + (void)ternary_avx2(out, a, b, c, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(a[0]) * 3)); + + _mm_free(out); + _mm_free(c); + _mm_free(b); + _mm_free(a); +} + +static void BM_avx512_ternary(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint32_t *a = (uint32_t *)_mm_malloc(len * sizeof(*a), 64); + uint32_t *b = (uint32_t *)_mm_malloc(len * sizeof(*a), 64); + uint32_t *c = (uint32_t *)_mm_malloc(len * sizeof(*a), 64); + uint32_t *out = (uint32_t *)_mm_malloc(len * sizeof(*a), 64); + + init_sources(a, b, c, len); + memset(out, 0, sizeof(*out) * len); + + for (auto _ : state) { + (void)ternary_avx512(out, a, b, c, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(a[0]) * 3)); + + _mm_free(out); + _mm_free(c); + _mm_free(b); + _mm_free(a); +} + +static void BM_vpternlog_ternary(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint32_t *a = (uint32_t *)_mm_malloc(len * sizeof(*a), 64); + uint32_t *b = (uint32_t *)_mm_malloc(len * sizeof(*a), 64); + uint32_t *c = (uint32_t *)_mm_malloc(len * sizeof(*a), 64); + uint32_t *out = (uint32_t *)_mm_malloc(len * sizeof(*a), 64); + + init_sources(a, b, c, len); + memset(out, 0, sizeof(*out) * len); + + for (auto _ : state) { + (void)ternary_vpternlog(out, a, b, c, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(a[0]) * 3)); + + _mm_free(out); + _mm_free(c); + _mm_free(b); + _mm_free(a); +} + +BENCHMARK(BM_avx2_ternary) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_avx512_ternary) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_vpternlog_ternary) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex12/ternary_avx2.s b/chap18/ex12/ternary_avx2.s index d0cee6f..1f8c2a4 100644 --- a/chap18/ex12/ternary_avx2.s +++ b/chap18/ex12/ternary_avx2.s @@ -67,4 +67,6 @@ mainloop: vzeroupper ret - +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex12/ternary_avx512.s b/chap18/ex12/ternary_avx512.s index ec44501..58f03d2 100644 --- a/chap18/ex12/ternary_avx512.s +++ b/chap18/ex12/ternary_avx512.s @@ -66,4 +66,6 @@ mainloop: vzeroupper ret - +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex12/ternary_vpternlog.s b/chap18/ex12/ternary_vpternlog.s index d35a3a9..c4c053a 100644 --- a/chap18/ex12/ternary_vpternlog.s +++ b/chap18/ex12/ternary_vpternlog.s @@ -58,4 +58,6 @@ mainloop: vzeroupper ret - +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex13/CMakeLists.txt b/chap18/ex13/CMakeLists.txt index 7977cbb..92f7762 100644 --- a/chap18/ex13/CMakeLists.txt +++ b/chap18/ex13/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex13_srcs ex13_test.cpp transpose_scalar.c transpose_avx2.c transpose_avx512.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex13_srcs ${avx512_ex13_srcs} transpose_scalar.s transpose_avx2.s transpose_avx512.s) +set(avx512_ex13_ass transpose_scalar.s transpose_avx2.s transpose_avx512.s) elseif(MSVC) -set(avx512_ex13_srcs ${avx512_ex13_srcs} transpose_scalar.asm transpose_avx2.asm transpose_avx512.asm) +set(avx512_ex13_ass transpose_scalar.asm transpose_avx2.asm transpose_avx512.asm) endif() -add_executable(avx512_ex13_tests ${avx512_ex13_srcs}) - +add_executable(avx512_ex13_tests ex13_test.cpp transpose_scalar.c transpose_avx2.c transpose_avx512.c ${avx512_ex13_ass}) target_link_libraries(avx512_ex13_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex13_bench ex13_bench.cpp ${avx512_ex13_ass}) + target_link_libraries(avx512_ex13_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex13_test COMMAND avx512_ex13_tests) diff --git a/chap18/ex13/ex13_bench.cpp b/chap18/ex13/ex13_bench.cpp new file mode 100644 index 0000000..e51086b --- /dev/null +++ b/chap18/ex13/ex13_bench.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "optimisation_common.h" + +#include "transpose_avx2.h" +#include "transpose_avx512.h" +#include "transpose_scalar.h" + +const size_t MATRIX_W = 8; +const size_t MATRIX_H = 8; +const size_t MATRIX_COUNT = 50; +const size_t TOTAL_ELEMENTS = MATRIX_W * MATRIX_H * MATRIX_COUNT; + +static void init_sources(uint16_t *in) +{ + uint16_t counter = 0; + + for (size_t i = 0; i < MATRIX_COUNT; i++) + for (size_t j = 0; j < MATRIX_H; j++) + for (size_t k = 0; k < MATRIX_W; k++) { + in[counter] = counter; + counter++; + } +} + +static void BM_scalar_transpose(benchmark::State &state) +{ + int len = state.range(0); + + uint16_t *in = (uint16_t *)malloc(TOTAL_ELEMENTS * sizeof(*in)); + uint16_t *out = (uint16_t *)malloc(TOTAL_ELEMENTS * sizeof(*out)); + + init_sources(in); + memset(out, 0, TOTAL_ELEMENTS * sizeof(*out)); + + for (auto _ : state) { + for (int i = 0; i < len; i++) + (void)transpose_scalar(out, in); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(TOTAL_ELEMENTS * sizeof(in[0]))); + + free(out); + free(in); +} + +static void BM_avx2_transpose(benchmark::State &state) +{ + int len = state.range(0); + + uint16_t *in = (uint16_t *)_mm_malloc(TOTAL_ELEMENTS * sizeof(*in), 32); + uint16_t *out = + (uint16_t *)_mm_malloc(TOTAL_ELEMENTS * sizeof(*out), 32); + + init_sources(in); + memset(out, 0, TOTAL_ELEMENTS * sizeof(*out)); + + for (auto _ : state) { + for (int i = 0; i < len; i++) + (void)transpose_avx2(out, in); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(TOTAL_ELEMENTS * sizeof(in[0]))); + + _mm_free(out); + _mm_free(in); +} + +static void BM_avx512_transpose(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint16_t *in = (uint16_t *)_mm_malloc(TOTAL_ELEMENTS * sizeof(*in), 64); + uint16_t *out = + (uint16_t *)_mm_malloc(TOTAL_ELEMENTS * sizeof(*out), 64); + + init_sources(in); + memset(out, 0, TOTAL_ELEMENTS * sizeof(*out)); + + for (auto _ : state) { + for (int i = 0; i < len; i++) + (void)transpose_avx512(out, in); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(TOTAL_ELEMENTS * sizeof(in[0]))); + + _mm_free(out); + _mm_free(in); +} + +BENCHMARK(BM_scalar_transpose) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_avx2_transpose) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_avx512_transpose) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex13/transpose_avx2.s b/chap18/ex13/transpose_avx2.s index 16f3b70..0c654bd 100644 --- a/chap18/ex13/transpose_avx2.s +++ b/chap18/ex13/transpose_avx2.s @@ -65,3 +65,7 @@ matrix_loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex13/transpose_avx512.s b/chap18/ex13/transpose_avx512.s index 163481b..740e00e 100644 --- a/chap18/ex13/transpose_avx512.s +++ b/chap18/ex13/transpose_avx512.s @@ -63,3 +63,7 @@ permMaskBuffer: .short 5, 13, 21, 29, 37, 45, 53, 61 .short 6, 14, 22, 30, 38, 46, 54, 62 .short 7, 15, 23, 31, 39, 47, 55, 63 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex13/transpose_scalar.s b/chap18/ex13/transpose_scalar.s index d270162..e77fa42 100644 --- a/chap18/ex13/transpose_scalar.s +++ b/chap18/ex13/transpose_scalar.s @@ -60,3 +60,7 @@ innerloop: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex14/CMakeLists.txt b/chap18/ex14/CMakeLists.txt index ca25280..5163a4a 100644 --- a/chap18/ex14/CMakeLists.txt +++ b/chap18/ex14/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex14_srcs ex14_test.cpp register_broadcast.c memory_broadcast.c embedded_broadcast.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex14_srcs ${avx512_ex14_srcs} register_broadcast.s memory_broadcast.s embedded_broadcast.s) +set(avx512_ex14_ass register_broadcast.s memory_broadcast.s embedded_broadcast.s) elseif(MSVC) -set(avx512_ex14_srcs ${avx512_ex14_srcs} register_broadcast.asm memory_broadcast.asm embedded_broadcast.asm) +set(avx512_ex14_ass register_broadcast.asm memory_broadcast.asm embedded_broadcast.asm) endif() -add_executable(avx512_ex14_tests ${avx512_ex14_srcs}) - +add_executable(avx512_ex14_tests ex14_test.cpp register_broadcast.c memory_broadcast.c embedded_broadcast.c ${avx512_ex14_ass}) target_link_libraries(avx512_ex14_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex14_bench ex14_bench.cpp ${avx512_ex14_ass}) + target_link_libraries(avx512_ex14_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex14_test COMMAND avx512_ex14_tests) diff --git a/chap18/ex14/embedded_broadcast.s b/chap18/ex14/embedded_broadcast.s index 8db1728..1b85b13 100644 --- a/chap18/ex14/embedded_broadcast.s +++ b/chap18/ex14/embedded_broadcast.s @@ -47,3 +47,7 @@ loop: vmovdqu32 [rsi], zmm2 vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex14/ex14_bench.cpp b/chap18/ex14/ex14_bench.cpp new file mode 100644 index 0000000..11df3fb --- /dev/null +++ b/chap18/ex14/ex14_bench.cpp @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "optimisation_common.h" + +#include "embedded_broadcast.h" +#include "memory_broadcast.h" +#include "register_broadcast.h" + +#ifdef _MSC_VER +__declspec(align(64)) static uint32_t indices[16]; +__declspec(align(64)) static uint32_t input[16]; +__declspec(align(64)) static uint32_t output[16]; +#else +static uint32_t indices[16] __attribute__((aligned(64))); +static uint32_t input[16] __attribute__((aligned(64))); +static uint32_t output[16] __attribute__((aligned(64))); +#endif + +static void init_sources(uint32_t *broadcast_values, int len) +{ + for (uint32_t i = 0; i < 16; i++) + indices[i] = 15 - i; + for (uint32_t i = 0; i < (uint32_t)len; i++) + broadcast_values[i] = i + 1; + for (uint32_t i = 0; i < 16; i++) { + input[i] = i; + output[i] = 0; + } +} + +static void BM_register_broadcast(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint32_t *broadcast = (uint32_t *)malloc(len * sizeof(*broadcast)); + + init_sources(broadcast, len); + + for (auto _ : state) { + register_broadcast(input, output, (uint64_t)len, broadcast, + indices); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(broadcast[0]))); + + free(broadcast); +} + +static void BM_memory_broadcast(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint32_t *broadcast = (uint32_t *)malloc(len * sizeof(*broadcast)); + + init_sources(broadcast, len); + + for (auto _ : state) { + memory_broadcast(input, output, (uint64_t)len, broadcast, + indices); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(broadcast[0]))); + + free(broadcast); +} + +static void BM_embedded_broadcast(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint32_t *broadcast = (uint32_t *)malloc(len * sizeof(*broadcast)); + + init_sources(broadcast, len); + + for (auto _ : state) { + embedded_broadcast(input, output, (uint64_t)len, broadcast, + indices); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(broadcast[0]))); + + free(broadcast); +} + +BENCHMARK(BM_register_broadcast) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_memory_broadcast) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_embedded_broadcast) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex14/memory_broadcast.s b/chap18/ex14/memory_broadcast.s index 1cbcaf0..dac16b6 100644 --- a/chap18/ex14/memory_broadcast.s +++ b/chap18/ex14/memory_broadcast.s @@ -48,3 +48,7 @@ loop: vmovdqu32 [rsi], zmm2 vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex14/register_broadcast.s b/chap18/ex14/register_broadcast.s index 5d6b65d..c039f6b 100644 --- a/chap18/ex14/register_broadcast.s +++ b/chap18/ex14/register_broadcast.s @@ -49,3 +49,7 @@ loop: vmovdqu32 [rsi], zmm2 vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex15/CMakeLists.txt b/chap18/ex15/CMakeLists.txt index bdb14ed..42cc869 100644 --- a/chap18/ex15/CMakeLists.txt +++ b/chap18/ex15/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex15_srcs ex15_test.cpp register_broadcast.c memory_broadcast.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex15_srcs ${avx512_ex15_srcs} register_broadcast.s memory_broadcast.s) +set(avx512_ex15_ass register_broadcast.s memory_broadcast.s) elseif(MSVC) -set(avx512_ex15_srcs ${avx512_ex15_srcs} register_broadcast.asm memory_broadcast.asm) +set(avx512_ex15_ass register_broadcast.asm memory_broadcast.asm) endif() -add_executable(avx512_ex15_tests ${avx512_ex15_srcs}) - +add_executable(avx512_ex15_tests ex15_test.cpp register_broadcast.c memory_broadcast.c ${avx512_ex15_ass}) target_link_libraries(avx512_ex15_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex15_bench ex15_bench.cpp ${avx512_ex15_ass}) + target_link_libraries(avx512_ex15_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex15_test COMMAND avx512_ex15_tests) diff --git a/chap18/ex15/ex15_bench.cpp b/chap18/ex15/ex15_bench.cpp new file mode 100644 index 0000000..9b0ae32 --- /dev/null +++ b/chap18/ex15/ex15_bench.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "optimisation_common.h" + +#include "memory_broadcast.h" +#include "register_broadcast.h" + +const size_t MAX_OUTPUTS = 32; + +#ifdef _MSC_VER +__declspec(align(64)) static uint16_t indices[MAX_OUTPUTS]; +__declspec(align(64)) static uint16_t input[MAX_OUTPUTS]; +__declspec(align(64)) static uint16_t output[MAX_OUTPUTS]; +#else +static uint16_t indices[MAX_OUTPUTS] __attribute__((aligned(64))); +static uint16_t input[MAX_OUTPUTS] __attribute__((aligned(64))); +static uint16_t output[MAX_OUTPUTS] __attribute__((aligned(64))); +#endif + +static void init_sources() +{ + for (size_t i = 0; i < MAX_OUTPUTS; i++) { + indices[i] = (uint16_t)((MAX_OUTPUTS - 1) - i); + input[i] = (uint16_t)i; + output[i] = (uint16_t)0; + } +} + +static void BM_Register_broadcast(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + uint32_t *broadcast_values = + (uint32_t *)malloc(sizeof(*broadcast_values) * len); + + init_sources(); + + for (int i = 0; i < len; i++) + broadcast_values[i] = (uint16_t)(i + 1); + + for (auto _ : state) { + register_broadcast(input, output, len, broadcast_values, + indices); + } + + free(broadcast_values); +} + +static void BM_Memory_broadcast(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + uint16_t *broadcast_values_16 = + (uint16_t *)malloc(sizeof(*broadcast_values_16) * len); + + init_sources(); + + for (int i = 0; i < len; i++) + broadcast_values_16[i] = (uint16_t)(i + 1); + + for (auto _ : state) { + memory_broadcast(input, output, len, broadcast_values_16, + indices); + } + + free(broadcast_values_16); +} + +BENCHMARK(BM_Register_broadcast) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_Memory_broadcast) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex15/memory_broadcast.s b/chap18/ex15/memory_broadcast.s index 436547f..03b1a08 100644 --- a/chap18/ex15/memory_broadcast.s +++ b/chap18/ex15/memory_broadcast.s @@ -48,3 +48,7 @@ loop: vmovdqu32 [rsi], zmm2 vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex15/register_broadcast.s b/chap18/ex15/register_broadcast.s index 75342d4..4d5b981 100644 --- a/chap18/ex15/register_broadcast.s +++ b/chap18/ex15/register_broadcast.s @@ -49,3 +49,7 @@ loop: vmovdqu32 [rsi], zmm2 vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex16/CMakeLists.txt b/chap18/ex16/CMakeLists.txt index 046c6c6..96ff7d6 100644 --- a/chap18/ex16/CMakeLists.txt +++ b/chap18/ex16/CMakeLists.txt @@ -1,10 +1,9 @@ -set(avx512_ex16_srcs ex16_test.cpp embedded_rounding.c manual_rounding.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex16_srcs ${avx512_ex16_srcs} embedded_rounding.s manual_rounding.s) +set(avx512_ex16_ass embedded_rounding.s manual_rounding.s) elseif(MSVC) -set(avx512_ex16_srcs ${avx512_ex16_srcs} embedded_rounding.asm manual_rounding.asm) +set(avx512_ex16_ass embedded_rounding.asm manual_rounding.asm) endif() -add_executable(avx512_ex16_tests ${avx512_ex16_srcs}) +add_executable(avx512_ex16_tests ex16_test.cpp embedded_rounding.c manual_rounding.c ${avx512_ex16_ass}) target_link_libraries(avx512_ex16_tests gtest_main optimisation_common) add_test(NAME avx512_ex16_test COMMAND avx512_ex16_tests) diff --git a/chap18/ex16/embedded_rounding.s b/chap18/ex16/embedded_rounding.s index 90ef5b0..5f30c3a 100644 --- a/chap18/ex16/embedded_rounding.s +++ b/chap18/ex16/embedded_rounding.s @@ -41,3 +41,7 @@ embedded_rounding: vmovups [rdx], zmm7 vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex16/manual_rounding.s b/chap18/ex16/manual_rounding.s index 2ae87ec..11ee9df 100644 --- a/chap18/ex16/manual_rounding.s +++ b/chap18/ex16/manual_rounding.s @@ -60,3 +60,7 @@ manual_rounding: pop rbx vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex17/CMakeLists.txt b/chap18/ex17/CMakeLists.txt index e9f7c18..ddb988c 100644 --- a/chap18/ex17/CMakeLists.txt +++ b/chap18/ex17/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex17_tests ex17_test.cpp scalar_scatter.c software_scatter.c hardware_scatter.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex17_tests ${avx512_ex17_tests} scalar_scatter.s software_scatter.s hardware_scatter.s) +set(avx512_ex17_ass scalar_scatter.s software_scatter.s hardware_scatter.s) elseif(MSVC) -set(avx512_ex17_tests ${avx512_ex17_tests} scalar_scatter.asm software_scatter.asm hardware_scatter.asm) +set(avx512_ex17_ass scalar_scatter.asm software_scatter.asm hardware_scatter.asm) endif() -add_executable(avx512_ex17_tests ${avx512_ex17_tests}) - +add_executable(avx512_ex17_tests ex17_test.cpp scalar_scatter.c software_scatter.c hardware_scatter.c ${avx512_ex17_ass}) target_link_libraries(avx512_ex17_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex17_bench ex17_bench.cpp ${avx512_ex17_ass}) + target_link_libraries(avx512_ex17_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex17_test COMMAND avx512_ex17_tests) diff --git a/chap18/ex17/ex17_bench.cpp b/chap18/ex17/ex17_bench.cpp new file mode 100644 index 0000000..4ce1d9c --- /dev/null +++ b/chap18/ex17/ex17_bench.cpp @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "optimisation_common.h" + +#include "hardware_scatter.h" +#include "scalar_scatter.h" +#include "software_scatter.h" + +static void init_sources(uint64_t *input, uint32_t *indices, int len) +{ + for (int i = 0; i < len; i++) { + indices[i] = (uint32_t)i * 4; + input[i] = (uint64_t)rand(); + } +} + +static void BM_scatter_scalar(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint64_t *input = (uint64_t *)malloc(len * sizeof(*input)); + uint32_t *indices = (uint32_t *)malloc(len * sizeof(*indices)); + float *output = (float *)malloc(len * sizeof(*output) * 4); + + init_sources(input, indices, len); + + for (auto _ : state) { + scalar_scatter(input, indices, len * 4, output); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(input[0]))); + + free(output); + free(indices); + free(input); +} + +static void BM_scatter_software(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint64_t *input = (uint64_t *)_mm_malloc(len * sizeof(*input), 64); + uint32_t *indices = (uint32_t *)_mm_malloc(len * sizeof(*indices), 64); + float *output = (float *)_mm_malloc(len * sizeof(*output) * 4, 64); + + init_sources(input, indices, len); + + for (auto _ : state) { + software_scatter(input, indices, len * 4, output); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(input[0]))); + + _mm_free(output); + _mm_free(indices); + _mm_free(input); +} + +static void BM_scatter_hardware(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint64_t *input = (uint64_t *)_mm_malloc(len * sizeof(*input), 64); + uint32_t *indices = (uint32_t *)_mm_malloc(len * sizeof(*indices), 64); + float *output = (float *)_mm_malloc(len * sizeof(*output) * 4, 64); + + init_sources(input, indices, len); + + for (auto _ : state) { + hardware_scatter(input, indices, len * 4, output); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(input[0]))); + + _mm_free(output); + _mm_free(indices); + _mm_free(input); +} + +BENCHMARK(BM_scatter_scalar) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_scatter_software) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_scatter_hardware) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex17/hardware_scatter.s b/chap18/ex17/hardware_scatter.s index 2d61c23..fe830ba 100644 --- a/chap18/ex17/hardware_scatter.s +++ b/chap18/ex17/hardware_scatter.s @@ -53,3 +53,7 @@ mainloop: vzeroupper pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex17/scalar_scatter.s b/chap18/ex17/scalar_scatter.s index a5764d7..a9644ac 100644 --- a/chap18/ex17/scalar_scatter.s +++ b/chap18/ex17/scalar_scatter.s @@ -49,3 +49,7 @@ mainloop: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex17/software_scatter.s b/chap18/ex17/software_scatter.s index 6ad76f8..3c39de2 100644 --- a/chap18/ex17/software_scatter.s +++ b/chap18/ex17/software_scatter.s @@ -108,3 +108,7 @@ mainloop: .quad 0x0000000400000003 .quad 0x0000000600000005 .quad 0x0000000800000007 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex18/CMakeLists.txt b/chap18/ex18/CMakeLists.txt index d7cf3ee..ecbfab4 100644 --- a/chap18/ex18/CMakeLists.txt +++ b/chap18/ex18/CMakeLists.txt @@ -1,11 +1,15 @@ -set(avx512_ex18_srcs ex18_test.cpp qword_avx2_intrinsics.c qword_avx2.c qword_avx512_intrinsics.c qword_avx512.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex18_srcs ${avx512_ex18_srcs} qword_avx2_ass.s qword_avx512_ass.s) +set(avx512_ex18_ass qword_avx2_ass.s qword_avx512_ass.s) set_property(SOURCE qword_avx512_intrinsics.c APPEND PROPERTY COMPILE_OPTIONS "-mavx512f" "-mavx512dq") elseif(MSVC) -set(avx512_ex18_srcs ${avx512_ex18_srcs} qword_avx2_ass.asm qword_avx512_ass.asm) +set(avx512_ex18_ass qword_avx2_ass.asm qword_avx512_ass.asm) endif() -add_executable(avx512_ex18_tests ${avx512_ex18_srcs}) - +add_executable(avx512_ex18_tests ex18_test.cpp qword_avx2_intrinsics.c qword_avx2.c qword_avx512_intrinsics.c qword_avx512.c ${avx512_ex18_ass}) target_link_libraries(avx512_ex18_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex18_bench ex18_bench.cpp qword_avx2_intrinsics.c qword_avx512_intrinsics.c ${avx512_ex18_ass}) + target_link_libraries(avx512_ex18_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex18_test COMMAND avx512_ex18_tests) diff --git a/chap18/ex18/ex18_bench.cpp b/chap18/ex18/ex18_bench.cpp new file mode 100644 index 0000000..ac50682 --- /dev/null +++ b/chap18/ex18/ex18_bench.cpp @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "optimisation_common.h" + +#include "qword_avx2.h" +#include "qword_avx512.h" + +static void init_sources(int64_t *input_a, int64_t *input_b, int len) +{ + for (int i = 0; i < len; i++) { + input_a[i] = rand() - (RAND_MAX / 2); + input_b[i] = rand() - (RAND_MAX / 2); + } +} + +static void BM_qword_avx2_instrinsics(benchmark::State &state) +{ + int len = state.range(0); + + int64_t *input_a = (int64_t *)_mm_malloc(len * sizeof(*input_a), 32); + int64_t *input_b = (int64_t *)_mm_malloc(len * sizeof(*input_b), 32); + int64_t *output = (int64_t *)_mm_malloc(len * sizeof(*output), 32); + + init_sources(input_a, input_b, len); + + for (auto _ : state) { + qword_avx2_intrinsics(input_a, input_b, output, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(input_a[0]) * 2)); + + _mm_free(output); + _mm_free(input_b); + _mm_free(input_a); +} + +static void BM_qword_avx2_ass(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + int64_t *input_a = (int64_t *)_mm_malloc(len * sizeof(*input_a), 32); + int64_t *input_b = (int64_t *)_mm_malloc(len * sizeof(*input_b), 32); + int64_t *output = (int64_t *)_mm_malloc(len * sizeof(*output), 32); + + init_sources(input_a, input_b, len); + + for (auto _ : state) { + qword_avx2_ass(input_a, input_b, output, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(input_a[0]) * 2)); + + _mm_free(output); + _mm_free(input_b); + _mm_free(input_a); +} + +static void BM_qword_avx512_instrinsics(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + int64_t *input_a = (int64_t *)_mm_malloc(len * sizeof(*input_a), 64); + int64_t *input_b = (int64_t *)_mm_malloc(len * sizeof(*input_b), 64); + int64_t *output = (int64_t *)_mm_malloc(len * sizeof(*output), 64); + + init_sources(input_a, input_b, len); + + for (auto _ : state) { + qword_avx512_intrinsics(input_a, input_b, output, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(input_a[0]) * 2)); + + _mm_free(output); + _mm_free(input_b); + _mm_free(input_a); +} + +static void BM_qword_avx512_ass(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + int64_t *input_a = (int64_t *)_mm_malloc(len * sizeof(*input_a), 64); + int64_t *input_b = (int64_t *)_mm_malloc(len * sizeof(*input_b), 64); + int64_t *output = (int64_t *)_mm_malloc(len * sizeof(*output), 64); + + init_sources(input_a, input_b, len); + + for (auto _ : state) { + qword_avx512_ass(input_a, input_b, output, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(input_a[0]) * 2)); + + _mm_free(output); + _mm_free(input_b); + _mm_free(input_a); +} + +BENCHMARK(BM_qword_avx2_instrinsics) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_qword_avx2_ass) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_qword_avx512_instrinsics) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_qword_avx512_ass) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex18/ex18_test.cpp b/chap18/ex18/ex18_test.cpp index 6f7909f..3e95ac7 100644 --- a/chap18/ex18/ex18_test.cpp +++ b/chap18/ex18/ex18_test.cpp @@ -52,9 +52,6 @@ static void init_sources() TEST(avx512_18, qword_avx2_instrinics) { - if (!supports_avx512_skx()) - GTEST_SKIP_("AVX-512 not supported, skipping test"); - init_sources(); memset(output, 0, MAX_SIZE * sizeof(int64_t)); diff --git a/chap18/ex18/qword_avx2_ass.s b/chap18/ex18/qword_avx2_ass.s index 1f3ae15..2b5d9d2 100644 --- a/chap18/ex18/qword_avx2_ass.s +++ b/chap18/ex18/qword_avx2_ass.s @@ -160,3 +160,7 @@ loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex18/qword_avx512_ass.s b/chap18/ex18/qword_avx512_ass.s index 51100c9..4df8a0a 100644 --- a/chap18/ex18/qword_avx512_ass.s +++ b/chap18/ex18/qword_avx512_ass.s @@ -72,3 +72,7 @@ loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex19/CMakeLists.txt b/chap18/ex19/CMakeLists.txt index cc6d29a..7334667 100644 --- a/chap18/ex19/CMakeLists.txt +++ b/chap18/ex19/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex19_srcs ex19_test.cpp scalar_histogram.c avx512_histogram.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex19_srcs ${avx512_ex19_srcs} scalar_histogram.s avx512_histogram.s) +set(avx512_ex19_ass scalar_histogram.s avx512_histogram.s) elseif(MSVC) -set(avx512_ex19_srcs ${avx512_ex19_srcs} scalar_histogram.asm avx512_histogram.asm) +set(avx512_ex19_ass scalar_histogram.asm avx512_histogram.asm) endif() -add_executable(avx512_ex19_tests ${avx512_ex19_srcs}) - +add_executable(avx512_ex19_tests ex19_test.cpp scalar_histogram.c avx512_histogram.c ${avx512_ex19_ass}) target_link_libraries(avx512_ex19_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex19_bench ex19_bench.cpp ${avx512_ex19_ass}) + target_link_libraries(avx512_ex19_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex19_test COMMAND avx512_ex19_tests) diff --git a/chap18/ex19/avx512_histogram.s b/chap18/ex19/avx512_histogram.s index 579c2ab..c68c06f 100644 --- a/chap18/ex19/avx512_histogram.s +++ b/chap18/ex19/avx512_histogram.s @@ -83,3 +83,6 @@ update: ret +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex19/ex19_bench.cpp b/chap18/ex19/ex19_bench.cpp new file mode 100644 index 0000000..bfd4f60 --- /dev/null +++ b/chap18/ex19/ex19_bench.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "optimisation_common.h" + +#include "avx512_histogram.h" +#include "scalar_histogram.h" + +const size_t MAX_BINS = 32; + +#ifdef _MSC_VER +__declspec(align(64)) static uint32_t histogram[MAX_BINS]; +#else +static uint32_t histogram[MAX_BINS] __attribute__((aligned(64))); +#endif + +static void init_sources(int32_t *inputs, int len) +{ + for (int i = 0; i < len; i++) + inputs[i] = rand(); + memset(histogram, 0, sizeof(histogram)); +} + +static void BM_scalar_histogram(benchmark::State &state) +{ + int len = state.range(0); + + int32_t *inputs = (int32_t *)malloc(len * sizeof(*inputs)); + + init_sources(inputs, len); + + for (auto _ : state) { + scalar_histogram(inputs, histogram, len, MAX_BINS); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(inputs[0]))); + + free(inputs); +} + +static void BM_avx512_histogram(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + int32_t *inputs = (int32_t *)_mm_malloc(len * sizeof(*inputs), 64); + + init_sources(inputs, len); + + for (auto _ : state) { + avx512_histogram(inputs, histogram, len, MAX_BINS); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(inputs[0]))); + + _mm_free(inputs); +} + +BENCHMARK(BM_scalar_histogram) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_avx512_histogram) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex19/scalar_histogram.s b/chap18/ex19/scalar_histogram.s index 3c29cfa..a4b5c9c 100644 --- a/chap18/ex19/scalar_histogram.s +++ b/chap18/ex19/scalar_histogram.s @@ -61,3 +61,7 @@ histogram_loop: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex2/CMakeLists.txt b/chap18/ex2/CMakeLists.txt index 5b6dcdc..88ac055 100644 --- a/chap18/ex2/CMakeLists.txt +++ b/chap18/ex2/CMakeLists.txt @@ -1,10 +1,15 @@ -set(avx512_ex2_srcs ex2_test.cpp transform_avx.c transform_avx512.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex2_srcs ${avx512_ex2_srcs} transform_avx.s transform_avx512.s) +set(avx512_ex2_ass transform_avx.s transform_avx512.s) elseif(MSVC) -set(avx512_ex2_srcs ${avx512_ex2_srcs} transform_avx.asm transform_avx512.asm) +set(avx512_ex2_ass transform_avx.asm transform_avx512.asm) endif() -add_executable(avx512_ex2_tests ${avx512_ex2_srcs}) + +add_executable(avx512_ex2_tests ex2_test.cpp transform_avx.c transform_avx512.c ${avx512_ex2_ass}) target_link_libraries(avx512_ex2_tests gtest_main optimisation_common) +IF( benchmark_FOUND ) + add_executable(avx512_ex2_bench ex2_bench.cpp ${avx512_ex2_ass}) + target_link_libraries(avx512_ex2_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex2_test COMMAND avx512_ex2_tests) diff --git a/chap18/ex2/ex2_bench.cpp b/chap18/ex2/ex2_bench.cpp new file mode 100644 index 0000000..9d30921 --- /dev/null +++ b/chap18/ex2/ex2_bench.cpp @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "optimisation_common.h" + +#include "transform_avx.h" +#include "transform_avx512.h" + +static void BM_transform_avx(benchmark::State &state) +{ + int len = state.range(0); + // Dynamic memory allocation with 32byte alignment + float *pInVector = (float *)_mm_malloc(len * sizeof(float), 32); + float *pOutVector = (float *)_mm_malloc(len * sizeof(float), 32); + + // init data + for (int i = 0; i < len; i++) + pInVector[i] = 1; + + float cos_teta = 0.8660254037; + float sin_teta = 0.5; + + // clang-format off + + //Static memory allocation of 8 floats with 32byte alignments +#ifdef _MSC_VER + __declspec(align(32)) float cos_sin_teta_vec[8] = { +#else + float cos_sin_teta_vec[8] __attribute__((aligned(32))) = { +#endif + cos_teta, sin_teta, cos_teta, sin_teta, + cos_teta, sin_teta, cos_teta, sin_teta + }; +#ifdef _MSC_VER + __declspec(align(32)) float sin_cos_teta_vec[8] = { +#else + float sin_cos_teta_vec[8] __attribute__((aligned(32))) = { +#endif + sin_teta, cos_teta, sin_teta, cos_teta, + sin_teta, cos_teta, sin_teta, cos_teta + }; + + // clang-format on + + for (auto _ : state) { + transform_avx(cos_sin_teta_vec, sin_cos_teta_vec, pInVector, + pOutVector, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(pInVector[0]))); + + _mm_free(pInVector); + _mm_free(pOutVector); +} + +static void BM_transform_avx512(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + // Dynamic memory allocation with 64byte alignment + float *pInVector = (float *)_mm_malloc(len * sizeof(float), 64); + float *pOutVector = (float *)_mm_malloc(len * sizeof(float), 64); + + // init data + for (int i = 0; i < len; i++) + pInVector[i] = 1; + + float cos_teta = 0.8660254037; + float sin_teta = 0.5; + + // clang-format off + + //Static memory allocation of 16 floats with 64byte align- ments +#ifdef _MSC_VER + __declspec(align(64)) float cos_sin_teta_vec[16] = { +#else + float cos_sin_teta_vec[16] __attribute__((aligned(64))) = { +#endif + cos_teta, sin_teta, cos_teta, sin_teta, + cos_teta, sin_teta, cos_teta, sin_teta, + cos_teta, sin_teta, cos_teta, sin_teta, + cos_teta, sin_teta, cos_teta, sin_teta + }; +#ifdef _MSC_VER + __declspec(align(64)) float sin_cos_teta_vec[16] = { +#else + float sin_cos_teta_vec[16] __attribute__((aligned(64))) = { +#endif + sin_teta, cos_teta, sin_teta, cos_teta, + sin_teta, cos_teta, sin_teta, cos_teta, + sin_teta, cos_teta, sin_teta, cos_teta, + sin_teta, cos_teta, sin_teta, cos_teta + }; + + // clang-format on + + for (auto _ : state) { + transform_avx512(cos_sin_teta_vec, sin_cos_teta_vec, pInVector, + pOutVector, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(pInVector[0]))); + + _mm_free(pInVector); + _mm_free(pOutVector); +} + +BENCHMARK(BM_transform_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_transform_avx512) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex2/ex2_test.cpp b/chap18/ex2/ex2_test.cpp index d287e51..4bb819e 100644 --- a/chap18/ex2/ex2_test.cpp +++ b/chap18/ex2/ex2_test.cpp @@ -64,15 +64,14 @@ TEST(avx512_2, transform_avx) true); for (int i = 0; i < len; i += 2) { - if (i & 1) { - float cosx = pInVector[i + 1] * cos_teta; - float sinx = pInVector[i + 1] * sin_teta; - EXPECT_FLOAT_EQ(sinx + cosx, pOutVector[i]); - } else { - float cosx = pInVector[i] * cos_teta; - float sinx = pInVector[i] * sin_teta; - EXPECT_FLOAT_EQ(cosx - sinx, pOutVector[i]); - } + // Assert X' + float cosx = pInVector[i] * cos_teta; + float siny = pInVector[i + 1] * sin_teta; + ASSERT_FLOAT_EQ(cosx - siny, pOutVector[i]); + // Assert Y' + float sinx = pInVector[i] * sin_teta; + float cosy = pInVector[i + 1] * cos_teta; + ASSERT_FLOAT_EQ(sinx + cosy, pOutVector[i + 1]); } ASSERT_EQ(transform_avx_check(NULL, sin_cos_teta_vec, pInVector, @@ -148,15 +147,14 @@ TEST(avx512_2, transform_avx512) true); for (int i = 0; i < len; i += 2) { - if (i & 1) { - float cosx = pInVector[i + 1] * cos_teta; - float sinx = pInVector[i + 1] * sin_teta; - EXPECT_FLOAT_EQ(sinx + cosx, pOutVector[i]); - } else { - float cosx = pInVector[i] * cos_teta; - float sinx = pInVector[i] * sin_teta; - EXPECT_FLOAT_EQ(cosx - sinx, pOutVector[i]); - } + // Assert X' + float cosx = pInVector[i] * cos_teta; + float siny = pInVector[i + 1] * sin_teta; + ASSERT_FLOAT_EQ(cosx - siny, pOutVector[i]); + // Assert Y' + float sinx = pInVector[i] * sin_teta; + float cosy = pInVector[i + 1] * cos_teta; + ASSERT_FLOAT_EQ(sinx + cosy, pOutVector[i + 1]); } ASSERT_EQ(transform_avx512_check(NULL, sin_cos_teta_vec, pInVector, diff --git a/chap18/ex2/transform_avx.s b/chap18/ex2/transform_avx.s index 230cc2d..8902003 100644 --- a/chap18/ex2/transform_avx.s +++ b/chap18/ex2/transform_avx.s @@ -63,3 +63,7 @@ loop1: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex2/transform_avx512.s b/chap18/ex2/transform_avx512.s index 962f8ba..3fd5231 100644 --- a/chap18/ex2/transform_avx512.s +++ b/chap18/ex2/transform_avx512.s @@ -62,3 +62,7 @@ loop1: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex20/CMakeLists.txt b/chap18/ex20/CMakeLists.txt index f6f30f0..bb70881 100644 --- a/chap18/ex20/CMakeLists.txt +++ b/chap18/ex20/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex20_srcs ex20_test.cpp scalar_vector_dp.c avx512_vector_dp.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex20_srcs ${avx512_ex20_srcs} scalar_vector_dp.s avx512_vector_dp.s) +set(avx512_ex20_ass scalar_vector_dp.s avx512_vector_dp.s) elseif(MSVC) -set(avx512_ex20_srcs ${avx512_ex20_srcs} scalar_vector_dp.asm avx512_vector_dp.asm) +set(avx512_ex20_ass scalar_vector_dp.asm avx512_vector_dp.asm) endif() -add_executable(avx512_ex20_tests ${avx512_ex20_srcs}) - +add_executable(avx512_ex20_tests ex20_test.cpp init_sparse.cpp scalar_vector_dp.c avx512_vector_dp.c ${avx512_ex20_ass}) target_link_libraries(avx512_ex20_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex20_bench ex20_bench.cpp init_sparse.cpp ${avx512_ex20_ass}) + target_link_libraries(avx512_ex20_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex20_test COMMAND avx512_ex20_tests) diff --git a/chap18/ex20/avx512_vector_dp.s b/chap18/ex20/avx512_vector_dp.s index f536032..f693eef 100644 --- a/chap18/ex20/avx512_vector_dp.s +++ b/chap18/ex20/avx512_vector_dp.s @@ -186,3 +186,6 @@ upconvert_control: .quad 0x0000000000000006 .quad 0x0000000000000007 +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex20/ex20_bench.cpp b/chap18/ex20/ex20_bench.cpp new file mode 100644 index 0000000..10d48f2 --- /dev/null +++ b/chap18/ex20/ex20_bench.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "optimisation_common.h" + +#include "avx512_vector_dp.h" +#include "init_sparse.h" +#include "scalar_vector_dp.h" + +static void BM_scalar_vector_dp(benchmark::State &state) +{ + int len = state.range(0); + + uint32_t *a_index = (uint32_t *)malloc(len * 4 * sizeof(*a_index)); + double *a_values = (double *)malloc(len * sizeof(*a_values)); + uint32_t *b_index = (uint32_t *)malloc(len * 4 * sizeof(*a_index)); + double *b_values = (double *)malloc(len * sizeof(*b_values)); + + init_sparse(a_index, a_values, b_index, b_values, len); + + for (auto _ : state) { + scalar_vector_dp(a_index, a_values, b_index, b_values, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(a_values[0]) * 2)); + + free(b_values); + free(b_index); + free(a_values); + free(a_index); +} + +static void BM_avx512_vector_dp(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint32_t *a_index = + (uint32_t *)_mm_malloc(len * 4 * sizeof(*a_index), 64); + double *a_values = (double *)_mm_malloc(len * sizeof(*a_values), 64); + uint32_t *b_index = + (uint32_t *)_mm_malloc(len * 4 * sizeof(*a_index), 64); + double *b_values = (double *)_mm_malloc(len * sizeof(*b_values), 64); + + init_sparse(a_index, a_values, b_index, b_values, len); + + for (auto _ : state) { + avx512_vector_dp(a_index, a_values, b_index, b_values, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(a_values[0]) * 2)); + + _mm_free(b_values); + _mm_free(b_index); + _mm_free(a_values); + _mm_free(a_index); +} + +BENCHMARK(BM_scalar_vector_dp) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_avx512_vector_dp) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex20/ex20_test.cpp b/chap18/ex20/ex20_test.cpp index b65f052..fa025fa 100644 --- a/chap18/ex20/ex20_test.cpp +++ b/chap18/ex20/ex20_test.cpp @@ -13,11 +13,10 @@ * PERFORMANCE OF THIS SOFTWARE. */ -#include - #include "gtest/gtest.h" #include "avx512_vector_dp.h" +#include "init_sparse.h" #include "optimisation_common.h" #include "scalar_vector_dp.h" @@ -59,36 +58,7 @@ static void compute_ref_sum() static void init_sources() { - for (uint32_t i = 0; i < MAX_SIZE; i++) { - a_index[i] = i; - b_index[i] = i; - } - - for (size_t i = 0; i < MAX_ELS; i++) { - a_values[i] = (((double)rand()) / RAND_MAX) - 0.5; - b_values[i] = (((double)rand()) / RAND_MAX) - 0.5; - } - - for (size_t i = 0; i < MAX_SIZE; i++) { - size_t a = rand() % MAX_SIZE; - size_t b = rand() % MAX_SIZE; - uint32_t tmp; - - tmp = a_index[a]; - a_index[a] = a_index[b]; - a_index[b] = tmp; - - a = rand() % MAX_SIZE; - b = rand() % MAX_SIZE; - - tmp = b_index[a]; - b_index[a] = b_index[b]; - b_index[b] = tmp; - } - - std::sort(&a_index[0], &a_index[MAX_ELS]); - std::sort(&b_index[0], &b_index[MAX_ELS]); - + init_sparse(a_index, a_values, b_index, b_values, MAX_ELS); compute_ref_sum(); } @@ -98,9 +68,6 @@ TEST(avx512_20, scalar_vector_dp) srand(0); - if (!supports_avx512_skx()) - GTEST_SKIP_("AVX-512 not supported, skipping test"); - init_sources(); sum = 0.0; diff --git a/chap18/ex20/init_sparse.cpp b/chap18/ex20/init_sparse.cpp new file mode 100644 index 0000000..2a0db07 --- /dev/null +++ b/chap18/ex20/init_sparse.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include "init_sparse.h" +#include + +void init_sparse(uint32_t *a_index, double *a_values, uint32_t *b_index, + double *b_values, size_t len) +{ + size_t max_size = len * 4; + + for (size_t i = 0; i < max_size; i++) { + a_index[i] = (uint32_t)i; + b_index[i] = (uint32_t)i; + } + + for (size_t i = 0; i < len; i++) { + a_values[i] = (((double)rand()) / RAND_MAX) - 0.5; + b_values[i] = (((double)rand()) / RAND_MAX) - 0.5; + } + + for (size_t i = 0; i < max_size; i++) { + size_t a = rand() % max_size; + size_t b = rand() % max_size; + uint32_t tmp; + + tmp = a_index[a]; + a_index[a] = a_index[b]; + a_index[b] = tmp; + + a = rand() % max_size; + b = rand() % max_size; + + tmp = b_index[a]; + b_index[a] = b_index[b]; + b_index[b] = tmp; + } + + std::sort(&a_index[0], &a_index[len]); + std::sort(&b_index[0], &b_index[len]); +} diff --git a/chap18/ex20/init_sparse.h b/chap18/ex20/init_sparse.h new file mode 100644 index 0000000..d9f2621 --- /dev/null +++ b/chap18/ex20/init_sparse.h @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef INIT_SPARSE_H__ +#define INIT_SPARSE_H__ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void init_sparse(uint32_t *a_index, double *a_values, uint32_t *b_index, + double *b_values, size_t len); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/chap18/ex20/scalar_vector_dp.s b/chap18/ex20/scalar_vector_dp.s index 904d5c5..93579f3 100644 --- a/chap18/ex20/scalar_vector_dp.s +++ b/chap18/ex20/scalar_vector_dp.s @@ -83,3 +83,7 @@ end: movsd xmm0, xmm4 ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex21/CMakeLists.txt b/chap18/ex21/CMakeLists.txt index 76b55d4..a28718c 100644 --- a/chap18/ex21/CMakeLists.txt +++ b/chap18/ex21/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex21_srcs ex21_test.cpp lookup_novbmi.c lookup_vbmi.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex21_srcs ${avx512_ex21_srcs} lookup_novbmi.s lookup_vbmi.s) +set(avx512_ex21_ass lookup_novbmi.s lookup_vbmi.s) elseif(MSVC) -set(avx512_ex21_srcs ${avx512_ex21_srcs} lookup_novbmi.asm lookup_vbmi.asm) +set(avx512_ex21_ass lookup_novbmi.asm lookup_vbmi.asm) endif() -add_executable(avx512_ex21_tests ${avx512_ex21_srcs}) - +add_executable(avx512_ex21_tests ex21_test.cpp lookup_novbmi.c lookup_vbmi.c ${avx512_ex21_ass}) target_link_libraries(avx512_ex21_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex21_bench ex21_bench.cpp ${avx512_ex21_ass}) + target_link_libraries(avx512_ex21_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex21_test COMMAND avx512_ex21_tests) diff --git a/chap18/ex21/ex21_bench.cpp b/chap18/ex21/ex21_bench.cpp new file mode 100644 index 0000000..7eb9969 --- /dev/null +++ b/chap18/ex21/ex21_bench.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "optimisation_common.h" + +#include "lookup_novbmi.h" +#include "lookup_vbmi.h" + +#ifdef _MSC_VER // Preferred VS2019 version 16.3 or higher +__declspec(align(64)) static unsigned char b[64]; +#else +static uint8_t b[64] __attribute__((aligned(64))); +#endif + +static void init_sources(uint8_t *a, uint8_t *out, int len) +{ + for (int i = 0; i < len; i++) { + a[i] = static_cast(i % 255); + out[i] = static_cast(0); + } + for (size_t i = 0; i < 64; i++) { + b[i] = static_cast(63 - i); + } +} + +static void BM_lookup_novbmi(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint8_t *a = (uint8_t *)_mm_malloc(len, 64); + uint8_t *out = (uint8_t *)_mm_malloc(len, 64); + + init_sources(a, out, len); + + for (auto _ : state) { + lookup_novbmi(a, b, out, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len)); + + _mm_free(out); + _mm_free(a); +} + +static void BM_lookup_vbmi(benchmark::State &state) +{ + if (!supports_avx512_icl()) { + state.SkipWithError("VBMI not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint8_t *a = (uint8_t *)_mm_malloc(len, 64); + uint8_t *out = (uint8_t *)_mm_malloc(len, 64); + + init_sources(a, out, len); + + for (auto _ : state) { + lookup_vbmi(a, b, out, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len)); + + _mm_free(out); + _mm_free(a); +} + +BENCHMARK(BM_lookup_novbmi) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_lookup_vbmi) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex21/ex21_test.cpp b/chap18/ex21/ex21_test.cpp index 136423d..62ce92e 100644 --- a/chap18/ex21/ex21_test.cpp +++ b/chap18/ex21/ex21_test.cpp @@ -33,7 +33,7 @@ static unsigned char out[MAX_SIZE] __attribute__((aligned(64))); static unsigned char c_out[MAX_SIZE] __attribute__((aligned(64))); #endif -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { a[i] = static_cast(i % 255); @@ -45,8 +45,8 @@ void init_sources() } } -void lookup(unsigned char *in_bytes, unsigned char *out_bytes, - unsigned char *dictionary_bytes, int numOfElements) +static void lookup(unsigned char *in_bytes, unsigned char *out_bytes, + unsigned char *dictionary_bytes, int numOfElements) { for (int i = 0; i < numOfElements; i++) { out_bytes[i] = dictionary_bytes[in_bytes[i] & 63]; diff --git a/chap18/ex21/lookup_novbmi.s b/chap18/ex21/lookup_novbmi.s index 0502f42..c6b524b 100644 --- a/chap18/ex21/lookup_novbmi.s +++ b/chap18/ex21/lookup_novbmi.s @@ -50,3 +50,7 @@ loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex21/lookup_vbmi.s b/chap18/ex21/lookup_vbmi.s index fa47163..fe55407 100644 --- a/chap18/ex21/lookup_vbmi.s +++ b/chap18/ex21/lookup_vbmi.s @@ -46,3 +46,7 @@ loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex22/CMakeLists.txt b/chap18/ex22/CMakeLists.txt index 9d2b73f..984470b 100644 --- a/chap18/ex22/CMakeLists.txt +++ b/chap18/ex22/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex22_srcs ex22_test.cpp lookup128_novbmi.c lookup128_vbmi.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex22_srcs ${avx512_ex22_srcs} lookup128_novbmi.s lookup128_vbmi.s) +set(avx512_ex22_ass lookup128_novbmi.s lookup128_vbmi.s) elseif(MSVC) -set(avx512_ex22_srcs ${avx512_ex22_srcs} lookup128_novbmi.asm lookup128_vbmi.asm) +set(avx512_ex22_ass lookup128_novbmi.asm lookup128_vbmi.asm) endif() -add_executable(avx512_ex22_tests ${avx512_ex22_srcs}) - +add_executable(avx512_ex22_tests ex22_test.cpp lookup128_novbmi.c lookup128_vbmi.c ${avx512_ex22_ass}) target_link_libraries(avx512_ex22_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex22_bench ex22_bench.cpp ${avx512_ex22_ass}) + target_link_libraries(avx512_ex22_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex22_test COMMAND avx512_ex22_tests) diff --git a/chap18/ex22/ex22_bench.cpp b/chap18/ex22/ex22_bench.cpp new file mode 100644 index 0000000..a4db3dd --- /dev/null +++ b/chap18/ex22/ex22_bench.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "optimisation_common.h" + +#include "lookup128_novbmi.h" +#include "lookup128_vbmi.h" + +#ifdef _MSC_VER // Preferred VS2019 version 16.3 or higher +__declspec(align(64)) static unsigned char b[128]; +#else +static uint8_t b[128] __attribute__((aligned(64))); +#endif + +static void init_sources(uint8_t *a, uint8_t *out, int len) +{ + for (int i = 0; i < len; i++) { + a[i] = static_cast(i % 255); + out[i] = static_cast(0); + } + for (size_t i = 0; i < 128; i++) { + b[i] = static_cast(127 - i); + } +} + +static void BM_lookup128_novbmi(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint8_t *a = (uint8_t *)_mm_malloc(len, 64); + uint8_t *out = (uint8_t *)_mm_malloc(len, 64); + + init_sources(a, out, len); + + for (auto _ : state) { + lookup128_novbmi(a, b, out, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len)); + + _mm_free(out); + _mm_free(a); +} + +static void BM_lookup128_vbmi(benchmark::State &state) +{ + if (!supports_avx512_icl()) { + state.SkipWithError("VBMI not supported, skipping test"); + return; + } + + int len = state.range(0); + + uint8_t *a = (uint8_t *)_mm_malloc(len, 64); + uint8_t *out = (uint8_t *)_mm_malloc(len, 64); + + init_sources(a, out, len); + + for (auto _ : state) { + lookup128_vbmi(a, b, out, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len)); + + _mm_free(out); + _mm_free(a); +} + +BENCHMARK(BM_lookup128_novbmi) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_lookup128_vbmi) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex22/lookup128_novbmi.s b/chap18/ex22/lookup128_novbmi.s index 3e4efbf..96b8ec2 100644 --- a/chap18/ex22/lookup128_novbmi.s +++ b/chap18/ex22/lookup128_novbmi.s @@ -66,3 +66,7 @@ loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex22/lookup128_vbmi.s b/chap18/ex22/lookup128_vbmi.s index 5ebf8e6..518e8fb 100644 --- a/chap18/ex22/lookup128_vbmi.s +++ b/chap18/ex22/lookup128_vbmi.s @@ -46,3 +46,7 @@ loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex23/CMakeLists.txt b/chap18/ex23/CMakeLists.txt index 7586ea8..ecaa32e 100644 --- a/chap18/ex23/CMakeLists.txt +++ b/chap18/ex23/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex23_srcs ex23_test.cpp decompress_novbmi.c decompress_vbmi.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex23_srcs ${avx512_ex23_srcs} decompress_novbmi.s decompress_vbmi.s) +set(avx512_ex23_ass decompress_novbmi.s decompress_vbmi.s) elseif(MSVC) -set(avx512_ex23_srcs ${avx512_ex23_srcs} decompress_novbmi.asm decompress_vbmi.asm) +set(avx512_ex23_ass decompress_novbmi.asm decompress_vbmi.asm) endif() -add_executable(avx512_ex23_tests ${avx512_ex23_srcs}) - +add_executable(avx512_ex23_tests ex23_test.cpp decompress_novbmi.c decompress_vbmi.c ${avx512_ex23_ass}) target_link_libraries(avx512_ex23_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex23_bench ex23_bench.cpp ${avx512_ex23_ass}) + target_link_libraries(avx512_ex23_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex23_test COMMAND avx512_ex23_tests) diff --git a/chap18/ex23/decompress_novbmi.s b/chap18/ex23/decompress_novbmi.s index 31e8f1b..684a889 100644 --- a/chap18/ex23/decompress_novbmi.s +++ b/chap18/ex23/decompress_novbmi.s @@ -71,3 +71,7 @@ loop: jb loop ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex23/decompress_vbmi.s b/chap18/ex23/decompress_vbmi.s index 3100f47..a2f206a 100644 --- a/chap18/ex23/decompress_vbmi.s +++ b/chap18/ex23/decompress_vbmi.s @@ -73,3 +73,8 @@ multishift_ctrl: .byte 0, 5, 10,15,20,25,30,35 .byte 0, 5, 10,15,20,25,30,35 .byte 0, 5, 10,15,20,25,30,35 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + diff --git a/chap18/ex23/ex23_bench.cpp b/chap18/ex23/ex23_bench.cpp new file mode 100644 index 0000000..4639aff --- /dev/null +++ b/chap18/ex23/ex23_bench.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "optimisation_common.h" + +#include "decompress_novbmi.h" +#include "decompress_vbmi.h" + +static void init_sources(uint8_t *a, uint8_t *out, int max_input_size, int len) +{ + for (int i = 0; i < max_input_size; i++) { + a[i] = static_cast(i % 255); + } + + memset(out, 0, len); +} + +static void BM_decompress_novbmi(benchmark::State &state) +{ + int len = state.range(0); + int input_size = (len / 40) * 40; + int max_input_size = input_size + 24; + + uint8_t *a = (uint8_t *)malloc(max_input_size); + uint8_t *out = (uint8_t *)malloc(len); + + init_sources(a, out, max_input_size, len); + + for (auto _ : state) { + decompress_novbmi(len, out, a); + } + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(input_size)); + + free(out); + free(a); +} + +static void BM_decompress_vbmi(benchmark::State &state) +{ + if (!supports_avx512_icl()) { + state.SkipWithError("VBMI not supported, skipping test"); + return; + } + + int len = state.range(0); + int input_size = (len / 40) * 40; + int max_input_size = input_size + 24; + + uint8_t *a = (uint8_t *)_mm_malloc(max_input_size, 64); + uint8_t *out = (uint8_t *)_mm_malloc(len, 64); + + init_sources(a, out, max_input_size, len); + + for (auto _ : state) { + decompress_vbmi(out, a, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(input_size)); + + _mm_free(out); + _mm_free(a); +} + +BENCHMARK(BM_decompress_novbmi) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_decompress_vbmi) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex24/CMakeLists.txt b/chap18/ex24/CMakeLists.txt index b2b992b..795aa38 100644 --- a/chap18/ex24/CMakeLists.txt +++ b/chap18/ex24/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex24_srcs ex24_test.cpp) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex24_srcs ${avx512_ex24_srcs} only_256bit.s both_256_512bit.s) +set(avx512_ex24_ass only_256bit.s both_256_512bit.s) elseif(MSVC) -set(avx512_ex24_srcs ${avx512_ex24_srcs} only_256bit.asm both_256_512bit.asm) +set(avx512_ex24_ass only_256bit.asm both_256_512bit.asm) endif() -add_executable(avx512_ex24_tests ${avx512_ex24_srcs}) - +add_executable(avx512_ex24_tests ex24_test.cpp ${avx512_ex24_ass}) target_link_libraries(avx512_ex24_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex24_bench ex24_bench.cpp ${avx512_ex24_ass}) + target_link_libraries(avx512_ex24_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex24_test COMMAND avx512_ex24_tests) diff --git a/chap18/ex24/both_256_512bit.s b/chap18/ex24/both_256_512bit.s index 1e4bbac..9eddc26 100644 --- a/chap18/ex24/both_256_512bit.s +++ b/chap18/ex24/both_256_512bit.s @@ -57,3 +57,7 @@ Loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex24/ex24_bench.cpp b/chap18/ex24/ex24_bench.cpp new file mode 100644 index 0000000..4707d34 --- /dev/null +++ b/chap18/ex24/ex24_bench.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "optimisation_common.h" + +#include "both_256_512bit.h" +#include "only_256bit.h" + +static void BM_only_256bit(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + uint64_t len = static_cast(state.range(0)); + for (auto _ : state) { + only_256bit(len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(256)); +} + +static void BM_both_256_512bit(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + uint64_t len = static_cast(state.range(0)); + for (auto _ : state) { + both_256_512bit(len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(256)); +} + +BENCHMARK(BM_only_256bit) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_both_256_512bit) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex24/only_256bit.s b/chap18/ex24/only_256bit.s index e042784..606b476 100644 --- a/chap18/ex24/only_256bit.s +++ b/chap18/ex24/only_256bit.s @@ -57,3 +57,7 @@ Loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex25/fma_only_tpt.s b/chap18/ex25/fma_only_tpt.s index 4f09011..1248f24 100644 --- a/chap18/ex25/fma_only_tpt.s +++ b/chap18/ex25/fma_only_tpt.s @@ -64,3 +64,7 @@ loop1: .p2align 6 one_vec: .double 1, 1, 1, 1, 1, 1, 1, 1 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex25/fma_shuffle_tpt.s b/chap18/ex25/fma_shuffle_tpt.s index 1c6c1b3..b59e24b 100644 --- a/chap18/ex25/fma_shuffle_tpt.s +++ b/chap18/ex25/fma_shuffle_tpt.s @@ -91,3 +91,7 @@ one_vec: .double 1, 1, 1, 1, 1, 1, 1, 1 shuf_vec: .4byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex26/CMakeLists.txt b/chap18/ex26/CMakeLists.txt index aafe891..3db6eae 100644 --- a/chap18/ex26/CMakeLists.txt +++ b/chap18/ex26/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex26_srcs ex26_test.cpp g2s_vpgatherdd.c g2s_vpermi2d.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex26_srcs ${avx512_ex26_srcs} g2s_vpgatherdd.s g2s_vpermi2d.s) +set(avx512_ex26_ass g2s_vpgatherdd.s g2s_vpermi2d.s) elseif(MSVC) -set(avx512_ex26_srcs ${avx512_ex26_srcs} g2s_vpgatherdd.asm g2s_vpermi2d.asm) +set(avx512_ex26_ass g2s_vpgatherdd.asm g2s_vpermi2d.asm) endif() -add_executable(avx512_ex26_tests ${avx512_ex26_srcs}) - +add_executable(avx512_ex26_tests ex26_test.cpp g2s_vpgatherdd.c g2s_vpermi2d.c ${avx512_ex26_ass}) target_link_libraries(avx512_ex26_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex26_bench ex26_bench.cpp ${avx512_ex26_ass}) + target_link_libraries(avx512_ex26_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex26_test COMMAND avx512_ex26_tests) diff --git a/chap18/ex26/ex26_bench.cpp b/chap18/ex26/ex26_bench.cpp new file mode 100644 index 0000000..e4628db --- /dev/null +++ b/chap18/ex26/ex26_bench.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "optimisation_common.h" + +#include "g2s_vpermi2d.h" +#include "g2s_vpgatherdd.h" + +static void init_sources(complex_num *aos, float *soa_real, + float *soa_imaginary, int len) +{ + for (int i = 0; i < len; i++) { + aos[i].real = (float)i; + aos[i].imaginary = (float)i + 1; + soa_real[i] = 0.0; + soa_imaginary[i] = 0.0; + } +} + +static void BM_g2s_vpgatherdd(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = static_cast(state.range(0)); + complex_num *aos = + reinterpret_cast(_mm_malloc(sizeof(*aos) * len, 64)); + float *soa_real = + reinterpret_cast(_mm_malloc(sizeof(*soa_real) * len, 64)); + float *soa_imaginary = reinterpret_cast( + _mm_malloc(sizeof(*soa_imaginary) * len, 64)); + + init_sources(aos, soa_real, soa_imaginary, len); + + for (auto _ : state) { + g2s_vpgatherdd(len, aos, soa_imaginary, soa_real); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*aos))); + + _mm_free(soa_imaginary); + _mm_free(soa_real); + _mm_free(aos); +} + +static void BM_g2s_vpermi2d(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = static_cast(state.range(0)); + complex_num *aos = + reinterpret_cast(_mm_malloc(sizeof(*aos) * len, 64)); + float *soa_real = + reinterpret_cast(_mm_malloc(sizeof(*soa_real) * len, 64)); + float *soa_imaginary = reinterpret_cast( + _mm_malloc(sizeof(*soa_imaginary) * len, 64)); + + init_sources(aos, soa_real, soa_imaginary, len); + + for (auto _ : state) { + g2s_vpermi2d(len, aos, soa_imaginary, soa_real); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*aos))); + + _mm_free(soa_imaginary); + _mm_free(soa_real); + _mm_free(aos); +} + +BENCHMARK(BM_g2s_vpgatherdd) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_g2s_vpermi2d) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex26/ex26_test.cpp b/chap18/ex26/ex26_test.cpp index 3852311..6ead02a 100644 --- a/chap18/ex26/ex26_test.cpp +++ b/chap18/ex26/ex26_test.cpp @@ -25,7 +25,7 @@ static complex_num aos[MAX_SIZE]; static float soa_real[MAX_SIZE]; static float soa_imaginary[MAX_SIZE]; -void init_sources() +static void init_sources() { for (size_t i = 0; i < MAX_SIZE; i++) { aos[i].real = (float)i; diff --git a/chap18/ex26/g2s_vpermi2d.s b/chap18/ex26/g2s_vpermi2d.s index 69f7d64..28942d0 100644 --- a/chap18/ex26/g2s_vpermi2d.s +++ b/chap18/ex26/g2s_vpermi2d.s @@ -69,3 +69,7 @@ gather_imag_index: .4byte 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 gather_real_index: .4byte 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex26/g2s_vpgatherdd.s b/chap18/ex26/g2s_vpgatherdd.s index 9acc897..2e693f8 100644 --- a/chap18/ex26/g2s_vpgatherdd.s +++ b/chap18/ex26/g2s_vpgatherdd.s @@ -83,3 +83,7 @@ gather_imag_index: .4byte 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 gather_real_index: .4byte 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex27/CMakeLists.txt b/chap18/ex27/CMakeLists.txt index c21f581..4a9d191 100644 --- a/chap18/ex27/CMakeLists.txt +++ b/chap18/ex27/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex27_srcs ex27_test.cpp s2s_vscatterdps.c s2s_verpmi2d.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex27_srcs ${avx512_ex27_srcs} s2s_vscatterdps.s s2s_vpermi2d.s) +set(avx512_ex27_ass s2s_vscatterdps.s s2s_vpermi2d.s) elseif(MSVC) -set(avx512_ex27_srcs ${avx512_ex27_srcs} s2s_vscatterdps.asm s2s_vpermi2d.asm) +set(avx512_ex27_ass s2s_vscatterdps.asm s2s_vpermi2d.asm) endif() -add_executable(avx512_ex27_tests ${avx512_ex27_srcs}) - +add_executable(avx512_ex27_tests ex27_test.cpp s2s_vscatterdps.c s2s_verpmi2d.c ${avx512_ex27_ass}) target_link_libraries(avx512_ex27_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex27_bench ex27_bench.cpp ${avx512_ex27_ass}) + target_link_libraries(avx512_ex27_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex27_test COMMAND avx512_ex27_tests) diff --git a/chap18/ex27/ex27_bench.cpp b/chap18/ex27/ex27_bench.cpp new file mode 100644 index 0000000..3ceef79 --- /dev/null +++ b/chap18/ex27/ex27_bench.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "optimisation_common.h" + +#include "s2s_vpermi2d.h" +#include "s2s_vscatterdps.h" + +static void init_sources(complex_num *aos, float *soa_real, + float *soa_imaginary, int len) +{ + for (int i = 0; i < len; i++) { + soa_real[i] = (float)i; + soa_imaginary[i] = (float)i + 1; + aos[i].real = 0.0; + aos[i].imaginary = 0.0; + } +} + +static void BM_s2s_vscatterdps(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = static_cast(state.range(0)); + complex_num *aos = + reinterpret_cast(_mm_malloc(sizeof(*aos) * len, 64)); + float *soa_real = + reinterpret_cast(_mm_malloc(sizeof(*soa_real) * len, 64)); + float *soa_imaginary = reinterpret_cast( + _mm_malloc(sizeof(*soa_imaginary) * len, 64)); + + init_sources(aos, soa_real, soa_imaginary, len); + + for (auto _ : state) { + s2s_vscatterdps(len, soa_imaginary, soa_real, aos); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*aos))); + + _mm_free(soa_imaginary); + _mm_free(soa_real); + _mm_free(aos); +} + +static void BM_s2s_vpermi2d(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = static_cast(state.range(0)); + complex_num *aos = + reinterpret_cast(_mm_malloc(sizeof(*aos) * len, 64)); + float *soa_real = + reinterpret_cast(_mm_malloc(sizeof(*soa_real) * len, 64)); + float *soa_imaginary = reinterpret_cast( + _mm_malloc(sizeof(*soa_imaginary) * len, 64)); + + init_sources(aos, soa_real, soa_imaginary, len); + + for (auto _ : state) { + s2s_vpermi2d(len, soa_imaginary, soa_real, aos); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*aos))); + + _mm_free(soa_imaginary); + _mm_free(soa_real); + _mm_free(aos); +} + +BENCHMARK(BM_s2s_vscatterdps) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_s2s_vpermi2d) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex27/s2s_vpermi2d.s b/chap18/ex27/s2s_vpermi2d.s index 248e075..903c734 100644 --- a/chap18/ex27/s2s_vpermi2d.s +++ b/chap18/ex27/s2s_vpermi2d.s @@ -65,3 +65,7 @@ first_half: .4byte 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 second_half: .4byte 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex27/s2s_vscatterdps.s b/chap18/ex27/s2s_vscatterdps.s index a4fab76..f07a0a5 100644 --- a/chap18/ex27/s2s_vscatterdps.s +++ b/chap18/ex27/s2s_vscatterdps.s @@ -65,3 +65,7 @@ gather_imag_index: .4byte 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 gather_real_index: .4byte 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex28/CMakeLists.txt b/chap18/ex28/CMakeLists.txt index 3d00574..54378b3 100644 --- a/chap18/ex28/CMakeLists.txt +++ b/chap18/ex28/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex28_srcs ex28_test.cpp adj_vpgatherpd.c adj_load_masked_broadcast.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex28_srcs ${avx512_ex28_srcs} adj_vpgatherpd.s adj_load_masked_broadcast.s) +set(avx512_ex28_ass adj_vpgatherpd.s adj_load_masked_broadcast.s) elseif(MSVC) -set(avx512_ex28_srcs ${avx512_ex28_srcs} adj_vpgatherpd.asm adj_load_masked_broadcast.asm) +set(avx512_ex28_ass adj_vpgatherpd.asm adj_load_masked_broadcast.asm) endif() -add_executable(avx512_ex28_tests ${avx512_ex28_srcs}) - +add_executable(avx512_ex28_tests ex28_test.cpp adj_vpgatherpd.c adj_load_masked_broadcast.c ${avx512_ex28_ass}) target_link_libraries(avx512_ex28_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex28_bench ex28_bench.cpp ${avx512_ex28_ass}) + target_link_libraries(avx512_ex28_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex28_test COMMAND avx512_ex28_tests) diff --git a/chap18/ex28/adj_load_masked_broadcast.s b/chap18/ex28/adj_load_masked_broadcast.s index 575deee..de61293 100644 --- a/chap18/ex28/adj_load_masked_broadcast.s +++ b/chap18/ex28/adj_load_masked_broadcast.s @@ -56,3 +56,7 @@ loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex28/adj_vpgatherpd.s b/chap18/ex28/adj_vpgatherpd.s index 0df792a..7aa8a6c 100644 --- a/chap18/ex28/adj_vpgatherpd.s +++ b/chap18/ex28/adj_vpgatherpd.s @@ -73,3 +73,7 @@ index_inc: .4byte 0, 8, 16, 24, 0, 8, 16, 24 index_scale: .4byte 32, 32, 32, 32, 32, 32, 32, 32 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex28/ex28_bench.cpp b/chap18/ex28/ex28_bench.cpp new file mode 100644 index 0000000..d18c2fd --- /dev/null +++ b/chap18/ex28/ex28_bench.cpp @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "optimisation_common.h" + +#include "adj_load_masked_broadcast.h" +#include "adj_vpgatherpd.h" +#include "elem_struct.h" + +static void init_sources(elem_struct_t *in, int32_t *indices, int len) +{ + for (int32_t i = 0; i < len; i++) { + for (size_t j = 0; j < 4; j++) + in[i].var[j] = ((double)rand()) / RAND_MAX; + indices[i] = i; + } + + for (int i = 0; i < len; i++) { + size_t a = rand() % len; + size_t b = rand() % len; + int32_t tmp = indices[a]; + indices[a] = indices[b]; + indices[b] = tmp; + } +} + +static void BM_adj_vpgatherpd(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = static_cast(state.range(0)); + elem_struct_t *in = + reinterpret_cast(malloc(sizeof(*in) * len)); + double *out = + reinterpret_cast(_mm_malloc(4 * sizeof(*out) * len, 64)); + int32_t *indices = + reinterpret_cast(_mm_malloc(sizeof(*indices) * len, 64)); + + init_sources(in, indices, len); + + for (auto _ : state) { + adj_vpgatherpd(len, indices, in, out); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*in))); + + _mm_free(indices); + _mm_free(out); + free(in); +} + +static void BM_adj_load_masked_broadcast(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = static_cast(state.range(0)); + elem_struct_t *in = + reinterpret_cast(malloc(sizeof(*in) * len)); + double *out = + reinterpret_cast(_mm_malloc(4 * sizeof(*out) * len, 64)); + int32_t *indices = + reinterpret_cast(_mm_malloc(sizeof(*indices) * len, 64)); + + init_sources(in, indices, len); + + for (auto _ : state) { + adj_load_masked_broadcast(len, indices, in, out); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*in))); + + _mm_free(indices); + _mm_free(out); + free(in); +} + +BENCHMARK(BM_adj_vpgatherpd) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_adj_load_masked_broadcast) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex29/CMakeLists.txt b/chap18/ex29/CMakeLists.txt index e900f62..e557ca2 100644 --- a/chap18/ex29/CMakeLists.txt +++ b/chap18/ex29/CMakeLists.txt @@ -1,10 +1,13 @@ -set(avx512_ex29_srcs ex29_test.cpp saxpy_512.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex29_srcs ${avx512_ex29_srcs} saxpy_512.s) +set(avx512_ex29_ass saxpy_512.s) elseif(MSVC) -set(avx512_ex29_srcs ${avx512_ex29_srcs} saxpy_512.asm) +set(avx512_ex29_ass saxpy_512.asm) endif() -add_executable(avx512_ex29_tests ${avx512_ex29_srcs}) - +add_executable(avx512_ex29_tests ex29_test.cpp saxpy_512.c ${avx512_ex29_ass}) target_link_libraries(avx512_ex29_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex29_bench ex29_bench.cpp ${avx512_ex29_ass}) + target_link_libraries(avx512_ex29_bench benchmark::benchmark optimisation_common) +ENDIF() add_test(NAME avx512_ex29_test COMMAND avx512_ex29_tests) diff --git a/chap18/ex29/ex29_bench.cpp b/chap18/ex29/ex29_bench.cpp new file mode 100644 index 0000000..7078568 --- /dev/null +++ b/chap18/ex29/ex29_bench.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "optimisation_common.h" + +#include "saxpy_512.h" + +static void init_sources(float *src, float *src2, int len) +{ + for (int32_t i = 0; i < len; i++) { + src[i] = 2.0f * i; + src2[i] = 3.0f * i; + } +} + +static void BM_saxpy512_aligned(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = static_cast(state.range(0)); + float *src = + reinterpret_cast(_mm_malloc(sizeof(*src) * len, 64)); + float *src2 = + reinterpret_cast(_mm_malloc(sizeof(*src2) * len, 64)); + float *dest = + reinterpret_cast(_mm_malloc(sizeof(*dest) * len, 64)); + + init_sources(src, src2, len); + + for (auto _ : state) { + saxpy_512(src, src2, len, dest, 10.0); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*src)) * int64_t(2)); + + _mm_free(dest); + _mm_free(src2); + _mm_free(src); +} + +static void BM_saxpy512_unaligned_dest(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = static_cast(state.range(0)); + float *src = + reinterpret_cast(_mm_malloc(sizeof(*src) * len, 64)); + float *src2 = + reinterpret_cast(_mm_malloc(sizeof(*src2) * len, 64)); + float *dest = reinterpret_cast( + _mm_malloc(sizeof(*dest) * (len + 1), 64)); + + init_sources(src, src2, len); + + for (auto _ : state) { + saxpy_512(src, src2, len, &dest[1], 10.0); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*src)) * int64_t(2)); + + _mm_free(dest); + _mm_free(src2); + _mm_free(src); +} + +static void BM_saxpy512_unaligned(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = static_cast(state.range(0)); + float *src = + reinterpret_cast(_mm_malloc(sizeof(*src) * (len + 1), 64)); + float *src2 = reinterpret_cast( + _mm_malloc(sizeof(*src2) * (len + 1), 64)); + float *dest = reinterpret_cast( + _mm_malloc(sizeof(*dest) * (len + 1), 64)); + + init_sources(&src[1], &src2[1], len); + + for (auto _ : state) { + saxpy_512(&src[1], &src2[1], len, &dest[1], 10.0); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*src)) * int64_t(2)); + + _mm_free(dest); + _mm_free(src2); + _mm_free(src); +} + +static void BM_saxpy512_unaligned_src(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = static_cast(state.range(0)); + float *src = + reinterpret_cast(_mm_malloc(sizeof(*src) * len, 64)); + float *src2 = + reinterpret_cast(_mm_malloc(sizeof(*src2) * len, 64)); + float *dest = reinterpret_cast( + _mm_malloc(sizeof(*dest) * (len + 1), 64)); + + init_sources(&src[1], src2, len); + + for (auto _ : state) { + saxpy_512(&src[1], src2, len, dest, 10.0); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * + int64_t(sizeof(*src)) * int64_t(2)); + + _mm_free(dest); + _mm_free(src2); + _mm_free(src); +} + +BENCHMARK(BM_saxpy512_aligned) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 13) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_saxpy512_unaligned_dest) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 13) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_saxpy512_unaligned) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 13) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_saxpy512_unaligned_src) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 13) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex29/saxpy_512.s b/chap18/ex29/saxpy_512.s index 2173891..bcfe46c 100644 --- a/chap18/ex29/saxpy_512.s +++ b/chap18/ex29/saxpy_512.s @@ -64,3 +64,7 @@ mainloop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex3/CMakeLists.txt b/chap18/ex3/CMakeLists.txt index 68f28fb..b550688 100644 --- a/chap18/ex3/CMakeLists.txt +++ b/chap18/ex3/CMakeLists.txt @@ -1,6 +1,13 @@ if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) set_property(SOURCE mul_blend_avx512.c APPEND PROPERTY COMPILE_OPTIONS "-mavx512f") endif() -add_executable(avx512_ex3_tests ex3_test.cpp mul_blend_avx.c mul_blend_avx512.c) +set(avx512_ex3_srcs mul_blend_avx.c mul_blend_avx512.c) + +add_executable(avx512_ex3_tests ex3_test.cpp ${avx512_ex3_srcs}) target_link_libraries(avx512_ex3_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex3_bench ex3_bench.cpp ${avx512_ex3_srcs}) + target_link_libraries(avx512_ex3_bench benchmark::benchmark optimisation_common) +ENDIF() add_test(NAME avx512_ex3_test COMMAND avx512_ex3_tests) diff --git a/chap18/ex3/ex3_bench.cpp b/chap18/ex3/ex3_bench.cpp new file mode 100644 index 0000000..f1f5687 --- /dev/null +++ b/chap18/ex3/ex3_bench.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "mul_blend_avx.h" +#include "mul_blend_avx512.h" +#include "optimisation_common.h" + +static void init_sources(int len, double *a, double *b, double *c) +{ + for (int i = 0; i < len; i++) { + a[i] = (float)((i & 1) ? i : 0); + b[i] = (float)i + 1; + c[i] = 0.0; + } +} + +static void BM_blend_avx(benchmark::State &state) +{ + int len = state.range(0); + double *a = (double *)_mm_malloc(len * sizeof(double), 32); + double *b = (double *)_mm_malloc(len * sizeof(double), 32); + double *c = (double *)_mm_malloc(len * sizeof(double), 32); + + init_sources(len, a, b, c); + for (auto _ : state) { + mul_blend_avx_check(a, b, c, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * 2 * + int64_t(sizeof(a[0]))); + + _mm_free(c); + _mm_free(b); + _mm_free(a); +} + +static void BM_blend_avx512(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + double *a = (double *)_mm_malloc(len * sizeof(double), 64); + double *b = (double *)_mm_malloc(len * sizeof(double), 64); + double *c = (double *)_mm_malloc(len * sizeof(double), 64); + + init_sources(len, a, b, c); + for (auto _ : state) { + mul_blend_avx512(a, b, c, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * 2 * + int64_t(sizeof(a[0]))); + + _mm_free(c); + _mm_free(b); + _mm_free(a); +} + +BENCHMARK(BM_blend_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_blend_avx512) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex30/CMakeLists.txt b/chap18/ex30/CMakeLists.txt index 260f2b0..f826004 100644 --- a/chap18/ex30/CMakeLists.txt +++ b/chap18/ex30/CMakeLists.txt @@ -1,10 +1,9 @@ -set(avx512_ex30_srcs ex30_test.cpp single_div_24.c single_div_23.c single_div_14.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex30_srcs ${avx512_ex30_srcs} single_div_24.s single_div_23.s single_div_14.s) +set(avx512_ex30_ass single_div_24.s single_div_23.s single_div_14.s) elseif(MSVC) -set(avx512_ex30_srcs ${avx512_ex30_srcs} single_div_24.asm single_div_23.asm single_div_14.asm) +set(avx512_ex30_ass single_div_24.asm single_div_23.asm single_div_14.asm) endif() -add_executable(avx512_ex30_tests ${avx512_ex30_srcs}) +add_executable(avx512_ex30_tests ex30_test.cpp single_div_24.c single_div_23.c single_div_14.c ${avx512_ex30_ass}) target_link_libraries(avx512_ex30_tests gtest_main optimisation_common) add_test(NAME avx512_ex30_test COMMAND avx512_ex30_tests) diff --git a/chap18/ex30/single_div_14.s b/chap18/ex30/single_div_14.s index ef99336..6c9d44d 100644 --- a/chap18/ex30/single_div_14.s +++ b/chap18/ex30/single_div_14.s @@ -40,3 +40,7 @@ single_div_14: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex30/single_div_23.s b/chap18/ex30/single_div_23.s index 17f7b69..e203414 100644 --- a/chap18/ex30/single_div_23.s +++ b/chap18/ex30/single_div_23.s @@ -43,3 +43,7 @@ single_div_23: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex30/single_div_24.s b/chap18/ex30/single_div_24.s index b7b409a..107c164 100644 --- a/chap18/ex30/single_div_24.s +++ b/chap18/ex30/single_div_24.s @@ -37,3 +37,7 @@ single_div_24: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex31/CMakeLists.txt b/chap18/ex31/CMakeLists.txt index 960d5a9..158df77 100644 --- a/chap18/ex31/CMakeLists.txt +++ b/chap18/ex31/CMakeLists.txt @@ -1,10 +1,9 @@ -set(avx512_ex31_srcs ex31_test.cpp single_rcps_22.c single_rcps_23.c single_rcps_14.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex31_srcs ${avx512_ex31_srcs} single_rcps_22.s single_rcps_23.s single_rcps_14.s) +set(avx512_ex31_ass single_rcps_22.s single_rcps_23.s single_rcps_14.s) elseif(MSVC) -set(avx512_ex31_srcs ${avx512_ex31_srcs} single_rcps_22.asm single_rcps_23.asm single_rcps_14.asm) +set(avx512_ex31_ass single_rcps_22.asm single_rcps_23.asm single_rcps_14.asm) endif() -add_executable(avx512_ex31_tests ${avx512_ex31_srcs}) +add_executable(avx512_ex31_tests ex31_test.cpp single_rcps_22.c single_rcps_23.c single_rcps_14.c ${avx512_ex31_ass}) target_link_libraries(avx512_ex31_tests gtest_main optimisation_common) add_test(NAME avx512_ex31_test COMMAND avx512_ex31_tests) diff --git a/chap18/ex31/single_rcps_14.s b/chap18/ex31/single_rcps_14.s index d2d0df9..7706fa9 100644 --- a/chap18/ex31/single_rcps_14.s +++ b/chap18/ex31/single_rcps_14.s @@ -37,3 +37,7 @@ single_rcps_14: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex31/single_rcps_22.s b/chap18/ex31/single_rcps_22.s index 351de7c..dd6b171 100644 --- a/chap18/ex31/single_rcps_22.s +++ b/chap18/ex31/single_rcps_22.s @@ -44,3 +44,7 @@ single_rcps_22: .p2align 2 one: .float 1.0 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex31/single_rcps_23.s b/chap18/ex31/single_rcps_23.s index 0ca126c..06a837d 100644 --- a/chap18/ex31/single_rcps_23.s +++ b/chap18/ex31/single_rcps_23.s @@ -49,3 +49,7 @@ single_rcps_23: .p2align 2 half: .float 0.5 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex32/CMakeLists.txt b/chap18/ex32/CMakeLists.txt index ce6f7ea..8f68eca 100644 --- a/chap18/ex32/CMakeLists.txt +++ b/chap18/ex32/CMakeLists.txt @@ -1,10 +1,9 @@ -set(avx512_ex32_srcs ex32_test.cpp single_sqrt_24.c single_sqrt_23.c single_sqrt_14.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex32_srcs ${avx512_ex32_srcs} single_sqrt_24.s single_sqrt_23.s single_sqrt_14.s) +set(avx512_ex32_ass single_sqrt_24.s single_sqrt_23.s single_sqrt_14.s) elseif(MSVC) -set(avx512_ex32_srcs ${avx512_ex32_srcs} single_sqrt_24.asm single_sqrt_23.asm single_sqrt_14.asm) +set(avx512_ex32_ass single_sqrt_24.asm single_sqrt_23.asm single_sqrt_14.asm) endif() -add_executable(avx512_ex32_tests ${avx512_ex32_srcs}) +add_executable(avx512_ex32_tests ex32_test.cpp single_sqrt_24.c single_sqrt_23.c single_sqrt_14.c ${avx512_ex32_ass}) target_link_libraries(avx512_ex32_tests gtest_main optimisation_common) add_test(NAME avx512_ex32_test COMMAND avx512_ex32_tests) diff --git a/chap18/ex32/single_sqrt_14.s b/chap18/ex32/single_sqrt_14.s index 7e2fb82..6a95c9d 100644 --- a/chap18/ex32/single_sqrt_14.s +++ b/chap18/ex32/single_sqrt_14.s @@ -40,3 +40,7 @@ single_sqrt_14: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex32/single_sqrt_23.s b/chap18/ex32/single_sqrt_23.s index 5939687..6d1d7d6 100644 --- a/chap18/ex32/single_sqrt_23.s +++ b/chap18/ex32/single_sqrt_23.s @@ -49,3 +49,7 @@ single_sqrt_23: .p2align 2 half: .float 0.5 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex32/single_sqrt_24.s b/chap18/ex32/single_sqrt_24.s index 0da12a2..0b90879 100644 --- a/chap18/ex32/single_sqrt_24.s +++ b/chap18/ex32/single_sqrt_24.s @@ -37,3 +37,7 @@ single_sqrt_24: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex33/CMakeLists.txt b/chap18/ex33/CMakeLists.txt index fe719d6..268e375 100644 --- a/chap18/ex33/CMakeLists.txt +++ b/chap18/ex33/CMakeLists.txt @@ -1,10 +1,9 @@ -set(avx512_ex33_srcs ex33_test.cpp double_div_53.c double_div_52.c double_div_26.c double_div_14.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex33_srcs ${avx512_ex33_srcs} double_div_53.s double_div_52.s double_div_26.s double_div_14.s) +set(avx512_ex33_ass double_div_53.s double_div_52.s double_div_26.s double_div_14.s) elseif(MSVC) -set(avx512_ex33_srcs ${avx512_ex33_srcs} double_div_53.asm double_div_52.asm double_div_26.asm double_div_14.asm) +set(avx512_ex33_ass double_div_53.asm double_div_52.asm double_div_26.asm double_div_14.asm) endif() -add_executable(avx512_ex33_tests ${avx512_ex33_srcs}) +add_executable(avx512_ex33_tests ex33_test.cpp double_div_53.c double_div_52.c double_div_26.c double_div_14.c ${avx512_ex33_ass}) target_link_libraries(avx512_ex33_tests gtest_main optimisation_common) add_test(NAME avx512_ex33_test COMMAND avx512_ex33_tests) diff --git a/chap18/ex33/double_div_14.s b/chap18/ex33/double_div_14.s index cefa8a9..ab18b44 100644 --- a/chap18/ex33/double_div_14.s +++ b/chap18/ex33/double_div_14.s @@ -40,3 +40,7 @@ double_div_14: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex33/double_div_26.s b/chap18/ex33/double_div_26.s index aaf8356..deb49cf 100644 --- a/chap18/ex33/double_div_26.s +++ b/chap18/ex33/double_div_26.s @@ -44,3 +44,7 @@ double_div_26: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex33/double_div_52.s b/chap18/ex33/double_div_52.s index e739bb5..e643c87 100644 --- a/chap18/ex33/double_div_52.s +++ b/chap18/ex33/double_div_52.s @@ -54,3 +54,7 @@ double_div_52: .p2align 3 one: .double 1.0 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex33/double_div_53.s b/chap18/ex33/double_div_53.s index 3e4b221..1fc90ef 100644 --- a/chap18/ex33/double_div_53.s +++ b/chap18/ex33/double_div_53.s @@ -39,3 +39,7 @@ double_div_53: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex34/CMakeLists.txt b/chap18/ex34/CMakeLists.txt index d112f51..8610cad 100644 --- a/chap18/ex34/CMakeLists.txt +++ b/chap18/ex34/CMakeLists.txt @@ -1,10 +1,9 @@ -set(avx512_ex34_srcs ex34_test.cpp double_rsqrt_52.c double_rsqrt_51.c double_rsqrt_50.c double_rsqrt_26.c double_rsqrt_14.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex34_srcs ${avx512_ex34_srcs} double_rsqrt_52.s double_rsqrt_51.s double_rsqrt_50.s double_rsqrt_26.s double_rsqrt_14.s) +set(avx512_ex34_ass double_rsqrt_52.s double_rsqrt_51.s double_rsqrt_50.s double_rsqrt_26.s double_rsqrt_14.s) elseif(MSVC) -set(avx512_ex34_srcs ${avx512_ex34_srcs} double_rsqrt_52.asm double_rsqrt_51.asm double_rsqrt_50.asm double_rsqrt_26.asm double_rsqrt_14.asm) +set(avx512_ex34_ass double_rsqrt_52.asm double_rsqrt_51.asm double_rsqrt_50.asm double_rsqrt_26.asm double_rsqrt_14.asm) endif() -add_executable(avx512_ex34_tests ${avx512_ex34_srcs}) +add_executable(avx512_ex34_tests ex34_test.cpp double_rsqrt_52.c double_rsqrt_51.c double_rsqrt_50.c double_rsqrt_26.c double_rsqrt_14.c ${avx512_ex34_ass}) target_link_libraries(avx512_ex34_tests gtest_main optimisation_common) add_test(NAME avx512_ex34_test COMMAND avx512_ex34_tests) diff --git a/chap18/ex34/double_rsqrt_14.s b/chap18/ex34/double_rsqrt_14.s index 1c4ff04..0a58870 100644 --- a/chap18/ex34/double_rsqrt_14.s +++ b/chap18/ex34/double_rsqrt_14.s @@ -37,3 +37,7 @@ double_rsqrt_14: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex34/double_rsqrt_26.s b/chap18/ex34/double_rsqrt_26.s index a9e1cc6..b344dfc 100644 --- a/chap18/ex34/double_rsqrt_26.s +++ b/chap18/ex34/double_rsqrt_26.s @@ -48,3 +48,6 @@ double_rsqrt_26: half: .double 0.5 +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex34/double_rsqrt_50.s b/chap18/ex34/double_rsqrt_50.s index dff738b..107208c 100644 --- a/chap18/ex34/double_rsqrt_50.s +++ b/chap18/ex34/double_rsqrt_50.s @@ -91,3 +91,6 @@ dc3: .quad 0x3FD4000005E80001 .quad 0x3FD4000005E80001 +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex34/double_rsqrt_51.s b/chap18/ex34/double_rsqrt_51.s index afea132..b3e7c77 100644 --- a/chap18/ex34/double_rsqrt_51.s +++ b/chap18/ex34/double_rsqrt_51.s @@ -44,3 +44,7 @@ double_rsqrt_51: .p2align 3 one: .double 1.0 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex34/double_rsqrt_52.s b/chap18/ex34/double_rsqrt_52.s index ef1193a..dfc70e6 100644 --- a/chap18/ex34/double_rsqrt_52.s +++ b/chap18/ex34/double_rsqrt_52.s @@ -93,3 +93,6 @@ dc3: .quad 0x3FD4000005E80001 .quad 0x3FD4000005E80001 +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex35/CMakeLists.txt b/chap18/ex35/CMakeLists.txt index 823309e..0fc982c 100644 --- a/chap18/ex35/CMakeLists.txt +++ b/chap18/ex35/CMakeLists.txt @@ -1,10 +1,9 @@ -set(avx512_ex35_srcs ex35_test.cpp double_sqrt_52.c double_sqrt_53.c double_sqrt_26.c double_sqrt_14.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex35_srcs ${avx512_ex35_srcs} double_sqrt_52.s double_sqrt_53.s double_sqrt_26.s double_sqrt_14.s) +set(avx512_ex35_ass double_sqrt_52.s double_sqrt_53.s double_sqrt_26.s double_sqrt_14.s) elseif(MSVC) -set(avx512_ex35_srcs ${avx512_ex35_srcs} double_sqrt_52.asm double_sqrt_53.asm double_sqrt_26.asm double_sqrt_14.asm) +set(avx512_ex35_ass double_sqrt_52.asm double_sqrt_53.asm double_sqrt_26.asm double_sqrt_14.asm) endif() -add_executable(avx512_ex35_tests ${avx512_ex35_srcs}) +add_executable(avx512_ex35_tests ex35_test.cpp double_sqrt_52.c double_sqrt_53.c double_sqrt_26.c double_sqrt_14.c ${avx512_ex35_ass}) target_link_libraries(avx512_ex35_tests gtest_main optimisation_common) add_test(NAME avx512_ex35_test COMMAND avx512_ex35_tests) diff --git a/chap18/ex35/double_sqrt_14.s b/chap18/ex35/double_sqrt_14.s index 677448f..7f2ffe2 100644 --- a/chap18/ex35/double_sqrt_14.s +++ b/chap18/ex35/double_sqrt_14.s @@ -40,3 +40,7 @@ double_sqrt_14: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex35/double_sqrt_26.s b/chap18/ex35/double_sqrt_26.s index 541a2f0..d1d0f03 100644 --- a/chap18/ex35/double_sqrt_26.s +++ b/chap18/ex35/double_sqrt_26.s @@ -56,3 +56,7 @@ half: .quad 0x3FE0000000000000 .quad 0x3FE0000000000000 .quad 0x3FE0000000000000 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex35/double_sqrt_52.s b/chap18/ex35/double_sqrt_52.s index 5b762b3..d5aeceb 100644 --- a/chap18/ex35/double_sqrt_52.s +++ b/chap18/ex35/double_sqrt_52.s @@ -53,3 +53,7 @@ double_sqrt_52: .p2align 3 half: .double 0.5 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex35/double_sqrt_53.s b/chap18/ex35/double_sqrt_53.s index 0b2b5a6..c8a4002 100644 --- a/chap18/ex35/double_sqrt_53.s +++ b/chap18/ex35/double_sqrt_53.s @@ -37,3 +37,7 @@ double_sqrt_53: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex4/CMakeLists.txt b/chap18/ex4/CMakeLists.txt index 4b0a8f8..e2fe3b6 100644 --- a/chap18/ex4/CMakeLists.txt +++ b/chap18/ex4/CMakeLists.txt @@ -1,10 +1,15 @@ -set(avx512_ex4_srcs ex4_test.cpp mul_blend_avx.c mul_blend_avx512.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex4_srcs ${avx512_ex4_srcs} mul_blend_avx.s mul_blend_avx512.s) +set(avx512_ex4_ass mul_blend_avx.s mul_blend_avx512.s) elseif(MSVC) -set(avx512_ex4_srcs ${avx512_ex4_srcs} mul_blend_avx.asm mul_blend_avx512.asm) +set(avx512_ex4_ass mul_blend_avx.asm mul_blend_avx512.asm) endif() -add_executable(avx512_ex4_tests ${avx512_ex4_srcs}) +add_executable(avx512_ex4_tests ex4_test.cpp mul_blend_avx.c mul_blend_avx512.c ${avx512_ex4_ass}) target_link_libraries(avx512_ex4_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex4_bench ex4_bench.cpp ${avx512_ex4_ass}) + target_link_libraries(avx512_ex4_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex4_test COMMAND avx512_ex4_tests) diff --git a/chap18/ex4/ex4_bench.cpp b/chap18/ex4/ex4_bench.cpp new file mode 100644 index 0000000..a0359f2 --- /dev/null +++ b/chap18/ex4/ex4_bench.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "mul_blend_avx.h" +#include "mul_blend_avx512.h" +#include "optimisation_common.h" + +static void init_sources(int len, double *a, double *b, double *c) +{ + for (int i = 0; i < len; i++) { + a[i] = (float)((i & 1) ? i : 0); + b[i] = (float)i + 1; + c[i] = 0.0; + } +} + +static void BM_mul_blend_avx(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + double *a = (double *)_mm_malloc(len * sizeof(double), 32); + double *b = (double *)_mm_malloc(len * sizeof(double), 32); + double *c = (double *)_mm_malloc(len * sizeof(double), 32); + + init_sources(len, a, b, c); + for (auto _ : state) { + mul_blend_avx(a, b, c, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * 2 * + int64_t(sizeof(a[0]))); + + _mm_free(c); + _mm_free(b); + _mm_free(a); +} + +static void BM_mul_blend_avx512(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + double *a = (double *)_mm_malloc(len * sizeof(double), 64); + double *b = (double *)_mm_malloc(len * sizeof(double), 64); + double *c = (double *)_mm_malloc(len * sizeof(double), 64); + + init_sources(len, a, b, c); + for (auto _ : state) { + mul_blend_avx512(a, b, c, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * int64_t(len) * 2 * + int64_t(sizeof(a[0]))); + + _mm_free(c); + _mm_free(b); + _mm_free(a); +} + +BENCHMARK(BM_mul_blend_avx) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_mul_blend_avx512) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex4/mul_blend_avx.s b/chap18/ex4/mul_blend_avx.s index 2a2d69c..243149b 100644 --- a/chap18/ex4/mul_blend_avx.s +++ b/chap18/ex4/mul_blend_avx.s @@ -104,3 +104,7 @@ loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex4/mul_blend_avx512.s b/chap18/ex4/mul_blend_avx512.s index 8e53f3b..bce4684 100644 --- a/chap18/ex4/mul_blend_avx512.s +++ b/chap18/ex4/mul_blend_avx512.s @@ -71,3 +71,7 @@ loop: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex6/CMakeLists.txt b/chap18/ex6/CMakeLists.txt index fbcccd1..6c0824c 100644 --- a/chap18/ex6/CMakeLists.txt +++ b/chap18/ex6/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex6_srcs ex6_test.cpp mask_avx512.c blend_avx512.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex6_srcs ${avx512_ex6_srcs} mask_avx512.s blend_avx512.s) +set(avx512_ex6_ass mask_avx512.s blend_avx512.s) elseif(MSVC) -set(avx512_ex6_srcs ${avx512_ex6_srcs} mask_avx512.asm blend_avx512.asm) +set(avx512_ex6_ass mask_avx512.asm blend_avx512.asm) endif() -add_executable(avx512_ex6_tests ${avx512_ex6_srcs}) +add_executable(avx512_ex6_tests ex6_test.cpp mask_avx512.c blend_avx512.c ${avx512_ex6_ass}) target_link_libraries(avx512_ex6_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex6_bench ex6_bench.cpp ${avx512_ex6_ass}) + target_link_libraries(avx512_ex6_bench benchmark::benchmark optimisation_common) +ENDIF() add_test(NAME avx512_ex6_test COMMAND avx512_ex6_tests) diff --git a/chap18/ex6/blend_avx512.s b/chap18/ex6/blend_avx512.s index c957ff1..5d2e4d6 100644 --- a/chap18/ex6/blend_avx512.s +++ b/chap18/ex6/blend_avx512.s @@ -55,3 +55,7 @@ mainloop: pop rbx vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex6/ex6_bench.cpp b/chap18/ex6/ex6_bench.cpp new file mode 100644 index 0000000..549062c --- /dev/null +++ b/chap18/ex6/ex6_bench.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "blend_avx512.h" +#include "mask_avx512.h" +#include "optimisation_common.h" + +static void init_sources(int len, uint32_t *a, uint32_t *b, uint32_t *c) +{ + for (int i = 0; i < len; i++) { + b[i] = i; + a[i] = (uint32_t)i & 1; + c[i] = 0; + } +} + +static void BM_mask_avx512(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + uint32_t *a = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 64); + uint32_t *b = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 64); + uint32_t *c = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 64); + + init_sources(len, a, b, c); + + for (auto _ : state) { + mask_avx512(a, b, c, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(len * sizeof(a[0]) * 3)); + + _mm_free(c); + _mm_free(b); + _mm_free(a); +} + +static void BM_blend_avx512(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + uint32_t *a = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 64); + uint32_t *b = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 64); + uint32_t *c = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 64); + + init_sources(len, a, b, c); + + for (auto _ : state) { + blend_avx512(a, b, c, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(len * sizeof(a[0]) * 3)); + + _mm_free(c); + _mm_free(b); + _mm_free(a); +} + +BENCHMARK(BM_mask_avx512) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_blend_avx512) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex6/mask_avx512.s b/chap18/ex6/mask_avx512.s index d1a1d38..431cb2d 100644 --- a/chap18/ex6/mask_avx512.s +++ b/chap18/ex6/mask_avx512.s @@ -54,3 +54,7 @@ mainloop: pop rbx vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex7/CMakeLists.txt b/chap18/ex7/CMakeLists.txt index 7cbefe6..9088430 100644 --- a/chap18/ex7/CMakeLists.txt +++ b/chap18/ex7/CMakeLists.txt @@ -1,10 +1,15 @@ -set(avx512_ex7_srcs ex7_test.cpp mask_avx512.c blend_avx512.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex7_srcs ${avx512_ex7_srcs} mask_avx512.s blend_avx512.s) +set(avx512_ex7_ass mask_avx512.s blend_avx512.s) elseif(MSVC) -set(avx512_ex7_srcs ${avx512_ex7_srcs} mask_avx512.asm blend_avx512.asm) +set(avx512_ex7_ass mask_avx512.asm blend_avx512.asm) endif() -add_executable(avx512_ex7_tests ${avx512_ex7_srcs}) +add_executable(avx512_ex7_tests ex7_test.cpp mask_avx512.c blend_avx512.c ${avx512_ex7_ass}) target_link_libraries(avx512_ex7_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex7_bench ex7_bench.cpp ${avx512_ex7_ass}) + target_link_libraries(avx512_ex7_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex7_test COMMAND avx512_ex7_tests) diff --git a/chap18/ex7/blend_avx512.s b/chap18/ex7/blend_avx512.s index 70dd962..1c7f531 100644 --- a/chap18/ex7/blend_avx512.s +++ b/chap18/ex7/blend_avx512.s @@ -46,3 +46,7 @@ loop1: pop rbx vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex7/ex7_bench.cpp b/chap18/ex7/ex7_bench.cpp new file mode 100644 index 0000000..1a5bce1 --- /dev/null +++ b/chap18/ex7/ex7_bench.cpp @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "blend_avx512.h" +#include "mask_avx512.h" +#include "optimisation_common.h" + +static void init_sources(int len, uint32_t *a, uint32_t *b) +{ + for (int i = 0; i < len; i++) { + a[i] = (i & 1) ? i + 1 : i; + b[i] = i; + } +} + +static void BM_mask_avx512(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + uint32_t *a = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 64); + uint32_t *b = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 64); + + init_sources(len, a, b); + + for (auto _ : state) { + mask_avx512(a, b, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(len * 2 * sizeof(a[0]))); + + _mm_free(b); + _mm_free(a); +} + +static void BM_blend_avx512(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + uint32_t *a = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 64); + uint32_t *b = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 64); + + init_sources(len, a, b); + + for (auto _ : state) { + blend_avx512(a, b, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(len * 2 * sizeof(a[0]))); + + _mm_free(b); + _mm_free(a); +} + +BENCHMARK(BM_mask_avx512) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_blend_avx512) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex7/mask_avx512.s b/chap18/ex7/mask_avx512.s index 079acaa..60a1272 100644 --- a/chap18/ex7/mask_avx512.s +++ b/chap18/ex7/mask_avx512.s @@ -47,3 +47,7 @@ loop1: pop rbx vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex8/CMakeLists.txt b/chap18/ex8/CMakeLists.txt index f5c2c8f..bd0cbd9 100644 --- a/chap18/ex8/CMakeLists.txt +++ b/chap18/ex8/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex8_srcs ex8_test.cpp mce_scalar.c mce_avx2.c mce_avx512.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex8_srcs ${avx512_ex8_srcs} mce_scalar.s mce_avx2.s mce_avx512.s) +set(avx512_ex8_ass mce_scalar.s mce_avx2.s mce_avx512.s) elseif(MSVC) -set(avx512_ex8_srcs ${avx512_ex8_srcs} mce_scalar.asm mce_avx2.asm mce_avx512.asm) +set(avx512_ex8_ass mce_scalar.asm mce_avx2.asm mce_avx512.asm) endif() -add_executable(avx512_ex8_tests ${avx512_ex8_srcs}) - +add_executable(avx512_ex8_tests ex8_test.cpp mce_scalar.c mce_avx2.c mce_avx512.c ${avx512_ex8_ass}) target_link_libraries(avx512_ex8_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex8_bench ex8_bench.cpp ${avx512_ex8_ass}) + target_link_libraries(avx512_ex8_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex8_test COMMAND avx512_ex8_tests) diff --git a/chap18/ex8/ex8_bench.cpp b/chap18/ex8/ex8_bench.cpp new file mode 100644 index 0000000..af1f4aa --- /dev/null +++ b/chap18/ex8/ex8_bench.cpp @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "mce_avx2.h" +#include "mce_avx512.h" +#include "mce_scalar.h" +#include "optimisation_common.h" + +static void init_sources(int len, uint32_t *a, uint32_t *b) +{ + for (int i = 0; i < len; i++) { + b[i] = i & 0xff; + a[i] = 0; + } +} + +static void BM_mce_scalar(benchmark::State &state) +{ + int len = state.range(0); + uint32_t *a = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 16); + uint32_t *b = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 16); + + init_sources(len, a, b); + + for (auto _ : state) { + mce_scalar(a, b, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(len * sizeof(b[0]))); + + _mm_free(b); + _mm_free(a); +} + +static void BM_mce_avx2(benchmark::State &state) +{ + int len = state.range(0); + uint32_t *a = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 32); + uint32_t *b = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 32); + + init_sources(len, a, b); + + for (auto _ : state) { + mce_avx2(a, b, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(len * sizeof(b[0]))); + + _mm_free(b); + _mm_free(a); +} + +static void BM_mce_avx512(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + uint32_t *a = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 64); + uint32_t *b = (uint32_t *)_mm_malloc(len * sizeof(uint32_t), 64); + + init_sources(len, a, b); + + for (auto _ : state) { + mce_avx512(a, b, len); + } + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(len * sizeof(b[0]))); + _mm_free(b); + _mm_free(a); +} + +BENCHMARK(BM_mce_scalar) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_mce_avx2) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_mce_avx512) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex8/mce_avx2.s b/chap18/ex8/mce_avx2.s index 0f1cc9f..e7fe700 100644 --- a/chap18/ex8/mce_avx2.s +++ b/chap18/ex8/mce_avx2.s @@ -66,3 +66,6 @@ five: three: .int 3 +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex8/mce_avx512.s b/chap18/ex8/mce_avx512.s index 04c1813..f975d05 100644 --- a/chap18/ex8/mce_avx512.s +++ b/chap18/ex8/mce_avx512.s @@ -64,3 +64,6 @@ five: three: .int 3 +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex8/mce_scalar.s b/chap18/ex8/mce_scalar.s index 32864f8..043333b 100644 --- a/chap18/ex8/mce_scalar.s +++ b/chap18/ex8/mce_scalar.s @@ -53,3 +53,7 @@ label1: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex9/CMakeLists.txt b/chap18/ex9/CMakeLists.txt index ea66386..f482379 100644 --- a/chap18/ex9/CMakeLists.txt +++ b/chap18/ex9/CMakeLists.txt @@ -1,10 +1,14 @@ -set(avx512_ex9_srcs ex9_test.cpp no_peeling.c peeling.c) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(avx512_ex9_srcs ${avx512_ex9_srcs} no_peeling.s peeling.s) +set(avx512_ex9_ass no_peeling.s peeling.s) elseif(MSVC) -set(avx512_ex9_srcs ${avx512_ex9_srcs} no_peeling.asm peeling.asm) +set(avx512_ex9_ass no_peeling.asm peeling.asm) endif() -add_executable(avx512_ex9_tests ${avx512_ex9_srcs}) - +add_executable(avx512_ex9_tests ex9_test.cpp no_peeling.c peeling.c ${avx512_ex9_ass}) target_link_libraries(avx512_ex9_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(avx512_ex9_bench ex9_bench.cpp ${avx512_ex9_ass}) + target_link_libraries(avx512_ex9_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME avx512_ex9_test COMMAND avx512_ex9_tests) diff --git a/chap18/ex9/ex9_bench.cpp b/chap18/ex9/ex9_bench.cpp new file mode 100644 index 0000000..c8d77e4 --- /dev/null +++ b/chap18/ex9/ex9_bench.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "no_peeling.h" +#include "optimisation_common.h" +#include "peeling.h" + +static void init_sources(int len, float *a, float *b) +{ + for (int i = 0; i < len; i++) { + b[i] = static_cast(i); + a[i] = 0.0; + } +} + +static void BM_no_peeling(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + float *a = (float *)_mm_malloc((len + 8) * sizeof(float), 64); + float *b = (float *)_mm_malloc((len + 8) * sizeof(float), 64); + + init_sources(len, a + 8, b + 8); + + for (auto _ : state) { + no_peeling(a + 8, b + 8, len, 2.0, 1.0); + } + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(len * sizeof(b[0]))); + + _mm_free(b); + _mm_free(a); +} + +static void BM_peeling(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + int len = state.range(0); + float *a = (float *)_mm_malloc((len + 8) * sizeof(float), 64); + float *b = (float *)_mm_malloc((len + 8) * sizeof(float), 64); + + init_sources(len, a + 8, b + 8); + + for (auto _ : state) { + peel(a + 8, b + 8, len, 2.0, 1.0); + } + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(len * sizeof(b[0]))); + _mm_free(b); + _mm_free(a); +} + +BENCHMARK(BM_no_peeling) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK(BM_peeling) + ->Arg(1 << 6) + ->Arg(1 << 8) + ->Arg(1 << 10) + ->Arg(1 << 12) + ->Arg(1 << 14) + ->Arg(1 << 16) + ->Arg(1 << 18); +BENCHMARK_MAIN(); diff --git a/chap18/ex9/no_peeling.s b/chap18/ex9/no_peeling.s index bc00e3f..e9491a7 100644 --- a/chap18/ex9/no_peeling.s +++ b/chap18/ex9/no_peeling.s @@ -71,3 +71,7 @@ end: indices: .int 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap18/ex9/peeling.s b/chap18/ex9/peeling.s index c1dc2a2..fc72f48 100644 --- a/chap18/ex9/peeling.s +++ b/chap18/ex9/peeling.s @@ -88,3 +88,7 @@ end: indices: .int 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap8/ex1/CMakeLists.txt b/chap8/ex1/CMakeLists.txt index c5b6bfe..707661c 100644 --- a/chap8/ex1/CMakeLists.txt +++ b/chap8/ex1/CMakeLists.txt @@ -1,9 +1,15 @@ -set(vnni_ex1_srcs ex1_test.cpp) +set(vnni_ex1_srcs ) if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang) -set(vnni_ex1_srcs ${vnni_ex1_srcs} dotprod_vnni.s dotprod_novnni.s) +set(vnni_ex1_ass dotprod_vnni.s dotprod_novnni.s) elseif(MSVC) -set(vnni_ex1_srcs ${vnni_ex1_srcs} dotprod_vnni.asm dotprod_novnni.asm) +set(vnni_ex1_ass dotprod_vnni.asm dotprod_novnni.asm) endif() -add_executable(vnni_ex1_tests ${vnni_ex1_srcs}) +add_executable(vnni_ex1_tests ex1_test.cpp ${vnni_ex1_ass}) target_link_libraries(vnni_ex1_tests gtest_main optimisation_common) + +IF( benchmark_FOUND ) + add_executable(vnni_ex1_bench ex1_bench.cpp ${vnni_ex1_ass}) + target_link_libraries(vnni_ex1_bench benchmark::benchmark optimisation_common) +ENDIF() + add_test(NAME vnni_ex1_test COMMAND vnni_ex1_tests) diff --git a/chap8/ex1/dotprod_novnni.s b/chap8/ex1/dotprod_novnni.s index 60503db..c43cde8 100644 --- a/chap8/ex1/dotprod_novnni.s +++ b/chap8/ex1/dotprod_novnni.s @@ -137,3 +137,7 @@ inner: .p2align 2 onew: .word 1, 1 + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap8/ex1/dotprod_vnni.s b/chap8/ex1/dotprod_vnni.s index 8dde7e5..bb3d1dd 100644 --- a/chap8/ex1/dotprod_vnni.s +++ b/chap8/ex1/dotprod_vnni.s @@ -100,3 +100,7 @@ inner: vzeroupper ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/chap8/ex1/ex1_bench.cpp b/chap8/ex1/ex1_bench.cpp new file mode 100644 index 0000000..b248180 --- /dev/null +++ b/chap8/ex1/ex1_bench.cpp @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2021 by Intel Corporation + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include + +#include "dotprod_novnni.h" +#include "dotprod_vnni.h" +#include "optimisation_common.h" + +#define M 64 +#define K 64 +#define N 64 +#define K_PACKED (K / 4) +#define N_PACKED (N * 4) + +#ifdef _MSC_VER // Preferred VS2019 version 16.3 or higher +__declspec(align(64)) static uint8_t lhs[M][K]; +__declspec(align(64)) static int8_t rhs_packed[K_PACKED][N_PACKED]; +__declspec(align(64)) static int32_t res[M][N]; +#else +static uint8_t lhs[M][K] __attribute__((aligned(64))); +static int8_t rhs_packed[K_PACKED][N_PACKED] __attribute__((aligned(64))); +static int32_t res[M][N] __attribute__((aligned(64))); +#endif + +static void init_data() +{ + int8_t counter = 0; + + memset(res, 0, sizeof(res)); + + for (size_t j = 0; j < M; j++) + for (size_t k = 0; k < K; k++) + lhs[j][k] = (counter++) & 127; + + counter = 0; + for (size_t j = 0; j < K_PACKED; j++) + for (size_t k = 0; k < N_PACKED; k++) + rhs_packed[j][k] = (counter++) & 127; +} + +static void BM_dotprod_novnni(benchmark::State &state) +{ + if (!supports_avx512_skx()) { + state.SkipWithError("AVX-512 not supported, skipping test"); + return; + } + + init_data(); + for (auto _ : state) { + for (size_t i = 0; i < 16; i++) + dotprod_novnni_4x64x64( + &lhs[i * 4][0], &rhs_packed[0][0], &res[i * 4][0]); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(sizeof(lhs) + sizeof(rhs_packed))); +} + +static void BM_vnni(benchmark::State &state) +{ + if (!supports_avx512_clx()) { + state.SkipWithError("VNNI not supported, skipping test"); + return; + } + + init_data(); + for (auto _ : state) { + for (size_t i = 0; i < 16; i++) + dotprod_vnni_4x64x64(&lhs[i * 4][0], &rhs_packed[0][0], + &res[i * 4][0]); + } + state.SetBytesProcessed(int64_t(state.iterations()) * + int64_t(sizeof(lhs) + sizeof(rhs_packed))); +} + +BENCHMARK(BM_dotprod_novnni); +BENCHMARK(BM_vnni); +BENCHMARK_MAIN(); diff --git a/chap8/ex1/ex1_test.cpp b/chap8/ex1/ex1_test.cpp index 9116463..89e04b4 100644 --- a/chap8/ex1/ex1_test.cpp +++ b/chap8/ex1/ex1_test.cpp @@ -35,7 +35,7 @@ static int8_t rhs_packed[K_PACKED][N_PACKED]; static int32_t res_scalar[M][N]; static int32_t res[M][N]; -void init_data() +static void init_data() { int8_t counter = 0; diff --git a/chap8/ex9/pixel_shuffler.cpp b/chap8/ex9/pixel_shuffler.cpp index d392142..276145c 100644 --- a/chap8/ex9/pixel_shuffler.cpp +++ b/chap8/ex9/pixel_shuffler.cpp @@ -14,6 +14,7 @@ */ #include +#include #include #include "pixel_shuffler.hpp" diff --git a/common/supports_avx512.s b/common/supports_avx512.s index dc9c326..2aa2b02 100644 --- a/common/supports_avx512.s +++ b/common/supports_avx512.s @@ -71,3 +71,7 @@ not_supported: pop rbx ret + +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/verify.sh b/verify.sh index 1b82c8d..d92887c 100755 --- a/verify.sh +++ b/verify.sh @@ -16,4 +16,20 @@ mkdir -p build cd build cmake .. -DENABLE_WERROR=ON make -j + +exstack=0 +if type eu-readelf >/dev/null 2>&1 ; then + for i in `find . -executable -type f -name "*_bench" -o -name "*_tests"` ; do + stack=`eu-readelf -l $i | awk '$1 == "GNU_STACK"'` + if [[ "$stack" = *" RWE "* ]]; then + echo "$i has an executable stack." + exstack=1 + fi + done +fi + cd .. + +if [ "$exstack" -eq "1" ]; then + exit 1 +fi