Merge pull request #9 from intel/markdryan/jan-2023

Add AMX, FP16 and other examples
intel · Jan 10, 2023 · 5cd28bc · 5cd28bc
2 parents 61cdceb + a388c3c
commit 5cd28bc
Show file tree

Hide file tree

Showing 215 changed files with 8,117 additions and 89 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -18,7 +18,7 @@ jobs:
   # This workflow contains a single job called "build"
   build:
     # The type of runner that the job will run on
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-22.04
 
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
@@ -41,4 +41,4 @@ jobs:
           mkdir clang-build
           cd clang-build
           CC=clang CXX=clang++ cmake -DENABLE_WERROR=ON ..
-          make -j
+          make -j 4
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ build
 checkpatch.pl
 const_structs.checkpatch
 spelling.txt
+**/optimisation.tar
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,6 +1,11 @@
 cmake_minimum_required (VERSION 3.16.3)
+if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
+  cmake_policy(SET CMP0135 NEW)
+endif()
 project(optimization C CXX ASM)
 
+include(CheckCXXCompilerFlag)
+
 find_package(benchmark QUIET)
 
 if (CMAKE_CXX_COMPILER_ID MATCHES MSVC)
@@ -29,6 +34,7 @@ include_directories("common")
 
 if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang)
   add_compile_options(-Wall -Winline -pedantic -march=haswell)
+  add_compile_options($<$<COMPILE_LANGUAGE:ASM>:-x$<SEMICOLON>assembler-with-cpp>)
 endif()
 
 if (CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang)
@@ -42,6 +48,8 @@ endif()
 enable_testing()
 add_subdirectory(common)
 add_subdirectory(chap5/ex15)
+add_subdirectory(chap7/ex3)
+add_subdirectory(chap7/ex4)
 add_subdirectory(chap8/ex1)
 add_subdirectory(chap8/ex2)
 add_subdirectory(chap8/ex4)
@@ -91,6 +99,7 @@ add_subdirectory(chap18/ex1)
 add_subdirectory(chap18/ex2)
 add_subdirectory(chap18/ex3)
 add_subdirectory(chap18/ex4)
+add_subdirectory(chap18/ex5)
 add_subdirectory(chap18/ex6)
 add_subdirectory(chap18/ex7)
 add_subdirectory(chap18/ex8)
@@ -121,3 +130,27 @@ add_subdirectory(chap18/ex32)
 add_subdirectory(chap18/ex33)
 add_subdirectory(chap18/ex34)
 add_subdirectory(chap18/ex35)
+add_subdirectory(chap19/ex1)
+add_subdirectory(chap19/ex2)
+add_subdirectory(chap19/ex3)
+add_subdirectory(chap19/ex4)
+add_subdirectory(chap19/ex5)
+add_subdirectory(chap20/ex4)
+add_subdirectory(chap20/ex5)
+add_subdirectory(chap20/ex6)
+add_subdirectory(chap20/ex7)
+add_subdirectory(chap20/ex8)
+add_subdirectory(chap20/ex10)
+add_subdirectory(chap20/ex14)
+add_subdirectory(chap20/ex16)
+add_subdirectory(chap20/ex17)
+add_subdirectory(chap20/ex18)
+add_subdirectory(chap20/ex19)
+add_subdirectory(chap20/ex20)
+add_subdirectory(chap20/ex21)
+add_subdirectory(chap20/ex22)
+add_subdirectory(chap20/ex23)
+add_subdirectory(chap20/ex24)
+add_subdirectory(chap20/ex25)
+add_subdirectory(chap20/ex27)
+
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -3,3 +3,6 @@
 - [email protected]
 - [email protected]
 - [email protected]
+- [email protected]
+- [email protected]
+- [email protected]
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ Intel Optimization Manual available here
 is provided for GCC, Clang and MSVC, using the Intel syntax.  Unit tests are
 also provided for each of the samples.
 
-## Building on Linux
+## Building on Linux and macOS
 
 To run the unit tests
 
@@ -16,7 +16,13 @@ To run the unit tests
 4. cmake ..
 5. make && make test
 
-GCC 8.1 or higher is required to build the unit tests.  The unit tests are
+GCC 8.1 (or clang 12 on macOS) or higher is required to build the unit tests.  However,
+many of the newer examples, e.g, those that use AMX or AVX-512 FP16 instructions, require newer
+versions of the compilers to build; GCC 12 or clang 14.  No errors will be reported
+when building, but examples built with toolchains that do not support the instructions
+that they test will simple report an error when run and exit.
+
+The unit tests are
 compiled with --march=haswell and so a fourth-generation Intel® Core™ (Haswell)
 CPU or later is required to run them.  Tests that execute instructions not present
 on fourth-generation Intel® Core™ (Haswell) will be
@@ -34,13 +40,13 @@ The code samples can also be compiled with clang:
 ## Building on Windows
 
 To run the tests on Windows machine-
-Dependency- Visual Studio 2019
+Dependency- Visual Studio 2022
 
 1. go to optimization repo on your local machine.
 2. mkdir bld
 3. cd bld
 4. (inside x64 Native tools command prompt)
-   "cmake -G "Visual Studio 16 2019" .." => this will generate visual studio solution files.
+   "cmake -G "Visual Studio 17 2022" .." => this will generate visual studio solution files.
    open optimization.sln file using visual studio.
 5. To Build- build "ALL_BUILD" project
 6. To Run tests- build "RUN_TESTS" project.
@@ -52,6 +58,10 @@ built using [Google's Benchmark project](https://github.com/google/benchmark).
 If Benchmark is installed and discoverable by CMake, the benchmarks for the code
 samples will be automatically built when you type make.
 
+In Windows, ensure you build the benchmark code with the same build type 
+(Release/Debug) as Google's Benchmark to prevent debug level mismatch errors 
+while linking.
+
 ## CPU Requirements
 
 The code samples assume that they are being run on a fourth-generation Intel® Core™ (Haswell) processor

diff --git a/chap15/ex27/rsqrtps_newt_sse.s b/chap15/ex27/rsqrtps_newt_sse.s
@@ -55,7 +55,11 @@ loop1:
 	pop rbx
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 4
 
 minus_half:

diff --git a/chap15/ex27/vrsqrtps_newt_avx.s b/chap15/ex27/vrsqrtps_newt_avx.s
@@ -57,7 +57,11 @@ loop1:
 	pop rbx
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 5
 
 half:

diff --git a/chap15/ex30/sqrt_rsqrtps_taylor_sse.s b/chap15/ex30/sqrt_rsqrtps_taylor_sse.s
@@ -61,7 +61,11 @@ loop1:
 	pop rbx
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 4
 
 minus_half:

diff --git a/chap15/ex30/sqrt_vrsqrtps_taylor_avx.s b/chap15/ex30/sqrt_vrsqrtps_taylor_avx.s
@@ -59,7 +59,11 @@ loop1:
 	pop rbx
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 5
 
 minus_half:

diff --git a/chap15/ex46/avx2_vpgatherd.s b/chap15/ex46/avx2_vpgatherd.s
@@ -61,7 +61,11 @@ loop:
 	vzeroupper
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 5
 
 real_offset:

diff --git a/chap18/ex10/avx2_compress.s b/chap18/ex10/avx2_compress.s
@@ -72,7 +72,11 @@ mainloop:
 
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 5
 
 shuffle_LUT:

diff --git a/chap18/ex10/avx_compress.s b/chap18/ex10/avx_compress.s
@@ -73,7 +73,11 @@ mainloop:
 
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 4
 
 shuffle_LUT:

diff --git a/chap18/ex11/expand_avx2.s b/chap18/ex11/expand_avx2.s
@@ -65,7 +65,11 @@ mainloop:
 	vzeroupper
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 5
 
 shuf2:

diff --git a/chap18/ex13/transpose_avx512.s b/chap18/ex13/transpose_avx512.s
@@ -51,7 +51,11 @@ matrix_loop:
 	vzeroupper
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 6
 
 permMaskBuffer:

diff --git a/chap18/ex17/software_scatter.s b/chap18/ex17/software_scatter.s
@@ -100,7 +100,11 @@ mainloop:
 	pop rbx
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 5
 
 	shufMaskP:

diff --git a/chap18/ex18/qword_avx2_intrinsics.c b/chap18/ex18/qword_avx2_intrinsics.c
@@ -24,7 +24,7 @@ void qword_avx2_intrinsics(const int64_t *a, const int64_t *b, int64_t *c,
 
 	for (int i = 0; i < N; i += 32) {
 		__m256i aa, bb, aah, bbh, mul, sum;
-		//#pragma unroll(8)
+		// #pragma unroll(8)
 		for (int j = 0; j < 8; j++) {
 			aa = _mm256_loadu_si256(
 			    (const __m256i *)(a + i + 4 * j));

diff --git a/chap18/ex20/avx512_vector_dp.s b/chap18/ex20/avx512_vector_dp.s
@@ -165,7 +165,11 @@ end:
 
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 6
 all_31s:
 	.quad 0x0000001f0000001f

diff --git a/chap18/ex23/decompress_vbmi.s b/chap18/ex23/decompress_vbmi.s
@@ -51,7 +51,11 @@ loop:
 	vzeroupper
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 6
 
 permute_ctrl:

diff --git a/chap18/ex25/fma_only_tpt.s b/chap18/ex25/fma_only_tpt.s
@@ -60,7 +60,11 @@ loop1:
 
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 6
 one_vec:
 	.double 1, 1, 1, 1, 1, 1, 1, 1

diff --git a/chap18/ex25/fma_shuffle_tpt.s b/chap18/ex25/fma_shuffle_tpt.s
@@ -85,7 +85,11 @@ loop1:
 
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 6
 one_vec:
 	.double 1, 1, 1, 1, 1, 1, 1, 1

diff --git a/chap18/ex26/g2s_vpermi2d.s b/chap18/ex26/g2s_vpermi2d.s
@@ -62,7 +62,11 @@ loop:
 
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 6
 
 gather_imag_index:

diff --git a/chap18/ex26/g2s_vpgatherdd.s b/chap18/ex26/g2s_vpgatherdd.s
@@ -76,7 +76,11 @@ loop:
 
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 6
 
 gather_imag_index:

diff --git a/chap18/ex27/s2s_vpermi2d.s b/chap18/ex27/s2s_vpermi2d.s
@@ -58,7 +58,11 @@ loop:
 
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 6
 
 first_half:

diff --git a/chap18/ex27/s2s_vscatterdps.s b/chap18/ex27/s2s_vscatterdps.s
@@ -58,7 +58,11 @@ loop:
 
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 6
 
 gather_imag_index:

diff --git a/chap18/ex28/adj_vpgatherpd.s b/chap18/ex28/adj_vpgatherpd.s
@@ -66,7 +66,11 @@ loop:
 
 	ret
 
-	.data
+#ifdef __APPLE__
+	.section __TEXT,__const
+#else
+	.section .rodata
+#endif
 	.p2align 5
 
 index_inc: