Skip to content
This repository has been archived by the owner on Jul 16, 2024. It is now read-only.

Commit

Permalink
Merge pull request #9 from intel/markdryan/jan-2023
Browse files Browse the repository at this point in the history
Add AMX, FP16 and other examples
  • Loading branch information
Mark Ryan authored Jan 10, 2023
2 parents 61cdceb + a388c3c commit 5cd28bc
Show file tree
Hide file tree
Showing 215 changed files with 8,117 additions and 89 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
# This workflow contains a single job called "build"
build:
# The type of runner that the job will run on
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04

# Steps represent a sequence of tasks that will be executed as part of the job
steps:
Expand All @@ -41,4 +41,4 @@ jobs:
mkdir clang-build
cd clang-build
CC=clang CXX=clang++ cmake -DENABLE_WERROR=ON ..
make -j
make -j 4
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ build
checkpatch.pl
const_structs.checkpatch
spelling.txt
**/optimisation.tar
33 changes: 33 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
cmake_minimum_required (VERSION 3.16.3)
if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
cmake_policy(SET CMP0135 NEW)
endif()
project(optimization C CXX ASM)

include(CheckCXXCompilerFlag)

find_package(benchmark QUIET)

if (CMAKE_CXX_COMPILER_ID MATCHES MSVC)
Expand Down Expand Up @@ -29,6 +34,7 @@ include_directories("common")

if(CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES GNU OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang)
add_compile_options(-Wall -Winline -pedantic -march=haswell)
add_compile_options($<$<COMPILE_LANGUAGE:ASM>:-x$<SEMICOLON>assembler-with-cpp>)
endif()

if (CMAKE_CXX_COMPILER_ID MATCHES Clang OR CMAKE_CXX_COMPILER_ID MATCHES AppleClang)
Expand All @@ -42,6 +48,8 @@ endif()
enable_testing()
add_subdirectory(common)
add_subdirectory(chap5/ex15)
add_subdirectory(chap7/ex3)
add_subdirectory(chap7/ex4)
add_subdirectory(chap8/ex1)
add_subdirectory(chap8/ex2)
add_subdirectory(chap8/ex4)
Expand Down Expand Up @@ -91,6 +99,7 @@ add_subdirectory(chap18/ex1)
add_subdirectory(chap18/ex2)
add_subdirectory(chap18/ex3)
add_subdirectory(chap18/ex4)
add_subdirectory(chap18/ex5)
add_subdirectory(chap18/ex6)
add_subdirectory(chap18/ex7)
add_subdirectory(chap18/ex8)
Expand Down Expand Up @@ -121,3 +130,27 @@ add_subdirectory(chap18/ex32)
add_subdirectory(chap18/ex33)
add_subdirectory(chap18/ex34)
add_subdirectory(chap18/ex35)
add_subdirectory(chap19/ex1)
add_subdirectory(chap19/ex2)
add_subdirectory(chap19/ex3)
add_subdirectory(chap19/ex4)
add_subdirectory(chap19/ex5)
add_subdirectory(chap20/ex4)
add_subdirectory(chap20/ex5)
add_subdirectory(chap20/ex6)
add_subdirectory(chap20/ex7)
add_subdirectory(chap20/ex8)
add_subdirectory(chap20/ex10)
add_subdirectory(chap20/ex14)
add_subdirectory(chap20/ex16)
add_subdirectory(chap20/ex17)
add_subdirectory(chap20/ex18)
add_subdirectory(chap20/ex19)
add_subdirectory(chap20/ex20)
add_subdirectory(chap20/ex21)
add_subdirectory(chap20/ex22)
add_subdirectory(chap20/ex23)
add_subdirectory(chap20/ex24)
add_subdirectory(chap20/ex25)
add_subdirectory(chap20/ex27)

3 changes: 3 additions & 0 deletions CONTRIBUTORS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@
- [email protected]
- [email protected]
- [email protected]
- [email protected]
- [email protected]
- [email protected]
18 changes: 14 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Intel Optimization Manual available here
is provided for GCC, Clang and MSVC, using the Intel syntax. Unit tests are
also provided for each of the samples.

## Building on Linux
## Building on Linux and macOS

To run the unit tests

Expand All @@ -16,7 +16,13 @@ To run the unit tests
4. cmake ..
5. make && make test

GCC 8.1 or higher is required to build the unit tests. The unit tests are
GCC 8.1 (or clang 12 on macOS) or higher is required to build the unit tests. However,
many of the newer examples, e.g, those that use AMX or AVX-512 FP16 instructions, require newer
versions of the compilers to build; GCC 12 or clang 14. No errors will be reported
when building, but examples built with toolchains that do not support the instructions
that they test will simple report an error when run and exit.

The unit tests are
compiled with --march=haswell and so a fourth-generation Intel® Core™ (Haswell)
CPU or later is required to run them. Tests that execute instructions not present
on fourth-generation Intel® Core™ (Haswell) will be
Expand All @@ -34,13 +40,13 @@ The code samples can also be compiled with clang:
## Building on Windows

To run the tests on Windows machine-
Dependency- Visual Studio 2019
Dependency- Visual Studio 2022

1. go to optimization repo on your local machine.
2. mkdir bld
3. cd bld
4. (inside x64 Native tools command prompt)
"cmake -G "Visual Studio 16 2019" .." => this will generate visual studio solution files.
"cmake -G "Visual Studio 17 2022" .." => this will generate visual studio solution files.
open optimization.sln file using visual studio.
5. To Build- build "ALL_BUILD" project
6. To Run tests- build "RUN_TESTS" project.
Expand All @@ -52,6 +58,10 @@ built using [Google's Benchmark project](https://github.com/google/benchmark).
If Benchmark is installed and discoverable by CMake, the benchmarks for the code
samples will be automatically built when you type make.

In Windows, ensure you build the benchmark code with the same build type
(Release/Debug) as Google's Benchmark to prevent debug level mismatch errors
while linking.

## CPU Requirements

The code samples assume that they are being run on a fourth-generation Intel® Core™ (Haswell) processor
Expand Down
6 changes: 5 additions & 1 deletion chap15/ex27/rsqrtps_newt_sse.s
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,11 @@ loop1:
pop rbx
ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 4

minus_half:
Expand Down
6 changes: 5 additions & 1 deletion chap15/ex27/vrsqrtps_newt_avx.s
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,11 @@ loop1:
pop rbx
ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 5

half:
Expand Down
6 changes: 5 additions & 1 deletion chap15/ex30/sqrt_rsqrtps_taylor_sse.s
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,11 @@ loop1:
pop rbx
ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 4

minus_half:
Expand Down
6 changes: 5 additions & 1 deletion chap15/ex30/sqrt_vrsqrtps_taylor_avx.s
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,11 @@ loop1:
pop rbx
ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 5

minus_half:
Expand Down
6 changes: 5 additions & 1 deletion chap15/ex46/avx2_vpgatherd.s
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,11 @@ loop:
vzeroupper
ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 5

real_offset:
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex10/avx2_compress.s
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,11 @@ mainloop:

ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 5

shuffle_LUT:
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex10/avx_compress.s
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,11 @@ mainloop:

ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 4

shuffle_LUT:
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex11/expand_avx2.s
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,11 @@ mainloop:
vzeroupper
ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 5

shuf2:
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex13/transpose_avx512.s
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,11 @@ matrix_loop:
vzeroupper
ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 6

permMaskBuffer:
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex17/software_scatter.s
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,11 @@ mainloop:
pop rbx
ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 5

shufMaskP:
Expand Down
2 changes: 1 addition & 1 deletion chap18/ex18/qword_avx2_intrinsics.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ void qword_avx2_intrinsics(const int64_t *a, const int64_t *b, int64_t *c,

for (int i = 0; i < N; i += 32) {
__m256i aa, bb, aah, bbh, mul, sum;
//#pragma unroll(8)
// #pragma unroll(8)
for (int j = 0; j < 8; j++) {
aa = _mm256_loadu_si256(
(const __m256i *)(a + i + 4 * j));
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex20/avx512_vector_dp.s
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,11 @@ end:

ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 6
all_31s:
.quad 0x0000001f0000001f
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex23/decompress_vbmi.s
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,11 @@ loop:
vzeroupper
ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 6

permute_ctrl:
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex25/fma_only_tpt.s
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ loop1:

ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 6
one_vec:
.double 1, 1, 1, 1, 1, 1, 1, 1
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex25/fma_shuffle_tpt.s
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,11 @@ loop1:

ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 6
one_vec:
.double 1, 1, 1, 1, 1, 1, 1, 1
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex26/g2s_vpermi2d.s
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,11 @@ loop:

ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 6

gather_imag_index:
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex26/g2s_vpgatherdd.s
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,11 @@ loop:

ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 6

gather_imag_index:
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex27/s2s_vpermi2d.s
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,11 @@ loop:

ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 6

first_half:
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex27/s2s_vscatterdps.s
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,11 @@ loop:

ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 6

gather_imag_index:
Expand Down
6 changes: 5 additions & 1 deletion chap18/ex28/adj_vpgatherpd.s
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,11 @@ loop:

ret

.data
#ifdef __APPLE__
.section __TEXT,__const
#else
.section .rodata
#endif
.p2align 5

index_inc:
Expand Down
Loading

0 comments on commit 5cd28bc

Please sign in to comment.