Skip to content

Commit

Permalink
build: gemm: add a build option to discard autogen kernels by isa
Browse files Browse the repository at this point in the history
  • Loading branch information
dzarukin authored and luweizhou2016 committed Nov 1, 2023
1 parent e634182 commit 9861b36
Show file tree
Hide file tree
Showing 13 changed files with 460 additions and 335 deletions.
17 changes: 16 additions & 1 deletion cmake/configuring_primitive_list.cmake
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#===============================================================================
# Copyright 2021 Intel Corporation
# Copyright 2021-2023 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -66,6 +66,21 @@ else()
endif()
message(STATUS "Enabled primitive GPU ISA: ${DNNL_ENABLE_PRIMITIVE_GPU_ISA}")

if (ONEDNN_ENABLE_GEMM_KERNELS_ISA STREQUAL "ALL")
set(BUILD_GEMM_KERNELS_ALL TRUE)
elseif (ONEDNN_ENABLE_GEMM_KERNELS_ISA STREQUAL "NONE")
set(BUILD_GEMM_KERNELS_NONE TRUE)
else()
foreach(isa ${ONEDNN_ENABLE_GEMM_KERNELS_ISA})
string(TOUPPER ${isa} uisa)
if(NOT "${uisa}" MATCHES "^(SSE41|AVX2|AVX512)$")
message(FATAL_ERROR "Unsupported primitive CPU ISA: ${uisa}")
endif()
set(BUILD_GEMM_${uisa} TRUE)
endforeach()
endif()
message(STATUS "Enabled GeMM kernels ISA: ${ONEDNN_ENABLE_GEMM_KERNELS_ISA}")

# When certain primitives or primitive ISA are switched off, some functions may
# become unused which is expected. Switch off warning for unused functions in
# such cases.
Expand Down
2 changes: 2 additions & 0 deletions cmake/dnnl_compat.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ set(COMPAT_CACHE_STRING_VARS
"LIBRARY_NAME"
"ENABLE_WORKLOAD"
"ENABLE_PRIMITIVE"
"ENABLE_PRIMITIVE_CPU_ISA"
"ENABLE_PRIMITIVE_GPU_ISA"
"ARCH_OPT_FLAGS"
"CPU_RUNTIME"
"GPU_RUNTIME"
Expand Down
10 changes: 10 additions & 0 deletions cmake/options.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,16 @@ set(DNNL_ENABLE_PRIMITIVE_GPU_ISA "ALL" CACHE STRING
- <ISA_NAME>;<ISA_NAME>;... Includes only selected ISA to be enabled.
Possible values are: GEN9, GEN11, XELP, XEHP, XEHPG, XEHPC.")

set(ONEDNN_ENABLE_GEMM_KERNELS_ISA "ALL" CACHE STRING
"Specifies an ISA set of GeMM kernels residing in x64/gemm folder to be
available at build time. Valid values:
- ALL (the default). Includes all ISA kernels to be enabled.
- NONE. Removes all kernels and interfaces.
- <ISA_NAME>. Enables all ISA up to ISA_NAME included.
Possible value are: SSE41, AVX2, AVX512. The linear order is
SSE41 < AVX2 < AVX512 < AMX (or ALL). It means that if user selects, e.g.
AVX2 ISA, SSE41 kernels will also present at build time.")

# =============
# Optimizations
# =============
Expand Down
12 changes: 12 additions & 0 deletions doc/build/build_options.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ oneDNN supports the following build-time options.
| ONEDNN_ENABLE_PRIMITIVE | **ALL**, PRIMITIVE_NAME | Specifies a set of functionality to be available based on primitives |
| ONEDNN_ENABLE_PRIMITIVE_CPU_ISA | **ALL**, CPU_ISA_NAME | Specifies a set of functionality to be available for CPU backend based on CPU ISA |
| ONEDNN_ENABLE_PRIMITIVE_GPU_ISA | **ALL**, GPU_ISA_NAME | Specifies a set of functionality to be available for GPU backend based on GPU ISA |
| ONEDNN_ENABLE_GEMM_KERNELS_ISA | **ALL**, NONE, ISA_NAME | Specifies a set of functionality to be available for GeMM kernels for CPU backend based on ISA |
| ONEDNN_EXPERIMENTAL | ON, **OFF** | Enables [experimental features](@ref dev_guide_experimental) |
| ONEDNN_VERBOSE | **ON**, OFF | Enables [verbose mode](@ref dev_guide_verbose) |
| ONEDNN_AARCH64_USE_ACL | ON, **OFF** | Enables integration with Arm Compute Library for AArch64 builds |
Expand Down Expand Up @@ -109,6 +110,17 @@ always be available. Example that enables XeLP and XeHP set:
-DONEDNN_ENABLE_PRIMITIVE_GPU_ISA=XELP;XEHP
```

#### ONEDNN_ENABLE_GEMM_KERNELS_ISA
This option supports several values: `ALL` (the default) which enables all
ISA kernels from x64/gemm folder, `NONE` which disables all kernels and removes
correspondent interfaces, or one of `SSE41`, `AVX2`, and `AVX512`. Values are
linearly ordered as `SSE41` < `AVX2` < `AVX512`. When specified, selected ISA
and all ISA that are "smaller" will be available. Example that leaves SSE41 and
AVX2 sets, but removes AVX512 and AMX kernels:
```
-DONEDNN_ENABLE_GEMM_KERNELS_ISA=AVX2
```

## CPU Options
Intel Architecture Processors and compatible devices are supported by
oneDNN CPU engine. The CPU engine is built by default but can be disabled
Expand Down
6 changes: 6 additions & 0 deletions include/oneapi/dnnl/dnnl_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -193,4 +193,10 @@
#cmakedefine01 BUILD_XEHP
#cmakedefine01 BUILD_XEHPG
#cmakedefine01 BUILD_XEHPC
// GeMM kernels ISA controls
#cmakedefine01 BUILD_GEMM_KERNELS_ALL
#cmakedefine01 BUILD_GEMM_KERNELS_NONE
#cmakedefine01 BUILD_GEMM_SSE41
#cmakedefine01 BUILD_GEMM_AVX2
#cmakedefine01 BUILD_GEMM_AVX512
#endif
46 changes: 28 additions & 18 deletions src/cpu/gemm/gemm.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2018-2022 Intel Corporation
* Copyright 2018-2023 Intel Corporation
* Copyright 2022 IBM Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -134,13 +134,14 @@ dnnl_status_t extended_sgemm(const char *transa, const char *transb,
}
#endif

#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
if (mayiuse(sse41)) {
float *dummy_ao = nullptr;
float *dummy_bo = nullptr;
return gemm_driver(transa, transb, bias ? "C" : nullptr, M, N, K, alpha,
A, lda, dummy_ao, B, ldb, dummy_bo, beta, C, ldc, bias,
auto status = gemm_driver(transa, transb, bias ? "C" : nullptr, M, N, K,
alpha, A, lda, dummy_ao, B, ldb, dummy_bo, beta, C, ldc, bias,
force_jit_nocopy_gemm);
if (status == status::success) return status;
}
#endif

Expand Down Expand Up @@ -201,10 +202,12 @@ dnnl_status_t gemm_s8x8s32(const char *transa, const char *transb,
LDA, ao, B, LDB, bo, beta, C, LDC, co);
if (status == dnnl_success) return status;

#if DNNL_X64
if (mayiuse(sse41))
return gemm_driver(transa, transb, offsetc, M, N, K, alpha, A, LDA, ao,
B, LDB, bo, beta, C, LDC, co, false);
#if DNNL_X64 && !__BUILD_GEMM_NONE
if (mayiuse(sse41)) {
auto status = gemm_driver(transa, transb, offsetc, M, N, K, alpha, A,
LDA, ao, B, LDB, bo, beta, C, LDC, co, false);
if (status == status::success) return status;
}
#elif DNNL_PPC64
#ifdef __MMA__
int ATflag = (*transa == 'T') || (*transa == 't');
Expand Down Expand Up @@ -237,18 +240,23 @@ dnnl_status_t gemm_s8x8s32(const char *transa, const char *transb,

if (*M == 0 || *N == 0 || *K == 0) return dnnl_success;

#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
bool use_jit = mayiuse(avx512_core);
bool use_s8u8 = true
&& utils::everyone_is(0, *ao, *bo) // so far a requirement
&& IMPLICATION(USE_MKL_IGEMM == 0, mayiuse(sse41));

if (use_jit)
return gemm_driver(transa, transb, offsetc, M, N, K, alpha, A, LDA, ao,
B, LDB, bo, beta, C, LDC, co, false);
else if (use_s8u8)
return simple_gemm_s8s8s32(transa, transb, offsetc, M, N, K, alpha, A,
LDA, ao, B, LDB, bo, beta, C, LDC, co);
if (use_jit) {
auto status = gemm_driver(transa, transb, offsetc, M, N, K, alpha, A,
LDA, ao, B, LDB, bo, beta, C, LDC, co, false);
if (status == status::success) return status;
}

if (use_s8u8) {
auto status = simple_gemm_s8s8s32(transa, transb, offsetc, M, N, K,
alpha, A, LDA, ao, B, LDB, bo, beta, C, LDC, co);
if (status == status::success) return status;
}
#endif

#if DNNL_PPC64
Expand Down Expand Up @@ -285,16 +293,18 @@ dnnl_status_t gemm_bf16bf16f32(const char *transa, const char *transb,
ldb, C, ldc, alpha, beta, false);
if (status != dnnl_success) return status;

#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
char *dummyOffsetC = nullptr;
bfloat16_t *dummy_ao = nullptr;
bfloat16_t *dummy_bo = nullptr;
float *dummy_co = nullptr;

if (mayiuse(avx512_core))
return gemm_driver(transa, transb, dummyOffsetC, M, N, K, alpha,
if (mayiuse(avx512_core)) {
auto status = gemm_driver(transa, transb, dummyOffsetC, M, N, K, alpha,
(const bfloat16_t *)A, lda, dummy_ao, (const bfloat16_t *)B,
ldb, dummy_bo, beta, (float *)C, ldc, dummy_co, false);
if (status == status::success) return status;
}
#elif DNNL_PPC64
#if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
bool trA = *transa == 't' || *transa == 'T';
Expand Down
15 changes: 14 additions & 1 deletion src/cpu/gemm/gemm.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2018-2022 Intel Corporation
* Copyright 2018-2023 Intel Corporation
* Copyright 2022 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -28,6 +28,19 @@

#if DNNL_X64
#include "cpu/x64/cpu_isa_traits.hpp"

// Kernels ISA section for configuring knobs.
#define __BUILD_GEMM_AMX BUILD_GEMM_KERNELS_ALL
#define __BUILD_GEMM_AVX512 __BUILD_GEMM_AMX || BUILD_GEMM_AVX512
#define __BUILD_GEMM_AVX2 __BUILD_GEMM_AVX512 || BUILD_GEMM_AVX2
#define __BUILD_GEMM_SSE41 __BUILD_GEMM_AVX2 || BUILD_GEMM_SSE41
#define __BUILD_GEMM_NONE BUILD_GEMM_KERNELS_NONE
#else
#define __BUILD_GEMM_AMX 0
#define __BUILD_GEMM_AVX512 0
#define __BUILD_GEMM_AVX2 0
#define __BUILD_GEMM_SSE41 0
#define __BUILD_GEMM_NONE 0
#endif

#if DNNL_AARCH64
Expand Down
31 changes: 16 additions & 15 deletions src/cpu/gemm/gemm_pack.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
* Copyright 2020-2023 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,6 +16,7 @@

#include "cpu/platform.hpp"

#include "cpu/gemm/gemm.hpp"
#include "cpu/gemm/gemm_pack.hpp"

#if DNNL_X64
Expand All @@ -27,13 +28,13 @@ namespace impl {
namespace cpu {

bool pack_sgemm_supported() {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::pack_sgemm_supported();
#endif
return false;
}
bool pack_gemm_bf16bf16f32_supported() {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::pack_gemm_bf16bf16f32_supported();
#endif
return false;
Expand All @@ -42,7 +43,7 @@ bool pack_gemm_bf16bf16f32_supported() {
dnnl_status_t sgemm_pack_get_size(const char *identifier, const char *transa,
const char *transb, const dim_t *M, const dim_t *N, const dim_t *K,
const dim_t *lda, const dim_t *ldb, size_t *size, bool *pack) {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::sgemm_pack_get_size(
identifier, transa, transb, M, N, K, lda, ldb, size, pack);
#endif
Expand All @@ -53,7 +54,7 @@ dnnl_status_t gemm_bf16bf16f32_pack_get_size(const char *identifier,
const char *transa, const char *transb, const dim_t *M, const dim_t *N,
const dim_t *K, const dim_t *lda, const dim_t *ldb, size_t *size,
bool *pack) {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::gemm_bf16bf16f32_pack_get_size(
identifier, transa, transb, M, N, K, lda, ldb, size, pack);
#endif
Expand All @@ -64,7 +65,7 @@ dnnl_status_t gemm_s8u8s32_pack_get_size(const char *identifier,
const char *transa, const char *transb, const dim_t *M, const dim_t *N,
const dim_t *K, const dim_t *lda, const dim_t *ldb, size_t *size,
bool *pack) {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::gemm_s8u8s32_pack_get_size(
identifier, transa, transb, M, N, K, lda, ldb, size, pack);
#endif
Expand All @@ -75,7 +76,7 @@ dnnl_status_t gemm_s8s8s32_pack_get_size(const char *identifier,
const char *transa, const char *transb, const dim_t *M, const dim_t *N,
const dim_t *K, const dim_t *lda, const dim_t *ldb, size_t *size,
bool *pack) {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::gemm_s8s8s32_pack_get_size(
identifier, transa, transb, M, N, K, lda, ldb, size, pack);
#endif
Expand All @@ -85,7 +86,7 @@ dnnl_status_t gemm_s8s8s32_pack_get_size(const char *identifier,
dnnl_status_t sgemm_pack(const char *identifier, const char *transa,
const char *transb, const dim_t *M, const dim_t *N, const dim_t *K,
const dim_t *lda, const dim_t *ldb, const float *src, float *dst) {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::sgemm_pack(
identifier, transa, transb, M, N, K, lda, ldb, src, dst);
#endif
Expand All @@ -96,7 +97,7 @@ dnnl_status_t gemm_bf16bf16f32_pack(const char *identifier, const char *transa,
const char *transb, const dim_t *M, const dim_t *N, const dim_t *K,
const dim_t *lda, const dim_t *ldb, const bfloat16_t *src,
bfloat16_t *dst) {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::gemm_bf16bf16f32_pack(
identifier, transa, transb, M, N, K, lda, ldb, src, dst);
#endif
Expand All @@ -106,7 +107,7 @@ dnnl_status_t gemm_bf16bf16f32_pack(const char *identifier, const char *transa,
dnnl_status_t gemm_s8u8s32_pack(const char *identifier, const char *transa,
const char *transb, const dim_t *M, const dim_t *N, const dim_t *K,
const dim_t *lda, const dim_t *ldb, const void *src, void *dst) {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::gemm_s8u8s32_pack(
identifier, transa, transb, M, N, K, lda, ldb, src, dst);
#endif
Expand All @@ -116,7 +117,7 @@ dnnl_status_t gemm_s8u8s32_pack(const char *identifier, const char *transa,
dnnl_status_t gemm_s8s8s32_pack(const char *identifier, const char *transa,
const char *transb, const dim_t *M, const dim_t *N, const dim_t *K,
const dim_t *lda, const dim_t *ldb, const void *src, void *dst) {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::gemm_s8s8s32_pack(
identifier, transa, transb, M, N, K, lda, ldb, src, dst);
#endif
Expand All @@ -127,7 +128,7 @@ dnnl_status_t sgemm_compute(const char *transa, const char *transb,
const dim_t *M, const dim_t *N, const dim_t *K, const float *A,
const dim_t *lda, const float *B, const dim_t *ldb, const float *beta,
float *C, const dim_t *ldc) {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::sgemm_compute(
transa, transb, M, N, K, A, lda, B, ldb, beta, C, ldc);
#endif
Expand All @@ -138,7 +139,7 @@ dnnl_status_t gemm_bf16bf16f32_compute(const char *transa, const char *transb,
const dim_t *M, const dim_t *N, const dim_t *K, const bfloat16_t *A,
const dim_t *lda, const bfloat16_t *B, const dim_t *ldb,
const float *beta, float *C, const dim_t *ldc) {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::gemm_bf16bf16f32_compute(
transa, transb, M, N, K, A, lda, B, ldb, beta, C, ldc);
#endif
Expand All @@ -149,7 +150,7 @@ dnnl_status_t gemm_s8u8s32_compute(const char *transa, const char *transb,
const char *offsetc, const dim_t *M, const dim_t *N, const dim_t *K,
const int8_t *A, const dim_t *lda, const uint8_t *B, const dim_t *ldb,
const float *beta, int32_t *C, const dim_t *ldc, const int32_t *co) {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::gemm_s8u8s32_compute(
transa, transb, offsetc, M, N, K, A, lda, B, ldb, beta, C, ldc, co);
#endif
Expand All @@ -160,7 +161,7 @@ dnnl_status_t gemm_s8s8s32_compute(const char *transa, const char *transb,
const char *offsetc, const dim_t *M, const dim_t *N, const dim_t *K,
const int8_t *A, const dim_t *lda, const int8_t *B, const dim_t *ldb,
const float *beta, int32_t *C, const dim_t *ldc, const int32_t *co) {
#if DNNL_X64
#if DNNL_X64 && !__BUILD_GEMM_NONE
return x64::gemm_s8s8s32_compute(
transa, transb, offsetc, M, N, K, A, lda, B, ldb, beta, C, ldc, co);
#endif
Expand Down
2 changes: 1 addition & 1 deletion src/cpu/rnn/rnn_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -873,7 +873,7 @@ bool init_conf(rnn_conf_t &rnn, const rnn_desc_t &rd,

rnn.diff_weights_overwrite = rd.flags & rnn_flags::diff_weights_overwrite;

#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL || BUILD_GEMM_KERNELS_NONE
// XXX: Threadpool runtime may use different number of threads at execute
// and create stages. GEMM packed API is not aware of number of threads as
// of now. In order to synchronize all layers, GEMM pack API should be
Expand Down
29 changes: 29 additions & 0 deletions src/cpu/x64/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,35 @@ else()
PROPERTIES COMPILE_FLAGS "${OPT_LEVEL}")
endif()

# Discard GeMM kernel files when requested
if(ONEDNN_ENABLE_GEMM_KERNELS_ISA MATCHES "^(AVX512|AVX2|SSE41|NONE)$")
file(GLOB_RECURSE SOURCES_AMX ${CMAKE_CURRENT_SOURCE_DIR}/gemm/jit*amx*)
foreach(amx_file ${SOURCES_AMX})
list(REMOVE_ITEM SOURCES "${amx_file}")
endforeach()
endif()

if(ONEDNN_ENABLE_GEMM_KERNELS_ISA MATCHES "^(AVX2|SSE41|NONE)$")
file(GLOB_RECURSE SOURCES_AVX512 ${CMAKE_CURRENT_SOURCE_DIR}/gemm/jit*avx512*)
foreach(avx512_file ${SOURCES_AVX512})
list(REMOVE_ITEM SOURCES "${avx512_file}")
endforeach()
endif()

if(ONEDNN_ENABLE_GEMM_KERNELS_ISA MATCHES "^(SSE41|NONE)$")
file(GLOB_RECURSE SOURCES_AVX ${CMAKE_CURRENT_SOURCE_DIR}/gemm/jit*avx*)
foreach(avx_file ${SOURCES_AVX})
list(REMOVE_ITEM SOURCES "${avx_file}")
endforeach()
endif()

if(ONEDNN_ENABLE_GEMM_KERNELS_ISA MATCHES "^(NONE)$")
file(GLOB_RECURSE SOURCES_SSE41 ${CMAKE_CURRENT_SOURCE_DIR}/gemm/*)
foreach(sse41_file ${SOURCES_SSE41})
list(REMOVE_ITEM SOURCES "${sse41_file}")
endforeach()
endif()

set(OBJ_LIB ${LIB_PACKAGE_NAME}_cpu_x64)
add_library(${OBJ_LIB} OBJECT ${SOURCES})
set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
Expand Down
Loading

0 comments on commit 9861b36

Please sign in to comment.