Skip to content

Commit

Permalink
Update AVX2 and AVX512 flags (#2167)
Browse files Browse the repository at this point in the history
Summary:
- Update AVX2 and AVX512 flags to account for nvcc as the front-end compiler

Pull Request resolved: #2167

Reviewed By: spcyppt

Differential Revision: D51681615

Pulled By: q10

fbshipit-source-id: 231aa051f121ff7a5f6aac56f335442bbd312a49
  • Loading branch information
q10 authored and facebook-github-bot committed Nov 30, 2023
1 parent 48120da commit 453c80e
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 15 deletions.
1 change: 1 addition & 0 deletions .github/scripts/fbgemm_gpu_build.bash
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ build_fbgemm_gpu_package () {
--package_name="${package_name}" \
--python-tag="${python_tag}" \
--plat-name="${plat_name}" \
--verbose \
"${build_args[@]}"

# Run checks on the built libraries
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
# found in:
# https://github.com/github/gitignore/

# General
.DS_Store
*~

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
20 changes: 16 additions & 4 deletions fbgemm_gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -432,10 +432,22 @@ else()
DEPENDS "${optimizer_codegen_dependencies}")
endif()

set(AVX2_FLAGS "-mavx2;-mf16c;-mfma;-fopenmp")
if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
# NVCC in WSL complains about unknown -mavx options
# https://github.com/pytorch/FBGEMM/issues/2135
set(AVX2_FLAGS "-Xcompiler;-mavx;-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-fopenmp")
endif()

set(AVX512_FLAGS "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl;-fopenmp")
if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
set(AVX512_FLAGS "-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-Xcompiler;-mavx512f;-Xcompiler;-mavx512bw;-Xcompiler;-mavx512dq;-Xcompiler;-mavx512vl;-fopenmp")
endif()

if(CXX_AVX2_FOUND)
set_source_files_properties(${gen_cpu_source_files}
PROPERTIES COMPILE_OPTIONS
"-mavx2;-mf16c;-mfma;-fopenmp")
"${AVX2_FLAGS}")
else()
set_source_files_properties(${gen_cpu_source_files}
PROPERTIES COMPILE_OPTIONS
Expand Down Expand Up @@ -504,13 +516,13 @@ set(fbgemm_sources_avx512
if(CXX_AVX2_FOUND)
set_source_files_properties(${fbgemm_sources_avx2}
PROPERTIES COMPILE_OPTIONS
"-mavx2;-mf16c;-mfma")
"${AVX2_FLAGS}")
endif()

if(CXX_AVX512_FOUND)
set_source_files_properties(${fbgemm_sources_avx512}
PROPERTIES COMPILE_OPTIONS
"-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl")
"${AVX512_FLAGS}")
endif()

set(fbgemm_sources ${fbgemm_sources_normal})
Expand Down Expand Up @@ -618,7 +630,7 @@ endif()
if(CXX_AVX2_FOUND)
set_source_files_properties(${fbgemm_gpu_sources_static_cpu}
PROPERTIES COMPILE_OPTIONS
"-mavx;-mf16c;-mfma;-mavx2;-fopenmp")
"${AVX2_FLAGS}")
else()
set_source_files_properties(${fbgemm_gpu_sources_static_cpu}
PROPERTIES COMPILE_OPTIONS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ __global__ __launch_bounds__(kMaxThreads) void grad_mean{{ vbe_desc }}_kernel(
) {
int32_t T = D_offsets.size(0) - 1;
int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
int32_t b;
[[maybe_unused]] int32_t b;
int32_t t;
const auto total_B = offsets.size(0) - 1;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
}

int32_t t;
int32_t b;
[[maybe_unused]] int32_t b;

{%- if vbe %}
const auto info = reinterpret_cast<const uint32_t*>(&b_t_map[b_t])[0];
Expand Down
10 changes: 5 additions & 5 deletions fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ __inline__ __device__ void process_all_indices_no_pooling(
const auto total_load_D = static_cast<uint32_t>(smem[params_offset + SAVED_PARAMS::P_total_load_D]);

// Each thread loads a separate weight ptr
const auto weight_ptrs = reinterpret_cast<const uintptr_t>(&weights[indices[threadIdx.x] * load_D]);
const auto weight_ptrs = reinterpret_cast<uintptr_t>(&weights[indices[threadIdx.x] * load_D]);

// Assuming kWarpSize is a multiple of STEP
for (uint32_t l_start = 0; l_start < TOTAL_L; l_start += STEP) {
Expand Down Expand Up @@ -332,8 +332,8 @@ __noinline__ __device__ void process_all_indices_small_Ls(
const cache_t* lxu_cache_weights =
reinterpret_cast<const cache_t*>(smem[params_offset + LXU_CACHE_PARAMS::P_lxu_cache_weights]);
SMEM_GENERIC_PTR[threadIdx.x] = cache_idx != kCacheLocationMissing ?
reinterpret_cast<const uintptr_t>(&lxu_cache_weights[cache_idx * max_D_cache]) :
reinterpret_cast<const uintptr_t>(&weights[indices[l] * load_D]);
reinterpret_cast<uintptr_t>(&lxu_cache_weights[cache_idx * max_D_cache]) :
reinterpret_cast<uintptr_t>(&weights[indices[l] * load_D]);
}
if (!std::is_same<emb_t, cache_t>::value) {
cache_look_up_bits = ballot_sync(cache_idx != kCacheLocationMissing);
Expand Down Expand Up @@ -558,8 +558,8 @@ __noinline__ __device__ void process_all_indices_large_Ls(
const auto* lxu_cache_weights =
reinterpret_cast<const cache_t*>(smem[params_offset + LXU_CACHE_PARAMS::P_lxu_cache_weights]);
SMEM_GENERIC_PTR[threadIdx.x] = cache_idx != kCacheLocationMissing ?
reinterpret_cast<const uintptr_t>(&lxu_cache_weights[cache_idx * max_D_cache]) :
reinterpret_cast<const uintptr_t>(&weights[indices[l] * load_D]);
reinterpret_cast<uintptr_t>(&lxu_cache_weights[cache_idx * max_D_cache]) :
reinterpret_cast<uintptr_t>(&weights[indices[l] * load_D]);
}
if (!std::is_same<emb_t, cache_t>::value) {
cache_look_up_bits = ballot_sync(cache_idx != kCacheLocationMissing);
Expand Down
22 changes: 19 additions & 3 deletions fbgemm_gpu/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@

def parse_args(argv: List[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="fbgemm_gpu setup")
parser.add_argument(
"--verbose",
action="store_true",
help="Print verbose logs during the build.",
)
parser.add_argument(
"--package_variant",
type=str,
Expand Down Expand Up @@ -133,8 +138,6 @@ def set_cuda_environment_variables() -> None:
def cmake_environment_variables(args) -> None:
def _get_cxx11_abi():
try:
import torch

value = int(torch._C._GLIBCXX_USE_CXX11_ABI)
except ImportError:
value = 0
Expand All @@ -143,11 +146,22 @@ def _get_cxx11_abi():
torch_root = os.path.dirname(torch.__file__)
os.environ["CMAKE_BUILD_PARALLEL_LEVEL"] = str(os.cpu_count() // 2)

cmake_args = [f"-DCMAKE_PREFIX_PATH={torch_root}", _get_cxx11_abi()]
cmake_args = [
f"-DCMAKE_PREFIX_PATH={torch_root}",
_get_cxx11_abi(),
]

if args.verbose:
print("[SETUP.PY] Building in VERBOSE mode ...")
cmake_args.append("-DCMAKE_VERBOSE_MAKEFILE=1")

if args.package_variant == "cpu":
print("[SETUP.PY] Building the CPU-ONLY variant of FBGEMM_GPU ...")
cmake_args.append("-DFBGEMM_CPU_ONLY=ON")

if args.nvml_lib_path:
cmake_args.append(f"-DNVML_LIB_PATH={args.nvml_lib_path}")

return cmake_args


Expand Down Expand Up @@ -183,6 +197,7 @@ def extract_variant_version(cls, variant: str) -> str:

if variant == "cpu":
variant_version = "+cpu"

elif variant == "cuda":
set_cuda_environment_variables()
if torch.version.cuda is not None:
Expand All @@ -192,6 +207,7 @@ def extract_variant_version(cls, variant: str) -> str:
sys.exit(
"[SETUP.PY] Installed PyTorch variant is not CUDA; cannot determine the CUDA version!"
)

elif variant == "rocm":
if torch.version.hip is not None:
rocm_version = torch.version.hip.split(".")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ void jagged_jagged_elementwise_dense_output_(
x_offset_ptrs.vals[d] = \
x_offsets_contig[d].template data_ptr<index_t>(); \
} \
const auto func_name = "jagged_jagged_elementwise_dense_output_kernel_"; \
[[maybe_unused]] const auto func_name = \
"jagged_jagged_elementwise_dense_output_kernel_"; \
jagged_jagged_elementwise_dense_output_kernel_<NUM_JAGGED_DIM, index_t> \
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>( \
MAKE_PTA_WITH_NAME(func_name, x_values, scalar_t, 2, 32), \
Expand Down

0 comments on commit 453c80e

Please sign in to comment.