Update AVX2 and AVX512 flags (#2167)

Summary: - Update AVX2 and AVX512 flags to account for nvcc as the front-end compiler Pull Request resolved: #2167 Reviewed By: spcyppt Differential Revision: D51681615 Pulled By: q10 fbshipit-source-id: 231aa051f121ff7a5f6aac56f335442bbd312a49
pytorch · Nov 30, 2023 · 453c80e · 453c80e
1 parent 48120da
commit 453c80e
Show file tree

Hide file tree

Showing 8 changed files with 49 additions and 15 deletions.
diff --git a/.github/scripts/fbgemm_gpu_build.bash b/.github/scripts/fbgemm_gpu_build.bash
@@ -347,6 +347,7 @@ build_fbgemm_gpu_package () {
       --package_name="${package_name}" \
       --python-tag="${python_tag}" \
       --plat-name="${plat_name}" \
+      --verbose \
       "${build_args[@]}"
 
   # Run checks on the built libraries

diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,10 @@
 # found in:
 # https://github.com/github/gitignore/
 
+# General
+.DS_Store
+*~
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/fbgemm_gpu/CMakeLists.txt b/fbgemm_gpu/CMakeLists.txt
@@ -432,10 +432,22 @@ else()
     DEPENDS "${optimizer_codegen_dependencies}")
 endif()
 
+set(AVX2_FLAGS "-mavx2;-mf16c;-mfma;-fopenmp")
+if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
+  # NVCC in WSL complains about unknown -mavx options
+  # https://github.com/pytorch/FBGEMM/issues/2135
+  set(AVX2_FLAGS "-Xcompiler;-mavx;-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-fopenmp")
+endif()
+
+set(AVX512_FLAGS "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl;-fopenmp")
+if(NOT FBGEMM_CPU_ONLY AND WSL_MODE)
+  set(AVX512_FLAGS "-Xcompiler;-mavx2;-Xcompiler;-mf16c;-Xcompiler;-mfma;-Xcompiler;-mavx512f;-Xcompiler;-mavx512bw;-Xcompiler;-mavx512dq;-Xcompiler;-mavx512vl;-fopenmp")
+endif()
+
 if(CXX_AVX2_FOUND)
   set_source_files_properties(${gen_cpu_source_files}
     PROPERTIES COMPILE_OPTIONS
-    "-mavx2;-mf16c;-mfma;-fopenmp")
+    "${AVX2_FLAGS}")
 else()
   set_source_files_properties(${gen_cpu_source_files}
     PROPERTIES COMPILE_OPTIONS
@@ -504,13 +516,13 @@ set(fbgemm_sources_avx512
 if(CXX_AVX2_FOUND)
   set_source_files_properties(${fbgemm_sources_avx2}
     PROPERTIES COMPILE_OPTIONS
-    "-mavx2;-mf16c;-mfma")
+    "${AVX2_FLAGS}")
 endif()
 
 if(CXX_AVX512_FOUND)
   set_source_files_properties(${fbgemm_sources_avx512}
     PROPERTIES COMPILE_OPTIONS
-    "-mavx2;-mf16c;-mfma;-mavx512f;-mavx512bw;-mavx512dq;-mavx512vl")
+    "${AVX512_FLAGS}")
 endif()
 
 set(fbgemm_sources ${fbgemm_sources_normal})
@@ -618,7 +630,7 @@ endif()
 if(CXX_AVX2_FOUND)
   set_source_files_properties(${fbgemm_gpu_sources_static_cpu}
     PROPERTIES COMPILE_OPTIONS
-    "-mavx;-mf16c;-mfma;-mavx2;-fopenmp")
+    "${AVX2_FLAGS}")
 else()
   set_source_files_properties(${fbgemm_gpu_sources_static_cpu}
     PROPERTIES COMPILE_OPTIONS

diff --git a/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu
@@ -151,7 +151,7 @@ __global__ __launch_bounds__(kMaxThreads) void grad_mean{{ vbe_desc }}_kernel(
 ) {
   int32_t T = D_offsets.size(0) - 1;
   int32_t b_t = blockIdx.x * blockDim.y + threadIdx.y;
-  int32_t b;
+  [[maybe_unused]] int32_t b;
   int32_t t;
   const auto total_B = offsets.size(0) - 1;
 

diff --git a/fbgemm_gpu/codegen/embedding_backward_split_indice_weights_template.cu b/fbgemm_gpu/codegen/embedding_backward_split_indice_weights_template.cu
@@ -66,7 +66,7 @@ __global__ __launch_bounds__(kForwardMaxThreads) void
     }
 
     int32_t t;
-    int32_t b;
+    [[maybe_unused]] int32_t b;
 
     {%- if vbe %}
     const auto info = reinterpret_cast<const uint32_t*>(&b_t_map[b_t])[0];

diff --git a/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu b/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu
@@ -130,7 +130,7 @@ __inline__ __device__ void process_all_indices_no_pooling(
   const auto total_load_D = static_cast<uint32_t>(smem[params_offset + SAVED_PARAMS::P_total_load_D]);
 
   // Each thread loads a separate weight ptr
-  const auto weight_ptrs = reinterpret_cast<const uintptr_t>(&weights[indices[threadIdx.x] * load_D]);
+  const auto weight_ptrs = reinterpret_cast<uintptr_t>(&weights[indices[threadIdx.x] * load_D]);
 
   // Assuming kWarpSize is a multiple of STEP
   for (uint32_t l_start = 0; l_start < TOTAL_L; l_start += STEP) {
@@ -332,8 +332,8 @@ __noinline__ __device__ void process_all_indices_small_Ls(
           const cache_t* lxu_cache_weights =
             reinterpret_cast<const cache_t*>(smem[params_offset + LXU_CACHE_PARAMS::P_lxu_cache_weights]);
           SMEM_GENERIC_PTR[threadIdx.x] = cache_idx != kCacheLocationMissing ?
-            reinterpret_cast<const uintptr_t>(&lxu_cache_weights[cache_idx * max_D_cache]) :
-            reinterpret_cast<const uintptr_t>(&weights[indices[l] * load_D]);
+            reinterpret_cast<uintptr_t>(&lxu_cache_weights[cache_idx * max_D_cache]) :
+            reinterpret_cast<uintptr_t>(&weights[indices[l] * load_D]);
         }
         if (!std::is_same<emb_t, cache_t>::value) {
           cache_look_up_bits = ballot_sync(cache_idx != kCacheLocationMissing);
@@ -558,8 +558,8 @@ __noinline__ __device__ void process_all_indices_large_Ls(
           const auto* lxu_cache_weights =
             reinterpret_cast<const cache_t*>(smem[params_offset + LXU_CACHE_PARAMS::P_lxu_cache_weights]);
           SMEM_GENERIC_PTR[threadIdx.x] = cache_idx != kCacheLocationMissing ?
-            reinterpret_cast<const uintptr_t>(&lxu_cache_weights[cache_idx * max_D_cache]) :
-            reinterpret_cast<const uintptr_t>(&weights[indices[l] * load_D]);
+            reinterpret_cast<uintptr_t>(&lxu_cache_weights[cache_idx * max_D_cache]) :
+            reinterpret_cast<uintptr_t>(&weights[indices[l] * load_D]);
         }
         if (!std::is_same<emb_t, cache_t>::value) {
           cache_look_up_bits = ballot_sync(cache_idx != kCacheLocationMissing);

diff --git a/fbgemm_gpu/setup.py b/fbgemm_gpu/setup.py
@@ -26,6 +26,11 @@
 
 def parse_args(argv: List[str]) -> argparse.Namespace:
     parser = argparse.ArgumentParser(description="fbgemm_gpu setup")
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Print verbose logs during the build.",
+    )
     parser.add_argument(
         "--package_variant",
         type=str,
@@ -133,8 +138,6 @@ def set_cuda_environment_variables() -> None:
 def cmake_environment_variables(args) -> None:
     def _get_cxx11_abi():
         try:
-            import torch
-
             value = int(torch._C._GLIBCXX_USE_CXX11_ABI)
         except ImportError:
             value = 0
@@ -143,11 +146,22 @@ def _get_cxx11_abi():
     torch_root = os.path.dirname(torch.__file__)
     os.environ["CMAKE_BUILD_PARALLEL_LEVEL"] = str(os.cpu_count() // 2)
 
-    cmake_args = [f"-DCMAKE_PREFIX_PATH={torch_root}", _get_cxx11_abi()]
+    cmake_args = [
+        f"-DCMAKE_PREFIX_PATH={torch_root}",
+        _get_cxx11_abi(),
+    ]
+
+    if args.verbose:
+        print("[SETUP.PY] Building in VERBOSE mode ...")
+        cmake_args.append("-DCMAKE_VERBOSE_MAKEFILE=1")
+
     if args.package_variant == "cpu":
+        print("[SETUP.PY] Building the CPU-ONLY variant of FBGEMM_GPU ...")
         cmake_args.append("-DFBGEMM_CPU_ONLY=ON")
+
     if args.nvml_lib_path:
         cmake_args.append(f"-DNVML_LIB_PATH={args.nvml_lib_path}")
+
     return cmake_args
 
 
@@ -183,6 +197,7 @@ def extract_variant_version(cls, variant: str) -> str:
 
         if variant == "cpu":
             variant_version = "+cpu"
+
         elif variant == "cuda":
             set_cuda_environment_variables()
             if torch.version.cuda is not None:
@@ -192,6 +207,7 @@ def extract_variant_version(cls, variant: str) -> str:
                 sys.exit(
                     "[SETUP.PY] Installed PyTorch variant is not CUDA; cannot determine the CUDA version!"
                 )
+
         elif variant == "rocm":
             if torch.version.hip is not None:
                 rocm_version = torch.version.hip.split(".")

diff --git a/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu b/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu
@@ -104,7 +104,8 @@ void jagged_jagged_elementwise_dense_output_(
       x_offset_ptrs.vals[d] =                                                \
           x_offsets_contig[d].template data_ptr<index_t>();                  \
     }                                                                        \
-    const auto func_name = "jagged_jagged_elementwise_dense_output_kernel_"; \
+    [[maybe_unused]] const auto func_name =                                  \
+        "jagged_jagged_elementwise_dense_output_kernel_";                    \
     jagged_jagged_elementwise_dense_output_kernel_<NUM_JAGGED_DIM, index_t>  \
         <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(          \
             MAKE_PTA_WITH_NAME(func_name, x_values, scalar_t, 2, 32),        \