Use sycl::ext::oneapi::experimental for complex tyes

This works around use of double precision functions/literals in implementations of these functions in MSVC headers, causing failures to offload on Iris Xe for single precision input citing lack of fp64 support by the hardware. Changes include CL/sycl.hpp to sycl/sycl.hpp per SYCL-2020 spec For every CMake target, where add_sycl_to_target is used, we also run target_compile_options( ${target_name} PRIVATE -fysl-targets=spir64-unknown-unknown,nvptx64-nvidia-cuda ) Add DPCTL_TARGET_CUDA Boolean cmake option Also DPCTL_SYCL_TARGETS parameter can be used to specify targets to build for. DPCTL_TARGET_CUDA could be set via cmake option, or via environment variable, e.g. ``` $ DPCTL_TARGET_CUDA=1 python scripts/build_locally.py --verbose ``` This calls `target_compile_options` to set sycl-targets for targets needing SYCL
IntelPython · Nov 16, 2023 · 6d3be5d · 6d3be5d
1 parent f772888
commit 6d3be5d
Show file tree

Hide file tree

Showing 160 changed files with 6,871 additions and 218 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -17,9 +17,30 @@ option(DPCTL_GENERATE_COVERAGE
     "Build dpctl with coverage instrumentation"
     OFF
 )
+option(DPCTL_TARGET_CUDA
+    "Build DPCTL to target CUDA devices"
+    OFF
+)
 
 find_package(IntelSYCL REQUIRED PATHS ${CMAKE_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
 
+set(_dpctl_sycl_targets)
+if ("x${DPCTL_SYCL_TARGETS}" STREQUAL "x")
+   if(DPCTL_TARGET_CUDA)
+      set(_dpctl_sycl_targets "nvptx64-nvidia-cuda,spir64-unknown-unknown")
+   else()
+      if(DEFINED ENV{DPCTL_TARGET_CUDA})
+          set(_dpctl_sycl_targets "nvptx64-nvidia-cuda,spir64-unknown-unknown")
+      endif()
+   endif()
+else()
+   set(_dpctl_sycl_targets ${DPCTL_SYCL_TARGETS})
+endif()
+
+if(_dpctl_sycl_targets)
+   message(STATUS "Compiling for -fsycl-targets=${_dpctl_sycl_targets}")
+endif()
+
 add_subdirectory(libsyclinterface)
 
 file(GLOB _dpctl_capi_headers dpctl/apis/include/*.h*)

diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt
@@ -143,7 +143,20 @@ function(build_dpctl_ext _trgt _src _dest)
     add_custom_target(${_cythonize_trgt} DEPENDS ${_src})
     Python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
     if (BUILD_DPCTL_EXT_SYCL)
-      add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
+        add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
+	if(_dpctl_sycl_targets)
+	    # make fat binary
+            target_compile_options(
+                ${_trgt}
+                PRIVATE
+                -fsycl-targets=${_dpctl_sycl_targets}
+            )
+	    target_link_options(
+	        ${_trgt}
+	        PRIVATE
+	        -fsycl-targets=${_dpctl_sycl_targets}
+	    )
+	endif()
     endif()
     target_include_directories(${_trgt} PRIVATE ${NumPy_INCLUDE_DIR} ${DPCTL_INCLUDE_DIR})
     add_dependencies(${_trgt} _build_time_create_dpctl_include_copy ${_cythonize_trgt})

diff --git a/dpctl/_host_task_util.hpp b/dpctl/_host_task_util.hpp
@@ -33,7 +33,7 @@
 #include "Python.h"
 #include "syclinterface/dpctl_data_types.h"
 #include "syclinterface/dpctl_sycl_type_casters.hpp"
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef QRef,
                                 PyObject **obj_array,

diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp
@@ -26,10 +26,10 @@
 #pragma once
 
 #include "dpctl_capi.h"
-#include <CL/sycl.hpp>
 #include <complex>
 #include <memory>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 

diff --git a/dpctl/sycl.pxd b/dpctl/sycl.pxd
@@ -20,7 +20,7 @@
 from . cimport _backend as dpctl_backend
 
 
-cdef extern from "CL/sycl.hpp" namespace "sycl":
+cdef extern from "sycl/sycl.hpp" namespace "sycl":
     cdef cppclass queue "sycl::queue":
         pass
 

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
@@ -188,12 +188,20 @@ foreach(_src_fn ${_no_fast_math_sources})
   )
 endforeach()
 if (UNIX)
-  set_source_files_properties(
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/abs.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions/sqrt.cpp
-    PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES")
+  set(_compiler_definitions "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES;SYCL_EXT_ONEAPI_COMPLEX")
+else()
+  set(_compiler_definitions "SYCL_EXT_ONEAPI_COMPLEX")
 endif()
 
+foreach(_src_fn ${_elementwise_sources})
+  get_source_file_property(_cmpl_options_defs ${_src_fn} COMPILE_DEFINITIONS)
+  set(_combined_options_defs ${_cmpl_options_defs} "${_compiler_definitions}")
+  set_source_files_properties(
+     ${_src_fn}
+     PROPERTIES COMPILE_DEFINITIONS "${_combined_options_defs}"
+  )
+endforeach()
+
 set(_linker_options "LINKER:${DPCTL_LDFLAGS}")
 foreach(python_module_name ${_py_trgts})
     target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
@@ -209,6 +217,19 @@ foreach(python_module_name ${_py_trgts})
         ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/
     )
     target_link_options(${python_module_name} PRIVATE ${_linker_options})
+    if(_dpctl_sycl_targets)
+        # make fat binary
+        target_compile_options(
+            ${python_module_name}
+            PRIVATE
+            -fsycl-targets=${_dpctl_sycl_targets}
+        )
+        target_link_options(
+            ${python_module_name}
+            PRIVATE
+            -fsycl-targets=${_dpctl_sycl_targets}
+        )
+    endif()
     add_dependencies(${python_module_name} _dpctl4pybind11_deps)
     install(TARGETS ${python_module_name} DESTINATION "dpctl/tensor")
 endforeach()
diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
@@ -23,11 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <array>
 #include <cstdint>
 #include <limits>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 

diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -23,10 +23,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstdint>
 #include <limits>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 

diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
@@ -24,7 +24,7 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <complex>
 #include <cstdint>

diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp
@@ -27,9 +27,9 @@
 #include "utils/offset_utils.hpp"
 #include "utils/strided_iters.hpp"
 #include "utils/type_utils.hpp"
-#include <CL/sycl.hpp>
 #include <complex>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 
 namespace dpctl
 {

diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -23,10 +23,10 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <complex>
 #include <cstdint>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -23,12 +23,13 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -49,6 +50,7 @@ namespace abs
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -120,7 +122,7 @@ template <typename argT, typename resT> struct AbsFunctor
         }
         else {
 #ifdef USE_STD_ABS_FOR_COMPLEX_TYPES
-            return std::abs(z);
+            return exprm_ns::abs(exprm_ns::complex<realT>(z));
 #else
             return std::hypot(std::real(z), std::imag(z));
 #endif

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -23,10 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -47,6 +48,7 @@ namespace acos
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -103,18 +105,21 @@ template <typename argT, typename resT> struct AcosFunctor
             constexpr realT r_eps =
                 realT(1) / std::numeric_limits<realT>::epsilon();
             if (std::abs(x) > r_eps || std::abs(y) > r_eps) {
-                argT log_in = std::log(in);
+                using sycl_complexT = exprm_ns::complex<realT>;
+                sycl_complexT log_in =
+                    exprm_ns::log(exprm_ns::complex<realT>(in));
 
-                const realT wx = std::real(log_in);
-                const realT wy = std::imag(log_in);
+                const realT wx = log_in.real();
+                const realT wy = log_in.imag();
                 const realT rx = std::abs(wy);
 
                 realT ry = wx + std::log(realT(2));
                 return resT{rx, (std::signbit(y)) ? ry : -ry};
             }
 
             /* ordinary cases */
-            return std::acos(in);
+            return exprm_ns::acos(
+                exprm_ns::complex<realT>(in)); // std::acos(in);
         }
         else {
             static_assert(std::is_floating_point_v<argT> ||

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -23,10 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -47,6 +48,7 @@ namespace acosh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -110,15 +112,18 @@ template <typename argT, typename resT> struct AcoshFunctor
              * For large x or y including acos(+-Inf + I*+-Inf)
              */
             if (std::abs(x) > r_eps || std::abs(y) > r_eps) {
-                const realT wx = std::real(std::log(in));
-                const realT wy = std::imag(std::log(in));
+                using sycl_complexT = typename exprm_ns::complex<realT>;
+                const sycl_complexT log_in = exprm_ns::log(sycl_complexT(in));
+                const realT wx = log_in.real();
+                const realT wy = log_in.imag();
                 const realT rx = std::abs(wy);
                 realT ry = wx + std::log(realT(2));
                 acos_in = resT{rx, (std::signbit(y)) ? ry : -ry};
             }
             else {
                 /* ordinary cases */
-                acos_in = std::acos(in);
+                acos_in = exprm_ns::acos(
+                    exprm_ns::complex<realT>(in)); // std::acos(in);
             }
 
             /* Now we calculate acosh(z) */

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -24,9 +24,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
@@ -49,6 +50,7 @@ namespace add
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 template <typename argT1, typename argT2, typename resT> struct AddFunctor
 {
@@ -60,7 +62,31 @@ template <typename argT1, typename argT2, typename resT> struct AddFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
-        return in1 + in2;
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value)
+        {
+            using rT1 = typename argT1::value_type;
+            using rT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<rT1>(in1) + exprm_ns::complex<rT2>(in2);
+        }
+        else if constexpr (tu_ns::is_complex<argT1>::value &&
+                           !tu_ns::is_complex<argT2>::value)
+        {
+            using rT1 = typename argT1::value_type;
+
+            return exprm_ns::complex<rT1>(in1) + in2;
+        }
+        else if constexpr (!tu_ns::is_complex<argT1>::value &&
+                           tu_ns::is_complex<argT2>::value)
+        {
+            using rT2 = typename argT2::value_type;
+
+            return in1 + exprm_ns::complex<rT2>(in2);
+        }
+        else {
+            return in1 + in2;
+        }
     }
 
     template <int vec_sz>