From c9cc505ff201f32ed68be45dc1596f5eb75cf2dc Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Tue, 17 Oct 2023 01:32:44 -0500
Subject: [PATCH 01/19] Tweaked test_intel_device_info

---
 dpctl/tests/test_utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/dpctl/tests/test_utils.py b/dpctl/tests/test_utils.py
index 1aab7fd7e7..05b2dc7890 100644
--- a/dpctl/tests/test_utils.py
+++ b/dpctl/tests/test_utils.py
@@ -21,7 +21,6 @@
 
 import dpctl
 import dpctl.utils
-from dpctl.enum_types import backend_type
 
 
 def test_get_execution_queue_input_validation():
@@ -132,9 +131,7 @@ def test_intel_device_info():
         pytest.skip("Default device could not be created")
     descr = dpctl.utils.intel_device_info(d)
     assert isinstance(descr, dict)
-    assert ("device_id" in descr) or (
-        not d.has_aspect_cpu and not d.backend == backend_type.level_zero
-    )
+    assert ("device_id" in descr) or not descr
     allowed_names = [
         "device_id",
         "gpu_slices",

From 2d2f235bd49aff9ff186811996a794ba52cdbd41 Mon Sep 17 00:00:00 2001
From: ndgrigorian <46709016+ndgrigorian@users.noreply.github.com>
Date: Tue, 17 Oct 2023 08:48:13 -0700
Subject: [PATCH 02/19] Elementwise functions cbrt, exp2, copysign, and rsqrt
 (#1443)

* Implements dpctl.tensor.cbrt

* Implements copysign and exp2 elementwise funcs

* Adds tests for cbrt, copysign, exp2

* Implements rsqrt and tests for rsqrt

* Modified tests for cbrt, copysign, and rsqrt

Now test more type combinations/output types
---
 dpctl/tensor/__init__.py                      |   8 +
 dpctl/tensor/_elementwise_funcs.py            | 113 ++++++++
 .../kernels/elementwise_functions/cbrt.hpp    | 172 ++++++++++++
 .../elementwise_functions/copysign.hpp        | 215 +++++++++++++++
 .../kernels/elementwise_functions/exp2.hpp    | 229 +++++++++++++++
 .../kernels/elementwise_functions/rsqrt.hpp   | 179 ++++++++++++
 .../source/elementwise_functions.cpp          | 261 +++++++++++++++++-
 dpctl/tests/elementwise/test_cbrt.py          |  79 ++++++
 dpctl/tests/elementwise/test_copysign.py      | 111 ++++++++
 dpctl/tests/elementwise/test_exp2.py          | 168 +++++++++++
 dpctl/tests/elementwise/test_rsqrt.py         |  74 +++++
 11 files changed, 1608 insertions(+), 1 deletion(-)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
 create mode 100644 dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
 create mode 100644 dpctl/tests/elementwise/test_cbrt.py
 create mode 100644 dpctl/tests/elementwise/test_copysign.py
 create mode 100644 dpctl/tests/elementwise/test_exp2.py
 create mode 100644 dpctl/tests/elementwise/test_rsqrt.py

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 3473d5cde5..bab31379b7 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -110,13 +110,16 @@
     bitwise_or,
     bitwise_right_shift,
     bitwise_xor,
+    cbrt,
     ceil,
     conj,
+    copysign,
     cos,
     cosh,
     divide,
     equal,
     exp,
+    exp2,
     expm1,
     floor,
     floor_divide,
@@ -149,6 +152,7 @@
     real,
     remainder,
     round,
+    rsqrt,
     sign,
     signbit,
     sin,
@@ -314,4 +318,8 @@
     "argmax",
     "argmin",
     "prod",
+    "cbrt",
+    "exp2",
+    "copysign",
+    "rsqrt",
 ]
diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
index 259443f8e3..24ae7fa8cf 100644
--- a/dpctl/tensor/_elementwise_funcs.py
+++ b/dpctl/tensor/_elementwise_funcs.py
@@ -1761,3 +1761,116 @@
 hypot = BinaryElementwiseFunc(
     "hypot", ti._hypot_result_type, ti._hypot, _hypot_docstring_
 )
+
+
+# U37: ==== CBRT        (x)
+_cbrt_docstring_ = """
+cbrt(x, out=None, order='K')
+
+Computes positive cube-root for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real floating-point data type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise positive cube-root.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+cbrt = UnaryElementwiseFunc(
+    "cbrt", ti._cbrt_result_type, ti._cbrt, _cbrt_docstring_
+)
+
+
+# U38: ==== EXP2        (x)
+_exp2_docstring_ = """
+exp2(x, out=None, order='K')
+
+Computes the base-2 exponential for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise base-2 exponentials.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+exp2 = UnaryElementwiseFunc(
+    "exp2", ti._exp2_result_type, ti._exp2, _exp2_docstring_
+)
+
+
+# B25: ==== COPYSIGN    (x1, x2)
+_copysign_docstring_ = """
+copysign(x1, x2, out=None, order='K')
+
+Composes a floating-point value with the magnitude of `x1_i` and the sign of
+`x2_i` for each element of input arrays `x1` and `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real floating-point data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real floating-point data
+        type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+copysign = BinaryElementwiseFunc(
+    "copysign",
+    ti._copysign_result_type,
+    ti._copysign,
+    _copysign_docstring_,
+)
+
+
+# U39: ==== RSQRT        (x)
+_rsqrt_docstring_ = """
+rsqrt(x, out=None, order='K')
+
+Computes the reciprocal square-root for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real floating-point data type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise reciprocal square-root.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+rsqrt = UnaryElementwiseFunc(
+    "rsqrt", ti._rsqrt_result_type, ti._rsqrt, _rsqrt_docstring_
+)
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
new file mode 100644
index 0000000000..1d4aa65002
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -0,0 +1,172 @@
+//=== cbrt.hpp -   Unary function CBRT                   ------  *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of CBRT(x)
+/// function that compute a square root.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace cbrt
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT, typename resT> struct CbrtFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::true_type;
+
+    resT operator()(const argT &in) const
+    {
+        return sycl::cbrt(in);
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using CbrtContigFunctor = elementwise_common::
+    UnaryContigFunctor<argTy, resTy, CbrtFunctor<argTy, resTy>, vec_sz, n_vecs>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using CbrtStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, CbrtFunctor<argTy, resTy>>;
+
+template <typename T> struct CbrtOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+class cbrt_contig_kernel;
+
+template <typename argTy>
+sycl::event cbrt_contig_impl(sycl::queue &exec_q,
+                             size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::unary_contig_impl<
+        argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T> struct CbrtContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename CbrtOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cbrt_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct CbrtTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::cbrt(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CbrtOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3> class cbrt_strided_kernel;
+
+template <typename argTy>
+sycl::event
+cbrt_strided_impl(sycl::queue &exec_q,
+                  size_t nelems,
+                  int nd,
+                  const py::ssize_t *shape_and_strides,
+                  const char *arg_p,
+                  py::ssize_t arg_offset,
+                  char *res_p,
+                  py::ssize_t res_offset,
+                  const std::vector<sycl::event> &depends,
+                  const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, CbrtOutputType, CbrtStridedFunctor, cbrt_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T> struct CbrtStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename CbrtOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cbrt_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace cbrt
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
new file mode 100644
index 0000000000..b1997d06b4
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
@@ -0,0 +1,215 @@
+//=== copysign.hpp -   Binary function COPYSIGN         ------  *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of COPYSIGN(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace copysign
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT> struct CopysignFunctor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return sycl::copysign(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+    operator()(const sycl::vec<argT1, vec_sz> &in1,
+               const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = sycl::copysign(in1, in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using CopysignContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            CopysignFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using CopysignStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    CopysignFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2> struct CopysignOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class copysign_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event copysign_contig_impl(sycl::queue &exec_q,
+                                 size_t nelems,
+                                 const char *arg1_p,
+                                 py::ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 py::ssize_t arg2_offset,
+                                 char *res_p,
+                                 py::ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, CopysignOutputType, CopysignContigFunctor,
+        copysign_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                                arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct CopysignContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename CopysignOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = copysign_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2> struct CopysignTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CopysignOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class copysign_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+copysign_strided_impl(sycl::queue &exec_q,
+                      size_t nelems,
+                      int nd,
+                      const py::ssize_t *shape_and_strides,
+                      const char *arg1_p,
+                      py::ssize_t arg1_offset,
+                      const char *arg2_p,
+                      py::ssize_t arg2_offset,
+                      char *res_p,
+                      py::ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, CopysignOutputType, CopysignStridedFunctor,
+        copysign_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                 arg1_offset, arg2_p, arg2_offset, res_p,
+                                 res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct CopysignStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename CopysignOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = copysign_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace copysign
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
new file mode 100644
index 0000000000..67ee23df48
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
@@ -0,0 +1,229 @@
+//=== exp2.hpp -   Unary function EXP2                     ------
+//*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of EXP2(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <type_traits>
+
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace exp2
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT> struct Exp2Functor
+{
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            const argT tmp = in * std::log(realT(2));
+
+            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(tmp);
+            const realT y = std::imag(tmp);
+            if (std::isfinite(x)) {
+                if (std::isfinite(y)) {
+                    return std::exp(tmp);
+                }
+                else {
+                    return resT{q_nan, q_nan};
+                }
+            }
+            else if (std::isnan(x)) {
+                /* x is nan */
+                if (y == realT(0)) {
+                    return resT{in};
+                }
+                else {
+                    return resT{x, q_nan};
+                }
+            }
+            else {
+                if (!std::signbit(x)) { /* x is +inf */
+                    if (y == realT(0)) {
+                        return resT{x, y};
+                    }
+                    else if (std::isfinite(y)) {
+                        return resT{x * std::cos(y), x * std::sin(y)};
+                    }
+                    else {
+                        /* x = +inf, y = +-inf || nan */
+                        return resT{x, q_nan};
+                    }
+                }
+                else { /* x is -inf */
+                    if (std::isfinite(y)) {
+                        realT exp_x = std::exp(x);
+                        return resT{exp_x * std::cos(y), exp_x * std::sin(y)};
+                    }
+                    else {
+                        /* x = -inf, y = +-inf || nan */
+                        return resT{0, 0};
+                    }
+                }
+            }
+        }
+        else {
+            return sycl::exp2(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using Exp2ContigFunctor = elementwise_common::
+    UnaryContigFunctor<argTy, resTy, Exp2Functor<argTy, resTy>, vec_sz, n_vecs>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using Exp2StridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, Exp2Functor<argTy, resTy>>;
+
+template <typename T> struct Exp2OutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+class exp2_contig_kernel;
+
+template <typename argTy>
+sycl::event exp2_contig_impl(sycl::queue &exec_q,
+                             size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::unary_contig_impl<
+        argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T> struct Exp2ContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename Exp2OutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp2_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct Exp2TypeMapFactory
+{
+    /*! @brief get typeid for output type of std::exp2(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Exp2OutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3> class exp2_strided_kernel;
+
+template <typename argTy>
+sycl::event
+exp2_strided_impl(sycl::queue &exec_q,
+                  size_t nelems,
+                  int nd,
+                  const py::ssize_t *shape_and_strides,
+                  const char *arg_p,
+                  py::ssize_t arg_offset,
+                  char *res_p,
+                  py::ssize_t res_offset,
+                  const std::vector<sycl::event> &depends,
+                  const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, Exp2OutputType, Exp2StridedFunctor, exp2_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T> struct Exp2StridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename Exp2OutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp2_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace exp2
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
new file mode 100644
index 0000000000..de51b31c30
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -0,0 +1,179 @@
+//=== rsqrt.hpp -   Unary function RSQRT                   ------
+//*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of RSQRT(x)
+/// function that computes the reciprocal square root.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace rsqrt
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT, typename resT> struct RsqrtFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::true_type;
+
+    resT operator()(const argT &in) const
+    {
+        return sycl::rsqrt(in);
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using RsqrtContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           RsqrtFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using RsqrtStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, RsqrtFunctor<argTy, resTy>>;
+
+template <typename T> struct RsqrtOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+class rsqrt_contig_kernel;
+
+template <typename argTy>
+sycl::event rsqrt_contig_impl(sycl::queue &exec_q,
+                              size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::unary_contig_impl<
+        argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T> struct RsqrtContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename RsqrtOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = rsqrt_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct RsqrtTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::rsqrt(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename RsqrtOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3> class rsqrt_strided_kernel;
+
+template <typename argTy>
+sycl::event
+rsqrt_strided_impl(sycl::queue &exec_q,
+                   size_t nelems,
+                   int nd,
+                   const py::ssize_t *shape_and_strides,
+                   const char *arg_p,
+                   py::ssize_t arg_offset,
+                   char *res_p,
+                   py::ssize_t res_offset,
+                   const std::vector<sycl::event> &depends,
+                   const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, RsqrtOutputType, RsqrtStridedFunctor, rsqrt_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T> struct RsqrtStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename RsqrtOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = rsqrt_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace rsqrt
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
index 3cca479a3f..043cac0cd2 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
@@ -48,12 +48,15 @@
 #include "kernels/elementwise_functions/bitwise_or.hpp"
 #include "kernels/elementwise_functions/bitwise_right_shift.hpp"
 #include "kernels/elementwise_functions/bitwise_xor.hpp"
+#include "kernels/elementwise_functions/cbrt.hpp"
 #include "kernels/elementwise_functions/ceil.hpp"
 #include "kernels/elementwise_functions/conj.hpp"
+#include "kernels/elementwise_functions/copysign.hpp"
 #include "kernels/elementwise_functions/cos.hpp"
 #include "kernels/elementwise_functions/cosh.hpp"
 #include "kernels/elementwise_functions/equal.hpp"
 #include "kernels/elementwise_functions/exp.hpp"
+#include "kernels/elementwise_functions/exp2.hpp"
 #include "kernels/elementwise_functions/expm1.hpp"
 #include "kernels/elementwise_functions/floor.hpp"
 #include "kernels/elementwise_functions/floor_divide.hpp"
@@ -86,6 +89,7 @@
 #include "kernels/elementwise_functions/real.hpp"
 #include "kernels/elementwise_functions/remainder.hpp"
 #include "kernels/elementwise_functions/round.hpp"
+#include "kernels/elementwise_functions/rsqrt.hpp"
 #include "kernels/elementwise_functions/sign.hpp"
 #include "kernels/elementwise_functions/signbit.hpp"
 #include "kernels/elementwise_functions/sin.hpp"
@@ -2749,7 +2753,6 @@ void populate_trunc_dispatch_vectors(void)
 } // namespace impl
 
 // B24:  ==== HYPOT    (x1, x2)
-
 namespace impl
 {
 namespace hypot_fn_ns = dpctl::tensor::kernels::hypot;
@@ -2788,6 +2791,151 @@ void populate_hypot_dispatch_tables(void)
 
 } // namespace impl
 
+// U37: ==== CBRT        (x)
+namespace impl
+{
+
+namespace cbrt_fn_ns = dpctl::tensor::kernels::cbrt;
+
+static unary_contig_impl_fn_ptr_t cbrt_contig_dispatch_vector[td_ns::num_types];
+static int cbrt_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    cbrt_strided_dispatch_vector[td_ns::num_types];
+
+void populate_cbrt_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = cbrt_fn_ns;
+
+    using fn_ns::CbrtContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CbrtContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cbrt_contig_dispatch_vector);
+
+    using fn_ns::CbrtStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CbrtStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cbrt_strided_dispatch_vector);
+
+    using fn_ns::CbrtTypeMapFactory;
+    DispatchVectorBuilder<int, CbrtTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(cbrt_output_typeid_vector);
+}
+
+} // namespace impl
+
+// B24:  ==== COPYSIGN    (x1, x2)
+namespace impl
+{
+namespace copysign_fn_ns = dpctl::tensor::kernels::copysign;
+
+static binary_contig_impl_fn_ptr_t
+    copysign_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int copysign_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    copysign_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_copysign_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = copysign_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::CopysignTypeMapFactory;
+    DispatchTableBuilder<int, CopysignTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(copysign_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::CopysignStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, CopysignStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(copysign_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::CopysignContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, CopysignContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(copysign_contig_dispatch_table);
+};
+
+} // namespace impl
+
+// U38: ==== EXP2           (x)
+namespace impl
+{
+
+namespace exp2_fn_ns = dpctl::tensor::kernels::exp2;
+
+static unary_contig_impl_fn_ptr_t exp2_contig_dispatch_vector[td_ns::num_types];
+static int exp2_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    exp2_strided_dispatch_vector[td_ns::num_types];
+
+void populate_exp2_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = exp2_fn_ns;
+
+    using fn_ns::Exp2ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Exp2ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(exp2_contig_dispatch_vector);
+
+    using fn_ns::Exp2StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Exp2StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(exp2_strided_dispatch_vector);
+
+    using fn_ns::Exp2TypeMapFactory;
+    DispatchVectorBuilder<int, Exp2TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(exp2_output_typeid_vector);
+}
+
+} // namespace impl
+
+// U39: ==== RSQRT        (x)
+namespace impl
+{
+
+namespace rsqrt_fn_ns = dpctl::tensor::kernels::rsqrt;
+
+static unary_contig_impl_fn_ptr_t
+    rsqrt_contig_dispatch_vector[td_ns::num_types];
+static int rsqrt_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    rsqrt_strided_dispatch_vector[td_ns::num_types];
+
+void populate_rsqrt_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = rsqrt_fn_ns;
+
+    using fn_ns::RsqrtContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RsqrtContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(rsqrt_contig_dispatch_vector);
+
+    using fn_ns::RsqrtStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RsqrtStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(rsqrt_strided_dispatch_vector);
+
+    using fn_ns::RsqrtTypeMapFactory;
+    DispatchVectorBuilder<int, RsqrtTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(rsqrt_output_typeid_vector);
+}
+
+} // namespace impl
+
 // ==========================================================================================
 // //
 
@@ -4889,6 +5037,117 @@ void init_elementwise_functions(py::module_ m)
               py::arg("depends") = py::list());
         m.def("_hypot_result_type", hypot_result_type_pyapi, "");
     }
+
+    // U37: ==== CBRT        (x)
+    {
+        impl::populate_cbrt_dispatch_vectors();
+        using impl::cbrt_contig_dispatch_vector;
+        using impl::cbrt_output_typeid_vector;
+        using impl::cbrt_strided_dispatch_vector;
+
+        auto cbrt_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, cbrt_output_typeid_vector,
+                cbrt_contig_dispatch_vector, cbrt_strided_dispatch_vector);
+        };
+        m.def("_cbrt", cbrt_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto cbrt_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, cbrt_output_typeid_vector);
+        };
+        m.def("_cbrt_result_type", cbrt_result_type_pyapi);
+    }
+
+    // B25: ==== COPYSIGN       (x1, x2)
+    {
+        impl::populate_copysign_dispatch_tables();
+        using impl::copysign_contig_dispatch_table;
+        using impl::copysign_output_id_table;
+        using impl::copysign_strided_dispatch_table;
+
+        auto copysign_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
+                                  const dpctl::tensor::usm_ndarray &src2,
+                                  const dpctl::tensor::usm_ndarray &dst,
+                                  sycl::queue &exec_q,
+                                  const std::vector<sycl::event> &depends =
+                                      {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, copysign_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                copysign_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                copysign_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto copysign_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               copysign_output_id_table);
+        };
+        m.def("_copysign", copysign_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_copysign_result_type", copysign_result_type_pyapi, "");
+    }
+
+    // U38: ==== EXP2      (x)
+    {
+        impl::populate_exp2_dispatch_vectors();
+        using impl::exp2_contig_dispatch_vector;
+        using impl::exp2_output_typeid_vector;
+        using impl::exp2_strided_dispatch_vector;
+
+        auto exp2_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, exp2_output_typeid_vector,
+                exp2_contig_dispatch_vector, exp2_strided_dispatch_vector);
+        };
+        m.def("_exp2", exp2_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto exp2_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, exp2_output_typeid_vector);
+        };
+        m.def("_exp2_result_type", exp2_result_type_pyapi);
+    }
+
+    // U39: ==== RSQRT        (x)
+    {
+        impl::populate_rsqrt_dispatch_vectors();
+        using impl::rsqrt_contig_dispatch_vector;
+        using impl::rsqrt_output_typeid_vector;
+        using impl::rsqrt_strided_dispatch_vector;
+
+        auto rsqrt_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, rsqrt_output_typeid_vector,
+                rsqrt_contig_dispatch_vector, rsqrt_strided_dispatch_vector);
+        };
+        m.def("_rsqrt", rsqrt_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto rsqrt_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              rsqrt_output_typeid_vector);
+        };
+        m.def("_rsqrt_result_type", rsqrt_result_type_pyapi);
+    }
 }
 
 } // namespace py_internal
diff --git a/dpctl/tests/elementwise/test_cbrt.py b/dpctl/tests/elementwise/test_cbrt.py
new file mode 100644
index 0000000000..b06a8d19cf
--- /dev/null
+++ b/dpctl/tests/elementwise/test_cbrt.py
@@ -0,0 +1,79 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _map_to_device_dtype, _no_complex_dtypes, _real_fp_dtypes
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_cbrt_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.cbrt(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.cbrt(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_cbrt_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.cbrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_cbrt_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.cbrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
+def test_cbrt_special_cases():
+    get_queue_or_skip()
+
+    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = dpt.cbrt(X)
+    expected = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    tol = dpt.finfo(dpt.float32).resolution
+
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpctl/tests/elementwise/test_copysign.py b/dpctl/tests/elementwise/test_copysign.py
new file mode 100644
index 0000000000..26a285343c
--- /dev/null
+++ b/dpctl/tests/elementwise/test_copysign.py
@@ -0,0 +1,111 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import ctypes
+
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _compare_dtypes, _no_complex_dtypes, _real_fp_dtypes
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
+def test_copysign_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.copysign(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.copysign(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.copysign(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.copysign(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _real_fp_dtypes)
+def test_copysign_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.copysign(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.copysign(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dt", _real_fp_dtypes)
+def test_copysign(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.arange(100, dtype=dt, sycl_queue=q)
+    x[1::2] *= -1
+    y = dpt.ones(100, dtype=dt, sycl_queue=q)
+    y[::2] *= -1
+    res = dpt.copysign(x, y)
+    expected = dpt.negative(x)
+    tol = dpt.finfo(dt).resolution
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol)
+
+
+def test_copysign_special_values():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([1.0, 0.0, dpt.nan, dpt.nan], dtype="f4")
+    y1 = dpt.asarray([-1.0, -0.0, -dpt.nan, -1], dtype="f4")
+    res = dpt.copysign(x1, y1)
+    assert dpt.all(dpt.signbit(res))
+    x2 = dpt.asarray([-1.0, -0.0, -dpt.nan, -dpt.nan], dtype="f4")
+    res = dpt.copysign(x2, y1)
+    assert dpt.all(dpt.signbit(res))
+    y2 = dpt.asarray([0.0, 1.0, dpt.nan, 1.0], dtype="f4")
+    res = dpt.copysign(x2, y2)
+    assert not dpt.any(dpt.signbit(res))
+    res = dpt.copysign(x1, y2)
+    assert not dpt.any(dpt.signbit(res))
diff --git a/dpctl/tests/elementwise/test_exp2.py b/dpctl/tests/elementwise/test_exp2.py
new file mode 100644
index 0000000000..d4bef1efab
--- /dev/null
+++ b/dpctl/tests/elementwise/test_exp2.py
@@ -0,0 +1,168 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp2_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.exp2(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.exp2(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_exp2_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.exp2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_exp2_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.exp2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_exp2_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1 / 4
+    X[..., 1::2] = 1 / 2
+
+    Y = dpt.exp2(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.exp2(np.float32(1 / 4))
+    expected_Y[..., 1::2] = np.exp2(np.float32(1 / 2))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp2_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1 / 4
+    X[..., 1::2] = 1 / 2
+
+    for ord in ["C", "F", "A", "K"]:
+        for perms in itertools.permutations(range(4)):
+            U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+            Y = dpt.exp2(U, order=ord)
+            expected_Y = np.exp2(dpt.asnumpy(U))
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+def test_exp2_special_cases():
+    get_queue_or_skip()
+
+    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = np.asarray([np.nan, 1.0, 1.0, np.inf, 0.0], dtype="f4")
+
+    tol = dpt.finfo(X.dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol)
+
+    # special cases for complex variant
+    num_finite = 1.0
+    vals = [
+        complex(0.0, 0.0),
+        complex(num_finite, dpt.inf),
+        complex(num_finite, dpt.nan),
+        complex(dpt.inf, 0.0),
+        complex(-dpt.inf, num_finite),
+        complex(dpt.inf, num_finite),
+        complex(-dpt.inf, dpt.inf),
+        complex(dpt.inf, dpt.inf),
+        complex(-dpt.inf, dpt.nan),
+        complex(dpt.inf, dpt.nan),
+        complex(dpt.nan, 0.0),
+        complex(dpt.nan, num_finite),
+        complex(dpt.nan, dpt.nan),
+    ]
+    X = dpt.asarray(vals, dtype=dpt.complex64)
+    cis_1 = complex(np.cos(num_finite), np.sin(num_finite))
+    c_nan = complex(np.nan, np.nan)
+    res = np.asarray(
+        [
+            complex(1.0, 0.0),
+            c_nan,
+            c_nan,
+            complex(np.inf, 0.0),
+            0.0,
+            np.inf * cis_1,
+            complex(0.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(0.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(np.nan, 0.0),
+            c_nan,
+            c_nan,
+        ],
+        dtype=np.complex64,
+    )
+
+    tol = dpt.finfo(X.dtype).resolution
+    with np.errstate(invalid="ignore"):
+        assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_rsqrt.py b/dpctl/tests/elementwise/test_rsqrt.py
new file mode 100644
index 0000000000..ef9378ade2
--- /dev/null
+++ b/dpctl/tests/elementwise/test_rsqrt.py
@@ -0,0 +1,74 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _map_to_device_dtype, _no_complex_dtypes, _real_fp_dtypes
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_rsqrt_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.reciprocal(np.sqrt(np.array(1, dtype=dtype))).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.rsqrt(x).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_rsqrt_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    res = dpt.rsqrt(x)
+    expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype))
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_rsqrt_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    res = dpt.rsqrt(x)
+    expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype))
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol)
+
+
+def test_rsqrt_special_cases():
+    get_queue_or_skip()
+
+    x = dpt.asarray([dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = dpt.rsqrt(x)
+    expected = dpt.asarray(
+        [dpt.nan, dpt.nan, dpt.inf, -dpt.inf, 0.0, dpt.nan], dtype="f4"
+    )
+    assert dpt.allclose(res, expected, equal_nan=True)

From 4f5ee5e5463e058576daf5e6316d5c7074368783 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 20 Sep 2023 08:31:15 -0500
Subject: [PATCH 03/19] Use sycl::ext::oneapi::experimental for complex
 trig/trigh and inverses

Use sycl_complex extension to implement complex-valued trigonometric,
hyperbolic functions and their inverses.

This works around use of double precision functions/literals in implementations
of these functions in MSVC headers, causing failures to offload on Iris Xe for
single precision input citing lack of fp64 support by the hardware.
---
 dpctl/tensor/CMakeLists.txt                                | 2 +-
 .../include/kernels/elementwise_functions/acos.hpp         | 5 ++++-
 .../include/kernels/elementwise_functions/acosh.hpp        | 5 ++++-
 .../include/kernels/elementwise_functions/asin.hpp         | 5 ++++-
 .../include/kernels/elementwise_functions/asinh.hpp        | 5 ++++-
 .../include/kernels/elementwise_functions/atan.hpp         | 5 ++++-
 .../include/kernels/elementwise_functions/atanh.hpp        | 5 ++++-
 .../include/kernels/elementwise_functions/cos.hpp          | 5 ++++-
 .../include/kernels/elementwise_functions/cosh.hpp         | 5 ++++-
 .../include/kernels/elementwise_functions/sin.hpp          | 7 +++++--
 .../include/kernels/elementwise_functions/sinh.hpp         | 2 ++
 .../include/kernels/elementwise_functions/tan.hpp          | 4 +++-
 .../include/kernels/elementwise_functions/tanh.hpp         | 5 ++++-
 13 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index aba009411d..1aa1206b51 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -77,7 +77,7 @@ endforeach()
 if (UNIX)
   set_source_files_properties(
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
-    PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES")
+    PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES;SYCL_EXT_ONEAPI_COMPLEX")
 endif()
 target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
 target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
index ac1d597c93..e8d7d9eb17 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -27,6 +27,7 @@
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -47,6 +48,7 @@ namespace acos
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace cmplx_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -114,7 +116,8 @@ template <typename argT, typename resT> struct AcosFunctor
             }
 
             /* ordinary cases */
-            return std::acos(in);
+            return cmplx_ns::acos(
+                cmplx_ns::complex<realT>(in)); // std::acos(in);
         }
         else {
             static_assert(std::is_floating_point_v<argT> ||
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
index 484b0da8a6..4288e6259f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -27,6 +27,7 @@
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -47,6 +48,7 @@ namespace acosh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace cmplx_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -118,7 +120,8 @@ template <typename argT, typename resT> struct AcoshFunctor
             }
             else {
                 /* ordinary cases */
-                acos_in = std::acos(in);
+                acos_in = cmplx_ns::acos(
+                    cmplx_ns::complex<realT>(in)); // std::acos(in);
             }
 
             /* Now we calculate acosh(z) */
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
index 8b960dd30d..ed9418d69f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -27,6 +27,7 @@
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -47,6 +48,7 @@ namespace asin
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace cmplx_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -134,7 +136,8 @@ template <typename argT, typename resT> struct AsinFunctor
                 return resT{asinh_im, asinh_re};
             }
             /* ordinary cases */
-            return std::asin(in);
+            return cmplx_ns::asin(
+                cmplx_ns::complex<realT>(in)); // std::asin(in);
         }
         else {
             static_assert(std::is_floating_point_v<argT> ||
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
index 271a861cfe..7712a902e9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -27,6 +27,7 @@
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -47,6 +48,7 @@ namespace asinh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace cmplx_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -115,7 +117,8 @@ template <typename argT, typename resT> struct AsinhFunctor
             }
 
             /* ordinary cases */
-            return std::asinh(in);
+            return cmplx_ns::asinh(
+                cmplx_ns::complex<realT>(in)); // std::asinh(in);
         }
         else {
             static_assert(std::is_floating_point_v<argT> ||
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
index f1dcce2831..8a7b67a5ac 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
@@ -28,6 +28,7 @@
 #include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -48,6 +49,7 @@ namespace atan
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace cmplx_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -126,7 +128,8 @@ template <typename argT, typename resT> struct AtanFunctor
                 return resT{atanh_im, atanh_re};
             }
             /* ordinary cases */
-            return std::atan(in);
+            return cmplx_ns::atan(
+                cmplx_ns::complex<realT>(in)); // std::atan(in);
         }
         else {
             static_assert(std::is_floating_point_v<argT> ||
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
index 56432d7808..15b09ee952 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
@@ -28,6 +28,7 @@
 #include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -48,6 +49,7 @@ namespace atanh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace cmplx_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -119,7 +121,8 @@ template <typename argT, typename resT> struct AtanhFunctor
                 return resT{res_re, res_im};
             }
             /* ordinary cases */
-            return std::atanh(in);
+            return cmplx_ns::atanh(
+                cmplx_ns::complex<realT>(in)); // std::atanh(in);
         }
         else {
             static_assert(std::is_floating_point_v<argT> ||
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
index f7c66d5f68..f78c67a4cf 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
@@ -27,6 +27,7 @@
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -47,6 +48,7 @@ namespace cos
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace cmplx_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -81,7 +83,8 @@ template <typename argT, typename resT> struct CosFunctor
              * real and imaginary parts of input are finite.
              */
             if (in_re_finite && in_im_finite) {
-                return std::cos(in);
+                return cmplx_ns::cos(
+                    cmplx_ns::complex<realT>(in)); // std::cos(in);
             }
 
             /*
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
index fbcc7e40f9..e1af41c331 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
@@ -27,6 +27,7 @@
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -47,6 +48,7 @@ namespace cosh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace cmplx_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -81,7 +83,8 @@ template <typename argT, typename resT> struct CoshFunctor
              * real and imaginary parts of input are finite.
              */
             if (xfinite && yfinite) {
-                return std::cosh(in);
+                return cmplx_ns::cosh(
+                    cmplx_ns::complex<realT>(in)); // std::cosh(in);
             }
 
             /*
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
index b9f03e6234..0f35aa306e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
@@ -23,10 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -47,6 +48,7 @@ namespace sin
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace cmplx_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -79,7 +81,8 @@ template <typename argT, typename resT> struct SinFunctor
              * real and imaginary parts of input are finite.
              */
             if (in_re_finite && in_im_finite) {
-                return std::sin(in);
+                return cmplx_ns::sin(
+                    cmplx_ns::complex<realT>(in)); // std::sin(in);
             }
 
             /*
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
index 3a8d05d774..55fdc44531 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
@@ -27,6 +27,7 @@
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -47,6 +48,7 @@ namespace sinh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace cmplx_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
index 45f931b7f4..d01a3e906a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
@@ -28,6 +28,7 @@
 #include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -48,6 +49,7 @@ namespace tan
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace cmplx_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -118,7 +120,7 @@ template <typename argT, typename resT> struct TanFunctor
                 return resT{q_nan, q_nan};
             }
             /* ordinary cases */
-            return std::tan(in);
+            return cmplx_ns::tan(cmplx_ns::complex<realT>(in)); // std::tan(in);
         }
         else {
             static_assert(std::is_floating_point_v<argT> ||
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
index ef943319b2..237d8d2618 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
@@ -29,6 +29,7 @@
 #include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -49,6 +50,7 @@ namespace tanh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace cmplx_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -112,7 +114,8 @@ template <typename argT, typename resT> struct TanhFunctor
                 return resT{q_nan, q_nan};
             }
             /* ordinary cases */
-            return std::tanh(in);
+            return cmplx_ns::tanh(
+                cmplx_ns::complex<realT>(in)); // std::tanh(in);
         }
         else {
             static_assert(std::is_floating_point_v<argT> ||

From 1d5fdcecc927e0e0d68acec53e87f71c2c0a1eb0 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 20 Sep 2023 09:33:45 -0500
Subject: [PATCH 04/19] Set SYCL_EXT_ONEAPI_COMPLEX on Windows as well

---
 dpctl/tensor/CMakeLists.txt | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 1aa1206b51..15bc58f57b 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -75,10 +75,15 @@ foreach(_src_fn ${_no_fast_math_sources})
   )
 endforeach()
 if (UNIX)
-  set_source_files_properties(
-    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
-    PROPERTIES COMPILE_DEFINITIONS "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES;SYCL_EXT_ONEAPI_COMPLEX")
+  set(_compiler_definitions "USE_STD_ABS_FOR_COMPLEX_TYPES;USE_STD_SQRT_FOR_COMPLEX_TYPES;SYCL_EXT_ONEAPI_COMPLEX")
+else()
+  set(_compiler_definitions "SYCL_EXT_ONEAPI_COMPLEX")
 endif()
+set_source_files_properties(
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp
+    PROPERTIES COMPILE_DEFINITIONS "${_compiler_definitions}"
+)
+
 target_compile_options(${python_module_name} PRIVATE -fno-sycl-id-queries-fit-in-int)
 target_link_options(${python_module_name} PRIVATE -fsycl-device-code-split=per_kernel)
 if(UNIX)

From 8df47452020eafabe007860f4768b3512d4ee023 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 22 Sep 2023 13:25:26 -0500
Subject: [PATCH 05/19] Use sycl_complex in add, conj

---
 .../kernels/elementwise_functions/add.hpp     | 28 ++++++++++++++++++-
 .../kernels/elementwise_functions/conj.hpp    |  6 +++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index df6797845f..060665504e 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -27,6 +27,7 @@
 #include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
@@ -49,6 +50,7 @@ namespace add
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 template <typename argT1, typename argT2, typename resT> struct AddFunctor
 {
@@ -60,7 +62,31 @@ template <typename argT1, typename argT2, typename resT> struct AddFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
-        return in1 + in2;
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value)
+        {
+            using rT1 = typename argT1::value_type;
+            using rT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<rT1>(in1) + exprm_ns::complex<rT2>(in2);
+        }
+        else if constexpr (tu_ns::is_complex<argT1>::value &&
+                           !tu_ns::is_complex<argT2>::value)
+        {
+            using rT1 = typename argT1::value_type;
+
+            return exprm_ns::complex<rT1>(in1) + in2;
+        }
+        else if constexpr (!tu_ns::is_complex<argT1>::value &&
+                           tu_ns::is_complex<argT2>::value)
+        {
+            using rT2 = typename argT2::value_type;
+
+            return in1 + exprm_ns::complex<rT2>(in2);
+        }
+        else {
+            return in1 + in2;
+        }
     }
 
     template <int vec_sz>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
index 3b0a1584de..0b9f0c0460 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
@@ -29,6 +29,7 @@
 #include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -49,6 +50,7 @@ namespace conj
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -68,7 +70,9 @@ template <typename argT, typename resT> struct ConjFunctor
     resT operator()(const argT &in) const
     {
         if constexpr (is_complex<argT>::value) {
-            return std::conj(in);
+            using rT = typename argT::value_type;
+
+            return exprm_ns::conj(exprm_ns::complex<rT>(in)); // std::conj(in);
         }
         else {
             if constexpr (!std::is_same_v<argT, bool>)

From ef2563d155ae1e081e14e30f9c0144aab8878798 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Mon, 25 Sep 2023 14:16:17 -0500
Subject: [PATCH 06/19] More transitions to experimental complex

---
 .../kernels/elementwise_functions/abs.hpp       |  6 ++++--
 .../kernels/elementwise_functions/acos.hpp      |  8 ++++----
 .../kernels/elementwise_functions/acosh.hpp     |  2 +-
 .../kernels/elementwise_functions/add.hpp       |  2 +-
 .../kernels/elementwise_functions/asin.hpp      |  8 ++++----
 .../kernels/elementwise_functions/asinh.hpp     |  8 ++++----
 .../kernels/elementwise_functions/atan.hpp      |  8 ++++----
 .../kernels/elementwise_functions/atan2.hpp     |  2 +-
 .../kernels/elementwise_functions/atanh.hpp     |  8 ++++----
 .../elementwise_functions/bitwise_and.hpp       |  2 +-
 .../elementwise_functions/bitwise_invert.hpp    |  2 +-
 .../bitwise_left_shift.hpp                      |  2 +-
 .../elementwise_functions/bitwise_or.hpp        |  2 +-
 .../bitwise_right_shift.hpp                     |  2 +-
 .../elementwise_functions/bitwise_xor.hpp       |  2 +-
 .../kernels/elementwise_functions/ceil.hpp      |  2 +-
 .../kernels/elementwise_functions/common.hpp    |  2 +-
 .../elementwise_functions/common_inplace.hpp    |  2 +-
 .../kernels/elementwise_functions/conj.hpp      |  2 +-
 .../kernels/elementwise_functions/cos.hpp       |  8 ++++----
 .../kernels/elementwise_functions/cosh.hpp      |  8 ++++----
 .../kernels/elementwise_functions/equal.hpp     | 17 +++++++++++++++--
 .../kernels/elementwise_functions/exp.hpp       |  7 +++++--
 .../kernels/elementwise_functions/expm1.hpp     |  2 +-
 .../kernels/elementwise_functions/floor.hpp     |  2 +-
 .../elementwise_functions/floor_divide.hpp      |  2 +-
 .../kernels/elementwise_functions/greater.hpp   |  2 +-
 .../elementwise_functions/greater_equal.hpp     |  2 +-
 .../kernels/elementwise_functions/hypot.hpp     |  2 +-
 .../kernels/elementwise_functions/imag.hpp      |  2 +-
 .../kernels/elementwise_functions/isfinite.hpp  |  3 ++-
 .../kernels/elementwise_functions/isinf.hpp     |  2 +-
 .../kernels/elementwise_functions/isnan.hpp     |  2 +-
 .../elementwise_functions/less_equal.hpp        |  2 +-
 34 files changed, 77 insertions(+), 58 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
index bcf6a28040..ab321ad356 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp
@@ -23,12 +23,13 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -49,6 +50,7 @@ namespace abs
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -120,7 +122,7 @@ template <typename argT, typename resT> struct AbsFunctor
         }
         else {
 #ifdef USE_STD_ABS_FOR_COMPLEX_TYPES
-            return std::abs(z);
+            return exprm_ns::abs(exprm_ns::complex<realT>(z));
 #else
             return std::hypot(std::real(z), std::imag(z));
 #endif
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
index e8d7d9eb17..28968de761 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -23,11 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -48,7 +48,7 @@ namespace acos
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace cmplx_ns = sycl::ext::oneapi::experimental;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -116,8 +116,8 @@ template <typename argT, typename resT> struct AcosFunctor
             }
 
             /* ordinary cases */
-            return cmplx_ns::acos(
-                cmplx_ns::complex<realT>(in)); // std::acos(in);
+            return exprm_ns::acos(
+                exprm_ns::complex<realT>(in)); // std::acos(in);
         }
         else {
             static_assert(std::is_floating_point_v<argT> ||
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
index 4288e6259f..7323579df5 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -23,11 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
index 060665504e..0ed1710833 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
@@ -24,10 +24,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
index ed9418d69f..595d75eeca 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -23,11 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -48,7 +48,7 @@ namespace asin
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace cmplx_ns = sycl::ext::oneapi::experimental;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -136,8 +136,8 @@ template <typename argT, typename resT> struct AsinFunctor
                 return resT{asinh_im, asinh_re};
             }
             /* ordinary cases */
-            return cmplx_ns::asin(
-                cmplx_ns::complex<realT>(in)); // std::asin(in);
+            return exprm_ns::asin(
+                exprm_ns::complex<realT>(in)); // std::asin(in);
         }
         else {
             static_assert(std::is_floating_point_v<argT> ||
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
index 7712a902e9..d126afba87 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -23,11 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -48,7 +48,7 @@ namespace asinh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace cmplx_ns = sycl::ext::oneapi::experimental;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -117,8 +117,8 @@ template <typename argT, typename resT> struct AsinhFunctor
             }
 
             /* ordinary cases */
-            return cmplx_ns::asinh(
-                cmplx_ns::complex<realT>(in)); // std::asinh(in);
+            return exprm_ns::asinh(
+                exprm_ns::complex<realT>(in)); // std::asinh(in);
         }
         else {
             static_assert(std::is_floating_point_v<argT> ||
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
index 8a7b67a5ac..df8bba538b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp
@@ -23,12 +23,12 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -49,7 +49,7 @@ namespace atan
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace cmplx_ns = sycl::ext::oneapi::experimental;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -128,8 +128,8 @@ template <typename argT, typename resT> struct AtanFunctor
                 return resT{atanh_im, atanh_re};
             }
             /* ordinary cases */
-            return cmplx_ns::atan(
-                cmplx_ns::complex<realT>(in)); // std::atan(in);
+            return exprm_ns::atan(
+                exprm_ns::complex<realT>(in)); // std::atan(in);
         }
         else {
             static_assert(std::is_floating_point_v<argT> ||
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
index 765c0fe0c3..8df1667312 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp
@@ -24,9 +24,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
index 15b09ee952..d6a4b06ac3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp
@@ -23,12 +23,12 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -49,7 +49,7 @@ namespace atanh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace cmplx_ns = sycl::ext::oneapi::experimental;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -121,8 +121,8 @@ template <typename argT, typename resT> struct AtanhFunctor
                 return resT{res_re, res_im};
             }
             /* ordinary cases */
-            return cmplx_ns::atanh(
-                cmplx_ns::complex<realT>(in)); // std::atanh(in);
+            return exprm_ns::atanh(
+                exprm_ns::complex<realT>(in)); // std::atanh(in);
         }
         else {
             static_assert(std::is_floating_point_v<argT> ||
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
index 016b3a05d3..85bb603bd3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp
@@ -23,9 +23,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
index 9ce56be966..8337396427 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp
@@ -25,9 +25,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
index 4ae04f97de..79b767e678 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp
@@ -24,9 +24,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
index 65f25dd296..18173f02b1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp
@@ -23,9 +23,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
index 9442d4f6b7..354b2a5cd9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp
@@ -24,9 +24,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
index 2b0ab09dca..c9cdc77701 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp
@@ -23,9 +23,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
index 76fa80c287..0059064ec1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp
@@ -23,10 +23,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
index c0a94be341..5dc4728a65 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp
@@ -23,10 +23,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
index 614c7f4092..c4f893a532 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp
@@ -24,10 +24,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 
 namespace dpctl
 {
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
index 0b9f0c0460..6977e3a747 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp
@@ -24,12 +24,12 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
index f78c67a4cf..bdc1acc1fe 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp
@@ -23,11 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -48,7 +48,7 @@ namespace cos
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace cmplx_ns = sycl::ext::oneapi::experimental;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -83,8 +83,8 @@ template <typename argT, typename resT> struct CosFunctor
              * real and imaginary parts of input are finite.
              */
             if (in_re_finite && in_im_finite) {
-                return cmplx_ns::cos(
-                    cmplx_ns::complex<realT>(in)); // std::cos(in);
+                return exprm_ns::cos(
+                    exprm_ns::complex<realT>(in)); // std::cos(in);
             }
 
             /*
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
index e1af41c331..7093d2a2a3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp
@@ -23,11 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -48,7 +48,7 @@ namespace cosh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace cmplx_ns = sycl::ext::oneapi::experimental;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -83,8 +83,8 @@ template <typename argT, typename resT> struct CoshFunctor
              * real and imaginary parts of input are finite.
              */
             if (xfinite && yfinite) {
-                return cmplx_ns::cosh(
-                    cmplx_ns::complex<realT>(in)); // std::cosh(in);
+                return exprm_ns::cosh(
+                    exprm_ns::complex<realT>(in)); // std::cosh(in);
             }
 
             /*
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
index cd726f72ea..6d68861396 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp
@@ -24,9 +24,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
@@ -48,6 +49,7 @@ namespace equal
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 template <typename argT1, typename argT2, typename resT> struct EqualFunctor
 {
@@ -62,7 +64,18 @@ template <typename argT1, typename argT2, typename resT> struct EqualFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
-        return (in1 == in2);
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value)
+        {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<realT1>(in1) ==
+                   exprm_ns::complex<realT2>(in2);
+        }
+        else {
+            return (in1 == in2);
+        }
     }
 
     template <int vec_sz>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
index 003de44c27..453eb05c52 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp
@@ -23,10 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -47,6 +48,7 @@ namespace exp
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -73,7 +75,8 @@ template <typename argT, typename resT> struct ExpFunctor
             const realT y = std::imag(in);
             if (std::isfinite(x)) {
                 if (std::isfinite(y)) {
-                    return std::exp(in);
+                    return exprm_ns::exp(
+                        exprm_ns::complex<realT>(in)); // std::exp(in);
                 }
                 else {
                     return resT{q_nan, q_nan};
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
index 3f6a73b6d3..f5204e87b3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp
@@ -24,11 +24,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
index e675407d0b..88a20dafe0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp
@@ -23,10 +23,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
index 025d7e8bc4..b8d528bfc1 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp
@@ -24,9 +24,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
index 2a151ce737..e01360efa7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp
@@ -25,9 +25,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/math_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
index 5704336990..f017b7f150 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp
@@ -25,9 +25,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/math_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
index a369c54f24..fd19d29c0b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp
@@ -24,9 +24,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
index 64da603037..bb1ff2ebcb 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp
@@ -24,11 +24,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
index 1d8f177e40..1554f905b7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp
@@ -24,10 +24,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
@@ -46,6 +46,7 @@ namespace isfinite
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
index d9afdb9317..2720385614 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp
@@ -24,10 +24,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
index b5051ab833..15551e295a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp
@@ -24,9 +24,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
index 47e2301fe7..f9f6729968 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp
@@ -25,9 +25,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/math_utils.hpp"

From 0717bbe5274805f152668e85e9c57543c473f8ea Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 27 Sep 2023 13:10:47 -0500
Subject: [PATCH 07/19] More files change to use sycl_complex

---
 .../include/kernels/elementwise_functions/less.hpp   |  2 +-
 .../include/kernels/elementwise_functions/log.hpp    | 12 ++++++++++--
 .../include/kernels/elementwise_functions/log10.hpp  |  8 ++++++--
 .../include/kernels/elementwise_functions/log1p.hpp  |  2 +-
 .../include/kernels/elementwise_functions/log2.hpp   |  8 ++++++--
 .../kernels/elementwise_functions/logaddexp.hpp      |  2 +-
 .../kernels/elementwise_functions/logical_and.hpp    |  2 +-
 .../kernels/elementwise_functions/logical_not.hpp    |  2 +-
 .../kernels/elementwise_functions/logical_or.hpp     |  2 +-
 .../kernels/elementwise_functions/logical_xor.hpp    |  2 +-
 10 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
index c33d6d7c10..02c7a0d95a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp
@@ -24,9 +24,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/math_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
index abcc899fc0..ff37d87157 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp
@@ -23,10 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -47,6 +48,7 @@ namespace log
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -65,7 +67,13 @@ template <typename argT, typename resT> struct LogFunctor
 
     resT operator()(const argT &in) const
     {
-        return std::log(in);
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+            return exprm_ns::log(exprm_ns::complex<realT>(in)); // std::log(in);
+        }
+        else {
+            return std::log(in);
+        }
     }
 };
 
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
index afcf8aa085..88dabcaabe 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp
@@ -24,10 +24,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -48,6 +49,7 @@ namespace log10
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
@@ -70,7 +72,9 @@ template <typename argT, typename resT> struct Log10Functor
     {
         if constexpr (is_complex<argT>::value) {
             using realT = typename argT::value_type;
-            return (std::log(in) / std::log(realT{10}));
+            // return (std::log(in) / std::log(realT{10}));
+            return exprm_ns::log(exprm_ns::complex<realT>(in)) /
+                   std::log(realT{10});
         }
         else {
             return std::log10(in);
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
index 6d7a56ccf5..11e3fb3f9f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp
@@ -24,10 +24,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
index 533d0120df..57d7dcaf31 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp
@@ -24,10 +24,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -48,6 +49,7 @@ namespace log2
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
@@ -70,7 +72,9 @@ template <typename argT, typename resT> struct Log2Functor
     {
         if constexpr (is_complex<argT>::value) {
             using realT = typename argT::value_type;
-            return std::log(in) / std::log(realT{2});
+            // std::log(in) / std::log(realT{2});
+            return exprm_ns::log(exprm_ns::complex<realT>(in)) /
+                   std::log(realT{2});
         }
         else {
             return std::log2(in);
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
index 90b7997a37..bc0f26cdee 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp
@@ -25,10 +25,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
index 10e4e0cbff..988d1ed380 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp
@@ -25,9 +25,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
index 78bacbe686..826af2ee37 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp
@@ -25,9 +25,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
index bfb1288870..333951e6b5 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp
@@ -25,9 +25,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
index 44d361cfc1..ce4bde9e6b 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp
@@ -25,9 +25,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"

From c5f26eb5781d8c8cf6d5049af4edfbae100055e5 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 29 Sep 2023 09:43:28 -0500
Subject: [PATCH 08/19] Use oneapi extension for complexes for remaining
 elementwise functions

Used functions from sycl::ext::oneapi::experimental context to implement
evaluation on data of complex type.
---
 .../kernels/elementwise_functions/maximum.hpp |  2 +-
 .../kernels/elementwise_functions/minimum.hpp |  2 +-
 .../elementwise_functions/multiply.hpp        | 17 ++++++++--
 .../elementwise_functions/negative.hpp        |  2 +-
 .../elementwise_functions/not_equal.hpp       |  2 +-
 .../elementwise_functions/positive.hpp        |  2 +-
 .../kernels/elementwise_functions/pow.hpp     | 13 +++++++-
 .../kernels/elementwise_functions/proj.hpp    |  2 +-
 .../kernels/elementwise_functions/real.hpp    |  2 +-
 .../elementwise_functions/remainder.hpp       |  2 +-
 .../kernels/elementwise_functions/round.hpp   |  2 +-
 .../kernels/elementwise_functions/sign.hpp    | 23 ++++++++------
 .../kernels/elementwise_functions/signbit.hpp |  2 +-
 .../kernels/elementwise_functions/sinh.hpp    |  2 +-
 .../kernels/elementwise_functions/sqrt.hpp    |  9 ++++--
 .../kernels/elementwise_functions/square.hpp  | 15 +++++++--
 .../elementwise_functions/subtract.hpp        |  2 +-
 .../kernels/elementwise_functions/tan.hpp     |  2 +-
 .../kernels/elementwise_functions/tanh.hpp    |  2 +-
 .../elementwise_functions/true_divide.hpp     | 31 +++++++++++++++++--
 .../kernels/elementwise_functions/trunc.hpp   |  2 +-
 21 files changed, 105 insertions(+), 33 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
index 324f3f5ad2..8a1990ba7d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp
@@ -24,9 +24,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/math_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
index 9a7ec72e56..fb3490ee19 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp
@@ -24,9 +24,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/math_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
index c316279a76..612ad78360 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp
@@ -24,9 +24,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
@@ -49,6 +50,7 @@ namespace multiply
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 template <typename argT1, typename argT2, typename resT> struct MultiplyFunctor
 {
@@ -60,7 +62,18 @@ template <typename argT1, typename argT2, typename resT> struct MultiplyFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
-        return in1 * in2;
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value)
+        {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<realT1>(in1) *
+                   exprm_ns::complex<realT2>(in2);
+        }
+        else {
+            return in1 * in2;
+        }
     }
 
     template <int vec_sz>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
index cbeeb60b7c..bc28aafad7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp
@@ -24,10 +24,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
index 88e077b402..faeab82580 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp
@@ -24,9 +24,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
index cbeba2e91d..b3e109c76c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp
@@ -24,10 +24,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
index ba9241b8db..b02a8d4126 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp
@@ -24,10 +24,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
@@ -49,6 +50,7 @@ namespace pow
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 template <typename argT1, typename argT2, typename resT> struct PowFunctor
 {
@@ -83,6 +85,15 @@ template <typename argT1, typename argT2, typename resT> struct PowFunctor
             }
             return res;
         }
+        else if constexpr (tu_ns::is_complex<argT1>::value &&
+                           tu_ns::is_complex<argT2>::value)
+        {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::pow(exprm_ns::complex<realT1>(in1),
+                                 exprm_ns::complex<realT2>(in2));
+        }
         else {
             return std::pow(in1, in2);
         }
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
index dcaa4b0f5f..92f5ffa729 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp
@@ -24,12 +24,12 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
index 294b796e96..6a7580d548 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp
@@ -24,11 +24,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
index 6cd306a900..fcf2775ef3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp
@@ -25,9 +25,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
index 84d8fb7252..547d31b392 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp
@@ -23,10 +23,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
index fc3d44dcfa..162db394de 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp
@@ -23,11 +23,12 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -48,6 +49,7 @@ namespace sign
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
@@ -61,38 +63,41 @@ template <typename argT, typename resT> struct SignFunctor
         std::disjunction<is_complex<resT>, is_complex<argT>>>;
     using supports_sg_loadstore = std::false_type;
 
-    resT operator()(const argT &x) const
+    resT operator()(const argT &in) const
     {
         if constexpr (std::is_integral_v<argT>) {
             if constexpr (std::is_unsigned_v<argT>) {
-                return resT(0 < x);
+                return resT(0 < in);
             }
             else {
-                return sign<argT>(x);
+                return sign_impl<argT>(in);
             }
         }
         else {
             if constexpr (is_complex<argT>::value) {
-                if (x == argT(0)) {
+                using realT = typename argT::value_type;
+
+                if (in == argT(0)) {
                     return resT(0);
                 }
                 else {
-                    return (x / std::abs(x));
+                    auto z = exprm_ns::complex<realT>(in);
+                    return (z / exprm_ns::abs(z));
                 }
             }
             else {
-                if (std::isnan(x)) {
+                if (std::isnan(in)) {
                     return std::numeric_limits<resT>::quiet_NaN();
                 }
                 else {
-                    return sign<argT>(x);
+                    return sign_impl<argT>(in);
                 }
             }
         }
     }
 
 private:
-    template <typename T> T sign(const T &v) const
+    template <typename T> T sign_impl(const T &v) const
     {
         return (T(0) < v) - (v < T(0));
     }
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
index 0f509f7950..3e961c466d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp
@@ -24,10 +24,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
index 55fdc44531..af887431d7 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
@@ -23,11 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
index efa580d70e..b638e4a55f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp
@@ -24,12 +24,13 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -50,6 +51,7 @@ namespace sqrt
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -74,7 +76,10 @@ template <typename argT, typename resT> struct SqrtFunctor
             // #else
             //             return std::sqrt(in);
             // #endif
-            return csqrt(in);
+            using realT = typename argT::value_type;
+
+            // return csqrt(in);
+            return exprm_ns::sqrt(exprm_ns::complex<realT>(in));
         }
         else {
             return std::sqrt(in);
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
index 6b5f372c3d..2c37ce87d9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp
@@ -24,10 +24,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -48,6 +49,7 @@ namespace square
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 using dpctl::tensor::type_utils::vec_cast;
@@ -68,7 +70,16 @@ template <typename argT, typename resT> struct SquareFunctor
 
     resT operator()(const argT &in) const
     {
-        return in * in;
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            auto z = exprm_ns::complex<realT>(in);
+
+            return z * z;
+        }
+        else {
+            return in * in;
+        }
     }
 
     template <int vec_sz>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
index 3eb8420933..332f52930d 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp
@@ -24,9 +24,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
index d01a3e906a..1f97b59054 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp
@@ -23,12 +23,12 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
index 237d8d2618..453ce17b54 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp
@@ -24,12 +24,12 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
index 138f7a3f91..fcbb6adc08 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -24,9 +24,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
@@ -48,6 +49,7 @@ namespace true_divide
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
 namespace tu_ns = dpctl::tensor::type_utils;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 template <typename argT1, typename argT2, typename resT>
 struct TrueDivideFunctor
@@ -60,7 +62,32 @@ struct TrueDivideFunctor
 
     resT operator()(const argT1 &in1, const argT2 &in2) const
     {
-        return in1 / in2;
+        if constexpr (tu_ns::is_complex<argT1>::value &&
+                      tu_ns::is_complex<argT2>::value)
+        {
+            using realT1 = typename argT1::value_type;
+            using realT2 = typename argT2::value_type;
+
+            return exprm_ns::complex<realT1>(in1) /
+                   exprm_ns::complex<realT2>(in2);
+        }
+        else if constexpr (tu_ns::is_complex<argT1>::value &&
+                           !tu_ns::is_complex<argT2>::value)
+        {
+            using realT1 = typename argT1::value_type;
+
+            return exprm_ns::complex<realT1>(in1) / in2;
+        }
+        else if constexpr (!tu_ns::is_complex<argT1>::value &&
+                           tu_ns::is_complex<argT2>::value)
+        {
+            using realT2 = typename argT2::value_type;
+
+            return in1 / exprm_ns::complex<realT2>(in2);
+        }
+        else {
+            return in1 / in2;
+        }
     }
 
     template <int vec_sz>
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
index 33e942dd6a..0e08d966e9 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp
@@ -23,10 +23,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"

From f6c3e56a714d0915abecfe8b74f2a15fc7e6e88a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 29 Sep 2023 09:53:24 -0500
Subject: [PATCH 09/19] Changes include CL/sycl.hpp to sycl/sycl.hpp per
 SYCL-2020 spec

---
 dpctl/tensor/libtensor/include/kernels/accumulators.hpp        | 2 +-
 .../libtensor/include/kernels/boolean_advanced_indexing.hpp    | 2 +-
 dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp  | 2 +-
 dpctl/tensor/libtensor/include/kernels/constructors.hpp        | 2 +-
 dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp       | 2 +-
 .../libtensor/include/kernels/integer_advanced_indexing.hpp    | 2 +-
 dpctl/tensor/libtensor/include/kernels/reductions.hpp          | 3 +--
 dpctl/tensor/libtensor/include/kernels/repeat.hpp              | 2 +-
 dpctl/tensor/libtensor/include/kernels/where.hpp               | 2 +-
 dpctl/tensor/libtensor/include/utils/offset_utils.hpp          | 2 +-
 dpctl/tensor/libtensor/include/utils/sycl_utils.hpp            | 2 +-
 dpctl/tensor/libtensor/include/utils/type_dispatch.hpp         | 2 +-
 dpctl/tensor/libtensor/include/utils/type_utils.hpp            | 2 +-
 13 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
index 110010706c..40ddc49e77 100644
--- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp
@@ -23,11 +23,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <array>
 #include <cstdint>
 #include <limits>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
index 968459fb68..522baadc6d 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
@@ -23,10 +23,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstdint>
 #include <limits>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
index 9736b2c2a3..61fb0f6ba0 100644
--- a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
@@ -24,7 +24,7 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include <complex>
 #include <cstdint>
diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp
index 8870e26ac2..c28033d23d 100644
--- a/dpctl/tensor/libtensor/include/kernels/constructors.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/constructors.hpp
@@ -27,9 +27,9 @@
 #include "utils/offset_utils.hpp"
 #include "utils/strided_iters.hpp"
 #include "utils/type_utils.hpp"
-#include <CL/sycl.hpp>
 #include <complex>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 
 namespace dpctl
 {
diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
index 0db1f071a1..9d1c788626 100644
--- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
@@ -23,10 +23,10 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <complex>
 #include <cstdint>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
index 6acf0a9f50..769774f4dd 100644
--- a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp
@@ -23,11 +23,11 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <algorithm>
 #include <complex>
 #include <cstdint>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
index 7cb97cd4f9..f9b3c5bcd0 100644
--- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -23,11 +23,10 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
-#include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 #include <utility>
 #include <vector>
diff --git a/dpctl/tensor/libtensor/include/kernels/repeat.hpp b/dpctl/tensor/libtensor/include/kernels/repeat.hpp
index 1f2335fc6c..05b57a8cda 100644
--- a/dpctl/tensor/libtensor/include/kernels/repeat.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/repeat.hpp
@@ -23,11 +23,11 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <algorithm>
 #include <complex>
 #include <cstdint>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp
index fc9546a9a8..9558603d5e 100644
--- a/dpctl/tensor/libtensor/include/kernels/where.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/where.hpp
@@ -27,11 +27,11 @@
 #include "pybind11/stl.h"
 #include "utils/offset_utils.hpp"
 #include "utils/type_utils.hpp"
-#include <CL/sycl.hpp>
 #include <algorithm>
 #include <complex>
 #include <cstdint>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
index 29517ce2c5..523620737b 100644
--- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp
@@ -26,9 +26,9 @@
 
 #pragma once
 
-#include <CL/sycl.hpp>
 #include <algorithm>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <tuple>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index 0d4240c516..31d3d874a0 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -23,10 +23,10 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <algorithm>
 #include <cstddef>
 #include <type_traits>
+#include <sycl/sycl.hpp>
 #include <vector>
 
 #include "math_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
index afc458169e..af031a963b 100644
--- a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
+++ b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp
@@ -25,8 +25,8 @@
 #pragma once
 
 #include "dpctl4pybind11.hpp"
-#include <CL/sycl.hpp>
 #include <complex>
+#include <sycl/sycl.hpp>
 
 namespace dpctl
 {
diff --git a/dpctl/tensor/libtensor/include/utils/type_utils.hpp b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
index 4ea17ac730..a50e5159e4 100644
--- a/dpctl/tensor/libtensor/include/utils/type_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/type_utils.hpp
@@ -23,9 +23,9 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <complex>
 #include <exception>
+#include <sycl/sycl.hpp>
 #include <utility>
 
 namespace dpctl

From 3b9d81d3b350d7cf625d57e7938869b7b9e5b2f9 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 29 Sep 2023 09:54:05 -0500
Subject: [PATCH 10/19] Change include CL/sycl.hpp to sycl/sycl.hpp per
 SYCL-2020 spec

---
 dpctl/tensor/libtensor/source/accumulators.cpp                | 2 +-
 dpctl/tensor/libtensor/source/accumulators.hpp                | 2 +-
 dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp   | 2 +-
 dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp   | 2 +-
 dpctl/tensor/libtensor/source/boolean_reductions.cpp          | 2 +-
 dpctl/tensor/libtensor/source/boolean_reductions.hpp          | 2 +-
 dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp    | 2 +-
 dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp    | 2 +-
 dpctl/tensor/libtensor/source/copy_for_reshape.cpp            | 2 +-
 dpctl/tensor/libtensor/source/copy_for_reshape.hpp            | 2 +-
 dpctl/tensor/libtensor/source/copy_for_roll.cpp               | 2 +-
 dpctl/tensor/libtensor/source/copy_for_roll.hpp               | 2 +-
 .../libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp  | 2 +-
 .../libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp  | 2 +-
 dpctl/tensor/libtensor/source/device_support_queries.cpp      | 2 +-
 dpctl/tensor/libtensor/source/device_support_queries.hpp      | 2 +-
 dpctl/tensor/libtensor/source/elementwise_functions.cpp       | 2 +-
 dpctl/tensor/libtensor/source/elementwise_functions.hpp       | 2 +-
 dpctl/tensor/libtensor/source/eye_ctor.cpp                    | 2 +-
 dpctl/tensor/libtensor/source/eye_ctor.hpp                    | 2 +-
 dpctl/tensor/libtensor/source/full_ctor.cpp                   | 2 +-
 dpctl/tensor/libtensor/source/full_ctor.hpp                   | 2 +-
 dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp   | 2 +-
 dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp   | 2 +-
 dpctl/tensor/libtensor/source/linear_sequences.cpp            | 2 +-
 dpctl/tensor/libtensor/source/linear_sequences.hpp            | 2 +-
 dpctl/tensor/libtensor/source/reduction_over_axis.hpp         | 4 +++-
 dpctl/tensor/libtensor/source/repeat.cpp                      | 2 +-
 dpctl/tensor/libtensor/source/repeat.hpp                      | 2 +-
 dpctl/tensor/libtensor/source/tensor_py.cpp                   | 2 +-
 dpctl/tensor/libtensor/source/triul_ctor.cpp                  | 2 +-
 dpctl/tensor/libtensor/source/triul_ctor.hpp                  | 2 +-
 dpctl/tensor/libtensor/source/where.cpp                       | 2 +-
 dpctl/tensor/libtensor/source/where.hpp                       | 2 +-
 34 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/accumulators.cpp b/dpctl/tensor/libtensor/source/accumulators.cpp
index 40f4424ef9..0a2ce69f69 100644
--- a/dpctl/tensor/libtensor/source/accumulators.cpp
+++ b/dpctl/tensor/libtensor/source/accumulators.cpp
@@ -23,11 +23,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "dpctl4pybind11.hpp"
-#include <CL/sycl.hpp>
 #include <cstdint>
 #include <limits>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/accumulators.hpp b/dpctl/tensor/libtensor/source/accumulators.hpp
index 4979eab54f..ba40c38e1d 100644
--- a/dpctl/tensor/libtensor/source/accumulators.hpp
+++ b/dpctl/tensor/libtensor/source/accumulators.hpp
@@ -23,7 +23,7 @@
 //===--------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
index ff7b32d0f7..903e1b5536 100644
--- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.cpp
@@ -24,11 +24,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "dpctl4pybind11.hpp"
-#include <CL/sycl.hpp>
 #include <cstdint>
 #include <limits>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
index 26f1c6a646..8347d9f687 100644
--- a/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/source/boolean_advanced_indexing.hpp
@@ -24,7 +24,7 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/boolean_reductions.cpp b/dpctl/tensor/libtensor/source/boolean_reductions.cpp
index 5f3c1f5e51..32deab6da9 100644
--- a/dpctl/tensor/libtensor/source/boolean_reductions.cpp
+++ b/dpctl/tensor/libtensor/source/boolean_reductions.cpp
@@ -24,8 +24,8 @@
 /// dpctl.tensor.all and dpctl.tensor.any
 //===----------------------------------------------------------------------===//
 
-#include <CL/sycl.hpp>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/boolean_reductions.hpp b/dpctl/tensor/libtensor/source/boolean_reductions.hpp
index 5a0d5d381a..4d59463f8b 100644
--- a/dpctl/tensor/libtensor/source/boolean_reductions.hpp
+++ b/dpctl/tensor/libtensor/source/boolean_reductions.hpp
@@ -25,11 +25,11 @@
 
 #pragma once
 #include "dpctl4pybind11.hpp"
-#include <CL/sycl.hpp>
 #include <algorithm>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
index 290ab88fe8..51ddd81312 100644
--- a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
+++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.cpp
@@ -22,7 +22,6 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-#include <CL/sycl.hpp>
 #include <algorithm>
 #include <complex>
 #include <cstdint>
@@ -30,6 +29,7 @@
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
 #include <thread>
 #include <type_traits>
 #include <utility>
diff --git a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
index c2161f1ba6..c8196b416a 100644
--- a/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
+++ b/dpctl/tensor/libtensor/source/copy_and_cast_usm_to_usm.hpp
@@ -23,7 +23,7 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp
index c9ab58528a..235878b820 100644
--- a/dpctl/tensor/libtensor/source/copy_for_reshape.cpp
+++ b/dpctl/tensor/libtensor/source/copy_for_reshape.cpp
@@ -22,7 +22,7 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp
index 2f25a68480..cd4ca68ff0 100644
--- a/dpctl/tensor/libtensor/source/copy_for_reshape.hpp
+++ b/dpctl/tensor/libtensor/source/copy_for_reshape.hpp
@@ -23,7 +23,7 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.cpp b/dpctl/tensor/libtensor/source/copy_for_roll.cpp
index cc319e6e08..ab36f543af 100644
--- a/dpctl/tensor/libtensor/source/copy_for_roll.cpp
+++ b/dpctl/tensor/libtensor/source/copy_for_roll.cpp
@@ -22,7 +22,7 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/copy_for_roll.hpp b/dpctl/tensor/libtensor/source/copy_for_roll.hpp
index 38e84b9c6a..357d821eff 100644
--- a/dpctl/tensor/libtensor/source/copy_for_roll.hpp
+++ b/dpctl/tensor/libtensor/source/copy_for_roll.hpp
@@ -23,7 +23,7 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
index bb367a42b9..f644522c18 100644
--- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
+++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.cpp
@@ -22,8 +22,8 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-#include <CL/sycl.hpp>
 #include <algorithm>
+#include <sycl/sycl.hpp>
 #include <vector>
 
 #include "dpctl4pybind11.hpp"
diff --git a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
index 3f1833ec99..247a5d7314 100644
--- a/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
+++ b/dpctl/tensor/libtensor/source/copy_numpy_ndarray_into_usm_ndarray.hpp
@@ -23,7 +23,7 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <vector>
 
 #include "dpctl4pybind11.hpp"
diff --git a/dpctl/tensor/libtensor/source/device_support_queries.cpp b/dpctl/tensor/libtensor/source/device_support_queries.cpp
index 9f793cb00a..cb0dbc02a5 100644
--- a/dpctl/tensor/libtensor/source/device_support_queries.cpp
+++ b/dpctl/tensor/libtensor/source/device_support_queries.cpp
@@ -25,9 +25,9 @@
 #include <string>
 
 #include "dpctl4pybind11.hpp"
-#include <CL/sycl.hpp>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
 
 namespace dpctl
 {
diff --git a/dpctl/tensor/libtensor/source/device_support_queries.hpp b/dpctl/tensor/libtensor/source/device_support_queries.hpp
index 3367f8bfc2..efffd4ac93 100644
--- a/dpctl/tensor/libtensor/source/device_support_queries.hpp
+++ b/dpctl/tensor/libtensor/source/device_support_queries.hpp
@@ -26,9 +26,9 @@
 #include <string>
 
 #include "dpctl4pybind11.hpp"
-#include <CL/sycl.hpp>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
 
 namespace dpctl
 {
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
index 043cac0cd2..9ab7c0807c 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
@@ -24,10 +24,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "dpctl4pybind11.hpp"
-#include <CL/sycl.hpp>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 
 #include "elementwise_functions.hpp"
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions.hpp
index 523e4259c3..666e34773c 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions.hpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions.hpp
@@ -26,10 +26,10 @@
 #pragma once
 
 #include "dpctl4pybind11.hpp"
-#include <CL/sycl.hpp>
 #include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/eye_ctor.cpp b/dpctl/tensor/libtensor/source/eye_ctor.cpp
index 5d7657d047..c768a5e395 100644
--- a/dpctl/tensor/libtensor/source/eye_ctor.cpp
+++ b/dpctl/tensor/libtensor/source/eye_ctor.cpp
@@ -22,7 +22,7 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/eye_ctor.hpp b/dpctl/tensor/libtensor/source/eye_ctor.hpp
index 4307e0f3b2..58249f08d7 100644
--- a/dpctl/tensor/libtensor/source/eye_ctor.hpp
+++ b/dpctl/tensor/libtensor/source/eye_ctor.hpp
@@ -23,7 +23,7 @@
 //===--------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp
index 085bdcaf2a..c8004bfae8 100644
--- a/dpctl/tensor/libtensor/source/full_ctor.cpp
+++ b/dpctl/tensor/libtensor/source/full_ctor.cpp
@@ -23,10 +23,10 @@
 //===--------------------------------------------------------------------===//
 
 #include "dpctl4pybind11.hpp"
-#include <CL/sycl.hpp>
 #include <complex>
 #include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/full_ctor.hpp b/dpctl/tensor/libtensor/source/full_ctor.hpp
index 3894babf1f..66456f9a7f 100644
--- a/dpctl/tensor/libtensor/source/full_ctor.hpp
+++ b/dpctl/tensor/libtensor/source/full_ctor.hpp
@@ -23,7 +23,7 @@
 //===--------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
index a17a229fc1..0fd3d2615d 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.cpp
@@ -23,13 +23,13 @@
 /// dpctl.tensor.put
 //===----------------------------------------------------------------------===//
 
-#include <CL/sycl.hpp>
 #include <algorithm>
 #include <complex>
 #include <cstdint>
 #include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 
 #include "dpctl4pybind11.hpp"
diff --git a/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp
index f845f7d23b..011fe670a9 100644
--- a/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp
+++ b/dpctl/tensor/libtensor/source/integer_advanced_indexing.hpp
@@ -24,7 +24,7 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/linear_sequences.cpp b/dpctl/tensor/libtensor/source/linear_sequences.cpp
index 34db93de12..72d292df5f 100644
--- a/dpctl/tensor/libtensor/source/linear_sequences.cpp
+++ b/dpctl/tensor/libtensor/source/linear_sequences.cpp
@@ -23,10 +23,10 @@
 //===--------------------------------------------------------------------===//
 
 #include "dpctl4pybind11.hpp"
-#include <CL/sycl.hpp>
 #include <complex>
 #include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/linear_sequences.hpp b/dpctl/tensor/libtensor/source/linear_sequences.hpp
index 61e613b45f..fd13677680 100644
--- a/dpctl/tensor/libtensor/source/linear_sequences.hpp
+++ b/dpctl/tensor/libtensor/source/linear_sequences.hpp
@@ -23,7 +23,7 @@
 //===--------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
index 1a9cb6f5e7..46ef02f064 100644
--- a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
@@ -25,10 +25,12 @@
 
 #pragma once
 
-#include <CL/sycl.hpp>
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <type_traits>
+#include <string>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/repeat.cpp b/dpctl/tensor/libtensor/source/repeat.cpp
index f3a20cbbaa..fe11684ab9 100644
--- a/dpctl/tensor/libtensor/source/repeat.cpp
+++ b/dpctl/tensor/libtensor/source/repeat.cpp
@@ -23,11 +23,11 @@
 //===--------------------------------------------------------------------===//
 
 #include "dpctl4pybind11.hpp"
-#include <CL/sycl.hpp>
 #include <cstdint>
 #include <limits>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/repeat.hpp b/dpctl/tensor/libtensor/source/repeat.hpp
index 65ace36516..2d37aa33e9 100644
--- a/dpctl/tensor/libtensor/source/repeat.hpp
+++ b/dpctl/tensor/libtensor/source/repeat.hpp
@@ -23,7 +23,7 @@
 //===--------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 0e8b4236b6..8bab9c8bb8 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -22,12 +22,12 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===----------------------------------------------------------------------===//
 
-#include <CL/sycl.hpp>
 #include <algorithm>
 #include <complex>
 #include <cstdint>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
 #include <thread>
 #include <type_traits>
 #include <utility>
diff --git a/dpctl/tensor/libtensor/source/triul_ctor.cpp b/dpctl/tensor/libtensor/source/triul_ctor.cpp
index 40dd5cf48a..03fcd2994c 100644
--- a/dpctl/tensor/libtensor/source/triul_ctor.cpp
+++ b/dpctl/tensor/libtensor/source/triul_ctor.cpp
@@ -22,7 +22,7 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/triul_ctor.hpp b/dpctl/tensor/libtensor/source/triul_ctor.hpp
index 9e7053c638..de80d20407 100644
--- a/dpctl/tensor/libtensor/source/triul_ctor.hpp
+++ b/dpctl/tensor/libtensor/source/triul_ctor.hpp
@@ -23,7 +23,7 @@
 //===--------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tensor/libtensor/source/where.cpp b/dpctl/tensor/libtensor/source/where.cpp
index ed782bda34..e533fd2ee1 100644
--- a/dpctl/tensor/libtensor/source/where.cpp
+++ b/dpctl/tensor/libtensor/source/where.cpp
@@ -24,12 +24,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "dpctl4pybind11.hpp"
-#include <CL/sycl.hpp>
 #include <complex>
 #include <cstdint>
 #include <pybind11/complex.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 
 #include "kernels/where.hpp"
diff --git a/dpctl/tensor/libtensor/source/where.hpp b/dpctl/tensor/libtensor/source/where.hpp
index 6fe6527080..2ca3b39e02 100644
--- a/dpctl/tensor/libtensor/source/where.hpp
+++ b/dpctl/tensor/libtensor/source/where.hpp
@@ -24,7 +24,7 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 

From 44abcb4b8d1146c52bbc87e017d9a487dded88cf Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 11 Oct 2023 02:07:19 -0500
Subject: [PATCH 11/19] Use experimental::complex for in-place division

---
 .../elementwise_functions/true_divide.hpp     | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
index fcbb6adc08..742a9c1a80 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp
@@ -407,7 +407,24 @@ template <typename argT, typename resT> struct TrueDivideInplaceFunctor
 
     void operator()(resT &res, const argT &in)
     {
-        res /= in;
+        if constexpr (tu_ns::is_complex<resT>::value) {
+            using res_rT = typename resT::value_type;
+            if constexpr (tu_ns::is_complex<argT>::value) {
+                using arg_rT = typename argT::value_type;
+
+                auto res1 = exprm_ns::complex<res_rT>(res);
+                res1 /= exprm_ns::complex<arg_rT>(in);
+                res = res1;
+            }
+            else {
+                auto res1 = exprm_ns::complex<res_rT>(res);
+                res1 /= in;
+                res = res1;
+            }
+        }
+        else {
+            res /= in;
+        }
     }
 
     template <int vec_sz>

From 23aeec6833fec091931b4dbe6f9362676d3f2f37 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 14 Oct 2023 13:47:57 -0500
Subject: [PATCH 12/19] include "CL/sycl.hpp" -> include "sycl/sycl.hpp"

---
 dpctl/_host_task_util.hpp                             | 2 +-
 dpctl/apis/include/dpctl4pybind11.hpp                 | 2 +-
 dpctl/sycl.pxd                                        | 2 +-
 dpctl/tensor/libtensor/source/reduction_over_axis.cpp | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dpctl/_host_task_util.hpp b/dpctl/_host_task_util.hpp
index 8db17594fd..308b85a249 100644
--- a/dpctl/_host_task_util.hpp
+++ b/dpctl/_host_task_util.hpp
@@ -31,7 +31,7 @@
 
 #include "Python.h"
 #include "syclinterface/dpctl_data_types.h"
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int async_dec_ref(DPCTLSyclQueueRef QRef,
                   PyObject **obj_array,
diff --git a/dpctl/apis/include/dpctl4pybind11.hpp b/dpctl/apis/include/dpctl4pybind11.hpp
index f68826af48..10ee4602c3 100644
--- a/dpctl/apis/include/dpctl4pybind11.hpp
+++ b/dpctl/apis/include/dpctl4pybind11.hpp
@@ -26,10 +26,10 @@
 #pragma once
 
 #include "dpctl_capi.h"
-#include <CL/sycl.hpp>
 #include <complex>
 #include <memory>
 #include <pybind11/pybind11.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/sycl.pxd b/dpctl/sycl.pxd
index 918f476298..0318868ef8 100644
--- a/dpctl/sycl.pxd
+++ b/dpctl/sycl.pxd
@@ -20,7 +20,7 @@
 from . cimport _backend as dpctl_backend
 
 
-cdef extern from "CL/sycl.hpp" namespace "sycl":
+cdef extern from "sycl/sycl.hpp" namespace "sycl":
     cdef cppclass queue "sycl::queue":
         pass
 
diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
index c67fcd5ba3..00e4a0a076 100644
--- a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp
@@ -22,8 +22,8 @@
 /// This file defines functions of dpctl.tensor._tensor_impl extensions
 //===--------------------------------------------------------------------===//
 
-#include <CL/sycl.hpp>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 

From 66ba04e17edcd02b0bd131e096076785a5bf7648 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sat, 14 Oct 2023 15:23:49 -0500
Subject: [PATCH 13/19] include CL/sycl.hpp -> include sycl/sycl.hpp

---
 .../helper/include/dpctl_error_handlers.h          |  2 +-
 .../helper/include/dpctl_utils_helper.h            |  2 +-
 .../include/dpctl_device_selection.hpp             |  2 +-
 .../include/dpctl_sycl_type_casters.hpp            |  2 +-
 libsyclinterface/source/dpctl_device_selection.cpp |  2 +-
 .../source/dpctl_sycl_context_interface.cpp        |  2 +-
 .../source/dpctl_sycl_device_interface.cpp         |  2 +-
 .../source/dpctl_sycl_device_manager.cpp           |  2 +-
 .../dpctl_sycl_device_selector_interface.cpp       |  2 +-
 .../source/dpctl_sycl_event_interface.cpp          |  2 +-
 .../source/dpctl_sycl_kernel_bundle_interface.cpp  | 14 +++-----------
 .../source/dpctl_sycl_kernel_interface.cpp         |  2 +-
 .../source/dpctl_sycl_platform_interface.cpp       |  2 +-
 .../source/dpctl_sycl_platform_manager.cpp         |  2 +-
 .../source/dpctl_sycl_queue_interface.cpp          |  2 +-
 .../source/dpctl_sycl_queue_manager.cpp            |  2 +-
 .../source/dpctl_sycl_usm_interface.cpp            |  2 +-
 libsyclinterface/tests/test_helper.cpp             |  2 +-
 .../tests/test_sycl_context_interface.cpp          |  2 +-
 .../tests/test_sycl_device_aspects.cpp             |  2 +-
 .../tests/test_sycl_device_interface.cpp           |  2 +-
 .../tests/test_sycl_device_invalid_filters.cpp     |  2 +-
 .../tests/test_sycl_device_selector_interface.cpp  |  2 +-
 .../tests/test_sycl_device_subdevices.cpp          |  2 +-
 .../tests/test_sycl_event_interface.cpp            |  2 +-
 .../tests/test_sycl_kernel_bundle_interface.cpp    |  2 +-
 .../tests/test_sycl_kernel_interface.cpp           |  2 +-
 .../tests/test_sycl_platform_interface.cpp         |  2 +-
 .../tests/test_sycl_platform_invalid_filters.cpp   |  2 +-
 .../tests/test_sycl_queue_interface.cpp            |  2 +-
 libsyclinterface/tests/test_sycl_queue_manager.cpp |  2 +-
 libsyclinterface/tests/test_sycl_queue_submit.cpp  |  2 +-
 libsyclinterface/tests/test_sycl_usm_interface.cpp |  2 +-
 33 files changed, 35 insertions(+), 43 deletions(-)

diff --git a/libsyclinterface/helper/include/dpctl_error_handlers.h b/libsyclinterface/helper/include/dpctl_error_handlers.h
index 2947dd1d5e..5c7c12f1a4 100644
--- a/libsyclinterface/helper/include/dpctl_error_handlers.h
+++ b/libsyclinterface/helper/include/dpctl_error_handlers.h
@@ -27,7 +27,7 @@
 
 #include "Support/DllExport.h"
 #include "dpctl_error_handler_type.h"
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 /*!
  * @brief Functor class used by DPCTL to handle SYCL asynchronous errors.
diff --git a/libsyclinterface/helper/include/dpctl_utils_helper.h b/libsyclinterface/helper/include/dpctl_utils_helper.h
index 9ed29514df..f7f484a32a 100644
--- a/libsyclinterface/helper/include/dpctl_utils_helper.h
+++ b/libsyclinterface/helper/include/dpctl_utils_helper.h
@@ -26,7 +26,7 @@
 
 #include "Support/DllExport.h"
 #include "dpctl_sycl_enum_types.h"
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 /*!
  * @brief Converts a sycl::info::device_type input value to a string.
diff --git a/libsyclinterface/include/dpctl_device_selection.hpp b/libsyclinterface/include/dpctl_device_selection.hpp
index 9da0072ab1..605078586c 100644
--- a/libsyclinterface/include/dpctl_device_selection.hpp
+++ b/libsyclinterface/include/dpctl_device_selection.hpp
@@ -28,7 +28,7 @@
 #pragma once
 
 #include "Support/DllExport.h"
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 namespace dpctl
 {
diff --git a/libsyclinterface/include/dpctl_sycl_type_casters.hpp b/libsyclinterface/include/dpctl_sycl_type_casters.hpp
index 470165afdd..107fc43ff4 100644
--- a/libsyclinterface/include/dpctl_sycl_type_casters.hpp
+++ b/libsyclinterface/include/dpctl_sycl_type_casters.hpp
@@ -30,7 +30,7 @@
 
 #include "dpctl_device_selection.hpp"
 #include "dpctl_sycl_types.h"
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <vector>
 
 namespace dpctl::syclinterface
diff --git a/libsyclinterface/source/dpctl_device_selection.cpp b/libsyclinterface/source/dpctl_device_selection.cpp
index 7203bc3b1a..299ca5be41 100644
--- a/libsyclinterface/source/dpctl_device_selection.cpp
+++ b/libsyclinterface/source/dpctl_device_selection.cpp
@@ -27,7 +27,7 @@
 
 #include "dpctl_device_selection.hpp"
 #include "Config/dpctl_config.h"
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 namespace
 {
diff --git a/libsyclinterface/source/dpctl_sycl_context_interface.cpp b/libsyclinterface/source/dpctl_sycl_context_interface.cpp
index a19286a779..ab9923652c 100644
--- a/libsyclinterface/source/dpctl_sycl_context_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_context_interface.cpp
@@ -28,7 +28,7 @@
 #include "Config/dpctl_config.h"
 #include "dpctl_error_handlers.h"
 #include "dpctl_sycl_type_casters.hpp"
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/libsyclinterface/source/dpctl_sycl_device_interface.cpp b/libsyclinterface/source/dpctl_sycl_device_interface.cpp
index b5a97013c2..e30a690cfb 100644
--- a/libsyclinterface/source/dpctl_sycl_device_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_device_interface.cpp
@@ -32,9 +32,9 @@
 #include "dpctl_sycl_device_manager.h"
 #include "dpctl_sycl_type_casters.hpp"
 #include "dpctl_utils_helper.h"
-#include <CL/sycl.hpp> /* SYCL headers   */
 #include <algorithm>
 #include <cstring>
+#include <sycl/sycl.hpp> /* SYCL headers   */
 #include <utility>
 #include <vector>
 
diff --git a/libsyclinterface/source/dpctl_sycl_device_manager.cpp b/libsyclinterface/source/dpctl_sycl_device_manager.cpp
index 0eb71df412..f36f5db21e 100644
--- a/libsyclinterface/source/dpctl_sycl_device_manager.cpp
+++ b/libsyclinterface/source/dpctl_sycl_device_manager.cpp
@@ -29,10 +29,10 @@
 #include "dpctl_sycl_enum_types.h"
 #include "dpctl_sycl_type_casters.hpp"
 #include "dpctl_utils_helper.h"
-#include <CL/sycl.hpp>           /* SYCL headers   */
 #include <Config/dpctl_config.h> /* Config */
 #include <iomanip>
 #include <iostream>
+#include <sycl/sycl.hpp> /* SYCL headers   */
 #include <unordered_map>
 #include <utility>
 #include <vector>
diff --git a/libsyclinterface/source/dpctl_sycl_device_selector_interface.cpp b/libsyclinterface/source/dpctl_sycl_device_selector_interface.cpp
index 9753c32613..834e9a57a2 100644
--- a/libsyclinterface/source/dpctl_sycl_device_selector_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_device_selector_interface.cpp
@@ -28,7 +28,7 @@
 #include "dpctl_device_selection.hpp"
 #include "dpctl_error_handlers.h"
 #include "dpctl_sycl_type_casters.hpp"
-#include <CL/sycl.hpp> /* SYCL headers   */
+#include <sycl/sycl.hpp> /* SYCL headers   */
 
 using namespace sycl;
 
diff --git a/libsyclinterface/source/dpctl_sycl_event_interface.cpp b/libsyclinterface/source/dpctl_sycl_event_interface.cpp
index 3f872f4493..7a109faca9 100644
--- a/libsyclinterface/source/dpctl_sycl_event_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_event_interface.cpp
@@ -29,7 +29,7 @@
 #include "dpctl_error_handlers.h"
 #include "dpctl_sycl_type_casters.hpp"
 #include "dpctl_utils_helper.h"
-#include <CL/sycl.hpp> /* SYCL headers   */
+#include <sycl/sycl.hpp> /* SYCL headers   */
 #include <vector>
 
 using namespace sycl;
diff --git a/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp b/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp
index 201c8172e3..d32f278c07 100644
--- a/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_kernel_bundle_interface.cpp
@@ -30,14 +30,10 @@
 #include "dpctl_dynamic_lib_helper.h"
 #include "dpctl_error_handlers.h"
 #include "dpctl_sycl_type_casters.hpp"
-#include <CL/cl.h>     /* OpenCL headers     */
-#include <CL/sycl.hpp> /* Sycl headers       */
-#if __has_include(<sycl/backend/opencl.hpp>)
-#include <sycl/backend/opencl.hpp>
-#else
-#include <CL/sycl/backend/opencl.hpp>
-#endif
+#include <CL/cl.h> /* OpenCL headers     */
 #include <sstream>
+#include <sycl/backend/opencl.hpp>
+#include <sycl/sycl.hpp> /* Sycl headers       */
 #include <utility>
 
 #ifdef DPCTL_ENABLE_L0_PROGRAM_CREATION
@@ -45,11 +41,7 @@
 // not reorder the includes.
 // clang-format off
 #include "ze_api.h" /* Level Zero headers */
-#if __has_include(<sycl/ext/oneapi/backend/level_zero.hpp>)
 #include <sycl/ext/oneapi/backend/level_zero.hpp>
-#else
-#include <CL/sycl/backend/level_zero.hpp>
-#endif
 // clang-format on
 #endif
 
diff --git a/libsyclinterface/source/dpctl_sycl_kernel_interface.cpp b/libsyclinterface/source/dpctl_sycl_kernel_interface.cpp
index 8a5af3f179..abd7f9a443 100644
--- a/libsyclinterface/source/dpctl_sycl_kernel_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_kernel_interface.cpp
@@ -29,8 +29,8 @@
 #include "dpctl_error_handlers.h"
 #include "dpctl_string_utils.hpp"
 #include "dpctl_sycl_type_casters.hpp"
-#include <CL/sycl.hpp> /* Sycl headers */
 #include <cstdint>
+#include <sycl/sycl.hpp> /* Sycl headers */
 
 using namespace sycl;
 
diff --git a/libsyclinterface/source/dpctl_sycl_platform_interface.cpp b/libsyclinterface/source/dpctl_sycl_platform_interface.cpp
index fb0fbd6bd2..409b600355 100644
--- a/libsyclinterface/source/dpctl_sycl_platform_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_platform_interface.cpp
@@ -31,11 +31,11 @@
 #include "dpctl_string_utils.hpp"
 #include "dpctl_sycl_type_casters.hpp"
 #include "dpctl_utils_helper.h"
-#include <CL/sycl.hpp>
 #include <iomanip>
 #include <iostream>
 #include <set>
 #include <sstream>
+#include <sycl/sycl.hpp>
 #include <utility>
 #include <vector>
 
diff --git a/libsyclinterface/source/dpctl_sycl_platform_manager.cpp b/libsyclinterface/source/dpctl_sycl_platform_manager.cpp
index 6717b48c6f..f01f7a76a5 100644
--- a/libsyclinterface/source/dpctl_sycl_platform_manager.cpp
+++ b/libsyclinterface/source/dpctl_sycl_platform_manager.cpp
@@ -31,11 +31,11 @@
 #include "dpctl_sycl_platform_interface.h"
 #include "dpctl_sycl_type_casters.hpp"
 #include "dpctl_utils_helper.h"
-#include <CL/sycl.hpp>
 #include <iomanip>
 #include <iostream>
 #include <set>
 #include <sstream>
+#include <sycl/sycl.hpp>
 
 using namespace sycl;
 
diff --git a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
index 4903b888ff..3eec8934a8 100644
--- a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp
@@ -31,9 +31,9 @@
 #include "dpctl_sycl_device_interface.h"
 #include "dpctl_sycl_device_manager.h"
 #include "dpctl_sycl_type_casters.hpp"
-#include <CL/sycl.hpp> /* SYCL headers   */
 #include <exception>
 #include <stdexcept>
+#include <sycl/sycl.hpp> /* SYCL headers   */
 #include <utility>
 
 using namespace sycl;
diff --git a/libsyclinterface/source/dpctl_sycl_queue_manager.cpp b/libsyclinterface/source/dpctl_sycl_queue_manager.cpp
index 54e97c0efa..651689e105 100644
--- a/libsyclinterface/source/dpctl_sycl_queue_manager.cpp
+++ b/libsyclinterface/source/dpctl_sycl_queue_manager.cpp
@@ -28,7 +28,7 @@
 #include "dpctl_error_handlers.h"
 #include "dpctl_sycl_device_manager.h"
 #include "dpctl_sycl_type_casters.hpp"
-#include <CL/sycl.hpp> /* SYCL headers   */
+#include <sycl/sycl.hpp> /* SYCL headers   */
 #include <vector>
 
 using namespace sycl;
diff --git a/libsyclinterface/source/dpctl_sycl_usm_interface.cpp b/libsyclinterface/source/dpctl_sycl_usm_interface.cpp
index 2ebae9801e..b993ee32a8 100644
--- a/libsyclinterface/source/dpctl_sycl_usm_interface.cpp
+++ b/libsyclinterface/source/dpctl_sycl_usm_interface.cpp
@@ -29,7 +29,7 @@
 #include "dpctl_error_handlers.h"
 #include "dpctl_sycl_device_interface.h"
 #include "dpctl_sycl_type_casters.hpp"
-#include <CL/sycl.hpp> /* SYCL headers   */
+#include <sycl/sycl.hpp> /* SYCL headers   */
 #include <utility>
 
 using namespace sycl;
diff --git a/libsyclinterface/tests/test_helper.cpp b/libsyclinterface/tests/test_helper.cpp
index ea529cbf24..467274849d 100644
--- a/libsyclinterface/tests/test_helper.cpp
+++ b/libsyclinterface/tests/test_helper.cpp
@@ -26,9 +26,9 @@
 
 #include "Config/dpctl_config.h"
 #include "dpctl_utils_helper.h"
-#include <CL/sycl.hpp>
 #include <gtest/gtest.h>
 #include <string>
+#include <sycl/sycl.hpp>
 
 struct TestHelperFns : public ::testing::Test
 {
diff --git a/libsyclinterface/tests/test_sycl_context_interface.cpp b/libsyclinterface/tests/test_sycl_context_interface.cpp
index 36b2ff6e97..75fbbe7cb2 100644
--- a/libsyclinterface/tests/test_sycl_context_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_context_interface.cpp
@@ -29,8 +29,8 @@
 #include "dpctl_sycl_device_interface.h"
 #include "dpctl_sycl_device_selector_interface.h"
 #include "dpctl_sycl_types.h"
-#include <CL/sycl.hpp>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 #include <vector>
 
 using namespace sycl;
diff --git a/libsyclinterface/tests/test_sycl_device_aspects.cpp b/libsyclinterface/tests/test_sycl_device_aspects.cpp
index 9019d7f718..e2e42db74d 100644
--- a/libsyclinterface/tests/test_sycl_device_aspects.cpp
+++ b/libsyclinterface/tests/test_sycl_device_aspects.cpp
@@ -30,8 +30,8 @@
 #include "dpctl_sycl_enum_types.h"
 #include "dpctl_sycl_type_casters.hpp"
 #include "dpctl_utils_helper.h"
-#include <CL/sycl.hpp>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 #include <utility>
 
 namespace
diff --git a/libsyclinterface/tests/test_sycl_device_interface.cpp b/libsyclinterface/tests/test_sycl_device_interface.cpp
index dd20c738df..a0544482ff 100644
--- a/libsyclinterface/tests/test_sycl_device_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_device_interface.cpp
@@ -29,8 +29,8 @@
 #include "dpctl_sycl_platform_interface.h"
 #include "dpctl_utils.h"
 #include "dpctl_utils_helper.h"
-#include <CL/sycl.hpp>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 
 using namespace sycl;
 
diff --git a/libsyclinterface/tests/test_sycl_device_invalid_filters.cpp b/libsyclinterface/tests/test_sycl_device_invalid_filters.cpp
index c6a722c87a..50cfc6ba67 100644
--- a/libsyclinterface/tests/test_sycl_device_invalid_filters.cpp
+++ b/libsyclinterface/tests/test_sycl_device_invalid_filters.cpp
@@ -25,8 +25,8 @@
 
 #include "dpctl_sycl_device_interface.h"
 #include "dpctl_sycl_device_selector_interface.h"
-#include <CL/sycl.hpp>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 
 using namespace sycl;
 
diff --git a/libsyclinterface/tests/test_sycl_device_selector_interface.cpp b/libsyclinterface/tests/test_sycl_device_selector_interface.cpp
index eff7e4ee41..8e5df58769 100644
--- a/libsyclinterface/tests/test_sycl_device_selector_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_device_selector_interface.cpp
@@ -28,8 +28,8 @@
 #include "dpctl_sycl_device_manager.h"
 #include "dpctl_sycl_device_selector_interface.h"
 #include "dpctl_sycl_type_casters.hpp"
-#include <CL/sycl.hpp>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 
 using namespace sycl;
 
diff --git a/libsyclinterface/tests/test_sycl_device_subdevices.cpp b/libsyclinterface/tests/test_sycl_device_subdevices.cpp
index cb73359059..f2039c6dcf 100644
--- a/libsyclinterface/tests/test_sycl_device_subdevices.cpp
+++ b/libsyclinterface/tests/test_sycl_device_subdevices.cpp
@@ -32,8 +32,8 @@
 #include "dpctl_sycl_type_casters.hpp"
 #include "dpctl_utils.h"
 #include "dpctl_utils_helper.h"
-#include <CL/sycl.hpp>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 
 using namespace sycl;
 using namespace dpctl::syclinterface;
diff --git a/libsyclinterface/tests/test_sycl_event_interface.cpp b/libsyclinterface/tests/test_sycl_event_interface.cpp
index 0cc11af731..615755ebc3 100644
--- a/libsyclinterface/tests/test_sycl_event_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_event_interface.cpp
@@ -27,8 +27,8 @@
 #include "Config/dpctl_config.h"
 #include "dpctl_sycl_event_interface.h"
 #include "dpctl_sycl_types.h"
-#include <CL/sycl.hpp>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 #include <vector>
 
 using namespace sycl;
diff --git a/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp b/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp
index 6383b730a0..c450d6722d 100644
--- a/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_kernel_bundle_interface.cpp
@@ -34,11 +34,11 @@
 #include "dpctl_sycl_kernel_interface.h"
 #include "dpctl_sycl_queue_interface.h"
 #include "dpctl_sycl_queue_manager.h"
-#include <CL/sycl.hpp>
 #include <array>
 #include <filesystem>
 #include <fstream>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 
 using namespace sycl;
 
diff --git a/libsyclinterface/tests/test_sycl_kernel_interface.cpp b/libsyclinterface/tests/test_sycl_kernel_interface.cpp
index 97fba96bc3..d7e7cb4087 100644
--- a/libsyclinterface/tests/test_sycl_kernel_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_kernel_interface.cpp
@@ -33,9 +33,9 @@
 #include "dpctl_sycl_queue_interface.h"
 #include "dpctl_sycl_queue_manager.h"
 #include "dpctl_utils.h"
-#include <CL/sycl.hpp>
 #include <array>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 
 using namespace sycl;
 
diff --git a/libsyclinterface/tests/test_sycl_platform_interface.cpp b/libsyclinterface/tests/test_sycl_platform_interface.cpp
index f04cead0e1..3164aef7ec 100644
--- a/libsyclinterface/tests/test_sycl_platform_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_platform_interface.cpp
@@ -29,8 +29,8 @@
 #include "dpctl_sycl_platform_interface.h"
 #include "dpctl_sycl_platform_manager.h"
 #include "dpctl_utils.h"
-#include <CL/sycl.hpp>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 #include <vector>
 
 using namespace sycl;
diff --git a/libsyclinterface/tests/test_sycl_platform_invalid_filters.cpp b/libsyclinterface/tests/test_sycl_platform_invalid_filters.cpp
index 5499f88430..41f0cc4a53 100644
--- a/libsyclinterface/tests/test_sycl_platform_invalid_filters.cpp
+++ b/libsyclinterface/tests/test_sycl_platform_invalid_filters.cpp
@@ -26,8 +26,8 @@
 
 #include "dpctl_sycl_device_selector_interface.h"
 #include "dpctl_sycl_platform_interface.h"
-#include <CL/sycl.hpp>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 
 using namespace sycl;
 
diff --git a/libsyclinterface/tests/test_sycl_queue_interface.cpp b/libsyclinterface/tests/test_sycl_queue_interface.cpp
index 8d23929d39..db170d8d26 100644
--- a/libsyclinterface/tests/test_sycl_queue_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_queue_interface.cpp
@@ -34,8 +34,8 @@
 #include "dpctl_sycl_queue_manager.h"
 #include "dpctl_sycl_type_casters.hpp"
 #include "dpctl_sycl_usm_interface.h"
-#include <CL/sycl.hpp>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 
 using namespace sycl;
 using namespace dpctl::syclinterface;
diff --git a/libsyclinterface/tests/test_sycl_queue_manager.cpp b/libsyclinterface/tests/test_sycl_queue_manager.cpp
index 4f9e84ea20..0fc640f4ab 100644
--- a/libsyclinterface/tests/test_sycl_queue_manager.cpp
+++ b/libsyclinterface/tests/test_sycl_queue_manager.cpp
@@ -30,8 +30,8 @@
 #include "dpctl_sycl_queue_interface.h"
 #include "dpctl_sycl_queue_manager.h"
 #include "dpctl_sycl_type_casters.hpp"
-#include <CL/sycl.hpp>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 #include <thread>
 
 using namespace std;
diff --git a/libsyclinterface/tests/test_sycl_queue_submit.cpp b/libsyclinterface/tests/test_sycl_queue_submit.cpp
index 680314b719..3ef37978d3 100644
--- a/libsyclinterface/tests/test_sycl_queue_submit.cpp
+++ b/libsyclinterface/tests/test_sycl_queue_submit.cpp
@@ -32,10 +32,10 @@
 #include "dpctl_sycl_queue_interface.h"
 #include "dpctl_sycl_type_casters.hpp"
 #include "dpctl_sycl_usm_interface.h"
-#include <CL/sycl.hpp>
 #include <filesystem>
 #include <fstream>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 
 namespace
 {
diff --git a/libsyclinterface/tests/test_sycl_usm_interface.cpp b/libsyclinterface/tests/test_sycl_usm_interface.cpp
index a6dbb2290a..99f8e52051 100644
--- a/libsyclinterface/tests/test_sycl_usm_interface.cpp
+++ b/libsyclinterface/tests/test_sycl_usm_interface.cpp
@@ -32,9 +32,9 @@
 #include "dpctl_sycl_queue_manager.h"
 #include "dpctl_sycl_type_casters.hpp"
 #include "dpctl_sycl_usm_interface.h"
-#include <CL/sycl.hpp>
 #include <cstring>
 #include <gtest/gtest.h>
+#include <sycl/sycl.hpp>
 
 using namespace sycl;
 

From fd9df2a98e4a6d8b14b56e5dad390a93d9973f26 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 18 Oct 2023 12:07:02 -0700
Subject: [PATCH 14/19] Add target_compile_options setting sycl-targets for
 targets needing SYCL

For every CMake target, where add_sycl_to_target is used, we also run
target_compile_options(
   ${target_name}
   PRIVATE
   -fysl-targets=spir64-unknown-unknown,nvptx64-nvidia-cuda
)
---
 dpctl/CMakeLists.txt                  | 8 +++++++-
 dpctl/tensor/CMakeLists.txt           | 7 +++++++
 dpctl/utils/CMakeLists.txt            | 6 ++++++
 libsyclinterface/CMakeLists.txt       | 6 ++++++
 libsyclinterface/tests/CMakeLists.txt | 6 ++++++
 5 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt
index cb872ff45f..604506e2bc 100644
--- a/dpctl/CMakeLists.txt
+++ b/dpctl/CMakeLists.txt
@@ -143,7 +143,13 @@ function(build_dpctl_ext _trgt _src _dest)
     add_custom_target(${_cythonize_trgt} DEPENDS ${_src})
     Python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
     if (BUILD_DPCTL_EXT_SYCL)
-      add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
+        add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
+        # make fat binary
+        target_compile_options(
+            ${_trgt}
+            PRIVATE
+            -fsycl-targets=spir64-unknown-unknown,nvptx64-nvidia-cuda
+        )
     endif()
     target_include_directories(${_trgt} PRIVATE ${NumPy_INCLUDE_DIR} ${DPCTL_INCLUDE_DIR})
     add_dependencies(${_trgt} _build_time_create_dpctl_include_copy ${_cythonize_trgt})
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 15bc58f57b..3eabfc433a 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -55,6 +55,13 @@ set(_tensor_impl_sources
 set(python_module_name _tensor_impl)
 pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
+# make fat binary
+target_compile_options(
+    ${python_module_name}
+    PRIVATE
+    -fsycl-targets=spir64-unknown-unknown,nvptx64-nvidia-cuda
+)
+
 set(_clang_prefix "")
 if (WIN32)
   set(_clang_prefix "/clang:")
diff --git a/dpctl/utils/CMakeLists.txt b/dpctl/utils/CMakeLists.txt
index aadc1c0fe0..8b9225de69 100644
--- a/dpctl/utils/CMakeLists.txt
+++ b/dpctl/utils/CMakeLists.txt
@@ -21,6 +21,12 @@ pybind11_add_module(${python_module_name} MODULE
   ${_module_src}
 )
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
+# make fat binary
+target_compile_options(
+    ${python_module_name}
+    PRIVATE
+    -fsycl-targets=spir64-unknown-unknown,nvptx64-nvidia-cuda
+)
 target_include_directories(${python_module_name}
     PRIVATE
     ${CMAKE_CURRENT_SOURCE_DIR}/../include
diff --git a/libsyclinterface/CMakeLists.txt b/libsyclinterface/CMakeLists.txt
index 64ec3271b1..04dc14153d 100644
--- a/libsyclinterface/CMakeLists.txt
+++ b/libsyclinterface/CMakeLists.txt
@@ -205,6 +205,12 @@ add_library(DPCTLSyclInterface
     ${helper_sources}
 )
 add_sycl_to_target(TARGET DPCTLSyclInterface SOURCES ${sources} ${helper_sources})
+# make fat binary
+target_compile_options(
+    DPCTLSyclInterface
+    PRIVATE
+    -fsycl-targets=spir64-unknown-unknown,nvptx64-nvidia-cuda
+)
 
 if(DPCTL_GENERATE_COVERAGE)
     target_link_options(DPCTLSyclInterface
diff --git a/libsyclinterface/tests/CMakeLists.txt b/libsyclinterface/tests/CMakeLists.txt
index 472e1787fa..17dff208e5 100644
--- a/libsyclinterface/tests/CMakeLists.txt
+++ b/libsyclinterface/tests/CMakeLists.txt
@@ -52,6 +52,12 @@ add_sycl_to_target(
   ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_interface.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_usm_interface.cpp
 )
+# make fat binary
+target_compile_options(
+    dpctl_c_api_tests
+    PRIVATE
+    -fsycl-targets=spir64-unknown-unknown,nvptx64-nvidia-cuda
+)
 
 if(DPCTL_GENERATE_COVERAGE)
     target_include_directories(dpctl_c_api_tests

From 9561b6e178e4c08636a16376056fb492e6954d4c Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 18 Oct 2023 13:17:07 -0700
Subject: [PATCH 15/19] Fix compilation error

---
 dpctl/CMakeLists.txt                  | 7 ++++++-
 dpctl/tensor/CMakeLists.txt           | 7 ++++++-
 dpctl/utils/CMakeLists.txt            | 7 ++++++-
 libsyclinterface/CMakeLists.txt       | 7 ++++++-
 libsyclinterface/tests/CMakeLists.txt | 7 ++++++-
 5 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt
index 604506e2bc..3f81c66972 100644
--- a/dpctl/CMakeLists.txt
+++ b/dpctl/CMakeLists.txt
@@ -148,8 +148,13 @@ function(build_dpctl_ext _trgt _src _dest)
         target_compile_options(
             ${_trgt}
             PRIVATE
-            -fsycl-targets=spir64-unknown-unknown,nvptx64-nvidia-cuda
+            -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
         )
+	target_link_options(
+	    ${_trgt}
+	    PRIVATE
+	    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
+	)
     endif()
     target_include_directories(${_trgt} PRIVATE ${NumPy_INCLUDE_DIR} ${DPCTL_INCLUDE_DIR})
     add_dependencies(${_trgt} _build_time_create_dpctl_include_copy ${_cythonize_trgt})
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 3eabfc433a..ed82f58437 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -59,7 +59,12 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
 target_compile_options(
     ${python_module_name}
     PRIVATE
-    -fsycl-targets=spir64-unknown-unknown,nvptx64-nvidia-cuda
+    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
+)
+target_link_options(
+    ${python_module_name}
+    PRIVATE
+    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
 )
 
 set(_clang_prefix "")
diff --git a/dpctl/utils/CMakeLists.txt b/dpctl/utils/CMakeLists.txt
index 8b9225de69..194946e4da 100644
--- a/dpctl/utils/CMakeLists.txt
+++ b/dpctl/utils/CMakeLists.txt
@@ -25,7 +25,12 @@ add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
 target_compile_options(
     ${python_module_name}
     PRIVATE
-    -fsycl-targets=spir64-unknown-unknown,nvptx64-nvidia-cuda
+    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
+)
+target_link_options(
+    ${python_module_name}
+    PRIVATE
+    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
 )
 target_include_directories(${python_module_name}
     PRIVATE
diff --git a/libsyclinterface/CMakeLists.txt b/libsyclinterface/CMakeLists.txt
index 04dc14153d..916421d455 100644
--- a/libsyclinterface/CMakeLists.txt
+++ b/libsyclinterface/CMakeLists.txt
@@ -209,7 +209,12 @@ add_sycl_to_target(TARGET DPCTLSyclInterface SOURCES ${sources} ${helper_sources
 target_compile_options(
     DPCTLSyclInterface
     PRIVATE
-    -fsycl-targets=spir64-unknown-unknown,nvptx64-nvidia-cuda
+    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
+)
+target_link_options(
+    DPCTLSyclInterface
+    PRIVATE
+    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
 )
 
 if(DPCTL_GENERATE_COVERAGE)
diff --git a/libsyclinterface/tests/CMakeLists.txt b/libsyclinterface/tests/CMakeLists.txt
index 17dff208e5..e83db384dd 100644
--- a/libsyclinterface/tests/CMakeLists.txt
+++ b/libsyclinterface/tests/CMakeLists.txt
@@ -56,7 +56,12 @@ add_sycl_to_target(
 target_compile_options(
     dpctl_c_api_tests
     PRIVATE
-    -fsycl-targets=spir64-unknown-unknown,nvptx64-nvidia-cuda
+    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
+)
+target_link_options(
+    dpctl_c_api_tests
+    PRIVATE
+    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
 )
 
 if(DPCTL_GENERATE_COVERAGE)

From c101748c3e25b229bc57df107473a67908f5a1bf Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 25 Oct 2023 08:03:16 -0700
Subject: [PATCH 16/19] Replace overlooked std::log, std::sinh, std::exp for
 complex types

Replaced them with uses of sycl::ext::oneapi::experimental namespace
functions instead.
---
 .../kernels/elementwise_functions/acos.hpp        |  7 ++++---
 .../kernels/elementwise_functions/acosh.hpp       | 12 +++++++-----
 .../kernels/elementwise_functions/asin.hpp        | 15 ++++++++-------
 .../kernels/elementwise_functions/asinh.hpp       |  9 ++++++---
 .../kernels/elementwise_functions/exp2.hpp        |  6 ++++--
 .../include/kernels/elementwise_functions/sin.hpp |  6 +++---
 .../kernels/elementwise_functions/sinh.hpp        |  4 ++--
 7 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
index 28968de761..b64296ec1a 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -105,10 +105,11 @@ template <typename argT, typename resT> struct AcosFunctor
             constexpr realT r_eps =
                 realT(1) / std::numeric_limits<realT>::epsilon();
             if (std::abs(x) > r_eps || std::abs(y) > r_eps) {
-                argT log_in = std::log(in);
+                using sycl_complexT = exprm_ns::complex<realT>;
+                sycl_complexT log_in = exprm_ns::log(exprm_ns::complex<realT>(in));
 
-                const realT wx = std::real(log_in);
-                const realT wy = std::imag(log_in);
+                const realT wx = log_in.real();
+                const realT wy = log_in.imag();
                 const realT rx = std::abs(wy);
 
                 realT ry = wx + std::log(realT(2));
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
index 7323579df5..56730a411c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -48,7 +48,7 @@ namespace acosh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace cmplx_ns = sycl::ext::oneapi::experimental;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -112,16 +112,18 @@ template <typename argT, typename resT> struct AcoshFunctor
              * For large x or y including acos(+-Inf + I*+-Inf)
              */
             if (std::abs(x) > r_eps || std::abs(y) > r_eps) {
-                const realT wx = std::real(std::log(in));
-                const realT wy = std::imag(std::log(in));
+                using sycl_complexT = typename exprm_ns::complex<realT>;
+                const sycl_complexT log_in = exprm_ns::log(sycl_complexT(in));
+                const realT wx = log_in.real();
+                const realT wy = log_in.imag();
                 const realT rx = std::abs(wy);
                 realT ry = wx + std::log(realT(2));
                 acos_in = resT{rx, (std::signbit(y)) ? ry : -ry};
             }
             else {
                 /* ordinary cases */
-                acos_in = cmplx_ns::acos(
-                    cmplx_ns::complex<realT>(in)); // std::acos(in);
+                acos_in = exprm_ns::acos(
+                    exprm_ns::complex<realT>(in)); // std::acos(in);
             }
 
             /* Now we calculate acosh(z) */
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
index 595d75eeca..5f83c95323 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -119,17 +119,18 @@ template <typename argT, typename resT> struct AsinFunctor
             constexpr realT r_eps =
                 realT(1) / std::numeric_limits<realT>::epsilon();
             if (std::abs(x) > r_eps || std::abs(y) > r_eps) {
-                const resT z = {x, y};
+                using sycl_complexT = exprm_ns::complex<realT>;
+                const sycl_complexT z {x, y};
                 realT wx, wy;
                 if (!std::signbit(x)) {
-                    auto log_z = std::log(z);
-                    wx = std::real(log_z) + std::log(realT(2));
-                    wy = std::imag(log_z);
+                    auto log_z = exprm_ns::log(z);
+                    wx = log_z.real() + std::log(realT(2));
+                    wy = log_z.imag();
                 }
                 else {
-                    auto log_mz = std::log(-z);
-                    wx = std::real(log_mz) + std::log(realT(2));
-                    wy = std::imag(log_mz);
+                    auto log_mz = exprm_ns::log(-z);
+                    wx = log_mz.real() + std::log(realT(2));
+                    wy = log_mz.imag();
                 }
                 const realT asinh_re = std::copysign(wx, x);
                 const realT asinh_im = std::copysign(wy, y);
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
index d126afba87..50ba0d8e86 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -108,9 +108,12 @@ template <typename argT, typename resT> struct AsinhFunctor
                 realT(1) / std::numeric_limits<realT>::epsilon();
 
             if (std::abs(x) > r_eps || std::abs(y) > r_eps) {
-                resT log_in = (std::signbit(x)) ? std::log(-in) : std::log(in);
-                realT wx = std::real(log_in) + std::log(realT(2));
-                realT wy = std::imag(log_in);
+                using sycl_complexT = exprm_ns::complex<realT>;
+                sycl_complexT log_in = (std::signbit(x)) ?
+		  exprm_ns::log(sycl_complexT(-in)) :
+		  exprm_ns::log(sycl_complexT(in));
+                realT wx = log_in.real() + std::log(realT(2));
+                realT wy = log_in.imag();
                 const realT res_re = std::copysign(wx, x);
                 const realT res_im = std::copysign(wy, y);
                 return resT{res_re, res_im};
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
index 67ee23df48..b6b2f32e83 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
@@ -24,10 +24,11 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
@@ -48,6 +49,7 @@ namespace exp2
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -76,7 +78,7 @@ template <typename argT, typename resT> struct Exp2Functor
             const realT y = std::imag(tmp);
             if (std::isfinite(x)) {
                 if (std::isfinite(y)) {
-                    return std::exp(tmp);
+                    return exprm_ns::exp(exprm_ns::complex<realT>(tmp));
                 }
                 else {
                     return resT{q_nan, q_nan};
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
index 0f35aa306e..e1e9e79c57 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
@@ -48,7 +48,7 @@ namespace sin
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace cmplx_ns = sycl::ext::oneapi::experimental;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -81,8 +81,8 @@ template <typename argT, typename resT> struct SinFunctor
              * real and imaginary parts of input are finite.
              */
             if (in_re_finite && in_im_finite) {
-                return cmplx_ns::sin(
-                    cmplx_ns::complex<realT>(in)); // std::sin(in);
+                return exprm_ns::sin(
+                    exprm_ns::complex<realT>(in)); // std::sin(in);
             }
 
             /*
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
index af887431d7..37ac3734e3 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
@@ -48,7 +48,7 @@ namespace sinh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace cmplx_ns = sycl::ext::oneapi::experimental;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -81,7 +81,7 @@ template <typename argT, typename resT> struct SinhFunctor
              * real and imaginary parts of input are finite.
              */
             if (xfinite && yfinite) {
-                return std::sinh(in);
+	      return exprm_ns::sinh(exprm_ns::complex<realT>(in));
             }
             /*
              * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.

From 0827f3d1509c8c1ddf7fc4146ec829bf7a1f456a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 25 Oct 2023 08:04:42 -0700
Subject: [PATCH 17/19] Replaced include CL/sycl.hpp with include sycl/sycl.hpp

---
 .../libtensor/include/kernels/elementwise_functions/cbrt.hpp    | 2 +-
 .../include/kernels/elementwise_functions/copysign.hpp          | 2 +-
 .../libtensor/include/kernels/elementwise_functions/rsqrt.hpp   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
index 1d4aa65002..92584f0dfe 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -24,10 +24,10 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
index b1997d06b4..43e06cb281 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
@@ -24,9 +24,9 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cstddef>
 #include <cstdint>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "utils/offset_utils.hpp"
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
index de51b31c30..d9e0c33081 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -25,12 +25,12 @@
 //===---------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl.hpp>
 #include <cmath>
 #include <complex>
 #include <cstddef>
 #include <cstdint>
 #include <limits>
+#include <sycl/sycl.hpp>
 #include <type_traits>
 
 #include "kernels/elementwise_functions/common.hpp"

From 5eefdd10cb649103e042dfa3405d4c741a12b349 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 25 Oct 2023 08:05:21 -0700
Subject: [PATCH 18/19] Add DPCTL_TARGET_CUDA Boolean cmake option

Also DPCTL_SYCL_TARGETS parameter can be used to specify targets
to build for.

DPCTL_TARGET_CUDA could be set via cmake option, or via environment
variable, e.g.

```
$ DPCTL_TARGET_CUDA=1 python scripts/build_locally.py --verbose
```
---
 CMakeLists.txt                        | 21 +++++++++++++++++++++
 dpctl/CMakeLists.txt                  | 24 +++++++++++++-----------
 dpctl/tensor/CMakeLists.txt           | 24 +++++++++++++-----------
 dpctl/utils/CMakeLists.txt            | 24 +++++++++++++-----------
 libsyclinterface/CMakeLists.txt       | 22 ++++++++++++----------
 libsyclinterface/tests/CMakeLists.txt |  2 ++
 6 files changed, 74 insertions(+), 43 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index eb53db12ec..adfb4fbddd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,9 +17,30 @@ option(DPCTL_GENERATE_COVERAGE
     "Build dpctl with coverage instrumentation"
     OFF
 )
+option(DPCTL_TARGET_CUDA
+    "Build DPCTL to target CUDA devices"
+    OFF
+)
 
 find_package(IntelSYCL REQUIRED PATHS ${CMAKE_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
 
+set(_dpctl_sycl_targets)
+if ("x${DPCTL_SYCL_TARGETS}" STREQUAL "x")
+   if(DPCTL_TARGET_CUDA)
+      set(_dpctl_sycl_targets "nvptx64-nvidia-cuda,spir64-unknown-unknown")
+   else()
+      if(DEFINED ENV{DPCTL_TARGET_CUDA})
+          set(_dpctl_sycl_targets "nvptx64-nvidia-cuda,spir64-unknown-unknown")
+      endif()
+   endif()
+else()
+   set(_dpctl_sycl_targets ${DPCTL_SYCL_TARGETS})
+endif()
+
+if(_dpctl_sycl_targets)
+   message(STATUS "Compiling for -fsycl-targets=${_dpctl_sycl_targets}")
+endif()
+
 add_subdirectory(libsyclinterface)
 
 file(GLOB _dpctl_capi_headers dpctl/apis/include/*.h*)
diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt
index 3f81c66972..616f270ad3 100644
--- a/dpctl/CMakeLists.txt
+++ b/dpctl/CMakeLists.txt
@@ -144,17 +144,19 @@ function(build_dpctl_ext _trgt _src _dest)
     Python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
     if (BUILD_DPCTL_EXT_SYCL)
         add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
-        # make fat binary
-        target_compile_options(
-            ${_trgt}
-            PRIVATE
-            -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
-        )
-	target_link_options(
-	    ${_trgt}
-	    PRIVATE
-	    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
-	)
+	if(_dpctl_sycl_targets)
+	    # make fat binary
+            target_compile_options(
+                ${_trgt}
+                PRIVATE
+                -fsycl-targets=${_dpctl_sycl_targets}
+            )
+	    target_link_options(
+	        ${_trgt}
+	        PRIVATE
+	        -fsycl-targets=${_dpctl_sycl_targets}
+	    )
+	endif()
     endif()
     target_include_directories(${_trgt} PRIVATE ${NumPy_INCLUDE_DIR} ${DPCTL_INCLUDE_DIR})
     add_dependencies(${_trgt} _build_time_create_dpctl_include_copy ${_cythonize_trgt})
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index ed82f58437..0f63e4bdeb 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -55,17 +55,19 @@ set(_tensor_impl_sources
 set(python_module_name _tensor_impl)
 pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
-# make fat binary
-target_compile_options(
-    ${python_module_name}
-    PRIVATE
-    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
-)
-target_link_options(
-    ${python_module_name}
-    PRIVATE
-    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
-)
+if(_dpctl_sycl_targets)
+    # make fat binary
+    target_compile_options(
+        ${python_module_name}
+        PRIVATE
+        -fsycl-targets=${_dpctl_sycl_targets}
+    )
+    target_link_options(
+        ${python_module_name}
+        PRIVATE
+        -fsycl-targets=${_dpctl_sycl_targets}
+    )
+endif()
 
 set(_clang_prefix "")
 if (WIN32)
diff --git a/dpctl/utils/CMakeLists.txt b/dpctl/utils/CMakeLists.txt
index 194946e4da..e7d3951e5b 100644
--- a/dpctl/utils/CMakeLists.txt
+++ b/dpctl/utils/CMakeLists.txt
@@ -21,17 +21,19 @@ pybind11_add_module(${python_module_name} MODULE
   ${_module_src}
 )
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
-# make fat binary
-target_compile_options(
-    ${python_module_name}
-    PRIVATE
-    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
-)
-target_link_options(
-    ${python_module_name}
-    PRIVATE
-    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
-)
+if(_dpctl_sycl_targets)
+    # make fat binary
+    target_compile_options(
+        ${python_module_name}
+        PRIVATE
+        -fsycl-targets=${_dpctl_sycl_targets}
+    )
+    target_link_options(
+        ${python_module_name}
+        PRIVATE
+        -fsycl-targets=${_dpctl_sycl_targets}
+    )
+endif()
 target_include_directories(${python_module_name}
     PRIVATE
     ${CMAKE_CURRENT_SOURCE_DIR}/../include
diff --git a/libsyclinterface/CMakeLists.txt b/libsyclinterface/CMakeLists.txt
index 916421d455..e84959c1c3 100644
--- a/libsyclinterface/CMakeLists.txt
+++ b/libsyclinterface/CMakeLists.txt
@@ -206,16 +206,18 @@ add_library(DPCTLSyclInterface
 )
 add_sycl_to_target(TARGET DPCTLSyclInterface SOURCES ${sources} ${helper_sources})
 # make fat binary
-target_compile_options(
-    DPCTLSyclInterface
-    PRIVATE
-    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
-)
-target_link_options(
-    DPCTLSyclInterface
-    PRIVATE
-    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
-)
+if(_dpctl_sycl_targets)
+    target_compile_options(
+        DPCTLSyclInterface
+        PRIVATE
+        -fsycl-targets=${_dpctl_sycl_targets}
+    )
+    target_link_options(
+        DPCTLSyclInterface
+        PRIVATE
+        -fsycl-targets=${_dpctl_sycl_targets}
+   )
+endif()
 
 if(DPCTL_GENERATE_COVERAGE)
     target_link_options(DPCTLSyclInterface
diff --git a/libsyclinterface/tests/CMakeLists.txt b/libsyclinterface/tests/CMakeLists.txt
index e83db384dd..5a672e312f 100644
--- a/libsyclinterface/tests/CMakeLists.txt
+++ b/libsyclinterface/tests/CMakeLists.txt
@@ -52,6 +52,7 @@ add_sycl_to_target(
   ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_interface.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_usm_interface.cpp
 )
+if (_dpctl_sycl_targets)
 # make fat binary
 target_compile_options(
     dpctl_c_api_tests
@@ -63,6 +64,7 @@ target_link_options(
     PRIVATE
     -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
 )
+endif()
 
 if(DPCTL_GENERATE_COVERAGE)
     target_include_directories(dpctl_c_api_tests

From 986dc6f88b88db98e4fc53ba0741018bc04bd0c8 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Wed, 25 Oct 2023 09:07:45 -0700
Subject: [PATCH 19/19] clang-format fixes

---
 .../include/kernels/elementwise_functions/acos.hpp          | 3 ++-
 .../include/kernels/elementwise_functions/asin.hpp          | 2 +-
 .../include/kernels/elementwise_functions/asinh.hpp         | 6 +++---
 .../include/kernels/elementwise_functions/sinh.hpp          | 2 +-
 dpctl/tensor/libtensor/include/utils/sycl_utils.hpp         | 2 +-
 dpctl/tensor/libtensor/source/reduction_over_axis.hpp       | 2 +-
 6 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
index b64296ec1a..23a87b9d44 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -106,7 +106,8 @@ template <typename argT, typename resT> struct AcosFunctor
                 realT(1) / std::numeric_limits<realT>::epsilon();
             if (std::abs(x) > r_eps || std::abs(y) > r_eps) {
                 using sycl_complexT = exprm_ns::complex<realT>;
-                sycl_complexT log_in = exprm_ns::log(exprm_ns::complex<realT>(in));
+                sycl_complexT log_in =
+                    exprm_ns::log(exprm_ns::complex<realT>(in));
 
                 const realT wx = log_in.real();
                 const realT wy = log_in.imag();
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
index 5f83c95323..035480c437 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -120,7 +120,7 @@ template <typename argT, typename resT> struct AsinFunctor
                 realT(1) / std::numeric_limits<realT>::epsilon();
             if (std::abs(x) > r_eps || std::abs(y) > r_eps) {
                 using sycl_complexT = exprm_ns::complex<realT>;
-                const sycl_complexT z {x, y};
+                const sycl_complexT z{x, y};
                 realT wx, wy;
                 if (!std::signbit(x)) {
                     auto log_z = exprm_ns::log(z);
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
index 50ba0d8e86..523ca4f01f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -109,9 +109,9 @@ template <typename argT, typename resT> struct AsinhFunctor
 
             if (std::abs(x) > r_eps || std::abs(y) > r_eps) {
                 using sycl_complexT = exprm_ns::complex<realT>;
-                sycl_complexT log_in = (std::signbit(x)) ?
-		  exprm_ns::log(sycl_complexT(-in)) :
-		  exprm_ns::log(sycl_complexT(in));
+                sycl_complexT log_in = (std::signbit(x))
+                                           ? exprm_ns::log(sycl_complexT(-in))
+                                           : exprm_ns::log(sycl_complexT(in));
                 realT wx = log_in.real() + std::log(realT(2));
                 realT wy = log_in.imag();
                 const realT res_re = std::copysign(wx, x);
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
index 37ac3734e3..b11c7402d0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
@@ -81,7 +81,7 @@ template <typename argT, typename resT> struct SinhFunctor
              * real and imaginary parts of input are finite.
              */
             if (xfinite && yfinite) {
-	      return exprm_ns::sinh(exprm_ns::complex<realT>(in));
+                return exprm_ns::sinh(exprm_ns::complex<realT>(in));
             }
             /*
              * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index 31d3d874a0..2f5e74172f 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -25,8 +25,8 @@
 #pragma once
 #include <algorithm>
 #include <cstddef>
-#include <type_traits>
 #include <sycl/sycl.hpp>
+#include <type_traits>
 #include <vector>
 
 #include "math_utils.hpp"
diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
index 46ef02f064..e9ccd1d52a 100644
--- a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
@@ -28,9 +28,9 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <type_traits>
 #include <string>
 #include <sycl/sycl.hpp>
+#include <type_traits>
 #include <utility>
 #include <vector>