diff --git a/CMakeLists.txt b/CMakeLists.txt
index eb53db12ec..adfb4fbddd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,9 +17,30 @@ option(DPCTL_GENERATE_COVERAGE
     "Build dpctl with coverage instrumentation"
     OFF
 )
+option(DPCTL_TARGET_CUDA
+    "Build DPCTL to target CUDA devices"
+    OFF
+)
 
 find_package(IntelSYCL REQUIRED PATHS ${CMAKE_SOURCE_DIR}/cmake NO_DEFAULT_PATH)
 
+set(_dpctl_sycl_targets)
+if ("x${DPCTL_SYCL_TARGETS}" STREQUAL "x")
+   if(DPCTL_TARGET_CUDA)
+      set(_dpctl_sycl_targets "nvptx64-nvidia-cuda,spir64-unknown-unknown")
+   else()
+      if(DEFINED ENV{DPCTL_TARGET_CUDA})
+          set(_dpctl_sycl_targets "nvptx64-nvidia-cuda,spir64-unknown-unknown")
+      endif()
+   endif()
+else()
+   set(_dpctl_sycl_targets ${DPCTL_SYCL_TARGETS})
+endif()
+
+if(_dpctl_sycl_targets)
+   message(STATUS "Compiling for -fsycl-targets=${_dpctl_sycl_targets}")
+endif()
+
 add_subdirectory(libsyclinterface)
 
 file(GLOB _dpctl_capi_headers dpctl/apis/include/*.h*)
diff --git a/dpctl/CMakeLists.txt b/dpctl/CMakeLists.txt
index cb872ff45f..616f270ad3 100644
--- a/dpctl/CMakeLists.txt
+++ b/dpctl/CMakeLists.txt
@@ -143,7 +143,20 @@ function(build_dpctl_ext _trgt _src _dest)
     add_custom_target(${_cythonize_trgt} DEPENDS ${_src})
     Python_add_library(${_trgt} MODULE WITH_SOABI ${_generated_src})
     if (BUILD_DPCTL_EXT_SYCL)
-      add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
+        add_sycl_to_target(TARGET ${_trgt} SOURCES ${_generated_src})
+	if(_dpctl_sycl_targets)
+	    # make fat binary
+            target_compile_options(
+                ${_trgt}
+                PRIVATE
+                -fsycl-targets=${_dpctl_sycl_targets}
+            )
+	    target_link_options(
+	        ${_trgt}
+	        PRIVATE
+	        -fsycl-targets=${_dpctl_sycl_targets}
+	    )
+	endif()
     endif()
     target_include_directories(${_trgt} PRIVATE ${NumPy_INCLUDE_DIR} ${DPCTL_INCLUDE_DIR})
     add_dependencies(${_trgt} _build_time_create_dpctl_include_copy ${_cythonize_trgt})
diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 15bc58f57b..0f63e4bdeb 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -55,6 +55,20 @@ set(_tensor_impl_sources
 set(python_module_name _tensor_impl)
 pybind11_add_module(${python_module_name} MODULE ${_tensor_impl_sources})
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_tensor_impl_sources})
+if(_dpctl_sycl_targets)
+    # make fat binary
+    target_compile_options(
+        ${python_module_name}
+        PRIVATE
+        -fsycl-targets=${_dpctl_sycl_targets}
+    )
+    target_link_options(
+        ${python_module_name}
+        PRIVATE
+        -fsycl-targets=${_dpctl_sycl_targets}
+    )
+endif()
+
 set(_clang_prefix "")
 if (WIN32)
   set(_clang_prefix "/clang:")
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 3473d5cde5..bab31379b7 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -110,13 +110,16 @@
     bitwise_or,
     bitwise_right_shift,
     bitwise_xor,
+    cbrt,
     ceil,
     conj,
+    copysign,
     cos,
     cosh,
     divide,
     equal,
     exp,
+    exp2,
     expm1,
     floor,
     floor_divide,
@@ -149,6 +152,7 @@
     real,
     remainder,
     round,
+    rsqrt,
     sign,
     signbit,
     sin,
@@ -314,4 +318,8 @@
     "argmax",
     "argmin",
     "prod",
+    "cbrt",
+    "exp2",
+    "copysign",
+    "rsqrt",
 ]
diff --git a/dpctl/tensor/_elementwise_funcs.py b/dpctl/tensor/_elementwise_funcs.py
index 259443f8e3..24ae7fa8cf 100644
--- a/dpctl/tensor/_elementwise_funcs.py
+++ b/dpctl/tensor/_elementwise_funcs.py
@@ -1761,3 +1761,116 @@
 hypot = BinaryElementwiseFunc(
     "hypot", ti._hypot_result_type, ti._hypot, _hypot_docstring_
 )
+
+
+# U37: ==== CBRT        (x)
+_cbrt_docstring_ = """
+cbrt(x, out=None, order='K')
+
+Computes positive cube-root for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real floating-point data type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise positive cube-root.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+cbrt = UnaryElementwiseFunc(
+    "cbrt", ti._cbrt_result_type, ti._cbrt, _cbrt_docstring_
+)
+
+
+# U38: ==== EXP2        (x)
+_exp2_docstring_ = """
+exp2(x, out=None, order='K')
+
+Computes the base-2 exponential for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a floating-point data type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise base-2 exponentials.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+exp2 = UnaryElementwiseFunc(
+    "exp2", ti._exp2_result_type, ti._exp2, _exp2_docstring_
+)
+
+
+# B25: ==== COPYSIGN    (x1, x2)
+_copysign_docstring_ = """
+copysign(x1, x2, out=None, order='K')
+
+Composes a floating-point value with the magnitude of `x1_i` and the sign of
+`x2_i` for each element of input arrays `x1` and `x2`.
+
+Args:
+    x1 (usm_ndarray):
+        First input array, expected to have a real floating-point data type.
+    x2 (usm_ndarray):
+        Second input array, also expected to have a real floating-point data
+        type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise results. The data type
+        of the returned array is determined by the Type Promotion Rules.
+"""
+copysign = BinaryElementwiseFunc(
+    "copysign",
+    ti._copysign_result_type,
+    ti._copysign,
+    _copysign_docstring_,
+)
+
+
+# U39: ==== RSQRT        (x)
+_rsqrt_docstring_ = """
+rsqrt(x, out=None, order='K')
+
+Computes the reciprocal square-root for each element `x_i` for input array `x`.
+
+Args:
+    x (usm_ndarray):
+        Input array, expected to have a real floating-point data type.
+    out ({None, usm_ndarray}, optional):
+        Output array to populate.
+        Array have the correct shape and the expected data type.
+    order ("C","F","A","K", optional):
+        Memory layout of the newly output array, if parameter `out` is `None`.
+        Default: "K".
+Returns:
+    usm_narray:
+        An array containing the element-wise reciprocal square-root.
+        The data type of the returned array is determined by
+        the Type Promotion Rules.
+"""
+
+rsqrt = UnaryElementwiseFunc(
+    "rsqrt", ti._rsqrt_result_type, ti._rsqrt, _rsqrt_docstring_
+)
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
index 28968de761..23a87b9d44 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp
@@ -105,10 +105,12 @@ template <typename argT, typename resT> struct AcosFunctor
             constexpr realT r_eps =
                 realT(1) / std::numeric_limits<realT>::epsilon();
             if (std::abs(x) > r_eps || std::abs(y) > r_eps) {
-                argT log_in = std::log(in);
+                using sycl_complexT = exprm_ns::complex<realT>;
+                sycl_complexT log_in =
+                    exprm_ns::log(exprm_ns::complex<realT>(in));
 
-                const realT wx = std::real(log_in);
-                const realT wy = std::imag(log_in);
+                const realT wx = log_in.real();
+                const realT wy = log_in.imag();
                 const realT rx = std::abs(wy);
 
                 realT ry = wx + std::log(realT(2));
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
index 7323579df5..56730a411c 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp
@@ -48,7 +48,7 @@ namespace acosh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace cmplx_ns = sycl::ext::oneapi::experimental;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -112,16 +112,18 @@ template <typename argT, typename resT> struct AcoshFunctor
              * For large x or y including acos(+-Inf + I*+-Inf)
              */
             if (std::abs(x) > r_eps || std::abs(y) > r_eps) {
-                const realT wx = std::real(std::log(in));
-                const realT wy = std::imag(std::log(in));
+                using sycl_complexT = typename exprm_ns::complex<realT>;
+                const sycl_complexT log_in = exprm_ns::log(sycl_complexT(in));
+                const realT wx = log_in.real();
+                const realT wy = log_in.imag();
                 const realT rx = std::abs(wy);
                 realT ry = wx + std::log(realT(2));
                 acos_in = resT{rx, (std::signbit(y)) ? ry : -ry};
             }
             else {
                 /* ordinary cases */
-                acos_in = cmplx_ns::acos(
-                    cmplx_ns::complex<realT>(in)); // std::acos(in);
+                acos_in = exprm_ns::acos(
+                    exprm_ns::complex<realT>(in)); // std::acos(in);
             }
 
             /* Now we calculate acosh(z) */
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
index 595d75eeca..035480c437 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp
@@ -119,17 +119,18 @@ template <typename argT, typename resT> struct AsinFunctor
             constexpr realT r_eps =
                 realT(1) / std::numeric_limits<realT>::epsilon();
             if (std::abs(x) > r_eps || std::abs(y) > r_eps) {
-                const resT z = {x, y};
+                using sycl_complexT = exprm_ns::complex<realT>;
+                const sycl_complexT z{x, y};
                 realT wx, wy;
                 if (!std::signbit(x)) {
-                    auto log_z = std::log(z);
-                    wx = std::real(log_z) + std::log(realT(2));
-                    wy = std::imag(log_z);
+                    auto log_z = exprm_ns::log(z);
+                    wx = log_z.real() + std::log(realT(2));
+                    wy = log_z.imag();
                 }
                 else {
-                    auto log_mz = std::log(-z);
-                    wx = std::real(log_mz) + std::log(realT(2));
-                    wy = std::imag(log_mz);
+                    auto log_mz = exprm_ns::log(-z);
+                    wx = log_mz.real() + std::log(realT(2));
+                    wy = log_mz.imag();
                 }
                 const realT asinh_re = std::copysign(wx, x);
                 const realT asinh_im = std::copysign(wy, y);
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
index d126afba87..523ca4f01f 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp
@@ -108,9 +108,12 @@ template <typename argT, typename resT> struct AsinhFunctor
                 realT(1) / std::numeric_limits<realT>::epsilon();
 
             if (std::abs(x) > r_eps || std::abs(y) > r_eps) {
-                resT log_in = (std::signbit(x)) ? std::log(-in) : std::log(in);
-                realT wx = std::real(log_in) + std::log(realT(2));
-                realT wy = std::imag(log_in);
+                using sycl_complexT = exprm_ns::complex<realT>;
+                sycl_complexT log_in = (std::signbit(x))
+                                           ? exprm_ns::log(sycl_complexT(-in))
+                                           : exprm_ns::log(sycl_complexT(in));
+                realT wx = log_in.real() + std::log(realT(2));
+                realT wy = log_in.imag();
                 const realT res_re = std::copysign(wx, x);
                 const realT res_im = std::copysign(wy, y);
                 return resT{res_re, res_im};
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
new file mode 100644
index 0000000000..92584f0dfe
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp
@@ -0,0 +1,172 @@
+//=== cbrt.hpp -   Unary function CBRT                   ------  *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of CBRT(x)
+/// function that compute a square root.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace cbrt
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT, typename resT> struct CbrtFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::true_type;
+
+    resT operator()(const argT &in) const
+    {
+        return sycl::cbrt(in);
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using CbrtContigFunctor = elementwise_common::
+    UnaryContigFunctor<argTy, resTy, CbrtFunctor<argTy, resTy>, vec_sz, n_vecs>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using CbrtStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, CbrtFunctor<argTy, resTy>>;
+
+template <typename T> struct CbrtOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+class cbrt_contig_kernel;
+
+template <typename argTy>
+sycl::event cbrt_contig_impl(sycl::queue &exec_q,
+                             size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::unary_contig_impl<
+        argTy, CbrtOutputType, CbrtContigFunctor, cbrt_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T> struct CbrtContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename CbrtOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cbrt_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct CbrtTypeMapFactory
+{
+    /*! @brief get typeid for output type of std::cbrt(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CbrtOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3> class cbrt_strided_kernel;
+
+template <typename argTy>
+sycl::event
+cbrt_strided_impl(sycl::queue &exec_q,
+                  size_t nelems,
+                  int nd,
+                  const py::ssize_t *shape_and_strides,
+                  const char *arg_p,
+                  py::ssize_t arg_offset,
+                  char *res_p,
+                  py::ssize_t res_offset,
+                  const std::vector<sycl::event> &depends,
+                  const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, CbrtOutputType, CbrtStridedFunctor, cbrt_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T> struct CbrtStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename CbrtOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = cbrt_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace cbrt
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
new file mode 100644
index 0000000000..43e06cb281
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp
@@ -0,0 +1,215 @@
+//=== copysign.hpp -   Binary function COPYSIGN         ------  *-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of COPYSIGN(x1, x2)
+/// function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/elementwise_functions/common.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace copysign
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace tu_ns = dpctl::tensor::type_utils;
+
+template <typename argT1, typename argT2, typename resT> struct CopysignFunctor
+{
+
+    using supports_sg_loadstore = std::true_type;
+    using supports_vec = std::true_type;
+
+    resT operator()(const argT1 &in1, const argT2 &in2) const
+    {
+        return sycl::copysign(in1, in2);
+    }
+
+    template <int vec_sz>
+    sycl::vec<resT, vec_sz>
+    operator()(const sycl::vec<argT1, vec_sz> &in1,
+               const sycl::vec<argT2, vec_sz> &in2) const
+    {
+        auto tmp = sycl::copysign(in1, in2);
+        if constexpr (std::is_same_v<resT,
+                                     typename decltype(tmp)::element_type>) {
+            return tmp;
+        }
+        else {
+            using dpctl::tensor::type_utils::vec_cast;
+
+            return vec_cast<resT, typename decltype(tmp)::element_type, vec_sz>(
+                tmp);
+        }
+    }
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using CopysignContigFunctor =
+    elementwise_common::BinaryContigFunctor<argT1,
+                                            argT2,
+                                            resT,
+                                            CopysignFunctor<argT1, argT2, resT>,
+                                            vec_sz,
+                                            n_vecs>;
+
+template <typename argT1, typename argT2, typename resT, typename IndexerT>
+using CopysignStridedFunctor = elementwise_common::BinaryStridedFunctor<
+    argT1,
+    argT2,
+    resT,
+    IndexerT,
+    CopysignFunctor<argT1, argT2, resT>>;
+
+template <typename T1, typename T2> struct CopysignOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::BinaryTypeMapResultEntry<T1,
+                                        sycl::half,
+                                        T2,
+                                        sycl::half,
+                                        sycl::half>,
+        td_ns::BinaryTypeMapResultEntry<T1, float, T2, float, float>,
+        td_ns::BinaryTypeMapResultEntry<T1, double, T2, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename argT1,
+          typename argT2,
+          typename resT,
+          unsigned int vec_sz,
+          unsigned int n_vecs>
+class copysign_contig_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event copysign_contig_impl(sycl::queue &exec_q,
+                                 size_t nelems,
+                                 const char *arg1_p,
+                                 py::ssize_t arg1_offset,
+                                 const char *arg2_p,
+                                 py::ssize_t arg2_offset,
+                                 char *res_p,
+                                 py::ssize_t res_offset,
+                                 const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::binary_contig_impl<
+        argTy1, argTy2, CopysignOutputType, CopysignContigFunctor,
+        copysign_contig_kernel>(exec_q, nelems, arg1_p, arg1_offset, arg2_p,
+                                arg2_offset, res_p, res_offset, depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct CopysignContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename CopysignOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = copysign_contig_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T1, typename T2> struct CopysignTypeMapFactory
+{
+    /*! @brief get typeid for output type of divide(T1 x, T2 y) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename CopysignOutputType<T1, T2>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename resT, typename IndexerT>
+class copysign_strided_kernel;
+
+template <typename argTy1, typename argTy2>
+sycl::event
+copysign_strided_impl(sycl::queue &exec_q,
+                      size_t nelems,
+                      int nd,
+                      const py::ssize_t *shape_and_strides,
+                      const char *arg1_p,
+                      py::ssize_t arg1_offset,
+                      const char *arg2_p,
+                      py::ssize_t arg2_offset,
+                      char *res_p,
+                      py::ssize_t res_offset,
+                      const std::vector<sycl::event> &depends,
+                      const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::binary_strided_impl<
+        argTy1, argTy2, CopysignOutputType, CopysignStridedFunctor,
+        copysign_strided_kernel>(exec_q, nelems, nd, shape_and_strides, arg1_p,
+                                 arg1_offset, arg2_p, arg2_offset, res_p,
+                                 res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T1, typename T2> struct CopysignStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename CopysignOutputType<T1, T2>::value_type,
+                          void>)
+        {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = copysign_strided_impl<T1, T2>;
+            return fn;
+        }
+    }
+};
+
+} // namespace copysign
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
new file mode 100644
index 0000000000..b6b2f32e83
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp
@@ -0,0 +1,231 @@
+//=== exp2.hpp -   Unary function EXP2                     ------
+//*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of EXP2(x) function.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <sycl/ext/oneapi/experimental/sycl_complex.hpp>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace exp2
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
+
+using dpctl::tensor::type_utils::is_complex;
+
+template <typename argT, typename resT> struct Exp2Functor
+{
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::negation<
+        std::disjunction<is_complex<resT>, is_complex<argT>>>;
+
+    resT operator()(const argT &in) const
+    {
+        if constexpr (is_complex<argT>::value) {
+            using realT = typename argT::value_type;
+
+            const argT tmp = in * std::log(realT(2));
+
+            constexpr realT q_nan = std::numeric_limits<realT>::quiet_NaN();
+
+            const realT x = std::real(tmp);
+            const realT y = std::imag(tmp);
+            if (std::isfinite(x)) {
+                if (std::isfinite(y)) {
+                    return exprm_ns::exp(exprm_ns::complex<realT>(tmp));
+                }
+                else {
+                    return resT{q_nan, q_nan};
+                }
+            }
+            else if (std::isnan(x)) {
+                /* x is nan */
+                if (y == realT(0)) {
+                    return resT{in};
+                }
+                else {
+                    return resT{x, q_nan};
+                }
+            }
+            else {
+                if (!std::signbit(x)) { /* x is +inf */
+                    if (y == realT(0)) {
+                        return resT{x, y};
+                    }
+                    else if (std::isfinite(y)) {
+                        return resT{x * std::cos(y), x * std::sin(y)};
+                    }
+                    else {
+                        /* x = +inf, y = +-inf || nan */
+                        return resT{x, q_nan};
+                    }
+                }
+                else { /* x is -inf */
+                    if (std::isfinite(y)) {
+                        realT exp_x = std::exp(x);
+                        return resT{exp_x * std::cos(y), exp_x * std::sin(y)};
+                    }
+                    else {
+                        /* x = -inf, y = +-inf || nan */
+                        return resT{0, 0};
+                    }
+                }
+            }
+        }
+        else {
+            return sycl::exp2(in);
+        }
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using Exp2ContigFunctor = elementwise_common::
+    UnaryContigFunctor<argTy, resTy, Exp2Functor<argTy, resTy>, vec_sz, n_vecs>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using Exp2StridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, Exp2Functor<argTy, resTy>>;
+
+template <typename T> struct Exp2OutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::TypeMapResultEntry<T, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float>,
+        td_ns::TypeMapResultEntry<T, double>,
+        td_ns::TypeMapResultEntry<T, std::complex<float>>,
+        td_ns::TypeMapResultEntry<T, std::complex<double>>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+class exp2_contig_kernel;
+
+template <typename argTy>
+sycl::event exp2_contig_impl(sycl::queue &exec_q,
+                             size_t nelems,
+                             const char *arg_p,
+                             char *res_p,
+                             const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::unary_contig_impl<
+        argTy, Exp2OutputType, Exp2ContigFunctor, exp2_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T> struct Exp2ContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename Exp2OutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp2_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct Exp2TypeMapFactory
+{
+    /*! @brief get typeid for output type of std::exp2(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename Exp2OutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3> class exp2_strided_kernel;
+
+template <typename argTy>
+sycl::event
+exp2_strided_impl(sycl::queue &exec_q,
+                  size_t nelems,
+                  int nd,
+                  const py::ssize_t *shape_and_strides,
+                  const char *arg_p,
+                  py::ssize_t arg_offset,
+                  char *res_p,
+                  py::ssize_t res_offset,
+                  const std::vector<sycl::event> &depends,
+                  const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, Exp2OutputType, Exp2StridedFunctor, exp2_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T> struct Exp2StridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename Exp2OutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = exp2_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace exp2
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
new file mode 100644
index 0000000000..d9e0c33081
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp
@@ -0,0 +1,179 @@
+//=== rsqrt.hpp -   Unary function RSQRT                   ------
+//*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===---------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for elementwise evaluation of RSQRT(x)
+/// function that computes the reciprocal square root.
+//===---------------------------------------------------------------------===//
+
+#pragma once
+#include <cmath>
+#include <complex>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+#include "kernels/elementwise_functions/common.hpp"
+
+#include "utils/offset_utils.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+#include <pybind11/pybind11.h>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace rsqrt
+{
+
+namespace py = pybind11;
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+template <typename argT, typename resT> struct RsqrtFunctor
+{
+
+    // is function constant for given argT
+    using is_constant = typename std::false_type;
+    // constant value, if constant
+    // constexpr resT constant_value = resT{};
+    // is function defined for sycl::vec
+    using supports_vec = typename std::false_type;
+    // do both argTy and resTy support sugroup store/load operation
+    using supports_sg_loadstore = typename std::true_type;
+
+    resT operator()(const argT &in) const
+    {
+        return sycl::rsqrt(in);
+    }
+};
+
+template <typename argTy,
+          typename resTy = argTy,
+          unsigned int vec_sz = 4,
+          unsigned int n_vecs = 2>
+using RsqrtContigFunctor =
+    elementwise_common::UnaryContigFunctor<argTy,
+                                           resTy,
+                                           RsqrtFunctor<argTy, resTy>,
+                                           vec_sz,
+                                           n_vecs>;
+
+template <typename argTy, typename resTy, typename IndexerT>
+using RsqrtStridedFunctor = elementwise_common::
+    UnaryStridedFunctor<argTy, resTy, IndexerT, RsqrtFunctor<argTy, resTy>>;
+
+template <typename T> struct RsqrtOutputType
+{
+    using value_type = typename std::disjunction< // disjunction is C++17
+                                                  // feature, supported by DPC++
+        td_ns::TypeMapResultEntry<T, sycl::half, sycl::half>,
+        td_ns::TypeMapResultEntry<T, float, float>,
+        td_ns::TypeMapResultEntry<T, double, double>,
+        td_ns::DefaultResultEntry<void>>::result_type;
+};
+
+template <typename T1, typename T2, unsigned int vec_sz, unsigned int n_vecs>
+class rsqrt_contig_kernel;
+
+template <typename argTy>
+sycl::event rsqrt_contig_impl(sycl::queue &exec_q,
+                              size_t nelems,
+                              const char *arg_p,
+                              char *res_p,
+                              const std::vector<sycl::event> &depends = {})
+{
+    return elementwise_common::unary_contig_impl<
+        argTy, RsqrtOutputType, RsqrtContigFunctor, rsqrt_contig_kernel>(
+        exec_q, nelems, arg_p, res_p, depends);
+}
+
+template <typename fnT, typename T> struct RsqrtContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename RsqrtOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = rsqrt_contig_impl<T>;
+            return fn;
+        }
+    }
+};
+
+template <typename fnT, typename T> struct RsqrtTypeMapFactory
+{
+    /*! @brief get typeid for output type of sycl::rsqrt(T x) */
+    std::enable_if_t<std::is_same<fnT, int>::value, int> get()
+    {
+        using rT = typename RsqrtOutputType<T>::value_type;
+        return td_ns::GetTypeid<rT>{}.get();
+    }
+};
+
+template <typename T1, typename T2, typename T3> class rsqrt_strided_kernel;
+
+template <typename argTy>
+sycl::event
+rsqrt_strided_impl(sycl::queue &exec_q,
+                   size_t nelems,
+                   int nd,
+                   const py::ssize_t *shape_and_strides,
+                   const char *arg_p,
+                   py::ssize_t arg_offset,
+                   char *res_p,
+                   py::ssize_t res_offset,
+                   const std::vector<sycl::event> &depends,
+                   const std::vector<sycl::event> &additional_depends)
+{
+    return elementwise_common::unary_strided_impl<
+        argTy, RsqrtOutputType, RsqrtStridedFunctor, rsqrt_strided_kernel>(
+        exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
+        res_offset, depends, additional_depends);
+}
+
+template <typename fnT, typename T> struct RsqrtStridedFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<typename RsqrtOutputType<T>::value_type,
+                                     void>) {
+            fnT fn = nullptr;
+            return fn;
+        }
+        else {
+            fnT fn = rsqrt_strided_impl<T>;
+            return fn;
+        }
+    }
+};
+
+} // namespace rsqrt
+} // namespace kernels
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
index 0f35aa306e..e1e9e79c57 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp
@@ -48,7 +48,7 @@ namespace sin
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace cmplx_ns = sycl::ext::oneapi::experimental;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -81,8 +81,8 @@ template <typename argT, typename resT> struct SinFunctor
              * real and imaginary parts of input are finite.
              */
             if (in_re_finite && in_im_finite) {
-                return cmplx_ns::sin(
-                    cmplx_ns::complex<realT>(in)); // std::sin(in);
+                return exprm_ns::sin(
+                    exprm_ns::complex<realT>(in)); // std::sin(in);
             }
 
             /*
diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
index af887431d7..b11c7402d0 100644
--- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp
@@ -48,7 +48,7 @@ namespace sinh
 
 namespace py = pybind11;
 namespace td_ns = dpctl::tensor::type_dispatch;
-namespace cmplx_ns = sycl::ext::oneapi::experimental;
+namespace exprm_ns = sycl::ext::oneapi::experimental;
 
 using dpctl::tensor::type_utils::is_complex;
 
@@ -81,7 +81,7 @@ template <typename argT, typename resT> struct SinhFunctor
              * real and imaginary parts of input are finite.
              */
             if (xfinite && yfinite) {
-                return std::sinh(in);
+                return exprm_ns::sinh(exprm_ns::complex<realT>(in));
             }
             /*
              * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
index 31d3d874a0..2f5e74172f 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp
@@ -25,8 +25,8 @@
 #pragma once
 #include <algorithm>
 #include <cstddef>
-#include <type_traits>
 #include <sycl/sycl.hpp>
+#include <type_traits>
 #include <vector>
 
 #include "math_utils.hpp"
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.cpp b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
index de5e6b09d0..9ab7c0807c 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions.cpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions.cpp
@@ -48,12 +48,15 @@
 #include "kernels/elementwise_functions/bitwise_or.hpp"
 #include "kernels/elementwise_functions/bitwise_right_shift.hpp"
 #include "kernels/elementwise_functions/bitwise_xor.hpp"
+#include "kernels/elementwise_functions/cbrt.hpp"
 #include "kernels/elementwise_functions/ceil.hpp"
 #include "kernels/elementwise_functions/conj.hpp"
+#include "kernels/elementwise_functions/copysign.hpp"
 #include "kernels/elementwise_functions/cos.hpp"
 #include "kernels/elementwise_functions/cosh.hpp"
 #include "kernels/elementwise_functions/equal.hpp"
 #include "kernels/elementwise_functions/exp.hpp"
+#include "kernels/elementwise_functions/exp2.hpp"
 #include "kernels/elementwise_functions/expm1.hpp"
 #include "kernels/elementwise_functions/floor.hpp"
 #include "kernels/elementwise_functions/floor_divide.hpp"
@@ -86,6 +89,7 @@
 #include "kernels/elementwise_functions/real.hpp"
 #include "kernels/elementwise_functions/remainder.hpp"
 #include "kernels/elementwise_functions/round.hpp"
+#include "kernels/elementwise_functions/rsqrt.hpp"
 #include "kernels/elementwise_functions/sign.hpp"
 #include "kernels/elementwise_functions/signbit.hpp"
 #include "kernels/elementwise_functions/sin.hpp"
@@ -2749,7 +2753,6 @@ void populate_trunc_dispatch_vectors(void)
 } // namespace impl
 
 // B24:  ==== HYPOT    (x1, x2)
-
 namespace impl
 {
 namespace hypot_fn_ns = dpctl::tensor::kernels::hypot;
@@ -2788,6 +2791,151 @@ void populate_hypot_dispatch_tables(void)
 
 } // namespace impl
 
+// U37: ==== CBRT        (x)
+namespace impl
+{
+
+namespace cbrt_fn_ns = dpctl::tensor::kernels::cbrt;
+
+static unary_contig_impl_fn_ptr_t cbrt_contig_dispatch_vector[td_ns::num_types];
+static int cbrt_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    cbrt_strided_dispatch_vector[td_ns::num_types];
+
+void populate_cbrt_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = cbrt_fn_ns;
+
+    using fn_ns::CbrtContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, CbrtContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(cbrt_contig_dispatch_vector);
+
+    using fn_ns::CbrtStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, CbrtStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(cbrt_strided_dispatch_vector);
+
+    using fn_ns::CbrtTypeMapFactory;
+    DispatchVectorBuilder<int, CbrtTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(cbrt_output_typeid_vector);
+}
+
+} // namespace impl
+
+// B24:  ==== COPYSIGN    (x1, x2)
+namespace impl
+{
+namespace copysign_fn_ns = dpctl::tensor::kernels::copysign;
+
+static binary_contig_impl_fn_ptr_t
+    copysign_contig_dispatch_table[td_ns::num_types][td_ns::num_types];
+static int copysign_output_id_table[td_ns::num_types][td_ns::num_types];
+
+static binary_strided_impl_fn_ptr_t
+    copysign_strided_dispatch_table[td_ns::num_types][td_ns::num_types];
+
+void populate_copysign_dispatch_tables(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = copysign_fn_ns;
+
+    // which input types are supported, and what is the type of the result
+    using fn_ns::CopysignTypeMapFactory;
+    DispatchTableBuilder<int, CopysignTypeMapFactory, num_types> dtb1;
+    dtb1.populate_dispatch_table(copysign_output_id_table);
+
+    // function pointers for operation on general strided arrays
+    using fn_ns::CopysignStridedFactory;
+    DispatchTableBuilder<binary_strided_impl_fn_ptr_t, CopysignStridedFactory,
+                         num_types>
+        dtb2;
+    dtb2.populate_dispatch_table(copysign_strided_dispatch_table);
+
+    // function pointers for operation on contiguous inputs and output
+    using fn_ns::CopysignContigFactory;
+    DispatchTableBuilder<binary_contig_impl_fn_ptr_t, CopysignContigFactory,
+                         num_types>
+        dtb3;
+    dtb3.populate_dispatch_table(copysign_contig_dispatch_table);
+};
+
+} // namespace impl
+
+// U38: ==== EXP2           (x)
+namespace impl
+{
+
+namespace exp2_fn_ns = dpctl::tensor::kernels::exp2;
+
+static unary_contig_impl_fn_ptr_t exp2_contig_dispatch_vector[td_ns::num_types];
+static int exp2_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    exp2_strided_dispatch_vector[td_ns::num_types];
+
+void populate_exp2_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = exp2_fn_ns;
+
+    using fn_ns::Exp2ContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, Exp2ContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(exp2_contig_dispatch_vector);
+
+    using fn_ns::Exp2StridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, Exp2StridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(exp2_strided_dispatch_vector);
+
+    using fn_ns::Exp2TypeMapFactory;
+    DispatchVectorBuilder<int, Exp2TypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(exp2_output_typeid_vector);
+}
+
+} // namespace impl
+
+// U39: ==== RSQRT        (x)
+namespace impl
+{
+
+namespace rsqrt_fn_ns = dpctl::tensor::kernels::rsqrt;
+
+static unary_contig_impl_fn_ptr_t
+    rsqrt_contig_dispatch_vector[td_ns::num_types];
+static int rsqrt_output_typeid_vector[td_ns::num_types];
+static unary_strided_impl_fn_ptr_t
+    rsqrt_strided_dispatch_vector[td_ns::num_types];
+
+void populate_rsqrt_dispatch_vectors(void)
+{
+    using namespace td_ns;
+    namespace fn_ns = rsqrt_fn_ns;
+
+    using fn_ns::RsqrtContigFactory;
+    DispatchVectorBuilder<unary_contig_impl_fn_ptr_t, RsqrtContigFactory,
+                          num_types>
+        dvb1;
+    dvb1.populate_dispatch_vector(rsqrt_contig_dispatch_vector);
+
+    using fn_ns::RsqrtStridedFactory;
+    DispatchVectorBuilder<unary_strided_impl_fn_ptr_t, RsqrtStridedFactory,
+                          num_types>
+        dvb2;
+    dvb2.populate_dispatch_vector(rsqrt_strided_dispatch_vector);
+
+    using fn_ns::RsqrtTypeMapFactory;
+    DispatchVectorBuilder<int, RsqrtTypeMapFactory, num_types> dvb3;
+    dvb3.populate_dispatch_vector(rsqrt_output_typeid_vector);
+}
+
+} // namespace impl
+
 // ==========================================================================================
 // //
 
@@ -4889,6 +5037,117 @@ void init_elementwise_functions(py::module_ m)
               py::arg("depends") = py::list());
         m.def("_hypot_result_type", hypot_result_type_pyapi, "");
     }
+
+    // U37: ==== CBRT        (x)
+    {
+        impl::populate_cbrt_dispatch_vectors();
+        using impl::cbrt_contig_dispatch_vector;
+        using impl::cbrt_output_typeid_vector;
+        using impl::cbrt_strided_dispatch_vector;
+
+        auto cbrt_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, cbrt_output_typeid_vector,
+                cbrt_contig_dispatch_vector, cbrt_strided_dispatch_vector);
+        };
+        m.def("_cbrt", cbrt_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto cbrt_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, cbrt_output_typeid_vector);
+        };
+        m.def("_cbrt_result_type", cbrt_result_type_pyapi);
+    }
+
+    // B25: ==== COPYSIGN       (x1, x2)
+    {
+        impl::populate_copysign_dispatch_tables();
+        using impl::copysign_contig_dispatch_table;
+        using impl::copysign_output_id_table;
+        using impl::copysign_strided_dispatch_table;
+
+        auto copysign_pyapi = [&](const dpctl::tensor::usm_ndarray &src1,
+                                  const dpctl::tensor::usm_ndarray &src2,
+                                  const dpctl::tensor::usm_ndarray &dst,
+                                  sycl::queue &exec_q,
+                                  const std::vector<sycl::event> &depends =
+                                      {}) {
+            return py_binary_ufunc(
+                src1, src2, dst, exec_q, depends, copysign_output_id_table,
+                // function pointers to handle operation on contiguous arrays
+                // (pointers may be nullptr)
+                copysign_contig_dispatch_table,
+                // function pointers to handle operation on strided arrays (most
+                // general case)
+                copysign_strided_dispatch_table,
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t>{},
+                // function pointers to handle operation of c-contig matrix and
+                // c-contig row with broadcasting (may be nullptr)
+                td_ns::NullPtrTable<
+                    binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t>{});
+        };
+        auto copysign_result_type_pyapi = [&](const py::dtype &dtype1,
+                                              const py::dtype &dtype2) {
+            return py_binary_ufunc_result_type(dtype1, dtype2,
+                                               copysign_output_id_table);
+        };
+        m.def("_copysign", copysign_pyapi, "", py::arg("src1"), py::arg("src2"),
+              py::arg("dst"), py::arg("sycl_queue"),
+              py::arg("depends") = py::list());
+        m.def("_copysign_result_type", copysign_result_type_pyapi, "");
+    }
+
+    // U38: ==== EXP2      (x)
+    {
+        impl::populate_exp2_dispatch_vectors();
+        using impl::exp2_contig_dispatch_vector;
+        using impl::exp2_output_typeid_vector;
+        using impl::exp2_strided_dispatch_vector;
+
+        auto exp2_pyapi = [&](const arrayT &src, const arrayT &dst,
+                              sycl::queue &exec_q,
+                              const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, exp2_output_typeid_vector,
+                exp2_contig_dispatch_vector, exp2_strided_dispatch_vector);
+        };
+        m.def("_exp2", exp2_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto exp2_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype, exp2_output_typeid_vector);
+        };
+        m.def("_exp2_result_type", exp2_result_type_pyapi);
+    }
+
+    // U39: ==== RSQRT        (x)
+    {
+        impl::populate_rsqrt_dispatch_vectors();
+        using impl::rsqrt_contig_dispatch_vector;
+        using impl::rsqrt_output_typeid_vector;
+        using impl::rsqrt_strided_dispatch_vector;
+
+        auto rsqrt_pyapi = [&](const arrayT &src, const arrayT &dst,
+                               sycl::queue &exec_q,
+                               const event_vecT &depends = {}) {
+            return py_unary_ufunc(
+                src, dst, exec_q, depends, rsqrt_output_typeid_vector,
+                rsqrt_contig_dispatch_vector, rsqrt_strided_dispatch_vector);
+        };
+        m.def("_rsqrt", rsqrt_pyapi, "", py::arg("src"), py::arg("dst"),
+              py::arg("sycl_queue"), py::arg("depends") = py::list());
+
+        auto rsqrt_result_type_pyapi = [&](const py::dtype &dtype) {
+            return py_unary_ufunc_result_type(dtype,
+                                              rsqrt_output_typeid_vector);
+        };
+        m.def("_rsqrt_result_type", rsqrt_result_type_pyapi);
+    }
 }
 
 } // namespace py_internal
diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
index 46ef02f064..e9ccd1d52a 100644
--- a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
+++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp
@@ -28,9 +28,9 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <type_traits>
 #include <string>
 #include <sycl/sycl.hpp>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
diff --git a/dpctl/tests/elementwise/test_cbrt.py b/dpctl/tests/elementwise/test_cbrt.py
new file mode 100644
index 0000000000..b06a8d19cf
--- /dev/null
+++ b/dpctl/tests/elementwise/test_cbrt.py
@@ -0,0 +1,79 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _map_to_device_dtype, _no_complex_dtypes, _real_fp_dtypes
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_cbrt_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.cbrt(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.cbrt(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_cbrt_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.cbrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_cbrt_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    X = dpt.linspace(0, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.cbrt(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.cbrt(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
+def test_cbrt_special_cases():
+    get_queue_or_skip()
+
+    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = dpt.cbrt(X)
+    expected = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    tol = dpt.finfo(dpt.float32).resolution
+
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol, equal_nan=True)
diff --git a/dpctl/tests/elementwise/test_copysign.py b/dpctl/tests/elementwise/test_copysign.py
new file mode 100644
index 0000000000..26a285343c
--- /dev/null
+++ b/dpctl/tests/elementwise/test_copysign.py
@@ -0,0 +1,111 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import ctypes
+
+import numpy as np
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _compare_dtypes, _no_complex_dtypes, _real_fp_dtypes
+
+
+@pytest.mark.parametrize("op1_dtype", _no_complex_dtypes)
+@pytest.mark.parametrize("op2_dtype", _no_complex_dtypes)
+def test_copysign_dtype_matrix(op1_dtype, op2_dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(op1_dtype, q)
+    skip_if_dtype_not_supported(op2_dtype, q)
+
+    sz = 127
+    ar1 = dpt.ones(sz, dtype=op1_dtype)
+    ar2 = dpt.ones_like(ar1, dtype=op2_dtype)
+
+    r = dpt.copysign(ar1, ar2)
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.copysign(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar1.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+    assert r.sycl_queue == ar1.sycl_queue
+
+    ar3 = dpt.ones(sz, dtype=op1_dtype)
+    ar4 = dpt.ones(2 * sz, dtype=op2_dtype)
+
+    r = dpt.copysign(ar3[::-1], ar4[::2])
+    assert isinstance(r, dpt.usm_ndarray)
+    expected = np.copysign(
+        np.ones(1, dtype=op1_dtype), np.ones(1, dtype=op2_dtype)
+    )
+    assert _compare_dtypes(r.dtype, expected.dtype, sycl_queue=q)
+    assert r.shape == ar3.shape
+    assert (dpt.asnumpy(r) == expected.astype(r.dtype)).all()
+
+
+@pytest.mark.parametrize("arr_dt", _real_fp_dtypes)
+def test_copysign_python_scalar(arr_dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(arr_dt, q)
+
+    X = dpt.ones((10, 10), dtype=arr_dt, sycl_queue=q)
+    py_ones = (
+        bool(1),
+        int(1),
+        float(1),
+        np.float32(1),
+        ctypes.c_int(1),
+    )
+    for sc in py_ones:
+        R = dpt.copysign(X, sc)
+        assert isinstance(R, dpt.usm_ndarray)
+        R = dpt.copysign(sc, X)
+        assert isinstance(R, dpt.usm_ndarray)
+
+
+@pytest.mark.parametrize("dt", _real_fp_dtypes)
+def test_copysign(dt):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dt, q)
+
+    x = dpt.arange(100, dtype=dt, sycl_queue=q)
+    x[1::2] *= -1
+    y = dpt.ones(100, dtype=dt, sycl_queue=q)
+    y[::2] *= -1
+    res = dpt.copysign(x, y)
+    expected = dpt.negative(x)
+    tol = dpt.finfo(dt).resolution
+    assert dpt.allclose(res, expected, atol=tol, rtol=tol)
+
+
+def test_copysign_special_values():
+    get_queue_or_skip()
+
+    x1 = dpt.asarray([1.0, 0.0, dpt.nan, dpt.nan], dtype="f4")
+    y1 = dpt.asarray([-1.0, -0.0, -dpt.nan, -1], dtype="f4")
+    res = dpt.copysign(x1, y1)
+    assert dpt.all(dpt.signbit(res))
+    x2 = dpt.asarray([-1.0, -0.0, -dpt.nan, -dpt.nan], dtype="f4")
+    res = dpt.copysign(x2, y1)
+    assert dpt.all(dpt.signbit(res))
+    y2 = dpt.asarray([0.0, 1.0, dpt.nan, 1.0], dtype="f4")
+    res = dpt.copysign(x2, y2)
+    assert not dpt.any(dpt.signbit(res))
+    res = dpt.copysign(x1, y2)
+    assert not dpt.any(dpt.signbit(res))
diff --git a/dpctl/tests/elementwise/test_exp2.py b/dpctl/tests/elementwise/test_exp2.py
new file mode 100644
index 0000000000..d4bef1efab
--- /dev/null
+++ b/dpctl/tests/elementwise/test_exp2.py
@@ -0,0 +1,168 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import itertools
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _all_dtypes, _map_to_device_dtype, _usm_types
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp2_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.asarray(0, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.exp2(np.array(0, dtype=dtype)).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.exp2(X).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_exp2_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.exp2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_exp2_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2 * 1027
+
+    X = dpt.linspace(1, 5, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    Xnp = dpt.asnumpy(X)
+
+    Y = dpt.exp2(X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), np.exp2(Xnp), atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("usm_type", _usm_types)
+def test_exp2_usm_type(usm_type):
+    q = get_queue_or_skip()
+
+    arg_dt = np.dtype("f4")
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, usm_type=usm_type, sycl_queue=q)
+    X[..., 0::2] = 1 / 4
+    X[..., 1::2] = 1 / 2
+
+    Y = dpt.exp2(X)
+    assert Y.usm_type == X.usm_type
+    assert Y.sycl_queue == X.sycl_queue
+    assert Y.flags.c_contiguous
+
+    expected_Y = np.empty(input_shape, dtype=arg_dt)
+    expected_Y[..., 0::2] = np.exp2(np.float32(1 / 4))
+    expected_Y[..., 1::2] = np.exp2(np.float32(1 / 2))
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _all_dtypes)
+def test_exp2_order(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    arg_dt = np.dtype(dtype)
+    input_shape = (10, 10, 10, 10)
+    X = dpt.empty(input_shape, dtype=arg_dt, sycl_queue=q)
+    X[..., 0::2] = 1 / 4
+    X[..., 1::2] = 1 / 2
+
+    for ord in ["C", "F", "A", "K"]:
+        for perms in itertools.permutations(range(4)):
+            U = dpt.permute_dims(X[:, ::-1, ::-1, :], perms)
+            Y = dpt.exp2(U, order=ord)
+            expected_Y = np.exp2(dpt.asnumpy(U))
+            tol = 8 * max(
+                dpt.finfo(Y.dtype).resolution,
+                np.finfo(expected_Y.dtype).resolution,
+            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+
+
+def test_exp2_special_cases():
+    get_queue_or_skip()
+
+    X = dpt.asarray([dpt.nan, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = np.asarray([np.nan, 1.0, 1.0, np.inf, 0.0], dtype="f4")
+
+    tol = dpt.finfo(X.dtype).resolution
+    assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol)
+
+    # special cases for complex variant
+    num_finite = 1.0
+    vals = [
+        complex(0.0, 0.0),
+        complex(num_finite, dpt.inf),
+        complex(num_finite, dpt.nan),
+        complex(dpt.inf, 0.0),
+        complex(-dpt.inf, num_finite),
+        complex(dpt.inf, num_finite),
+        complex(-dpt.inf, dpt.inf),
+        complex(dpt.inf, dpt.inf),
+        complex(-dpt.inf, dpt.nan),
+        complex(dpt.inf, dpt.nan),
+        complex(dpt.nan, 0.0),
+        complex(dpt.nan, num_finite),
+        complex(dpt.nan, dpt.nan),
+    ]
+    X = dpt.asarray(vals, dtype=dpt.complex64)
+    cis_1 = complex(np.cos(num_finite), np.sin(num_finite))
+    c_nan = complex(np.nan, np.nan)
+    res = np.asarray(
+        [
+            complex(1.0, 0.0),
+            c_nan,
+            c_nan,
+            complex(np.inf, 0.0),
+            0.0,
+            np.inf * cis_1,
+            complex(0.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(0.0, 0.0),
+            complex(np.inf, np.nan),
+            complex(np.nan, 0.0),
+            c_nan,
+            c_nan,
+        ],
+        dtype=np.complex64,
+    )
+
+    tol = dpt.finfo(X.dtype).resolution
+    with np.errstate(invalid="ignore"):
+        assert_allclose(dpt.asnumpy(dpt.exp2(X)), res, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_rsqrt.py b/dpctl/tests/elementwise/test_rsqrt.py
new file mode 100644
index 0000000000..ef9378ade2
--- /dev/null
+++ b/dpctl/tests/elementwise/test_rsqrt.py
@@ -0,0 +1,74 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2023 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+from .utils import _map_to_device_dtype, _no_complex_dtypes, _real_fp_dtypes
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_rsqrt_out_type(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    x = dpt.asarray(1, dtype=dtype, sycl_queue=q)
+    expected_dtype = np.reciprocal(np.sqrt(np.array(1, dtype=dtype))).dtype
+    expected_dtype = _map_to_device_dtype(expected_dtype, q.sycl_device)
+    assert dpt.rsqrt(x).dtype == expected_dtype
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_rsqrt_output_contig(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 1027
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)
+    res = dpt.rsqrt(x)
+    expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype))
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol)
+
+
+@pytest.mark.parametrize("dtype", _real_fp_dtypes)
+def test_rsqrt_output_strided(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    n_seq = 2054
+
+    x = dpt.linspace(1, 13, num=n_seq, dtype=dtype, sycl_queue=q)[::-2]
+    res = dpt.rsqrt(x)
+    expected = np.reciprocal(np.sqrt(dpt.asnumpy(x), dtype=dtype))
+    tol = 8 * dpt.finfo(res.dtype).resolution
+    assert_allclose(dpt.asnumpy(res), expected, atol=tol, rtol=tol)
+
+
+def test_rsqrt_special_cases():
+    get_queue_or_skip()
+
+    x = dpt.asarray([dpt.nan, -1.0, 0.0, -0.0, dpt.inf, -dpt.inf], dtype="f4")
+    res = dpt.rsqrt(x)
+    expected = dpt.asarray(
+        [dpt.nan, dpt.nan, dpt.inf, -dpt.inf, 0.0, dpt.nan], dtype="f4"
+    )
+    assert dpt.allclose(res, expected, equal_nan=True)
diff --git a/dpctl/tests/test_utils.py b/dpctl/tests/test_utils.py
index 1aab7fd7e7..05b2dc7890 100644
--- a/dpctl/tests/test_utils.py
+++ b/dpctl/tests/test_utils.py
@@ -21,7 +21,6 @@
 
 import dpctl
 import dpctl.utils
-from dpctl.enum_types import backend_type
 
 
 def test_get_execution_queue_input_validation():
@@ -132,9 +131,7 @@ def test_intel_device_info():
         pytest.skip("Default device could not be created")
     descr = dpctl.utils.intel_device_info(d)
     assert isinstance(descr, dict)
-    assert ("device_id" in descr) or (
-        not d.has_aspect_cpu and not d.backend == backend_type.level_zero
-    )
+    assert ("device_id" in descr) or not descr
     allowed_names = [
         "device_id",
         "gpu_slices",
diff --git a/dpctl/utils/CMakeLists.txt b/dpctl/utils/CMakeLists.txt
index aadc1c0fe0..e7d3951e5b 100644
--- a/dpctl/utils/CMakeLists.txt
+++ b/dpctl/utils/CMakeLists.txt
@@ -21,6 +21,19 @@ pybind11_add_module(${python_module_name} MODULE
   ${_module_src}
 )
 add_sycl_to_target(TARGET ${python_module_name} SOURCES ${_module_src})
+if(_dpctl_sycl_targets)
+    # make fat binary
+    target_compile_options(
+        ${python_module_name}
+        PRIVATE
+        -fsycl-targets=${_dpctl_sycl_targets}
+    )
+    target_link_options(
+        ${python_module_name}
+        PRIVATE
+        -fsycl-targets=${_dpctl_sycl_targets}
+    )
+endif()
 target_include_directories(${python_module_name}
     PRIVATE
     ${CMAKE_CURRENT_SOURCE_DIR}/../include
diff --git a/libsyclinterface/CMakeLists.txt b/libsyclinterface/CMakeLists.txt
index 64ec3271b1..e84959c1c3 100644
--- a/libsyclinterface/CMakeLists.txt
+++ b/libsyclinterface/CMakeLists.txt
@@ -205,6 +205,19 @@ add_library(DPCTLSyclInterface
     ${helper_sources}
 )
 add_sycl_to_target(TARGET DPCTLSyclInterface SOURCES ${sources} ${helper_sources})
+# make fat binary
+if(_dpctl_sycl_targets)
+    target_compile_options(
+        DPCTLSyclInterface
+        PRIVATE
+        -fsycl-targets=${_dpctl_sycl_targets}
+    )
+    target_link_options(
+        DPCTLSyclInterface
+        PRIVATE
+        -fsycl-targets=${_dpctl_sycl_targets}
+   )
+endif()
 
 if(DPCTL_GENERATE_COVERAGE)
     target_link_options(DPCTLSyclInterface
diff --git a/libsyclinterface/tests/CMakeLists.txt b/libsyclinterface/tests/CMakeLists.txt
index 472e1787fa..5a672e312f 100644
--- a/libsyclinterface/tests/CMakeLists.txt
+++ b/libsyclinterface/tests/CMakeLists.txt
@@ -52,6 +52,19 @@ add_sycl_to_target(
   ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_queue_interface.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/test_sycl_usm_interface.cpp
 )
+if (_dpctl_sycl_targets)
+# make fat binary
+target_compile_options(
+    dpctl_c_api_tests
+    PRIVATE
+    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
+)
+target_link_options(
+    dpctl_c_api_tests
+    PRIVATE
+    -fsycl-targets=nvptx64-nvidia-cuda,spir64-unknown-unknown
+)
+endif()
 
 if(DPCTL_GENERATE_COVERAGE)
     target_include_directories(dpctl_c_api_tests