diff --git a/.github/workflows/generate-coverage.yaml b/.github/workflows/generate-coverage.yaml
index 5975837d55..3de1427654 100644
--- a/.github/workflows/generate-coverage.yaml
+++ b/.github/workflows/generate-coverage.yaml
@@ -79,7 +79,7 @@ jobs:
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest pytest-cov scikit-build cmake coverage[toml]
+          pip install numpy cython"<3" setuptools pytest pytest-cov scikit-build cmake coverage[toml]
 
       - name: Build dpctl with coverage
         shell: bash -l {0}
diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml
index 768d958e02..a72741c67f 100644
--- a/.github/workflows/generate-docs.yml
+++ b/.github/workflows/generate-docs.yml
@@ -49,7 +49,7 @@ jobs:
         if: ${{ !github.event.pull_request || github.event.action != 'closed' }}
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools scikit-build cmake sphinx sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics
+          pip install numpy cython"<3" setuptools scikit-build cmake sphinx sphinx_rtd_theme pydot graphviz sphinxcontrib-programoutput sphinxcontrib-googleanalytics
       - name: Checkout repo
         uses: actions/checkout@v3
         with:
diff --git a/.github/workflows/os-llvm-sycl-build.yml b/.github/workflows/os-llvm-sycl-build.yml
index 1aae32d4d9..e1a390aad8 100644
--- a/.github/workflows/os-llvm-sycl-build.yml
+++ b/.github/workflows/os-llvm-sycl-build.yml
@@ -108,7 +108,7 @@ jobs:
       - name: Install dpctl dependencies
         shell: bash -l {0}
         run: |
-          pip install numpy cython setuptools pytest scikit-build cmake
+          pip install numpy cython"<3" setuptools pytest scikit-build cmake
 
       - name: Checkout repo
         uses: actions/checkout@v3
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 56958d3355..aad850b060 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -20,7 +20,7 @@ requirements:
         - cmake  >=3.21
         - ninja
         - git
-        - cython
+        - cython  <3
         - python
         - scikit-build
         - numpy
diff --git a/dpctl/tensor/_elementwise_common.py b/dpctl/tensor/_elementwise_common.py
index 9c61f5e97c..55c95f5360 100644
--- a/dpctl/tensor/_elementwise_common.py
+++ b/dpctl/tensor/_elementwise_common.py
@@ -52,6 +52,15 @@ def __call__(self, x, out=None, order="K"):
         if not isinstance(x, dpt.usm_ndarray):
             raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
 
+        if order not in ["C", "F", "K", "A"]:
+            order = "K"
+        buf_dt, res_dt = _find_buf_dtype(
+            x.dtype, self.result_type_resolver_fn_, x.sycl_device
+        )
+        if res_dt is None:
+            raise RuntimeError
+
+        orig_out = out
         if out is not None:
             if not isinstance(out, dpt.usm_ndarray):
                 raise TypeError(
@@ -64,8 +73,21 @@ def __call__(self, x, out=None, order="K"):
                     f"Expected output shape is {x.shape}, got {out.shape}"
                 )
 
-            if ti._array_overlap(x, out):
-                raise TypeError("Input and output arrays have memory overlap")
+            if res_dt != out.dtype:
+                raise TypeError(
+                    f"Output array of type {res_dt} is needed,"
+                    f" got {out.dtype}"
+                )
+
+            if (
+                buf_dt is None
+                and ti._array_overlap(x, out)
+                and not ti._same_logical_tensors(x, out)
+            ):
+                # Allocate a temporary buffer to avoid memory overlapping.
+                # Note if `buf_dt` is not None, a temporary copy of `x` will be
+                # created, so the array overlap check isn't needed.
+                out = dpt.empty_like(out)
 
             if (
                 dpctl.utils.get_execution_queue((x.sycl_queue, out.sycl_queue))
@@ -75,13 +97,6 @@ def __call__(self, x, out=None, order="K"):
                     "Input and output allocation queues are not compatible"
                 )
 
-        if order not in ["C", "F", "K", "A"]:
-            order = "K"
-        buf_dt, res_dt = _find_buf_dtype(
-            x.dtype, self.result_type_resolver_fn_, x.sycl_device
-        )
-        if res_dt is None:
-            raise RuntimeError
         exec_q = x.sycl_queue
         if buf_dt is None:
             if out is None:
@@ -91,17 +106,20 @@ def __call__(self, x, out=None, order="K"):
                     if order == "A":
                         order = "F" if x.flags.f_contiguous else "C"
                     out = dpt.empty_like(x, dtype=res_dt, order=order)
-            else:
-                if res_dt != out.dtype:
-                    raise TypeError(
-                        f"Output array of type {res_dt} is needed,"
-                        f" got {out.dtype}"
-                    )
 
-            ht, _ = self.unary_fn_(x, out, sycl_queue=exec_q)
-            ht.wait()
+            ht_unary_ev, unary_ev = self.unary_fn_(x, out, sycl_queue=exec_q)
+
+            if not (orig_out is None or orig_out is out):
+                # Copy the out data from temporary buffer to original memory
+                ht_copy_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
+                    src=out, dst=orig_out, sycl_queue=exec_q, depends=[unary_ev]
+                )
+                ht_copy_ev.wait()
+                out = orig_out
 
+            ht_unary_ev.wait()
             return out
+
         if order == "K":
             buf = _empty_like_orderK(x, buf_dt)
         else:
@@ -117,11 +135,6 @@ def __call__(self, x, out=None, order="K"):
                 out = _empty_like_orderK(buf, res_dt)
             else:
                 out = dpt.empty_like(buf, dtype=res_dt, order=order)
-        else:
-            if buf_dt != out.dtype:
-                raise TypeError(
-                    f"Output array of type {buf_dt} is needed, got {out.dtype}"
-                )
 
         ht, _ = self.unary_fn_(buf, out, sycl_queue=exec_q, depends=[copy_ev])
         ht_copy_ev.wait()
diff --git a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
index e4be509a22..331ef6c5eb 100644
--- a/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
+++ b/dpctl/tensor/libtensor/include/utils/memory_overlap.hpp
@@ -100,6 +100,53 @@ struct MemoryOverlap
     }
 };
 
+struct SameLogicalTensors
+{
+    bool operator()(dpctl::tensor::usm_ndarray ar1,
+                    dpctl::tensor::usm_ndarray ar2) const
+    {
+        // Same ndim
+        int nd1 = ar1.get_ndim();
+        if (nd1 != ar2.get_ndim())
+            return false;
+
+        // Same dtype
+        int tn1 = ar1.get_typenum();
+        if (tn1 != ar2.get_typenum())
+            return false;
+
+        // Same pointer
+        const char *ar1_data = ar1.get_data();
+        const char *ar2_data = ar2.get_data();
+
+        if (ar1_data != ar2_data)
+            return false;
+
+        // Same shape and strides
+        const py::ssize_t *ar1_shape = ar1.get_shape_raw();
+        const py::ssize_t *ar2_shape = ar2.get_shape_raw();
+
+        if (!std::equal(ar1_shape, ar1_shape + nd1, ar2_shape))
+            return false;
+
+        // Same shape and strides
+        auto const &ar1_strides = ar1.get_strides_vector();
+        auto const &ar2_strides = ar2.get_strides_vector();
+
+        auto ar1_beg_it = std::begin(ar1_strides);
+        auto ar1_end_it = std::end(ar1_strides);
+
+        auto ar2_beg_it = std::begin(ar2_strides);
+
+        if (!std::equal(ar1_beg_it, ar1_end_it, ar2_beg_it))
+            return false;
+
+        // all checks passed: arrays are logical views
+        // into the same memory
+        return true;
+    }
+};
+
 } // namespace overlap
 } // namespace tensor
 } // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions.hpp
index 27ee9c9fcb..453992220a 100644
--- a/dpctl/tensor/libtensor/source/elementwise_functions.hpp
+++ b/dpctl/tensor/libtensor/source/elementwise_functions.hpp
@@ -128,7 +128,9 @@ py_unary_ufunc(dpctl::tensor::usm_ndarray src,
 
     // check memory overlap
     auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, dst)) {
+    auto const &same_logical_tensors =
+        dpctl::tensor::overlap::SameLogicalTensors();
+    if (overlap(src, dst) && !same_logical_tensors(src, dst)) {
         throw py::value_error("Arrays index overlapping segments of memory");
     }
 
diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp
index 4b36dea534..1833c2d770 100644
--- a/dpctl/tensor/libtensor/source/tensor_py.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_py.cpp
@@ -60,6 +60,7 @@ using dpctl::tensor::c_contiguous_strides;
 using dpctl::tensor::f_contiguous_strides;
 
 using dpctl::tensor::overlap::MemoryOverlap;
+using dpctl::tensor::overlap::SameLogicalTensors;
 
 using dpctl::tensor::py_internal::copy_usm_ndarray_into_usm_ndarray;
 
@@ -338,6 +339,15 @@ PYBIND11_MODULE(_tensor_impl, m)
           "Determines if the memory regions indexed by each array overlap",
           py::arg("array1"), py::arg("array2"));
 
+    auto same_logical_tensors = [](dpctl::tensor::usm_ndarray x1,
+                                   dpctl::tensor::usm_ndarray x2) -> bool {
+        auto const &same_logical_tensors = SameLogicalTensors();
+        return same_logical_tensors(x1, x2);
+    };
+    m.def("_same_logical_tensors", same_logical_tensors,
+          "Determines if the memory regions indexed by each array are the same",
+          py::arg("array1"), py::arg("array2"));
+
     m.def("_place", &py_place, "", py::arg("dst"), py::arg("cumsum"),
           py::arg("axis_start"), py::arg("axis_end"), py::arg("rhs"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());
diff --git a/dpctl/tests/_numpy_warnings.py b/dpctl/tests/_numpy_warnings.py
new file mode 100644
index 0000000000..1e723c3001
--- /dev/null
+++ b/dpctl/tests/_numpy_warnings.py
@@ -0,0 +1,28 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import pytest
+
+
+@pytest.fixture
+def suppress_invalid_numpy_warnings():
+    # invalid: treatment for invalid floating-point operation
+    # (result is not an expressible number, typically indicates
+    # that a NaN was produced)
+    old_settings = numpy.seterr(invalid="ignore")
+    yield
+    numpy.seterr(**old_settings)  # reset to default
diff --git a/dpctl/tests/conftest.py b/dpctl/tests/conftest.py
index 7fc63a5a24..600953bcf7 100644
--- a/dpctl/tests/conftest.py
+++ b/dpctl/tests/conftest.py
@@ -26,8 +26,15 @@
     invalid_filter,
     valid_filter,
 )
+from _numpy_warnings import suppress_invalid_numpy_warnings
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "helper"))
 
 # common fixtures
-__all__ = ["check", "device_selector", "invalid_filter", "valid_filter"]
+__all__ = [
+    "check",
+    "device_selector",
+    "invalid_filter",
+    "suppress_invalid_numpy_warnings",
+    "valid_filter",
+]
diff --git a/dpctl/tests/elementwise/test_abs.py b/dpctl/tests/elementwise/test_abs.py
index ee7fa0cb6c..ab0d34d54d 100644
--- a/dpctl/tests/elementwise/test_abs.py
+++ b/dpctl/tests/elementwise/test_abs.py
@@ -22,7 +22,7 @@
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
-from .utils import _all_dtypes, _usm_types
+from .utils import _all_dtypes, _no_complex_dtypes, _usm_types
 
 
 @pytest.mark.parametrize("dtype", _all_dtypes)
@@ -113,3 +113,25 @@ def test_abs_complex(dtype):
             np.testing.assert_allclose(
                 dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
             )
+
+
+@pytest.mark.parametrize("dtype", _no_complex_dtypes)
+def test_abs_out_overlap(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(0, 35, 60, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5, 4))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np.abs(Xnp, out=Xnp)
+
+    Y = dpt.abs(X, out=X)
+    assert Y is X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+
+    Ynp = np.abs(Xnp, out=Xnp[::-1])
+    Y = dpt.abs(X, out=X[::-1])
+    assert Y is not X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+    assert np.allclose(dpt.asnumpy(Y), Ynp)
diff --git a/dpctl/tests/elementwise/test_exp.py b/dpctl/tests/elementwise/test_exp.py
index 5ea8ded018..85f21694c5 100644
--- a/dpctl/tests/elementwise/test_exp.py
+++ b/dpctl/tests/elementwise/test_exp.py
@@ -145,3 +145,26 @@ def test_exp_strided(dtype):
                 atol=tol,
                 rtol=tol,
             )
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_exp_out_overlap(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(0, 1, 15, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np.exp(Xnp, out=Xnp)
+
+    Y = dpt.exp(X, out=X)
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+    assert Y is X
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+
+    Ynp = np.exp(Xnp, out=Xnp[::-1])
+    Y = dpt.exp(X, out=X[::-1])
+    assert Y is not X
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), Ynp, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_log.py b/dpctl/tests/elementwise/test_log.py
index ed56fb6468..b0cc337826 100644
--- a/dpctl/tests/elementwise/test_log.py
+++ b/dpctl/tests/elementwise/test_log.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 import pytest
-from numpy.testing import assert_equal
+from numpy.testing import assert_allclose, assert_equal
 
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
@@ -50,7 +50,7 @@ def test_log_output_contig(dtype):
     Y = dpt.log(X)
     tol = 8 * dpt.finfo(Y.dtype).resolution
 
-    np.testing.assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
@@ -66,7 +66,7 @@ def test_log_output_strided(dtype):
     Y = dpt.log(X)
     tol = 8 * dpt.finfo(Y.dtype).resolution
 
-    np.testing.assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), np.log(Xnp), atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("usm_type", _usm_types)
@@ -89,7 +89,7 @@ def test_log_usm_type(usm_type):
     expected_Y[..., 1::2] = np.log(np.float32(10 * dpt.e))
     tol = 8 * dpt.finfo(Y.dtype).resolution
 
-    np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("dtype", _all_dtypes)
@@ -112,9 +112,7 @@ def test_log_order(dtype):
                 dpt.finfo(Y.dtype).resolution,
                 np.finfo(expected_Y.dtype).resolution,
             )
-            np.testing.assert_allclose(
-                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
-            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
 
 
 def test_log_special_cases():
@@ -126,3 +124,27 @@ def test_log_special_cases():
     Xnp = dpt.asnumpy(X)
 
     assert_equal(dpt.asnumpy(dpt.log(X)), np.log(Xnp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_log_out_overlap(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(5, 35, 60, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5, 4))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np.log(Xnp, out=Xnp)
+
+    Y = dpt.log(X, out=X)
+    assert Y is X
+
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+
+    Ynp = np.log(Xnp, out=Xnp[::-1])
+    Y = dpt.log(X, out=X[::-1])
+    assert Y is not X
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), Ynp, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_sincos.py b/dpctl/tests/elementwise/test_sincos.py
index d027ef026a..d4ca463394 100644
--- a/dpctl/tests/elementwise/test_sincos.py
+++ b/dpctl/tests/elementwise/test_sincos.py
@@ -161,12 +161,6 @@ def test_sincos_errors(callable):
         y,
     )
 
-    x = dpt.zeros(2)
-    y = x
-    assert_raises_regex(
-        TypeError, "Input and output arrays have memory overlap", callable, x, y
-    )
-
     x = dpt.zeros(2, dtype="float32")
     y = np.empty_like(x)
     assert_raises_regex(
@@ -230,3 +224,28 @@ def test_sincos_strided(dtype):
                 atol=tol,
                 rtol=tol,
             )
+
+
+@pytest.mark.parametrize(
+    "np_call, dpt_call", [(np.sin, dpt.sin), (np.cos, dpt.cos)]
+)
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_sincos_out_overlap(np_call, dpt_call, dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(-np.pi / 2, np.pi / 2, 60, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5, 4))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np_call(Xnp, out=Xnp)
+
+    Y = dpt_call(X, out=X)
+    assert Y is X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+
+    Ynp = np_call(Xnp, out=Xnp[::-1])
+    Y = dpt_call(X, out=X[::-1])
+    assert Y is not X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+    assert np.allclose(dpt.asnumpy(Y), Ynp)
diff --git a/dpctl/tests/elementwise/test_sqrt.py b/dpctl/tests/elementwise/test_sqrt.py
index ce168a5ccb..a15f5262a7 100644
--- a/dpctl/tests/elementwise/test_sqrt.py
+++ b/dpctl/tests/elementwise/test_sqrt.py
@@ -18,7 +18,7 @@
 
 import numpy as np
 import pytest
-from numpy.testing import assert_equal
+from numpy.testing import assert_allclose, assert_equal
 
 import dpctl.tensor as dpt
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
@@ -50,7 +50,7 @@ def test_sqrt_output_contig(dtype):
     Y = dpt.sqrt(X)
     tol = 8 * dpt.finfo(Y.dtype).resolution
 
-    np.testing.assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
@@ -66,7 +66,7 @@ def test_sqrt_output_strided(dtype):
     Y = dpt.sqrt(X)
     tol = 8 * dpt.finfo(Y.dtype).resolution
 
-    np.testing.assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), np.sqrt(Xnp), atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("usm_type", _usm_types)
@@ -89,7 +89,7 @@ def test_sqrt_usm_type(usm_type):
     expected_Y[..., 1::2] = np.sqrt(np.float32(23.0))
     tol = 8 * dpt.finfo(Y.dtype).resolution
 
-    np.testing.assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
 
 
 @pytest.mark.parametrize("dtype", _all_dtypes)
@@ -112,11 +112,10 @@ def test_sqrt_order(dtype):
                 dpt.finfo(Y.dtype).resolution,
                 np.finfo(expected_Y.dtype).resolution,
             )
-            np.testing.assert_allclose(
-                dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol
-            )
+            assert_allclose(dpt.asnumpy(Y), expected_Y, atol=tol, rtol=tol)
 
 
+@pytest.mark.usefixtures("suppress_invalid_numpy_warnings")
 def test_sqrt_special_cases():
     q = get_queue_or_skip()
 
@@ -126,3 +125,27 @@ def test_sqrt_special_cases():
     Xnp = dpt.asnumpy(X)
 
     assert_equal(dpt.asnumpy(dpt.sqrt(X)), np.sqrt(Xnp))
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_sqrt_out_overlap(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(0, 35, 60, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5, 4))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np.sqrt(Xnp, out=Xnp)
+
+    Y = dpt.sqrt(X, out=X)
+    assert Y is X
+
+    tol = 8 * dpt.finfo(Y.dtype).resolution
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+
+    Ynp = np.sqrt(Xnp, out=Xnp[::-1])
+    Y = dpt.sqrt(X, out=X[::-1])
+    assert Y is not X
+    assert_allclose(dpt.asnumpy(X), Xnp, atol=tol, rtol=tol)
+    assert_allclose(dpt.asnumpy(Y), Ynp, atol=tol, rtol=tol)
diff --git a/dpctl/tests/elementwise/test_square.py b/dpctl/tests/elementwise/test_square.py
index 95ec163e2f..3af0528944 100644
--- a/dpctl/tests/elementwise/test_square.py
+++ b/dpctl/tests/elementwise/test_square.py
@@ -97,3 +97,29 @@ def test_square_special_cases(dtype):
             rtol=tol,
             equal_nan=True,
         )
+
+
+@pytest.mark.parametrize("dtype", ["f2", "f4", "f8", "c8", "c16"])
+def test_square_out_overlap(dtype):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    X = dpt.linspace(0, 35, 60, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5, 4))
+
+    Xnp = dpt.asnumpy(X)
+    Ynp = np.square(Xnp, out=Xnp)
+
+    Y = dpt.square(X, out=X)
+    assert Y is X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+
+    X = dpt.linspace(0, 35, 60, dtype=dtype, sycl_queue=q)
+    X = dpt.reshape(X, (3, 5, 4))
+    Xnp = dpt.asnumpy(X)
+
+    Ynp = np.square(Xnp, out=Xnp[::-1])
+    Y = dpt.square(X, out=X[::-1])
+    assert Y is not X
+    assert np.allclose(dpt.asnumpy(X), Xnp)
+    assert np.allclose(dpt.asnumpy(Y), Ynp)
diff --git a/setup.py b/setup.py
index 6eda8f29f0..2ec9dbbde9 100644
--- a/setup.py
+++ b/setup.py
@@ -149,20 +149,20 @@ def _get_cmdclass():
     package_data={"dpctl": ["tests/*.*", "tests/helper/*.py"]},
     include_package_data=True,
     zip_safe=False,
-    setup_requires=["Cython"],
+    setup_requires=["Cython<3"],
     install_requires=[
         "numpy",
     ],
     extras_require={
         "docs": [
-            "Cython",
+            "Cython<3",
             "sphinx",
             "sphinx_rtd_theme",
             "pydot",
             "graphviz",
             "sphinxcontrib-programoutput",
         ],
-        "coverage": ["Cython", "pytest", "pytest-cov", "coverage", "tomli"],
+        "coverage": ["Cython<3", "pytest", "pytest-cov", "coverage", "tomli"],
     },
     keywords="dpctl",
     classifiers=[