From c2601fd0e48b0d2e9111797f8b2decbc68864c2b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 31 Jan 2024 12:12:19 +0100
Subject: [PATCH 01/39] Impl dpnp.linalg.qr for 2d array

---
 dpnp/backend/extensions/lapack/CMakeLists.txt |   2 +
 dpnp/backend/extensions/lapack/geqrf.cpp      | 244 +++++++++++++++++
 dpnp/backend/extensions/lapack/geqrf.hpp      |  51 ++++
 dpnp/backend/extensions/lapack/lapack_py.cpp  |  17 ++
 dpnp/backend/extensions/lapack/orgqr.cpp      | 245 ++++++++++++++++++
 dpnp/backend/extensions/lapack/orgqr.hpp      |  54 ++++
 .../extensions/lapack/types_matrix.hpp        |  46 ++++
 dpnp/linalg/dpnp_iface_linalg.py              |  51 ++--
 dpnp/linalg/dpnp_utils_linalg.py              | 133 ++++++++++
 9 files changed, 828 insertions(+), 15 deletions(-)
 create mode 100644 dpnp/backend/extensions/lapack/geqrf.cpp
 create mode 100644 dpnp/backend/extensions/lapack/geqrf.hpp
 create mode 100644 dpnp/backend/extensions/lapack/orgqr.cpp
 create mode 100644 dpnp/backend/extensions/lapack/orgqr.hpp

diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index 0cfdc1a1677..b1d2ecb87df 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -27,10 +27,12 @@
 set(python_module_name _lapack_impl)
 set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/lapack_py.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/geqrf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gesv.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getrf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getrf_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/heevd.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/orgqr.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/potrf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/potrf_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/syevd.cpp
diff --git a/dpnp/backend/extensions/lapack/geqrf.cpp b/dpnp/backend/extensions/lapack/geqrf.cpp
new file mode 100644
index 00000000000..c32de58fccf
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/geqrf.cpp
@@ -0,0 +1,244 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "geqrf.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*geqrf_impl_fn_ptr_t)(sycl::queue,
+                                           const std::int64_t,
+                                           const std::int64_t,
+                                           char *,
+                                           std::int64_t,
+                                           char *,
+                                           std::vector<sycl::event> &,
+                                           const std::vector<sycl::event> &);
+
+static geqrf_impl_fn_ptr_t geqrf_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event geqrf_impl(sycl::queue exec_q,
+                              const std::int64_t m,
+                              const std::int64_t n,
+                              char *in_a,
+                              std::int64_t lda,
+                              char *in_tau,
+                              std::vector<sycl::event> &host_task_events,
+                              const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *tau = reinterpret_cast<T *>(in_tau);
+
+    const std::int64_t scratchpad_size =
+        oneapi::mkl::lapack::geqrf_scratchpad_size<T>(exec_q, m, n, lda);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event geqrf_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        geqrf_event = oneapi::mkl::lapack::geqrf(
+            exec_q,
+            m,          // The number of rows in the matrix; (0 ≤ m).
+            n,          // The number of columns in the matrix; (0 ≤ n).
+            a,          // Pointer to the m-by-n matrix.
+            lda,        // The leading dimension of `a`; (1 ≤ m).
+            tau,        // Pointer to the array of scalar factors of the
+                        // elementary reflectors.
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail();
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during geqrf() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << info;
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg << "Unexpected SYCL exception caught during geqrf() call:\n"
+                  << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(geqrf_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+
+    return geqrf_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    geqrf(sycl::queue q,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray tau_array,
+          const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int tau_array_nd = tau_array.get_ndim();
+
+    if (a_array_nd != 2) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but a 2-dimensional array is expected.");
+    }
+
+    if (tau_array_nd != 1) {
+        throw py::value_error("The array of Householder scalars has ndim=" +
+                              std::to_string(tau_array_nd) +
+                              ", but a 1-dimensional array is expected.");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, tau_array)) {
+        throw py::value_error(
+            "The input array and the array of Householder scalars "
+            "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    if (!is_a_array_c_contig) {
+        throw py::value_error("The input array "
+                              "must be C-contiguous");
+    }
+
+    bool is_tau_array_c_contig = tau_array.is_c_contiguous();
+    bool is_tau_array_f_contig = tau_array.is_f_contiguous();
+
+    if (!is_tau_array_c_contig || !is_tau_array_f_contig) {
+        throw py::value_error("The array of Householder scalars "
+                              "must be contiguous");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+
+    geqrf_impl_fn_ptr_t geqrf_fn = geqrf_dispatch_vector[a_array_type_id];
+    if (geqrf_fn == nullptr) {
+        throw py::value_error(
+            "No geqrf implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    char *a_array_data = a_array.get_data();
+    char *tau_array_data = tau_array.get_data();
+
+    const py::ssize_t *a_array_shape = a_array.get_shape_raw();
+
+    // The input array is transponded
+    // Change the order of getting m, n
+    const std::int64_t m = a_array_shape[1];
+    const std::int64_t n = a_array_shape[0];
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event geqrf_ev = geqrf_fn(q, m, n, a_array_data, lda, tau_array_data,
+                                    host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array},
+                                                        host_task_events);
+
+    return std::make_pair(args_ev, geqrf_ev);
+}
+
+template <typename fnT, typename T>
+struct GeqrfContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::GeqrfTypePairSupportFactory<T>::is_defined) {
+            return geqrf_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_geqrf_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<geqrf_impl_fn_ptr_t, GeqrfContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(geqrf_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/geqrf.hpp b/dpnp/backend/extensions/lapack/geqrf.hpp
new file mode 100644
index 00000000000..172c7fc76fa
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/geqrf.hpp
@@ -0,0 +1,51 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+#include <dpctl4pybind11.hpp>
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+extern std::pair<sycl::event, sycl::event>
+    geqrf(sycl::queue exec_q,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray tau_array,
+          const std::vector<sycl::event> &depends = {});
+
+extern void init_geqrf_dispatch_vector(void);
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp
index f97a0dc4a43..6e487ccefb3 100644
--- a/dpnp/backend/extensions/lapack/lapack_py.cpp
+++ b/dpnp/backend/extensions/lapack/lapack_py.cpp
@@ -30,10 +30,12 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include "geqrf.hpp"
 #include "gesv.hpp"
 #include "getrf.hpp"
 #include "heevd.hpp"
 #include "linalg_exceptions.hpp"
+#include "orgqr.hpp"
 #include "potrf.hpp"
 #include "syevd.hpp"
 
@@ -43,9 +45,11 @@ namespace py = pybind11;
 // populate dispatch vectors
 void init_dispatch_vectors(void)
 {
+    lapack_ext::init_geqrf_dispatch_vector();
     lapack_ext::init_gesv_dispatch_vector();
     lapack_ext::init_getrf_batch_dispatch_vector();
     lapack_ext::init_getrf_dispatch_vector();
+    lapack_ext::init_orgqr_dispatch_vector();
     lapack_ext::init_potrf_batch_dispatch_vector();
     lapack_ext::init_potrf_dispatch_vector();
     lapack_ext::init_syevd_dispatch_vector();
@@ -67,6 +71,12 @@ PYBIND11_MODULE(_lapack_impl, m)
     init_dispatch_vectors();
     init_dispatch_tables();
 
+    m.def("_geqrf", &lapack_ext::geqrf,
+          "Call `geqrf` from OneMKL LAPACK library to return "
+          "the QR factorization of a general m x n matrix ",
+          py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"),
+          py::arg("depends") = py::list());
+
     m.def("_gesv", &lapack_ext::gesv,
           "Call `gesv` from OneMKL LAPACK library to return "
           "the solution of a system of linear equations with "
@@ -95,6 +105,13 @@ PYBIND11_MODULE(_lapack_impl, m)
           py::arg("eig_vecs"), py::arg("eig_vals"),
           py::arg("depends") = py::list());
 
+    m.def("_orgqr", &lapack_ext::orgqr,
+          "Call `orgqr` from OneMKL LAPACK library to return "
+          "the real orthogonal matrix Q of the QR factorization",
+          py::arg("sycl_queue"), py::arg("m"), py::arg("n"), py::arg("k"),
+          py::arg("a_array"), py::arg("tau_array"),
+          py::arg("depends") = py::list());
+
     m.def("_potrf", &lapack_ext::potrf,
           "Call `potrf` from OneMKL LAPACK library to return "
           "the Cholesky factorization of a symmetric positive-definite matrix",
diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp
new file mode 100644
index 00000000000..25f2c29f3f5
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/orgqr.cpp
@@ -0,0 +1,245 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "orgqr.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*orgqr_impl_fn_ptr_t)(sycl::queue,
+                                           const std::int64_t,
+                                           const std::int64_t,
+                                           const std::int64_t,
+                                           char *,
+                                           std::int64_t,
+                                           char *,
+                                           std::vector<sycl::event> &,
+                                           const std::vector<sycl::event> &);
+
+static orgqr_impl_fn_ptr_t orgqr_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event orgqr_impl(sycl::queue exec_q,
+                              const std::int64_t m,
+                              const std::int64_t n,
+                              const std::int64_t k,
+                              char *in_a,
+                              std::int64_t lda,
+                              char *in_tau,
+                              std::vector<sycl::event> &host_task_events,
+                              const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *tau = reinterpret_cast<T *>(in_tau);
+
+    const std::int64_t scratchpad_size =
+        oneapi::mkl::lapack::orgqr_scratchpad_size<T>(exec_q, m, n, k, lda);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event orgqr_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        orgqr_event = oneapi::mkl::lapack::orgqr(
+            exec_q,
+            m,          // The number of rows in the matrix; (0 ≤ m).
+            n,          // The number of columns in the matrix; (0 ≤ n).
+            k,          // The number of elementary reflectors
+                        // whose product defines the matrix Q; (0 ≤ k ≤ n).
+            a,          // Pointer to the m-by-n matrix.
+            lda,        // The leading dimension of `a`; (1 ≤ m).
+            tau,        // Pointer to the array of scalar factors of the
+                        // elementary reflectors.
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail();
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during orgqr() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << info;
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg << "Unexpected SYCL exception caught during orfqr() call:\n"
+                  << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(orgqr_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+
+    return orgqr_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    orgqr(sycl::queue q,
+          const std::int64_t m,
+          const std::int64_t n,
+          const std::int64_t k,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray tau_array,
+          const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int tau_array_nd = tau_array.get_ndim();
+
+    if (a_array_nd != 2) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but a 2-dimensional array is expected.");
+    }
+
+    if (tau_array_nd != 1) {
+        throw py::value_error("The array of Householder scalars has ndim=" +
+                              std::to_string(tau_array_nd) +
+                              ", but a 1-dimensional array is expected.");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, tau_array)) {
+        throw py::value_error(
+            "The input array and the array of Householder scalars "
+            "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    if (!is_a_array_c_contig) {
+        throw py::value_error("The input array "
+                              "must be C-contiguous");
+    }
+
+    bool is_tau_array_c_contig = tau_array.is_c_contiguous();
+    bool is_tau_array_f_contig = tau_array.is_f_contiguous();
+
+    if (!is_tau_array_c_contig || !is_tau_array_f_contig) {
+        throw py::value_error("The array of Householder scalars "
+                              "must be contiguous");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+
+    orgqr_impl_fn_ptr_t orgqr_fn = orgqr_dispatch_vector[a_array_type_id];
+    if (orgqr_fn == nullptr) {
+        throw py::value_error(
+            "No orgqr implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    char *a_array_data = a_array.get_data();
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+
+    char *tau_array_data = tau_array.get_data();
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event orgqr_ev = orgqr_fn(q, m, n, k, a_array_data, lda,
+                                    tau_array_data, host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array},
+                                                        host_task_events);
+
+    return std::make_pair(args_ev, orgqr_ev);
+}
+
+template <typename fnT, typename T>
+struct OrgqrContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::OrgqrTypePairSupportFactory<T>::is_defined) {
+            return orgqr_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_orgqr_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<orgqr_impl_fn_ptr_t, OrgqrContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(orgqr_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/orgqr.hpp b/dpnp/backend/extensions/lapack/orgqr.hpp
new file mode 100644
index 00000000000..cd0938dd72a
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/orgqr.hpp
@@ -0,0 +1,54 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+#include <dpctl4pybind11.hpp>
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+extern std::pair<sycl::event, sycl::event>
+    orgqr(sycl::queue exec_q,
+          const std::int64_t m,
+          const std::int64_t n,
+          const std::int64_t k,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray tau_array,
+          const std::vector<sycl::event> &depends = {});
+
+extern void init_orgqr_dispatch_vector(void);
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/types_matrix.hpp b/dpnp/backend/extensions/lapack/types_matrix.hpp
index 0277e630738..3f7be9cf4a9 100644
--- a/dpnp/backend/extensions/lapack/types_matrix.hpp
+++ b/dpnp/backend/extensions/lapack/types_matrix.hpp
@@ -43,6 +43,33 @@ namespace lapack
 {
 namespace types
 {
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::geqrf<T>
+ * function.
+ *
+ * @tparam T Type of array containing the input matrix to be QR factorized.
+ * Upon execution, this matrix is transformed to output arrays representing
+ * the orthogonal matrix Q and the upper triangular matrix R.
+ */
+template <typename T>
+struct GeqrfTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T, double, T, double>,
+        dpctl_td_ns::TypePairDefinedEntry<T, float, T, float>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<float>,
+                                          T,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<double>,
+                                          T,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
 /**
  * @brief A factory to define pairs of supported types for which
  * MKL LAPACK library provides support in oneapi::mkl::lapack::gesv<T>
@@ -142,6 +169,25 @@ struct HeevdTypePairSupportFactory
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
 
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::orgqr<T>
+ * function.
+ *
+ * @tparam T Type of array containing the matrix A from which the
+ * elementary reflectors were generated (as in QR factorization).
+ * Upon execution, the array is overwritten with the orthonormal matrix Q.
+ */
+template <typename T>
+struct OrgqrTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T, double, T, double>,
+        dpctl_td_ns::TypePairDefinedEntry<T, float, T, float>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
 /**
  * @brief A factory to define pairs of supported types for which
  * MKL LAPACK library provides support in oneapi::mkl::lapack::potrf<T>
diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py
index 2f7420d1329..62ce3a8ab9b 100644
--- a/dpnp/linalg/dpnp_iface_linalg.py
+++ b/dpnp/linalg/dpnp_iface_linalg.py
@@ -50,6 +50,7 @@
     dpnp_cholesky,
     dpnp_det,
     dpnp_eigh,
+    dpnp_qr,
     dpnp_slogdet,
     dpnp_solve,
 )
@@ -506,7 +507,7 @@ def norm(x1, ord=None, axis=None, keepdims=False):
     return call_origin(numpy.linalg.norm, x1, ord, axis, keepdims)
 
 
-def qr(x1, mode="reduced"):
+def qr(a, mode="reduced"):
     """
     Compute the qr factorization of a matrix.
 
@@ -515,25 +516,45 @@ def qr(x1, mode="reduced"):
 
     For full documentation refer to :obj:`numpy.linalg.qr`.
 
-    Limitations
-    -----------
-    Input array is supported as :obj:`dpnp.ndarray`.
-    Parameter mode='reduced' is supported.
+    Parameters
+    ----------
+    a : {dpnp.ndarray, usm_ndarray}
+        The input array with the dimensionality of at least 2.
+    mode : {"reduced", "complete", "r", "raw"}, optional
+        If K = min(M, N), then
+        - "reduced" : returns Q, R with dimensions (…, M, K), (…, K, N)
+        - "complete" : returns Q, R with dimensions (…, M, M), (…, M, N)
+        - "r" : returns R only with dimensions (…, K, N)
+        - "raw" : returns h, tau with dimensions (…, N, M), (…, K,)
+        Default: "reduced".
+
+    Returns
+    -------
+    out : {dpnp.ndarray, tuple of dpnp.ndarray}
+        Although the type of returned object depends on the mode,
+        it returns a tuple of ``(Q, R)`` by default.
+        For details, please see the document of :func:`numpy.linalg.qr`.
+
+    Examples
+    --------
+    >>> import dpnp as np
+    >>> a = np.random.randn(9, 6)
+    >>> Q, R = np.linalg.qr(a)
+    >>> np.allclose(a, np.dot(Q, R))
+    array([ True])
+    >>> R2 = np.linalg.qr(a, mode='r')
+    >>> np.allclose(R, R2)
+    array([ True])
 
     """
 
-    x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_nondefault_queue=False)
-    if x1_desc:
-        if x1_desc.ndim != 2:
-            pass
-        elif mode != "reduced":
-            pass
-        else:
-            result_tup = dpnp_qr(x1_desc, mode)
+    dpnp.check_supported_arrays_type(a)
+    check_stacked_2d(a)
 
-            return result_tup
+    if mode not in ("reduced", "complete", "r", "raw"):
+        raise ValueError(f"Unrecognized mode {mode}")
 
-    return call_origin(numpy.linalg.qr, x1, mode)
+    return dpnp_qr(a, mode)
 
 
 def solve(a, b):
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 40159ac02bc..71f27bef7c2 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -740,6 +740,139 @@ def dpnp_eigh(a, UPLO):
         return w, out_v
 
 
+def dpnp_qr(a, mode="reduced"):
+    """
+    dpnp_qr(a, mode="reduced")
+
+    Return the qr factorization of `a` matrix.
+
+    """
+
+    if a.ndim > 2:
+        pass
+
+    a_usm_arr = dpnp.get_usm_ndarray(a)
+    a_sycl_queue = a.sycl_queue
+    a_usm_type = a.usm_type
+
+    res_type = _common_type(a)
+
+    m, n = a.shape
+    k = min(m, n)
+    if k == 0:
+        if mode == "reduced":
+            return dpnp.empty(
+                (m, 0),
+                dtype=res_type,
+                sycl_queue=a_sycl_queue,
+                usm_type=a_usm_type,
+            ), dpnp.empty(
+                (0, n),
+                dtype=res_type,
+                sycl_queue=a_sycl_queue,
+                usm_type=a_usm_type,
+            )
+        elif mode == "complete":
+            return dpnp.identity(
+                m, dtype=res_type, sycl_queue=a_sycl_queue, usm_type=a_usm_type
+            ), dpnp.empty(
+                (m, n),
+                dtype=res_type,
+                sycl_queue=a_sycl_queue,
+                usm_type=a_usm_type,
+            )
+        elif mode == "r":
+            return dpnp.empty(
+                (0, n),
+                dtype=res_type,
+                sycl_queue=a_sycl_queue,
+                usm_type=a_usm_type,
+            )
+        else:  # mode == "raw"
+            return dpnp.empty(
+                (n, m),
+                dtype=res_type,
+                sycl_queue=a_sycl_queue,
+                usm_type=a_usm_type,
+            ), dpnp.empty(
+                (0,),
+                dtype=res_type,
+                sycl_queue=a_sycl_queue,
+                usm_type=a_usm_type,
+            )
+
+    a = a.T
+    a_usm_arr = dpnp.get_usm_ndarray(a)
+    a_t = dpnp.empty_like(a, order="C", dtype=res_type, usm_type=a_usm_type)
+
+    # use DPCTL tensor function to fill the matrix array
+    # with content from the input array `a`
+    a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_usm_arr, dst=a_t.get_array(), sycl_queue=a_sycl_queue
+    )
+
+    tau_h = dpnp.empty(
+        k, dtype=res_type, sycl_queue=a_sycl_queue, usm_type=a_usm_type
+    )
+
+    # Call the LAPACK extension function _geqrf to compute the QR factorization
+    # of a general m x n matrix.
+    ht_geqrf_ev, geqrf_ev = li._geqrf(
+        a_sycl_queue, a_t.get_array(), tau_h.get_array(), [a_copy_ev]
+    )
+
+    ht_geqrf_ev.wait()
+    a_ht_copy_ev.wait()
+
+    if mode == "r":
+        r = a_t[:, :k].transpose()
+        return dpnp.triu(r).astype(res_type, copy=False)
+
+    if mode == "raw":
+        return (
+            a_t.astype(res_type, copy=False),
+            tau_h.astype(res_type, copy=False),
+        )
+
+    # mc is the total number of columns in the q matrix.
+    # In `complete` mode, mc equals the number of rows.
+    # In `reduced` mode, mc is the lesser of the row count or column count.
+    if mode == "complete" and m > n:
+        mc = m
+        q = dpnp.empty(
+            (m, m),
+            dtype=res_type,
+            order="C",
+            sycl_queue=a_sycl_queue,
+            usm_type=a_usm_type,
+        )
+    else:
+        mc = k
+        q = dpnp.empty(
+            (n, m),
+            dtype=res_type,
+            order="C",
+            sycl_queue=a_sycl_queue,
+            usm_type=a_usm_type,
+        )
+    q[:n] = a_t
+
+    # Call the LAPACK extension function _orgqr to generate the real orthogonal matrix
+    # `Q` of the QR factorization
+    ht_orgqr_ev, _ = li._orgqr(
+        a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [geqrf_ev]
+    )
+
+    ht_orgqr_ev.wait()
+
+    q = q[:mc].transpose()
+    r = a_t[:, :mc].transpose()
+    return (
+        q.astype(dtype=res_type, copy=False),
+        dpnp.triu(r).astype(dtype=res_type, copy=False),
+    )
+
+
 def dpnp_solve(a, b):
     """
     dpnp_solve(a, b)

From 8d644aa3c29d315f962123eef8377043dc3413cc Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 31 Jan 2024 12:19:39 +0100
Subject: [PATCH 02/39] Update condition and random files in cupy/testing

---
 tests/third_party/cupy/testing/__init__.py  |  4 +---
 tests/third_party/cupy/testing/condition.py |  2 +-
 tests/third_party/cupy/testing/random.py    | 11 +++++++----
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/tests/third_party/cupy/testing/__init__.py b/tests/third_party/cupy/testing/__init__.py
index 701c381e2f3..aa6c113706b 100644
--- a/tests/third_party/cupy/testing/__init__.py
+++ b/tests/third_party/cupy/testing/__init__.py
@@ -60,6 +60,4 @@
     product,
     product_dict,
 )
-from tests.third_party.cupy.testing.random import fix_random
-
-# from tests.third_party.cupy.testing.random import generate_seed
+from tests.third_party.cupy.testing.random import fix_random, generate_seed
diff --git a/tests/third_party/cupy/testing/condition.py b/tests/third_party/cupy/testing/condition.py
index 4465dc3d0ee..3533ef8b84d 100644
--- a/tests/third_party/cupy/testing/condition.py
+++ b/tests/third_party/cupy/testing/condition.py
@@ -106,7 +106,7 @@ def repeat(times, intensive_times=None):
     if intensive_times is None:
         return repeat_with_success_at_least(times, times)
 
-    casual_test = bool(int(os.environ.get("CUPY_TEST_CASUAL", "0")))
+    casual_test = bool(int(os.environ.get("CUPY_TEST_CASUAL", "1")))
     times_ = times if casual_test else intensive_times
     return repeat_with_success_at_least(times_, times_)
 
diff --git a/tests/third_party/cupy/testing/random.py b/tests/third_party/cupy/testing/random.py
index 444f2b3352c..a4dc897bb9c 100644
--- a/tests/third_party/cupy/testing/random.py
+++ b/tests/third_party/cupy/testing/random.py
@@ -20,12 +20,15 @@ def do_setup(deterministic=True):
     global _old_cupy_random_states
     _old_python_random_state = random.getstate()
     _old_numpy_random_state = numpy.random.get_state()
-    _old_cupy_random_states = cupy.random.generator._random_states
-    cupy.random.reset_states()
+    _old_cupy_random_states = cupy.random.dpnp_iface_random._dpnp_random_states
+    cupy.random.dpnp_iface_random._dpnp_random_states = {}
     # Check that _random_state has been recreated in
     # cupy.random.reset_states(). Otherwise the contents of
     # _old_cupy_random_states would be overwritten.
-    assert cupy.random.generator._random_states is not _old_cupy_random_states
+    assert (
+        cupy.random.dpnp_iface_random._dpnp_random_states
+        is not _old_cupy_random_states
+    )
 
     if not deterministic:
         random.seed()
@@ -43,7 +46,7 @@ def do_teardown():
     global _old_cupy_random_states
     random.setstate(_old_python_random_state)
     numpy.random.set_state(_old_numpy_random_state)
-    cupy.random.generator._random_states = _old_cupy_random_states
+    cupy.random.dpnp_iface_random._dpnp_random_states = _old_cupy_random_states
     _old_python_random_state = None
     _old_numpy_random_state = None
     _old_cupy_random_states = None

From 3a6cb9f484b13801082e7541c01b5e63419e4428 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 31 Jan 2024 12:23:22 +0100
Subject: [PATCH 03/39] Rename cupy/testing/condition.py to .../_condition.py

---
 .../cupy/random_tests/test_sample.py          | 24 +++++++++----------
 .../testing/{condition.py => _condition.py}   |  0
 2 files changed, 12 insertions(+), 12 deletions(-)
 rename tests/third_party/cupy/testing/{condition.py => _condition.py} (100%)

diff --git a/tests/third_party/cupy/random_tests/test_sample.py b/tests/third_party/cupy/random_tests/test_sample.py
index f95f3e42710..79e2370ad05 100644
--- a/tests/third_party/cupy/random_tests/test_sample.py
+++ b/tests/third_party/cupy/random_tests/test_sample.py
@@ -7,7 +7,7 @@
 import dpnp as cupy
 from dpnp import random
 from tests.third_party.cupy import testing
-from tests.third_party.cupy.testing import condition, hypothesis
+from tests.third_party.cupy.testing import _condition, hypothesis
 
 
 @testing.gpu
@@ -43,7 +43,7 @@ def test_zero_sizes(self):
 @testing.gpu
 class TestRandint2(unittest.TestCase):
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_bound_1(self):
         vals = [random.randint(0, 10, (2, 3)) for _ in range(10)]
         for val in vals:
@@ -52,7 +52,7 @@ def test_bound_1(self):
         self.assertEqual(max(_.max() for _ in vals), 9)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_bound_2(self):
         vals = [random.randint(0, 2) for _ in range(20)]
         for val in vals:
@@ -61,7 +61,7 @@ def test_bound_2(self):
         self.assertEqual(max(_.max() for _ in vals), 1)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_bound_overflow(self):
         # 100 - (-100) exceeds the range of int8
         val = random.randint(numpy.int8(-100), numpy.int8(100), size=20)
@@ -70,7 +70,7 @@ def test_bound_overflow(self):
         self.assertLess(val.max(), 100)
 
     @pytest.mark.usefixtures("allow_fall_back_on_numpy")
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_bound_float1(self):
         # generate floats s.t. int(low) < int(high)
         low, high = sorted(numpy.random.uniform(-5, 5, size=2))
@@ -90,7 +90,7 @@ def test_bound_float2(self):
         self.assertEqual(min(_.min() for _ in vals), -1)
         self.assertEqual(max(_.max() for _ in vals), 0)
 
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_goodness_of_fit(self):
         mx = 5
         trial = 100
@@ -99,7 +99,7 @@ def test_goodness_of_fit(self):
         expected = numpy.array([float(trial) / mx] * mx)
         self.assertTrue(hypothesis.chi_square_test(counts, expected))
 
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_goodness_of_fit_2(self):
         mx = 5
         vals = random.randint(mx, size=(5, 20))
@@ -169,7 +169,7 @@ def test_size_is_not_none(self):
 @testing.fix_random()
 @testing.gpu
 class TestRandomIntegers2(unittest.TestCase):
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_bound_1(self):
         vals = [random.random_integers(0, 10, (2, 3)).get() for _ in range(10)]
         for val in vals:
@@ -177,7 +177,7 @@ def test_bound_1(self):
         self.assertEqual(min(_.min() for _ in vals), 0)
         self.assertEqual(max(_.max() for _ in vals), 10)
 
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_bound_2(self):
         vals = [random.random_integers(0, 2).get() for _ in range(20)]
         for val in vals:
@@ -185,7 +185,7 @@ def test_bound_2(self):
         self.assertEqual(min(vals), 0)
         self.assertEqual(max(vals), 2)
 
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_goodness_of_fit(self):
         mx = 5
         trial = 100
@@ -194,7 +194,7 @@ def test_goodness_of_fit(self):
         expected = numpy.array([float(trial) / mx] * mx)
         self.assertTrue(hypothesis.chi_square_test(counts, expected))
 
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     def test_goodness_of_fit_2(self):
         mx = 5
         vals = random.randint(0, mx, (5, 20)).get()
@@ -289,7 +289,7 @@ def test_randn_invalid_argument(self):
 @testing.fix_random()
 @testing.gpu
 class TestMultinomial(unittest.TestCase):
-    @condition.repeat(3, 10)
+    @_condition.repeat(3, 10)
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose(rtol=0.05)
     def test_multinomial(self, xp, dtype):
diff --git a/tests/third_party/cupy/testing/condition.py b/tests/third_party/cupy/testing/_condition.py
similarity index 100%
rename from tests/third_party/cupy/testing/condition.py
rename to tests/third_party/cupy/testing/_condition.py

From 872a713d9b760b93c56c6255375a5660d4c44789 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 31 Jan 2024 12:25:44 +0100
Subject: [PATCH 04/39] Add cupy tests for dpnp.linalg.qr

---
 .../cupy/linalg_tests/test_decomposition.py   | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/tests/third_party/cupy/linalg_tests/test_decomposition.py b/tests/third_party/cupy/linalg_tests/test_decomposition.py
index 42bcf122ff4..a9ce10e2051 100644
--- a/tests/third_party/cupy/linalg_tests/test_decomposition.py
+++ b/tests/third_party/cupy/linalg_tests/test_decomposition.py
@@ -6,6 +6,7 @@
 import dpnp as cupy
 from tests.helper import has_support_aspect64, is_cpu_device
 from tests.third_party.cupy import testing
+from tests.third_party.cupy.testing import _condition
 
 
 def random_matrix(shape, dtype, scale, sym=False):
@@ -135,3 +136,74 @@ def check_L(self, array):
     def test_decomposition(self, dtype):
         A = numpy.array([[1, -2], [-2, 1]]).astype(dtype)
         self.check_L(A)
+
+
+@testing.parameterize(
+    *testing.product(
+        {
+            "mode": ["r", "raw", "complete", "reduced"],
+        }
+    )
+)
+class TestQRDecomposition(unittest.TestCase):
+    @testing.for_dtypes("fdFD")
+    def check_mode(self, array, mode, dtype):
+        if dtype in (numpy.complex64, numpy.complex128):
+            pytest.skip("ungqr unsupported")
+
+        a_cpu = numpy.asarray(array, dtype=dtype)
+        a_gpu = cupy.asarray(array, dtype=dtype)
+        result_gpu = cupy.linalg.qr(a_gpu, mode=mode)
+        if (
+            mode != "raw"
+            or numpy.lib.NumpyVersion(numpy.__version__) >= "1.22.0rc1"
+        ):
+            result_cpu = numpy.linalg.qr(a_cpu, mode=mode)
+            self._check_result(result_cpu, result_gpu)
+
+    def _check_result(self, result_cpu, result_gpu):
+        if isinstance(result_cpu, tuple):
+            for b_cpu, b_gpu in zip(result_cpu, result_gpu):
+                assert b_cpu.dtype == b_gpu.dtype
+                testing.assert_allclose(b_cpu, b_gpu, atol=1e-4)
+        else:
+            assert result_cpu.dtype == result_gpu.dtype
+            testing.assert_allclose(result_cpu, result_gpu, atol=1e-4)
+
+    @testing.fix_random()
+    @_condition.repeat(3, 10)
+    def test_mode(self):
+        self.check_mode(numpy.random.randn(2, 4), mode=self.mode)
+        self.check_mode(numpy.random.randn(3, 3), mode=self.mode)
+        self.check_mode(numpy.random.randn(5, 4), mode=self.mode)
+
+    # @testing.with_requires("numpy>=1.22")
+    # # @testing.fix_random()
+    # def test_mode_rank3(self):
+    #     self.check_mode(numpy.random.randn(3, 2, 4), mode=self.mode)
+    #     self.check_mode(numpy.random.randn(4, 3, 3), mode=self.mode)
+    #     self.check_mode(numpy.random.randn(2, 5, 4), mode=self.mode)
+
+    # @testing.with_requires("numpy>=1.22")
+    # # @testing.fix_random()
+    # def test_mode_rank4(self):
+    #     self.check_mode(numpy.random.randn(2, 3, 2, 4), mode=self.mode)
+    #     self.check_mode(numpy.random.randn(2, 4, 3, 3), mode=self.mode)
+    #     self.check_mode(numpy.random.randn(2, 2, 5, 4), mode=self.mode)
+
+    # @testing.with_requires("numpy>=1.16")
+    # def test_empty_array(self):
+    #     self.check_mode(numpy.empty((0, 3)), mode=self.mode)
+    #     self.check_mode(numpy.empty((3, 0)), mode=self.mode)
+
+    # @testing.with_requires("numpy>=1.22")
+    # def test_empty_array_rank3(self):
+    #     self.check_mode(numpy.empty((0, 3, 2)), mode=self.mode)
+    #     self.check_mode(numpy.empty((3, 0, 2)), mode=self.mode)
+    #     self.check_mode(numpy.empty((3, 2, 0)), mode=self.mode)
+    #     self.check_mode(numpy.empty((0, 3, 3)), mode=self.mode)
+    #     self.check_mode(numpy.empty((3, 0, 3)), mode=self.mode)
+    #     self.check_mode(numpy.empty((3, 3, 0)), mode=self.mode)
+    #     self.check_mode(numpy.empty((0, 2, 3)), mode=self.mode)
+    #     self.check_mode(numpy.empty((2, 0, 3)), mode=self.mode)
+    #     self.check_mode(numpy.empty((2, 3, 0)), mode=self.mode)

From 890bffecfce2cebed6f6349436eff66d685316cf Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 31 Jan 2024 15:38:52 +0100
Subject: [PATCH 05/39] Add batch implementation of dpnp.linalg.qr

---
 dpnp/backend/extensions/lapack/CMakeLists.txt |   2 +
 dpnp/backend/extensions/lapack/geqrf.hpp      |  12 +
 .../backend/extensions/lapack/geqrf_batch.cpp | 268 +++++++++++++++++
 dpnp/backend/extensions/lapack/lapack_py.cpp  |  19 ++
 dpnp/backend/extensions/lapack/orgqr.hpp      |  13 +
 .../backend/extensions/lapack/orgqr_batch.cpp | 273 ++++++++++++++++++
 .../extensions/lapack/types_matrix.hpp        |  49 ++++
 dpnp/linalg/dpnp_utils_linalg.py              | 178 +++++++++++-
 8 files changed, 812 insertions(+), 2 deletions(-)
 create mode 100644 dpnp/backend/extensions/lapack/geqrf_batch.cpp
 create mode 100644 dpnp/backend/extensions/lapack/orgqr_batch.cpp

diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index b1d2ecb87df..3dbd95fbe23 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -28,11 +28,13 @@ set(python_module_name _lapack_impl)
 set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/lapack_py.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/geqrf.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/geqrf_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gesv.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getrf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/getrf_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/heevd.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/orgqr.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/orgqr_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/potrf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/potrf_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/syevd.cpp
diff --git a/dpnp/backend/extensions/lapack/geqrf.hpp b/dpnp/backend/extensions/lapack/geqrf.hpp
index 172c7fc76fa..4ab65286b29 100644
--- a/dpnp/backend/extensions/lapack/geqrf.hpp
+++ b/dpnp/backend/extensions/lapack/geqrf.hpp
@@ -44,6 +44,18 @@ extern std::pair<sycl::event, sycl::event>
           dpctl::tensor::usm_ndarray tau_array,
           const std::vector<sycl::event> &depends = {});
 
+extern std::pair<sycl::event, sycl::event>
+    geqrf_batch(sycl::queue exec_q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray tau_array,
+                std::int64_t m,
+                std::int64_t n,
+                std::int64_t stride_a,
+                std::int64_t stride_tau,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends = {});
+
+extern void init_geqrf_batch_dispatch_vector(void);
 extern void init_geqrf_dispatch_vector(void);
 } // namespace lapack
 } // namespace ext
diff --git a/dpnp/backend/extensions/lapack/geqrf_batch.cpp b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
new file mode 100644
index 00000000000..64964bfe153
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
@@ -0,0 +1,268 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "geqrf.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*geqrf_batch_impl_fn_ptr_t)(
+    sycl::queue,
+    std::int64_t,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    std::int64_t,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+static geqrf_batch_impl_fn_ptr_t
+    geqrf_batch_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event geqrf_batch_impl(sycl::queue exec_q,
+                                    std::int64_t m,
+                                    std::int64_t n,
+                                    char *in_a,
+                                    std::int64_t lda,
+                                    std::int64_t stride_a,
+                                    char *in_tau,
+                                    std::int64_t stride_tau,
+                                    std::int64_t batch_size,
+                                    std::vector<sycl::event> &host_task_events,
+                                    const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *tau = reinterpret_cast<T *>(in_tau);
+
+    const std::int64_t scratchpad_size =
+        mkl_lapack::geqrf_batch_scratchpad_size<T>(exec_q, m, n, lda, stride_a,
+                                                   stride_tau, batch_size);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event geqrf_batch_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        geqrf_batch_event = mkl_lapack::geqrf_batch(
+            exec_q,
+            m, // The number of rows in each matrix in the batch; (0 ≤ m).
+               // It must be a non-negative integer.
+            n, // The number of columns in each matrix in the batch; (0 ≤ n).
+               // It must be a non-negative integer.
+            a, // Pointer to the batch of matrices, each of size (m x n).
+            lda,      // The leading dimension of each matrix in the batch.
+                      // For row major layout, lda ≥ max(1, m).
+            stride_a, // Stride between consecutive matrices in the batch.
+            tau, // Pointer to the array of scalar factors of the elementary
+                 // reflectors for each matrix in the batch.
+            stride_tau, // Stride between arrays of scalar factors in the batch.
+            batch_size, // The number of matrices in the batch.
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail();
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during geqrf_batch() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << e.info();
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg
+            << "Unexpected SYCL exception caught during geqrf_batch() call:\n"
+            << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(geqrf_batch_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+    return geqrf_batch_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    geqrf_batch(sycl::queue q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray tau_array,
+                std::int64_t m,
+                std::int64_t n,
+                std::int64_t stride_a,
+                std::int64_t stride_tau,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int tau_array_nd = tau_array.get_ndim();
+
+    if (a_array_nd < 3) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but an array with ndim >= 3 is expected.");
+    }
+
+    if (tau_array_nd != 2) {
+        throw py::value_error("The array of Householder scalars has ndim=" +
+                              std::to_string(tau_array_nd) +
+                              ", but a 2-dimensional array is expected.");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, tau_array)) {
+        throw py::value_error(
+            "The input array and the array of Householder scalars "
+            "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    bool is_tau_array_c_contig = tau_array.is_c_contiguous();
+    if (!is_a_array_c_contig) {
+        throw py::value_error("The input array "
+                              "must be C-contiguous");
+    }
+    if (!is_tau_array_c_contig) {
+        throw py::value_error("The array of Householder scalars "
+                              "must be C-contiguous");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+
+    geqrf_batch_impl_fn_ptr_t geqrf_batch_fn =
+        geqrf_batch_dispatch_vector[a_array_type_id];
+    if (geqrf_batch_fn == nullptr) {
+        throw py::value_error(
+            "No geqrf_batch implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    // auto tau_types = dpctl_td_ns::usm_ndarray_types();
+    // int tau_array_type_id =
+    //     tau_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    // if (tau_array_type_id != static_cast<int>(dpctl_td_ns::typenum_t::INT64))
+    // {
+    //     throw py::value_error("The type of 'tau_array' must be int64.");
+    // }
+
+    char *a_array_data = a_array.get_data();
+    char *tau_array_data = tau_array.get_data();
+
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event geqrf_batch_ev =
+        geqrf_batch_fn(q, m, n, a_array_data, lda, stride_a, tau_array_data,
+                       stride_tau, batch_size, host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array},
+                                                        host_task_events);
+
+    return std::make_pair(args_ev, geqrf_batch_ev);
+}
+
+template <typename fnT, typename T>
+struct GeqrfBatchContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::GeqrfBatchTypePairSupportFactory<T>::is_defined) {
+            return geqrf_batch_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_geqrf_batch_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<geqrf_batch_impl_fn_ptr_t,
+                                       GeqrfBatchContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(geqrf_batch_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp
index 6e487ccefb3..3bc3c022626 100644
--- a/dpnp/backend/extensions/lapack/lapack_py.cpp
+++ b/dpnp/backend/extensions/lapack/lapack_py.cpp
@@ -45,10 +45,12 @@ namespace py = pybind11;
 // populate dispatch vectors
 void init_dispatch_vectors(void)
 {
+    lapack_ext::init_geqrf_batch_dispatch_vector();
     lapack_ext::init_geqrf_dispatch_vector();
     lapack_ext::init_gesv_dispatch_vector();
     lapack_ext::init_getrf_batch_dispatch_vector();
     lapack_ext::init_getrf_dispatch_vector();
+    lapack_ext::init_orgqr_batch_dispatch_vector();
     lapack_ext::init_orgqr_dispatch_vector();
     lapack_ext::init_potrf_batch_dispatch_vector();
     lapack_ext::init_potrf_dispatch_vector();
@@ -71,6 +73,14 @@ PYBIND11_MODULE(_lapack_impl, m)
     init_dispatch_vectors();
     init_dispatch_tables();
 
+    m.def("_geqrf_batch", &lapack_ext::geqrf_batch,
+          "Call `geqrf_batch` from OneMKL LAPACK library to return "
+          "the QR factorization of a batch general matrix ",
+          py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"),
+          py::arg("m"), py::arg("n"), py::arg("stride_a"),
+          py::arg("stride_tau"), py::arg("batch_size"),
+          py::arg("depends") = py::list());
+
     m.def("_geqrf", &lapack_ext::geqrf,
           "Call `geqrf` from OneMKL LAPACK library to return "
           "the QR factorization of a general m x n matrix ",
@@ -105,6 +115,15 @@ PYBIND11_MODULE(_lapack_impl, m)
           py::arg("eig_vecs"), py::arg("eig_vals"),
           py::arg("depends") = py::list());
 
+    m.def("_orgqr_batch", &lapack_ext::orgqr_batch,
+          "Call `orgqr` from OneMKL LAPACK library to return "
+          "the real orthogonal matrix Qi of the QR factorization "
+          "for a batch of general matrices",
+          py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"),
+          py::arg("m"), py::arg("n"), py::arg("k"), py::arg("stride_a"),
+          py::arg("stride_tau"), py::arg("batch_size"),
+          py::arg("depends") = py::list());
+
     m.def("_orgqr", &lapack_ext::orgqr,
           "Call `orgqr` from OneMKL LAPACK library to return "
           "the real orthogonal matrix Q of the QR factorization",
diff --git a/dpnp/backend/extensions/lapack/orgqr.hpp b/dpnp/backend/extensions/lapack/orgqr.hpp
index cd0938dd72a..9cc4f530d03 100644
--- a/dpnp/backend/extensions/lapack/orgqr.hpp
+++ b/dpnp/backend/extensions/lapack/orgqr.hpp
@@ -47,6 +47,19 @@ extern std::pair<sycl::event, sycl::event>
           dpctl::tensor::usm_ndarray tau_array,
           const std::vector<sycl::event> &depends = {});
 
+extern std::pair<sycl::event, sycl::event>
+    orgqr_batch(sycl::queue exec_q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray tau_array,
+                std::int64_t m,
+                std::int64_t n,
+                std::int64_t k,
+                std::int64_t stride_a,
+                std::int64_t stride_tau,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends = {});
+
+extern void init_orgqr_batch_dispatch_vector(void);
 extern void init_orgqr_dispatch_vector(void);
 } // namespace lapack
 } // namespace ext
diff --git a/dpnp/backend/extensions/lapack/orgqr_batch.cpp b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
new file mode 100644
index 00000000000..560e9026fd6
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
@@ -0,0 +1,273 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "orgqr.hpp"
+#include "types_matrix.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*orgqr_batch_impl_fn_ptr_t)(
+    sycl::queue,
+    std::int64_t,
+    std::int64_t,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    std::int64_t,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+static orgqr_batch_impl_fn_ptr_t
+    orgqr_batch_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event orgqr_batch_impl(sycl::queue exec_q,
+                                    std::int64_t m,
+                                    std::int64_t n,
+                                    std::int64_t k,
+                                    char *in_a,
+                                    std::int64_t lda,
+                                    std::int64_t stride_a,
+                                    char *in_tau,
+                                    std::int64_t stride_tau,
+                                    std::int64_t batch_size,
+                                    std::vector<sycl::event> &host_task_events,
+                                    const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *tau = reinterpret_cast<T *>(in_tau);
+
+    const std::int64_t scratchpad_size =
+        mkl_lapack::orgqr_batch_scratchpad_size<T>(
+            exec_q, m, n, k, lda, stride_a, stride_tau, batch_size);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event orgqr_batch_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        orgqr_batch_event = mkl_lapack::orgqr_batch(
+            exec_q,
+            m, // The number of rows in each matrix in the batch; (0 ≤ m).
+               // It must be a non-negative integer.
+            n, // The number of columns in each matrix in the batch; (0 ≤ n).
+               // It must be a non-negative integer.
+            k, // The number of elementary reflectors
+               // whose product defines the matrices Qi; (0 ≤ k ≤ n).
+            a, // Pointer to the batch of matrices, each of size (m x n).
+            lda,      // The leading dimension of each matrix in the batch.
+                      // For row major layout, lda ≥ max(1, m).
+            stride_a, // Stride between consecutive matrices in the batch.
+            tau, // Pointer to the array of scalar factors of the elementary
+                 // reflectors for each matrix in the batch.
+            stride_tau, // Stride between arrays of scalar factors in the batch.
+            batch_size, // The number of matrices in the batch.
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail();
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during orgqr_batch() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << e.info();
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg
+            << "Unexpected SYCL exception caught during orgqr_batch() call:\n"
+            << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(orgqr_batch_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+    return orgqr_batch_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    orgqr_batch(sycl::queue q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray tau_array,
+                std::int64_t m,
+                std::int64_t n,
+                std::int64_t k,
+                std::int64_t stride_a,
+                std::int64_t stride_tau,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int tau_array_nd = tau_array.get_ndim();
+
+    if (a_array_nd < 3) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but an array with ndim >= 3 is expected.");
+    }
+
+    if (tau_array_nd != 2) {
+        throw py::value_error("The array of Householder scalars has ndim=" +
+                              std::to_string(tau_array_nd) +
+                              ", but a 2-dimensional array is expected.");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, tau_array)) {
+        throw py::value_error(
+            "The input array and the array of Householder scalars "
+            "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    bool is_tau_array_c_contig = tau_array.is_c_contiguous();
+    if (!is_a_array_c_contig) {
+        throw py::value_error("The input array "
+                              "must be C-contiguous");
+    }
+    if (!is_tau_array_c_contig) {
+        throw py::value_error("The array of Householder scalars "
+                              "must be C-contiguous");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+
+    orgqr_batch_impl_fn_ptr_t orgqr_batch_fn =
+        orgqr_batch_dispatch_vector[a_array_type_id];
+    if (orgqr_batch_fn == nullptr) {
+        throw py::value_error(
+            "No orgqr_batch implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    // auto tau_types = dpctl_td_ns::usm_ndarray_types();
+    // int tau_array_type_id =
+    //     tau_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    // if (tau_array_type_id != static_cast<int>(dpctl_td_ns::typenum_t::INT64))
+    // {
+    //     throw py::value_error("The type of 'tau_array' must be int64.");
+    // }
+
+    char *a_array_data = a_array.get_data();
+    char *tau_array_data = tau_array.get_data();
+
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event orgqr_batch_ev =
+        orgqr_batch_fn(q, m, n, k, a_array_data, lda, stride_a, tau_array_data,
+                       stride_tau, batch_size, host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array},
+                                                        host_task_events);
+
+    return std::make_pair(args_ev, orgqr_batch_ev);
+}
+
+template <typename fnT, typename T>
+struct OrgqrBatchContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::OrgqrBatchTypePairSupportFactory<T>::is_defined) {
+            return orgqr_batch_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_orgqr_batch_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<orgqr_batch_impl_fn_ptr_t,
+                                       OrgqrBatchContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(orgqr_batch_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/types_matrix.hpp b/dpnp/backend/extensions/lapack/types_matrix.hpp
index 3f7be9cf4a9..89042fcf640 100644
--- a/dpnp/backend/extensions/lapack/types_matrix.hpp
+++ b/dpnp/backend/extensions/lapack/types_matrix.hpp
@@ -43,6 +43,34 @@ namespace lapack
 {
 namespace types
 {
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::geqrf_batch<T>
+ * function.
+ *
+ * @tparam T Type of array containing the input matrices to be QR factorized in
+ * batch mode. Upon execution, each matrix in the batch is transformed to output
+ * arrays representing their respective orthogonal matrix Q and upper triangular
+ * matrix R.
+ */
+template <typename T>
+struct GeqrfBatchTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T, double, T, double>,
+        dpctl_td_ns::TypePairDefinedEntry<T, float, T, float>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<float>,
+                                          T,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<double>,
+                                          T,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
 /**
  * @brief A factory to define pairs of supported types for which
  * MKL LAPACK library provides support in oneapi::mkl::lapack::geqrf<T>
@@ -169,6 +197,27 @@ struct HeevdTypePairSupportFactory
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
 
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::orgqr_batch<T>
+ * function.
+ *
+ * @tparam T Type of array containing the matrix A,
+ * each from a separate instance in the batch, from which the
+ * elementary reflectors were generated (as in QR factorization).
+ * Upon execution, each array in the batch is overwritten with
+ * its respective orthonormal matrix Q.
+ */
+template <typename T>
+struct OrgqrBatchTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T, double, T, double>,
+        dpctl_td_ns::TypePairDefinedEntry<T, float, T, float>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
 /**
  * @brief A factory to define pairs of supported types for which
  * MKL LAPACK library provides support in oneapi::mkl::lapack::orgqr<T>
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 71f27bef7c2..eb9d41c0bb2 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -25,7 +25,7 @@
 
 
 import dpctl.tensor._tensor_impl as ti
-from numpy import issubdtype
+from numpy import issubdtype, prod
 
 import dpnp
 import dpnp.backend.extensions.lapack._lapack_impl as li
@@ -119,6 +119,50 @@ def _real_type(dtype, device=None):
     return dpnp.dtype(real_type)
 
 
+def _stacked_identity(
+    batch_shape, n, dtype, usm_type="device", sycl_queue=None
+):
+    """
+    Create stacked identity matrices of size `n x n`.
+
+    Forms multiple identity matrices based on `batch_shape`.
+
+    Parameters
+    ----------
+    batch_shape : tuple
+        Shape of the batch determining the stacking of identity matrices.
+    n : int
+        Dimension of each identity matrix.
+    dtype : dtype
+        Data type of the matrix element.
+    usm_type : {"device", "shared", "host"}, optional
+        The type of SYCL USM allocation for the output array.
+    sycl_queue : {None, SyclQueue}, optional
+        A SYCL queue to use for output array allocation and copying.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        Array of stacked `n x n` identity matrices as per `batch_shape`.
+
+    Example
+    -------
+    >>> _stacked_identity((2,), 2, dtype=dpnp.int64)
+    array([[[1, 0],
+            [0, 1]],
+
+           [[1, 0],
+            [0, 1]]])
+
+    """
+
+    shape = batch_shape + (n, n)
+    idx = dpnp.arange(n, usm_type=usm_type, sycl_queue=sycl_queue)
+    x = dpnp.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=sycl_queue)
+    x[..., idx, idx] = 1
+    return x
+
+
 def check_stacked_2d(*arrays):
     """
     Return ``True`` if each array in `arrays` has at least two dimensions.
@@ -740,6 +784,136 @@ def dpnp_eigh(a, UPLO):
         return w, out_v
 
 
+def dpnp_qr_batch(a, mode="reduced"):
+    """
+    dpnp_qr_batch(a, mode="reduced")
+
+    Return the batched qr factorization of `a` matrix.
+
+    """
+
+    a_sycl_queue = a.sycl_queue
+    a_usm_type = a.usm_type
+
+    m, n = a.shape[-2:]
+    k = min(m, n)
+
+    batch_shape = a.shape[:-2]
+    batch_size = prod(batch_shape)
+
+    res_type = _common_type(a)
+
+    if batch_size == 0 or k == 0:
+        if mode == "reduced":
+            return (
+                dpnp.empty(batch_shape + (m, k), dtype=res_type),
+                dpnp.empty(batch_shape + (k, n), dtype=res_type),
+            )
+        elif mode == "complete":
+            q = _stacked_identity(batch_shape, m, dtype=res_type)
+            return (q, dpnp.empty(batch_shape + (m, n), dtype=res_type))
+        elif mode == "r":
+            return dpnp.empty(batch_shape + (k, n), dtype=res_type)
+        elif mode == "raw":
+            return (
+                dpnp.empty(batch_shape + (n, m), dtype=res_type),
+                dpnp.empty(batch_shape + (k,), dtype=res_type),
+            )
+
+    # get 3d input arrays by reshape
+    a = a.reshape(-1, m, n)
+
+    a = a.swapaxes(-2, -1)
+    a_usm_arr = dpnp.get_usm_ndarray(a)
+
+    a_t = dpnp.empty_like(a, order="C", dtype=res_type, usm_type=a_usm_type)
+
+    # use DPCTL tensor function to fill the matrix array
+    # with content from the input array `a`
+    a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_usm_arr, dst=a_t.get_array(), sycl_queue=a_sycl_queue
+    )
+
+    tau_h = dpnp.empty(
+        (batch_size, k),
+        dtype=res_type,
+        sycl_queue=a_sycl_queue,
+        usm_type=a_usm_type,
+    )
+
+    a_stride = a_t.strides[0]
+    tau_stride = tau_h.strides[0]
+
+    # Call the LAPACK extension function _geqrf_batch to compute the QR factorization
+    # of a general m x n matrix.
+    ht_lapack_ev, geqrf_batch_ev = li._geqrf_batch(
+        a_sycl_queue,
+        a_t.get_array(),
+        tau_h.get_array(),
+        m,
+        n,
+        a_stride,
+        tau_stride,
+        batch_size,
+        [a_copy_ev],
+    )
+
+    ht_lapack_ev.wait()
+    a_ht_copy_ev.wait()
+
+    if mode == "r":
+        r = a_t[..., :k].swapaxes(-2, -1)
+        out_r = dpnp.triu(r).astype(res_type, copy=False)
+        return out_r.reshape(batch_shape + out_r.shape[-2:])
+
+    if mode == "raw":
+        q = a_t.reshape(batch_shape + a_t.shape[-2:])
+        r = tau_h.reshape(batch_shape + tau_h.shape[-1:])
+        return (
+            q.astype(res_type, copy=False),
+            r.astype(res_type, copy=False),
+        )
+
+    if mode == "complete" and m > n:
+        mc = m
+        q = dpnp.empty((batch_size, m, m), dtype=res_type)
+    else:
+        mc = k
+        q = dpnp.empty((batch_size, n, m), dtype=res_type)
+    q[..., :n, :] = a_t
+
+    q_stride = q.strides[0]
+    tau_stride = tau_h.strides[0]
+
+    # Call the LAPACK extension function _orgqr to generate the orthogonal/unitary matrix
+    # `Qi` of the QR factorization for a batch of general matrices.
+    ht_lapack_ev, _ = li._orgqr_batch(
+        a_sycl_queue,
+        q.get_array(),
+        tau_h.get_array(),
+        m,
+        mc,
+        k,
+        q_stride,
+        tau_stride,
+        batch_size,
+        [geqrf_batch_ev],
+    )
+
+    q = q[..., :mc, :].swapaxes(-2, -1)
+    r = a_t[..., :mc].swapaxes(-2, -1)
+
+    r_triu = dpnp.triu(r).astype(res_type, copy=False)
+
+    q_reshape = q.reshape(batch_shape + q.shape[-2:])
+    r_triu_reshape = r_triu.reshape(batch_shape + r_triu.shape[-2:])
+
+    return (
+        q_reshape.astype(dtype=res_type, copy=False),
+        r_triu_reshape.astype(dtype=res_type, copy=False),
+    )
+
+
 def dpnp_qr(a, mode="reduced"):
     """
     dpnp_qr(a, mode="reduced")
@@ -749,7 +923,7 @@ def dpnp_qr(a, mode="reduced"):
     """
 
     if a.ndim > 2:
-        pass
+        return dpnp_qr_batch(a, mode=mode)
 
     a_usm_arr = dpnp.get_usm_ndarray(a)
     a_sycl_queue = a.sycl_queue

From 47e309e4d429597ca3a625f9173047a440f97d06 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 31 Jan 2024 16:03:23 +0100
Subject: [PATCH 06/39] Uncomment cupy tests for 3d and 4d cases

---
 .../cupy/linalg_tests/test_decomposition.py   | 60 +++++++++----------
 1 file changed, 30 insertions(+), 30 deletions(-)

diff --git a/tests/third_party/cupy/linalg_tests/test_decomposition.py b/tests/third_party/cupy/linalg_tests/test_decomposition.py
index a9ce10e2051..813b322d823 100644
--- a/tests/third_party/cupy/linalg_tests/test_decomposition.py
+++ b/tests/third_party/cupy/linalg_tests/test_decomposition.py
@@ -177,33 +177,33 @@ def test_mode(self):
         self.check_mode(numpy.random.randn(3, 3), mode=self.mode)
         self.check_mode(numpy.random.randn(5, 4), mode=self.mode)
 
-    # @testing.with_requires("numpy>=1.22")
-    # # @testing.fix_random()
-    # def test_mode_rank3(self):
-    #     self.check_mode(numpy.random.randn(3, 2, 4), mode=self.mode)
-    #     self.check_mode(numpy.random.randn(4, 3, 3), mode=self.mode)
-    #     self.check_mode(numpy.random.randn(2, 5, 4), mode=self.mode)
-
-    # @testing.with_requires("numpy>=1.22")
-    # # @testing.fix_random()
-    # def test_mode_rank4(self):
-    #     self.check_mode(numpy.random.randn(2, 3, 2, 4), mode=self.mode)
-    #     self.check_mode(numpy.random.randn(2, 4, 3, 3), mode=self.mode)
-    #     self.check_mode(numpy.random.randn(2, 2, 5, 4), mode=self.mode)
-
-    # @testing.with_requires("numpy>=1.16")
-    # def test_empty_array(self):
-    #     self.check_mode(numpy.empty((0, 3)), mode=self.mode)
-    #     self.check_mode(numpy.empty((3, 0)), mode=self.mode)
-
-    # @testing.with_requires("numpy>=1.22")
-    # def test_empty_array_rank3(self):
-    #     self.check_mode(numpy.empty((0, 3, 2)), mode=self.mode)
-    #     self.check_mode(numpy.empty((3, 0, 2)), mode=self.mode)
-    #     self.check_mode(numpy.empty((3, 2, 0)), mode=self.mode)
-    #     self.check_mode(numpy.empty((0, 3, 3)), mode=self.mode)
-    #     self.check_mode(numpy.empty((3, 0, 3)), mode=self.mode)
-    #     self.check_mode(numpy.empty((3, 3, 0)), mode=self.mode)
-    #     self.check_mode(numpy.empty((0, 2, 3)), mode=self.mode)
-    #     self.check_mode(numpy.empty((2, 0, 3)), mode=self.mode)
-    #     self.check_mode(numpy.empty((2, 3, 0)), mode=self.mode)
+    @testing.with_requires("numpy>=1.22")
+    @testing.fix_random()
+    def test_mode_rank3(self):
+        self.check_mode(numpy.random.randn(3, 2, 4), mode=self.mode)
+        self.check_mode(numpy.random.randn(4, 3, 3), mode=self.mode)
+        self.check_mode(numpy.random.randn(2, 5, 4), mode=self.mode)
+
+    @testing.with_requires("numpy>=1.22")
+    @testing.fix_random()
+    def test_mode_rank4(self):
+        self.check_mode(numpy.random.randn(2, 3, 2, 4), mode=self.mode)
+        self.check_mode(numpy.random.randn(2, 4, 3, 3), mode=self.mode)
+        self.check_mode(numpy.random.randn(2, 2, 5, 4), mode=self.mode)
+
+    @testing.with_requires("numpy>=1.16")
+    def test_empty_array(self):
+        self.check_mode(numpy.empty((0, 3)), mode=self.mode)
+        self.check_mode(numpy.empty((3, 0)), mode=self.mode)
+
+    @testing.with_requires("numpy>=1.22")
+    def test_empty_array_rank3(self):
+        self.check_mode(numpy.empty((0, 3, 2)), mode=self.mode)
+        self.check_mode(numpy.empty((3, 0, 2)), mode=self.mode)
+        self.check_mode(numpy.empty((3, 2, 0)), mode=self.mode)
+        self.check_mode(numpy.empty((0, 3, 3)), mode=self.mode)
+        self.check_mode(numpy.empty((3, 0, 3)), mode=self.mode)
+        self.check_mode(numpy.empty((3, 3, 0)), mode=self.mode)
+        self.check_mode(numpy.empty((0, 2, 3)), mode=self.mode)
+        self.check_mode(numpy.empty((2, 0, 3)), mode=self.mode)
+        self.check_mode(numpy.empty((2, 3, 0)), mode=self.mode)

From 7e9750ab446a62b49e811af4d02e0ee7f4312be3 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 31 Jan 2024 16:08:03 +0100
Subject: [PATCH 07/39] Remove an old impl of dpnp_qr

---
 dpnp/backend/include/dpnp_iface_fptr.hpp  |  2 -
 dpnp/backend/kernels/dpnp_krnl_linalg.cpp | 34 --------------
 dpnp/dpnp_algo/dpnp_algo.pxd              |  2 -
 dpnp/linalg/dpnp_algo_linalg.pyx          | 56 -----------------------
 4 files changed, 94 deletions(-)

diff --git a/dpnp/backend/include/dpnp_iface_fptr.hpp b/dpnp/backend/include/dpnp_iface_fptr.hpp
index 87826cc3bb9..4640599d1c0 100644
--- a/dpnp/backend/include/dpnp_iface_fptr.hpp
+++ b/dpnp/backend/include/dpnp_iface_fptr.hpp
@@ -222,8 +222,6 @@ enum class DPNPFuncName : size_t
     DPNP_FN_PUT,            /**< Used in numpy.put() impl  */
     DPNP_FN_PUT_ALONG_AXIS, /**< Used in numpy.put_along_axis() impl  */
     DPNP_FN_QR,             /**< Used in numpy.linalg.qr() impl  */
-    DPNP_FN_QR_EXT,         /**< Used in numpy.linalg.qr() impl, requires extra
-                               parameters */
     DPNP_FN_RADIANS,        /**< Used in numpy.radians() impl  */
     DPNP_FN_RADIANS_EXT,    /**< Used in numpy.radians() impl, requires extra
                                parameters */
diff --git a/dpnp/backend/kernels/dpnp_krnl_linalg.cpp b/dpnp/backend/kernels/dpnp_krnl_linalg.cpp
index 297f963ba28..1e0c2c44015 100644
--- a/dpnp/backend/kernels/dpnp_krnl_linalg.cpp
+++ b/dpnp/backend/kernels/dpnp_krnl_linalg.cpp
@@ -731,17 +731,6 @@ template <typename _InputDT, typename _ComputeDT>
 void (*dpnp_qr_default_c)(void *, void *, void *, void *, size_t, size_t) =
     dpnp_qr_c<_InputDT, _ComputeDT>;
 
-template <typename _InputDT, typename _ComputeDT>
-DPCTLSyclEventRef (*dpnp_qr_ext_c)(DPCTLSyclQueueRef,
-                                   void *,
-                                   void *,
-                                   void *,
-                                   void *,
-                                   size_t,
-                                   size_t,
-                                   const DPCTLEventVectorRef) =
-    dpnp_qr_c<_InputDT, _ComputeDT>;
-
 template <typename _InputDT, typename _ComputeDT, typename _SVDT>
 DPCTLSyclEventRef dpnp_svd_c(DPCTLSyclQueueRef q_ref,
                              void *array1_in,
@@ -1041,29 +1030,6 @@ void func_map_init_linalg_func(func_map_t &fmap)
     // fmap[DPNPFuncName::DPNP_FN_QR][eft_C128][eft_C128] = {
     // eft_C128, (void*)dpnp_qr_c<std::complex<double>, std::complex<double>>};
 
-    fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_INT][eft_INT] = {
-        get_default_floating_type(),
-        (void *)dpnp_qr_ext_c<
-            int32_t, func_type_map_t::find_type<get_default_floating_type()>>,
-        get_default_floating_type<std::false_type>(),
-        (void *)dpnp_qr_ext_c<
-            int32_t, func_type_map_t::find_type<
-                         get_default_floating_type<std::false_type>()>>};
-    fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_LNG][eft_LNG] = {
-        get_default_floating_type(),
-        (void *)dpnp_qr_ext_c<
-            int64_t, func_type_map_t::find_type<get_default_floating_type()>>,
-        get_default_floating_type<std::false_type>(),
-        (void *)dpnp_qr_ext_c<
-            int64_t, func_type_map_t::find_type<
-                         get_default_floating_type<std::false_type>()>>};
-    fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_FLT][eft_FLT] = {
-        eft_FLT, (void *)dpnp_qr_ext_c<float, float>};
-    fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_DBL][eft_DBL] = {
-        eft_DBL, (void *)dpnp_qr_ext_c<double, double>};
-    // fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_C128][eft_C128] = {
-    // eft_C128, (void*)dpnp_qr_c<std::complex<double>, std::complex<double>>};
-
     fmap[DPNPFuncName::DPNP_FN_SVD][eft_INT][eft_INT] = {
         eft_DBL, (void *)dpnp_svd_default_c<int32_t, double, double>};
     fmap[DPNPFuncName::DPNP_FN_SVD][eft_LNG][eft_LNG] = {
diff --git a/dpnp/dpnp_algo/dpnp_algo.pxd b/dpnp/dpnp_algo/dpnp_algo.pxd
index 3fcb67aac9e..f91f7693be7 100644
--- a/dpnp/dpnp_algo/dpnp_algo.pxd
+++ b/dpnp/dpnp_algo/dpnp_algo.pxd
@@ -98,8 +98,6 @@ cdef extern from "dpnp_iface_fptr.hpp" namespace "DPNPFuncName":  # need this na
         DPNP_FN_PARTITION
         DPNP_FN_PARTITION_EXT
         DPNP_FN_PLACE
-        DPNP_FN_QR
-        DPNP_FN_QR_EXT
         DPNP_FN_RADIANS
         DPNP_FN_RADIANS_EXT
         DPNP_FN_RNG_BETA
diff --git a/dpnp/linalg/dpnp_algo_linalg.pyx b/dpnp/linalg/dpnp_algo_linalg.pyx
index 6cf3bd3578f..2ba9fd5646a 100644
--- a/dpnp/linalg/dpnp_algo_linalg.pyx
+++ b/dpnp/linalg/dpnp_algo_linalg.pyx
@@ -51,7 +51,6 @@ __all__ = [
     "dpnp_inv",
     "dpnp_matrix_rank",
     "dpnp_norm",
-    "dpnp_qr",
     "dpnp_svd",
 ]
 
@@ -367,61 +366,6 @@ cpdef object dpnp_norm(object input, ord=None, axis=None):
         raise ValueError("Improper number of dimensions to norm.")
 
 
-cpdef tuple dpnp_qr(utils.dpnp_descriptor x1, str mode):
-    cdef size_t size_m = x1.shape[0]
-    cdef size_t size_n = x1.shape[1]
-    cdef size_t min_m_n = min(size_m, size_n)
-    cdef size_t size_tau = min_m_n
-
-    cdef DPNPFuncType param1_type = dpnp_dtype_to_DPNPFuncType(x1.dtype)
-    cdef DPNPFuncData kernel_data = get_dpnp_function_ptr(DPNP_FN_QR_EXT, param1_type, param1_type)
-
-    x1_obj = x1.get_array()
-
-    cdef (DPNPFuncType, void *) ret_type_and_func = utils.get_ret_type_and_func(kernel_data,
-                                                                                x1_obj.sycl_device.has_aspect_fp64)
-    cdef DPNPFuncType return_type = ret_type_and_func[0]
-    cdef custom_linalg_1in_3out_shape_t func = < custom_linalg_1in_3out_shape_t > ret_type_and_func[1]
-
-    cdef utils.dpnp_descriptor res_q = utils.create_output_descriptor((size_m, min_m_n),
-                                                                       return_type,
-                                                                       None,
-                                                                       device=x1_obj.sycl_device,
-                                                                       usm_type=x1_obj.usm_type,
-                                                                       sycl_queue=x1_obj.sycl_queue)
-    cdef utils.dpnp_descriptor res_r = utils.create_output_descriptor((min_m_n, size_n),
-                                                                       return_type,
-                                                                       None,
-                                                                       device=x1_obj.sycl_device,
-                                                                       usm_type=x1_obj.usm_type,
-                                                                       sycl_queue=x1_obj.sycl_queue)
-    cdef utils.dpnp_descriptor tau = utils.create_output_descriptor((size_tau, ),
-                                                                     return_type,
-                                                                     None,
-                                                                     device=x1_obj.sycl_device,
-                                                                     usm_type=x1_obj.usm_type,
-                                                                     sycl_queue=x1_obj.sycl_queue)
-
-    result_sycl_queue = res_q.get_array().sycl_queue
-
-    cdef c_dpctl.SyclQueue q = <c_dpctl.SyclQueue> result_sycl_queue
-    cdef c_dpctl.DPCTLSyclQueueRef q_ref = q.get_queue_ref()
-
-    cdef c_dpctl.DPCTLSyclEventRef event_ref = func(q_ref,
-                                                    x1.get_data(),
-                                                    res_q.get_data(),
-                                                    res_r.get_data(),
-                                                    tau.get_data(),
-                                                    size_m,
-                                                    size_n,
-                                                    NULL)  # dep_events_ref
-
-    with nogil: c_dpctl.DPCTLEvent_WaitAndThrow(event_ref)
-    c_dpctl.DPCTLEvent_Delete(event_ref)
-
-    return (res_q.get_pyobj(), res_r.get_pyobj())
-
-
 cpdef tuple dpnp_svd(utils.dpnp_descriptor x1, cpp_bool full_matrices, cpp_bool compute_uv, cpp_bool hermitian):
     cdef size_t size_m = x1.shape[0]
     cdef size_t size_n = x1.shape[1]

From 71277b0c326d2227fbfd8b8b1b34be64c3a68be1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 31 Jan 2024 17:00:28 +0100
Subject: [PATCH 08/39] Update test_qr in test_sycl_queue

---
 dpnp/linalg/dpnp_utils_linalg.py | 71 +++++++++++++++++++++++++++-----
 tests/test_sycl_queue.py         | 58 +++++++++++++++++---------
 2 files changed, 99 insertions(+), 30 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index eb9d41c0bb2..4de00a11168 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -806,18 +806,57 @@ def dpnp_qr_batch(a, mode="reduced"):
     if batch_size == 0 or k == 0:
         if mode == "reduced":
             return (
-                dpnp.empty(batch_shape + (m, k), dtype=res_type),
-                dpnp.empty(batch_shape + (k, n), dtype=res_type),
+                dpnp.empty(
+                    batch_shape + (m, k),
+                    dtype=res_type,
+                    sycl_queue=a_sycl_queue,
+                    usm_type=a_usm_type,
+                ),
+                dpnp.empty(
+                    batch_shape + (k, n),
+                    dtype=res_type,
+                    sycl_queue=a_sycl_queue,
+                    usm_type=a_usm_type,
+                ),
             )
         elif mode == "complete":
-            q = _stacked_identity(batch_shape, m, dtype=res_type)
-            return (q, dpnp.empty(batch_shape + (m, n), dtype=res_type))
+            q = _stacked_identity(
+                batch_shape,
+                m,
+                dtype=res_type,
+                usm_type=a_usm_type,
+                sycl_queue=a_sycl_queue,
+            )
+            return (
+                q,
+                dpnp.empty(
+                    batch_shape + (m, n),
+                    dtype=res_type,
+                    sycl_queue=a_sycl_queue,
+                    usm_type=a_usm_type,
+                ),
+            )
         elif mode == "r":
-            return dpnp.empty(batch_shape + (k, n), dtype=res_type)
+            return dpnp.empty(
+                batch_shape + (k, n),
+                dtype=res_type,
+                sycl_queue=a_sycl_queue,
+                usm_type=a_usm_type,
+            )
         elif mode == "raw":
             return (
-                dpnp.empty(batch_shape + (n, m), dtype=res_type),
-                dpnp.empty(batch_shape + (k,), dtype=res_type),
+                dpnp.empty(
+                    batch_shape + (n, m),
+                    dtype=res_type,
+                    sycl_queue=a_sycl_queue,
+                    usm_type=a_usm_type,
+                ),
+                dpnp.empty(
+                    batch_shape + (k,),
+                    dtype=res_type,
+                    sycl_queue=a_sycl_queue,
+                    usm_type=a_usm_type,
+                ),
             )
 
     # get 3d input arrays by reshape
@@ -826,7 +865,7 @@ def dpnp_qr_batch(a, mode="reduced"):
     a = a.swapaxes(-2, -1)
     a_usm_arr = dpnp.get_usm_ndarray(a)
 
-    a_t = dpnp.empty_like(a, order="C", dtype=res_type, usm_type=a_usm_type)
+    a_t = dpnp.empty_like(a, order="C", dtype=res_type)
 
     # use DPCTL tensor function to fill the matrix array
     # with content from the input array `a`
@@ -876,10 +915,20 @@ def dpnp_qr_batch(a, mode="reduced"):
 
     if mode == "complete" and m > n:
         mc = m
-        q = dpnp.empty((batch_size, m, m), dtype=res_type)
+        q = dpnp.empty(
+            (batch_size, m, m),
+            dtype=res_type,
+            sycl_queue=a_sycl_queue,
+            usm_type=a_usm_type,
+        )
     else:
         mc = k
-        q = dpnp.empty((batch_size, n, m), dtype=res_type)
+        q = dpnp.empty(
+            (batch_size, n, m),
+            dtype=res_type,
+            sycl_queue=a_sycl_queue,
+            usm_type=a_usm_type,
+        )
     q[..., :n, :] = a_t
 
     q_stride = q.strides[0]
@@ -977,7 +1026,7 @@ def dpnp_qr(a, mode="reduced"):
 
     a = a.T
     a_usm_arr = dpnp.get_usm_ndarray(a)
-    a_t = dpnp.empty_like(a, order="C", dtype=res_type, usm_type=a_usm_type)
+    a_t = dpnp.empty_like(a, order="C", dtype=res_type)
 
     # use DPCTL tensor function to fill the matrix array
     # with content from the input array `a`
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index ee247aa5414..85b84b9bae8 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -1171,34 +1171,54 @@ def test_matrix_rank(device):
     assert_array_equal(expected, result)
 
 
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (4, 4),
+        (2, 0),
+        (2, 2, 3),
+        (0, 2, 3),
+        (1, 0, 3),
+    ],
+    ids=[
+        "(4, 4)",
+        "(2, 0)",
+        "(2, 2, 3)",
+        "(0, 2, 3)",
+        "(1, 0, 3)",
+    ],
+)
+@pytest.mark.parametrize(
+    "mode",
+    ["r", "raw", "complete", "reduced"],
+    ids=["r", "raw", "complete", "reduced"],
+)
 @pytest.mark.parametrize(
     "device",
     valid_devices,
     ids=[device.filter_string for device in valid_devices],
 )
-def test_qr(device):
-    data = [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]]
-    dpnp_data = dpnp.array(data, device=device)
-    numpy_data = numpy.array(data, dtype=dpnp_data.dtype)
-
-    np_q, np_r = numpy.linalg.qr(numpy_data, "reduced")
-    dpnp_q, dpnp_r = dpnp.linalg.qr(dpnp_data, "reduced")
+def test_qr(shape, mode, device):
+    dtype = dpnp.default_float_type(device)
+    count_elems = numpy.prod(shape)
+    dpnp_data = dpnp.arange(count_elems, dtype=dtype, device=device).reshape(
+        shape
+    )
 
-    assert dpnp_q.dtype == np_q.dtype
-    assert dpnp_r.dtype == np_r.dtype
-    assert dpnp_q.shape == np_q.shape
-    assert dpnp_r.shape == np_r.shape
+    expected_queue = dpnp_data.get_array().sycl_queue
 
-    assert_dtype_allclose(dpnp_q, np_q)
-    assert_dtype_allclose(dpnp_r, np_r)
+    if mode == "r":
+        dpnp_r = dpnp.linalg.qr(dpnp_data, mode=mode)
+        dpnp_r_queue = dpnp_r.get_array().sycl_queue
+        assert_sycl_queue_equal(dpnp_r_queue, expected_queue)
+    else:
+        dpnp_q, dpnp_r = dpnp.linalg.qr(dpnp_data, mode=mode)
 
-    expected_queue = dpnp_data.get_array().sycl_queue
-    dpnp_q_queue = dpnp_q.get_array().sycl_queue
-    dpnp_r_queue = dpnp_r.get_array().sycl_queue
+        dpnp_q_queue = dpnp_q.get_array().sycl_queue
+        dpnp_r_queue = dpnp_r.get_array().sycl_queue
 
-    # compare queue and device
-    assert_sycl_queue_equal(dpnp_q_queue, expected_queue)
-    assert_sycl_queue_equal(dpnp_r_queue, expected_queue)
+        assert_sycl_queue_equal(dpnp_q_queue, expected_queue)
+        assert_sycl_queue_equal(dpnp_r_queue, expected_queue)
 
 
 @pytest.mark.parametrize(

From 0fe3346739b7a3a5cc0a20244140d0eb9fae145f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 31 Jan 2024 17:12:06 +0100
Subject: [PATCH 09/39] Add test_qr in test_usm_type

---
 tests/test_sycl_queue.py | 22 ++++++++++------------
 tests/test_usm_type.py   | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+), 12 deletions(-)

diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
index 85b84b9bae8..a5b02f4007f 100644
--- a/tests/test_sycl_queue.py
+++ b/tests/test_sycl_queue.py
@@ -1201,24 +1201,22 @@ def test_matrix_rank(device):
 def test_qr(shape, mode, device):
     dtype = dpnp.default_float_type(device)
     count_elems = numpy.prod(shape)
-    dpnp_data = dpnp.arange(count_elems, dtype=dtype, device=device).reshape(
-        shape
-    )
+    a = dpnp.arange(count_elems, dtype=dtype, device=device).reshape(shape)
 
-    expected_queue = dpnp_data.get_array().sycl_queue
+    expected_queue = a.get_array().sycl_queue
 
     if mode == "r":
-        dpnp_r = dpnp.linalg.qr(dpnp_data, mode=mode)
-        dpnp_r_queue = dpnp_r.get_array().sycl_queue
-        assert_sycl_queue_equal(dpnp_r_queue, expected_queue)
+        dp_r = dpnp.linalg.qr(a, mode=mode)
+        dp_r_queue = dp_r.get_array().sycl_queue
+        assert_sycl_queue_equal(dp_r_queue, expected_queue)
     else:
-        dpnp_q, dpnp_r = dpnp.linalg.qr(dpnp_data, mode=mode)
+        dp_q, dp_r = dpnp.linalg.qr(a, mode=mode)
 
-        dpnp_q_queue = dpnp_q.get_array().sycl_queue
-        dpnp_r_queue = dpnp_r.get_array().sycl_queue
+        dp_q_queue = dp_q.get_array().sycl_queue
+        dp_r_queue = dp_r.get_array().sycl_queue
 
-        assert_sycl_queue_equal(dpnp_q_queue, expected_queue)
-        assert_sycl_queue_equal(dpnp_r_queue, expected_queue)
+        assert_sycl_queue_equal(dp_q_queue, expected_queue)
+        assert_sycl_queue_equal(dp_r_queue, expected_queue)
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
index 554f939b208..f43cf138afd 100644
--- a/tests/test_usm_type.py
+++ b/tests/test_usm_type.py
@@ -710,3 +710,40 @@ def test_det(shape, is_empty, usm_type):
     det = dp.linalg.det(x)
 
     assert x.usm_type == det.usm_type
+
+
+@pytest.mark.parametrize("usm_type", list_of_usm_types, ids=list_of_usm_types)
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (4, 4),
+        (2, 0),
+        (2, 2, 3),
+        (0, 2, 3),
+        (1, 0, 3),
+    ],
+    ids=[
+        "(4, 4)",
+        "(2, 0)",
+        "(2, 2, 3)",
+        "(0, 2, 3)",
+        "(1, 0, 3)",
+    ],
+)
+@pytest.mark.parametrize(
+    "mode",
+    ["r", "raw", "complete", "reduced"],
+    ids=["r", "raw", "complete", "reduced"],
+)
+def test_qr(shape, mode, usm_type):
+    count_elems = numpy.prod(shape)
+    a = dp.arange(count_elems, usm_type=usm_type).reshape(shape)
+
+    if mode == "r":
+        dp_r = dp.linalg.qr(a, mode=mode)
+        assert a.usm_type == dp_r.usm_type
+    else:
+        dp_q, dp_r = dp.linalg.qr(a, mode=mode)
+
+        assert a.usm_type == dp_q.usm_type
+        assert a.usm_type == dp_r.usm_type

From 3b57f0ca1585e7ff33f812ccb8adc2de4a69af71 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 1 Feb 2024 12:33:08 +0100
Subject: [PATCH 10/39] Rename condition to _condition in test_solve.py

---
 tests/third_party/cupy/linalg_tests/test_solve.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/third_party/cupy/linalg_tests/test_solve.py b/tests/third_party/cupy/linalg_tests/test_solve.py
index b31082c8e84..cd397f6c9e1 100644
--- a/tests/third_party/cupy/linalg_tests/test_solve.py
+++ b/tests/third_party/cupy/linalg_tests/test_solve.py
@@ -10,7 +10,7 @@
     is_cpu_device,
 )
 from tests.third_party.cupy import testing
-from tests.third_party.cupy.testing import condition
+from tests.third_party.cupy.testing import _condition
 
 
 @testing.parameterize(
@@ -104,7 +104,7 @@ def test_invalid_shape(self):
 )
 class TestInv(unittest.TestCase):
     @testing.for_dtypes("ifdFD")
-    @condition.retry(10)
+    @_condition.retry(10)
     def check_x(self, a_shape, dtype):
         a_cpu = numpy.random.randint(0, 10, size=a_shape)
         a_cpu = a_cpu.astype(dtype, order=self.order)

From dc6dbcb9e9f52e3caed628d11c6e4fc99c8580b0 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Thu, 1 Feb 2024 13:38:58 +0100
Subject: [PATCH 11/39] Use _real_type for _orgqr

---
 dpnp/linalg/dpnp_utils_linalg.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index de9a0854b73..40600dfe0a9 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1224,6 +1224,9 @@ def dpnp_qr(a, mode="reduced"):
             tau_h.astype(res_type, copy=False),
         )
 
+    # _orgqr supports only floating type
+    orgqr_type = _real_type(res_type)
+
     # mc is the total number of columns in the q matrix.
     # In `complete` mode, mc equals the number of rows.
     # In `reduced` mode, mc is the lesser of the row count or column count.
@@ -1231,7 +1234,7 @@ def dpnp_qr(a, mode="reduced"):
         mc = m
         q = dpnp.empty(
             (m, m),
-            dtype=res_type,
+            dtype=orgqr_type,
             order="C",
             sycl_queue=a_sycl_queue,
             usm_type=a_usm_type,
@@ -1240,7 +1243,7 @@ def dpnp_qr(a, mode="reduced"):
         mc = k
         q = dpnp.empty(
             (n, m),
-            dtype=res_type,
+            dtype=orgqr_type,
             order="C",
             sycl_queue=a_sycl_queue,
             usm_type=a_usm_type,

From a2349fe04c8b1c713ed537f65c51e2cad89c3d5b Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 2 Feb 2024 13:15:42 +0100
Subject: [PATCH 12/39] Use _real_type for _orgqr_batch

---
 dpnp/linalg/dpnp_utils_linalg.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 40600dfe0a9..d34d12212bb 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1080,11 +1080,14 @@ def dpnp_qr_batch(a, mode="reduced"):
             r.astype(res_type, copy=False),
         )
 
+    # _orgqr supports only floating type
+    orgqr_type = _real_type(res_type)
+
     if mode == "complete" and m > n:
         mc = m
         q = dpnp.empty(
             (batch_size, m, m),
-            dtype=res_type,
+            dtype=orgqr_type,
             sycl_queue=a_sycl_queue,
             usm_type=a_usm_type,
         )
@@ -1092,7 +1095,7 @@ def dpnp_qr_batch(a, mode="reduced"):
         mc = k
         q = dpnp.empty(
             (batch_size, n, m),
-            dtype=res_type,
+            dtype=orgqr_type,
             sycl_queue=a_sycl_queue,
             usm_type=a_usm_type,
         )
@@ -1126,7 +1129,7 @@ def dpnp_qr_batch(a, mode="reduced"):
 
     return (
         q_reshape.astype(dtype=res_type, copy=False),
-        r_triu_reshape.astype(dtype=res_type, copy=False),
+        r_triu_reshape,
     )
 
 

From 4e993a9229366756d9071ff8dcb4c4d408753fe4 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 2 Feb 2024 15:11:05 +0100
Subject: [PATCH 13/39] Update dpnp tests for dpnp.linalg.qr

---
 tests/test_linalg.py | 193 ++++++++++++++++++++++++++++---------------
 1 file changed, 126 insertions(+), 67 deletions(-)

diff --git a/tests/test_linalg.py b/tests/test_linalg.py
index 5ea536c2887..49030fee561 100644
--- a/tests/test_linalg.py
+++ b/tests/test_linalg.py
@@ -1,7 +1,12 @@
 import dpctl
 import numpy
 import pytest
-from numpy.testing import assert_allclose, assert_array_equal, assert_raises
+from numpy.testing import (
+    assert_allclose,
+    assert_almost_equal,
+    assert_array_equal,
+    assert_raises,
+)
 
 import dpnp as inp
 from tests.third_party.cupy import testing
@@ -671,88 +676,142 @@ def test_norm3(array, ord, axis):
     assert_allclose(expected, result)
 
 
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-@pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
-@pytest.mark.parametrize(
-    "shape",
-    [(2, 2), (3, 4), (5, 3), (16, 16), (0, 0), (0, 2), (2, 0)],
-    ids=["(2,2)", "(3,4)", "(5,3)", "(16,16)", "(0,0)", "(0,2)", "(2,0)"],
-)
-@pytest.mark.parametrize(
-    "mode", ["complete", "reduced"], ids=["complete", "reduced"]
-)
-def test_qr(type, shape, mode):
-    a = numpy.arange(shape[0] * shape[1], dtype=type).reshape(shape)
-    ia = inp.array(a)
-
-    np_q, np_r = numpy.linalg.qr(a, mode)
-    dpnp_q, dpnp_r = inp.linalg.qr(ia, mode)
+class TestQr:
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize(
+        "shape",
+        [(2, 2), (3, 4), (5, 3), (16, 16), (2, 2, 2), (2, 4, 2), (2, 2, 4)],
+        ids=[
+            "(2, 2)",
+            "(3, 4)",
+            "(5, 3)",
+            "(16, 16)",
+            "(2, 2, 2)",
+            "(2, 4, 2)",
+            "(2, 2, 4)",
+        ],
+    )
+    @pytest.mark.parametrize(
+        "mode",
+        ["r", "raw", "complete", "reduced"],
+        ids=["r", "raw", "complete", "reduced"],
+    )
+    def test_qr(self, dtype, shape, mode):
+        a = numpy.random.rand(*shape).astype(dtype)
+        ia = inp.array(a)
 
-    support_aspect64 = has_support_aspect64()
+        if mode == "r":
+            np_r = numpy.linalg.qr(a, mode)
+            dpnp_r = inp.linalg.qr(ia, mode)
+        else:
+            np_q, np_r = numpy.linalg.qr(a, mode)
+            dpnp_q, dpnp_r = inp.linalg.qr(ia, mode)
+
+            # check decomposition
+            if mode in ("complete", "reduced"):
+                # _orgqr doesn`t support complex type
+                if not inp.issubdtype(dtype, inp.complexfloating):
+                    if a.ndim == 2:
+                        assert_almost_equal(
+                            inp.dot(dpnp_q, dpnp_r),
+                            a,
+                            decimal=5,
+                        )
+                    else:
+                        batch_size = a.shape[0]
+                        for i in range(batch_size):
+                            assert_almost_equal(
+                                inp.dot(dpnp_q[i], dpnp_r[i]),
+                                a[i],
+                                decimal=5,
+                            )
+            else:  # mode=="raw"
+                # _orgqr doesn`t support complex type
+                if not inp.issubdtype(dtype, inp.complexfloating):
+                    assert_dtype_allclose(dpnp_q, np_q)
+                    assert_dtype_allclose(dpnp_r, np_r)
+        if mode in ("raw", "r"):
+            assert_dtype_allclose(dpnp_r, np_r)
 
-    if support_aspect64:
-        assert dpnp_q.dtype == np_q.dtype
-        assert dpnp_r.dtype == np_r.dtype
-    assert dpnp_q.shape == np_q.shape
-    assert dpnp_r.shape == np_r.shape
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize(
+        "shape",
+        [(0, 0), (0, 2), (2, 0), (2, 0, 3), (2, 3, 0), (0, 2, 3)],
+        ids=[
+            "(0, 0)",
+            "(0, 2)",
+            "(2 ,0)",
+            "(2, 0, 3)",
+            "(2, 3, 0)",
+            "(0, 2, 3)",
+        ],
+    )
+    @pytest.mark.parametrize(
+        "mode",
+        ["r", "raw", "complete", "reduced"],
+        ids=["r", "raw", "complete", "reduced"],
+    )
+    def test_qr_empty(self, dtype, shape, mode):
+        a = numpy.empty(shape, dtype=dtype)
+        ia = inp.array(a)
 
-    tol = 1e-6
-    if type == inp.float32:
-        tol = 1e-02
-    elif not support_aspect64 and type in (inp.int32, inp.int64, None):
-        tol = 1e-02
+        if mode == "r":
+            np_r = numpy.linalg.qr(a, mode)
+            dpnp_r = inp.linalg.qr(ia, mode)
+        else:
+            np_q, np_r = numpy.linalg.qr(a, mode)
+            dpnp_q, dpnp_r = inp.linalg.qr(ia, mode)
 
-    # check decomposition
-    assert_allclose(
-        ia,
-        inp.dot(dpnp_q, dpnp_r),
-        rtol=tol,
-        atol=tol,
-    )
+            assert_dtype_allclose(dpnp_q, np_q)
 
-    # NP change sign for comparison
-    ncols = min(a.shape[0], a.shape[1])
-    for i in range(ncols):
-        j = numpy.where(numpy.abs(np_q[:, i]) > tol)[0][0]
-        if np_q[j, i] * dpnp_q[j, i] < 0:
-            np_q[:, i] = -np_q[:, i]
-            np_r[i, :] = -np_r[i, :]
+        assert_dtype_allclose(dpnp_r, np_r)
 
-        if numpy.any(numpy.abs(np_r[i, :]) > tol):
-            assert_allclose(
-                inp.asnumpy(dpnp_q)[:, i], np_q[:, i], rtol=tol, atol=tol
-            )
+    @pytest.mark.parametrize(
+        "mode",
+        ["r", "raw", "complete", "reduced"],
+        ids=["r", "raw", "complete", "reduced"],
+    )
+    def test_qr_strides(self, mode):
+        a = numpy.random.rand(5, 5)
+        ia = inp.array(a)
 
-    assert_allclose(dpnp_r, np_r, rtol=tol, atol=tol)
+        # positive strides
+        if mode == "r":
+            np_r = numpy.linalg.qr(a[::2, ::2], mode)
+            dpnp_r = inp.linalg.qr(ia[::2, ::2], mode)
+        else:
+            np_q, np_r = numpy.linalg.qr(a[::2, ::2], mode)
+            dpnp_q, dpnp_r = inp.linalg.qr(ia[::2, ::2], mode)
 
+            assert_dtype_allclose(dpnp_q, np_q)
 
-@pytest.mark.usefixtures("allow_fall_back_on_numpy")
-def test_qr_not_2D():
-    a = numpy.arange(12, dtype=numpy.float32).reshape((3, 2, 2))
-    ia = inp.array(a)
+        assert_dtype_allclose(dpnp_r, np_r)
 
-    np_q, np_r = numpy.linalg.qr(a)
-    dpnp_q, dpnp_r = inp.linalg.qr(ia)
+        # negative strides
+        if mode == "r":
+            np_r = numpy.linalg.qr(a[::-2, ::-2], mode)
+            dpnp_r = inp.linalg.qr(ia[::-2, ::-2], mode)
+        else:
+            np_q, np_r = numpy.linalg.qr(a[::-2, ::-2], mode)
+            dpnp_q, dpnp_r = inp.linalg.qr(ia[::-2, ::-2], mode)
 
-    assert dpnp_q.dtype == np_q.dtype
-    assert dpnp_r.dtype == np_r.dtype
-    assert dpnp_q.shape == np_q.shape
-    assert dpnp_r.shape == np_r.shape
+            assert_dtype_allclose(dpnp_q, np_q)
 
-    assert_allclose(ia, inp.matmul(dpnp_q, dpnp_r))
+        assert_dtype_allclose(dpnp_r, np_r)
 
-    a = numpy.empty((0, 3, 2), dtype=numpy.float32)
-    ia = inp.array(a)
+    def test_qr_errors(self):
+        a_dp = inp.array([[1, 2], [3, 5]], dtype="float32")
 
-    np_q, np_r = numpy.linalg.qr(a)
-    dpnp_q, dpnp_r = inp.linalg.qr(ia)
+        # unsupported type
+        a_np = inp.asnumpy(a_dp)
+        assert_raises(TypeError, inp.linalg.qr, a_np)
 
-    assert dpnp_q.dtype == np_q.dtype
-    assert dpnp_r.dtype == np_r.dtype
-    assert dpnp_q.shape == np_q.shape
-    assert dpnp_r.shape == np_r.shape
+        # a.ndim < 2
+        a_dp_ndim_1 = a_dp.flatten()
+        assert_raises(inp.linalg.LinAlgError, inp.linalg.qr, a_dp_ndim_1)
 
-    assert_allclose(ia, inp.matmul(dpnp_q, dpnp_r))
+        # invalid mode
+        assert_raises(ValueError, inp.linalg.qr, a_dp, "c")
 
 
 @pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))

From 95b842062bcf38b178743868d8d8c036d8ab0542 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 2 Feb 2024 16:56:56 +0100
Subject: [PATCH 14/39] Pass scratchpad_size to the error message test

---
 dpnp/backend/extensions/lapack/geqrf.cpp       | 3 ++-
 dpnp/backend/extensions/lapack/geqrf_batch.cpp | 3 ++-
 dpnp/backend/extensions/lapack/orgqr.cpp       | 3 ++-
 dpnp/backend/extensions/lapack/orgqr_batch.cpp | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/geqrf.cpp b/dpnp/backend/extensions/lapack/geqrf.cpp
index c32de58fccf..8413c2d785c 100644
--- a/dpnp/backend/extensions/lapack/geqrf.cpp
+++ b/dpnp/backend/extensions/lapack/geqrf.cpp
@@ -106,7 +106,8 @@ static sycl::event geqrf_impl(sycl::queue exec_q,
         else if (info == scratchpad_size && e.detail() != 0) {
             error_msg
                 << "Insufficient scratchpad size. Required size is at least "
-                << e.detail();
+                << e.detail() << ", but current size is " << scratchpad_size
+                << ".";
         }
         else {
             error_msg << "Unexpected MKL exception caught during geqrf() "
diff --git a/dpnp/backend/extensions/lapack/geqrf_batch.cpp b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
index 64964bfe153..14f7cae487e 100644
--- a/dpnp/backend/extensions/lapack/geqrf_batch.cpp
+++ b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
@@ -121,7 +121,8 @@ static sycl::event geqrf_batch_impl(sycl::queue exec_q,
         else if (info == scratchpad_size && e.detail() != 0) {
             error_msg
                 << "Insufficient scratchpad size. Required size is at least "
-                << e.detail();
+                << e.detail() << ", but current size is " << scratchpad_size
+                << ".";
         }
         else {
             error_msg << "Unexpected MKL exception caught during geqrf_batch() "
diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp
index 25f2c29f3f5..49a62742764 100644
--- a/dpnp/backend/extensions/lapack/orgqr.cpp
+++ b/dpnp/backend/extensions/lapack/orgqr.cpp
@@ -110,7 +110,8 @@ static sycl::event orgqr_impl(sycl::queue exec_q,
         else if (info == scratchpad_size && e.detail() != 0) {
             error_msg
                 << "Insufficient scratchpad size. Required size is at least "
-                << e.detail();
+                << e.detail() << ", but current size is " << scratchpad_size
+                << ".";
         }
         else {
             error_msg << "Unexpected MKL exception caught during orgqr() "
diff --git a/dpnp/backend/extensions/lapack/orgqr_batch.cpp b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
index 560e9026fd6..4a8ecaf5156 100644
--- a/dpnp/backend/extensions/lapack/orgqr_batch.cpp
+++ b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
@@ -125,7 +125,8 @@ static sycl::event orgqr_batch_impl(sycl::queue exec_q,
         else if (info == scratchpad_size && e.detail() != 0) {
             error_msg
                 << "Insufficient scratchpad size. Required size is at least "
-                << e.detail();
+                << e.detail() << ", but current size is " << scratchpad_size
+                << ".";
         }
         else {
             error_msg << "Unexpected MKL exception caught during orgqr_batch() "

From 3a50768feae340943d4445c905c0ea78813806b1 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 2 Feb 2024 17:51:19 +0100
Subject: [PATCH 15/39] Add additional checks

---
 dpnp/backend/extensions/lapack/geqrf.cpp       | 17 +++++++++++++++++
 dpnp/backend/extensions/lapack/geqrf_batch.cpp |  8 ++++++++
 dpnp/backend/extensions/lapack/orgqr.cpp       |  9 +++++++++
 3 files changed, 34 insertions(+)

diff --git a/dpnp/backend/extensions/lapack/geqrf.cpp b/dpnp/backend/extensions/lapack/geqrf.cpp
index 8413c2d785c..0e41a9a68d3 100644
--- a/dpnp/backend/extensions/lapack/geqrf.cpp
+++ b/dpnp/backend/extensions/lapack/geqrf.cpp
@@ -189,6 +189,14 @@ std::pair<sycl::event, sycl::event>
     auto array_types = dpctl_td_ns::usm_ndarray_types();
     int a_array_type_id =
         array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int tau_array_type_id =
+        array_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    if (a_array_type_id != tau_array_type_id) {
+        throw py::value_error(
+            "The types of the input array and "
+            "the array of Householder scalars are mismatched");
+    }
 
     geqrf_impl_fn_ptr_t geqrf_fn = geqrf_dispatch_vector[a_array_type_id];
     if (geqrf_fn == nullptr) {
@@ -208,6 +216,15 @@ std::pair<sycl::event, sycl::event>
     const std::int64_t n = a_array_shape[0];
     const std::int64_t lda = std::max<size_t>(1UL, m);
 
+    const size_t tau_array_size = tau_array.get_size();
+    const size_t min_m_n = std::max<size_t>(1UL, std::min<size_t>(m, n));
+
+    if (tau_array_size != min_m_n) {
+        throw py::value_error("The array of Householder scalars has size=" +
+                              std::to_string(tau_array_size) + ", but a size=" +
+                              std::to_string(min_m_n) + " array is expected.");
+    }
+
     std::vector<sycl::event> host_task_events;
     sycl::event geqrf_ev = geqrf_fn(q, m, n, a_array_data, lda, tau_array_data,
                                     host_task_events, depends);
diff --git a/dpnp/backend/extensions/lapack/geqrf_batch.cpp b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
index 14f7cae487e..19e270c0366 100644
--- a/dpnp/backend/extensions/lapack/geqrf_batch.cpp
+++ b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
@@ -207,6 +207,14 @@ std::pair<sycl::event, sycl::event>
     auto array_types = dpctl_td_ns::usm_ndarray_types();
     int a_array_type_id =
         array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int tau_array_type_id =
+        array_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    if (a_array_type_id != tau_array_type_id) {
+        throw py::value_error(
+            "The types of the input array and "
+            "the array of Householder scalars are mismatched");
+    }
 
     geqrf_batch_impl_fn_ptr_t geqrf_batch_fn =
         geqrf_batch_dispatch_vector[a_array_type_id];
diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp
index 49a62742764..b3f3c617083 100644
--- a/dpnp/backend/extensions/lapack/orgqr.cpp
+++ b/dpnp/backend/extensions/lapack/orgqr.cpp
@@ -193,6 +193,15 @@ std::pair<sycl::event, sycl::event>
                               "must be contiguous");
     }
 
+    const size_t tau_array_size = tau_array.get_size();
+    const size_t min_m_n = std::max<size_t>(1UL, std::min<size_t>(m, n));
+
+    if (tau_array_size != min_m_n) {
+        throw py::value_error("The array of Householder scalars has size=" +
+                              std::to_string(tau_array_size) + ", but a size=" +
+                              std::to_string(min_m_n) + " array is expected.");
+    }
+
     auto array_types = dpctl_td_ns::usm_ndarray_types();
     int a_array_type_id =
         array_types.typenum_to_lookup_id(a_array.get_typenum());

From 09b6673e6e7b1b5626e6795031abf0f7d79f3674 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Fri, 2 Feb 2024 17:57:22 +0100
Subject: [PATCH 16/39] Extend error handler for mkl batch funcs

---
 dpnp/backend/extensions/lapack/geqrf_batch.cpp | 5 +++++
 dpnp/backend/extensions/lapack/orgqr_batch.cpp | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/dpnp/backend/extensions/lapack/geqrf_batch.cpp b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
index 19e270c0366..a05b01e9172 100644
--- a/dpnp/backend/extensions/lapack/geqrf_batch.cpp
+++ b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
@@ -124,6 +124,11 @@ static sycl::event geqrf_batch_impl(sycl::queue exec_q,
                 << e.detail() << ", but current size is " << scratchpad_size
                 << ".";
         }
+        else if (info != 0 && e.detail() == 0) {
+            error_msg << "Error in batch processing. "
+                         "Number of failed calculations: "
+                      << info;
+        }
         else {
             error_msg << "Unexpected MKL exception caught during geqrf_batch() "
                          "call:\nreason: "
diff --git a/dpnp/backend/extensions/lapack/orgqr_batch.cpp b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
index 4a8ecaf5156..afffbf1ec48 100644
--- a/dpnp/backend/extensions/lapack/orgqr_batch.cpp
+++ b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
@@ -128,6 +128,11 @@ static sycl::event orgqr_batch_impl(sycl::queue exec_q,
                 << e.detail() << ", but current size is " << scratchpad_size
                 << ".";
         }
+        else if (info != 0 && e.detail() == 0) {
+            error_msg << "Error in batch processing. "
+                         "Number of failed calculations: "
+                      << info;
+        }
         else {
             error_msg << "Unexpected MKL exception caught during orgqr_batch() "
                          "call:\nreason: "

From 8b4528fc44a81fdbbfbdbe64302f9ddb250ad0c5 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Sun, 4 Feb 2024 13:16:33 +0100
Subject: [PATCH 17/39] Use mkl_lapack namespace

---
 dpnp/backend/extensions/lapack/geqrf.cpp | 4 ++--
 dpnp/backend/extensions/lapack/orgqr.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/geqrf.cpp b/dpnp/backend/extensions/lapack/geqrf.cpp
index 0e41a9a68d3..a91f689d503 100644
--- a/dpnp/backend/extensions/lapack/geqrf.cpp
+++ b/dpnp/backend/extensions/lapack/geqrf.cpp
@@ -73,7 +73,7 @@ static sycl::event geqrf_impl(sycl::queue exec_q,
     T *tau = reinterpret_cast<T *>(in_tau);
 
     const std::int64_t scratchpad_size =
-        oneapi::mkl::lapack::geqrf_scratchpad_size<T>(exec_q, m, n, lda);
+        mkl_lapack::geqrf_scratchpad_size<T>(exec_q, m, n, lda);
     T *scratchpad = nullptr;
 
     std::stringstream error_msg;
@@ -84,7 +84,7 @@ static sycl::event geqrf_impl(sycl::queue exec_q,
     try {
         scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
 
-        geqrf_event = oneapi::mkl::lapack::geqrf(
+        geqrf_event = mkl_lapack::geqrf(
             exec_q,
             m,          // The number of rows in the matrix; (0 ≤ m).
             n,          // The number of columns in the matrix; (0 ≤ n).
diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp
index b3f3c617083..cba825a7506 100644
--- a/dpnp/backend/extensions/lapack/orgqr.cpp
+++ b/dpnp/backend/extensions/lapack/orgqr.cpp
@@ -75,7 +75,7 @@ static sycl::event orgqr_impl(sycl::queue exec_q,
     T *tau = reinterpret_cast<T *>(in_tau);
 
     const std::int64_t scratchpad_size =
-        oneapi::mkl::lapack::orgqr_scratchpad_size<T>(exec_q, m, n, k, lda);
+        mkl_lapack::orgqr_scratchpad_size<T>(exec_q, m, n, k, lda);
     T *scratchpad = nullptr;
 
     std::stringstream error_msg;
@@ -86,7 +86,7 @@ static sycl::event orgqr_impl(sycl::queue exec_q,
     try {
         scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
 
-        orgqr_event = oneapi::mkl::lapack::orgqr(
+        orgqr_event = mkl_lapack::orgqr(
             exec_q,
             m,          // The number of rows in the matrix; (0 ≤ m).
             n,          // The number of columns in the matrix; (0 ≤ n).

From ea7cb470e5fe7633cbe56dad0900e722fd262d61 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Sun, 4 Feb 2024 14:18:55 +0100
Subject: [PATCH 18/39] Add ungqr mkl extension to support complex dtype

---
 dpnp/backend/extensions/lapack/CMakeLists.txt |   1 +
 dpnp/backend/extensions/lapack/lapack_py.cpp  |   9 +
 .../extensions/lapack/types_matrix.hpp        |  25 ++
 dpnp/backend/extensions/lapack/ungqr.cpp      | 253 ++++++++++++++++++
 dpnp/backend/extensions/lapack/ungqr.hpp      |  67 +++++
 dpnp/linalg/dpnp_utils_linalg.py              |  23 +-
 6 files changed, 369 insertions(+), 9 deletions(-)
 create mode 100644 dpnp/backend/extensions/lapack/ungqr.cpp
 create mode 100644 dpnp/backend/extensions/lapack/ungqr.hpp

diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index e8cde917364..dda702b8b0f 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -39,6 +39,7 @@ set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/potrf.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/potrf_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/syevd.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ungqr.cpp
 )
 
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp
index 180ea71288e..95608a79db3 100644
--- a/dpnp/backend/extensions/lapack/lapack_py.cpp
+++ b/dpnp/backend/extensions/lapack/lapack_py.cpp
@@ -39,6 +39,7 @@
 #include "orgqr.hpp"
 #include "potrf.hpp"
 #include "syevd.hpp"
+#include "ungqr.hpp"
 
 namespace lapack_ext = dpnp::backend::ext::lapack;
 namespace py = pybind11;
@@ -57,6 +58,7 @@ void init_dispatch_vectors(void)
     lapack_ext::init_potrf_batch_dispatch_vector();
     lapack_ext::init_potrf_dispatch_vector();
     lapack_ext::init_syevd_dispatch_vector();
+    lapack_ext::init_ungqr_dispatch_vector();
 }
 
 // populate dispatch tables
@@ -161,4 +163,11 @@ PYBIND11_MODULE(_lapack_impl, m)
           py::arg("sycl_queue"), py::arg("jobz"), py::arg("upper_lower"),
           py::arg("eig_vecs"), py::arg("eig_vals"),
           py::arg("depends") = py::list());
+
+    m.def("_ungqr", &lapack_ext::ungqr,
+          "Call `ungqr` from OneMKL LAPACK library to return "
+          "the complex unitary matrix Q of the QR factorization",
+          py::arg("sycl_queue"), py::arg("m"), py::arg("n"), py::arg("k"),
+          py::arg("a_array"), py::arg("tau_array"),
+          py::arg("depends") = py::list());
 }
diff --git a/dpnp/backend/extensions/lapack/types_matrix.hpp b/dpnp/backend/extensions/lapack/types_matrix.hpp
index 804e5d2d487..f80675beb38 100644
--- a/dpnp/backend/extensions/lapack/types_matrix.hpp
+++ b/dpnp/backend/extensions/lapack/types_matrix.hpp
@@ -332,6 +332,31 @@ struct SyevdTypePairSupportFactory
         // fall-through
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
+
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::ungqr<T>
+ * function.
+ *
+ * @tparam T Type of array containing the matrix A from which the
+ * elementary reflectors were generated (as in QR factorization).
+ * Upon execution, the array is overwritten with the complex unitary matrix Q.
+ */
+template <typename T>
+struct UngqrTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<float>,
+                                          T,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<double>,
+                                          T,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
 } // namespace types
 } // namespace lapack
 } // namespace ext
diff --git a/dpnp/backend/extensions/lapack/ungqr.cpp b/dpnp/backend/extensions/lapack/ungqr.cpp
new file mode 100644
index 00000000000..0a2a2660ed8
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/ungqr.cpp
@@ -0,0 +1,253 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "types_matrix.hpp"
+#include "ungqr.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*ungqr_impl_fn_ptr_t)(sycl::queue,
+                                           const std::int64_t,
+                                           const std::int64_t,
+                                           const std::int64_t,
+                                           char *,
+                                           std::int64_t,
+                                           char *,
+                                           std::vector<sycl::event> &,
+                                           const std::vector<sycl::event> &);
+
+static ungqr_impl_fn_ptr_t ungqr_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event ungqr_impl(sycl::queue exec_q,
+                              const std::int64_t m,
+                              const std::int64_t n,
+                              const std::int64_t k,
+                              char *in_a,
+                              std::int64_t lda,
+                              char *in_tau,
+                              std::vector<sycl::event> &host_task_events,
+                              const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *tau = reinterpret_cast<T *>(in_tau);
+
+    const std::int64_t scratchpad_size =
+        mkl_lapack::ungqr_scratchpad_size<T>(exec_q, m, n, k, lda);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event ungqr_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        ungqr_event = mkl_lapack::ungqr(
+            exec_q,
+            m,          // The number of rows in the matrix; (0 ≤ m).
+            n,          // The number of columns in the matrix; (0 ≤ n).
+            k,          // The number of elementary reflectors
+                        // whose product defines the matrix Q; (0 ≤ k ≤ n).
+            a,          // Pointer to the m-by-n matrix.
+            lda,        // The leading dimension of `a`; (1 ≤ m).
+            tau,        // Pointer to the array of scalar factors of the
+                        // elementary reflectors.
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail() << ", but current size is " << scratchpad_size
+                << ".";
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during ungqr() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << info;
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg << "Unexpected SYCL exception caught during orfqr() call:\n"
+                  << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(ungqr_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+
+    return ungqr_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    ungqr(sycl::queue q,
+          const std::int64_t m,
+          const std::int64_t n,
+          const std::int64_t k,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray tau_array,
+          const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int tau_array_nd = tau_array.get_ndim();
+
+    if (a_array_nd != 2) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but a 2-dimensional array is expected.");
+    }
+
+    if (tau_array_nd != 1) {
+        throw py::value_error("The array of Householder scalars has ndim=" +
+                              std::to_string(tau_array_nd) +
+                              ", but a 1-dimensional array is expected.");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, tau_array)) {
+        throw py::value_error(
+            "The input array and the array of Householder scalars "
+            "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    if (!is_a_array_c_contig) {
+        throw py::value_error("The input array "
+                              "must be C-contiguous");
+    }
+
+    bool is_tau_array_c_contig = tau_array.is_c_contiguous();
+    bool is_tau_array_f_contig = tau_array.is_f_contiguous();
+    if (!is_tau_array_c_contig || !is_tau_array_f_contig) {
+        throw py::value_error("The array of Householder scalars "
+                              "must be contiguous");
+    }
+
+    const size_t tau_array_size = tau_array.get_size();
+    if (static_cast<std::int64_t>(tau_array_size) != k) {
+        throw py::value_error("The array of Householder scalars has size=" +
+                              std::to_string(tau_array_size) +
+                              ", but an array of size=" + std::to_string(k) +
+                              " is expected.");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+
+    ungqr_impl_fn_ptr_t ungqr_fn = ungqr_dispatch_vector[a_array_type_id];
+    if (ungqr_fn == nullptr) {
+        throw py::value_error(
+            "No ungqr implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    char *a_array_data = a_array.get_data();
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+
+    char *tau_array_data = tau_array.get_data();
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event ungqr_ev = ungqr_fn(q, m, n, k, a_array_data, lda,
+                                    tau_array_data, host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array},
+                                                        host_task_events);
+
+    return std::make_pair(args_ev, ungqr_ev);
+}
+
+template <typename fnT, typename T>
+struct UngqrContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::UngqrTypePairSupportFactory<T>::is_defined) {
+            return ungqr_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_ungqr_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<ungqr_impl_fn_ptr_t, UngqrContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(ungqr_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/lapack/ungqr.hpp b/dpnp/backend/extensions/lapack/ungqr.hpp
new file mode 100644
index 00000000000..4151364ab62
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/ungqr.hpp
@@ -0,0 +1,67 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <CL/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+#include <dpctl4pybind11.hpp>
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+extern std::pair<sycl::event, sycl::event>
+    ungqr(sycl::queue exec_q,
+          const std::int64_t m,
+          const std::int64_t n,
+          const std::int64_t k,
+          dpctl::tensor::usm_ndarray a_array,
+          dpctl::tensor::usm_ndarray tau_array,
+          const std::vector<sycl::event> &depends = {});
+
+// extern std::pair<sycl::event, sycl::event>
+//     ungqr_batch(sycl::queue exec_q,
+//                 dpctl::tensor::usm_ndarray a_array,
+//                 dpctl::tensor::usm_ndarray tau_array,
+//                 std::int64_t m,
+//                 std::int64_t n,
+//                 std::int64_t k,
+//                 std::int64_t stride_a,
+//                 std::int64_t stride_tau,
+//                 std::int64_t batch_size,
+//                 const std::vector<sycl::event> &depends = {});
+
+// extern void init_ungqr_batch_dispatch_vector(void);
+extern void init_ungqr_dispatch_vector(void);
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index d34d12212bb..0836dbc3fdb 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1227,9 +1227,6 @@ def dpnp_qr(a, mode="reduced"):
             tau_h.astype(res_type, copy=False),
         )
 
-    # _orgqr supports only floating type
-    orgqr_type = _real_type(res_type)
-
     # mc is the total number of columns in the q matrix.
     # In `complete` mode, mc equals the number of rows.
     # In `reduced` mode, mc is the lesser of the row count or column count.
@@ -1237,7 +1234,7 @@ def dpnp_qr(a, mode="reduced"):
         mc = m
         q = dpnp.empty(
             (m, m),
-            dtype=orgqr_type,
+            dtype=res_type,
             order="C",
             sycl_queue=a_sycl_queue,
             usm_type=a_usm_type,
@@ -1246,20 +1243,28 @@ def dpnp_qr(a, mode="reduced"):
         mc = k
         q = dpnp.empty(
             (n, m),
-            dtype=orgqr_type,
+            dtype=res_type,
             order="C",
             sycl_queue=a_sycl_queue,
             usm_type=a_usm_type,
         )
     q[:n] = a_t
 
-    # Call the LAPACK extension function _orgqr to generate the real orthogonal matrix
-    # `Q` of the QR factorization
-    ht_orgqr_ev, _ = li._orgqr(
+    # Get LAPACK function (_orgqr for real or _ungqf for complex data types)
+    # for QR factorization
+    lapack_func = (
+        "_ungqr"
+        if dpnp.issubdtype(res_type, dpnp.complexfloating)
+        else "_orgqr"
+    )
+
+    # Call the LAPACK extension function _orgqr/_ungqf to generate the real orthogonal/
+    # complex unitary matrix `Q` of the QR factorization
+    ht_lapack_ev, _ = getattr(li, lapack_func)(
         a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [geqrf_ev]
     )
 
-    ht_orgqr_ev.wait()
+    ht_lapack_ev.wait()
 
     q = q[:mc].transpose()
     r = a_t[:, :mc].transpose()

From 245ee80c68f48f962e1fa0b68735cfe495146058 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Sun, 4 Feb 2024 14:26:35 +0100
Subject: [PATCH 19/39] Update tau array size check for orgqr

---
 dpnp/backend/extensions/lapack/orgqr.cpp | 8 ++++----
 dpnp/backend/extensions/lapack/ungqr.cpp | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp
index cba825a7506..07fd72af13e 100644
--- a/dpnp/backend/extensions/lapack/orgqr.cpp
+++ b/dpnp/backend/extensions/lapack/orgqr.cpp
@@ -194,12 +194,12 @@ std::pair<sycl::event, sycl::event>
     }
 
     const size_t tau_array_size = tau_array.get_size();
-    const size_t min_m_n = std::max<size_t>(1UL, std::min<size_t>(m, n));
 
-    if (tau_array_size != min_m_n) {
+    if (static_cast<std::int64_t>(tau_array_size) != k) {
         throw py::value_error("The array of Householder scalars has size=" +
-                              std::to_string(tau_array_size) + ", but a size=" +
-                              std::to_string(min_m_n) + " array is expected.");
+                              std::to_string(tau_array_size) +
+                              ", but an array of size=" + std::to_string(k) +
+                              " is expected.");
     }
 
     auto array_types = dpctl_td_ns::usm_ndarray_types();
diff --git a/dpnp/backend/extensions/lapack/ungqr.cpp b/dpnp/backend/extensions/lapack/ungqr.cpp
index 0a2a2660ed8..6ecca118f7d 100644
--- a/dpnp/backend/extensions/lapack/ungqr.cpp
+++ b/dpnp/backend/extensions/lapack/ungqr.cpp
@@ -187,12 +187,14 @@ std::pair<sycl::event, sycl::event>
 
     bool is_tau_array_c_contig = tau_array.is_c_contiguous();
     bool is_tau_array_f_contig = tau_array.is_f_contiguous();
+
     if (!is_tau_array_c_contig || !is_tau_array_f_contig) {
         throw py::value_error("The array of Householder scalars "
                               "must be contiguous");
     }
 
     const size_t tau_array_size = tau_array.get_size();
+
     if (static_cast<std::int64_t>(tau_array_size) != k) {
         throw py::value_error("The array of Householder scalars has size=" +
                               std::to_string(tau_array_size) +

From c4c29bd25bad4155beb1a514a4856d5ea1d36385 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Sun, 4 Feb 2024 14:58:30 +0100
Subject: [PATCH 20/39] Add ungqr_batch mkl extension to support complex dtype

---
 dpnp/backend/extensions/lapack/CMakeLists.txt |   1 +
 dpnp/backend/extensions/lapack/lapack_py.cpp  |  12 +-
 .../extensions/lapack/types_matrix.hpp        |  27 ++
 dpnp/backend/extensions/lapack/ungqr.hpp      |  24 +-
 .../backend/extensions/lapack/ungqr_batch.cpp | 279 ++++++++++++++++++
 dpnp/linalg/dpnp_utils_linalg.py              |  28 +-
 6 files changed, 348 insertions(+), 23 deletions(-)
 create mode 100644 dpnp/backend/extensions/lapack/ungqr_batch.cpp

diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt
index dda702b8b0f..0a0f6490bee 100644
--- a/dpnp/backend/extensions/lapack/CMakeLists.txt
+++ b/dpnp/backend/extensions/lapack/CMakeLists.txt
@@ -40,6 +40,7 @@ set(_module_src
     ${CMAKE_CURRENT_SOURCE_DIR}/potrf_batch.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/syevd.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ungqr.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ungqr_batch.cpp
 )
 
 pybind11_add_module(${python_module_name} MODULE ${_module_src})
diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp
index 95608a79db3..cd95dbac0ed 100644
--- a/dpnp/backend/extensions/lapack/lapack_py.cpp
+++ b/dpnp/backend/extensions/lapack/lapack_py.cpp
@@ -58,6 +58,7 @@ void init_dispatch_vectors(void)
     lapack_ext::init_potrf_batch_dispatch_vector();
     lapack_ext::init_potrf_dispatch_vector();
     lapack_ext::init_syevd_dispatch_vector();
+    lapack_ext::init_ungqr_batch_dispatch_vector();
     lapack_ext::init_ungqr_dispatch_vector();
 }
 
@@ -128,7 +129,7 @@ PYBIND11_MODULE(_lapack_impl, m)
           py::arg("depends") = py::list());
 
     m.def("_orgqr_batch", &lapack_ext::orgqr_batch,
-          "Call `orgqr` from OneMKL LAPACK library to return "
+          "Call `_orgqr_batch` from OneMKL LAPACK library to return "
           "the real orthogonal matrix Qi of the QR factorization "
           "for a batch of general matrices",
           py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"),
@@ -164,6 +165,15 @@ PYBIND11_MODULE(_lapack_impl, m)
           py::arg("eig_vecs"), py::arg("eig_vals"),
           py::arg("depends") = py::list());
 
+    m.def("_ungqr_batch", &lapack_ext::ungqr_batch,
+          "Call `_ungqr_batch` from OneMKL LAPACK library to return "
+          "the complex unitary matrices matrix Qi of the QR factorization "
+          "for a batch of general matrices",
+          py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"),
+          py::arg("m"), py::arg("n"), py::arg("k"), py::arg("stride_a"),
+          py::arg("stride_tau"), py::arg("batch_size"),
+          py::arg("depends") = py::list());
+
     m.def("_ungqr", &lapack_ext::ungqr,
           "Call `ungqr` from OneMKL LAPACK library to return "
           "the complex unitary matrix Q of the QR factorization",
diff --git a/dpnp/backend/extensions/lapack/types_matrix.hpp b/dpnp/backend/extensions/lapack/types_matrix.hpp
index f80675beb38..30621c673a9 100644
--- a/dpnp/backend/extensions/lapack/types_matrix.hpp
+++ b/dpnp/backend/extensions/lapack/types_matrix.hpp
@@ -333,6 +333,33 @@ struct SyevdTypePairSupportFactory
         dpctl_td_ns::NotDefinedEntry>::is_defined;
 };
 
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL LAPACK library provides support in oneapi::mkl::lapack::ungqr_batch<T>
+ * function.
+ *
+ * @tparam T Type of array containing the matrix A,
+ * each from a separate instance in the batch, from which the
+ * elementary reflectors were generated (as in QR factorization).
+ * Upon execution, each array in the batch is overwritten with
+ * its respective complex unitary matrix Q.
+ */
+template <typename T>
+struct UngqrBatchTypePairSupportFactory
+{
+    static constexpr bool is_defined = std::disjunction<
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<float>,
+                                          T,
+                                          std::complex<float>>,
+        dpctl_td_ns::TypePairDefinedEntry<T,
+                                          std::complex<double>,
+                                          T,
+                                          std::complex<double>>,
+        // fall-through
+        dpctl_td_ns::NotDefinedEntry>::is_defined;
+};
+
 /**
  * @brief A factory to define pairs of supported types for which
  * MKL LAPACK library provides support in oneapi::mkl::lapack::ungqr<T>
diff --git a/dpnp/backend/extensions/lapack/ungqr.hpp b/dpnp/backend/extensions/lapack/ungqr.hpp
index 4151364ab62..1a9b68e94f9 100644
--- a/dpnp/backend/extensions/lapack/ungqr.hpp
+++ b/dpnp/backend/extensions/lapack/ungqr.hpp
@@ -47,19 +47,19 @@ extern std::pair<sycl::event, sycl::event>
           dpctl::tensor::usm_ndarray tau_array,
           const std::vector<sycl::event> &depends = {});
 
-// extern std::pair<sycl::event, sycl::event>
-//     ungqr_batch(sycl::queue exec_q,
-//                 dpctl::tensor::usm_ndarray a_array,
-//                 dpctl::tensor::usm_ndarray tau_array,
-//                 std::int64_t m,
-//                 std::int64_t n,
-//                 std::int64_t k,
-//                 std::int64_t stride_a,
-//                 std::int64_t stride_tau,
-//                 std::int64_t batch_size,
-//                 const std::vector<sycl::event> &depends = {});
+extern std::pair<sycl::event, sycl::event>
+    ungqr_batch(sycl::queue exec_q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray tau_array,
+                std::int64_t m,
+                std::int64_t n,
+                std::int64_t k,
+                std::int64_t stride_a,
+                std::int64_t stride_tau,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends = {});
 
-// extern void init_ungqr_batch_dispatch_vector(void);
+extern void init_ungqr_batch_dispatch_vector(void);
 extern void init_ungqr_dispatch_vector(void);
 } // namespace lapack
 } // namespace ext
diff --git a/dpnp/backend/extensions/lapack/ungqr_batch.cpp b/dpnp/backend/extensions/lapack/ungqr_batch.cpp
new file mode 100644
index 00000000000..8d7b6cd33d4
--- /dev/null
+++ b/dpnp/backend/extensions/lapack/ungqr_batch.cpp
@@ -0,0 +1,279 @@
+//*****************************************************************************
+// Copyright (c) 2024, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#include <pybind11/pybind11.h>
+
+// dpctl tensor headers
+#include "utils/memory_overlap.hpp"
+#include "utils/type_utils.hpp"
+
+#include "types_matrix.hpp"
+#include "ungqr.hpp"
+
+#include "dpnp_utils.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace lapack
+{
+namespace mkl_lapack = oneapi::mkl::lapack;
+namespace py = pybind11;
+namespace type_utils = dpctl::tensor::type_utils;
+
+typedef sycl::event (*ungqr_batch_impl_fn_ptr_t)(
+    sycl::queue,
+    std::int64_t,
+    std::int64_t,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    std::int64_t,
+    char *,
+    std::int64_t,
+    std::int64_t,
+    std::vector<sycl::event> &,
+    const std::vector<sycl::event> &);
+
+static ungqr_batch_impl_fn_ptr_t
+    ungqr_batch_dispatch_vector[dpctl_td_ns::num_types];
+
+template <typename T>
+static sycl::event ungqr_batch_impl(sycl::queue exec_q,
+                                    std::int64_t m,
+                                    std::int64_t n,
+                                    std::int64_t k,
+                                    char *in_a,
+                                    std::int64_t lda,
+                                    std::int64_t stride_a,
+                                    char *in_tau,
+                                    std::int64_t stride_tau,
+                                    std::int64_t batch_size,
+                                    std::vector<sycl::event> &host_task_events,
+                                    const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    T *a = reinterpret_cast<T *>(in_a);
+    T *tau = reinterpret_cast<T *>(in_tau);
+
+    const std::int64_t scratchpad_size =
+        mkl_lapack::ungqr_batch_scratchpad_size<T>(
+            exec_q, m, n, k, lda, stride_a, stride_tau, batch_size);
+    T *scratchpad = nullptr;
+
+    std::stringstream error_msg;
+    std::int64_t info = 0;
+    bool is_exception_caught = false;
+
+    sycl::event ungqr_batch_event;
+    try {
+        scratchpad = sycl::malloc_device<T>(scratchpad_size, exec_q);
+
+        ungqr_batch_event = mkl_lapack::ungqr_batch(
+            exec_q,
+            m, // The number of rows in each matrix in the batch; (0 ≤ m).
+               // It must be a non-negative integer.
+            n, // The number of columns in each matrix in the batch; (0 ≤ n).
+               // It must be a non-negative integer.
+            k, // The number of elementary reflectors
+               // whose product defines the matrices Qi; (0 ≤ k ≤ n).
+            a, // Pointer to the batch of matrices, each of size (m x n).
+            lda,      // The leading dimension of each matrix in the batch.
+                      // For row major layout, lda ≥ max(1, m).
+            stride_a, // Stride between consecutive matrices in the batch.
+            tau, // Pointer to the array of scalar factors of the elementary
+                 // reflectors for each matrix in the batch.
+            stride_tau, // Stride between arrays of scalar factors in the batch.
+            batch_size, // The number of matrices in the batch.
+            scratchpad, // Pointer to scratchpad memory to be used by MKL
+                        // routine for storing intermediate results.
+            scratchpad_size, depends);
+    } catch (mkl_lapack::exception const &e) {
+        is_exception_caught = true;
+        info = e.info();
+
+        if (info < 0) {
+            error_msg << "Parameter number " << -info
+                      << " had an illegal value.";
+        }
+        else if (info == scratchpad_size && e.detail() != 0) {
+            error_msg
+                << "Insufficient scratchpad size. Required size is at least "
+                << e.detail() << ", but current size is " << scratchpad_size
+                << ".";
+        }
+        else if (info != 0 && e.detail() == 0) {
+            error_msg << "Error in batch processing. "
+                         "Number of failed calculations: "
+                      << info;
+        }
+        else {
+            error_msg << "Unexpected MKL exception caught during ungqr_batch() "
+                         "call:\nreason: "
+                      << e.what() << "\ninfo: " << e.info();
+        }
+    } catch (sycl::exception const &e) {
+        is_exception_caught = true;
+        error_msg
+            << "Unexpected SYCL exception caught during ungqr_batch() call:\n"
+            << e.what();
+    }
+
+    if (is_exception_caught) // an unexpected error occurs
+    {
+        if (scratchpad != nullptr) {
+            sycl::free(scratchpad, exec_q);
+        }
+
+        throw std::runtime_error(error_msg.str());
+    }
+
+    sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(ungqr_batch_event);
+        auto ctx = exec_q.get_context();
+        cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); });
+    });
+    host_task_events.push_back(clean_up_event);
+    return ungqr_batch_event;
+}
+
+std::pair<sycl::event, sycl::event>
+    ungqr_batch(sycl::queue q,
+                dpctl::tensor::usm_ndarray a_array,
+                dpctl::tensor::usm_ndarray tau_array,
+                std::int64_t m,
+                std::int64_t n,
+                std::int64_t k,
+                std::int64_t stride_a,
+                std::int64_t stride_tau,
+                std::int64_t batch_size,
+                const std::vector<sycl::event> &depends)
+{
+    const int a_array_nd = a_array.get_ndim();
+    const int tau_array_nd = tau_array.get_ndim();
+
+    if (a_array_nd < 3) {
+        throw py::value_error(
+            "The input array has ndim=" + std::to_string(a_array_nd) +
+            ", but an array with ndim >= 3 is expected.");
+    }
+
+    if (tau_array_nd != 2) {
+        throw py::value_error("The array of Householder scalars has ndim=" +
+                              std::to_string(tau_array_nd) +
+                              ", but a 2-dimensional array is expected.");
+    }
+
+    // check compatibility of execution queue and allocation queue
+    if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(a_array, tau_array)) {
+        throw py::value_error(
+            "The input array and the array of Householder scalars "
+            "are overlapping segments of memory");
+    }
+
+    bool is_a_array_c_contig = a_array.is_c_contiguous();
+    bool is_tau_array_c_contig = tau_array.is_c_contiguous();
+    if (!is_a_array_c_contig) {
+        throw py::value_error("The input array "
+                              "must be C-contiguous");
+    }
+    if (!is_tau_array_c_contig) {
+        throw py::value_error("The array of Householder scalars "
+                              "must be C-contiguous");
+    }
+
+    auto array_types = dpctl_td_ns::usm_ndarray_types();
+    int a_array_type_id =
+        array_types.typenum_to_lookup_id(a_array.get_typenum());
+
+    ungqr_batch_impl_fn_ptr_t ungqr_batch_fn =
+        ungqr_batch_dispatch_vector[a_array_type_id];
+    if (ungqr_batch_fn == nullptr) {
+        throw py::value_error(
+            "No ungqr_batch implementation defined for the provided type "
+            "of the input matrix.");
+    }
+
+    // auto tau_types = dpctl_td_ns::usm_ndarray_types();
+    // int tau_array_type_id =
+    //     tau_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    // if (tau_array_type_id != static_cast<int>(dpctl_td_ns::typenum_t::INT64))
+    // {
+    //     throw py::value_error("The type of 'tau_array' must be int64.");
+    // }
+
+    char *a_array_data = a_array.get_data();
+    char *tau_array_data = tau_array.get_data();
+
+    const std::int64_t lda = std::max<size_t>(1UL, m);
+
+    std::vector<sycl::event> host_task_events;
+    sycl::event ungqr_batch_ev =
+        ungqr_batch_fn(q, m, n, k, a_array_data, lda, stride_a, tau_array_data,
+                       stride_tau, batch_size, host_task_events, depends);
+
+    sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array},
+                                                        host_task_events);
+
+    return std::make_pair(args_ev, ungqr_batch_ev);
+}
+
+template <typename fnT, typename T>
+struct UngqrBatchContigFactory
+{
+    fnT get()
+    {
+        if constexpr (types::UngqrBatchTypePairSupportFactory<T>::is_defined) {
+            return ungqr_batch_impl<T>;
+        }
+        else {
+            return nullptr;
+        }
+    }
+};
+
+void init_ungqr_batch_dispatch_vector(void)
+{
+    dpctl_td_ns::DispatchVectorBuilder<ungqr_batch_impl_fn_ptr_t,
+                                       UngqrBatchContigFactory,
+                                       dpctl_td_ns::num_types>
+        contig;
+    contig.populate_dispatch_vector(ungqr_batch_dispatch_vector);
+}
+} // namespace lapack
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 0836dbc3fdb..8e25f1baa33 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1052,7 +1052,7 @@ def dpnp_qr_batch(a, mode="reduced"):
 
     # Call the LAPACK extension function _geqrf_batch to compute the QR factorization
     # of a general m x n matrix.
-    ht_lapack_ev, geqrf_batch_ev = li._geqrf_batch(
+    ht_geqrf_batch_ev, geqrf_batch_ev = li._geqrf_batch(
         a_sycl_queue,
         a_t.get_array(),
         tau_h.get_array(),
@@ -1064,7 +1064,7 @@ def dpnp_qr_batch(a, mode="reduced"):
         [a_copy_ev],
     )
 
-    ht_lapack_ev.wait()
+    ht_geqrf_batch_ev.wait()
     a_ht_copy_ev.wait()
 
     if mode == "r":
@@ -1080,14 +1080,11 @@ def dpnp_qr_batch(a, mode="reduced"):
             r.astype(res_type, copy=False),
         )
 
-    # _orgqr supports only floating type
-    orgqr_type = _real_type(res_type)
-
     if mode == "complete" and m > n:
         mc = m
         q = dpnp.empty(
             (batch_size, m, m),
-            dtype=orgqr_type,
+            dtype=res_type,
             sycl_queue=a_sycl_queue,
             usm_type=a_usm_type,
         )
@@ -1095,7 +1092,7 @@ def dpnp_qr_batch(a, mode="reduced"):
         mc = k
         q = dpnp.empty(
             (batch_size, n, m),
-            dtype=orgqr_type,
+            dtype=res_type,
             sycl_queue=a_sycl_queue,
             usm_type=a_usm_type,
         )
@@ -1104,9 +1101,18 @@ def dpnp_qr_batch(a, mode="reduced"):
     q_stride = q.strides[0]
     tau_stride = tau_h.strides[0]
 
-    # Call the LAPACK extension function _orgqr to generate the orthogonal/unitary matrix
-    # `Qi` of the QR factorization for a batch of general matrices.
-    ht_lapack_ev, _ = li._orgqr_batch(
+    # Get LAPACK function (_orgqr_batch for real or _ungqf_batch for complex data types)
+    # for QR factorization
+    lapack_func = (
+        "_ungqr_batch"
+        if dpnp.issubdtype(res_type, dpnp.complexfloating)
+        else "_orgqr_batch"
+    )
+
+    # Call the LAPACK extension function _orgqr_batch/ to generate the real orthogonal/
+    # complex unitary matrices `Qi` of the QR factorization
+    # for a batch of general matrices.
+    ht_lapack_ev, _ = getattr(li, lapack_func)(
         a_sycl_queue,
         q.get_array(),
         tau_h.get_array(),
@@ -1119,6 +1125,8 @@ def dpnp_qr_batch(a, mode="reduced"):
         [geqrf_batch_ev],
     )
 
+    ht_lapack_ev.wait()
+
     q = q[..., :mc, :].swapaxes(-2, -1)
     r = a_t[..., :mc].swapaxes(-2, -1)
 

From f097cd1c7e68979866dc73bf8d896eb6b292a5af Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Sun, 4 Feb 2024 15:03:30 +0100
Subject: [PATCH 21/39] Add arrays type check

---
 dpnp/backend/extensions/lapack/orgqr.cpp       |  8 ++++++++
 dpnp/backend/extensions/lapack/orgqr_batch.cpp | 17 ++++++++---------
 dpnp/backend/extensions/lapack/ungqr.cpp       |  8 ++++++++
 dpnp/backend/extensions/lapack/ungqr_batch.cpp | 17 ++++++++---------
 4 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp
index 07fd72af13e..22cbbe05bee 100644
--- a/dpnp/backend/extensions/lapack/orgqr.cpp
+++ b/dpnp/backend/extensions/lapack/orgqr.cpp
@@ -205,6 +205,14 @@ std::pair<sycl::event, sycl::event>
     auto array_types = dpctl_td_ns::usm_ndarray_types();
     int a_array_type_id =
         array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int tau_array_type_id =
+        array_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    if (a_array_type_id != tau_array_type_id) {
+        throw py::value_error(
+            "The types of the input array and "
+            "the array of Householder scalars are mismatched");
+    }
 
     orgqr_impl_fn_ptr_t orgqr_fn = orgqr_dispatch_vector[a_array_type_id];
     if (orgqr_fn == nullptr) {
diff --git a/dpnp/backend/extensions/lapack/orgqr_batch.cpp b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
index afffbf1ec48..dfa9932a8e0 100644
--- a/dpnp/backend/extensions/lapack/orgqr_batch.cpp
+++ b/dpnp/backend/extensions/lapack/orgqr_batch.cpp
@@ -217,6 +217,14 @@ std::pair<sycl::event, sycl::event>
     auto array_types = dpctl_td_ns::usm_ndarray_types();
     int a_array_type_id =
         array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int tau_array_type_id =
+        array_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    if (a_array_type_id != tau_array_type_id) {
+        throw py::value_error(
+            "The types of the input array and "
+            "the array of Householder scalars are mismatched");
+    }
 
     orgqr_batch_impl_fn_ptr_t orgqr_batch_fn =
         orgqr_batch_dispatch_vector[a_array_type_id];
@@ -226,15 +234,6 @@ std::pair<sycl::event, sycl::event>
             "of the input matrix.");
     }
 
-    // auto tau_types = dpctl_td_ns::usm_ndarray_types();
-    // int tau_array_type_id =
-    //     tau_types.typenum_to_lookup_id(tau_array.get_typenum());
-
-    // if (tau_array_type_id != static_cast<int>(dpctl_td_ns::typenum_t::INT64))
-    // {
-    //     throw py::value_error("The type of 'tau_array' must be int64.");
-    // }
-
     char *a_array_data = a_array.get_data();
     char *tau_array_data = tau_array.get_data();
 
diff --git a/dpnp/backend/extensions/lapack/ungqr.cpp b/dpnp/backend/extensions/lapack/ungqr.cpp
index 6ecca118f7d..7c8dea4e950 100644
--- a/dpnp/backend/extensions/lapack/ungqr.cpp
+++ b/dpnp/backend/extensions/lapack/ungqr.cpp
@@ -205,6 +205,14 @@ std::pair<sycl::event, sycl::event>
     auto array_types = dpctl_td_ns::usm_ndarray_types();
     int a_array_type_id =
         array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int tau_array_type_id =
+        array_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    if (a_array_type_id != tau_array_type_id) {
+        throw py::value_error(
+            "The types of the input array and "
+            "the array of Householder scalars are mismatched");
+    }
 
     ungqr_impl_fn_ptr_t ungqr_fn = ungqr_dispatch_vector[a_array_type_id];
     if (ungqr_fn == nullptr) {
diff --git a/dpnp/backend/extensions/lapack/ungqr_batch.cpp b/dpnp/backend/extensions/lapack/ungqr_batch.cpp
index 8d7b6cd33d4..c07eaf150fc 100644
--- a/dpnp/backend/extensions/lapack/ungqr_batch.cpp
+++ b/dpnp/backend/extensions/lapack/ungqr_batch.cpp
@@ -217,6 +217,14 @@ std::pair<sycl::event, sycl::event>
     auto array_types = dpctl_td_ns::usm_ndarray_types();
     int a_array_type_id =
         array_types.typenum_to_lookup_id(a_array.get_typenum());
+    int tau_array_type_id =
+        array_types.typenum_to_lookup_id(tau_array.get_typenum());
+
+    if (a_array_type_id != tau_array_type_id) {
+        throw py::value_error(
+            "The types of the input array and "
+            "the array of Householder scalars are mismatched");
+    }
 
     ungqr_batch_impl_fn_ptr_t ungqr_batch_fn =
         ungqr_batch_dispatch_vector[a_array_type_id];
@@ -226,15 +234,6 @@ std::pair<sycl::event, sycl::event>
             "of the input matrix.");
     }
 
-    // auto tau_types = dpctl_td_ns::usm_ndarray_types();
-    // int tau_array_type_id =
-    //     tau_types.typenum_to_lookup_id(tau_array.get_typenum());
-
-    // if (tau_array_type_id != static_cast<int>(dpctl_td_ns::typenum_t::INT64))
-    // {
-    //     throw py::value_error("The type of 'tau_array' must be int64.");
-    // }
-
     char *a_array_data = a_array.get_data();
     char *tau_array_data = tau_array.get_data();
 

From a9dfe1511609ef91ef9752d40d0e499881704da9 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Sun, 4 Feb 2024 15:10:49 +0100
Subject: [PATCH 22/39] Fix test_det_singular_matrix

---
 tests/test_linalg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_linalg.py b/tests/test_linalg.py
index 49030fee561..9163cff6aee 100644
--- a/tests/test_linalg.py
+++ b/tests/test_linalg.py
@@ -312,8 +312,8 @@ def test_det_singular_matrix(self, matrix):
         a_np = numpy.array(matrix, dtype="float32")
         a_dp = inp.array(a_np)
 
-        expected = numpy.linalg.slogdet(a_np)
-        result = inp.linalg.slogdet(a_dp)
+        expected = numpy.linalg.det(a_np)
+        result = inp.linalg.det(a_dp)
 
         assert_allclose(expected, result, rtol=1e-3, atol=1e-4)
 

From d3395af57302a4b127535034dd1a7b4a18a709af Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Sun, 4 Feb 2024 15:36:07 +0100
Subject: [PATCH 23/39] Expand tests for dpnp.linalg.qr with complex types

---
 tests/test_linalg.py                          | 35 +++++++++++--------
 .../cupy/linalg_tests/test_decomposition.py   |  3 --
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/tests/test_linalg.py b/tests/test_linalg.py
index 9163cff6aee..f935e434a1e 100644
--- a/tests/test_linalg.py
+++ b/tests/test_linalg.py
@@ -709,27 +709,32 @@ def test_qr(self, dtype, shape, mode):
 
             # check decomposition
             if mode in ("complete", "reduced"):
-                # _orgqr doesn`t support complex type
-                if not inp.issubdtype(dtype, inp.complexfloating):
-                    if a.ndim == 2:
+                if a.ndim == 2:
+                    # TODO: remove it when dpnp.dot() supports complex types
+                    if inp.issubdtype(dtype, inp.complexfloating):
+                        dpnp_as_np_q = inp.asnumpy(dpnp_q)
+                        dpnp_as_np_r = inp.asnumpy(dpnp_r)
+
                         assert_almost_equal(
-                            inp.dot(dpnp_q, dpnp_r),
+                            numpy.dot(dpnp_as_np_q, dpnp_as_np_r),
                             a,
                             decimal=5,
                         )
                     else:
-                        batch_size = a.shape[0]
-                        for i in range(batch_size):
-                            assert_almost_equal(
-                                inp.dot(dpnp_q[i], dpnp_r[i]),
-                                a[i],
-                                decimal=5,
-                            )
+                        assert_almost_equal(
+                            inp.dot(dpnp_q, dpnp_r),
+                            a,
+                            decimal=5,
+                        )
+                else:  # a.ndim > 2
+                    assert_almost_equal(
+                        inp.matmul(dpnp_q, dpnp_r),
+                        a,
+                        decimal=5,
+                    )
             else:  # mode=="raw"
-                # _orgqr doesn`t support complex type
-                if not inp.issubdtype(dtype, inp.complexfloating):
-                    assert_dtype_allclose(dpnp_q, np_q)
-                    assert_dtype_allclose(dpnp_r, np_r)
+                assert_dtype_allclose(dpnp_q, np_q)
+
         if mode in ("raw", "r"):
             assert_dtype_allclose(dpnp_r, np_r)
 
diff --git a/tests/third_party/cupy/linalg_tests/test_decomposition.py b/tests/third_party/cupy/linalg_tests/test_decomposition.py
index 813b322d823..d562ddccde5 100644
--- a/tests/third_party/cupy/linalg_tests/test_decomposition.py
+++ b/tests/third_party/cupy/linalg_tests/test_decomposition.py
@@ -148,9 +148,6 @@ def test_decomposition(self, dtype):
 class TestQRDecomposition(unittest.TestCase):
     @testing.for_dtypes("fdFD")
     def check_mode(self, array, mode, dtype):
-        if dtype in (numpy.complex64, numpy.complex128):
-            pytest.skip("ungqr unsupported")
-
         a_cpu = numpy.asarray(array, dtype=dtype)
         a_gpu = cupy.asarray(array, dtype=dtype)
         result_gpu = cupy.linalg.qr(a_gpu, mode=mode)

From ce46590ea8a586b50bd3938df131426b38e8baa3 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Sun, 4 Feb 2024 16:48:15 +0100
Subject: [PATCH 24/39] Update examples

---
 dpnp/linalg/dpnp_iface_linalg.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py
index 822054fe354..ed691bd8d92 100644
--- a/dpnp/linalg/dpnp_iface_linalg.py
+++ b/dpnp/linalg/dpnp_iface_linalg.py
@@ -562,10 +562,18 @@ def qr(a, mode="reduced"):
     >>> import dpnp as np
     >>> a = np.random.randn(9, 6)
     >>> Q, R = np.linalg.qr(a)
-    >>> np.allclose(a, np.dot(Q, R))
+    >>> np.allclose(a, np.dot(Q, R))  # a does equal QR
     array([ True])
     >>> R2 = np.linalg.qr(a, mode='r')
-    >>> np.allclose(R, R2)
+    >>> np.allclose(R, R2)  # mode='r' returns the same R as mode='full'
+    array([ True])
+    >>> a = np.random.normal(size=(3, 2, 2)) # Stack of 2 x 2 matrices as input
+    >>> Q, R = np.linalg.qr(a)
+    >>> Q.shape
+    (3, 2, 2)
+    >>> R.shape
+    (3, 2, 2)
+    >>> np.allclose(a, np.matmul(Q, R))
     array([ True])
 
     """

From 5c173ca262d2c123ab14435419fc23805b1ee952 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Sun, 4 Feb 2024 16:57:11 +0100
Subject: [PATCH 25/39] Remove astype for output arrays

---
 dpnp/linalg/dpnp_utils_linalg.py | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 8e25f1baa33..764e9b435be 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1069,16 +1069,13 @@ def dpnp_qr_batch(a, mode="reduced"):
 
     if mode == "r":
         r = a_t[..., :k].swapaxes(-2, -1)
-        out_r = dpnp.triu(r).astype(res_type, copy=False)
+        out_r = dpnp.triu(r)
         return out_r.reshape(batch_shape + out_r.shape[-2:])
 
     if mode == "raw":
         q = a_t.reshape(batch_shape + a_t.shape[-2:])
         r = tau_h.reshape(batch_shape + tau_h.shape[-1:])
-        return (
-            q.astype(res_type, copy=False),
-            r.astype(res_type, copy=False),
-        )
+        return (q, r)
 
     if mode == "complete" and m > n:
         mc = m
@@ -1130,14 +1127,11 @@ def dpnp_qr_batch(a, mode="reduced"):
     q = q[..., :mc, :].swapaxes(-2, -1)
     r = a_t[..., :mc].swapaxes(-2, -1)
 
-    r_triu = dpnp.triu(r).astype(res_type, copy=False)
-
-    q_reshape = q.reshape(batch_shape + q.shape[-2:])
-    r_triu_reshape = r_triu.reshape(batch_shape + r_triu.shape[-2:])
+    r_triu = dpnp.triu(r)
 
     return (
-        q_reshape.astype(dtype=res_type, copy=False),
-        r_triu_reshape,
+        q.reshape(batch_shape + q.shape[-2:]),
+        r_triu.reshape(batch_shape + r_triu.shape[-2:]),
     )
 
 
@@ -1227,13 +1221,10 @@ def dpnp_qr(a, mode="reduced"):
 
     if mode == "r":
         r = a_t[:, :k].transpose()
-        return dpnp.triu(r).astype(res_type, copy=False)
+        return dpnp.triu(r)
 
     if mode == "raw":
-        return (
-            a_t.astype(res_type, copy=False),
-            tau_h.astype(res_type, copy=False),
-        )
+        return (a_t, tau_h)
 
     # mc is the total number of columns in the q matrix.
     # In `complete` mode, mc equals the number of rows.
@@ -1276,10 +1267,7 @@ def dpnp_qr(a, mode="reduced"):
 
     q = q[:mc].transpose()
     r = a_t[:, :mc].transpose()
-    return (
-        q.astype(dtype=res_type, copy=False),
-        dpnp.triu(r).astype(dtype=res_type, copy=False),
-    )
+    return (q, dpnp.triu(r))
 
 
 def dpnp_solve(a, b):

From f494bfdcb024ab539d78b5bf3478c0b9b97dcf6f Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Sun, 4 Feb 2024 17:22:31 +0100
Subject: [PATCH 26/39] Use empty_like instead of empty

---
 dpnp/linalg/dpnp_utils_linalg.py | 127 +++++++++++++------------------
 1 file changed, 55 insertions(+), 72 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 764e9b435be..d18f8e390c8 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -973,17 +973,15 @@ def dpnp_qr_batch(a, mode="reduced"):
     if batch_size == 0 or k == 0:
         if mode == "reduced":
             return (
-                dpnp.empty(
-                    batch_shape + (m, k),
+                dpnp.empty_like(
+                    a,
+                    shape=batch_shape + (m, k),
                     dtype=res_type,
-                    sycl_queue=a_sycl_queue,
-                    usm_type=a_usm_type,
                 ),
-                dpnp.empty(
-                    batch_shape + (k, n),
+                dpnp.empty_like(
+                    a,
+                    shape=batch_shape + (k, n),
                     dtype=res_type,
-                    sycl_queue=a_sycl_queue,
-                    usm_type=a_usm_type,
                 ),
             )
         elif mode == "complete":
@@ -996,33 +994,29 @@ def dpnp_qr_batch(a, mode="reduced"):
             )
             return (
                 q,
-                dpnp.empty(
-                    batch_shape + (m, n),
+                dpnp.empty_like(
+                    a,
+                    shape=batch_shape + (m, n),
                     dtype=res_type,
-                    sycl_queue=a_sycl_queue,
-                    usm_type=a_usm_type,
                 ),
             )
         elif mode == "r":
-            return dpnp.empty(
-                batch_shape + (k, n),
+            return dpnp.empty_like(
+                a,
+                shape=batch_shape + (k, n),
                 dtype=res_type,
-                sycl_queue=a_sycl_queue,
-                usm_type=a_usm_type,
             )
         elif mode == "raw":
             return (
-                dpnp.empty(
-                    batch_shape + (n, m),
+                dpnp.empty_like(
+                    a,
+                    shape=batch_shape + (n, m),
                     dtype=res_type,
-                    sycl_queue=a_sycl_queue,
-                    usm_type=a_usm_type,
                 ),
-                dpnp.empty(
-                    batch_shape + (k,),
+                dpnp.empty_like(
+                    a,
+                    shape=batch_shape + (k,),
                     dtype=res_type,
-                    sycl_queue=a_sycl_queue,
-                    usm_type=a_usm_type,
                 ),
             )
 
@@ -1040,11 +1034,10 @@ def dpnp_qr_batch(a, mode="reduced"):
         src=a_usm_arr, dst=a_t.get_array(), sycl_queue=a_sycl_queue
     )
 
-    tau_h = dpnp.empty(
-        (batch_size, k),
+    tau_h = dpnp.empty_like(
+        a_t,
+        shape=(batch_size, k),
         dtype=res_type,
-        sycl_queue=a_sycl_queue,
-        usm_type=a_usm_type,
     )
 
     a_stride = a_t.strides[0]
@@ -1079,19 +1072,17 @@ def dpnp_qr_batch(a, mode="reduced"):
 
     if mode == "complete" and m > n:
         mc = m
-        q = dpnp.empty(
-            (batch_size, m, m),
+        q = dpnp.empty_like(
+            a_t,
+            shape=(batch_size, m, m),
             dtype=res_type,
-            sycl_queue=a_sycl_queue,
-            usm_type=a_usm_type,
         )
     else:
         mc = k
-        q = dpnp.empty(
-            (batch_size, n, m),
+        q = dpnp.empty_like(
+            a_t,
+            shape=(batch_size, n, m),
             dtype=res_type,
-            sycl_queue=a_sycl_queue,
-            usm_type=a_usm_type,
         )
     q[..., :n, :] = a_t
 
@@ -1156,44 +1147,38 @@ def dpnp_qr(a, mode="reduced"):
     k = min(m, n)
     if k == 0:
         if mode == "reduced":
-            return dpnp.empty(
-                (m, 0),
+            return dpnp.empty_like(
+                a,
+                shape=(m, 0),
                 dtype=res_type,
-                sycl_queue=a_sycl_queue,
-                usm_type=a_usm_type,
-            ), dpnp.empty(
-                (0, n),
+            ), dpnp.empty_like(
+                a,
+                shape=(0, n),
                 dtype=res_type,
-                sycl_queue=a_sycl_queue,
-                usm_type=a_usm_type,
             )
         elif mode == "complete":
             return dpnp.identity(
                 m, dtype=res_type, sycl_queue=a_sycl_queue, usm_type=a_usm_type
-            ), dpnp.empty(
-                (m, n),
+            ), dpnp.empty_like(
+                a,
+                shape=(m, n),
                 dtype=res_type,
-                sycl_queue=a_sycl_queue,
-                usm_type=a_usm_type,
             )
         elif mode == "r":
-            return dpnp.empty(
-                (0, n),
+            return dpnp.empty_like(
+                a,
+                shape=(0, n),
                 dtype=res_type,
-                sycl_queue=a_sycl_queue,
-                usm_type=a_usm_type,
             )
         else:  # mode == "raw"
-            return dpnp.empty(
-                (n, m),
+            return dpnp.empty_like(
+                a,
+                shape=(n, m),
                 dtype=res_type,
-                sycl_queue=a_sycl_queue,
-                usm_type=a_usm_type,
-            ), dpnp.empty(
-                (0,),
+            ), dpnp.empty_like(
+                a,
+                shape=(0,),
                 dtype=res_type,
-                sycl_queue=a_sycl_queue,
-                usm_type=a_usm_type,
             )
 
     a = a.T
@@ -1206,8 +1191,10 @@ def dpnp_qr(a, mode="reduced"):
         src=a_usm_arr, dst=a_t.get_array(), sycl_queue=a_sycl_queue
     )
 
-    tau_h = dpnp.empty(
-        k, dtype=res_type, sycl_queue=a_sycl_queue, usm_type=a_usm_type
+    tau_h = dpnp.empty_like(
+        a,
+        shape=(k,),
+        dtype=res_type,
     )
 
     # Call the LAPACK extension function _geqrf to compute the QR factorization
@@ -1231,21 +1218,17 @@ def dpnp_qr(a, mode="reduced"):
     # In `reduced` mode, mc is the lesser of the row count or column count.
     if mode == "complete" and m > n:
         mc = m
-        q = dpnp.empty(
-            (m, m),
+        q = dpnp.empty_like(
+            a_t,
+            shape=(m, m),
             dtype=res_type,
-            order="C",
-            sycl_queue=a_sycl_queue,
-            usm_type=a_usm_type,
         )
     else:
         mc = k
-        q = dpnp.empty(
-            (n, m),
+        q = dpnp.empty_like(
+            a_t,
+            shape=(n, m),
             dtype=res_type,
-            order="C",
-            sycl_queue=a_sycl_queue,
-            usm_type=a_usm_type,
         )
     q[:n] = a_t
 

From fc440440f639f92637b1a38296e098b7aafb0c29 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 5 Feb 2024 10:39:18 +0100
Subject: [PATCH 27/39] Minor update dpnp_qr/dpnp_qr_batch logic

---
 dpnp/linalg/dpnp_utils_linalg.py | 40 ++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index d18f8e390c8..e8a05c085a7 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1057,18 +1057,19 @@ def dpnp_qr_batch(a, mode="reduced"):
         [a_copy_ev],
     )
 
-    ht_geqrf_batch_ev.wait()
-    a_ht_copy_ev.wait()
+    if mode in ["r", "raw"]:
+        ht_geqrf_batch_ev.wait()
+        a_ht_copy_ev.wait()
 
-    if mode == "r":
-        r = a_t[..., :k].swapaxes(-2, -1)
-        out_r = dpnp.triu(r)
-        return out_r.reshape(batch_shape + out_r.shape[-2:])
+        if mode == "r":
+            r = a_t[..., :k].swapaxes(-2, -1)
+            out_r = dpnp.triu(r)
+            return out_r.reshape(batch_shape + out_r.shape[-2:])
 
-    if mode == "raw":
-        q = a_t.reshape(batch_shape + a_t.shape[-2:])
-        r = tau_h.reshape(batch_shape + tau_h.shape[-1:])
-        return (q, r)
+        if mode == "raw":
+            q = a_t.reshape(batch_shape + a_t.shape[-2:])
+            r = tau_h.reshape(batch_shape + tau_h.shape[-1:])
+            return (q, r)
 
     if mode == "complete" and m > n:
         mc = m
@@ -1114,6 +1115,8 @@ def dpnp_qr_batch(a, mode="reduced"):
     )
 
     ht_lapack_ev.wait()
+    ht_geqrf_batch_ev.wait()
+    a_ht_copy_ev.wait()
 
     q = q[..., :mc, :].swapaxes(-2, -1)
     r = a_t[..., :mc].swapaxes(-2, -1)
@@ -1203,15 +1206,16 @@ def dpnp_qr(a, mode="reduced"):
         a_sycl_queue, a_t.get_array(), tau_h.get_array(), [a_copy_ev]
     )
 
-    ht_geqrf_ev.wait()
-    a_ht_copy_ev.wait()
+    if mode in ["r", "raw"]:
+        ht_geqrf_ev.wait()
+        a_ht_copy_ev.wait()
 
-    if mode == "r":
-        r = a_t[:, :k].transpose()
-        return dpnp.triu(r)
+        if mode == "r":
+            r = a_t[:, :k].transpose()
+            return dpnp.triu(r)
 
-    if mode == "raw":
-        return (a_t, tau_h)
+        if mode == "raw":
+            return (a_t, tau_h)
 
     # mc is the total number of columns in the q matrix.
     # In `complete` mode, mc equals the number of rows.
@@ -1247,6 +1251,8 @@ def dpnp_qr(a, mode="reduced"):
     )
 
     ht_lapack_ev.wait()
+    ht_geqrf_ev.wait()
+    a_ht_copy_ev.wait()
 
     q = q[:mc].transpose()
     r = a_t[:, :mc].transpose()

From 1f7800d5fd0037658a07161d45e7d76cda0c1b2c Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 5 Feb 2024 10:50:37 +0100
Subject: [PATCH 28/39] Describe the returned arrays

---
 dpnp/linalg/dpnp_iface_linalg.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py
index ed691bd8d92..b3c8f2e4e4b 100644
--- a/dpnp/linalg/dpnp_iface_linalg.py
+++ b/dpnp/linalg/dpnp_iface_linalg.py
@@ -552,10 +552,21 @@ def qr(a, mode="reduced"):
 
     Returns
     -------
-    out : {dpnp.ndarray, tuple of dpnp.ndarray}
-        Although the type of returned object depends on the mode,
-        it returns a tuple of ``(Q, R)`` by default.
-        For details, please see the document of :func:`numpy.linalg.qr`.
+    When mode is "reduced" or "complete", the result will be a namedtuple with
+    the attributes Q and R.
+    Q : dpnp.ndarray
+        A matrix with orthonormal columns.
+        When mode = "complete" the result is an orthogonal/unitary matrix
+        depending on whether or not a is real/complex.
+        The determinant may be either +/- 1 in that case.
+        In case the number of dimensions in the input array is greater
+        than 2 then a stack of the matrices with above properties is returned.
+    R : dpnp.ndarray
+        The upper-triangular matrix or a stack of upper-triangular matrices
+        if the number of dimensions in the input array is greater than 2.
+    (h, tau) : tuple of dpnp.ndarray
+        The h array contains the Householder reflectors that generate Q along with R.
+        The tau array contains scaling factors for the reflectors.
 
     Examples
     --------

From a9783e6452b8a5ae2983f43892ce319606214091 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 5 Feb 2024 10:59:47 +0100
Subject: [PATCH 29/39] Use ht_list_ev with dpctl.SyclEvent.wait_for

---
 dpnp/linalg/dpnp_utils_linalg.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index e8a05c085a7..52a8c1a8016 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -24,6 +24,7 @@
 # *****************************************************************************
 
 
+import dpctl
 import dpctl.tensor._tensor_impl as ti
 from numpy import issubdtype, prod
 
@@ -1057,9 +1058,10 @@ def dpnp_qr_batch(a, mode="reduced"):
         [a_copy_ev],
     )
 
+    ht_list_ev = [ht_geqrf_batch_ev, a_ht_copy_ev]
+
     if mode in ["r", "raw"]:
-        ht_geqrf_batch_ev.wait()
-        a_ht_copy_ev.wait()
+        dpctl.SyclEvent.wait_for(ht_list_ev)
 
         if mode == "r":
             r = a_t[..., :k].swapaxes(-2, -1)
@@ -1114,9 +1116,8 @@ def dpnp_qr_batch(a, mode="reduced"):
         [geqrf_batch_ev],
     )
 
-    ht_lapack_ev.wait()
-    ht_geqrf_batch_ev.wait()
-    a_ht_copy_ev.wait()
+    ht_list_ev.append(ht_lapack_ev)
+    dpctl.SyclEvent.wait_for(ht_list_ev)
 
     q = q[..., :mc, :].swapaxes(-2, -1)
     r = a_t[..., :mc].swapaxes(-2, -1)
@@ -1206,9 +1207,10 @@ def dpnp_qr(a, mode="reduced"):
         a_sycl_queue, a_t.get_array(), tau_h.get_array(), [a_copy_ev]
     )
 
+    ht_list_ev = [ht_geqrf_ev, a_ht_copy_ev]
+
     if mode in ["r", "raw"]:
-        ht_geqrf_ev.wait()
-        a_ht_copy_ev.wait()
+        dpctl.SyclEvent.wait_for(ht_list_ev)
 
         if mode == "r":
             r = a_t[:, :k].transpose()
@@ -1250,9 +1252,8 @@ def dpnp_qr(a, mode="reduced"):
         a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [geqrf_ev]
     )
 
-    ht_lapack_ev.wait()
-    ht_geqrf_ev.wait()
-    a_ht_copy_ev.wait()
+    ht_list_ev.append(ht_lapack_ev)
+    dpctl.SyclEvent.wait_for(ht_list_ev)
 
     q = q[:mc].transpose()
     r = a_t[:, :mc].transpose()

From 2e9950afb95c0fc73278379b281971d8dbe645e6 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 5 Feb 2024 11:24:07 +0100
Subject: [PATCH 30/39] Add a comment for transpose

---
 dpnp/linalg/dpnp_utils_linalg.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 52a8c1a8016..d022d32ca37 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1185,6 +1185,10 @@ def dpnp_qr(a, mode="reduced"):
                 dtype=res_type,
             )
 
+    # Transpose the input matrix to convert from row-major to column-major order.
+    # This adjustment is necessary for compatibility with oneAPI MKL LAPACK routines,
+    # which expect matrices in column-major format.
+    # This allows data to be handled efficiently without the need for additional conversion.
     a = a.T
     a_usm_arr = dpnp.get_usm_ndarray(a)
     a_t = dpnp.empty_like(a, order="C", dtype=res_type)

From ea8cd4d266415bad6867ecf77d5551ab3d69d9d9 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 5 Feb 2024 11:59:22 +0100
Subject: [PATCH 31/39] Add _triu_inplace func

---
 dpnp/linalg/dpnp_utils_linalg.py | 54 +++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 5 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index d022d32ca37..7251cf7415e 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -952,6 +952,47 @@ def dpnp_inv(a):
     return b_f
 
 
+def _triu_inplace(a, host_tasks, depends=None):
+    """
+    _triu_inplace(a, host_tasks, depends=None)
+
+    Computes the upper triangular part of an array in-place,
+    but currently allocates extra memory for the result.
+
+    Parameters
+    ----------
+    a : {dpnp.ndarray, usm_ndarray}
+        Input array from which the upper triangular part is to be extracted.
+    host_tasks : list
+        A list to which the function appends the host event corresponding to the computation.
+        This allows for dependency management and synchronization with other tasks.
+    depends : list, optional
+        A list of events that the triangular operation depends on.
+        These tasks are completed before the triangular computation starts.
+        If ``None``, defaults to an empty list.
+
+    Returns
+    -------
+    out : dpnp.ndarray
+        A new array containing the upper triangular part of the input array `a`.
+
+    """
+
+    # TODO: implement a dedicated kernel for in-place triu instead of extra memory allocation for result
+    if depends is None:
+        depends = []
+    out = dpnp.empty_like(a, order="C")
+    ht_triu_ev, _ = ti._triu(
+        src=a.get_array(),
+        dst=out.get_array(),
+        k=0,
+        sycl_queue=a.sycl_queue,
+        depends=depends,
+    )
+    host_tasks.append(ht_triu_ev)
+    return out
+
+
 def dpnp_qr_batch(a, mode="reduced"):
     """
     dpnp_qr_batch(a, mode="reduced")
@@ -1065,10 +1106,12 @@ def dpnp_qr_batch(a, mode="reduced"):
 
         if mode == "r":
             r = a_t[..., :k].swapaxes(-2, -1)
-            out_r = dpnp.triu(r)
-            return out_r.reshape(batch_shape + out_r.shape[-2:])
+            r = _triu_inplace(r, ht_list_ev, [geqrf_batch_ev])
+            dpctl.SyclEvent.wait_for(ht_list_ev)
+            return r.reshape(batch_shape + r.shape[-2:])
 
         if mode == "raw":
+            dpctl.SyclEvent.wait_for(ht_list_ev)
             q = a_t.reshape(batch_shape + a_t.shape[-2:])
             r = tau_h.reshape(batch_shape + tau_h.shape[-1:])
             return (q, r)
@@ -1214,13 +1257,14 @@ def dpnp_qr(a, mode="reduced"):
     ht_list_ev = [ht_geqrf_ev, a_ht_copy_ev]
 
     if mode in ["r", "raw"]:
-        dpctl.SyclEvent.wait_for(ht_list_ev)
-
         if mode == "r":
             r = a_t[:, :k].transpose()
-            return dpnp.triu(r)
+            r = _triu_inplace(r, ht_list_ev, [geqrf_ev])
+            dpctl.SyclEvent.wait_for(ht_list_ev)
+            return r
 
         if mode == "raw":
+            dpctl.SyclEvent.wait_for(ht_list_ev)
             return (a_t, tau_h)
 
     # mc is the total number of columns in the q matrix.

From 1cd6c778890a1f8945ae668b38d0842359057fe3 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 5 Feb 2024 14:17:43 +0100
Subject: [PATCH 32/39] Use _triu_inplace for complete and reduced mode

---
 dpnp/linalg/dpnp_utils_linalg.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 4536b8bc7b3..92063e1926d 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1151,7 +1151,7 @@ def dpnp_qr_batch(a, mode="reduced"):
     # Call the LAPACK extension function _orgqr_batch/ to generate the real orthogonal/
     # complex unitary matrices `Qi` of the QR factorization
     # for a batch of general matrices.
-    ht_lapack_ev, _ = getattr(li, lapack_func)(
+    ht_lapack_ev, lapack_ev = getattr(li, lapack_func)(
         a_sycl_queue,
         q.get_array(),
         tau_h.get_array(),
@@ -1170,11 +1170,14 @@ def dpnp_qr_batch(a, mode="reduced"):
     q = q[..., :mc, :].swapaxes(-2, -1)
     r = a_t[..., :mc].swapaxes(-2, -1)
 
-    r_triu = dpnp.triu(r)
+    ht_list_ev.append(ht_lapack_ev)
+
+    r = _triu_inplace(r, ht_list_ev, [lapack_ev])
+    dpctl.SyclEvent.wait_for(ht_list_ev)
 
     return (
         q.reshape(batch_shape + q.shape[-2:]),
-        r_triu.reshape(batch_shape + r_triu.shape[-2:]),
+        r.reshape(batch_shape + r.shape[-2:]),
     )
 
 
@@ -1301,16 +1304,19 @@ def dpnp_qr(a, mode="reduced"):
 
     # Call the LAPACK extension function _orgqr/_ungqf to generate the real orthogonal/
     # complex unitary matrix `Q` of the QR factorization
-    ht_lapack_ev, _ = getattr(li, lapack_func)(
+    ht_lapack_ev, lapack_ev = getattr(li, lapack_func)(
         a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [geqrf_ev]
     )
 
+    q = q[:mc].transpose()
+    r = a_t[:, :mc].transpose()
+
     ht_list_ev.append(ht_lapack_ev)
+
+    r = _triu_inplace(r, ht_list_ev, [lapack_ev])
     dpctl.SyclEvent.wait_for(ht_list_ev)
 
-    q = q[:mc].transpose()
-    r = a_t[:, :mc].transpose()
-    return (q, dpnp.triu(r))
+    return (q, r)
 
 
 def dpnp_solve(a, b):

From d389ddb3362a50905d2941d7a06545a6abc479ad Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Mon, 5 Feb 2024 14:19:39 +0100
Subject: [PATCH 33/39] Fix validation check

---
 dpnp/linalg/dpnp_utils_linalg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 92063e1926d..f20011de649 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1237,7 +1237,7 @@ def dpnp_qr(a, mode="reduced"):
             )
 
     # Transpose the input matrix to convert from row-major to column-major order.
-    # This adjustment is necessary for compatibility with oneAPI MKL LAPACK routines,
+    # This adjustment is necessary for compatibility with OneMKL LAPACK routines,
     # which expect matrices in column-major format.
     # This allows data to be handled efficiently without the need for additional conversion.
     a = a.T

From 71fec506d7fb9b527d1437d43b42f0243a180158 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Tue, 6 Feb 2024 13:52:47 +0100
Subject: [PATCH 34/39] Use copy_usm for a_t array overwritten by
 geqrf/geqrf_batch

---
 dpnp/linalg/dpnp_utils_linalg.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index f20011de649..ed8ed3eed4d 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1135,7 +1135,17 @@ def dpnp_qr_batch(a, mode="reduced"):
             shape=(batch_size, n, m),
             dtype=res_type,
         )
-    q[..., :n, :] = a_t
+
+    # use DPCTL tensor function to fill the matrix array `q[..., :n, :]`
+    # with content from the array `a_t` overwritten by geqrf_batch
+    a_t_ht_copy_ev, a_t_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_t.get_array(),
+        dst=q[..., :n, :].get_array(),
+        sycl_queue=a_sycl_queue,
+        depends=[geqrf_batch_ev],
+    )
+
+    ht_list_ev.append(a_t_ht_copy_ev)
 
     q_stride = q.strides[0]
     tau_stride = tau_h.strides[0]
@@ -1161,7 +1171,7 @@ def dpnp_qr_batch(a, mode="reduced"):
         q_stride,
         tau_stride,
         batch_size,
-        [geqrf_batch_ev],
+        [a_t_copy_ev],
     )
 
     ht_list_ev.append(ht_lapack_ev)
@@ -1292,7 +1302,17 @@ def dpnp_qr(a, mode="reduced"):
             shape=(n, m),
             dtype=res_type,
         )
-    q[:n] = a_t
+
+    # use DPCTL tensor function to fill the matrix array `q[:n]`
+    # with content from the array `a_t` overwritten by geqrf
+    a_t_ht_copy_ev, a_t_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+        src=a_t.get_array(),
+        dst=q[:n].get_array(),
+        sycl_queue=a_sycl_queue,
+        depends=[geqrf_ev],
+    )
+
+    ht_list_ev.append(a_t_ht_copy_ev)
 
     # Get LAPACK function (_orgqr for real or _ungqf for complex data types)
     # for QR factorization
@@ -1305,7 +1325,7 @@ def dpnp_qr(a, mode="reduced"):
     # Call the LAPACK extension function _orgqr/_ungqf to generate the real orthogonal/
     # complex unitary matrix `Q` of the QR factorization
     ht_lapack_ev, lapack_ev = getattr(li, lapack_func)(
-        a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [geqrf_ev]
+        a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [a_t_copy_ev]
     )
 
     q = q[:mc].transpose()

From 120cf43a7d5723cbf8647e373782dd938e3479e3 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 7 Feb 2024 11:40:02 +0100
Subject: [PATCH 35/39] Address remarks

---
 dpnp/linalg/dpnp_utils_linalg.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index ed8ed3eed4d..13943b125ef 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1107,8 +1107,6 @@ def dpnp_qr_batch(a, mode="reduced"):
     ht_list_ev = [ht_geqrf_batch_ev, a_ht_copy_ev]
 
     if mode in ["r", "raw"]:
-        dpctl.SyclEvent.wait_for(ht_list_ev)
-
         if mode == "r":
             r = a_t[..., :k].swapaxes(-2, -1)
             r = _triu_inplace(r, ht_list_ev, [geqrf_batch_ev])
@@ -1175,7 +1173,6 @@ def dpnp_qr_batch(a, mode="reduced"):
     )
 
     ht_list_ev.append(ht_lapack_ev)
-    dpctl.SyclEvent.wait_for(ht_list_ev)
 
     q = q[..., :mc, :].swapaxes(-2, -1)
     r = a_t[..., :mc].swapaxes(-2, -1)

From 30188e4a15159a90e75877d1f8edbd57563d8dce Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 7 Feb 2024 13:22:31 +0100
Subject: [PATCH 36/39] Skip cupy tests on cpu

---
 tests/third_party/cupy/linalg_tests/test_decomposition.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/third_party/cupy/linalg_tests/test_decomposition.py b/tests/third_party/cupy/linalg_tests/test_decomposition.py
index fc75163f072..4769fe455eb 100644
--- a/tests/third_party/cupy/linalg_tests/test_decomposition.py
+++ b/tests/third_party/cupy/linalg_tests/test_decomposition.py
@@ -416,6 +416,10 @@ def _check_result(self, result_cpu, result_gpu):
             assert result_cpu.dtype == result_gpu.dtype
             testing.assert_allclose(result_cpu, result_gpu, atol=1e-4)
 
+    # TODO: New packages that fix issue CMPLRLLVM-53771 are only available in internal CI.
+    # Skip the tests on cpu until these packages are available for the external CI.
+    # Specifically dpcpp_linux-64>=2024.1.0
+    @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771")
     @testing.fix_random()
     @_condition.repeat(3, 10)
     def test_mode(self):
@@ -423,6 +427,7 @@ def test_mode(self):
         self.check_mode(numpy.random.randn(3, 3), mode=self.mode)
         self.check_mode(numpy.random.randn(5, 4), mode=self.mode)
 
+    @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771")
     @testing.with_requires("numpy>=1.22")
     @testing.fix_random()
     def test_mode_rank3(self):
@@ -430,6 +435,7 @@ def test_mode_rank3(self):
         self.check_mode(numpy.random.randn(4, 3, 3), mode=self.mode)
         self.check_mode(numpy.random.randn(2, 5, 4), mode=self.mode)
 
+    @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771")
     @testing.with_requires("numpy>=1.22")
     @testing.fix_random()
     def test_mode_rank4(self):

From d519b1ac8abd092fdd0dbeb1c036f80b1cad5381 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 7 Feb 2024 13:28:30 +0100
Subject: [PATCH 37/39] Skip dpnp tests on cpu

---
 tests/test_linalg.py | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/tests/test_linalg.py b/tests/test_linalg.py
index 879a3893eae..e0ad716dba1 100644
--- a/tests/test_linalg.py
+++ b/tests/test_linalg.py
@@ -678,6 +678,10 @@ def test_norm3(array, ord, axis):
 
 
 class TestQr:
+    # TODO: New packages that fix issue CMPLRLLVM-53771 are only available in internal CI.
+    # Skip the tests on cpu until these packages are available for the external CI.
+    # Specifically dpcpp_linux-64>=2024.1.0
+    @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771")
     @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
     @pytest.mark.parametrize(
         "shape",
@@ -711,22 +715,11 @@ def test_qr(self, dtype, shape, mode):
             # check decomposition
             if mode in ("complete", "reduced"):
                 if a.ndim == 2:
-                    # TODO: remove it when dpnp.dot() supports complex types
-                    if inp.issubdtype(dtype, inp.complexfloating):
-                        dpnp_as_np_q = inp.asnumpy(dpnp_q)
-                        dpnp_as_np_r = inp.asnumpy(dpnp_r)
-
-                        assert_almost_equal(
-                            numpy.dot(dpnp_as_np_q, dpnp_as_np_r),
-                            a,
-                            decimal=5,
-                        )
-                    else:
-                        assert_almost_equal(
-                            inp.dot(dpnp_q, dpnp_r),
-                            a,
-                            decimal=5,
-                        )
+                    assert_almost_equal(
+                        inp.dot(dpnp_q, dpnp_r),
+                        a,
+                        decimal=5,
+                    )
                 else:  # a.ndim > 2
                     assert_almost_equal(
                         inp.matmul(dpnp_q, dpnp_r),
@@ -772,6 +765,7 @@ def test_qr_empty(self, dtype, shape, mode):
 
         assert_dtype_allclose(dpnp_r, np_r)
 
+    @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771")
     @pytest.mark.parametrize(
         "mode",
         ["r", "raw", "complete", "reduced"],

From c8786db245dcfb414001328ec971968237daea7e Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 7 Feb 2024 13:39:08 +0100
Subject: [PATCH 38/39] Remove TODOs for svd tests

---
 tests/test_linalg.py                          |  8 -------
 .../cupy/linalg_tests/test_decomposition.py   | 23 +++++++------------
 2 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/tests/test_linalg.py b/tests/test_linalg.py
index e0ad716dba1..8e32b867b85 100644
--- a/tests/test_linalg.py
+++ b/tests/test_linalg.py
@@ -1076,14 +1076,6 @@ def check_decomposition(
             dpnp_diag_s = inp.zeros_like(dp_a, dtype=dp_s.dtype)
             for i in range(min(dp_a.shape[-2], dp_a.shape[-1])):
                 dpnp_diag_s[..., i, i] = dp_s[..., i]
-            # TODO: remove it when dpnp.dot is updated
-            # dpnp.dot does not support complex type
-            if inp.issubdtype(dp_a.dtype, inp.complexfloating):
-                reconstructed = numpy.dot(
-                    inp.asnumpy(dp_u),
-                    numpy.dot(inp.asnumpy(dpnp_diag_s), inp.asnumpy(dp_vt)),
-                )
-            else:
                 reconstructed = inp.dot(dp_u, inp.dot(dpnp_diag_s, dp_vt))
             # TODO: use assert dpnp.allclose() inside check_decomposition()
             # when it will support complex dtypes
diff --git a/tests/third_party/cupy/linalg_tests/test_decomposition.py b/tests/third_party/cupy/linalg_tests/test_decomposition.py
index 4769fe455eb..234a2e0e381 100644
--- a/tests/third_party/cupy/linalg_tests/test_decomposition.py
+++ b/tests/third_party/cupy/linalg_tests/test_decomposition.py
@@ -201,38 +201,31 @@ def check_usv(self, shape, dtype):
         # reconstruct the matrix
         k = s_cpu.shape[-1]
 
-        # dpnp.dot/matmul does not support complex type and unstable on cpu
-        # TODO: remove it and use xp.dot/matmul when dpnp.dot/matmul is updated
-        u_gpu = u_gpu.asnumpy()
-        vh_gpu = vh_gpu.asnumpy()
-        s_gpu = s_gpu.asnumpy()
-        xp = numpy
-
         if len(shape) == 2:
             if self.full_matrices:
-                a_gpu_usv = numpy.dot(u_gpu[:, :k] * s_gpu, vh_gpu[:k, :])
+                a_gpu_usv = cupy.dot(u_gpu[:, :k] * s_gpu, vh_gpu[:k, :])
             else:
-                a_gpu_usv = numpy.dot(u_gpu * s_gpu, vh_gpu)
+                a_gpu_usv = cupy.dot(u_gpu * s_gpu, vh_gpu)
         else:
             if self.full_matrices:
-                a_gpu_usv = numpy.matmul(
+                a_gpu_usv = cupy.matmul(
                     u_gpu[..., :k] * s_gpu[..., None, :], vh_gpu[..., :k, :]
                 )
             else:
-                a_gpu_usv = numpy.matmul(u_gpu * s_gpu[..., None, :], vh_gpu)
+                a_gpu_usv = cupy.matmul(u_gpu * s_gpu[..., None, :], vh_gpu)
         testing.assert_allclose(a_gpu, a_gpu_usv, rtol=1e-4, atol=1e-4)
 
         # assert unitary
         u_len = u_gpu.shape[-1]
         vh_len = vh_gpu.shape[-2]
         testing.assert_allclose(
-            xp.matmul(u_gpu.swapaxes(-1, -2).conj(), u_gpu),
-            stacked_identity(xp, shape[:-2], u_len, dtype),
+            cupy.matmul(u_gpu.swapaxes(-1, -2).conj(), u_gpu),
+            stacked_identity(cupy, shape[:-2], u_len, dtype),
             atol=1e-4,
         )
         testing.assert_allclose(
-            xp.matmul(vh_gpu, vh_gpu.swapaxes(-1, -2).conj()),
-            stacked_identity(xp, shape[:-2], vh_len, dtype),
+            cupy.matmul(vh_gpu, vh_gpu.swapaxes(-1, -2).conj()),
+            stacked_identity(cupy, shape[:-2], vh_len, dtype),
             atol=1e-4,
         )
 

From e0ee2f2a745fcae9c62a695d0dbda8b71c6c9d69 Mon Sep 17 00:00:00 2001
From: Vladislav Perevezentsev <vladislav.perevezentsev@intel.com>
Date: Wed, 7 Feb 2024 17:04:48 +0100
Subject: [PATCH 39/39] Address remarks

---
 dpnp/backend/extensions/lapack/geqrf_batch.cpp |  9 ---------
 dpnp/linalg/dpnp_utils_linalg.py               | 18 +++++++++---------
 2 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/dpnp/backend/extensions/lapack/geqrf_batch.cpp b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
index a05b01e9172..a4fe980a539 100644
--- a/dpnp/backend/extensions/lapack/geqrf_batch.cpp
+++ b/dpnp/backend/extensions/lapack/geqrf_batch.cpp
@@ -229,15 +229,6 @@ std::pair<sycl::event, sycl::event>
             "of the input matrix.");
     }
 
-    // auto tau_types = dpctl_td_ns::usm_ndarray_types();
-    // int tau_array_type_id =
-    //     tau_types.typenum_to_lookup_id(tau_array.get_typenum());
-
-    // if (tau_array_type_id != static_cast<int>(dpctl_td_ns::typenum_t::INT64))
-    // {
-    //     throw py::value_error("The type of 'tau_array' must be int64.");
-    // }
-
     char *a_array_data = a_array.get_data();
     char *tau_array_data = tau_array.get_data();
 
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
index 13943b125ef..a6dcfbf0c2b 100644
--- a/dpnp/linalg/dpnp_utils_linalg.py
+++ b/dpnp/linalg/dpnp_utils_linalg.py
@@ -1053,7 +1053,7 @@ def dpnp_qr_batch(a, mode="reduced"):
                 shape=batch_shape + (k, n),
                 dtype=res_type,
             )
-        elif mode == "raw":
+        else:  # mode=="raw"
             return (
                 dpnp.empty_like(
                     a,
@@ -1113,11 +1113,11 @@ def dpnp_qr_batch(a, mode="reduced"):
             dpctl.SyclEvent.wait_for(ht_list_ev)
             return r.reshape(batch_shape + r.shape[-2:])
 
-        if mode == "raw":
-            dpctl.SyclEvent.wait_for(ht_list_ev)
-            q = a_t.reshape(batch_shape + a_t.shape[-2:])
-            r = tau_h.reshape(batch_shape + tau_h.shape[-1:])
-            return (q, r)
+        # mode=="raw"
+        dpctl.SyclEvent.wait_for(ht_list_ev)
+        q = a_t.reshape(batch_shape + a_t.shape[-2:])
+        r = tau_h.reshape(batch_shape + tau_h.shape[-1:])
+        return (q, r)
 
     if mode == "complete" and m > n:
         mc = m
@@ -1278,9 +1278,9 @@ def dpnp_qr(a, mode="reduced"):
             dpctl.SyclEvent.wait_for(ht_list_ev)
             return r
 
-        if mode == "raw":
-            dpctl.SyclEvent.wait_for(ht_list_ev)
-            return (a_t, tau_h)
+        # mode == "raw":
+        dpctl.SyclEvent.wait_for(ht_list_ev)
+        return (a_t, tau_h)
 
     # mc is the total number of columns in the q matrix.
     # In `complete` mode, mc equals the number of rows.