From c2601fd0e48b0d2e9111797f8b2decbc68864c2b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 31 Jan 2024 12:12:19 +0100 Subject: [PATCH 01/39] Impl dpnp.linalg.qr for 2d array --- dpnp/backend/extensions/lapack/CMakeLists.txt | 2 + dpnp/backend/extensions/lapack/geqrf.cpp | 244 +++++++++++++++++ dpnp/backend/extensions/lapack/geqrf.hpp | 51 ++++ dpnp/backend/extensions/lapack/lapack_py.cpp | 17 ++ dpnp/backend/extensions/lapack/orgqr.cpp | 245 ++++++++++++++++++ dpnp/backend/extensions/lapack/orgqr.hpp | 54 ++++ .../extensions/lapack/types_matrix.hpp | 46 ++++ dpnp/linalg/dpnp_iface_linalg.py | 51 ++-- dpnp/linalg/dpnp_utils_linalg.py | 133 ++++++++++ 9 files changed, 828 insertions(+), 15 deletions(-) create mode 100644 dpnp/backend/extensions/lapack/geqrf.cpp create mode 100644 dpnp/backend/extensions/lapack/geqrf.hpp create mode 100644 dpnp/backend/extensions/lapack/orgqr.cpp create mode 100644 dpnp/backend/extensions/lapack/orgqr.hpp diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt index 0cfdc1a1677..b1d2ecb87df 100644 --- a/dpnp/backend/extensions/lapack/CMakeLists.txt +++ b/dpnp/backend/extensions/lapack/CMakeLists.txt @@ -27,10 +27,12 @@ set(python_module_name _lapack_impl) set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/lapack_py.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/geqrf.cpp ${CMAKE_CURRENT_SOURCE_DIR}/gesv.cpp ${CMAKE_CURRENT_SOURCE_DIR}/getrf.cpp ${CMAKE_CURRENT_SOURCE_DIR}/getrf_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/heevd.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/orgqr.cpp ${CMAKE_CURRENT_SOURCE_DIR}/potrf.cpp ${CMAKE_CURRENT_SOURCE_DIR}/potrf_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/syevd.cpp diff --git a/dpnp/backend/extensions/lapack/geqrf.cpp b/dpnp/backend/extensions/lapack/geqrf.cpp new file mode 100644 index 00000000000..c32de58fccf --- /dev/null +++ b/dpnp/backend/extensions/lapack/geqrf.cpp @@ -0,0 +1,244 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#include + +// dpctl tensor headers +#include "utils/memory_overlap.hpp" +#include "utils/type_utils.hpp" + +#include "geqrf.hpp" +#include "types_matrix.hpp" + +#include "dpnp_utils.hpp" + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace lapack +{ +namespace mkl_lapack = oneapi::mkl::lapack; +namespace py = pybind11; +namespace type_utils = dpctl::tensor::type_utils; + +typedef sycl::event (*geqrf_impl_fn_ptr_t)(sycl::queue, + const std::int64_t, + const std::int64_t, + char *, + std::int64_t, + char *, + std::vector &, + const std::vector &); + +static geqrf_impl_fn_ptr_t geqrf_dispatch_vector[dpctl_td_ns::num_types]; + +template +static sycl::event geqrf_impl(sycl::queue exec_q, + const std::int64_t m, + const std::int64_t n, + char *in_a, + std::int64_t lda, + char *in_tau, + std::vector &host_task_events, + const std::vector &depends) +{ + type_utils::validate_type_for_device(exec_q); + + T *a = reinterpret_cast(in_a); + T *tau = reinterpret_cast(in_tau); + + const std::int64_t scratchpad_size = + oneapi::mkl::lapack::geqrf_scratchpad_size(exec_q, m, n, lda); + T *scratchpad = nullptr; + + std::stringstream error_msg; + std::int64_t info = 0; + bool is_exception_caught = false; + + sycl::event geqrf_event; + try { + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + + geqrf_event = oneapi::mkl::lapack::geqrf( + exec_q, + m, // The number of rows in the matrix; (0 ≤ m). + n, // The number of columns in the matrix; (0 ≤ n). + a, // Pointer to the m-by-n matrix. + lda, // The leading dimension of `a`; (1 ≤ m). + tau, // Pointer to the array of scalar factors of the + // elementary reflectors. + scratchpad, // Pointer to scratchpad memory to be used by MKL + // routine for storing intermediate results. + scratchpad_size, depends); + } catch (mkl_lapack::exception const &e) { + is_exception_caught = true; + info = e.info(); + + if (info < 0) { + error_msg << "Parameter number " << -info + << " had an illegal value."; + } + else if (info == scratchpad_size && e.detail() != 0) { + error_msg + << "Insufficient scratchpad size. Required size is at least " + << e.detail(); + } + else { + error_msg << "Unexpected MKL exception caught during geqrf() " + "call:\nreason: " + << e.what() << "\ninfo: " << info; + } + } catch (sycl::exception const &e) { + is_exception_caught = true; + error_msg << "Unexpected SYCL exception caught during geqrf() call:\n" + << e.what(); + } + + if (is_exception_caught) // an unexpected error occurs + { + if (scratchpad != nullptr) { + sycl::free(scratchpad, exec_q); + } + throw std::runtime_error(error_msg.str()); + } + + sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(geqrf_event); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); }); + }); + host_task_events.push_back(clean_up_event); + + return geqrf_event; +} + +std::pair + geqrf(sycl::queue q, + dpctl::tensor::usm_ndarray a_array, + dpctl::tensor::usm_ndarray tau_array, + const std::vector &depends) +{ + const int a_array_nd = a_array.get_ndim(); + const int tau_array_nd = tau_array.get_ndim(); + + if (a_array_nd != 2) { + throw py::value_error( + "The input array has ndim=" + std::to_string(a_array_nd) + + ", but a 2-dimensional array is expected."); + } + + if (tau_array_nd != 1) { + throw py::value_error("The array of Householder scalars has ndim=" + + std::to_string(tau_array_nd) + + ", but a 1-dimensional array is expected."); + } + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(a_array, tau_array)) { + throw py::value_error( + "The input array and the array of Householder scalars " + "are overlapping segments of memory"); + } + + bool is_a_array_c_contig = a_array.is_c_contiguous(); + if (!is_a_array_c_contig) { + throw py::value_error("The input array " + "must be C-contiguous"); + } + + bool is_tau_array_c_contig = tau_array.is_c_contiguous(); + bool is_tau_array_f_contig = tau_array.is_f_contiguous(); + + if (!is_tau_array_c_contig || !is_tau_array_f_contig) { + throw py::value_error("The array of Householder scalars " + "must be contiguous"); + } + + auto array_types = dpctl_td_ns::usm_ndarray_types(); + int a_array_type_id = + array_types.typenum_to_lookup_id(a_array.get_typenum()); + + geqrf_impl_fn_ptr_t geqrf_fn = geqrf_dispatch_vector[a_array_type_id]; + if (geqrf_fn == nullptr) { + throw py::value_error( + "No geqrf implementation defined for the provided type " + "of the input matrix."); + } + + char *a_array_data = a_array.get_data(); + char *tau_array_data = tau_array.get_data(); + + const py::ssize_t *a_array_shape = a_array.get_shape_raw(); + + // The input array is transponded + // Change the order of getting m, n + const std::int64_t m = a_array_shape[1]; + const std::int64_t n = a_array_shape[0]; + const std::int64_t lda = std::max(1UL, m); + + std::vector host_task_events; + sycl::event geqrf_ev = geqrf_fn(q, m, n, a_array_data, lda, tau_array_data, + host_task_events, depends); + + sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array}, + host_task_events); + + return std::make_pair(args_ev, geqrf_ev); +} + +template +struct GeqrfContigFactory +{ + fnT get() + { + if constexpr (types::GeqrfTypePairSupportFactory::is_defined) { + return geqrf_impl; + } + else { + return nullptr; + } + } +}; + +void init_geqrf_dispatch_vector(void) +{ + dpctl_td_ns::DispatchVectorBuilder + contig; + contig.populate_dispatch_vector(geqrf_dispatch_vector); +} +} // namespace lapack +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/backend/extensions/lapack/geqrf.hpp b/dpnp/backend/extensions/lapack/geqrf.hpp new file mode 100644 index 00000000000..172c7fc76fa --- /dev/null +++ b/dpnp/backend/extensions/lapack/geqrf.hpp @@ -0,0 +1,51 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include +#include + +#include + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace lapack +{ +extern std::pair + geqrf(sycl::queue exec_q, + dpctl::tensor::usm_ndarray a_array, + dpctl::tensor::usm_ndarray tau_array, + const std::vector &depends = {}); + +extern void init_geqrf_dispatch_vector(void); +} // namespace lapack +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp index f97a0dc4a43..6e487ccefb3 100644 --- a/dpnp/backend/extensions/lapack/lapack_py.cpp +++ b/dpnp/backend/extensions/lapack/lapack_py.cpp @@ -30,10 +30,12 @@ #include #include +#include "geqrf.hpp" #include "gesv.hpp" #include "getrf.hpp" #include "heevd.hpp" #include "linalg_exceptions.hpp" +#include "orgqr.hpp" #include "potrf.hpp" #include "syevd.hpp" @@ -43,9 +45,11 @@ namespace py = pybind11; // populate dispatch vectors void init_dispatch_vectors(void) { + lapack_ext::init_geqrf_dispatch_vector(); lapack_ext::init_gesv_dispatch_vector(); lapack_ext::init_getrf_batch_dispatch_vector(); lapack_ext::init_getrf_dispatch_vector(); + lapack_ext::init_orgqr_dispatch_vector(); lapack_ext::init_potrf_batch_dispatch_vector(); lapack_ext::init_potrf_dispatch_vector(); lapack_ext::init_syevd_dispatch_vector(); @@ -67,6 +71,12 @@ PYBIND11_MODULE(_lapack_impl, m) init_dispatch_vectors(); init_dispatch_tables(); + m.def("_geqrf", &lapack_ext::geqrf, + "Call `geqrf` from OneMKL LAPACK library to return " + "the QR factorization of a general m x n matrix ", + py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"), + py::arg("depends") = py::list()); + m.def("_gesv", &lapack_ext::gesv, "Call `gesv` from OneMKL LAPACK library to return " "the solution of a system of linear equations with " @@ -95,6 +105,13 @@ PYBIND11_MODULE(_lapack_impl, m) py::arg("eig_vecs"), py::arg("eig_vals"), py::arg("depends") = py::list()); + m.def("_orgqr", &lapack_ext::orgqr, + "Call `orgqr` from OneMKL LAPACK library to return " + "the real orthogonal matrix Q of the QR factorization", + py::arg("sycl_queue"), py::arg("m"), py::arg("n"), py::arg("k"), + py::arg("a_array"), py::arg("tau_array"), + py::arg("depends") = py::list()); + m.def("_potrf", &lapack_ext::potrf, "Call `potrf` from OneMKL LAPACK library to return " "the Cholesky factorization of a symmetric positive-definite matrix", diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp new file mode 100644 index 00000000000..25f2c29f3f5 --- /dev/null +++ b/dpnp/backend/extensions/lapack/orgqr.cpp @@ -0,0 +1,245 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#include + +// dpctl tensor headers +#include "utils/memory_overlap.hpp" +#include "utils/type_utils.hpp" + +#include "orgqr.hpp" +#include "types_matrix.hpp" + +#include "dpnp_utils.hpp" + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace lapack +{ +namespace mkl_lapack = oneapi::mkl::lapack; +namespace py = pybind11; +namespace type_utils = dpctl::tensor::type_utils; + +typedef sycl::event (*orgqr_impl_fn_ptr_t)(sycl::queue, + const std::int64_t, + const std::int64_t, + const std::int64_t, + char *, + std::int64_t, + char *, + std::vector &, + const std::vector &); + +static orgqr_impl_fn_ptr_t orgqr_dispatch_vector[dpctl_td_ns::num_types]; + +template +static sycl::event orgqr_impl(sycl::queue exec_q, + const std::int64_t m, + const std::int64_t n, + const std::int64_t k, + char *in_a, + std::int64_t lda, + char *in_tau, + std::vector &host_task_events, + const std::vector &depends) +{ + type_utils::validate_type_for_device(exec_q); + + T *a = reinterpret_cast(in_a); + T *tau = reinterpret_cast(in_tau); + + const std::int64_t scratchpad_size = + oneapi::mkl::lapack::orgqr_scratchpad_size(exec_q, m, n, k, lda); + T *scratchpad = nullptr; + + std::stringstream error_msg; + std::int64_t info = 0; + bool is_exception_caught = false; + + sycl::event orgqr_event; + try { + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + + orgqr_event = oneapi::mkl::lapack::orgqr( + exec_q, + m, // The number of rows in the matrix; (0 ≤ m). + n, // The number of columns in the matrix; (0 ≤ n). + k, // The number of elementary reflectors + // whose product defines the matrix Q; (0 ≤ k ≤ n). + a, // Pointer to the m-by-n matrix. + lda, // The leading dimension of `a`; (1 ≤ m). + tau, // Pointer to the array of scalar factors of the + // elementary reflectors. + scratchpad, // Pointer to scratchpad memory to be used by MKL + // routine for storing intermediate results. + scratchpad_size, depends); + } catch (mkl_lapack::exception const &e) { + is_exception_caught = true; + info = e.info(); + + if (info < 0) { + error_msg << "Parameter number " << -info + << " had an illegal value."; + } + else if (info == scratchpad_size && e.detail() != 0) { + error_msg + << "Insufficient scratchpad size. Required size is at least " + << e.detail(); + } + else { + error_msg << "Unexpected MKL exception caught during orgqr() " + "call:\nreason: " + << e.what() << "\ninfo: " << info; + } + } catch (sycl::exception const &e) { + is_exception_caught = true; + error_msg << "Unexpected SYCL exception caught during orfqr() call:\n" + << e.what(); + } + + if (is_exception_caught) // an unexpected error occurs + { + if (scratchpad != nullptr) { + sycl::free(scratchpad, exec_q); + } + throw std::runtime_error(error_msg.str()); + } + + sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(orgqr_event); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); }); + }); + host_task_events.push_back(clean_up_event); + + return orgqr_event; +} + +std::pair + orgqr(sycl::queue q, + const std::int64_t m, + const std::int64_t n, + const std::int64_t k, + dpctl::tensor::usm_ndarray a_array, + dpctl::tensor::usm_ndarray tau_array, + const std::vector &depends) +{ + const int a_array_nd = a_array.get_ndim(); + const int tau_array_nd = tau_array.get_ndim(); + + if (a_array_nd != 2) { + throw py::value_error( + "The input array has ndim=" + std::to_string(a_array_nd) + + ", but a 2-dimensional array is expected."); + } + + if (tau_array_nd != 1) { + throw py::value_error("The array of Householder scalars has ndim=" + + std::to_string(tau_array_nd) + + ", but a 1-dimensional array is expected."); + } + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(a_array, tau_array)) { + throw py::value_error( + "The input array and the array of Householder scalars " + "are overlapping segments of memory"); + } + + bool is_a_array_c_contig = a_array.is_c_contiguous(); + if (!is_a_array_c_contig) { + throw py::value_error("The input array " + "must be C-contiguous"); + } + + bool is_tau_array_c_contig = tau_array.is_c_contiguous(); + bool is_tau_array_f_contig = tau_array.is_f_contiguous(); + + if (!is_tau_array_c_contig || !is_tau_array_f_contig) { + throw py::value_error("The array of Householder scalars " + "must be contiguous"); + } + + auto array_types = dpctl_td_ns::usm_ndarray_types(); + int a_array_type_id = + array_types.typenum_to_lookup_id(a_array.get_typenum()); + + orgqr_impl_fn_ptr_t orgqr_fn = orgqr_dispatch_vector[a_array_type_id]; + if (orgqr_fn == nullptr) { + throw py::value_error( + "No orgqr implementation defined for the provided type " + "of the input matrix."); + } + + char *a_array_data = a_array.get_data(); + const std::int64_t lda = std::max(1UL, m); + + char *tau_array_data = tau_array.get_data(); + + std::vector host_task_events; + sycl::event orgqr_ev = orgqr_fn(q, m, n, k, a_array_data, lda, + tau_array_data, host_task_events, depends); + + sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array}, + host_task_events); + + return std::make_pair(args_ev, orgqr_ev); +} + +template +struct OrgqrContigFactory +{ + fnT get() + { + if constexpr (types::OrgqrTypePairSupportFactory::is_defined) { + return orgqr_impl; + } + else { + return nullptr; + } + } +}; + +void init_orgqr_dispatch_vector(void) +{ + dpctl_td_ns::DispatchVectorBuilder + contig; + contig.populate_dispatch_vector(orgqr_dispatch_vector); +} +} // namespace lapack +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/backend/extensions/lapack/orgqr.hpp b/dpnp/backend/extensions/lapack/orgqr.hpp new file mode 100644 index 00000000000..cd0938dd72a --- /dev/null +++ b/dpnp/backend/extensions/lapack/orgqr.hpp @@ -0,0 +1,54 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include +#include + +#include + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace lapack +{ +extern std::pair + orgqr(sycl::queue exec_q, + const std::int64_t m, + const std::int64_t n, + const std::int64_t k, + dpctl::tensor::usm_ndarray a_array, + dpctl::tensor::usm_ndarray tau_array, + const std::vector &depends = {}); + +extern void init_orgqr_dispatch_vector(void); +} // namespace lapack +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/backend/extensions/lapack/types_matrix.hpp b/dpnp/backend/extensions/lapack/types_matrix.hpp index 0277e630738..3f7be9cf4a9 100644 --- a/dpnp/backend/extensions/lapack/types_matrix.hpp +++ b/dpnp/backend/extensions/lapack/types_matrix.hpp @@ -43,6 +43,33 @@ namespace lapack { namespace types { +/** + * @brief A factory to define pairs of supported types for which + * MKL LAPACK library provides support in oneapi::mkl::lapack::geqrf + * function. + * + * @tparam T Type of array containing the input matrix to be QR factorized. + * Upon execution, this matrix is transformed to output arrays representing + * the orthogonal matrix Q and the upper triangular matrix R. + */ +template +struct GeqrfTypePairSupportFactory +{ + static constexpr bool is_defined = std::disjunction< + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + T, + std::complex>, + dpctl_td_ns::TypePairDefinedEntry, + T, + std::complex>, + // fall-through + dpctl_td_ns::NotDefinedEntry>::is_defined; +}; + /** * @brief A factory to define pairs of supported types for which * MKL LAPACK library provides support in oneapi::mkl::lapack::gesv @@ -142,6 +169,25 @@ struct HeevdTypePairSupportFactory dpctl_td_ns::NotDefinedEntry>::is_defined; }; +/** + * @brief A factory to define pairs of supported types for which + * MKL LAPACK library provides support in oneapi::mkl::lapack::orgqr + * function. + * + * @tparam T Type of array containing the matrix A from which the + * elementary reflectors were generated (as in QR factorization). + * Upon execution, the array is overwritten with the orthonormal matrix Q. + */ +template +struct OrgqrTypePairSupportFactory +{ + static constexpr bool is_defined = std::disjunction< + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + // fall-through + dpctl_td_ns::NotDefinedEntry>::is_defined; +}; + /** * @brief A factory to define pairs of supported types for which * MKL LAPACK library provides support in oneapi::mkl::lapack::potrf diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py index 2f7420d1329..62ce3a8ab9b 100644 --- a/dpnp/linalg/dpnp_iface_linalg.py +++ b/dpnp/linalg/dpnp_iface_linalg.py @@ -50,6 +50,7 @@ dpnp_cholesky, dpnp_det, dpnp_eigh, + dpnp_qr, dpnp_slogdet, dpnp_solve, ) @@ -506,7 +507,7 @@ def norm(x1, ord=None, axis=None, keepdims=False): return call_origin(numpy.linalg.norm, x1, ord, axis, keepdims) -def qr(x1, mode="reduced"): +def qr(a, mode="reduced"): """ Compute the qr factorization of a matrix. @@ -515,25 +516,45 @@ def qr(x1, mode="reduced"): For full documentation refer to :obj:`numpy.linalg.qr`. - Limitations - ----------- - Input array is supported as :obj:`dpnp.ndarray`. - Parameter mode='reduced' is supported. + Parameters + ---------- + a : {dpnp.ndarray, usm_ndarray} + The input array with the dimensionality of at least 2. + mode : {"reduced", "complete", "r", "raw"}, optional + If K = min(M, N), then + - "reduced" : returns Q, R with dimensions (…, M, K), (…, K, N) + - "complete" : returns Q, R with dimensions (…, M, M), (…, M, N) + - "r" : returns R only with dimensions (…, K, N) + - "raw" : returns h, tau with dimensions (…, N, M), (…, K,) + Default: "reduced". + + Returns + ------- + out : {dpnp.ndarray, tuple of dpnp.ndarray} + Although the type of returned object depends on the mode, + it returns a tuple of ``(Q, R)`` by default. + For details, please see the document of :func:`numpy.linalg.qr`. + + Examples + -------- + >>> import dpnp as np + >>> a = np.random.randn(9, 6) + >>> Q, R = np.linalg.qr(a) + >>> np.allclose(a, np.dot(Q, R)) + array([ True]) + >>> R2 = np.linalg.qr(a, mode='r') + >>> np.allclose(R, R2) + array([ True]) """ - x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_nondefault_queue=False) - if x1_desc: - if x1_desc.ndim != 2: - pass - elif mode != "reduced": - pass - else: - result_tup = dpnp_qr(x1_desc, mode) + dpnp.check_supported_arrays_type(a) + check_stacked_2d(a) - return result_tup + if mode not in ("reduced", "complete", "r", "raw"): + raise ValueError(f"Unrecognized mode {mode}") - return call_origin(numpy.linalg.qr, x1, mode) + return dpnp_qr(a, mode) def solve(a, b): diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 40159ac02bc..71f27bef7c2 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -740,6 +740,139 @@ def dpnp_eigh(a, UPLO): return w, out_v +def dpnp_qr(a, mode="reduced"): + """ + dpnp_qr(a, mode="reduced") + + Return the qr factorization of `a` matrix. + + """ + + if a.ndim > 2: + pass + + a_usm_arr = dpnp.get_usm_ndarray(a) + a_sycl_queue = a.sycl_queue + a_usm_type = a.usm_type + + res_type = _common_type(a) + + m, n = a.shape + k = min(m, n) + if k == 0: + if mode == "reduced": + return dpnp.empty( + (m, 0), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ), dpnp.empty( + (0, n), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ) + elif mode == "complete": + return dpnp.identity( + m, dtype=res_type, sycl_queue=a_sycl_queue, usm_type=a_usm_type + ), dpnp.empty( + (m, n), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ) + elif mode == "r": + return dpnp.empty( + (0, n), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ) + else: # mode == "raw" + return dpnp.empty( + (n, m), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ), dpnp.empty( + (0,), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ) + + a = a.T + a_usm_arr = dpnp.get_usm_ndarray(a) + a_t = dpnp.empty_like(a, order="C", dtype=res_type, usm_type=a_usm_type) + + # use DPCTL tensor function to fill the matrix array + # with content from the input array `a` + a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, dst=a_t.get_array(), sycl_queue=a_sycl_queue + ) + + tau_h = dpnp.empty( + k, dtype=res_type, sycl_queue=a_sycl_queue, usm_type=a_usm_type + ) + + # Call the LAPACK extension function _geqrf to compute the QR factorization + # of a general m x n matrix. + ht_geqrf_ev, geqrf_ev = li._geqrf( + a_sycl_queue, a_t.get_array(), tau_h.get_array(), [a_copy_ev] + ) + + ht_geqrf_ev.wait() + a_ht_copy_ev.wait() + + if mode == "r": + r = a_t[:, :k].transpose() + return dpnp.triu(r).astype(res_type, copy=False) + + if mode == "raw": + return ( + a_t.astype(res_type, copy=False), + tau_h.astype(res_type, copy=False), + ) + + # mc is the total number of columns in the q matrix. + # In `complete` mode, mc equals the number of rows. + # In `reduced` mode, mc is the lesser of the row count or column count. + if mode == "complete" and m > n: + mc = m + q = dpnp.empty( + (m, m), + dtype=res_type, + order="C", + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ) + else: + mc = k + q = dpnp.empty( + (n, m), + dtype=res_type, + order="C", + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ) + q[:n] = a_t + + # Call the LAPACK extension function _orgqr to generate the real orthogonal matrix + # `Q` of the QR factorization + ht_orgqr_ev, _ = li._orgqr( + a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [geqrf_ev] + ) + + ht_orgqr_ev.wait() + + q = q[:mc].transpose() + r = a_t[:, :mc].transpose() + return ( + q.astype(dtype=res_type, copy=False), + dpnp.triu(r).astype(dtype=res_type, copy=False), + ) + + def dpnp_solve(a, b): """ dpnp_solve(a, b) From 8d644aa3c29d315f962123eef8377043dc3413cc Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 31 Jan 2024 12:19:39 +0100 Subject: [PATCH 02/39] Update condition and random files in cupy/testing --- tests/third_party/cupy/testing/__init__.py | 4 +--- tests/third_party/cupy/testing/condition.py | 2 +- tests/third_party/cupy/testing/random.py | 11 +++++++---- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/third_party/cupy/testing/__init__.py b/tests/third_party/cupy/testing/__init__.py index 701c381e2f3..aa6c113706b 100644 --- a/tests/third_party/cupy/testing/__init__.py +++ b/tests/third_party/cupy/testing/__init__.py @@ -60,6 +60,4 @@ product, product_dict, ) -from tests.third_party.cupy.testing.random import fix_random - -# from tests.third_party.cupy.testing.random import generate_seed +from tests.third_party.cupy.testing.random import fix_random, generate_seed diff --git a/tests/third_party/cupy/testing/condition.py b/tests/third_party/cupy/testing/condition.py index 4465dc3d0ee..3533ef8b84d 100644 --- a/tests/third_party/cupy/testing/condition.py +++ b/tests/third_party/cupy/testing/condition.py @@ -106,7 +106,7 @@ def repeat(times, intensive_times=None): if intensive_times is None: return repeat_with_success_at_least(times, times) - casual_test = bool(int(os.environ.get("CUPY_TEST_CASUAL", "0"))) + casual_test = bool(int(os.environ.get("CUPY_TEST_CASUAL", "1"))) times_ = times if casual_test else intensive_times return repeat_with_success_at_least(times_, times_) diff --git a/tests/third_party/cupy/testing/random.py b/tests/third_party/cupy/testing/random.py index 444f2b3352c..a4dc897bb9c 100644 --- a/tests/third_party/cupy/testing/random.py +++ b/tests/third_party/cupy/testing/random.py @@ -20,12 +20,15 @@ def do_setup(deterministic=True): global _old_cupy_random_states _old_python_random_state = random.getstate() _old_numpy_random_state = numpy.random.get_state() - _old_cupy_random_states = cupy.random.generator._random_states - cupy.random.reset_states() + _old_cupy_random_states = cupy.random.dpnp_iface_random._dpnp_random_states + cupy.random.dpnp_iface_random._dpnp_random_states = {} # Check that _random_state has been recreated in # cupy.random.reset_states(). Otherwise the contents of # _old_cupy_random_states would be overwritten. - assert cupy.random.generator._random_states is not _old_cupy_random_states + assert ( + cupy.random.dpnp_iface_random._dpnp_random_states + is not _old_cupy_random_states + ) if not deterministic: random.seed() @@ -43,7 +46,7 @@ def do_teardown(): global _old_cupy_random_states random.setstate(_old_python_random_state) numpy.random.set_state(_old_numpy_random_state) - cupy.random.generator._random_states = _old_cupy_random_states + cupy.random.dpnp_iface_random._dpnp_random_states = _old_cupy_random_states _old_python_random_state = None _old_numpy_random_state = None _old_cupy_random_states = None From 3a6cb9f484b13801082e7541c01b5e63419e4428 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 31 Jan 2024 12:23:22 +0100 Subject: [PATCH 03/39] Rename cupy/testing/condition.py to .../_condition.py --- .../cupy/random_tests/test_sample.py | 24 +++++++++---------- .../testing/{condition.py => _condition.py} | 0 2 files changed, 12 insertions(+), 12 deletions(-) rename tests/third_party/cupy/testing/{condition.py => _condition.py} (100%) diff --git a/tests/third_party/cupy/random_tests/test_sample.py b/tests/third_party/cupy/random_tests/test_sample.py index f95f3e42710..79e2370ad05 100644 --- a/tests/third_party/cupy/random_tests/test_sample.py +++ b/tests/third_party/cupy/random_tests/test_sample.py @@ -7,7 +7,7 @@ import dpnp as cupy from dpnp import random from tests.third_party.cupy import testing -from tests.third_party.cupy.testing import condition, hypothesis +from tests.third_party.cupy.testing import _condition, hypothesis @testing.gpu @@ -43,7 +43,7 @@ def test_zero_sizes(self): @testing.gpu class TestRandint2(unittest.TestCase): @pytest.mark.usefixtures("allow_fall_back_on_numpy") - @condition.repeat(3, 10) + @_condition.repeat(3, 10) def test_bound_1(self): vals = [random.randint(0, 10, (2, 3)) for _ in range(10)] for val in vals: @@ -52,7 +52,7 @@ def test_bound_1(self): self.assertEqual(max(_.max() for _ in vals), 9) @pytest.mark.usefixtures("allow_fall_back_on_numpy") - @condition.repeat(3, 10) + @_condition.repeat(3, 10) def test_bound_2(self): vals = [random.randint(0, 2) for _ in range(20)] for val in vals: @@ -61,7 +61,7 @@ def test_bound_2(self): self.assertEqual(max(_.max() for _ in vals), 1) @pytest.mark.usefixtures("allow_fall_back_on_numpy") - @condition.repeat(3, 10) + @_condition.repeat(3, 10) def test_bound_overflow(self): # 100 - (-100) exceeds the range of int8 val = random.randint(numpy.int8(-100), numpy.int8(100), size=20) @@ -70,7 +70,7 @@ def test_bound_overflow(self): self.assertLess(val.max(), 100) @pytest.mark.usefixtures("allow_fall_back_on_numpy") - @condition.repeat(3, 10) + @_condition.repeat(3, 10) def test_bound_float1(self): # generate floats s.t. int(low) < int(high) low, high = sorted(numpy.random.uniform(-5, 5, size=2)) @@ -90,7 +90,7 @@ def test_bound_float2(self): self.assertEqual(min(_.min() for _ in vals), -1) self.assertEqual(max(_.max() for _ in vals), 0) - @condition.repeat(3, 10) + @_condition.repeat(3, 10) def test_goodness_of_fit(self): mx = 5 trial = 100 @@ -99,7 +99,7 @@ def test_goodness_of_fit(self): expected = numpy.array([float(trial) / mx] * mx) self.assertTrue(hypothesis.chi_square_test(counts, expected)) - @condition.repeat(3, 10) + @_condition.repeat(3, 10) def test_goodness_of_fit_2(self): mx = 5 vals = random.randint(mx, size=(5, 20)) @@ -169,7 +169,7 @@ def test_size_is_not_none(self): @testing.fix_random() @testing.gpu class TestRandomIntegers2(unittest.TestCase): - @condition.repeat(3, 10) + @_condition.repeat(3, 10) def test_bound_1(self): vals = [random.random_integers(0, 10, (2, 3)).get() for _ in range(10)] for val in vals: @@ -177,7 +177,7 @@ def test_bound_1(self): self.assertEqual(min(_.min() for _ in vals), 0) self.assertEqual(max(_.max() for _ in vals), 10) - @condition.repeat(3, 10) + @_condition.repeat(3, 10) def test_bound_2(self): vals = [random.random_integers(0, 2).get() for _ in range(20)] for val in vals: @@ -185,7 +185,7 @@ def test_bound_2(self): self.assertEqual(min(vals), 0) self.assertEqual(max(vals), 2) - @condition.repeat(3, 10) + @_condition.repeat(3, 10) def test_goodness_of_fit(self): mx = 5 trial = 100 @@ -194,7 +194,7 @@ def test_goodness_of_fit(self): expected = numpy.array([float(trial) / mx] * mx) self.assertTrue(hypothesis.chi_square_test(counts, expected)) - @condition.repeat(3, 10) + @_condition.repeat(3, 10) def test_goodness_of_fit_2(self): mx = 5 vals = random.randint(0, mx, (5, 20)).get() @@ -289,7 +289,7 @@ def test_randn_invalid_argument(self): @testing.fix_random() @testing.gpu class TestMultinomial(unittest.TestCase): - @condition.repeat(3, 10) + @_condition.repeat(3, 10) @testing.for_float_dtypes() @testing.numpy_cupy_allclose(rtol=0.05) def test_multinomial(self, xp, dtype): diff --git a/tests/third_party/cupy/testing/condition.py b/tests/third_party/cupy/testing/_condition.py similarity index 100% rename from tests/third_party/cupy/testing/condition.py rename to tests/third_party/cupy/testing/_condition.py From 872a713d9b760b93c56c6255375a5660d4c44789 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 31 Jan 2024 12:25:44 +0100 Subject: [PATCH 04/39] Add cupy tests for dpnp.linalg.qr --- .../cupy/linalg_tests/test_decomposition.py | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tests/third_party/cupy/linalg_tests/test_decomposition.py b/tests/third_party/cupy/linalg_tests/test_decomposition.py index 42bcf122ff4..a9ce10e2051 100644 --- a/tests/third_party/cupy/linalg_tests/test_decomposition.py +++ b/tests/third_party/cupy/linalg_tests/test_decomposition.py @@ -6,6 +6,7 @@ import dpnp as cupy from tests.helper import has_support_aspect64, is_cpu_device from tests.third_party.cupy import testing +from tests.third_party.cupy.testing import _condition def random_matrix(shape, dtype, scale, sym=False): @@ -135,3 +136,74 @@ def check_L(self, array): def test_decomposition(self, dtype): A = numpy.array([[1, -2], [-2, 1]]).astype(dtype) self.check_L(A) + + +@testing.parameterize( + *testing.product( + { + "mode": ["r", "raw", "complete", "reduced"], + } + ) +) +class TestQRDecomposition(unittest.TestCase): + @testing.for_dtypes("fdFD") + def check_mode(self, array, mode, dtype): + if dtype in (numpy.complex64, numpy.complex128): + pytest.skip("ungqr unsupported") + + a_cpu = numpy.asarray(array, dtype=dtype) + a_gpu = cupy.asarray(array, dtype=dtype) + result_gpu = cupy.linalg.qr(a_gpu, mode=mode) + if ( + mode != "raw" + or numpy.lib.NumpyVersion(numpy.__version__) >= "1.22.0rc1" + ): + result_cpu = numpy.linalg.qr(a_cpu, mode=mode) + self._check_result(result_cpu, result_gpu) + + def _check_result(self, result_cpu, result_gpu): + if isinstance(result_cpu, tuple): + for b_cpu, b_gpu in zip(result_cpu, result_gpu): + assert b_cpu.dtype == b_gpu.dtype + testing.assert_allclose(b_cpu, b_gpu, atol=1e-4) + else: + assert result_cpu.dtype == result_gpu.dtype + testing.assert_allclose(result_cpu, result_gpu, atol=1e-4) + + @testing.fix_random() + @_condition.repeat(3, 10) + def test_mode(self): + self.check_mode(numpy.random.randn(2, 4), mode=self.mode) + self.check_mode(numpy.random.randn(3, 3), mode=self.mode) + self.check_mode(numpy.random.randn(5, 4), mode=self.mode) + + # @testing.with_requires("numpy>=1.22") + # # @testing.fix_random() + # def test_mode_rank3(self): + # self.check_mode(numpy.random.randn(3, 2, 4), mode=self.mode) + # self.check_mode(numpy.random.randn(4, 3, 3), mode=self.mode) + # self.check_mode(numpy.random.randn(2, 5, 4), mode=self.mode) + + # @testing.with_requires("numpy>=1.22") + # # @testing.fix_random() + # def test_mode_rank4(self): + # self.check_mode(numpy.random.randn(2, 3, 2, 4), mode=self.mode) + # self.check_mode(numpy.random.randn(2, 4, 3, 3), mode=self.mode) + # self.check_mode(numpy.random.randn(2, 2, 5, 4), mode=self.mode) + + # @testing.with_requires("numpy>=1.16") + # def test_empty_array(self): + # self.check_mode(numpy.empty((0, 3)), mode=self.mode) + # self.check_mode(numpy.empty((3, 0)), mode=self.mode) + + # @testing.with_requires("numpy>=1.22") + # def test_empty_array_rank3(self): + # self.check_mode(numpy.empty((0, 3, 2)), mode=self.mode) + # self.check_mode(numpy.empty((3, 0, 2)), mode=self.mode) + # self.check_mode(numpy.empty((3, 2, 0)), mode=self.mode) + # self.check_mode(numpy.empty((0, 3, 3)), mode=self.mode) + # self.check_mode(numpy.empty((3, 0, 3)), mode=self.mode) + # self.check_mode(numpy.empty((3, 3, 0)), mode=self.mode) + # self.check_mode(numpy.empty((0, 2, 3)), mode=self.mode) + # self.check_mode(numpy.empty((2, 0, 3)), mode=self.mode) + # self.check_mode(numpy.empty((2, 3, 0)), mode=self.mode) From 890bffecfce2cebed6f6349436eff66d685316cf Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 31 Jan 2024 15:38:52 +0100 Subject: [PATCH 05/39] Add batch implementation of dpnp.linalg.qr --- dpnp/backend/extensions/lapack/CMakeLists.txt | 2 + dpnp/backend/extensions/lapack/geqrf.hpp | 12 + .../backend/extensions/lapack/geqrf_batch.cpp | 268 +++++++++++++++++ dpnp/backend/extensions/lapack/lapack_py.cpp | 19 ++ dpnp/backend/extensions/lapack/orgqr.hpp | 13 + .../backend/extensions/lapack/orgqr_batch.cpp | 273 ++++++++++++++++++ .../extensions/lapack/types_matrix.hpp | 49 ++++ dpnp/linalg/dpnp_utils_linalg.py | 178 +++++++++++- 8 files changed, 812 insertions(+), 2 deletions(-) create mode 100644 dpnp/backend/extensions/lapack/geqrf_batch.cpp create mode 100644 dpnp/backend/extensions/lapack/orgqr_batch.cpp diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt index b1d2ecb87df..3dbd95fbe23 100644 --- a/dpnp/backend/extensions/lapack/CMakeLists.txt +++ b/dpnp/backend/extensions/lapack/CMakeLists.txt @@ -28,11 +28,13 @@ set(python_module_name _lapack_impl) set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/lapack_py.cpp ${CMAKE_CURRENT_SOURCE_DIR}/geqrf.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/geqrf_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/gesv.cpp ${CMAKE_CURRENT_SOURCE_DIR}/getrf.cpp ${CMAKE_CURRENT_SOURCE_DIR}/getrf_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/heevd.cpp ${CMAKE_CURRENT_SOURCE_DIR}/orgqr.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/orgqr_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/potrf.cpp ${CMAKE_CURRENT_SOURCE_DIR}/potrf_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/syevd.cpp diff --git a/dpnp/backend/extensions/lapack/geqrf.hpp b/dpnp/backend/extensions/lapack/geqrf.hpp index 172c7fc76fa..4ab65286b29 100644 --- a/dpnp/backend/extensions/lapack/geqrf.hpp +++ b/dpnp/backend/extensions/lapack/geqrf.hpp @@ -44,6 +44,18 @@ extern std::pair dpctl::tensor::usm_ndarray tau_array, const std::vector &depends = {}); +extern std::pair + geqrf_batch(sycl::queue exec_q, + dpctl::tensor::usm_ndarray a_array, + dpctl::tensor::usm_ndarray tau_array, + std::int64_t m, + std::int64_t n, + std::int64_t stride_a, + std::int64_t stride_tau, + std::int64_t batch_size, + const std::vector &depends = {}); + +extern void init_geqrf_batch_dispatch_vector(void); extern void init_geqrf_dispatch_vector(void); } // namespace lapack } // namespace ext diff --git a/dpnp/backend/extensions/lapack/geqrf_batch.cpp b/dpnp/backend/extensions/lapack/geqrf_batch.cpp new file mode 100644 index 00000000000..64964bfe153 --- /dev/null +++ b/dpnp/backend/extensions/lapack/geqrf_batch.cpp @@ -0,0 +1,268 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#include + +// dpctl tensor headers +#include "utils/memory_overlap.hpp" +#include "utils/type_utils.hpp" + +#include "geqrf.hpp" +#include "types_matrix.hpp" + +#include "dpnp_utils.hpp" + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace lapack +{ +namespace mkl_lapack = oneapi::mkl::lapack; +namespace py = pybind11; +namespace type_utils = dpctl::tensor::type_utils; + +typedef sycl::event (*geqrf_batch_impl_fn_ptr_t)( + sycl::queue, + std::int64_t, + std::int64_t, + char *, + std::int64_t, + std::int64_t, + char *, + std::int64_t, + std::int64_t, + std::vector &, + const std::vector &); + +static geqrf_batch_impl_fn_ptr_t + geqrf_batch_dispatch_vector[dpctl_td_ns::num_types]; + +template +static sycl::event geqrf_batch_impl(sycl::queue exec_q, + std::int64_t m, + std::int64_t n, + char *in_a, + std::int64_t lda, + std::int64_t stride_a, + char *in_tau, + std::int64_t stride_tau, + std::int64_t batch_size, + std::vector &host_task_events, + const std::vector &depends) +{ + type_utils::validate_type_for_device(exec_q); + + T *a = reinterpret_cast(in_a); + T *tau = reinterpret_cast(in_tau); + + const std::int64_t scratchpad_size = + mkl_lapack::geqrf_batch_scratchpad_size(exec_q, m, n, lda, stride_a, + stride_tau, batch_size); + T *scratchpad = nullptr; + + std::stringstream error_msg; + std::int64_t info = 0; + bool is_exception_caught = false; + + sycl::event geqrf_batch_event; + try { + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + + geqrf_batch_event = mkl_lapack::geqrf_batch( + exec_q, + m, // The number of rows in each matrix in the batch; (0 ≤ m). + // It must be a non-negative integer. + n, // The number of columns in each matrix in the batch; (0 ≤ n). + // It must be a non-negative integer. + a, // Pointer to the batch of matrices, each of size (m x n). + lda, // The leading dimension of each matrix in the batch. + // For row major layout, lda ≥ max(1, m). + stride_a, // Stride between consecutive matrices in the batch. + tau, // Pointer to the array of scalar factors of the elementary + // reflectors for each matrix in the batch. + stride_tau, // Stride between arrays of scalar factors in the batch. + batch_size, // The number of matrices in the batch. + scratchpad, // Pointer to scratchpad memory to be used by MKL + // routine for storing intermediate results. + scratchpad_size, depends); + } catch (mkl_lapack::exception const &e) { + is_exception_caught = true; + info = e.info(); + + if (info < 0) { + error_msg << "Parameter number " << -info + << " had an illegal value."; + } + else if (info == scratchpad_size && e.detail() != 0) { + error_msg + << "Insufficient scratchpad size. Required size is at least " + << e.detail(); + } + else { + error_msg << "Unexpected MKL exception caught during geqrf_batch() " + "call:\nreason: " + << e.what() << "\ninfo: " << e.info(); + } + } catch (sycl::exception const &e) { + is_exception_caught = true; + error_msg + << "Unexpected SYCL exception caught during geqrf_batch() call:\n" + << e.what(); + } + + if (is_exception_caught) // an unexpected error occurs + { + if (scratchpad != nullptr) { + sycl::free(scratchpad, exec_q); + } + + throw std::runtime_error(error_msg.str()); + } + + sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(geqrf_batch_event); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); }); + }); + host_task_events.push_back(clean_up_event); + return geqrf_batch_event; +} + +std::pair + geqrf_batch(sycl::queue q, + dpctl::tensor::usm_ndarray a_array, + dpctl::tensor::usm_ndarray tau_array, + std::int64_t m, + std::int64_t n, + std::int64_t stride_a, + std::int64_t stride_tau, + std::int64_t batch_size, + const std::vector &depends) +{ + const int a_array_nd = a_array.get_ndim(); + const int tau_array_nd = tau_array.get_ndim(); + + if (a_array_nd < 3) { + throw py::value_error( + "The input array has ndim=" + std::to_string(a_array_nd) + + ", but an array with ndim >= 3 is expected."); + } + + if (tau_array_nd != 2) { + throw py::value_error("The array of Householder scalars has ndim=" + + std::to_string(tau_array_nd) + + ", but a 2-dimensional array is expected."); + } + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(a_array, tau_array)) { + throw py::value_error( + "The input array and the array of Householder scalars " + "are overlapping segments of memory"); + } + + bool is_a_array_c_contig = a_array.is_c_contiguous(); + bool is_tau_array_c_contig = tau_array.is_c_contiguous(); + if (!is_a_array_c_contig) { + throw py::value_error("The input array " + "must be C-contiguous"); + } + if (!is_tau_array_c_contig) { + throw py::value_error("The array of Householder scalars " + "must be C-contiguous"); + } + + auto array_types = dpctl_td_ns::usm_ndarray_types(); + int a_array_type_id = + array_types.typenum_to_lookup_id(a_array.get_typenum()); + + geqrf_batch_impl_fn_ptr_t geqrf_batch_fn = + geqrf_batch_dispatch_vector[a_array_type_id]; + if (geqrf_batch_fn == nullptr) { + throw py::value_error( + "No geqrf_batch implementation defined for the provided type " + "of the input matrix."); + } + + // auto tau_types = dpctl_td_ns::usm_ndarray_types(); + // int tau_array_type_id = + // tau_types.typenum_to_lookup_id(tau_array.get_typenum()); + + // if (tau_array_type_id != static_cast(dpctl_td_ns::typenum_t::INT64)) + // { + // throw py::value_error("The type of 'tau_array' must be int64."); + // } + + char *a_array_data = a_array.get_data(); + char *tau_array_data = tau_array.get_data(); + + const std::int64_t lda = std::max(1UL, m); + + std::vector host_task_events; + sycl::event geqrf_batch_ev = + geqrf_batch_fn(q, m, n, a_array_data, lda, stride_a, tau_array_data, + stride_tau, batch_size, host_task_events, depends); + + sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array}, + host_task_events); + + return std::make_pair(args_ev, geqrf_batch_ev); +} + +template +struct GeqrfBatchContigFactory +{ + fnT get() + { + if constexpr (types::GeqrfBatchTypePairSupportFactory::is_defined) { + return geqrf_batch_impl; + } + else { + return nullptr; + } + } +}; + +void init_geqrf_batch_dispatch_vector(void) +{ + dpctl_td_ns::DispatchVectorBuilder + contig; + contig.populate_dispatch_vector(geqrf_batch_dispatch_vector); +} +} // namespace lapack +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp index 6e487ccefb3..3bc3c022626 100644 --- a/dpnp/backend/extensions/lapack/lapack_py.cpp +++ b/dpnp/backend/extensions/lapack/lapack_py.cpp @@ -45,10 +45,12 @@ namespace py = pybind11; // populate dispatch vectors void init_dispatch_vectors(void) { + lapack_ext::init_geqrf_batch_dispatch_vector(); lapack_ext::init_geqrf_dispatch_vector(); lapack_ext::init_gesv_dispatch_vector(); lapack_ext::init_getrf_batch_dispatch_vector(); lapack_ext::init_getrf_dispatch_vector(); + lapack_ext::init_orgqr_batch_dispatch_vector(); lapack_ext::init_orgqr_dispatch_vector(); lapack_ext::init_potrf_batch_dispatch_vector(); lapack_ext::init_potrf_dispatch_vector(); @@ -71,6 +73,14 @@ PYBIND11_MODULE(_lapack_impl, m) init_dispatch_vectors(); init_dispatch_tables(); + m.def("_geqrf_batch", &lapack_ext::geqrf_batch, + "Call `geqrf_batch` from OneMKL LAPACK library to return " + "the QR factorization of a batch general matrix ", + py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"), + py::arg("m"), py::arg("n"), py::arg("stride_a"), + py::arg("stride_tau"), py::arg("batch_size"), + py::arg("depends") = py::list()); + m.def("_geqrf", &lapack_ext::geqrf, "Call `geqrf` from OneMKL LAPACK library to return " "the QR factorization of a general m x n matrix ", @@ -105,6 +115,15 @@ PYBIND11_MODULE(_lapack_impl, m) py::arg("eig_vecs"), py::arg("eig_vals"), py::arg("depends") = py::list()); + m.def("_orgqr_batch", &lapack_ext::orgqr_batch, + "Call `orgqr` from OneMKL LAPACK library to return " + "the real orthogonal matrix Qi of the QR factorization " + "for a batch of general matrices", + py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"), + py::arg("m"), py::arg("n"), py::arg("k"), py::arg("stride_a"), + py::arg("stride_tau"), py::arg("batch_size"), + py::arg("depends") = py::list()); + m.def("_orgqr", &lapack_ext::orgqr, "Call `orgqr` from OneMKL LAPACK library to return " "the real orthogonal matrix Q of the QR factorization", diff --git a/dpnp/backend/extensions/lapack/orgqr.hpp b/dpnp/backend/extensions/lapack/orgqr.hpp index cd0938dd72a..9cc4f530d03 100644 --- a/dpnp/backend/extensions/lapack/orgqr.hpp +++ b/dpnp/backend/extensions/lapack/orgqr.hpp @@ -47,6 +47,19 @@ extern std::pair dpctl::tensor::usm_ndarray tau_array, const std::vector &depends = {}); +extern std::pair + orgqr_batch(sycl::queue exec_q, + dpctl::tensor::usm_ndarray a_array, + dpctl::tensor::usm_ndarray tau_array, + std::int64_t m, + std::int64_t n, + std::int64_t k, + std::int64_t stride_a, + std::int64_t stride_tau, + std::int64_t batch_size, + const std::vector &depends = {}); + +extern void init_orgqr_batch_dispatch_vector(void); extern void init_orgqr_dispatch_vector(void); } // namespace lapack } // namespace ext diff --git a/dpnp/backend/extensions/lapack/orgqr_batch.cpp b/dpnp/backend/extensions/lapack/orgqr_batch.cpp new file mode 100644 index 00000000000..560e9026fd6 --- /dev/null +++ b/dpnp/backend/extensions/lapack/orgqr_batch.cpp @@ -0,0 +1,273 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#include + +// dpctl tensor headers +#include "utils/memory_overlap.hpp" +#include "utils/type_utils.hpp" + +#include "orgqr.hpp" +#include "types_matrix.hpp" + +#include "dpnp_utils.hpp" + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace lapack +{ +namespace mkl_lapack = oneapi::mkl::lapack; +namespace py = pybind11; +namespace type_utils = dpctl::tensor::type_utils; + +typedef sycl::event (*orgqr_batch_impl_fn_ptr_t)( + sycl::queue, + std::int64_t, + std::int64_t, + std::int64_t, + char *, + std::int64_t, + std::int64_t, + char *, + std::int64_t, + std::int64_t, + std::vector &, + const std::vector &); + +static orgqr_batch_impl_fn_ptr_t + orgqr_batch_dispatch_vector[dpctl_td_ns::num_types]; + +template +static sycl::event orgqr_batch_impl(sycl::queue exec_q, + std::int64_t m, + std::int64_t n, + std::int64_t k, + char *in_a, + std::int64_t lda, + std::int64_t stride_a, + char *in_tau, + std::int64_t stride_tau, + std::int64_t batch_size, + std::vector &host_task_events, + const std::vector &depends) +{ + type_utils::validate_type_for_device(exec_q); + + T *a = reinterpret_cast(in_a); + T *tau = reinterpret_cast(in_tau); + + const std::int64_t scratchpad_size = + mkl_lapack::orgqr_batch_scratchpad_size( + exec_q, m, n, k, lda, stride_a, stride_tau, batch_size); + T *scratchpad = nullptr; + + std::stringstream error_msg; + std::int64_t info = 0; + bool is_exception_caught = false; + + sycl::event orgqr_batch_event; + try { + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + + orgqr_batch_event = mkl_lapack::orgqr_batch( + exec_q, + m, // The number of rows in each matrix in the batch; (0 ≤ m). + // It must be a non-negative integer. + n, // The number of columns in each matrix in the batch; (0 ≤ n). + // It must be a non-negative integer. + k, // The number of elementary reflectors + // whose product defines the matrices Qi; (0 ≤ k ≤ n). + a, // Pointer to the batch of matrices, each of size (m x n). + lda, // The leading dimension of each matrix in the batch. + // For row major layout, lda ≥ max(1, m). + stride_a, // Stride between consecutive matrices in the batch. + tau, // Pointer to the array of scalar factors of the elementary + // reflectors for each matrix in the batch. + stride_tau, // Stride between arrays of scalar factors in the batch. + batch_size, // The number of matrices in the batch. + scratchpad, // Pointer to scratchpad memory to be used by MKL + // routine for storing intermediate results. + scratchpad_size, depends); + } catch (mkl_lapack::exception const &e) { + is_exception_caught = true; + info = e.info(); + + if (info < 0) { + error_msg << "Parameter number " << -info + << " had an illegal value."; + } + else if (info == scratchpad_size && e.detail() != 0) { + error_msg + << "Insufficient scratchpad size. Required size is at least " + << e.detail(); + } + else { + error_msg << "Unexpected MKL exception caught during orgqr_batch() " + "call:\nreason: " + << e.what() << "\ninfo: " << e.info(); + } + } catch (sycl::exception const &e) { + is_exception_caught = true; + error_msg + << "Unexpected SYCL exception caught during orgqr_batch() call:\n" + << e.what(); + } + + if (is_exception_caught) // an unexpected error occurs + { + if (scratchpad != nullptr) { + sycl::free(scratchpad, exec_q); + } + + throw std::runtime_error(error_msg.str()); + } + + sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(orgqr_batch_event); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); }); + }); + host_task_events.push_back(clean_up_event); + return orgqr_batch_event; +} + +std::pair + orgqr_batch(sycl::queue q, + dpctl::tensor::usm_ndarray a_array, + dpctl::tensor::usm_ndarray tau_array, + std::int64_t m, + std::int64_t n, + std::int64_t k, + std::int64_t stride_a, + std::int64_t stride_tau, + std::int64_t batch_size, + const std::vector &depends) +{ + const int a_array_nd = a_array.get_ndim(); + const int tau_array_nd = tau_array.get_ndim(); + + if (a_array_nd < 3) { + throw py::value_error( + "The input array has ndim=" + std::to_string(a_array_nd) + + ", but an array with ndim >= 3 is expected."); + } + + if (tau_array_nd != 2) { + throw py::value_error("The array of Householder scalars has ndim=" + + std::to_string(tau_array_nd) + + ", but a 2-dimensional array is expected."); + } + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(a_array, tau_array)) { + throw py::value_error( + "The input array and the array of Householder scalars " + "are overlapping segments of memory"); + } + + bool is_a_array_c_contig = a_array.is_c_contiguous(); + bool is_tau_array_c_contig = tau_array.is_c_contiguous(); + if (!is_a_array_c_contig) { + throw py::value_error("The input array " + "must be C-contiguous"); + } + if (!is_tau_array_c_contig) { + throw py::value_error("The array of Householder scalars " + "must be C-contiguous"); + } + + auto array_types = dpctl_td_ns::usm_ndarray_types(); + int a_array_type_id = + array_types.typenum_to_lookup_id(a_array.get_typenum()); + + orgqr_batch_impl_fn_ptr_t orgqr_batch_fn = + orgqr_batch_dispatch_vector[a_array_type_id]; + if (orgqr_batch_fn == nullptr) { + throw py::value_error( + "No orgqr_batch implementation defined for the provided type " + "of the input matrix."); + } + + // auto tau_types = dpctl_td_ns::usm_ndarray_types(); + // int tau_array_type_id = + // tau_types.typenum_to_lookup_id(tau_array.get_typenum()); + + // if (tau_array_type_id != static_cast(dpctl_td_ns::typenum_t::INT64)) + // { + // throw py::value_error("The type of 'tau_array' must be int64."); + // } + + char *a_array_data = a_array.get_data(); + char *tau_array_data = tau_array.get_data(); + + const std::int64_t lda = std::max(1UL, m); + + std::vector host_task_events; + sycl::event orgqr_batch_ev = + orgqr_batch_fn(q, m, n, k, a_array_data, lda, stride_a, tau_array_data, + stride_tau, batch_size, host_task_events, depends); + + sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array}, + host_task_events); + + return std::make_pair(args_ev, orgqr_batch_ev); +} + +template +struct OrgqrBatchContigFactory +{ + fnT get() + { + if constexpr (types::OrgqrBatchTypePairSupportFactory::is_defined) { + return orgqr_batch_impl; + } + else { + return nullptr; + } + } +}; + +void init_orgqr_batch_dispatch_vector(void) +{ + dpctl_td_ns::DispatchVectorBuilder + contig; + contig.populate_dispatch_vector(orgqr_batch_dispatch_vector); +} +} // namespace lapack +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/backend/extensions/lapack/types_matrix.hpp b/dpnp/backend/extensions/lapack/types_matrix.hpp index 3f7be9cf4a9..89042fcf640 100644 --- a/dpnp/backend/extensions/lapack/types_matrix.hpp +++ b/dpnp/backend/extensions/lapack/types_matrix.hpp @@ -43,6 +43,34 @@ namespace lapack { namespace types { +/** + * @brief A factory to define pairs of supported types for which + * MKL LAPACK library provides support in oneapi::mkl::lapack::geqrf_batch + * function. + * + * @tparam T Type of array containing the input matrices to be QR factorized in + * batch mode. Upon execution, each matrix in the batch is transformed to output + * arrays representing their respective orthogonal matrix Q and upper triangular + * matrix R. + */ +template +struct GeqrfBatchTypePairSupportFactory +{ + static constexpr bool is_defined = std::disjunction< + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + T, + std::complex>, + dpctl_td_ns::TypePairDefinedEntry, + T, + std::complex>, + // fall-through + dpctl_td_ns::NotDefinedEntry>::is_defined; +}; + /** * @brief A factory to define pairs of supported types for which * MKL LAPACK library provides support in oneapi::mkl::lapack::geqrf @@ -169,6 +197,27 @@ struct HeevdTypePairSupportFactory dpctl_td_ns::NotDefinedEntry>::is_defined; }; +/** + * @brief A factory to define pairs of supported types for which + * MKL LAPACK library provides support in oneapi::mkl::lapack::orgqr_batch + * function. + * + * @tparam T Type of array containing the matrix A, + * each from a separate instance in the batch, from which the + * elementary reflectors were generated (as in QR factorization). + * Upon execution, each array in the batch is overwritten with + * its respective orthonormal matrix Q. + */ +template +struct OrgqrBatchTypePairSupportFactory +{ + static constexpr bool is_defined = std::disjunction< + dpctl_td_ns::TypePairDefinedEntry, + dpctl_td_ns::TypePairDefinedEntry, + // fall-through + dpctl_td_ns::NotDefinedEntry>::is_defined; +}; + /** * @brief A factory to define pairs of supported types for which * MKL LAPACK library provides support in oneapi::mkl::lapack::orgqr diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 71f27bef7c2..eb9d41c0bb2 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -25,7 +25,7 @@ import dpctl.tensor._tensor_impl as ti -from numpy import issubdtype +from numpy import issubdtype, prod import dpnp import dpnp.backend.extensions.lapack._lapack_impl as li @@ -119,6 +119,50 @@ def _real_type(dtype, device=None): return dpnp.dtype(real_type) +def _stacked_identity( + batch_shape, n, dtype, usm_type="device", sycl_queue=None +): + """ + Create stacked identity matrices of size `n x n`. + + Forms multiple identity matrices based on `batch_shape`. + + Parameters + ---------- + batch_shape : tuple + Shape of the batch determining the stacking of identity matrices. + n : int + Dimension of each identity matrix. + dtype : dtype + Data type of the matrix element. + usm_type : {"device", "shared", "host"}, optional + The type of SYCL USM allocation for the output array. + sycl_queue : {None, SyclQueue}, optional + A SYCL queue to use for output array allocation and copying. + + Returns + ------- + out : dpnp.ndarray + Array of stacked `n x n` identity matrices as per `batch_shape`. + + Example + ------- + >>> _stacked_identity((2,), 2, dtype=dpnp.int64) + array([[[1, 0], + [0, 1]], + + [[1, 0], + [0, 1]]]) + + """ + + shape = batch_shape + (n, n) + idx = dpnp.arange(n, usm_type=usm_type, sycl_queue=sycl_queue) + x = dpnp.zeros(shape, dtype=dtype, usm_type=usm_type, sycl_queue=sycl_queue) + x[..., idx, idx] = 1 + return x + + def check_stacked_2d(*arrays): """ Return ``True`` if each array in `arrays` has at least two dimensions. @@ -740,6 +784,136 @@ def dpnp_eigh(a, UPLO): return w, out_v +def dpnp_qr_batch(a, mode="reduced"): + """ + dpnp_qr_batch(a, mode="reduced") + + Return the batched qr factorization of `a` matrix. + + """ + + a_sycl_queue = a.sycl_queue + a_usm_type = a.usm_type + + m, n = a.shape[-2:] + k = min(m, n) + + batch_shape = a.shape[:-2] + batch_size = prod(batch_shape) + + res_type = _common_type(a) + + if batch_size == 0 or k == 0: + if mode == "reduced": + return ( + dpnp.empty(batch_shape + (m, k), dtype=res_type), + dpnp.empty(batch_shape + (k, n), dtype=res_type), + ) + elif mode == "complete": + q = _stacked_identity(batch_shape, m, dtype=res_type) + return (q, dpnp.empty(batch_shape + (m, n), dtype=res_type)) + elif mode == "r": + return dpnp.empty(batch_shape + (k, n), dtype=res_type) + elif mode == "raw": + return ( + dpnp.empty(batch_shape + (n, m), dtype=res_type), + dpnp.empty(batch_shape + (k,), dtype=res_type), + ) + + # get 3d input arrays by reshape + a = a.reshape(-1, m, n) + + a = a.swapaxes(-2, -1) + a_usm_arr = dpnp.get_usm_ndarray(a) + + a_t = dpnp.empty_like(a, order="C", dtype=res_type, usm_type=a_usm_type) + + # use DPCTL tensor function to fill the matrix array + # with content from the input array `a` + a_ht_copy_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, dst=a_t.get_array(), sycl_queue=a_sycl_queue + ) + + tau_h = dpnp.empty( + (batch_size, k), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ) + + a_stride = a_t.strides[0] + tau_stride = tau_h.strides[0] + + # Call the LAPACK extension function _geqrf_batch to compute the QR factorization + # of a general m x n matrix. + ht_lapack_ev, geqrf_batch_ev = li._geqrf_batch( + a_sycl_queue, + a_t.get_array(), + tau_h.get_array(), + m, + n, + a_stride, + tau_stride, + batch_size, + [a_copy_ev], + ) + + ht_lapack_ev.wait() + a_ht_copy_ev.wait() + + if mode == "r": + r = a_t[..., :k].swapaxes(-2, -1) + out_r = dpnp.triu(r).astype(res_type, copy=False) + return out_r.reshape(batch_shape + out_r.shape[-2:]) + + if mode == "raw": + q = a_t.reshape(batch_shape + a_t.shape[-2:]) + r = tau_h.reshape(batch_shape + tau_h.shape[-1:]) + return ( + q.astype(res_type, copy=False), + r.astype(res_type, copy=False), + ) + + if mode == "complete" and m > n: + mc = m + q = dpnp.empty((batch_size, m, m), dtype=res_type) + else: + mc = k + q = dpnp.empty((batch_size, n, m), dtype=res_type) + q[..., :n, :] = a_t + + q_stride = q.strides[0] + tau_stride = tau_h.strides[0] + + # Call the LAPACK extension function _orgqr to generate the orthogonal/unitary matrix + # `Qi` of the QR factorization for a batch of general matrices. + ht_lapack_ev, _ = li._orgqr_batch( + a_sycl_queue, + q.get_array(), + tau_h.get_array(), + m, + mc, + k, + q_stride, + tau_stride, + batch_size, + [geqrf_batch_ev], + ) + + q = q[..., :mc, :].swapaxes(-2, -1) + r = a_t[..., :mc].swapaxes(-2, -1) + + r_triu = dpnp.triu(r).astype(res_type, copy=False) + + q_reshape = q.reshape(batch_shape + q.shape[-2:]) + r_triu_reshape = r_triu.reshape(batch_shape + r_triu.shape[-2:]) + + return ( + q_reshape.astype(dtype=res_type, copy=False), + r_triu_reshape.astype(dtype=res_type, copy=False), + ) + + def dpnp_qr(a, mode="reduced"): """ dpnp_qr(a, mode="reduced") @@ -749,7 +923,7 @@ def dpnp_qr(a, mode="reduced"): """ if a.ndim > 2: - pass + return dpnp_qr_batch(a, mode=mode) a_usm_arr = dpnp.get_usm_ndarray(a) a_sycl_queue = a.sycl_queue From 47e309e4d429597ca3a625f9173047a440f97d06 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 31 Jan 2024 16:03:23 +0100 Subject: [PATCH 06/39] Uncomment cupy tests for 3d and 4d cases --- .../cupy/linalg_tests/test_decomposition.py | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/tests/third_party/cupy/linalg_tests/test_decomposition.py b/tests/third_party/cupy/linalg_tests/test_decomposition.py index a9ce10e2051..813b322d823 100644 --- a/tests/third_party/cupy/linalg_tests/test_decomposition.py +++ b/tests/third_party/cupy/linalg_tests/test_decomposition.py @@ -177,33 +177,33 @@ def test_mode(self): self.check_mode(numpy.random.randn(3, 3), mode=self.mode) self.check_mode(numpy.random.randn(5, 4), mode=self.mode) - # @testing.with_requires("numpy>=1.22") - # # @testing.fix_random() - # def test_mode_rank3(self): - # self.check_mode(numpy.random.randn(3, 2, 4), mode=self.mode) - # self.check_mode(numpy.random.randn(4, 3, 3), mode=self.mode) - # self.check_mode(numpy.random.randn(2, 5, 4), mode=self.mode) - - # @testing.with_requires("numpy>=1.22") - # # @testing.fix_random() - # def test_mode_rank4(self): - # self.check_mode(numpy.random.randn(2, 3, 2, 4), mode=self.mode) - # self.check_mode(numpy.random.randn(2, 4, 3, 3), mode=self.mode) - # self.check_mode(numpy.random.randn(2, 2, 5, 4), mode=self.mode) - - # @testing.with_requires("numpy>=1.16") - # def test_empty_array(self): - # self.check_mode(numpy.empty((0, 3)), mode=self.mode) - # self.check_mode(numpy.empty((3, 0)), mode=self.mode) - - # @testing.with_requires("numpy>=1.22") - # def test_empty_array_rank3(self): - # self.check_mode(numpy.empty((0, 3, 2)), mode=self.mode) - # self.check_mode(numpy.empty((3, 0, 2)), mode=self.mode) - # self.check_mode(numpy.empty((3, 2, 0)), mode=self.mode) - # self.check_mode(numpy.empty((0, 3, 3)), mode=self.mode) - # self.check_mode(numpy.empty((3, 0, 3)), mode=self.mode) - # self.check_mode(numpy.empty((3, 3, 0)), mode=self.mode) - # self.check_mode(numpy.empty((0, 2, 3)), mode=self.mode) - # self.check_mode(numpy.empty((2, 0, 3)), mode=self.mode) - # self.check_mode(numpy.empty((2, 3, 0)), mode=self.mode) + @testing.with_requires("numpy>=1.22") + @testing.fix_random() + def test_mode_rank3(self): + self.check_mode(numpy.random.randn(3, 2, 4), mode=self.mode) + self.check_mode(numpy.random.randn(4, 3, 3), mode=self.mode) + self.check_mode(numpy.random.randn(2, 5, 4), mode=self.mode) + + @testing.with_requires("numpy>=1.22") + @testing.fix_random() + def test_mode_rank4(self): + self.check_mode(numpy.random.randn(2, 3, 2, 4), mode=self.mode) + self.check_mode(numpy.random.randn(2, 4, 3, 3), mode=self.mode) + self.check_mode(numpy.random.randn(2, 2, 5, 4), mode=self.mode) + + @testing.with_requires("numpy>=1.16") + def test_empty_array(self): + self.check_mode(numpy.empty((0, 3)), mode=self.mode) + self.check_mode(numpy.empty((3, 0)), mode=self.mode) + + @testing.with_requires("numpy>=1.22") + def test_empty_array_rank3(self): + self.check_mode(numpy.empty((0, 3, 2)), mode=self.mode) + self.check_mode(numpy.empty((3, 0, 2)), mode=self.mode) + self.check_mode(numpy.empty((3, 2, 0)), mode=self.mode) + self.check_mode(numpy.empty((0, 3, 3)), mode=self.mode) + self.check_mode(numpy.empty((3, 0, 3)), mode=self.mode) + self.check_mode(numpy.empty((3, 3, 0)), mode=self.mode) + self.check_mode(numpy.empty((0, 2, 3)), mode=self.mode) + self.check_mode(numpy.empty((2, 0, 3)), mode=self.mode) + self.check_mode(numpy.empty((2, 3, 0)), mode=self.mode) From 7e9750ab446a62b49e811af4d02e0ee7f4312be3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 31 Jan 2024 16:08:03 +0100 Subject: [PATCH 07/39] Remove an old impl of dpnp_qr --- dpnp/backend/include/dpnp_iface_fptr.hpp | 2 - dpnp/backend/kernels/dpnp_krnl_linalg.cpp | 34 -------------- dpnp/dpnp_algo/dpnp_algo.pxd | 2 - dpnp/linalg/dpnp_algo_linalg.pyx | 56 ----------------------- 4 files changed, 94 deletions(-) diff --git a/dpnp/backend/include/dpnp_iface_fptr.hpp b/dpnp/backend/include/dpnp_iface_fptr.hpp index 87826cc3bb9..4640599d1c0 100644 --- a/dpnp/backend/include/dpnp_iface_fptr.hpp +++ b/dpnp/backend/include/dpnp_iface_fptr.hpp @@ -222,8 +222,6 @@ enum class DPNPFuncName : size_t DPNP_FN_PUT, /**< Used in numpy.put() impl */ DPNP_FN_PUT_ALONG_AXIS, /**< Used in numpy.put_along_axis() impl */ DPNP_FN_QR, /**< Used in numpy.linalg.qr() impl */ - DPNP_FN_QR_EXT, /**< Used in numpy.linalg.qr() impl, requires extra - parameters */ DPNP_FN_RADIANS, /**< Used in numpy.radians() impl */ DPNP_FN_RADIANS_EXT, /**< Used in numpy.radians() impl, requires extra parameters */ diff --git a/dpnp/backend/kernels/dpnp_krnl_linalg.cpp b/dpnp/backend/kernels/dpnp_krnl_linalg.cpp index 297f963ba28..1e0c2c44015 100644 --- a/dpnp/backend/kernels/dpnp_krnl_linalg.cpp +++ b/dpnp/backend/kernels/dpnp_krnl_linalg.cpp @@ -731,17 +731,6 @@ template void (*dpnp_qr_default_c)(void *, void *, void *, void *, size_t, size_t) = dpnp_qr_c<_InputDT, _ComputeDT>; -template -DPCTLSyclEventRef (*dpnp_qr_ext_c)(DPCTLSyclQueueRef, - void *, - void *, - void *, - void *, - size_t, - size_t, - const DPCTLEventVectorRef) = - dpnp_qr_c<_InputDT, _ComputeDT>; - template DPCTLSyclEventRef dpnp_svd_c(DPCTLSyclQueueRef q_ref, void *array1_in, @@ -1041,29 +1030,6 @@ void func_map_init_linalg_func(func_map_t &fmap) // fmap[DPNPFuncName::DPNP_FN_QR][eft_C128][eft_C128] = { // eft_C128, (void*)dpnp_qr_c, std::complex>}; - fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_INT][eft_INT] = { - get_default_floating_type(), - (void *)dpnp_qr_ext_c< - int32_t, func_type_map_t::find_type>, - get_default_floating_type(), - (void *)dpnp_qr_ext_c< - int32_t, func_type_map_t::find_type< - get_default_floating_type()>>}; - fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_LNG][eft_LNG] = { - get_default_floating_type(), - (void *)dpnp_qr_ext_c< - int64_t, func_type_map_t::find_type>, - get_default_floating_type(), - (void *)dpnp_qr_ext_c< - int64_t, func_type_map_t::find_type< - get_default_floating_type()>>}; - fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_FLT][eft_FLT] = { - eft_FLT, (void *)dpnp_qr_ext_c}; - fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_DBL][eft_DBL] = { - eft_DBL, (void *)dpnp_qr_ext_c}; - // fmap[DPNPFuncName::DPNP_FN_QR_EXT][eft_C128][eft_C128] = { - // eft_C128, (void*)dpnp_qr_c, std::complex>}; - fmap[DPNPFuncName::DPNP_FN_SVD][eft_INT][eft_INT] = { eft_DBL, (void *)dpnp_svd_default_c}; fmap[DPNPFuncName::DPNP_FN_SVD][eft_LNG][eft_LNG] = { diff --git a/dpnp/dpnp_algo/dpnp_algo.pxd b/dpnp/dpnp_algo/dpnp_algo.pxd index 3fcb67aac9e..f91f7693be7 100644 --- a/dpnp/dpnp_algo/dpnp_algo.pxd +++ b/dpnp/dpnp_algo/dpnp_algo.pxd @@ -98,8 +98,6 @@ cdef extern from "dpnp_iface_fptr.hpp" namespace "DPNPFuncName": # need this na DPNP_FN_PARTITION DPNP_FN_PARTITION_EXT DPNP_FN_PLACE - DPNP_FN_QR - DPNP_FN_QR_EXT DPNP_FN_RADIANS DPNP_FN_RADIANS_EXT DPNP_FN_RNG_BETA diff --git a/dpnp/linalg/dpnp_algo_linalg.pyx b/dpnp/linalg/dpnp_algo_linalg.pyx index 6cf3bd3578f..2ba9fd5646a 100644 --- a/dpnp/linalg/dpnp_algo_linalg.pyx +++ b/dpnp/linalg/dpnp_algo_linalg.pyx @@ -51,7 +51,6 @@ __all__ = [ "dpnp_inv", "dpnp_matrix_rank", "dpnp_norm", - "dpnp_qr", "dpnp_svd", ] @@ -367,61 +366,6 @@ cpdef object dpnp_norm(object input, ord=None, axis=None): raise ValueError("Improper number of dimensions to norm.") -cpdef tuple dpnp_qr(utils.dpnp_descriptor x1, str mode): - cdef size_t size_m = x1.shape[0] - cdef size_t size_n = x1.shape[1] - cdef size_t min_m_n = min(size_m, size_n) - cdef size_t size_tau = min_m_n - - cdef DPNPFuncType param1_type = dpnp_dtype_to_DPNPFuncType(x1.dtype) - cdef DPNPFuncData kernel_data = get_dpnp_function_ptr(DPNP_FN_QR_EXT, param1_type, param1_type) - - x1_obj = x1.get_array() - - cdef (DPNPFuncType, void *) ret_type_and_func = utils.get_ret_type_and_func(kernel_data, - x1_obj.sycl_device.has_aspect_fp64) - cdef DPNPFuncType return_type = ret_type_and_func[0] - cdef custom_linalg_1in_3out_shape_t func = < custom_linalg_1in_3out_shape_t > ret_type_and_func[1] - - cdef utils.dpnp_descriptor res_q = utils.create_output_descriptor((size_m, min_m_n), - return_type, - None, - device=x1_obj.sycl_device, - usm_type=x1_obj.usm_type, - sycl_queue=x1_obj.sycl_queue) - cdef utils.dpnp_descriptor res_r = utils.create_output_descriptor((min_m_n, size_n), - return_type, - None, - device=x1_obj.sycl_device, - usm_type=x1_obj.usm_type, - sycl_queue=x1_obj.sycl_queue) - cdef utils.dpnp_descriptor tau = utils.create_output_descriptor((size_tau, ), - return_type, - None, - device=x1_obj.sycl_device, - usm_type=x1_obj.usm_type, - sycl_queue=x1_obj.sycl_queue) - - result_sycl_queue = res_q.get_array().sycl_queue - - cdef c_dpctl.SyclQueue q = result_sycl_queue - cdef c_dpctl.DPCTLSyclQueueRef q_ref = q.get_queue_ref() - - cdef c_dpctl.DPCTLSyclEventRef event_ref = func(q_ref, - x1.get_data(), - res_q.get_data(), - res_r.get_data(), - tau.get_data(), - size_m, - size_n, - NULL) # dep_events_ref - - with nogil: c_dpctl.DPCTLEvent_WaitAndThrow(event_ref) - c_dpctl.DPCTLEvent_Delete(event_ref) - - return (res_q.get_pyobj(), res_r.get_pyobj()) - - cpdef tuple dpnp_svd(utils.dpnp_descriptor x1, cpp_bool full_matrices, cpp_bool compute_uv, cpp_bool hermitian): cdef size_t size_m = x1.shape[0] cdef size_t size_n = x1.shape[1] From 71277b0c326d2227fbfd8b8b1b34be64c3a68be1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 31 Jan 2024 17:00:28 +0100 Subject: [PATCH 08/39] Update test_qr in test_sycl_queue --- dpnp/linalg/dpnp_utils_linalg.py | 71 +++++++++++++++++++++++++++----- tests/test_sycl_queue.py | 58 +++++++++++++++++--------- 2 files changed, 99 insertions(+), 30 deletions(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index eb9d41c0bb2..4de00a11168 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -806,18 +806,57 @@ def dpnp_qr_batch(a, mode="reduced"): if batch_size == 0 or k == 0: if mode == "reduced": return ( - dpnp.empty(batch_shape + (m, k), dtype=res_type), - dpnp.empty(batch_shape + (k, n), dtype=res_type), + dpnp.empty( + batch_shape + (m, k), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ), + dpnp.empty( + batch_shape + (k, n), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ), ) elif mode == "complete": - q = _stacked_identity(batch_shape, m, dtype=res_type) - return (q, dpnp.empty(batch_shape + (m, n), dtype=res_type)) + q = _stacked_identity( + batch_shape, + m, + dtype=res_type, + usm_type=a_usm_type, + sycl_queue=a_sycl_queue, + ) + return ( + q, + dpnp.empty( + batch_shape + (m, n), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ), + ) elif mode == "r": - return dpnp.empty(batch_shape + (k, n), dtype=res_type) + return dpnp.empty( + batch_shape + (k, n), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ) elif mode == "raw": return ( - dpnp.empty(batch_shape + (n, m), dtype=res_type), - dpnp.empty(batch_shape + (k,), dtype=res_type), + dpnp.empty( + batch_shape + (n, m), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ), + dpnp.empty( + batch_shape + (k,), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ), ) # get 3d input arrays by reshape @@ -826,7 +865,7 @@ def dpnp_qr_batch(a, mode="reduced"): a = a.swapaxes(-2, -1) a_usm_arr = dpnp.get_usm_ndarray(a) - a_t = dpnp.empty_like(a, order="C", dtype=res_type, usm_type=a_usm_type) + a_t = dpnp.empty_like(a, order="C", dtype=res_type) # use DPCTL tensor function to fill the matrix array # with content from the input array `a` @@ -876,10 +915,20 @@ def dpnp_qr_batch(a, mode="reduced"): if mode == "complete" and m > n: mc = m - q = dpnp.empty((batch_size, m, m), dtype=res_type) + q = dpnp.empty( + (batch_size, m, m), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ) else: mc = k - q = dpnp.empty((batch_size, n, m), dtype=res_type) + q = dpnp.empty( + (batch_size, n, m), + dtype=res_type, + sycl_queue=a_sycl_queue, + usm_type=a_usm_type, + ) q[..., :n, :] = a_t q_stride = q.strides[0] @@ -977,7 +1026,7 @@ def dpnp_qr(a, mode="reduced"): a = a.T a_usm_arr = dpnp.get_usm_ndarray(a) - a_t = dpnp.empty_like(a, order="C", dtype=res_type, usm_type=a_usm_type) + a_t = dpnp.empty_like(a, order="C", dtype=res_type) # use DPCTL tensor function to fill the matrix array # with content from the input array `a` diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py index ee247aa5414..85b84b9bae8 100644 --- a/tests/test_sycl_queue.py +++ b/tests/test_sycl_queue.py @@ -1171,34 +1171,54 @@ def test_matrix_rank(device): assert_array_equal(expected, result) +@pytest.mark.parametrize( + "shape", + [ + (4, 4), + (2, 0), + (2, 2, 3), + (0, 2, 3), + (1, 0, 3), + ], + ids=[ + "(4, 4)", + "(2, 0)", + "(2, 2, 3)", + "(0, 2, 3)", + "(1, 0, 3)", + ], +) +@pytest.mark.parametrize( + "mode", + ["r", "raw", "complete", "reduced"], + ids=["r", "raw", "complete", "reduced"], +) @pytest.mark.parametrize( "device", valid_devices, ids=[device.filter_string for device in valid_devices], ) -def test_qr(device): - data = [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]] - dpnp_data = dpnp.array(data, device=device) - numpy_data = numpy.array(data, dtype=dpnp_data.dtype) - - np_q, np_r = numpy.linalg.qr(numpy_data, "reduced") - dpnp_q, dpnp_r = dpnp.linalg.qr(dpnp_data, "reduced") +def test_qr(shape, mode, device): + dtype = dpnp.default_float_type(device) + count_elems = numpy.prod(shape) + dpnp_data = dpnp.arange(count_elems, dtype=dtype, device=device).reshape( + shape + ) - assert dpnp_q.dtype == np_q.dtype - assert dpnp_r.dtype == np_r.dtype - assert dpnp_q.shape == np_q.shape - assert dpnp_r.shape == np_r.shape + expected_queue = dpnp_data.get_array().sycl_queue - assert_dtype_allclose(dpnp_q, np_q) - assert_dtype_allclose(dpnp_r, np_r) + if mode == "r": + dpnp_r = dpnp.linalg.qr(dpnp_data, mode=mode) + dpnp_r_queue = dpnp_r.get_array().sycl_queue + assert_sycl_queue_equal(dpnp_r_queue, expected_queue) + else: + dpnp_q, dpnp_r = dpnp.linalg.qr(dpnp_data, mode=mode) - expected_queue = dpnp_data.get_array().sycl_queue - dpnp_q_queue = dpnp_q.get_array().sycl_queue - dpnp_r_queue = dpnp_r.get_array().sycl_queue + dpnp_q_queue = dpnp_q.get_array().sycl_queue + dpnp_r_queue = dpnp_r.get_array().sycl_queue - # compare queue and device - assert_sycl_queue_equal(dpnp_q_queue, expected_queue) - assert_sycl_queue_equal(dpnp_r_queue, expected_queue) + assert_sycl_queue_equal(dpnp_q_queue, expected_queue) + assert_sycl_queue_equal(dpnp_r_queue, expected_queue) @pytest.mark.parametrize( From 0fe3346739b7a3a5cc0a20244140d0eb9fae145f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 31 Jan 2024 17:12:06 +0100 Subject: [PATCH 09/39] Add test_qr in test_usm_type --- tests/test_sycl_queue.py | 22 ++++++++++------------ tests/test_usm_type.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 12 deletions(-) diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py index 85b84b9bae8..a5b02f4007f 100644 --- a/tests/test_sycl_queue.py +++ b/tests/test_sycl_queue.py @@ -1201,24 +1201,22 @@ def test_matrix_rank(device): def test_qr(shape, mode, device): dtype = dpnp.default_float_type(device) count_elems = numpy.prod(shape) - dpnp_data = dpnp.arange(count_elems, dtype=dtype, device=device).reshape( - shape - ) + a = dpnp.arange(count_elems, dtype=dtype, device=device).reshape(shape) - expected_queue = dpnp_data.get_array().sycl_queue + expected_queue = a.get_array().sycl_queue if mode == "r": - dpnp_r = dpnp.linalg.qr(dpnp_data, mode=mode) - dpnp_r_queue = dpnp_r.get_array().sycl_queue - assert_sycl_queue_equal(dpnp_r_queue, expected_queue) + dp_r = dpnp.linalg.qr(a, mode=mode) + dp_r_queue = dp_r.get_array().sycl_queue + assert_sycl_queue_equal(dp_r_queue, expected_queue) else: - dpnp_q, dpnp_r = dpnp.linalg.qr(dpnp_data, mode=mode) + dp_q, dp_r = dpnp.linalg.qr(a, mode=mode) - dpnp_q_queue = dpnp_q.get_array().sycl_queue - dpnp_r_queue = dpnp_r.get_array().sycl_queue + dp_q_queue = dp_q.get_array().sycl_queue + dp_r_queue = dp_r.get_array().sycl_queue - assert_sycl_queue_equal(dpnp_q_queue, expected_queue) - assert_sycl_queue_equal(dpnp_r_queue, expected_queue) + assert_sycl_queue_equal(dp_q_queue, expected_queue) + assert_sycl_queue_equal(dp_r_queue, expected_queue) @pytest.mark.parametrize( diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py index 554f939b208..f43cf138afd 100644 --- a/tests/test_usm_type.py +++ b/tests/test_usm_type.py @@ -710,3 +710,40 @@ def test_det(shape, is_empty, usm_type): det = dp.linalg.det(x) assert x.usm_type == det.usm_type + + +@pytest.mark.parametrize("usm_type", list_of_usm_types, ids=list_of_usm_types) +@pytest.mark.parametrize( + "shape", + [ + (4, 4), + (2, 0), + (2, 2, 3), + (0, 2, 3), + (1, 0, 3), + ], + ids=[ + "(4, 4)", + "(2, 0)", + "(2, 2, 3)", + "(0, 2, 3)", + "(1, 0, 3)", + ], +) +@pytest.mark.parametrize( + "mode", + ["r", "raw", "complete", "reduced"], + ids=["r", "raw", "complete", "reduced"], +) +def test_qr(shape, mode, usm_type): + count_elems = numpy.prod(shape) + a = dp.arange(count_elems, usm_type=usm_type).reshape(shape) + + if mode == "r": + dp_r = dp.linalg.qr(a, mode=mode) + assert a.usm_type == dp_r.usm_type + else: + dp_q, dp_r = dp.linalg.qr(a, mode=mode) + + assert a.usm_type == dp_q.usm_type + assert a.usm_type == dp_r.usm_type From 3b57f0ca1585e7ff33f812ccb8adc2de4a69af71 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 1 Feb 2024 12:33:08 +0100 Subject: [PATCH 10/39] Rename condition to _condition in test_solve.py --- tests/third_party/cupy/linalg_tests/test_solve.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/third_party/cupy/linalg_tests/test_solve.py b/tests/third_party/cupy/linalg_tests/test_solve.py index b31082c8e84..cd397f6c9e1 100644 --- a/tests/third_party/cupy/linalg_tests/test_solve.py +++ b/tests/third_party/cupy/linalg_tests/test_solve.py @@ -10,7 +10,7 @@ is_cpu_device, ) from tests.third_party.cupy import testing -from tests.third_party.cupy.testing import condition +from tests.third_party.cupy.testing import _condition @testing.parameterize( @@ -104,7 +104,7 @@ def test_invalid_shape(self): ) class TestInv(unittest.TestCase): @testing.for_dtypes("ifdFD") - @condition.retry(10) + @_condition.retry(10) def check_x(self, a_shape, dtype): a_cpu = numpy.random.randint(0, 10, size=a_shape) a_cpu = a_cpu.astype(dtype, order=self.order) From dc6dbcb9e9f52e3caed628d11c6e4fc99c8580b0 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Thu, 1 Feb 2024 13:38:58 +0100 Subject: [PATCH 11/39] Use _real_type for _orgqr --- dpnp/linalg/dpnp_utils_linalg.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index de9a0854b73..40600dfe0a9 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -1224,6 +1224,9 @@ def dpnp_qr(a, mode="reduced"): tau_h.astype(res_type, copy=False), ) + # _orgqr supports only floating type + orgqr_type = _real_type(res_type) + # mc is the total number of columns in the q matrix. # In `complete` mode, mc equals the number of rows. # In `reduced` mode, mc is the lesser of the row count or column count. @@ -1231,7 +1234,7 @@ def dpnp_qr(a, mode="reduced"): mc = m q = dpnp.empty( (m, m), - dtype=res_type, + dtype=orgqr_type, order="C", sycl_queue=a_sycl_queue, usm_type=a_usm_type, @@ -1240,7 +1243,7 @@ def dpnp_qr(a, mode="reduced"): mc = k q = dpnp.empty( (n, m), - dtype=res_type, + dtype=orgqr_type, order="C", sycl_queue=a_sycl_queue, usm_type=a_usm_type, From a2349fe04c8b1c713ed537f65c51e2cad89c3d5b Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 2 Feb 2024 13:15:42 +0100 Subject: [PATCH 12/39] Use _real_type for _orgqr_batch --- dpnp/linalg/dpnp_utils_linalg.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 40600dfe0a9..d34d12212bb 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -1080,11 +1080,14 @@ def dpnp_qr_batch(a, mode="reduced"): r.astype(res_type, copy=False), ) + # _orgqr supports only floating type + orgqr_type = _real_type(res_type) + if mode == "complete" and m > n: mc = m q = dpnp.empty( (batch_size, m, m), - dtype=res_type, + dtype=orgqr_type, sycl_queue=a_sycl_queue, usm_type=a_usm_type, ) @@ -1092,7 +1095,7 @@ def dpnp_qr_batch(a, mode="reduced"): mc = k q = dpnp.empty( (batch_size, n, m), - dtype=res_type, + dtype=orgqr_type, sycl_queue=a_sycl_queue, usm_type=a_usm_type, ) @@ -1126,7 +1129,7 @@ def dpnp_qr_batch(a, mode="reduced"): return ( q_reshape.astype(dtype=res_type, copy=False), - r_triu_reshape.astype(dtype=res_type, copy=False), + r_triu_reshape, ) From 4e993a9229366756d9071ff8dcb4c4d408753fe4 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 2 Feb 2024 15:11:05 +0100 Subject: [PATCH 13/39] Update dpnp tests for dpnp.linalg.qr --- tests/test_linalg.py | 193 ++++++++++++++++++++++++++++--------------- 1 file changed, 126 insertions(+), 67 deletions(-) diff --git a/tests/test_linalg.py b/tests/test_linalg.py index 5ea536c2887..49030fee561 100644 --- a/tests/test_linalg.py +++ b/tests/test_linalg.py @@ -1,7 +1,12 @@ import dpctl import numpy import pytest -from numpy.testing import assert_allclose, assert_array_equal, assert_raises +from numpy.testing import ( + assert_allclose, + assert_almost_equal, + assert_array_equal, + assert_raises, +) import dpnp as inp from tests.third_party.cupy import testing @@ -671,88 +676,142 @@ def test_norm3(array, ord, axis): assert_allclose(expected, result) -@pytest.mark.usefixtures("allow_fall_back_on_numpy") -@pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True)) -@pytest.mark.parametrize( - "shape", - [(2, 2), (3, 4), (5, 3), (16, 16), (0, 0), (0, 2), (2, 0)], - ids=["(2,2)", "(3,4)", "(5,3)", "(16,16)", "(0,0)", "(0,2)", "(2,0)"], -) -@pytest.mark.parametrize( - "mode", ["complete", "reduced"], ids=["complete", "reduced"] -) -def test_qr(type, shape, mode): - a = numpy.arange(shape[0] * shape[1], dtype=type).reshape(shape) - ia = inp.array(a) - - np_q, np_r = numpy.linalg.qr(a, mode) - dpnp_q, dpnp_r = inp.linalg.qr(ia, mode) +class TestQr: + @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True)) + @pytest.mark.parametrize( + "shape", + [(2, 2), (3, 4), (5, 3), (16, 16), (2, 2, 2), (2, 4, 2), (2, 2, 4)], + ids=[ + "(2, 2)", + "(3, 4)", + "(5, 3)", + "(16, 16)", + "(2, 2, 2)", + "(2, 4, 2)", + "(2, 2, 4)", + ], + ) + @pytest.mark.parametrize( + "mode", + ["r", "raw", "complete", "reduced"], + ids=["r", "raw", "complete", "reduced"], + ) + def test_qr(self, dtype, shape, mode): + a = numpy.random.rand(*shape).astype(dtype) + ia = inp.array(a) - support_aspect64 = has_support_aspect64() + if mode == "r": + np_r = numpy.linalg.qr(a, mode) + dpnp_r = inp.linalg.qr(ia, mode) + else: + np_q, np_r = numpy.linalg.qr(a, mode) + dpnp_q, dpnp_r = inp.linalg.qr(ia, mode) + + # check decomposition + if mode in ("complete", "reduced"): + # _orgqr doesn`t support complex type + if not inp.issubdtype(dtype, inp.complexfloating): + if a.ndim == 2: + assert_almost_equal( + inp.dot(dpnp_q, dpnp_r), + a, + decimal=5, + ) + else: + batch_size = a.shape[0] + for i in range(batch_size): + assert_almost_equal( + inp.dot(dpnp_q[i], dpnp_r[i]), + a[i], + decimal=5, + ) + else: # mode=="raw" + # _orgqr doesn`t support complex type + if not inp.issubdtype(dtype, inp.complexfloating): + assert_dtype_allclose(dpnp_q, np_q) + assert_dtype_allclose(dpnp_r, np_r) + if mode in ("raw", "r"): + assert_dtype_allclose(dpnp_r, np_r) - if support_aspect64: - assert dpnp_q.dtype == np_q.dtype - assert dpnp_r.dtype == np_r.dtype - assert dpnp_q.shape == np_q.shape - assert dpnp_r.shape == np_r.shape + @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True)) + @pytest.mark.parametrize( + "shape", + [(0, 0), (0, 2), (2, 0), (2, 0, 3), (2, 3, 0), (0, 2, 3)], + ids=[ + "(0, 0)", + "(0, 2)", + "(2 ,0)", + "(2, 0, 3)", + "(2, 3, 0)", + "(0, 2, 3)", + ], + ) + @pytest.mark.parametrize( + "mode", + ["r", "raw", "complete", "reduced"], + ids=["r", "raw", "complete", "reduced"], + ) + def test_qr_empty(self, dtype, shape, mode): + a = numpy.empty(shape, dtype=dtype) + ia = inp.array(a) - tol = 1e-6 - if type == inp.float32: - tol = 1e-02 - elif not support_aspect64 and type in (inp.int32, inp.int64, None): - tol = 1e-02 + if mode == "r": + np_r = numpy.linalg.qr(a, mode) + dpnp_r = inp.linalg.qr(ia, mode) + else: + np_q, np_r = numpy.linalg.qr(a, mode) + dpnp_q, dpnp_r = inp.linalg.qr(ia, mode) - # check decomposition - assert_allclose( - ia, - inp.dot(dpnp_q, dpnp_r), - rtol=tol, - atol=tol, - ) + assert_dtype_allclose(dpnp_q, np_q) - # NP change sign for comparison - ncols = min(a.shape[0], a.shape[1]) - for i in range(ncols): - j = numpy.where(numpy.abs(np_q[:, i]) > tol)[0][0] - if np_q[j, i] * dpnp_q[j, i] < 0: - np_q[:, i] = -np_q[:, i] - np_r[i, :] = -np_r[i, :] + assert_dtype_allclose(dpnp_r, np_r) - if numpy.any(numpy.abs(np_r[i, :]) > tol): - assert_allclose( - inp.asnumpy(dpnp_q)[:, i], np_q[:, i], rtol=tol, atol=tol - ) + @pytest.mark.parametrize( + "mode", + ["r", "raw", "complete", "reduced"], + ids=["r", "raw", "complete", "reduced"], + ) + def test_qr_strides(self, mode): + a = numpy.random.rand(5, 5) + ia = inp.array(a) - assert_allclose(dpnp_r, np_r, rtol=tol, atol=tol) + # positive strides + if mode == "r": + np_r = numpy.linalg.qr(a[::2, ::2], mode) + dpnp_r = inp.linalg.qr(ia[::2, ::2], mode) + else: + np_q, np_r = numpy.linalg.qr(a[::2, ::2], mode) + dpnp_q, dpnp_r = inp.linalg.qr(ia[::2, ::2], mode) + assert_dtype_allclose(dpnp_q, np_q) -@pytest.mark.usefixtures("allow_fall_back_on_numpy") -def test_qr_not_2D(): - a = numpy.arange(12, dtype=numpy.float32).reshape((3, 2, 2)) - ia = inp.array(a) + assert_dtype_allclose(dpnp_r, np_r) - np_q, np_r = numpy.linalg.qr(a) - dpnp_q, dpnp_r = inp.linalg.qr(ia) + # negative strides + if mode == "r": + np_r = numpy.linalg.qr(a[::-2, ::-2], mode) + dpnp_r = inp.linalg.qr(ia[::-2, ::-2], mode) + else: + np_q, np_r = numpy.linalg.qr(a[::-2, ::-2], mode) + dpnp_q, dpnp_r = inp.linalg.qr(ia[::-2, ::-2], mode) - assert dpnp_q.dtype == np_q.dtype - assert dpnp_r.dtype == np_r.dtype - assert dpnp_q.shape == np_q.shape - assert dpnp_r.shape == np_r.shape + assert_dtype_allclose(dpnp_q, np_q) - assert_allclose(ia, inp.matmul(dpnp_q, dpnp_r)) + assert_dtype_allclose(dpnp_r, np_r) - a = numpy.empty((0, 3, 2), dtype=numpy.float32) - ia = inp.array(a) + def test_qr_errors(self): + a_dp = inp.array([[1, 2], [3, 5]], dtype="float32") - np_q, np_r = numpy.linalg.qr(a) - dpnp_q, dpnp_r = inp.linalg.qr(ia) + # unsupported type + a_np = inp.asnumpy(a_dp) + assert_raises(TypeError, inp.linalg.qr, a_np) - assert dpnp_q.dtype == np_q.dtype - assert dpnp_r.dtype == np_r.dtype - assert dpnp_q.shape == np_q.shape - assert dpnp_r.shape == np_r.shape + # a.ndim < 2 + a_dp_ndim_1 = a_dp.flatten() + assert_raises(inp.linalg.LinAlgError, inp.linalg.qr, a_dp_ndim_1) - assert_allclose(ia, inp.matmul(dpnp_q, dpnp_r)) + # invalid mode + assert_raises(ValueError, inp.linalg.qr, a_dp, "c") @pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True)) From 95b842062bcf38b178743868d8d8c036d8ab0542 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 2 Feb 2024 16:56:56 +0100 Subject: [PATCH 14/39] Pass scratchpad_size to the error message test --- dpnp/backend/extensions/lapack/geqrf.cpp | 3 ++- dpnp/backend/extensions/lapack/geqrf_batch.cpp | 3 ++- dpnp/backend/extensions/lapack/orgqr.cpp | 3 ++- dpnp/backend/extensions/lapack/orgqr_batch.cpp | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dpnp/backend/extensions/lapack/geqrf.cpp b/dpnp/backend/extensions/lapack/geqrf.cpp index c32de58fccf..8413c2d785c 100644 --- a/dpnp/backend/extensions/lapack/geqrf.cpp +++ b/dpnp/backend/extensions/lapack/geqrf.cpp @@ -106,7 +106,8 @@ static sycl::event geqrf_impl(sycl::queue exec_q, else if (info == scratchpad_size && e.detail() != 0) { error_msg << "Insufficient scratchpad size. Required size is at least " - << e.detail(); + << e.detail() << ", but current size is " << scratchpad_size + << "."; } else { error_msg << "Unexpected MKL exception caught during geqrf() " diff --git a/dpnp/backend/extensions/lapack/geqrf_batch.cpp b/dpnp/backend/extensions/lapack/geqrf_batch.cpp index 64964bfe153..14f7cae487e 100644 --- a/dpnp/backend/extensions/lapack/geqrf_batch.cpp +++ b/dpnp/backend/extensions/lapack/geqrf_batch.cpp @@ -121,7 +121,8 @@ static sycl::event geqrf_batch_impl(sycl::queue exec_q, else if (info == scratchpad_size && e.detail() != 0) { error_msg << "Insufficient scratchpad size. Required size is at least " - << e.detail(); + << e.detail() << ", but current size is " << scratchpad_size + << "."; } else { error_msg << "Unexpected MKL exception caught during geqrf_batch() " diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp index 25f2c29f3f5..49a62742764 100644 --- a/dpnp/backend/extensions/lapack/orgqr.cpp +++ b/dpnp/backend/extensions/lapack/orgqr.cpp @@ -110,7 +110,8 @@ static sycl::event orgqr_impl(sycl::queue exec_q, else if (info == scratchpad_size && e.detail() != 0) { error_msg << "Insufficient scratchpad size. Required size is at least " - << e.detail(); + << e.detail() << ", but current size is " << scratchpad_size + << "."; } else { error_msg << "Unexpected MKL exception caught during orgqr() " diff --git a/dpnp/backend/extensions/lapack/orgqr_batch.cpp b/dpnp/backend/extensions/lapack/orgqr_batch.cpp index 560e9026fd6..4a8ecaf5156 100644 --- a/dpnp/backend/extensions/lapack/orgqr_batch.cpp +++ b/dpnp/backend/extensions/lapack/orgqr_batch.cpp @@ -125,7 +125,8 @@ static sycl::event orgqr_batch_impl(sycl::queue exec_q, else if (info == scratchpad_size && e.detail() != 0) { error_msg << "Insufficient scratchpad size. Required size is at least " - << e.detail(); + << e.detail() << ", but current size is " << scratchpad_size + << "."; } else { error_msg << "Unexpected MKL exception caught during orgqr_batch() " From 3a50768feae340943d4445c905c0ea78813806b1 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 2 Feb 2024 17:51:19 +0100 Subject: [PATCH 15/39] Add additional checks --- dpnp/backend/extensions/lapack/geqrf.cpp | 17 +++++++++++++++++ dpnp/backend/extensions/lapack/geqrf_batch.cpp | 8 ++++++++ dpnp/backend/extensions/lapack/orgqr.cpp | 9 +++++++++ 3 files changed, 34 insertions(+) diff --git a/dpnp/backend/extensions/lapack/geqrf.cpp b/dpnp/backend/extensions/lapack/geqrf.cpp index 8413c2d785c..0e41a9a68d3 100644 --- a/dpnp/backend/extensions/lapack/geqrf.cpp +++ b/dpnp/backend/extensions/lapack/geqrf.cpp @@ -189,6 +189,14 @@ std::pair auto array_types = dpctl_td_ns::usm_ndarray_types(); int a_array_type_id = array_types.typenum_to_lookup_id(a_array.get_typenum()); + int tau_array_type_id = + array_types.typenum_to_lookup_id(tau_array.get_typenum()); + + if (a_array_type_id != tau_array_type_id) { + throw py::value_error( + "The types of the input array and " + "the array of Householder scalars are mismatched"); + } geqrf_impl_fn_ptr_t geqrf_fn = geqrf_dispatch_vector[a_array_type_id]; if (geqrf_fn == nullptr) { @@ -208,6 +216,15 @@ std::pair const std::int64_t n = a_array_shape[0]; const std::int64_t lda = std::max(1UL, m); + const size_t tau_array_size = tau_array.get_size(); + const size_t min_m_n = std::max(1UL, std::min(m, n)); + + if (tau_array_size != min_m_n) { + throw py::value_error("The array of Householder scalars has size=" + + std::to_string(tau_array_size) + ", but a size=" + + std::to_string(min_m_n) + " array is expected."); + } + std::vector host_task_events; sycl::event geqrf_ev = geqrf_fn(q, m, n, a_array_data, lda, tau_array_data, host_task_events, depends); diff --git a/dpnp/backend/extensions/lapack/geqrf_batch.cpp b/dpnp/backend/extensions/lapack/geqrf_batch.cpp index 14f7cae487e..19e270c0366 100644 --- a/dpnp/backend/extensions/lapack/geqrf_batch.cpp +++ b/dpnp/backend/extensions/lapack/geqrf_batch.cpp @@ -207,6 +207,14 @@ std::pair auto array_types = dpctl_td_ns::usm_ndarray_types(); int a_array_type_id = array_types.typenum_to_lookup_id(a_array.get_typenum()); + int tau_array_type_id = + array_types.typenum_to_lookup_id(tau_array.get_typenum()); + + if (a_array_type_id != tau_array_type_id) { + throw py::value_error( + "The types of the input array and " + "the array of Householder scalars are mismatched"); + } geqrf_batch_impl_fn_ptr_t geqrf_batch_fn = geqrf_batch_dispatch_vector[a_array_type_id]; diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp index 49a62742764..b3f3c617083 100644 --- a/dpnp/backend/extensions/lapack/orgqr.cpp +++ b/dpnp/backend/extensions/lapack/orgqr.cpp @@ -193,6 +193,15 @@ std::pair "must be contiguous"); } + const size_t tau_array_size = tau_array.get_size(); + const size_t min_m_n = std::max(1UL, std::min(m, n)); + + if (tau_array_size != min_m_n) { + throw py::value_error("The array of Householder scalars has size=" + + std::to_string(tau_array_size) + ", but a size=" + + std::to_string(min_m_n) + " array is expected."); + } + auto array_types = dpctl_td_ns::usm_ndarray_types(); int a_array_type_id = array_types.typenum_to_lookup_id(a_array.get_typenum()); From 09b6673e6e7b1b5626e6795031abf0f7d79f3674 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Fri, 2 Feb 2024 17:57:22 +0100 Subject: [PATCH 16/39] Extend error handler for mkl batch funcs --- dpnp/backend/extensions/lapack/geqrf_batch.cpp | 5 +++++ dpnp/backend/extensions/lapack/orgqr_batch.cpp | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/dpnp/backend/extensions/lapack/geqrf_batch.cpp b/dpnp/backend/extensions/lapack/geqrf_batch.cpp index 19e270c0366..a05b01e9172 100644 --- a/dpnp/backend/extensions/lapack/geqrf_batch.cpp +++ b/dpnp/backend/extensions/lapack/geqrf_batch.cpp @@ -124,6 +124,11 @@ static sycl::event geqrf_batch_impl(sycl::queue exec_q, << e.detail() << ", but current size is " << scratchpad_size << "."; } + else if (info != 0 && e.detail() == 0) { + error_msg << "Error in batch processing. " + "Number of failed calculations: " + << info; + } else { error_msg << "Unexpected MKL exception caught during geqrf_batch() " "call:\nreason: " diff --git a/dpnp/backend/extensions/lapack/orgqr_batch.cpp b/dpnp/backend/extensions/lapack/orgqr_batch.cpp index 4a8ecaf5156..afffbf1ec48 100644 --- a/dpnp/backend/extensions/lapack/orgqr_batch.cpp +++ b/dpnp/backend/extensions/lapack/orgqr_batch.cpp @@ -128,6 +128,11 @@ static sycl::event orgqr_batch_impl(sycl::queue exec_q, << e.detail() << ", but current size is " << scratchpad_size << "."; } + else if (info != 0 && e.detail() == 0) { + error_msg << "Error in batch processing. " + "Number of failed calculations: " + << info; + } else { error_msg << "Unexpected MKL exception caught during orgqr_batch() " "call:\nreason: " From 8b4528fc44a81fdbbfbdbe64302f9ddb250ad0c5 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Sun, 4 Feb 2024 13:16:33 +0100 Subject: [PATCH 17/39] Use mkl_lapack namespace --- dpnp/backend/extensions/lapack/geqrf.cpp | 4 ++-- dpnp/backend/extensions/lapack/orgqr.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dpnp/backend/extensions/lapack/geqrf.cpp b/dpnp/backend/extensions/lapack/geqrf.cpp index 0e41a9a68d3..a91f689d503 100644 --- a/dpnp/backend/extensions/lapack/geqrf.cpp +++ b/dpnp/backend/extensions/lapack/geqrf.cpp @@ -73,7 +73,7 @@ static sycl::event geqrf_impl(sycl::queue exec_q, T *tau = reinterpret_cast(in_tau); const std::int64_t scratchpad_size = - oneapi::mkl::lapack::geqrf_scratchpad_size(exec_q, m, n, lda); + mkl_lapack::geqrf_scratchpad_size(exec_q, m, n, lda); T *scratchpad = nullptr; std::stringstream error_msg; @@ -84,7 +84,7 @@ static sycl::event geqrf_impl(sycl::queue exec_q, try { scratchpad = sycl::malloc_device(scratchpad_size, exec_q); - geqrf_event = oneapi::mkl::lapack::geqrf( + geqrf_event = mkl_lapack::geqrf( exec_q, m, // The number of rows in the matrix; (0 ≤ m). n, // The number of columns in the matrix; (0 ≤ n). diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp index b3f3c617083..cba825a7506 100644 --- a/dpnp/backend/extensions/lapack/orgqr.cpp +++ b/dpnp/backend/extensions/lapack/orgqr.cpp @@ -75,7 +75,7 @@ static sycl::event orgqr_impl(sycl::queue exec_q, T *tau = reinterpret_cast(in_tau); const std::int64_t scratchpad_size = - oneapi::mkl::lapack::orgqr_scratchpad_size(exec_q, m, n, k, lda); + mkl_lapack::orgqr_scratchpad_size(exec_q, m, n, k, lda); T *scratchpad = nullptr; std::stringstream error_msg; @@ -86,7 +86,7 @@ static sycl::event orgqr_impl(sycl::queue exec_q, try { scratchpad = sycl::malloc_device(scratchpad_size, exec_q); - orgqr_event = oneapi::mkl::lapack::orgqr( + orgqr_event = mkl_lapack::orgqr( exec_q, m, // The number of rows in the matrix; (0 ≤ m). n, // The number of columns in the matrix; (0 ≤ n). From ea7cb470e5fe7633cbe56dad0900e722fd262d61 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Sun, 4 Feb 2024 14:18:55 +0100 Subject: [PATCH 18/39] Add ungqr mkl extension to support complex dtype --- dpnp/backend/extensions/lapack/CMakeLists.txt | 1 + dpnp/backend/extensions/lapack/lapack_py.cpp | 9 + .../extensions/lapack/types_matrix.hpp | 25 ++ dpnp/backend/extensions/lapack/ungqr.cpp | 253 ++++++++++++++++++ dpnp/backend/extensions/lapack/ungqr.hpp | 67 +++++ dpnp/linalg/dpnp_utils_linalg.py | 23 +- 6 files changed, 369 insertions(+), 9 deletions(-) create mode 100644 dpnp/backend/extensions/lapack/ungqr.cpp create mode 100644 dpnp/backend/extensions/lapack/ungqr.hpp diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt index e8cde917364..dda702b8b0f 100644 --- a/dpnp/backend/extensions/lapack/CMakeLists.txt +++ b/dpnp/backend/extensions/lapack/CMakeLists.txt @@ -39,6 +39,7 @@ set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/potrf.cpp ${CMAKE_CURRENT_SOURCE_DIR}/potrf_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/syevd.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ungqr.cpp ) pybind11_add_module(${python_module_name} MODULE ${_module_src}) diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp index 180ea71288e..95608a79db3 100644 --- a/dpnp/backend/extensions/lapack/lapack_py.cpp +++ b/dpnp/backend/extensions/lapack/lapack_py.cpp @@ -39,6 +39,7 @@ #include "orgqr.hpp" #include "potrf.hpp" #include "syevd.hpp" +#include "ungqr.hpp" namespace lapack_ext = dpnp::backend::ext::lapack; namespace py = pybind11; @@ -57,6 +58,7 @@ void init_dispatch_vectors(void) lapack_ext::init_potrf_batch_dispatch_vector(); lapack_ext::init_potrf_dispatch_vector(); lapack_ext::init_syevd_dispatch_vector(); + lapack_ext::init_ungqr_dispatch_vector(); } // populate dispatch tables @@ -161,4 +163,11 @@ PYBIND11_MODULE(_lapack_impl, m) py::arg("sycl_queue"), py::arg("jobz"), py::arg("upper_lower"), py::arg("eig_vecs"), py::arg("eig_vals"), py::arg("depends") = py::list()); + + m.def("_ungqr", &lapack_ext::ungqr, + "Call `ungqr` from OneMKL LAPACK library to return " + "the complex unitary matrix Q of the QR factorization", + py::arg("sycl_queue"), py::arg("m"), py::arg("n"), py::arg("k"), + py::arg("a_array"), py::arg("tau_array"), + py::arg("depends") = py::list()); } diff --git a/dpnp/backend/extensions/lapack/types_matrix.hpp b/dpnp/backend/extensions/lapack/types_matrix.hpp index 804e5d2d487..f80675beb38 100644 --- a/dpnp/backend/extensions/lapack/types_matrix.hpp +++ b/dpnp/backend/extensions/lapack/types_matrix.hpp @@ -332,6 +332,31 @@ struct SyevdTypePairSupportFactory // fall-through dpctl_td_ns::NotDefinedEntry>::is_defined; }; + +/** + * @brief A factory to define pairs of supported types for which + * MKL LAPACK library provides support in oneapi::mkl::lapack::ungqr + * function. + * + * @tparam T Type of array containing the matrix A from which the + * elementary reflectors were generated (as in QR factorization). + * Upon execution, the array is overwritten with the complex unitary matrix Q. + */ +template +struct UngqrTypePairSupportFactory +{ + static constexpr bool is_defined = std::disjunction< + dpctl_td_ns::TypePairDefinedEntry, + T, + std::complex>, + dpctl_td_ns::TypePairDefinedEntry, + T, + std::complex>, + // fall-through + dpctl_td_ns::NotDefinedEntry>::is_defined; +}; } // namespace types } // namespace lapack } // namespace ext diff --git a/dpnp/backend/extensions/lapack/ungqr.cpp b/dpnp/backend/extensions/lapack/ungqr.cpp new file mode 100644 index 00000000000..0a2a2660ed8 --- /dev/null +++ b/dpnp/backend/extensions/lapack/ungqr.cpp @@ -0,0 +1,253 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#include + +// dpctl tensor headers +#include "utils/memory_overlap.hpp" +#include "utils/type_utils.hpp" + +#include "types_matrix.hpp" +#include "ungqr.hpp" + +#include "dpnp_utils.hpp" + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace lapack +{ +namespace mkl_lapack = oneapi::mkl::lapack; +namespace py = pybind11; +namespace type_utils = dpctl::tensor::type_utils; + +typedef sycl::event (*ungqr_impl_fn_ptr_t)(sycl::queue, + const std::int64_t, + const std::int64_t, + const std::int64_t, + char *, + std::int64_t, + char *, + std::vector &, + const std::vector &); + +static ungqr_impl_fn_ptr_t ungqr_dispatch_vector[dpctl_td_ns::num_types]; + +template +static sycl::event ungqr_impl(sycl::queue exec_q, + const std::int64_t m, + const std::int64_t n, + const std::int64_t k, + char *in_a, + std::int64_t lda, + char *in_tau, + std::vector &host_task_events, + const std::vector &depends) +{ + type_utils::validate_type_for_device(exec_q); + + T *a = reinterpret_cast(in_a); + T *tau = reinterpret_cast(in_tau); + + const std::int64_t scratchpad_size = + mkl_lapack::ungqr_scratchpad_size(exec_q, m, n, k, lda); + T *scratchpad = nullptr; + + std::stringstream error_msg; + std::int64_t info = 0; + bool is_exception_caught = false; + + sycl::event ungqr_event; + try { + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + + ungqr_event = mkl_lapack::ungqr( + exec_q, + m, // The number of rows in the matrix; (0 ≤ m). + n, // The number of columns in the matrix; (0 ≤ n). + k, // The number of elementary reflectors + // whose product defines the matrix Q; (0 ≤ k ≤ n). + a, // Pointer to the m-by-n matrix. + lda, // The leading dimension of `a`; (1 ≤ m). + tau, // Pointer to the array of scalar factors of the + // elementary reflectors. + scratchpad, // Pointer to scratchpad memory to be used by MKL + // routine for storing intermediate results. + scratchpad_size, depends); + } catch (mkl_lapack::exception const &e) { + is_exception_caught = true; + info = e.info(); + + if (info < 0) { + error_msg << "Parameter number " << -info + << " had an illegal value."; + } + else if (info == scratchpad_size && e.detail() != 0) { + error_msg + << "Insufficient scratchpad size. Required size is at least " + << e.detail() << ", but current size is " << scratchpad_size + << "."; + } + else { + error_msg << "Unexpected MKL exception caught during ungqr() " + "call:\nreason: " + << e.what() << "\ninfo: " << info; + } + } catch (sycl::exception const &e) { + is_exception_caught = true; + error_msg << "Unexpected SYCL exception caught during orfqr() call:\n" + << e.what(); + } + + if (is_exception_caught) // an unexpected error occurs + { + if (scratchpad != nullptr) { + sycl::free(scratchpad, exec_q); + } + throw std::runtime_error(error_msg.str()); + } + + sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(ungqr_event); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); }); + }); + host_task_events.push_back(clean_up_event); + + return ungqr_event; +} + +std::pair + ungqr(sycl::queue q, + const std::int64_t m, + const std::int64_t n, + const std::int64_t k, + dpctl::tensor::usm_ndarray a_array, + dpctl::tensor::usm_ndarray tau_array, + const std::vector &depends) +{ + const int a_array_nd = a_array.get_ndim(); + const int tau_array_nd = tau_array.get_ndim(); + + if (a_array_nd != 2) { + throw py::value_error( + "The input array has ndim=" + std::to_string(a_array_nd) + + ", but a 2-dimensional array is expected."); + } + + if (tau_array_nd != 1) { + throw py::value_error("The array of Householder scalars has ndim=" + + std::to_string(tau_array_nd) + + ", but a 1-dimensional array is expected."); + } + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(a_array, tau_array)) { + throw py::value_error( + "The input array and the array of Householder scalars " + "are overlapping segments of memory"); + } + + bool is_a_array_c_contig = a_array.is_c_contiguous(); + if (!is_a_array_c_contig) { + throw py::value_error("The input array " + "must be C-contiguous"); + } + + bool is_tau_array_c_contig = tau_array.is_c_contiguous(); + bool is_tau_array_f_contig = tau_array.is_f_contiguous(); + if (!is_tau_array_c_contig || !is_tau_array_f_contig) { + throw py::value_error("The array of Householder scalars " + "must be contiguous"); + } + + const size_t tau_array_size = tau_array.get_size(); + if (static_cast(tau_array_size) != k) { + throw py::value_error("The array of Householder scalars has size=" + + std::to_string(tau_array_size) + + ", but an array of size=" + std::to_string(k) + + " is expected."); + } + + auto array_types = dpctl_td_ns::usm_ndarray_types(); + int a_array_type_id = + array_types.typenum_to_lookup_id(a_array.get_typenum()); + + ungqr_impl_fn_ptr_t ungqr_fn = ungqr_dispatch_vector[a_array_type_id]; + if (ungqr_fn == nullptr) { + throw py::value_error( + "No ungqr implementation defined for the provided type " + "of the input matrix."); + } + + char *a_array_data = a_array.get_data(); + const std::int64_t lda = std::max(1UL, m); + + char *tau_array_data = tau_array.get_data(); + + std::vector host_task_events; + sycl::event ungqr_ev = ungqr_fn(q, m, n, k, a_array_data, lda, + tau_array_data, host_task_events, depends); + + sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array}, + host_task_events); + + return std::make_pair(args_ev, ungqr_ev); +} + +template +struct UngqrContigFactory +{ + fnT get() + { + if constexpr (types::UngqrTypePairSupportFactory::is_defined) { + return ungqr_impl; + } + else { + return nullptr; + } + } +}; + +void init_ungqr_dispatch_vector(void) +{ + dpctl_td_ns::DispatchVectorBuilder + contig; + contig.populate_dispatch_vector(ungqr_dispatch_vector); +} +} // namespace lapack +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/backend/extensions/lapack/ungqr.hpp b/dpnp/backend/extensions/lapack/ungqr.hpp new file mode 100644 index 00000000000..4151364ab62 --- /dev/null +++ b/dpnp/backend/extensions/lapack/ungqr.hpp @@ -0,0 +1,67 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include +#include + +#include + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace lapack +{ +extern std::pair + ungqr(sycl::queue exec_q, + const std::int64_t m, + const std::int64_t n, + const std::int64_t k, + dpctl::tensor::usm_ndarray a_array, + dpctl::tensor::usm_ndarray tau_array, + const std::vector &depends = {}); + +// extern std::pair +// ungqr_batch(sycl::queue exec_q, +// dpctl::tensor::usm_ndarray a_array, +// dpctl::tensor::usm_ndarray tau_array, +// std::int64_t m, +// std::int64_t n, +// std::int64_t k, +// std::int64_t stride_a, +// std::int64_t stride_tau, +// std::int64_t batch_size, +// const std::vector &depends = {}); + +// extern void init_ungqr_batch_dispatch_vector(void); +extern void init_ungqr_dispatch_vector(void); +} // namespace lapack +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index d34d12212bb..0836dbc3fdb 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -1227,9 +1227,6 @@ def dpnp_qr(a, mode="reduced"): tau_h.astype(res_type, copy=False), ) - # _orgqr supports only floating type - orgqr_type = _real_type(res_type) - # mc is the total number of columns in the q matrix. # In `complete` mode, mc equals the number of rows. # In `reduced` mode, mc is the lesser of the row count or column count. @@ -1237,7 +1234,7 @@ def dpnp_qr(a, mode="reduced"): mc = m q = dpnp.empty( (m, m), - dtype=orgqr_type, + dtype=res_type, order="C", sycl_queue=a_sycl_queue, usm_type=a_usm_type, @@ -1246,20 +1243,28 @@ def dpnp_qr(a, mode="reduced"): mc = k q = dpnp.empty( (n, m), - dtype=orgqr_type, + dtype=res_type, order="C", sycl_queue=a_sycl_queue, usm_type=a_usm_type, ) q[:n] = a_t - # Call the LAPACK extension function _orgqr to generate the real orthogonal matrix - # `Q` of the QR factorization - ht_orgqr_ev, _ = li._orgqr( + # Get LAPACK function (_orgqr for real or _ungqf for complex data types) + # for QR factorization + lapack_func = ( + "_ungqr" + if dpnp.issubdtype(res_type, dpnp.complexfloating) + else "_orgqr" + ) + + # Call the LAPACK extension function _orgqr/_ungqf to generate the real orthogonal/ + # complex unitary matrix `Q` of the QR factorization + ht_lapack_ev, _ = getattr(li, lapack_func)( a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [geqrf_ev] ) - ht_orgqr_ev.wait() + ht_lapack_ev.wait() q = q[:mc].transpose() r = a_t[:, :mc].transpose() From 245ee80c68f48f962e1fa0b68735cfe495146058 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Sun, 4 Feb 2024 14:26:35 +0100 Subject: [PATCH 19/39] Update tau array size check for orgqr --- dpnp/backend/extensions/lapack/orgqr.cpp | 8 ++++---- dpnp/backend/extensions/lapack/ungqr.cpp | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp index cba825a7506..07fd72af13e 100644 --- a/dpnp/backend/extensions/lapack/orgqr.cpp +++ b/dpnp/backend/extensions/lapack/orgqr.cpp @@ -194,12 +194,12 @@ std::pair } const size_t tau_array_size = tau_array.get_size(); - const size_t min_m_n = std::max(1UL, std::min(m, n)); - if (tau_array_size != min_m_n) { + if (static_cast(tau_array_size) != k) { throw py::value_error("The array of Householder scalars has size=" + - std::to_string(tau_array_size) + ", but a size=" + - std::to_string(min_m_n) + " array is expected."); + std::to_string(tau_array_size) + + ", but an array of size=" + std::to_string(k) + + " is expected."); } auto array_types = dpctl_td_ns::usm_ndarray_types(); diff --git a/dpnp/backend/extensions/lapack/ungqr.cpp b/dpnp/backend/extensions/lapack/ungqr.cpp index 0a2a2660ed8..6ecca118f7d 100644 --- a/dpnp/backend/extensions/lapack/ungqr.cpp +++ b/dpnp/backend/extensions/lapack/ungqr.cpp @@ -187,12 +187,14 @@ std::pair bool is_tau_array_c_contig = tau_array.is_c_contiguous(); bool is_tau_array_f_contig = tau_array.is_f_contiguous(); + if (!is_tau_array_c_contig || !is_tau_array_f_contig) { throw py::value_error("The array of Householder scalars " "must be contiguous"); } const size_t tau_array_size = tau_array.get_size(); + if (static_cast(tau_array_size) != k) { throw py::value_error("The array of Householder scalars has size=" + std::to_string(tau_array_size) + From c4c29bd25bad4155beb1a514a4856d5ea1d36385 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Sun, 4 Feb 2024 14:58:30 +0100 Subject: [PATCH 20/39] Add ungqr_batch mkl extension to support complex dtype --- dpnp/backend/extensions/lapack/CMakeLists.txt | 1 + dpnp/backend/extensions/lapack/lapack_py.cpp | 12 +- .../extensions/lapack/types_matrix.hpp | 27 ++ dpnp/backend/extensions/lapack/ungqr.hpp | 24 +- .../backend/extensions/lapack/ungqr_batch.cpp | 279 ++++++++++++++++++ dpnp/linalg/dpnp_utils_linalg.py | 28 +- 6 files changed, 348 insertions(+), 23 deletions(-) create mode 100644 dpnp/backend/extensions/lapack/ungqr_batch.cpp diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt index dda702b8b0f..0a0f6490bee 100644 --- a/dpnp/backend/extensions/lapack/CMakeLists.txt +++ b/dpnp/backend/extensions/lapack/CMakeLists.txt @@ -40,6 +40,7 @@ set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/potrf_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/syevd.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ungqr.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ungqr_batch.cpp ) pybind11_add_module(${python_module_name} MODULE ${_module_src}) diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp index 95608a79db3..cd95dbac0ed 100644 --- a/dpnp/backend/extensions/lapack/lapack_py.cpp +++ b/dpnp/backend/extensions/lapack/lapack_py.cpp @@ -58,6 +58,7 @@ void init_dispatch_vectors(void) lapack_ext::init_potrf_batch_dispatch_vector(); lapack_ext::init_potrf_dispatch_vector(); lapack_ext::init_syevd_dispatch_vector(); + lapack_ext::init_ungqr_batch_dispatch_vector(); lapack_ext::init_ungqr_dispatch_vector(); } @@ -128,7 +129,7 @@ PYBIND11_MODULE(_lapack_impl, m) py::arg("depends") = py::list()); m.def("_orgqr_batch", &lapack_ext::orgqr_batch, - "Call `orgqr` from OneMKL LAPACK library to return " + "Call `_orgqr_batch` from OneMKL LAPACK library to return " "the real orthogonal matrix Qi of the QR factorization " "for a batch of general matrices", py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"), @@ -164,6 +165,15 @@ PYBIND11_MODULE(_lapack_impl, m) py::arg("eig_vecs"), py::arg("eig_vals"), py::arg("depends") = py::list()); + m.def("_ungqr_batch", &lapack_ext::ungqr_batch, + "Call `_ungqr_batch` from OneMKL LAPACK library to return " + "the complex unitary matrices matrix Qi of the QR factorization " + "for a batch of general matrices", + py::arg("sycl_queue"), py::arg("a_array"), py::arg("tau_array"), + py::arg("m"), py::arg("n"), py::arg("k"), py::arg("stride_a"), + py::arg("stride_tau"), py::arg("batch_size"), + py::arg("depends") = py::list()); + m.def("_ungqr", &lapack_ext::ungqr, "Call `ungqr` from OneMKL LAPACK library to return " "the complex unitary matrix Q of the QR factorization", diff --git a/dpnp/backend/extensions/lapack/types_matrix.hpp b/dpnp/backend/extensions/lapack/types_matrix.hpp index f80675beb38..30621c673a9 100644 --- a/dpnp/backend/extensions/lapack/types_matrix.hpp +++ b/dpnp/backend/extensions/lapack/types_matrix.hpp @@ -333,6 +333,33 @@ struct SyevdTypePairSupportFactory dpctl_td_ns::NotDefinedEntry>::is_defined; }; +/** + * @brief A factory to define pairs of supported types for which + * MKL LAPACK library provides support in oneapi::mkl::lapack::ungqr_batch + * function. + * + * @tparam T Type of array containing the matrix A, + * each from a separate instance in the batch, from which the + * elementary reflectors were generated (as in QR factorization). + * Upon execution, each array in the batch is overwritten with + * its respective complex unitary matrix Q. + */ +template +struct UngqrBatchTypePairSupportFactory +{ + static constexpr bool is_defined = std::disjunction< + dpctl_td_ns::TypePairDefinedEntry, + T, + std::complex>, + dpctl_td_ns::TypePairDefinedEntry, + T, + std::complex>, + // fall-through + dpctl_td_ns::NotDefinedEntry>::is_defined; +}; + /** * @brief A factory to define pairs of supported types for which * MKL LAPACK library provides support in oneapi::mkl::lapack::ungqr diff --git a/dpnp/backend/extensions/lapack/ungqr.hpp b/dpnp/backend/extensions/lapack/ungqr.hpp index 4151364ab62..1a9b68e94f9 100644 --- a/dpnp/backend/extensions/lapack/ungqr.hpp +++ b/dpnp/backend/extensions/lapack/ungqr.hpp @@ -47,19 +47,19 @@ extern std::pair dpctl::tensor::usm_ndarray tau_array, const std::vector &depends = {}); -// extern std::pair -// ungqr_batch(sycl::queue exec_q, -// dpctl::tensor::usm_ndarray a_array, -// dpctl::tensor::usm_ndarray tau_array, -// std::int64_t m, -// std::int64_t n, -// std::int64_t k, -// std::int64_t stride_a, -// std::int64_t stride_tau, -// std::int64_t batch_size, -// const std::vector &depends = {}); +extern std::pair + ungqr_batch(sycl::queue exec_q, + dpctl::tensor::usm_ndarray a_array, + dpctl::tensor::usm_ndarray tau_array, + std::int64_t m, + std::int64_t n, + std::int64_t k, + std::int64_t stride_a, + std::int64_t stride_tau, + std::int64_t batch_size, + const std::vector &depends = {}); -// extern void init_ungqr_batch_dispatch_vector(void); +extern void init_ungqr_batch_dispatch_vector(void); extern void init_ungqr_dispatch_vector(void); } // namespace lapack } // namespace ext diff --git a/dpnp/backend/extensions/lapack/ungqr_batch.cpp b/dpnp/backend/extensions/lapack/ungqr_batch.cpp new file mode 100644 index 00000000000..8d7b6cd33d4 --- /dev/null +++ b/dpnp/backend/extensions/lapack/ungqr_batch.cpp @@ -0,0 +1,279 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#include + +// dpctl tensor headers +#include "utils/memory_overlap.hpp" +#include "utils/type_utils.hpp" + +#include "types_matrix.hpp" +#include "ungqr.hpp" + +#include "dpnp_utils.hpp" + +namespace dpnp +{ +namespace backend +{ +namespace ext +{ +namespace lapack +{ +namespace mkl_lapack = oneapi::mkl::lapack; +namespace py = pybind11; +namespace type_utils = dpctl::tensor::type_utils; + +typedef sycl::event (*ungqr_batch_impl_fn_ptr_t)( + sycl::queue, + std::int64_t, + std::int64_t, + std::int64_t, + char *, + std::int64_t, + std::int64_t, + char *, + std::int64_t, + std::int64_t, + std::vector &, + const std::vector &); + +static ungqr_batch_impl_fn_ptr_t + ungqr_batch_dispatch_vector[dpctl_td_ns::num_types]; + +template +static sycl::event ungqr_batch_impl(sycl::queue exec_q, + std::int64_t m, + std::int64_t n, + std::int64_t k, + char *in_a, + std::int64_t lda, + std::int64_t stride_a, + char *in_tau, + std::int64_t stride_tau, + std::int64_t batch_size, + std::vector &host_task_events, + const std::vector &depends) +{ + type_utils::validate_type_for_device(exec_q); + + T *a = reinterpret_cast(in_a); + T *tau = reinterpret_cast(in_tau); + + const std::int64_t scratchpad_size = + mkl_lapack::ungqr_batch_scratchpad_size( + exec_q, m, n, k, lda, stride_a, stride_tau, batch_size); + T *scratchpad = nullptr; + + std::stringstream error_msg; + std::int64_t info = 0; + bool is_exception_caught = false; + + sycl::event ungqr_batch_event; + try { + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + + ungqr_batch_event = mkl_lapack::ungqr_batch( + exec_q, + m, // The number of rows in each matrix in the batch; (0 ≤ m). + // It must be a non-negative integer. + n, // The number of columns in each matrix in the batch; (0 ≤ n). + // It must be a non-negative integer. + k, // The number of elementary reflectors + // whose product defines the matrices Qi; (0 ≤ k ≤ n). + a, // Pointer to the batch of matrices, each of size (m x n). + lda, // The leading dimension of each matrix in the batch. + // For row major layout, lda ≥ max(1, m). + stride_a, // Stride between consecutive matrices in the batch. + tau, // Pointer to the array of scalar factors of the elementary + // reflectors for each matrix in the batch. + stride_tau, // Stride between arrays of scalar factors in the batch. + batch_size, // The number of matrices in the batch. + scratchpad, // Pointer to scratchpad memory to be used by MKL + // routine for storing intermediate results. + scratchpad_size, depends); + } catch (mkl_lapack::exception const &e) { + is_exception_caught = true; + info = e.info(); + + if (info < 0) { + error_msg << "Parameter number " << -info + << " had an illegal value."; + } + else if (info == scratchpad_size && e.detail() != 0) { + error_msg + << "Insufficient scratchpad size. Required size is at least " + << e.detail() << ", but current size is " << scratchpad_size + << "."; + } + else if (info != 0 && e.detail() == 0) { + error_msg << "Error in batch processing. " + "Number of failed calculations: " + << info; + } + else { + error_msg << "Unexpected MKL exception caught during ungqr_batch() " + "call:\nreason: " + << e.what() << "\ninfo: " << e.info(); + } + } catch (sycl::exception const &e) { + is_exception_caught = true; + error_msg + << "Unexpected SYCL exception caught during ungqr_batch() call:\n" + << e.what(); + } + + if (is_exception_caught) // an unexpected error occurs + { + if (scratchpad != nullptr) { + sycl::free(scratchpad, exec_q); + } + + throw std::runtime_error(error_msg.str()); + } + + sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(ungqr_batch_event); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); }); + }); + host_task_events.push_back(clean_up_event); + return ungqr_batch_event; +} + +std::pair + ungqr_batch(sycl::queue q, + dpctl::tensor::usm_ndarray a_array, + dpctl::tensor::usm_ndarray tau_array, + std::int64_t m, + std::int64_t n, + std::int64_t k, + std::int64_t stride_a, + std::int64_t stride_tau, + std::int64_t batch_size, + const std::vector &depends) +{ + const int a_array_nd = a_array.get_ndim(); + const int tau_array_nd = tau_array.get_ndim(); + + if (a_array_nd < 3) { + throw py::value_error( + "The input array has ndim=" + std::to_string(a_array_nd) + + ", but an array with ndim >= 3 is expected."); + } + + if (tau_array_nd != 2) { + throw py::value_error("The array of Householder scalars has ndim=" + + std::to_string(tau_array_nd) + + ", but a 2-dimensional array is expected."); + } + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(q, {a_array, tau_array})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(a_array, tau_array)) { + throw py::value_error( + "The input array and the array of Householder scalars " + "are overlapping segments of memory"); + } + + bool is_a_array_c_contig = a_array.is_c_contiguous(); + bool is_tau_array_c_contig = tau_array.is_c_contiguous(); + if (!is_a_array_c_contig) { + throw py::value_error("The input array " + "must be C-contiguous"); + } + if (!is_tau_array_c_contig) { + throw py::value_error("The array of Householder scalars " + "must be C-contiguous"); + } + + auto array_types = dpctl_td_ns::usm_ndarray_types(); + int a_array_type_id = + array_types.typenum_to_lookup_id(a_array.get_typenum()); + + ungqr_batch_impl_fn_ptr_t ungqr_batch_fn = + ungqr_batch_dispatch_vector[a_array_type_id]; + if (ungqr_batch_fn == nullptr) { + throw py::value_error( + "No ungqr_batch implementation defined for the provided type " + "of the input matrix."); + } + + // auto tau_types = dpctl_td_ns::usm_ndarray_types(); + // int tau_array_type_id = + // tau_types.typenum_to_lookup_id(tau_array.get_typenum()); + + // if (tau_array_type_id != static_cast(dpctl_td_ns::typenum_t::INT64)) + // { + // throw py::value_error("The type of 'tau_array' must be int64."); + // } + + char *a_array_data = a_array.get_data(); + char *tau_array_data = tau_array.get_data(); + + const std::int64_t lda = std::max(1UL, m); + + std::vector host_task_events; + sycl::event ungqr_batch_ev = + ungqr_batch_fn(q, m, n, k, a_array_data, lda, stride_a, tau_array_data, + stride_tau, batch_size, host_task_events, depends); + + sycl::event args_ev = dpctl::utils::keep_args_alive(q, {a_array, tau_array}, + host_task_events); + + return std::make_pair(args_ev, ungqr_batch_ev); +} + +template +struct UngqrBatchContigFactory +{ + fnT get() + { + if constexpr (types::UngqrBatchTypePairSupportFactory::is_defined) { + return ungqr_batch_impl; + } + else { + return nullptr; + } + } +}; + +void init_ungqr_batch_dispatch_vector(void) +{ + dpctl_td_ns::DispatchVectorBuilder + contig; + contig.populate_dispatch_vector(ungqr_batch_dispatch_vector); +} +} // namespace lapack +} // namespace ext +} // namespace backend +} // namespace dpnp diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 0836dbc3fdb..8e25f1baa33 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -1052,7 +1052,7 @@ def dpnp_qr_batch(a, mode="reduced"): # Call the LAPACK extension function _geqrf_batch to compute the QR factorization # of a general m x n matrix. - ht_lapack_ev, geqrf_batch_ev = li._geqrf_batch( + ht_geqrf_batch_ev, geqrf_batch_ev = li._geqrf_batch( a_sycl_queue, a_t.get_array(), tau_h.get_array(), @@ -1064,7 +1064,7 @@ def dpnp_qr_batch(a, mode="reduced"): [a_copy_ev], ) - ht_lapack_ev.wait() + ht_geqrf_batch_ev.wait() a_ht_copy_ev.wait() if mode == "r": @@ -1080,14 +1080,11 @@ def dpnp_qr_batch(a, mode="reduced"): r.astype(res_type, copy=False), ) - # _orgqr supports only floating type - orgqr_type = _real_type(res_type) - if mode == "complete" and m > n: mc = m q = dpnp.empty( (batch_size, m, m), - dtype=orgqr_type, + dtype=res_type, sycl_queue=a_sycl_queue, usm_type=a_usm_type, ) @@ -1095,7 +1092,7 @@ def dpnp_qr_batch(a, mode="reduced"): mc = k q = dpnp.empty( (batch_size, n, m), - dtype=orgqr_type, + dtype=res_type, sycl_queue=a_sycl_queue, usm_type=a_usm_type, ) @@ -1104,9 +1101,18 @@ def dpnp_qr_batch(a, mode="reduced"): q_stride = q.strides[0] tau_stride = tau_h.strides[0] - # Call the LAPACK extension function _orgqr to generate the orthogonal/unitary matrix - # `Qi` of the QR factorization for a batch of general matrices. - ht_lapack_ev, _ = li._orgqr_batch( + # Get LAPACK function (_orgqr_batch for real or _ungqf_batch for complex data types) + # for QR factorization + lapack_func = ( + "_ungqr_batch" + if dpnp.issubdtype(res_type, dpnp.complexfloating) + else "_orgqr_batch" + ) + + # Call the LAPACK extension function _orgqr_batch/ to generate the real orthogonal/ + # complex unitary matrices `Qi` of the QR factorization + # for a batch of general matrices. + ht_lapack_ev, _ = getattr(li, lapack_func)( a_sycl_queue, q.get_array(), tau_h.get_array(), @@ -1119,6 +1125,8 @@ def dpnp_qr_batch(a, mode="reduced"): [geqrf_batch_ev], ) + ht_lapack_ev.wait() + q = q[..., :mc, :].swapaxes(-2, -1) r = a_t[..., :mc].swapaxes(-2, -1) From f097cd1c7e68979866dc73bf8d896eb6b292a5af Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Sun, 4 Feb 2024 15:03:30 +0100 Subject: [PATCH 21/39] Add arrays type check --- dpnp/backend/extensions/lapack/orgqr.cpp | 8 ++++++++ dpnp/backend/extensions/lapack/orgqr_batch.cpp | 17 ++++++++--------- dpnp/backend/extensions/lapack/ungqr.cpp | 8 ++++++++ dpnp/backend/extensions/lapack/ungqr_batch.cpp | 17 ++++++++--------- 4 files changed, 32 insertions(+), 18 deletions(-) diff --git a/dpnp/backend/extensions/lapack/orgqr.cpp b/dpnp/backend/extensions/lapack/orgqr.cpp index 07fd72af13e..22cbbe05bee 100644 --- a/dpnp/backend/extensions/lapack/orgqr.cpp +++ b/dpnp/backend/extensions/lapack/orgqr.cpp @@ -205,6 +205,14 @@ std::pair auto array_types = dpctl_td_ns::usm_ndarray_types(); int a_array_type_id = array_types.typenum_to_lookup_id(a_array.get_typenum()); + int tau_array_type_id = + array_types.typenum_to_lookup_id(tau_array.get_typenum()); + + if (a_array_type_id != tau_array_type_id) { + throw py::value_error( + "The types of the input array and " + "the array of Householder scalars are mismatched"); + } orgqr_impl_fn_ptr_t orgqr_fn = orgqr_dispatch_vector[a_array_type_id]; if (orgqr_fn == nullptr) { diff --git a/dpnp/backend/extensions/lapack/orgqr_batch.cpp b/dpnp/backend/extensions/lapack/orgqr_batch.cpp index afffbf1ec48..dfa9932a8e0 100644 --- a/dpnp/backend/extensions/lapack/orgqr_batch.cpp +++ b/dpnp/backend/extensions/lapack/orgqr_batch.cpp @@ -217,6 +217,14 @@ std::pair auto array_types = dpctl_td_ns::usm_ndarray_types(); int a_array_type_id = array_types.typenum_to_lookup_id(a_array.get_typenum()); + int tau_array_type_id = + array_types.typenum_to_lookup_id(tau_array.get_typenum()); + + if (a_array_type_id != tau_array_type_id) { + throw py::value_error( + "The types of the input array and " + "the array of Householder scalars are mismatched"); + } orgqr_batch_impl_fn_ptr_t orgqr_batch_fn = orgqr_batch_dispatch_vector[a_array_type_id]; @@ -226,15 +234,6 @@ std::pair "of the input matrix."); } - // auto tau_types = dpctl_td_ns::usm_ndarray_types(); - // int tau_array_type_id = - // tau_types.typenum_to_lookup_id(tau_array.get_typenum()); - - // if (tau_array_type_id != static_cast(dpctl_td_ns::typenum_t::INT64)) - // { - // throw py::value_error("The type of 'tau_array' must be int64."); - // } - char *a_array_data = a_array.get_data(); char *tau_array_data = tau_array.get_data(); diff --git a/dpnp/backend/extensions/lapack/ungqr.cpp b/dpnp/backend/extensions/lapack/ungqr.cpp index 6ecca118f7d..7c8dea4e950 100644 --- a/dpnp/backend/extensions/lapack/ungqr.cpp +++ b/dpnp/backend/extensions/lapack/ungqr.cpp @@ -205,6 +205,14 @@ std::pair auto array_types = dpctl_td_ns::usm_ndarray_types(); int a_array_type_id = array_types.typenum_to_lookup_id(a_array.get_typenum()); + int tau_array_type_id = + array_types.typenum_to_lookup_id(tau_array.get_typenum()); + + if (a_array_type_id != tau_array_type_id) { + throw py::value_error( + "The types of the input array and " + "the array of Householder scalars are mismatched"); + } ungqr_impl_fn_ptr_t ungqr_fn = ungqr_dispatch_vector[a_array_type_id]; if (ungqr_fn == nullptr) { diff --git a/dpnp/backend/extensions/lapack/ungqr_batch.cpp b/dpnp/backend/extensions/lapack/ungqr_batch.cpp index 8d7b6cd33d4..c07eaf150fc 100644 --- a/dpnp/backend/extensions/lapack/ungqr_batch.cpp +++ b/dpnp/backend/extensions/lapack/ungqr_batch.cpp @@ -217,6 +217,14 @@ std::pair auto array_types = dpctl_td_ns::usm_ndarray_types(); int a_array_type_id = array_types.typenum_to_lookup_id(a_array.get_typenum()); + int tau_array_type_id = + array_types.typenum_to_lookup_id(tau_array.get_typenum()); + + if (a_array_type_id != tau_array_type_id) { + throw py::value_error( + "The types of the input array and " + "the array of Householder scalars are mismatched"); + } ungqr_batch_impl_fn_ptr_t ungqr_batch_fn = ungqr_batch_dispatch_vector[a_array_type_id]; @@ -226,15 +234,6 @@ std::pair "of the input matrix."); } - // auto tau_types = dpctl_td_ns::usm_ndarray_types(); - // int tau_array_type_id = - // tau_types.typenum_to_lookup_id(tau_array.get_typenum()); - - // if (tau_array_type_id != static_cast(dpctl_td_ns::typenum_t::INT64)) - // { - // throw py::value_error("The type of 'tau_array' must be int64."); - // } - char *a_array_data = a_array.get_data(); char *tau_array_data = tau_array.get_data(); From a9dfe1511609ef91ef9752d40d0e499881704da9 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Sun, 4 Feb 2024 15:10:49 +0100 Subject: [PATCH 22/39] Fix test_det_singular_matrix --- tests/test_linalg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_linalg.py b/tests/test_linalg.py index 49030fee561..9163cff6aee 100644 --- a/tests/test_linalg.py +++ b/tests/test_linalg.py @@ -312,8 +312,8 @@ def test_det_singular_matrix(self, matrix): a_np = numpy.array(matrix, dtype="float32") a_dp = inp.array(a_np) - expected = numpy.linalg.slogdet(a_np) - result = inp.linalg.slogdet(a_dp) + expected = numpy.linalg.det(a_np) + result = inp.linalg.det(a_dp) assert_allclose(expected, result, rtol=1e-3, atol=1e-4) From d3395af57302a4b127535034dd1a7b4a18a709af Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Sun, 4 Feb 2024 15:36:07 +0100 Subject: [PATCH 23/39] Expand tests for dpnp.linalg.qr with complex types --- tests/test_linalg.py | 35 +++++++++++-------- .../cupy/linalg_tests/test_decomposition.py | 3 -- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/tests/test_linalg.py b/tests/test_linalg.py index 9163cff6aee..f935e434a1e 100644 --- a/tests/test_linalg.py +++ b/tests/test_linalg.py @@ -709,27 +709,32 @@ def test_qr(self, dtype, shape, mode): # check decomposition if mode in ("complete", "reduced"): - # _orgqr doesn`t support complex type - if not inp.issubdtype(dtype, inp.complexfloating): - if a.ndim == 2: + if a.ndim == 2: + # TODO: remove it when dpnp.dot() supports complex types + if inp.issubdtype(dtype, inp.complexfloating): + dpnp_as_np_q = inp.asnumpy(dpnp_q) + dpnp_as_np_r = inp.asnumpy(dpnp_r) + assert_almost_equal( - inp.dot(dpnp_q, dpnp_r), + numpy.dot(dpnp_as_np_q, dpnp_as_np_r), a, decimal=5, ) else: - batch_size = a.shape[0] - for i in range(batch_size): - assert_almost_equal( - inp.dot(dpnp_q[i], dpnp_r[i]), - a[i], - decimal=5, - ) + assert_almost_equal( + inp.dot(dpnp_q, dpnp_r), + a, + decimal=5, + ) + else: # a.ndim > 2 + assert_almost_equal( + inp.matmul(dpnp_q, dpnp_r), + a, + decimal=5, + ) else: # mode=="raw" - # _orgqr doesn`t support complex type - if not inp.issubdtype(dtype, inp.complexfloating): - assert_dtype_allclose(dpnp_q, np_q) - assert_dtype_allclose(dpnp_r, np_r) + assert_dtype_allclose(dpnp_q, np_q) + if mode in ("raw", "r"): assert_dtype_allclose(dpnp_r, np_r) diff --git a/tests/third_party/cupy/linalg_tests/test_decomposition.py b/tests/third_party/cupy/linalg_tests/test_decomposition.py index 813b322d823..d562ddccde5 100644 --- a/tests/third_party/cupy/linalg_tests/test_decomposition.py +++ b/tests/third_party/cupy/linalg_tests/test_decomposition.py @@ -148,9 +148,6 @@ def test_decomposition(self, dtype): class TestQRDecomposition(unittest.TestCase): @testing.for_dtypes("fdFD") def check_mode(self, array, mode, dtype): - if dtype in (numpy.complex64, numpy.complex128): - pytest.skip("ungqr unsupported") - a_cpu = numpy.asarray(array, dtype=dtype) a_gpu = cupy.asarray(array, dtype=dtype) result_gpu = cupy.linalg.qr(a_gpu, mode=mode) From ce46590ea8a586b50bd3938df131426b38e8baa3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Sun, 4 Feb 2024 16:48:15 +0100 Subject: [PATCH 24/39] Update examples --- dpnp/linalg/dpnp_iface_linalg.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py index 822054fe354..ed691bd8d92 100644 --- a/dpnp/linalg/dpnp_iface_linalg.py +++ b/dpnp/linalg/dpnp_iface_linalg.py @@ -562,10 +562,18 @@ def qr(a, mode="reduced"): >>> import dpnp as np >>> a = np.random.randn(9, 6) >>> Q, R = np.linalg.qr(a) - >>> np.allclose(a, np.dot(Q, R)) + >>> np.allclose(a, np.dot(Q, R)) # a does equal QR array([ True]) >>> R2 = np.linalg.qr(a, mode='r') - >>> np.allclose(R, R2) + >>> np.allclose(R, R2) # mode='r' returns the same R as mode='full' + array([ True]) + >>> a = np.random.normal(size=(3, 2, 2)) # Stack of 2 x 2 matrices as input + >>> Q, R = np.linalg.qr(a) + >>> Q.shape + (3, 2, 2) + >>> R.shape + (3, 2, 2) + >>> np.allclose(a, np.matmul(Q, R)) array([ True]) """ From 5c173ca262d2c123ab14435419fc23805b1ee952 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Sun, 4 Feb 2024 16:57:11 +0100 Subject: [PATCH 25/39] Remove astype for output arrays --- dpnp/linalg/dpnp_utils_linalg.py | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 8e25f1baa33..764e9b435be 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -1069,16 +1069,13 @@ def dpnp_qr_batch(a, mode="reduced"): if mode == "r": r = a_t[..., :k].swapaxes(-2, -1) - out_r = dpnp.triu(r).astype(res_type, copy=False) + out_r = dpnp.triu(r) return out_r.reshape(batch_shape + out_r.shape[-2:]) if mode == "raw": q = a_t.reshape(batch_shape + a_t.shape[-2:]) r = tau_h.reshape(batch_shape + tau_h.shape[-1:]) - return ( - q.astype(res_type, copy=False), - r.astype(res_type, copy=False), - ) + return (q, r) if mode == "complete" and m > n: mc = m @@ -1130,14 +1127,11 @@ def dpnp_qr_batch(a, mode="reduced"): q = q[..., :mc, :].swapaxes(-2, -1) r = a_t[..., :mc].swapaxes(-2, -1) - r_triu = dpnp.triu(r).astype(res_type, copy=False) - - q_reshape = q.reshape(batch_shape + q.shape[-2:]) - r_triu_reshape = r_triu.reshape(batch_shape + r_triu.shape[-2:]) + r_triu = dpnp.triu(r) return ( - q_reshape.astype(dtype=res_type, copy=False), - r_triu_reshape, + q.reshape(batch_shape + q.shape[-2:]), + r_triu.reshape(batch_shape + r_triu.shape[-2:]), ) @@ -1227,13 +1221,10 @@ def dpnp_qr(a, mode="reduced"): if mode == "r": r = a_t[:, :k].transpose() - return dpnp.triu(r).astype(res_type, copy=False) + return dpnp.triu(r) if mode == "raw": - return ( - a_t.astype(res_type, copy=False), - tau_h.astype(res_type, copy=False), - ) + return (a_t, tau_h) # mc is the total number of columns in the q matrix. # In `complete` mode, mc equals the number of rows. @@ -1276,10 +1267,7 @@ def dpnp_qr(a, mode="reduced"): q = q[:mc].transpose() r = a_t[:, :mc].transpose() - return ( - q.astype(dtype=res_type, copy=False), - dpnp.triu(r).astype(dtype=res_type, copy=False), - ) + return (q, dpnp.triu(r)) def dpnp_solve(a, b): From f494bfdcb024ab539d78b5bf3478c0b9b97dcf6f Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Sun, 4 Feb 2024 17:22:31 +0100 Subject: [PATCH 26/39] Use empty_like instead of empty --- dpnp/linalg/dpnp_utils_linalg.py | 127 +++++++++++++------------------ 1 file changed, 55 insertions(+), 72 deletions(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 764e9b435be..d18f8e390c8 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -973,17 +973,15 @@ def dpnp_qr_batch(a, mode="reduced"): if batch_size == 0 or k == 0: if mode == "reduced": return ( - dpnp.empty( - batch_shape + (m, k), + dpnp.empty_like( + a, + shape=batch_shape + (m, k), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ), - dpnp.empty( - batch_shape + (k, n), + dpnp.empty_like( + a, + shape=batch_shape + (k, n), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ), ) elif mode == "complete": @@ -996,33 +994,29 @@ def dpnp_qr_batch(a, mode="reduced"): ) return ( q, - dpnp.empty( - batch_shape + (m, n), + dpnp.empty_like( + a, + shape=batch_shape + (m, n), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ), ) elif mode == "r": - return dpnp.empty( - batch_shape + (k, n), + return dpnp.empty_like( + a, + shape=batch_shape + (k, n), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ) elif mode == "raw": return ( - dpnp.empty( - batch_shape + (n, m), + dpnp.empty_like( + a, + shape=batch_shape + (n, m), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ), - dpnp.empty( - batch_shape + (k,), + dpnp.empty_like( + a, + shape=batch_shape + (k,), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ), ) @@ -1040,11 +1034,10 @@ def dpnp_qr_batch(a, mode="reduced"): src=a_usm_arr, dst=a_t.get_array(), sycl_queue=a_sycl_queue ) - tau_h = dpnp.empty( - (batch_size, k), + tau_h = dpnp.empty_like( + a_t, + shape=(batch_size, k), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ) a_stride = a_t.strides[0] @@ -1079,19 +1072,17 @@ def dpnp_qr_batch(a, mode="reduced"): if mode == "complete" and m > n: mc = m - q = dpnp.empty( - (batch_size, m, m), + q = dpnp.empty_like( + a_t, + shape=(batch_size, m, m), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ) else: mc = k - q = dpnp.empty( - (batch_size, n, m), + q = dpnp.empty_like( + a_t, + shape=(batch_size, n, m), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ) q[..., :n, :] = a_t @@ -1156,44 +1147,38 @@ def dpnp_qr(a, mode="reduced"): k = min(m, n) if k == 0: if mode == "reduced": - return dpnp.empty( - (m, 0), + return dpnp.empty_like( + a, + shape=(m, 0), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, - ), dpnp.empty( - (0, n), + ), dpnp.empty_like( + a, + shape=(0, n), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ) elif mode == "complete": return dpnp.identity( m, dtype=res_type, sycl_queue=a_sycl_queue, usm_type=a_usm_type - ), dpnp.empty( - (m, n), + ), dpnp.empty_like( + a, + shape=(m, n), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ) elif mode == "r": - return dpnp.empty( - (0, n), + return dpnp.empty_like( + a, + shape=(0, n), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ) else: # mode == "raw" - return dpnp.empty( - (n, m), + return dpnp.empty_like( + a, + shape=(n, m), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, - ), dpnp.empty( - (0,), + ), dpnp.empty_like( + a, + shape=(0,), dtype=res_type, - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ) a = a.T @@ -1206,8 +1191,10 @@ def dpnp_qr(a, mode="reduced"): src=a_usm_arr, dst=a_t.get_array(), sycl_queue=a_sycl_queue ) - tau_h = dpnp.empty( - k, dtype=res_type, sycl_queue=a_sycl_queue, usm_type=a_usm_type + tau_h = dpnp.empty_like( + a, + shape=(k,), + dtype=res_type, ) # Call the LAPACK extension function _geqrf to compute the QR factorization @@ -1231,21 +1218,17 @@ def dpnp_qr(a, mode="reduced"): # In `reduced` mode, mc is the lesser of the row count or column count. if mode == "complete" and m > n: mc = m - q = dpnp.empty( - (m, m), + q = dpnp.empty_like( + a_t, + shape=(m, m), dtype=res_type, - order="C", - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ) else: mc = k - q = dpnp.empty( - (n, m), + q = dpnp.empty_like( + a_t, + shape=(n, m), dtype=res_type, - order="C", - sycl_queue=a_sycl_queue, - usm_type=a_usm_type, ) q[:n] = a_t From fc440440f639f92637b1a38296e098b7aafb0c29 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 5 Feb 2024 10:39:18 +0100 Subject: [PATCH 27/39] Minor update dpnp_qr/dpnp_qr_batch logic --- dpnp/linalg/dpnp_utils_linalg.py | 40 ++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index d18f8e390c8..e8a05c085a7 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -1057,18 +1057,19 @@ def dpnp_qr_batch(a, mode="reduced"): [a_copy_ev], ) - ht_geqrf_batch_ev.wait() - a_ht_copy_ev.wait() + if mode in ["r", "raw"]: + ht_geqrf_batch_ev.wait() + a_ht_copy_ev.wait() - if mode == "r": - r = a_t[..., :k].swapaxes(-2, -1) - out_r = dpnp.triu(r) - return out_r.reshape(batch_shape + out_r.shape[-2:]) + if mode == "r": + r = a_t[..., :k].swapaxes(-2, -1) + out_r = dpnp.triu(r) + return out_r.reshape(batch_shape + out_r.shape[-2:]) - if mode == "raw": - q = a_t.reshape(batch_shape + a_t.shape[-2:]) - r = tau_h.reshape(batch_shape + tau_h.shape[-1:]) - return (q, r) + if mode == "raw": + q = a_t.reshape(batch_shape + a_t.shape[-2:]) + r = tau_h.reshape(batch_shape + tau_h.shape[-1:]) + return (q, r) if mode == "complete" and m > n: mc = m @@ -1114,6 +1115,8 @@ def dpnp_qr_batch(a, mode="reduced"): ) ht_lapack_ev.wait() + ht_geqrf_batch_ev.wait() + a_ht_copy_ev.wait() q = q[..., :mc, :].swapaxes(-2, -1) r = a_t[..., :mc].swapaxes(-2, -1) @@ -1203,15 +1206,16 @@ def dpnp_qr(a, mode="reduced"): a_sycl_queue, a_t.get_array(), tau_h.get_array(), [a_copy_ev] ) - ht_geqrf_ev.wait() - a_ht_copy_ev.wait() + if mode in ["r", "raw"]: + ht_geqrf_ev.wait() + a_ht_copy_ev.wait() - if mode == "r": - r = a_t[:, :k].transpose() - return dpnp.triu(r) + if mode == "r": + r = a_t[:, :k].transpose() + return dpnp.triu(r) - if mode == "raw": - return (a_t, tau_h) + if mode == "raw": + return (a_t, tau_h) # mc is the total number of columns in the q matrix. # In `complete` mode, mc equals the number of rows. @@ -1247,6 +1251,8 @@ def dpnp_qr(a, mode="reduced"): ) ht_lapack_ev.wait() + ht_geqrf_ev.wait() + a_ht_copy_ev.wait() q = q[:mc].transpose() r = a_t[:, :mc].transpose() From 1f7800d5fd0037658a07161d45e7d76cda0c1b2c Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 5 Feb 2024 10:50:37 +0100 Subject: [PATCH 28/39] Describe the returned arrays --- dpnp/linalg/dpnp_iface_linalg.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/dpnp/linalg/dpnp_iface_linalg.py b/dpnp/linalg/dpnp_iface_linalg.py index ed691bd8d92..b3c8f2e4e4b 100644 --- a/dpnp/linalg/dpnp_iface_linalg.py +++ b/dpnp/linalg/dpnp_iface_linalg.py @@ -552,10 +552,21 @@ def qr(a, mode="reduced"): Returns ------- - out : {dpnp.ndarray, tuple of dpnp.ndarray} - Although the type of returned object depends on the mode, - it returns a tuple of ``(Q, R)`` by default. - For details, please see the document of :func:`numpy.linalg.qr`. + When mode is "reduced" or "complete", the result will be a namedtuple with + the attributes Q and R. + Q : dpnp.ndarray + A matrix with orthonormal columns. + When mode = "complete" the result is an orthogonal/unitary matrix + depending on whether or not a is real/complex. + The determinant may be either +/- 1 in that case. + In case the number of dimensions in the input array is greater + than 2 then a stack of the matrices with above properties is returned. + R : dpnp.ndarray + The upper-triangular matrix or a stack of upper-triangular matrices + if the number of dimensions in the input array is greater than 2. + (h, tau) : tuple of dpnp.ndarray + The h array contains the Householder reflectors that generate Q along with R. + The tau array contains scaling factors for the reflectors. Examples -------- From a9783e6452b8a5ae2983f43892ce319606214091 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 5 Feb 2024 10:59:47 +0100 Subject: [PATCH 29/39] Use ht_list_ev with dpctl.SyclEvent.wait_for --- dpnp/linalg/dpnp_utils_linalg.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index e8a05c085a7..52a8c1a8016 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -24,6 +24,7 @@ # ***************************************************************************** +import dpctl import dpctl.tensor._tensor_impl as ti from numpy import issubdtype, prod @@ -1057,9 +1058,10 @@ def dpnp_qr_batch(a, mode="reduced"): [a_copy_ev], ) + ht_list_ev = [ht_geqrf_batch_ev, a_ht_copy_ev] + if mode in ["r", "raw"]: - ht_geqrf_batch_ev.wait() - a_ht_copy_ev.wait() + dpctl.SyclEvent.wait_for(ht_list_ev) if mode == "r": r = a_t[..., :k].swapaxes(-2, -1) @@ -1114,9 +1116,8 @@ def dpnp_qr_batch(a, mode="reduced"): [geqrf_batch_ev], ) - ht_lapack_ev.wait() - ht_geqrf_batch_ev.wait() - a_ht_copy_ev.wait() + ht_list_ev.append(ht_lapack_ev) + dpctl.SyclEvent.wait_for(ht_list_ev) q = q[..., :mc, :].swapaxes(-2, -1) r = a_t[..., :mc].swapaxes(-2, -1) @@ -1206,9 +1207,10 @@ def dpnp_qr(a, mode="reduced"): a_sycl_queue, a_t.get_array(), tau_h.get_array(), [a_copy_ev] ) + ht_list_ev = [ht_geqrf_ev, a_ht_copy_ev] + if mode in ["r", "raw"]: - ht_geqrf_ev.wait() - a_ht_copy_ev.wait() + dpctl.SyclEvent.wait_for(ht_list_ev) if mode == "r": r = a_t[:, :k].transpose() @@ -1250,9 +1252,8 @@ def dpnp_qr(a, mode="reduced"): a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [geqrf_ev] ) - ht_lapack_ev.wait() - ht_geqrf_ev.wait() - a_ht_copy_ev.wait() + ht_list_ev.append(ht_lapack_ev) + dpctl.SyclEvent.wait_for(ht_list_ev) q = q[:mc].transpose() r = a_t[:, :mc].transpose() From 2e9950afb95c0fc73278379b281971d8dbe645e6 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 5 Feb 2024 11:24:07 +0100 Subject: [PATCH 30/39] Add a comment for transpose --- dpnp/linalg/dpnp_utils_linalg.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 52a8c1a8016..d022d32ca37 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -1185,6 +1185,10 @@ def dpnp_qr(a, mode="reduced"): dtype=res_type, ) + # Transpose the input matrix to convert from row-major to column-major order. + # This adjustment is necessary for compatibility with oneAPI MKL LAPACK routines, + # which expect matrices in column-major format. + # This allows data to be handled efficiently without the need for additional conversion. a = a.T a_usm_arr = dpnp.get_usm_ndarray(a) a_t = dpnp.empty_like(a, order="C", dtype=res_type) From ea8cd4d266415bad6867ecf77d5551ab3d69d9d9 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 5 Feb 2024 11:59:22 +0100 Subject: [PATCH 31/39] Add _triu_inplace func --- dpnp/linalg/dpnp_utils_linalg.py | 54 +++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index d022d32ca37..7251cf7415e 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -952,6 +952,47 @@ def dpnp_inv(a): return b_f +def _triu_inplace(a, host_tasks, depends=None): + """ + _triu_inplace(a, host_tasks, depends=None) + + Computes the upper triangular part of an array in-place, + but currently allocates extra memory for the result. + + Parameters + ---------- + a : {dpnp.ndarray, usm_ndarray} + Input array from which the upper triangular part is to be extracted. + host_tasks : list + A list to which the function appends the host event corresponding to the computation. + This allows for dependency management and synchronization with other tasks. + depends : list, optional + A list of events that the triangular operation depends on. + These tasks are completed before the triangular computation starts. + If ``None``, defaults to an empty list. + + Returns + ------- + out : dpnp.ndarray + A new array containing the upper triangular part of the input array `a`. + + """ + + # TODO: implement a dedicated kernel for in-place triu instead of extra memory allocation for result + if depends is None: + depends = [] + out = dpnp.empty_like(a, order="C") + ht_triu_ev, _ = ti._triu( + src=a.get_array(), + dst=out.get_array(), + k=0, + sycl_queue=a.sycl_queue, + depends=depends, + ) + host_tasks.append(ht_triu_ev) + return out + + def dpnp_qr_batch(a, mode="reduced"): """ dpnp_qr_batch(a, mode="reduced") @@ -1065,10 +1106,12 @@ def dpnp_qr_batch(a, mode="reduced"): if mode == "r": r = a_t[..., :k].swapaxes(-2, -1) - out_r = dpnp.triu(r) - return out_r.reshape(batch_shape + out_r.shape[-2:]) + r = _triu_inplace(r, ht_list_ev, [geqrf_batch_ev]) + dpctl.SyclEvent.wait_for(ht_list_ev) + return r.reshape(batch_shape + r.shape[-2:]) if mode == "raw": + dpctl.SyclEvent.wait_for(ht_list_ev) q = a_t.reshape(batch_shape + a_t.shape[-2:]) r = tau_h.reshape(batch_shape + tau_h.shape[-1:]) return (q, r) @@ -1214,13 +1257,14 @@ def dpnp_qr(a, mode="reduced"): ht_list_ev = [ht_geqrf_ev, a_ht_copy_ev] if mode in ["r", "raw"]: - dpctl.SyclEvent.wait_for(ht_list_ev) - if mode == "r": r = a_t[:, :k].transpose() - return dpnp.triu(r) + r = _triu_inplace(r, ht_list_ev, [geqrf_ev]) + dpctl.SyclEvent.wait_for(ht_list_ev) + return r if mode == "raw": + dpctl.SyclEvent.wait_for(ht_list_ev) return (a_t, tau_h) # mc is the total number of columns in the q matrix. From 1cd6c778890a1f8945ae668b38d0842359057fe3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 5 Feb 2024 14:17:43 +0100 Subject: [PATCH 32/39] Use _triu_inplace for complete and reduced mode --- dpnp/linalg/dpnp_utils_linalg.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 4536b8bc7b3..92063e1926d 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -1151,7 +1151,7 @@ def dpnp_qr_batch(a, mode="reduced"): # Call the LAPACK extension function _orgqr_batch/ to generate the real orthogonal/ # complex unitary matrices `Qi` of the QR factorization # for a batch of general matrices. - ht_lapack_ev, _ = getattr(li, lapack_func)( + ht_lapack_ev, lapack_ev = getattr(li, lapack_func)( a_sycl_queue, q.get_array(), tau_h.get_array(), @@ -1170,11 +1170,14 @@ def dpnp_qr_batch(a, mode="reduced"): q = q[..., :mc, :].swapaxes(-2, -1) r = a_t[..., :mc].swapaxes(-2, -1) - r_triu = dpnp.triu(r) + ht_list_ev.append(ht_lapack_ev) + + r = _triu_inplace(r, ht_list_ev, [lapack_ev]) + dpctl.SyclEvent.wait_for(ht_list_ev) return ( q.reshape(batch_shape + q.shape[-2:]), - r_triu.reshape(batch_shape + r_triu.shape[-2:]), + r.reshape(batch_shape + r.shape[-2:]), ) @@ -1301,16 +1304,19 @@ def dpnp_qr(a, mode="reduced"): # Call the LAPACK extension function _orgqr/_ungqf to generate the real orthogonal/ # complex unitary matrix `Q` of the QR factorization - ht_lapack_ev, _ = getattr(li, lapack_func)( + ht_lapack_ev, lapack_ev = getattr(li, lapack_func)( a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [geqrf_ev] ) + q = q[:mc].transpose() + r = a_t[:, :mc].transpose() + ht_list_ev.append(ht_lapack_ev) + + r = _triu_inplace(r, ht_list_ev, [lapack_ev]) dpctl.SyclEvent.wait_for(ht_list_ev) - q = q[:mc].transpose() - r = a_t[:, :mc].transpose() - return (q, dpnp.triu(r)) + return (q, r) def dpnp_solve(a, b): From d389ddb3362a50905d2941d7a06545a6abc479ad Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Mon, 5 Feb 2024 14:19:39 +0100 Subject: [PATCH 33/39] Fix validation check --- dpnp/linalg/dpnp_utils_linalg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 92063e1926d..f20011de649 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -1237,7 +1237,7 @@ def dpnp_qr(a, mode="reduced"): ) # Transpose the input matrix to convert from row-major to column-major order. - # This adjustment is necessary for compatibility with oneAPI MKL LAPACK routines, + # This adjustment is necessary for compatibility with OneMKL LAPACK routines, # which expect matrices in column-major format. # This allows data to be handled efficiently without the need for additional conversion. a = a.T From 71fec506d7fb9b527d1437d43b42f0243a180158 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Tue, 6 Feb 2024 13:52:47 +0100 Subject: [PATCH 34/39] Use copy_usm for a_t array overwritten by geqrf/geqrf_batch --- dpnp/linalg/dpnp_utils_linalg.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index f20011de649..ed8ed3eed4d 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -1135,7 +1135,17 @@ def dpnp_qr_batch(a, mode="reduced"): shape=(batch_size, n, m), dtype=res_type, ) - q[..., :n, :] = a_t + + # use DPCTL tensor function to fill the matrix array `q[..., :n, :]` + # with content from the array `a_t` overwritten by geqrf_batch + a_t_ht_copy_ev, a_t_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_t.get_array(), + dst=q[..., :n, :].get_array(), + sycl_queue=a_sycl_queue, + depends=[geqrf_batch_ev], + ) + + ht_list_ev.append(a_t_ht_copy_ev) q_stride = q.strides[0] tau_stride = tau_h.strides[0] @@ -1161,7 +1171,7 @@ def dpnp_qr_batch(a, mode="reduced"): q_stride, tau_stride, batch_size, - [geqrf_batch_ev], + [a_t_copy_ev], ) ht_list_ev.append(ht_lapack_ev) @@ -1292,7 +1302,17 @@ def dpnp_qr(a, mode="reduced"): shape=(n, m), dtype=res_type, ) - q[:n] = a_t + + # use DPCTL tensor function to fill the matrix array `q[:n]` + # with content from the array `a_t` overwritten by geqrf + a_t_ht_copy_ev, a_t_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_t.get_array(), + dst=q[:n].get_array(), + sycl_queue=a_sycl_queue, + depends=[geqrf_ev], + ) + + ht_list_ev.append(a_t_ht_copy_ev) # Get LAPACK function (_orgqr for real or _ungqf for complex data types) # for QR factorization @@ -1305,7 +1325,7 @@ def dpnp_qr(a, mode="reduced"): # Call the LAPACK extension function _orgqr/_ungqf to generate the real orthogonal/ # complex unitary matrix `Q` of the QR factorization ht_lapack_ev, lapack_ev = getattr(li, lapack_func)( - a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [geqrf_ev] + a_sycl_queue, m, mc, k, q.get_array(), tau_h.get_array(), [a_t_copy_ev] ) q = q[:mc].transpose() From 120cf43a7d5723cbf8647e373782dd938e3479e3 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 7 Feb 2024 11:40:02 +0100 Subject: [PATCH 35/39] Address remarks --- dpnp/linalg/dpnp_utils_linalg.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index ed8ed3eed4d..13943b125ef 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -1107,8 +1107,6 @@ def dpnp_qr_batch(a, mode="reduced"): ht_list_ev = [ht_geqrf_batch_ev, a_ht_copy_ev] if mode in ["r", "raw"]: - dpctl.SyclEvent.wait_for(ht_list_ev) - if mode == "r": r = a_t[..., :k].swapaxes(-2, -1) r = _triu_inplace(r, ht_list_ev, [geqrf_batch_ev]) @@ -1175,7 +1173,6 @@ def dpnp_qr_batch(a, mode="reduced"): ) ht_list_ev.append(ht_lapack_ev) - dpctl.SyclEvent.wait_for(ht_list_ev) q = q[..., :mc, :].swapaxes(-2, -1) r = a_t[..., :mc].swapaxes(-2, -1) From 30188e4a15159a90e75877d1f8edbd57563d8dce Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 7 Feb 2024 13:22:31 +0100 Subject: [PATCH 36/39] Skip cupy tests on cpu --- tests/third_party/cupy/linalg_tests/test_decomposition.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/third_party/cupy/linalg_tests/test_decomposition.py b/tests/third_party/cupy/linalg_tests/test_decomposition.py index fc75163f072..4769fe455eb 100644 --- a/tests/third_party/cupy/linalg_tests/test_decomposition.py +++ b/tests/third_party/cupy/linalg_tests/test_decomposition.py @@ -416,6 +416,10 @@ def _check_result(self, result_cpu, result_gpu): assert result_cpu.dtype == result_gpu.dtype testing.assert_allclose(result_cpu, result_gpu, atol=1e-4) + # TODO: New packages that fix issue CMPLRLLVM-53771 are only available in internal CI. + # Skip the tests on cpu until these packages are available for the external CI. + # Specifically dpcpp_linux-64>=2024.1.0 + @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771") @testing.fix_random() @_condition.repeat(3, 10) def test_mode(self): @@ -423,6 +427,7 @@ def test_mode(self): self.check_mode(numpy.random.randn(3, 3), mode=self.mode) self.check_mode(numpy.random.randn(5, 4), mode=self.mode) + @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771") @testing.with_requires("numpy>=1.22") @testing.fix_random() def test_mode_rank3(self): @@ -430,6 +435,7 @@ def test_mode_rank3(self): self.check_mode(numpy.random.randn(4, 3, 3), mode=self.mode) self.check_mode(numpy.random.randn(2, 5, 4), mode=self.mode) + @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771") @testing.with_requires("numpy>=1.22") @testing.fix_random() def test_mode_rank4(self): From d519b1ac8abd092fdd0dbeb1c036f80b1cad5381 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 7 Feb 2024 13:28:30 +0100 Subject: [PATCH 37/39] Skip dpnp tests on cpu --- tests/test_linalg.py | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/tests/test_linalg.py b/tests/test_linalg.py index 879a3893eae..e0ad716dba1 100644 --- a/tests/test_linalg.py +++ b/tests/test_linalg.py @@ -678,6 +678,10 @@ def test_norm3(array, ord, axis): class TestQr: + # TODO: New packages that fix issue CMPLRLLVM-53771 are only available in internal CI. + # Skip the tests on cpu until these packages are available for the external CI. + # Specifically dpcpp_linux-64>=2024.1.0 + @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771") @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True)) @pytest.mark.parametrize( "shape", @@ -711,22 +715,11 @@ def test_qr(self, dtype, shape, mode): # check decomposition if mode in ("complete", "reduced"): if a.ndim == 2: - # TODO: remove it when dpnp.dot() supports complex types - if inp.issubdtype(dtype, inp.complexfloating): - dpnp_as_np_q = inp.asnumpy(dpnp_q) - dpnp_as_np_r = inp.asnumpy(dpnp_r) - - assert_almost_equal( - numpy.dot(dpnp_as_np_q, dpnp_as_np_r), - a, - decimal=5, - ) - else: - assert_almost_equal( - inp.dot(dpnp_q, dpnp_r), - a, - decimal=5, - ) + assert_almost_equal( + inp.dot(dpnp_q, dpnp_r), + a, + decimal=5, + ) else: # a.ndim > 2 assert_almost_equal( inp.matmul(dpnp_q, dpnp_r), @@ -772,6 +765,7 @@ def test_qr_empty(self, dtype, shape, mode): assert_dtype_allclose(dpnp_r, np_r) + @pytest.mark.skipif(is_cpu_device(), reason="CMPLRLLVM-53771") @pytest.mark.parametrize( "mode", ["r", "raw", "complete", "reduced"], From c8786db245dcfb414001328ec971968237daea7e Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 7 Feb 2024 13:39:08 +0100 Subject: [PATCH 38/39] Remove TODOs for svd tests --- tests/test_linalg.py | 8 ------- .../cupy/linalg_tests/test_decomposition.py | 23 +++++++------------ 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/tests/test_linalg.py b/tests/test_linalg.py index e0ad716dba1..8e32b867b85 100644 --- a/tests/test_linalg.py +++ b/tests/test_linalg.py @@ -1076,14 +1076,6 @@ def check_decomposition( dpnp_diag_s = inp.zeros_like(dp_a, dtype=dp_s.dtype) for i in range(min(dp_a.shape[-2], dp_a.shape[-1])): dpnp_diag_s[..., i, i] = dp_s[..., i] - # TODO: remove it when dpnp.dot is updated - # dpnp.dot does not support complex type - if inp.issubdtype(dp_a.dtype, inp.complexfloating): - reconstructed = numpy.dot( - inp.asnumpy(dp_u), - numpy.dot(inp.asnumpy(dpnp_diag_s), inp.asnumpy(dp_vt)), - ) - else: reconstructed = inp.dot(dp_u, inp.dot(dpnp_diag_s, dp_vt)) # TODO: use assert dpnp.allclose() inside check_decomposition() # when it will support complex dtypes diff --git a/tests/third_party/cupy/linalg_tests/test_decomposition.py b/tests/third_party/cupy/linalg_tests/test_decomposition.py index 4769fe455eb..234a2e0e381 100644 --- a/tests/third_party/cupy/linalg_tests/test_decomposition.py +++ b/tests/third_party/cupy/linalg_tests/test_decomposition.py @@ -201,38 +201,31 @@ def check_usv(self, shape, dtype): # reconstruct the matrix k = s_cpu.shape[-1] - # dpnp.dot/matmul does not support complex type and unstable on cpu - # TODO: remove it and use xp.dot/matmul when dpnp.dot/matmul is updated - u_gpu = u_gpu.asnumpy() - vh_gpu = vh_gpu.asnumpy() - s_gpu = s_gpu.asnumpy() - xp = numpy - if len(shape) == 2: if self.full_matrices: - a_gpu_usv = numpy.dot(u_gpu[:, :k] * s_gpu, vh_gpu[:k, :]) + a_gpu_usv = cupy.dot(u_gpu[:, :k] * s_gpu, vh_gpu[:k, :]) else: - a_gpu_usv = numpy.dot(u_gpu * s_gpu, vh_gpu) + a_gpu_usv = cupy.dot(u_gpu * s_gpu, vh_gpu) else: if self.full_matrices: - a_gpu_usv = numpy.matmul( + a_gpu_usv = cupy.matmul( u_gpu[..., :k] * s_gpu[..., None, :], vh_gpu[..., :k, :] ) else: - a_gpu_usv = numpy.matmul(u_gpu * s_gpu[..., None, :], vh_gpu) + a_gpu_usv = cupy.matmul(u_gpu * s_gpu[..., None, :], vh_gpu) testing.assert_allclose(a_gpu, a_gpu_usv, rtol=1e-4, atol=1e-4) # assert unitary u_len = u_gpu.shape[-1] vh_len = vh_gpu.shape[-2] testing.assert_allclose( - xp.matmul(u_gpu.swapaxes(-1, -2).conj(), u_gpu), - stacked_identity(xp, shape[:-2], u_len, dtype), + cupy.matmul(u_gpu.swapaxes(-1, -2).conj(), u_gpu), + stacked_identity(cupy, shape[:-2], u_len, dtype), atol=1e-4, ) testing.assert_allclose( - xp.matmul(vh_gpu, vh_gpu.swapaxes(-1, -2).conj()), - stacked_identity(xp, shape[:-2], vh_len, dtype), + cupy.matmul(vh_gpu, vh_gpu.swapaxes(-1, -2).conj()), + stacked_identity(cupy, shape[:-2], vh_len, dtype), atol=1e-4, ) From e0ee2f2a745fcae9c62a695d0dbda8b71c6c9d69 Mon Sep 17 00:00:00 2001 From: Vladislav Perevezentsev Date: Wed, 7 Feb 2024 17:04:48 +0100 Subject: [PATCH 39/39] Address remarks --- dpnp/backend/extensions/lapack/geqrf_batch.cpp | 9 --------- dpnp/linalg/dpnp_utils_linalg.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/dpnp/backend/extensions/lapack/geqrf_batch.cpp b/dpnp/backend/extensions/lapack/geqrf_batch.cpp index a05b01e9172..a4fe980a539 100644 --- a/dpnp/backend/extensions/lapack/geqrf_batch.cpp +++ b/dpnp/backend/extensions/lapack/geqrf_batch.cpp @@ -229,15 +229,6 @@ std::pair "of the input matrix."); } - // auto tau_types = dpctl_td_ns::usm_ndarray_types(); - // int tau_array_type_id = - // tau_types.typenum_to_lookup_id(tau_array.get_typenum()); - - // if (tau_array_type_id != static_cast(dpctl_td_ns::typenum_t::INT64)) - // { - // throw py::value_error("The type of 'tau_array' must be int64."); - // } - char *a_array_data = a_array.get_data(); char *tau_array_data = tau_array.get_data(); diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 13943b125ef..a6dcfbf0c2b 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -1053,7 +1053,7 @@ def dpnp_qr_batch(a, mode="reduced"): shape=batch_shape + (k, n), dtype=res_type, ) - elif mode == "raw": + else: # mode=="raw" return ( dpnp.empty_like( a, @@ -1113,11 +1113,11 @@ def dpnp_qr_batch(a, mode="reduced"): dpctl.SyclEvent.wait_for(ht_list_ev) return r.reshape(batch_shape + r.shape[-2:]) - if mode == "raw": - dpctl.SyclEvent.wait_for(ht_list_ev) - q = a_t.reshape(batch_shape + a_t.shape[-2:]) - r = tau_h.reshape(batch_shape + tau_h.shape[-1:]) - return (q, r) + # mode=="raw" + dpctl.SyclEvent.wait_for(ht_list_ev) + q = a_t.reshape(batch_shape + a_t.shape[-2:]) + r = tau_h.reshape(batch_shape + tau_h.shape[-1:]) + return (q, r) if mode == "complete" and m > n: mc = m @@ -1278,9 +1278,9 @@ def dpnp_qr(a, mode="reduced"): dpctl.SyclEvent.wait_for(ht_list_ev) return r - if mode == "raw": - dpctl.SyclEvent.wait_for(ht_list_ev) - return (a_t, tau_h) + # mode == "raw": + dpctl.SyclEvent.wait_for(ht_list_ev) + return (a_t, tau_h) # mc is the total number of columns in the q matrix. # In `complete` mode, mc equals the number of rows.