diff --git a/dpnp/backend/extensions/lapack/CMakeLists.txt b/dpnp/backend/extensions/lapack/CMakeLists.txt index f21f61c84df..03fceb05482 100644 --- a/dpnp/backend/extensions/lapack/CMakeLists.txt +++ b/dpnp/backend/extensions/lapack/CMakeLists.txt @@ -36,11 +36,13 @@ set(_module_src ${CMAKE_CURRENT_SOURCE_DIR}/getri_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/getrs.cpp ${CMAKE_CURRENT_SOURCE_DIR}/heevd.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/heevd_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/orgqr.cpp ${CMAKE_CURRENT_SOURCE_DIR}/orgqr_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/potrf.cpp ${CMAKE_CURRENT_SOURCE_DIR}/potrf_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/syevd.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/syevd_batch.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ungqr.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ungqr_batch.cpp ) diff --git a/dpnp/backend/extensions/lapack/common_helpers.hpp b/dpnp/backend/extensions/lapack/common_helpers.hpp index 0c818e74664..df2b87028e4 100644 --- a/dpnp/backend/extensions/lapack/common_helpers.hpp +++ b/dpnp/backend/extensions/lapack/common_helpers.hpp @@ -24,11 +24,15 @@ //***************************************************************************** #pragma once +#include #include +#include #include namespace dpnp::extensions::lapack::helper { +namespace py = pybind11; + template struct value_type_of { @@ -40,4 +44,23 @@ struct value_type_of> { using type = T; }; + +// Rounds up the number `value` to the nearest multiple of `mult`. +template +inline intT round_up_mult(intT value, intT mult) +{ + intT q = (value + (mult - 1)) / mult; + return q * mult; +} + +// Checks if the shape array has any non-zero dimension. +inline bool check_zeros_shape(int ndim, const py::ssize_t *shape) +{ + size_t src_nelems(1); + + for (int i = 0; i < ndim; ++i) { + src_nelems *= static_cast(shape[i]); + } + return src_nelems == 0; +} } // namespace dpnp::extensions::lapack::helper diff --git a/dpnp/backend/extensions/lapack/evd_batch_common.hpp b/dpnp/backend/extensions/lapack/evd_batch_common.hpp new file mode 100644 index 00000000000..5a7d985ff82 --- /dev/null +++ b/dpnp/backend/extensions/lapack/evd_batch_common.hpp @@ -0,0 +1,152 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include +#include + +// dpctl tensor headers +#include "utils/type_dispatch.hpp" + +#include "common_helpers.hpp" +#include "evd_common_utils.hpp" +#include "types_matrix.hpp" + +namespace dpnp::extensions::lapack::evd +{ +typedef sycl::event (*evd_batch_impl_fn_ptr_t)( + sycl::queue &, + const oneapi::mkl::job, + const oneapi::mkl::uplo, + const std::int64_t, + const std::int64_t, + char *, + char *, + const std::vector &); + +namespace dpctl_td_ns = dpctl::tensor::type_dispatch; +namespace py = pybind11; + +template +std::pair + evd_batch_func(sycl::queue &exec_q, + const std::int8_t jobz, + const std::int8_t upper_lower, + dpctl::tensor::usm_ndarray &eig_vecs, + dpctl::tensor::usm_ndarray &eig_vals, + const std::vector &depends, + const dispatchT &evd_batch_dispatch_table) +{ + const int eig_vecs_nd = eig_vecs.get_ndim(); + + const py::ssize_t *eig_vecs_shape = eig_vecs.get_shape_raw(); + const py::ssize_t *eig_vals_shape = eig_vals.get_shape_raw(); + + constexpr int expected_eig_vecs_nd = 3; + constexpr int expected_eig_vals_nd = 2; + + common_evd_checks(exec_q, eig_vecs, eig_vals, eig_vecs_shape, + expected_eig_vecs_nd, expected_eig_vals_nd); + + if (eig_vecs_shape[2] != eig_vals_shape[0] || + eig_vecs_shape[0] != eig_vals_shape[1]) + { + throw py::value_error( + "The shape of 'eig_vals' must be (batch_size, n), " + "where batch_size = " + + std::to_string(eig_vecs_shape[0]) + + " and n = " + std::to_string(eig_vecs_shape[1])); + } + + // Ensure `batch_size` and `n` are non-zero, otherwise return empty events + if (helper::check_zeros_shape(eig_vecs_nd, eig_vecs_shape)) { + // nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + auto array_types = dpctl_td_ns::usm_ndarray_types(); + const int eig_vecs_type_id = + array_types.typenum_to_lookup_id(eig_vecs.get_typenum()); + const int eig_vals_type_id = + array_types.typenum_to_lookup_id(eig_vals.get_typenum()); + + evd_batch_impl_fn_ptr_t evd_batch_fn = + evd_batch_dispatch_table[eig_vecs_type_id][eig_vals_type_id]; + if (evd_batch_fn == nullptr) { + throw py::value_error( + "Types of input vectors and result array are mismatched."); + } + + char *eig_vecs_data = eig_vecs.get_data(); + char *eig_vals_data = eig_vals.get_data(); + + const std::int64_t batch_size = eig_vecs_shape[2]; + const std::int64_t n = eig_vecs_shape[1]; + + const oneapi::mkl::job jobz_val = static_cast(jobz); + const oneapi::mkl::uplo uplo_val = + static_cast(upper_lower); + + sycl::event evd_batch_ev = + evd_batch_fn(exec_q, jobz_val, uplo_val, batch_size, n, eig_vecs_data, + eig_vals_data, depends); + + sycl::event ht_ev = dpctl::utils::keep_args_alive( + exec_q, {eig_vecs, eig_vals}, {evd_batch_ev}); + + return std::make_pair(ht_ev, evd_batch_ev); +} + +template +inline T *alloc_scratchpad(std::int64_t scratchpad_size, + std::int64_t n_linear_streams, + sycl::queue &exec_q) +{ + // Get padding size to ensure memory allocations are aligned to 256 bytes + // for better performance + const std::int64_t padding = 256 / sizeof(T); + + if (scratchpad_size <= 0) { + throw std::runtime_error( + "Invalid scratchpad size: must be greater than zero." + " Calculated scratchpad size: " + + std::to_string(scratchpad_size)); + } + + // Calculate the total scratchpad memory size needed for all linear + // streams with proper alignment + const size_t alloc_scratch_size = + helper::round_up_mult(n_linear_streams * scratchpad_size, padding); + + // Allocate memory for the total scratchpad + T *scratchpad = sycl::malloc_device(alloc_scratch_size, exec_q); + if (!scratchpad) { + throw std::runtime_error("Device allocation for scratchpad failed"); + } + + return scratchpad; +} +} // namespace dpnp::extensions::lapack::evd diff --git a/dpnp/backend/extensions/lapack/evd_common.hpp b/dpnp/backend/extensions/lapack/evd_common.hpp index 9ddb20fb8fa..cdccb3e79d3 100644 --- a/dpnp/backend/extensions/lapack/evd_common.hpp +++ b/dpnp/backend/extensions/lapack/evd_common.hpp @@ -29,22 +29,22 @@ #include // dpctl tensor headers -#include "utils/memory_overlap.hpp" -#include "utils/output_validation.hpp" #include "utils/type_dispatch.hpp" -#include "utils/type_utils.hpp" +#include "common_helpers.hpp" +#include "evd_common_utils.hpp" #include "types_matrix.hpp" namespace dpnp::extensions::lapack::evd { +using dpnp::extensions::lapack::helper::check_zeros_shape; + typedef sycl::event (*evd_impl_fn_ptr_t)(sycl::queue &, const oneapi::mkl::job, const oneapi::mkl::uplo, const std::int64_t, char *, char *, - std::vector &, const std::vector &); namespace dpctl_td_ns = dpctl::tensor::type_dispatch; @@ -61,70 +61,30 @@ std::pair const dispatchT &evd_dispatch_table) { const int eig_vecs_nd = eig_vecs.get_ndim(); - const int eig_vals_nd = eig_vals.get_ndim(); - - if (eig_vecs_nd != 2) { - throw py::value_error("Unexpected ndim=" + std::to_string(eig_vecs_nd) + - " of an output array with eigenvectors"); - } - else if (eig_vals_nd != 1) { - throw py::value_error("Unexpected ndim=" + std::to_string(eig_vals_nd) + - " of an output array with eigenvalues"); - } const py::ssize_t *eig_vecs_shape = eig_vecs.get_shape_raw(); const py::ssize_t *eig_vals_shape = eig_vals.get_shape_raw(); - if (eig_vecs_shape[0] != eig_vecs_shape[1]) { - throw py::value_error("Output array with eigenvectors with be square"); - } - else if (eig_vecs_shape[0] != eig_vals_shape[0]) { - throw py::value_error( - "Eigenvectors and eigenvalues have different shapes"); - } + constexpr int expected_eig_vecs_nd = 2; + constexpr int expected_eig_vals_nd = 1; - size_t src_nelems(1); + common_evd_checks(exec_q, eig_vecs, eig_vals, eig_vecs_shape, + expected_eig_vecs_nd, expected_eig_vals_nd); - for (int i = 0; i < eig_vecs_nd; ++i) { - src_nelems *= static_cast(eig_vecs_shape[i]); + if (eig_vecs_shape[0] != eig_vals_shape[0]) { + throw py::value_error( + "Eigenvectors and eigenvalues have different shapes"); } - if (src_nelems == 0) { + if (check_zeros_shape(eig_vecs_nd, eig_vecs_shape)) { // nothing to do return std::make_pair(sycl::event(), sycl::event()); } - dpctl::tensor::validation::CheckWritable::throw_if_not_writable(eig_vecs); - dpctl::tensor::validation::CheckWritable::throw_if_not_writable(eig_vals); - - // check compatibility of execution queue and allocation queue - if (!dpctl::utils::queues_are_compatible(exec_q, {eig_vecs, eig_vals})) { - throw py::value_error( - "Execution queue is not compatible with allocation queues"); - } - - auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); - if (overlap(eig_vecs, eig_vals)) { - throw py::value_error("Arrays with eigenvectors and eigenvalues are " - "overlapping segments of memory"); - } - - bool is_eig_vecs_f_contig = eig_vecs.is_f_contiguous(); - bool is_eig_vals_c_contig = eig_vals.is_c_contiguous(); - if (!is_eig_vecs_f_contig) { - throw py::value_error( - "An array with input matrix / output eigenvectors " - "must be F-contiguous"); - } - else if (!is_eig_vals_c_contig) { - throw py::value_error( - "An array with output eigenvalues must be C-contiguous"); - } - auto array_types = dpctl_td_ns::usm_ndarray_types(); - int eig_vecs_type_id = + const int eig_vecs_type_id = array_types.typenum_to_lookup_id(eig_vecs.get_typenum()); - int eig_vals_type_id = + const int eig_vals_type_id = array_types.typenum_to_lookup_id(eig_vals.get_typenum()); evd_impl_fn_ptr_t evd_fn = @@ -142,25 +102,12 @@ std::pair const oneapi::mkl::uplo uplo_val = static_cast(upper_lower); - std::vector host_task_events; sycl::event evd_ev = evd_fn(exec_q, jobz_val, uplo_val, n, eig_vecs_data, - eig_vals_data, host_task_events, depends); + eig_vals_data, depends); - sycl::event args_ev = dpctl::utils::keep_args_alive( - exec_q, {eig_vecs, eig_vals}, host_task_events); + sycl::event ht_ev = + dpctl::utils::keep_args_alive(exec_q, {eig_vecs, eig_vals}, {evd_ev}); - return std::make_pair(args_ev, evd_ev); -} - -template - typename factoryT> -void init_evd_dispatch_table( - dispatchT evd_dispatch_table[][dpctl_td_ns::num_types]) -{ - dpctl_td_ns::DispatchTableBuilder - contig; - contig.populate_dispatch_table(evd_dispatch_table); + return std::make_pair(ht_ev, evd_ev); } } // namespace dpnp::extensions::lapack::evd diff --git a/dpnp/backend/extensions/lapack/evd_common_utils.hpp b/dpnp/backend/extensions/lapack/evd_common_utils.hpp new file mode 100644 index 00000000000..1da74bc4ac5 --- /dev/null +++ b/dpnp/backend/extensions/lapack/evd_common_utils.hpp @@ -0,0 +1,106 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include + +// dpctl tensor headers +#include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpnp::extensions::lapack::evd +{ +namespace dpctl_td_ns = dpctl::tensor::type_dispatch; +namespace py = pybind11; + +template + typename factoryT> +void init_evd_dispatch_table( + dispatchT evd_dispatch_table[][dpctl_td_ns::num_types]) +{ + dpctl_td_ns::DispatchTableBuilder + contig; + contig.populate_dispatch_table(evd_dispatch_table); +} + +inline void common_evd_checks(sycl::queue &exec_q, + dpctl::tensor::usm_ndarray &eig_vecs, + dpctl::tensor::usm_ndarray &eig_vals, + const py::ssize_t *eig_vecs_shape, + const int expected_eig_vecs_nd, + const int expected_eig_vals_nd) +{ + const int eig_vecs_nd = eig_vecs.get_ndim(); + const int eig_vals_nd = eig_vals.get_ndim(); + + if (eig_vecs_nd != expected_eig_vecs_nd) { + throw py::value_error("The output eigenvectors array has ndim=" + + std::to_string(eig_vecs_nd) + ", but a " + + std::to_string(expected_eig_vecs_nd) + + "-dimensional array is expected."); + } + else if (eig_vals_nd != expected_eig_vals_nd) { + throw py::value_error("The output eigenvalues array has ndim=" + + std::to_string(eig_vals_nd) + ", but a " + + std::to_string(expected_eig_vals_nd) + + "-dimensional array is expected."); + } + + if (eig_vecs_shape[0] != eig_vecs_shape[1]) { + throw py::value_error("Output array with eigenvectors must be square"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(eig_vecs); + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(eig_vals); + + // check compatibility of execution queue and allocation queue + if (!dpctl::utils::queues_are_compatible(exec_q, {eig_vecs, eig_vals})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(eig_vecs, eig_vals)) { + throw py::value_error("Arrays with eigenvectors and eigenvalues are " + "overlapping segments of memory"); + } + + const bool is_eig_vecs_f_contig = eig_vecs.is_f_contiguous(); + const bool is_eig_vals_c_contig = eig_vals.is_c_contiguous(); + if (!is_eig_vecs_f_contig) { + throw py::value_error( + "An array with input matrix / output eigenvectors " + "must be F-contiguous"); + } + else if (!is_eig_vals_c_contig) { + throw py::value_error( + "An array with output eigenvalues must be C-contiguous"); + } +} +} // namespace dpnp::extensions::lapack::evd diff --git a/dpnp/backend/extensions/lapack/heevd.cpp b/dpnp/backend/extensions/lapack/heevd.cpp index 285926bb598..a623a0dd182 100644 --- a/dpnp/backend/extensions/lapack/heevd.cpp +++ b/dpnp/backend/extensions/lapack/heevd.cpp @@ -23,8 +23,14 @@ // THE POSSIBILITY OF SUCH DAMAGE. //***************************************************************************** +#include + +#include "evd_common.hpp" #include "heevd.hpp" +// dpctl tensor headers +#include "utils/type_utils.hpp" + namespace dpnp::extensions::lapack { namespace mkl_lapack = oneapi::mkl::lapack; @@ -37,7 +43,6 @@ static sycl::event heevd_impl(sycl::queue &exec_q, const std::int64_t n, char *in_a, char *out_w, - std::vector &host_task_events, const std::vector &depends) { type_utils::validate_type_for_device(exec_q); @@ -49,15 +54,25 @@ static sycl::event heevd_impl(sycl::queue &exec_q, const std::int64_t lda = std::max(1UL, n); const std::int64_t scratchpad_size = mkl_lapack::heevd_scratchpad_size(exec_q, jobz, upper_lower, n, lda); + + if (scratchpad_size <= 0) { + throw std::runtime_error( + "Invalid scratchpad size: must be greater than zero." + "Calculated scratchpad size: " + + std::to_string(scratchpad_size)); + } + T *scratchpad = nullptr; + // Allocate memory for the scratchpad + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + if (!scratchpad) + throw std::runtime_error("Device allocation for scratchpad failed"); std::stringstream error_msg; std::int64_t info = 0; sycl::event heevd_event; try { - scratchpad = sycl::malloc_device(scratchpad_size, exec_q); - heevd_event = mkl_lapack::heevd( exec_q, jobz, // 'jobz == job::vec' means eigenvalues and eigenvectors are @@ -94,14 +109,13 @@ static sycl::event heevd_impl(sycl::queue &exec_q, throw std::runtime_error(error_msg.str()); } - sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) { + sycl::event ht_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(heevd_event); auto ctx = exec_q.get_context(); cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); }); }); - host_task_events.push_back(clean_up_event); - return heevd_event; + return ht_ev; } template diff --git a/dpnp/backend/extensions/lapack/heevd.hpp b/dpnp/backend/extensions/lapack/heevd.hpp index 34f1702af00..25ed7f0a7a3 100644 --- a/dpnp/backend/extensions/lapack/heevd.hpp +++ b/dpnp/backend/extensions/lapack/heevd.hpp @@ -26,9 +26,6 @@ #pragma once #include -#include - -#include "evd_common.hpp" namespace py = pybind11; diff --git a/dpnp/backend/extensions/lapack/heevd_batch.cpp b/dpnp/backend/extensions/lapack/heevd_batch.cpp new file mode 100644 index 00000000000..bc21b3da35d --- /dev/null +++ b/dpnp/backend/extensions/lapack/heevd_batch.cpp @@ -0,0 +1,196 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#include + +#include "common_helpers.hpp" +#include "evd_batch_common.hpp" +#include "heevd_batch.hpp" + +// dpctl tensor headers +#include "utils/type_utils.hpp" + +namespace dpnp::extensions::lapack +{ +namespace mkl_lapack = oneapi::mkl::lapack; +namespace type_utils = dpctl::tensor::type_utils; + +template +static sycl::event heevd_batch_impl(sycl::queue &exec_q, + const oneapi::mkl::job jobz, + const oneapi::mkl::uplo upper_lower, + const std::int64_t batch_size, + const std::int64_t n, + char *in_a, + char *out_w, + const std::vector &depends) +{ + type_utils::validate_type_for_device(exec_q); + type_utils::validate_type_for_device(exec_q); + + T *a = reinterpret_cast(in_a); + RealT *w = reinterpret_cast(out_w); + + const std::int64_t a_size = n * n; + const std::int64_t w_size = n; + + const std::int64_t lda = std::max(1UL, n); + + // Get the number of independent linear streams + const std::int64_t n_linear_streams = + (batch_size > 16) ? 4 : ((batch_size > 4 ? 2 : 1)); + + const std::int64_t scratchpad_size = + mkl_lapack::heevd_scratchpad_size(exec_q, jobz, upper_lower, n, lda); + + T *scratchpad = + evd::alloc_scratchpad(scratchpad_size, n_linear_streams, exec_q); + + // Computation events to manage dependencies for each linear stream + std::vector> comp_evs(n_linear_streams, depends); + + std::stringstream error_msg; + std::int64_t info = 0; + + // Release GIL to avoid serialization of host task + // submissions to the same queue in OneMKL + py::gil_scoped_release release; + + for (std::int64_t batch_id = 0; batch_id < batch_size; ++batch_id) { + T *a_batch = a + batch_id * a_size; + RealT *w_batch = w + batch_id * w_size; + + std::int64_t stream_id = (batch_id % n_linear_streams); + + T *current_scratch_heevd = scratchpad + stream_id * scratchpad_size; + + // Get the event dependencies for the current stream + const auto ¤t_dep = comp_evs[stream_id]; + + sycl::event heevd_event; + try { + heevd_event = mkl_lapack::heevd( + exec_q, + jobz, // 'jobz == job::vec' means eigenvalues and eigenvectors + // are computed. + upper_lower, // 'upper_lower == job::upper' means the upper + // triangular part of A, or the lower triangular + // otherwise + n, // The order of the matrix A (0 <= n) + a_batch, // Pointer to the square A (n x n) + // If 'jobz == job::vec', then on exit it will + // contain the eigenvectors of A + lda, // The leading dimension of A, must be at least max(1, n) + w_batch, // Pointer to array of size at least n, it will contain + // the eigenvalues of A in ascending order + current_scratch_heevd, // Pointer to scratchpad memory to be + // used by MKL routine for storing + // intermediate results + scratchpad_size, current_dep); + } catch (mkl_lapack::exception const &e) { + error_msg << "Unexpected MKL exception caught during heevd() " + "call:\nreason: " + << e.what() << "\ninfo: " << e.info(); + info = e.info(); + } catch (sycl::exception const &e) { + error_msg + << "Unexpected SYCL exception caught during heevd() call:\n" + << e.what(); + info = -1; + } + + // Update the event dependencies for the current stream + comp_evs[stream_id] = {heevd_event}; + } + + if (info != 0) // an unexpected error occurs + { + if (scratchpad != nullptr) { + sycl::free(scratchpad, exec_q); + } + throw std::runtime_error(error_msg.str()); + } + + sycl::event ht_ev = exec_q.submit([&](sycl::handler &cgh) { + for (const auto &ev : comp_evs) { + cgh.depends_on(ev); + } + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); }); + }); + + return ht_ev; +} + +template +struct HeevdBatchContigFactory +{ + fnT get() + { + if constexpr (types::HeevdTypePairSupportFactory::is_defined) + { + return heevd_batch_impl; + } + else { + return nullptr; + } + } +}; + +using evd::evd_batch_impl_fn_ptr_t; + +void init_heevd_batch(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + static evd_batch_impl_fn_ptr_t + heevd_batch_dispatch_table[dpctl_td_ns::num_types] + [dpctl_td_ns::num_types]; + + { + evd::init_evd_dispatch_table( + heevd_batch_dispatch_table); + + auto heevd_batch_pyapi = + [&](sycl::queue &exec_q, const std::int8_t jobz, + const std::int8_t upper_lower, arrayT &eig_vecs, + arrayT &eig_vals, const event_vecT &depends = {}) { + return evd::evd_batch_func(exec_q, jobz, upper_lower, eig_vecs, + eig_vals, depends, + heevd_batch_dispatch_table); + }; + m.def( + "_heevd_batch", heevd_batch_pyapi, + "Call `heevd` from OneMKL LAPACK library in a loop to return " + "the eigenvalues and eigenvectors of a batch of complex Hermitian " + "matrices", + py::arg("sycl_queue"), py::arg("jobz"), py::arg("upper_lower"), + py::arg("eig_vecs"), py::arg("eig_vals"), + py::arg("depends") = py::list()); + } +} +} // namespace dpnp::extensions::lapack diff --git a/dpnp/backend/extensions/lapack/heevd_batch.hpp b/dpnp/backend/extensions/lapack/heevd_batch.hpp new file mode 100644 index 00000000000..668c3e6175c --- /dev/null +++ b/dpnp/backend/extensions/lapack/heevd_batch.hpp @@ -0,0 +1,35 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpnp::extensions::lapack +{ +void init_heevd_batch(py::module_ m); +} // namespace dpnp::extensions::lapack diff --git a/dpnp/backend/extensions/lapack/lapack_py.cpp b/dpnp/backend/extensions/lapack/lapack_py.cpp index b854a3136a9..7afa67e84ec 100644 --- a/dpnp/backend/extensions/lapack/lapack_py.cpp +++ b/dpnp/backend/extensions/lapack/lapack_py.cpp @@ -37,10 +37,12 @@ #include "getri.hpp" #include "getrs.hpp" #include "heevd.hpp" +#include "heevd_batch.hpp" #include "linalg_exceptions.hpp" #include "orgqr.hpp" #include "potrf.hpp" #include "syevd.hpp" +#include "syevd_batch.hpp" #include "ungqr.hpp" namespace lapack_ext = dpnp::extensions::lapack; @@ -83,6 +85,9 @@ PYBIND11_MODULE(_lapack_impl, m) lapack_ext::init_heevd(m); lapack_ext::init_syevd(m); + lapack_ext::init_heevd_batch(m); + lapack_ext::init_syevd_batch(m); + m.def("_geqrf_batch", &lapack_ext::geqrf_batch, "Call `geqrf_batch` from OneMKL LAPACK library to return " "the QR factorization of a batch general matrix ", diff --git a/dpnp/backend/extensions/lapack/syevd.cpp b/dpnp/backend/extensions/lapack/syevd.cpp index e30ecf0208d..bbc975b43d0 100644 --- a/dpnp/backend/extensions/lapack/syevd.cpp +++ b/dpnp/backend/extensions/lapack/syevd.cpp @@ -23,8 +23,14 @@ // THE POSSIBILITY OF SUCH DAMAGE. //***************************************************************************** +#include + +#include "evd_common.hpp" #include "syevd.hpp" +// dpctl tensor headers +#include "utils/type_utils.hpp" + namespace dpnp::extensions::lapack { namespace mkl_lapack = oneapi::mkl::lapack; @@ -37,7 +43,6 @@ static sycl::event syevd_impl(sycl::queue &exec_q, const std::int64_t n, char *in_a, char *out_w, - std::vector &host_task_events, const std::vector &depends) { type_utils::validate_type_for_device(exec_q); @@ -49,15 +54,25 @@ static sycl::event syevd_impl(sycl::queue &exec_q, const std::int64_t lda = std::max(1UL, n); const std::int64_t scratchpad_size = mkl_lapack::syevd_scratchpad_size(exec_q, jobz, upper_lower, n, lda); + + if (scratchpad_size <= 0) { + throw std::runtime_error( + "Invalid scratchpad size: must be greater than zero." + "Calculated scratchpad size: " + + std::to_string(scratchpad_size)); + } + T *scratchpad = nullptr; + // Allocate memory for the scratchpad + scratchpad = sycl::malloc_device(scratchpad_size, exec_q); + if (!scratchpad) + throw std::runtime_error("Device allocation for scratchpad failed"); std::stringstream error_msg; std::int64_t info = 0; sycl::event syevd_event; try { - scratchpad = sycl::malloc_device(scratchpad_size, exec_q); - syevd_event = mkl_lapack::syevd( exec_q, jobz, // 'jobz == job::vec' means eigenvalues and eigenvectors are @@ -94,14 +109,13 @@ static sycl::event syevd_impl(sycl::queue &exec_q, throw std::runtime_error(error_msg.str()); } - sycl::event clean_up_event = exec_q.submit([&](sycl::handler &cgh) { + sycl::event ht_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(syevd_event); auto ctx = exec_q.get_context(); cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); }); }); - host_task_events.push_back(clean_up_event); - return syevd_event; + return ht_ev; } template diff --git a/dpnp/backend/extensions/lapack/syevd.hpp b/dpnp/backend/extensions/lapack/syevd.hpp index 1fb2409834f..9bbb515ffd4 100644 --- a/dpnp/backend/extensions/lapack/syevd.hpp +++ b/dpnp/backend/extensions/lapack/syevd.hpp @@ -26,9 +26,6 @@ #pragma once #include -#include - -#include "evd_common.hpp" namespace py = pybind11; diff --git a/dpnp/backend/extensions/lapack/syevd_batch.cpp b/dpnp/backend/extensions/lapack/syevd_batch.cpp new file mode 100644 index 00000000000..d2b87fc260c --- /dev/null +++ b/dpnp/backend/extensions/lapack/syevd_batch.cpp @@ -0,0 +1,195 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#include + +#include "common_helpers.hpp" +#include "evd_batch_common.hpp" +#include "syevd_batch.hpp" + +// dpctl tensor headers +#include "utils/type_utils.hpp" + +namespace dpnp::extensions::lapack +{ +namespace mkl_lapack = oneapi::mkl::lapack; +namespace type_utils = dpctl::tensor::type_utils; + +template +static sycl::event syevd_batch_impl(sycl::queue &exec_q, + const oneapi::mkl::job jobz, + const oneapi::mkl::uplo upper_lower, + const std::int64_t batch_size, + const std::int64_t n, + char *in_a, + char *out_w, + const std::vector &depends) +{ + type_utils::validate_type_for_device(exec_q); + type_utils::validate_type_for_device(exec_q); + + T *a = reinterpret_cast(in_a); + RealT *w = reinterpret_cast(out_w); + + const std::int64_t a_size = n * n; + const std::int64_t w_size = n; + + const std::int64_t lda = std::max(1UL, n); + + // Get the number of independent linear streams + const std::int64_t n_linear_streams = + (batch_size > 16) ? 4 : ((batch_size > 4 ? 2 : 1)); + + const std::int64_t scratchpad_size = + mkl_lapack::syevd_scratchpad_size(exec_q, jobz, upper_lower, n, lda); + + T *scratchpad = + evd::alloc_scratchpad(scratchpad_size, n_linear_streams, exec_q); + + // Computation events to manage dependencies for each linear stream + std::vector> comp_evs(n_linear_streams, depends); + + std::stringstream error_msg; + std::int64_t info = 0; + + // Release GIL to avoid serialization of host task + // submissions to the same queue in OneMKL + py::gil_scoped_release release; + + for (std::int64_t batch_id = 0; batch_id < batch_size; ++batch_id) { + T *a_batch = a + batch_id * a_size; + RealT *w_batch = w + batch_id * w_size; + + std::int64_t stream_id = (batch_id % n_linear_streams); + + T *current_scratch_syevd = scratchpad + stream_id * scratchpad_size; + + // Get the event dependencies for the current stream + const auto ¤t_dep = comp_evs[stream_id]; + + sycl::event syevd_event; + try { + syevd_event = mkl_lapack::syevd( + exec_q, + jobz, // 'jobz == job::vec' means eigenvalues and eigenvectors + // are computed. + upper_lower, // 'upper_lower == job::upper' means the upper + // triangular part of A, or the lower triangular + // otherwise + n, // The order of the matrix A (0 <= n) + a_batch, // Pointer to the square A (n x n) + // If 'jobz == job::vec', then on exit it will + // contain the eigenvectors of A + lda, // The leading dimension of A, must be at least max(1, n) + w_batch, // Pointer to array of size at least n, it will contain + // the eigenvalues of A in ascending order + current_scratch_syevd, // Pointer to scratchpad memory to be + // used by MKL routine for storing + // intermediate results + scratchpad_size, current_dep); + } catch (mkl_lapack::exception const &e) { + error_msg << "Unexpected MKL exception caught during syevd() " + "call:\nreason: " + << e.what() << "\ninfo: " << e.info(); + info = e.info(); + } catch (sycl::exception const &e) { + error_msg + << "Unexpected SYCL exception caught during syevd() call:\n" + << e.what(); + info = -1; + } + + // Update the event dependencies for the current stream + comp_evs[stream_id] = {syevd_event}; + } + + if (info != 0) // an unexpected error occurs + { + if (scratchpad != nullptr) { + sycl::free(scratchpad, exec_q); + } + throw std::runtime_error(error_msg.str()); + } + + sycl::event ht_ev = exec_q.submit([&](sycl::handler &cgh) { + for (const auto &ev : comp_evs) { + cgh.depends_on(ev); + } + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, scratchpad]() { sycl::free(scratchpad, ctx); }); + }); + + return ht_ev; +} + +template +struct SyevdBatchContigFactory +{ + fnT get() + { + if constexpr (types::SyevdTypePairSupportFactory::is_defined) + { + return syevd_batch_impl; + } + else { + return nullptr; + } + } +}; + +using evd::evd_batch_impl_fn_ptr_t; + +void init_syevd_batch(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + static evd_batch_impl_fn_ptr_t + syevd_batch_dispatch_table[dpctl_td_ns::num_types] + [dpctl_td_ns::num_types]; + + { + evd::init_evd_dispatch_table( + syevd_batch_dispatch_table); + + auto syevd_batch_pyapi = + [&](sycl::queue &exec_q, const std::int8_t jobz, + const std::int8_t upper_lower, arrayT &eig_vecs, + arrayT &eig_vals, const event_vecT &depends = {}) { + return evd::evd_batch_func(exec_q, jobz, upper_lower, eig_vecs, + eig_vals, depends, + syevd_batch_dispatch_table); + }; + m.def("_syevd_batch", syevd_batch_pyapi, + "Call `syevd` from OneMKL LAPACK library in a loop to return " + "the eigenvalues and eigenvectors of a batch of real symmetric " + "matrices", + py::arg("sycl_queue"), py::arg("jobz"), py::arg("upper_lower"), + py::arg("eig_vecs"), py::arg("eig_vals"), + py::arg("depends") = py::list()); + } +} +} // namespace dpnp::extensions::lapack diff --git a/dpnp/backend/extensions/lapack/syevd_batch.hpp b/dpnp/backend/extensions/lapack/syevd_batch.hpp new file mode 100644 index 00000000000..62d6fbb2e66 --- /dev/null +++ b/dpnp/backend/extensions/lapack/syevd_batch.hpp @@ -0,0 +1,35 @@ +//***************************************************************************** +// Copyright (c) 2024, Intel Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF +// THE POSSIBILITY OF SUCH DAMAGE. +//***************************************************************************** + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpnp::extensions::lapack +{ +void init_syevd_batch(py::module_ m); +} // namespace dpnp::extensions::lapack diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py index 87a3781b734..330efa6d4a6 100644 --- a/dpnp/linalg/dpnp_utils_linalg.py +++ b/dpnp/linalg/dpnp_utils_linalg.py @@ -96,22 +96,6 @@ def _batched_eigh(a, UPLO, eigen_mode, w_type, v_type): """ - is_cpu_device = a.sycl_device.has_aspect_cpu - orig_shape = a.shape - # get 3d input array by reshape - a = dpnp.reshape(a, (-1, orig_shape[-2], orig_shape[-1])) - a_usm_arr = dpnp.get_usm_ndarray(a) - - # allocate a memory for dpnp array of eigenvalues - w = dpnp.empty_like( - a, - shape=orig_shape[:-1], - dtype=w_type, - ) - w_orig_shape = w.shape - # get 2d dpnp array with eigenvalues by reshape - w = w.reshape(-1, w_orig_shape[-1]) - # `eigen_mode` can be either "N" or "V", specifying the computation mode # for OneMKL LAPACK `syevd` and `heevd` routines. # "V" (default) means both eigenvectors and eigenvalues will be calculated @@ -119,62 +103,65 @@ def _batched_eigh(a, UPLO, eigen_mode, w_type, v_type): jobz = _jobz[eigen_mode] uplo = _upper_lower[UPLO] - # Get LAPACK function (_syevd for real or _heevd for complex data types) + # Get LAPACK function (_syevd_batch for real or _heevd_batch + # for complex data types) # to compute all eigenvalues and, optionally, all eigenvectors lapack_func = ( - "_heevd" if dpnp.issubdtype(v_type, dpnp.complexfloating) else "_syevd" + "_heevd_batch" + if dpnp.issubdtype(v_type, dpnp.complexfloating) + else "_syevd_batch" ) a_sycl_queue = a.sycl_queue - a_order = "C" if a.flags.c_contiguous else "F" + + a_orig_shape = a.shape + a_orig_order = "C" if a.flags.c_contiguous else "F" + # get 3d input array by reshape + a = dpnp.reshape(a, (-1, a_orig_shape[-2], a_orig_shape[-1])) + a_new_shape = a.shape + + # Reorder the elements by moving the last two axes of `a` to the front + # to match fortran-like array order which is assumed by syevd/heevd. + a = dpnp.moveaxis(a, (-2, -1), (0, 1)) + a_usm_arr = dpnp.get_usm_ndarray(a) _manager = dpu.SequentialOrderManager[a_sycl_queue] + dep_evs = _manager.submitted_events - # need to loop over the 1st dimension to get eigenvalues and - # eigenvectors of 3d matrix A - batch_size = a.shape[0] - eig_vecs = [None] * batch_size - for i in range(batch_size): - # oneMKL LAPACK assumes fortran-like array as input, so - # allocate a memory with 'F' order for dpnp array of eigenvectors - eig_vecs[i] = dpnp.empty_like(a[i], order="F", dtype=v_type) + a_copy = dpnp.empty_like(a, dtype=v_type, order="F") + ht_ev, a_copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=a_usm_arr, + dst=a_copy.get_array(), + sycl_queue=a_sycl_queue, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, a_copy_ev) - # use DPCTL tensor function to fill the array of eigenvectors with - # content of input array - ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=a_usm_arr[i], - dst=eig_vecs[i].get_array(), - sycl_queue=a_sycl_queue, - depends=_manager.submitted_events, - ) - _manager.add_event_pair(ht_ev, copy_ev) + w_orig_shape = a_orig_shape[:-1] + # allocate a memory for 2d dpnp array of eigenvalues + w = dpnp.empty_like(a, shape=a_new_shape[:-1], dtype=w_type) - # TODO: Remove this w/a when MKLD-17201 is solved. - # Waiting for a host task executing an OneMKL LAPACK syevd/heevd call - # on CPU causes deadlock due to serialization of all host tasks - # in the queue. - # We need to wait for each host tasks before calling _seyvd and _heevd - # to avoid deadlock. - if is_cpu_device: - dpnp.synchronize_array_data(a) + ht_ev, evd_batch_ev = getattr(li, lapack_func)( + a_sycl_queue, + jobz, + uplo, + a_copy.get_array(), + w.get_array(), + depends=[a_copy_ev], + ) - # call LAPACK extension function to get eigenvalues and - # eigenvectors of a portion of matrix A - ht_ev, lapack_ev = getattr(li, lapack_func)( - a_sycl_queue, - jobz, - uplo, - eig_vecs[i].get_array(), - w[i].get_array(), - depends=[copy_ev], - ) - _manager.add_event_pair(ht_ev, lapack_ev) + _manager.add_event_pair(ht_ev, evd_batch_ev) w = w.reshape(w_orig_shape) if eigen_mode == "V": - # combine the list of eigenvectors into a single array - v = dpnp.array(eig_vecs, order=a_order).reshape(orig_shape) + # syevd/heevd call overwtires `a` in Fortran order, reorder the axes + # to match C order by moving the last axis to the front and + # reshape it back to the original shape of `a`. + v = dpnp.moveaxis(a_copy, -1, 0).reshape(a_orig_shape) + # Convert to contiguous to align with NumPy + if a_orig_order == "C": + v = dpnp.ascontiguousarray(v) return w, v return w