From b70e348a3a2cca0fa282f2c91fcb2e407bafe46d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 10 Mar 2023 12:14:02 -0500 Subject: [PATCH 001/592] cleanup [skip ci] --- src/TiledArray/math/solvers/cp.h | 2 +- src/TiledArray/tensor/tensor.h | 3 ++- src/TiledArray/tile_interface/scale.h | 17 ++++++++++++----- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/TiledArray/math/solvers/cp.h b/src/TiledArray/math/solvers/cp.h index d21df2d46d..f94ea259ff 100644 --- a/src/TiledArray/math/solvers/cp.h +++ b/src/TiledArray/math/solvers/cp.h @@ -35,4 +35,4 @@ using TiledArray::math::cp::CP_ALS; using TiledArray::math::cp::cp_reconstruct; } // namespace TiledArray -#endif // TILEDARRAY_MATH_SOLVERS_DIIS_H__INCLUDED +#endif // TILEDARRAY_MATH_SOLVERS_CP_H__INCLUDED diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index d3c91361f5..1f09b92701 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -872,8 +872,9 @@ class Tensor { const_reference operator()(const Index&... i) const { TA_ASSERT(!this->empty()); TA_ASSERT(this->batch_size() == 1); + using Int = std::common_type_t; const auto iord = this->range_.ordinal( - std::array, sizeof...(Index)>{{i...}}); + std::array{{static_cast(i)...}}); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; } diff --git a/src/TiledArray/tile_interface/scale.h b/src/TiledArray/tile_interface/scale.h index 8d548114b9..cbfa48c972 100644 --- a/src/TiledArray/tile_interface/scale.h +++ b/src/TiledArray/tile_interface/scale.h @@ -38,9 +38,13 @@ namespace TiledArray { /// \tparam Scalar A numeric type (i.e. TiledArray::detail::is_numeric_v /// is true) \param arg The left-hand argument to be scaled \param factor The /// scaling factor \return A tile that is equal to arg * factor -template && - !TiledArray::detail::is_array_v>* = nullptr> +template < + typename Arg, typename Scalar, + std::enable_if_t< + TiledArray::detail::is_numeric_v && + !TiledArray::detail::is_array_v && + std::is_void_v().scale( + std::declval()))>>>* = nullptr> inline auto scale(const Arg& arg, const Scalar factor) { return arg.scale(factor); } @@ -55,8 +59,11 @@ inline auto scale(const Arg& arg, const Scalar factor) { /// \return A tile that is equal to perm ^ (arg * factor) template < typename Arg, typename Scalar, typename Perm, - std::enable_if_t && - TiledArray::detail::is_permutation_v>* = nullptr> + std::enable_if_t< + TiledArray::detail::is_numeric_v && + TiledArray::detail::is_permutation_v && + std::is_void_v().scale( + std::declval(), std::declval()))>>>* = nullptr> inline auto scale(const Arg& arg, const Scalar factor, const Perm& perm) { return arg.scale(factor, perm); } From e52c733e9f1400e70425d5be171a8e7cd5b56876 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 10 Mar 2023 16:51:25 -0500 Subject: [PATCH 002/592] TiledRange1::make_uniform uses @kmp5's implementation --- src/TiledArray/tiled_range1.h | 40 ++++++++++++++--------- tests/conversions.cpp | 5 ++- tests/kmp5_compute_trange1.h | 61 ----------------------------------- tests/tiled_range1.cpp | 18 +++++++++++ 4 files changed, 45 insertions(+), 79 deletions(-) delete mode 100644 tests/kmp5_compute_trange1.h diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 239321b567..0f6d18130b 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -59,7 +59,7 @@ class TiledRange1 { TiledRange1() : range_(0, 0), elements_range_(0, 0), tiles_ranges_(), elem2tile_() {} - /// Constructs a range with the boundaries provided by + /// Constructs a range with the tile boundaries ("hashmarks") provided by /// the range [ \p first , \p last ). /// \note validity of the [ \p first , \p last ) range is checked using /// #TA_ASSERT() only if preprocessor macro \c NDEBUG is not defined @@ -79,7 +79,7 @@ class TiledRange1 { /// Construct a 1D tiled range. - /// This will construct a 1D tiled range with tile boundaries + /// This will construct a 1D tiled range with tile boundaries ("hashmarks") /// {\p t0 , \p t_rest... } /// The number of tile boundaries is n + 1, where n is the number of tiles. /// Tiles are defined as [\p t0, t1), [t1, t2), [t2, t3), ... @@ -96,7 +96,7 @@ class TiledRange1 { /// Construct a 1D tiled range. - /// This will construct a 1D tiled range with tile boundaries + /// This will construct a 1D tiled range with tile boundaries ("hashmarks") /// {\p t0 , \p t_rest... } /// The number of tile boundaries is n + 1, where n is the number of tiles. /// Tiles are defined as [\p t0 , t1), [t1, t2), [t2, t3), ... @@ -242,22 +242,32 @@ class TiledRange1 { /// @brief makes a uniform (or, as uniform as possible) TiledRange1 /// @param[in] range_size the range size - /// @param[in] target_block_size the desired block size - /// @return TiledRange1 obtained by tiling range `[0,range_size)` into `(range_size + target_block_size - 1)/target_block_size` - /// blocks of approximately @p target_block_size size + /// @param[in] target_tile_size the desired tile size + /// @return TiledRange1 obtained by tiling range `[0,range_size)` into + /// `ntiles = (range_size + target_tile_size - 1)/target_tile_size` + /// tiles; if `x = range_size % ntiles` is not zero, first `x` tiles + /// have size `target_tile_size` and last + /// `ntiles - x` tiles have size `target_tile_size - 1`, else + /// all tiles have size `target_tile_size` . // clang-format on static TiledRange1 make_uniform(std::size_t range_size, - std::size_t target_block_size) { + std::size_t target_tile_size) { if (range_size > 0) { - TA_ASSERT(target_block_size > 0); - std::size_t nblocks = - (range_size + target_block_size - 1) / target_block_size; - std::size_t block_size = (range_size + nblocks - 1) / nblocks; + TA_ASSERT(target_tile_size > 0); + std::size_t ntiles = + (range_size + target_tile_size - 1) / target_tile_size; + auto dv = std::div((long)(range_size + ntiles - 1), (long)ntiles); + auto avg_tile_size = dv.quot - 1, num_avg_plus_one = dv.rem + 1; std::vector hashmarks; - hashmarks.reserve(nblocks + 1); - hashmarks.push_back(0); - for (auto i = block_size; i < range_size; i += block_size) { - hashmarks.push_back(i); + hashmarks.reserve(ntiles + 1); + std::size_t element = 0; + for (auto i = 0; i < num_avg_plus_one; + ++i, element += avg_tile_size + 1) { + hashmarks.push_back(element); + } + for (auto i = num_avg_plus_one; i < ntiles; + ++i, element += avg_tile_size) { + hashmarks.push_back(element); } hashmarks.push_back(range_size); return TiledRange1(hashmarks.begin(), hashmarks.end()); diff --git a/tests/conversions.cpp b/tests/conversions.cpp index a66386564d..e9ae430bbb 100644 --- a/tests/conversions.cpp +++ b/tests/conversions.cpp @@ -23,7 +23,6 @@ * */ -#include "kmp5_compute_trange1.h" #include "range_fixture.h" #include "tiledarray.h" #include "unit_test_config.h" @@ -340,8 +339,8 @@ BOOST_AUTO_TEST_CASE(tiles_of_arrays_non_unit_blocking) { std::size_t dim_one = 1336; std::size_t dim_two = 552; { - TA::TiledRange1 tr1_mode0 = kmp5_compute_trange1(dim_one, block_size); - TA::TiledRange1 tr1_mode1 = kmp5_compute_trange1(dim_two, 10); + TA::TiledRange1 tr1_mode0 = TiledRange1::make_uniform(dim_one, block_size); + TA::TiledRange1 tr1_mode1 = TiledRange1::make_uniform(dim_two, 10); tr = TiledArray::TiledRange({tr1_mode0, tr1_mode1}); tr_split = TiledArray::TiledRange({tr1_mode1}); } diff --git a/tests/kmp5_compute_trange1.h b/tests/kmp5_compute_trange1.h deleted file mode 100644 index 1e0d0b9a47..0000000000 --- a/tests/kmp5_compute_trange1.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2018 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Karl Pierce - * Department of Chemistry, Virginia Tech - * - * make_trange1.cpp - * June 7, 2022 - * - */ - -#ifndef TILEDARRAY_COMPUTE_TRANGE1__H -#define TILEDARRAY_COMPUTE_TRANGE1__H - -#include "tiledarray.h" - -namespace TiledArray { - -/// this creates "uniform" TiledRange1 object using same logic as assumed in -/// vector_of_array.h -inline TiledArray::TiledRange1 kmp5_compute_trange1( - std::size_t range_size, std::size_t target_block_size) { - if (range_size > 0) { - std::size_t nblocks = - (range_size + target_block_size - 1) / target_block_size; - auto dv = std::div((int)(range_size + nblocks - 1), (int)nblocks); - auto avg_block_size = dv.quot - 1, num_avg_plus_one = dv.rem + 1; - std::vector hashmarks; - hashmarks.reserve(nblocks + 1); - auto block_counter = 0; - for (auto i = 0; i < num_avg_plus_one; - ++i, block_counter += avg_block_size + 1) { - hashmarks.push_back(block_counter); - } - for (auto i = num_avg_plus_one; i < nblocks; - ++i, block_counter += avg_block_size) { - hashmarks.push_back(block_counter); - } - hashmarks.push_back(range_size); - return TA::TiledRange1(hashmarks.begin(), hashmarks.end()); - } else - return TA::TiledRange1{}; -} - -} // namespace TiledArray - -#endif // TILEDARRAY_COMPUTE_TRANGE1__H diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index 312389ff84..d7379e2fbb 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -253,4 +253,22 @@ BOOST_AUTO_TEST_CASE(concatenation) { BOOST_CHECK(concat(r2, r1) == (TiledRange1{0, 3, 4, 5, 7, 11, 13})); } +BOOST_AUTO_TEST_CASE(make_uniform) { + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(0, 0)); + BOOST_CHECK(TiledRange1::make_uniform(0, 0) == TiledRange1{}); + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(0, 1)); + BOOST_CHECK(TiledRange1::make_uniform(0, 1) == TiledRange1{}); + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(3, 10)); + BOOST_CHECK(TiledRange1::make_uniform(3, 10) == (TiledRange1{0, 3})); + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(50, 10)); + BOOST_CHECK(TiledRange1::make_uniform(50, 10) == + (TiledRange1{0, 10, 20, 30, 40, 50})); + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(55, 10)); + BOOST_CHECK(TiledRange1::make_uniform(55, 10) == + (TiledRange1{0, 10, 19, 28, 37, 46, 55})); + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(59, 10)); + BOOST_CHECK(TiledRange1::make_uniform(59, 10) == + (TiledRange1{0, 10, 20, 30, 40, 50, 59})); +} + BOOST_AUTO_TEST_SUITE_END() From 57f60d4c8f10191504f26ccaf0514c6587b91e80 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 21 Mar 2023 15:10:26 -0400 Subject: [PATCH 003/592] make TensorInterface::{abs_{max,min},squared_norm} behave as Tensor's counterparts for complex numeric_type --- src/TiledArray/tensor/kernels.h | 43 ++++++++------- src/TiledArray/tensor/tensor.h | 42 +++++++------- src/TiledArray/tensor/tensor_interface.h | 70 ++++++++++++++---------- 3 files changed, 84 insertions(+), 71 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index c65d0e5c69..81141f4982 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -646,34 +646,34 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1, /// \tparam ReduceOp The element-wise reduction /// operation type /// \tparam JoinOp The result operation type -/// \tparam Scalar A -/// scalar type +/// \tparam Identity A type that can be used as an argument to ReduceOp /// \tparam T1 The first argument tensor type -/// \tparam Ts The -/// argument tensor types +/// \tparam Ts The argument tensor types /// \param reduce_op The element-wise reduction operation /// \param identity The initial value for the reduction and the result /// \param tensor1 The first tensor to be reduced /// \param tensors The other tensors to be reduced /// \return The reduced value of the tensor(s) template < - typename ReduceOp, typename JoinOp, typename Scalar, typename T1, + typename ReduceOp, typename JoinOp, typename Identity, typename T1, typename... Ts, typename std::enable_if_t< is_tensor::value && is_contiguous_tensor::value && - !is_reduce_op_v, std::decay_t, + !is_reduce_op_v, std::decay_t, std::decay_t, std::decay_t...>>* = nullptr> -Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity, - const T1& tensor1, const Ts&... tensors) { +auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Identity&& identity, + const T1& tensor1, const Ts&... tensors) { TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); const auto volume = tensor1.range().volume(); - math::reduce_op(reduce_op, join_op, identity, volume, identity, + auto init = std::forward(identity); + math::reduce_op(std::forward(reduce_op), + std::forward(join_op), init, volume, init, tensor1.data(), tensors.data()...); - return identity; + return init; } /// Reduction operation for tensors @@ -698,8 +698,8 @@ template < is_tensor::value && is_contiguous_tensor::value && is_reduce_op_v, std::decay_t, std::decay_t, std::decay_t...>>* = nullptr> -Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity, - const T1& tensor1, const Ts&... tensors) { +auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity, + const T1& tensor1, const Ts&... tensors) { reduce_op(identity, &tensor1, &tensors...); return identity; } @@ -723,13 +723,14 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity, /// \param tensor1 The first tensor to be reduced /// \param tensors The other tensors to be reduced /// \return The reduced value of the tensor(s) -template ::value && is_contiguous_tensor::value>::type* = nullptr> -Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity, - const T1& tensor1, const Ts&... tensors) { +auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, + const Identity& identity, const T1& tensor1, + const Ts&... tensors) { TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); @@ -765,24 +766,24 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity, /// \param tensor1 The first tensor to be reduced /// \param tensors The other tensors to be reduced /// \return The reduced value of the tensor(s) -template ::value && !is_contiguous_tensor::value>::type* = nullptr> -Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, - const Scalar identity, const T1& tensor1, - const Ts&... tensors) { +auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, + const Identity& identity, const T1& tensor1, + const Ts&... tensors) { TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); const auto stride = inner_size(tensor1, tensors...); const auto volume = tensor1.range().volume(); - Scalar result = identity; + auto result = identity; for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume; ord += stride) { - Scalar temp = identity; + auto temp = identity; math::reduce_op(reduce_op, join_op, identity, stride, temp, tensor1.data() + tensor1.range().ordinal(ord), (tensors.data() + tensors.range().ordinal(ord))...); diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 1f09b92701..f25bba68f7 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -2174,18 +2174,19 @@ class Tensor { /// identity . If HAVE_INTEL_TBB is defined, and this is a contiguous tensor, /// the reduction will be executed in an undefined order, otherwise will /// execute in the order of increasing \c i . - /// \tparam ReduceOp The reduction - /// operation type + /// \tparam ReduceOp The reduction operation type /// \tparam JoinOp The join operation type - /// \param reduce_op The - /// element-wise reduction operation + /// \tparam T a type that can be used as argument to ReduceOp + /// \param reduce_op The element-wise reduction operation /// \param join_op The join result operation /// \param identity The identity value of the reduction /// \return The reduced value - template - decltype(auto) reduce(ReduceOp&& reduce_op, JoinOp&& join_op, - Scalar identity) const { - return detail::tensor_reduce(reduce_op, join_op, identity, *this); + template + auto reduce(ReduceOp&& reduce_op, JoinOp&& join_op, + Identity&& identity) const { + return detail::tensor_reduce(std::forward(reduce_op), + std::forward(join_op), + std::forward(identity), *this); } /// Binary reduction operation @@ -2196,22 +2197,23 @@ class Tensor { /// \c identity . If HAVE_INTEL_TBB is defined, and this is a contiguous /// tensor, the reduction will be executed in an undefined order, otherwise /// will execute in the order of increasing \c i . - /// \tparam Right The - /// right-hand argument tensor type - /// \tparam ReduceOp The reduction operation - /// type + /// \tparam Right The right-hand argument tensor type + /// \tparam ReduceOp The reduction operation type /// \tparam JoinOp The join operation type - /// \param other The right-hand - /// argument of the binary reduction - /// \param reduce_op The element-wise - /// reduction operation \param join_op The join result operation + /// \tparam Identity A type that can be used as argument to ReduceOp + /// \param other The right-hand argument of the binary reduction + /// \param reduce_op The element-wise reduction operation + /// \param join_op The join result operation /// \param identity The identity value of the reduction /// \return The reduced value - template ::value>::type* = nullptr> - decltype(auto) reduce(const Right& other, ReduceOp&& reduce_op, - JoinOp&& join_op, Scalar identity) const { - return detail::tensor_reduce(reduce_op, join_op, identity, *this, other); + auto reduce(const Right& other, ReduceOp&& reduce_op, JoinOp&& join_op, + Identity&& identity) const { + return detail::tensor_reduce( + std::forward(reduce_op), std::forward(join_op), + std::forward(identity), *this, other); } /// Sum of elements diff --git a/src/TiledArray/tensor/tensor_interface.h b/src/TiledArray/tensor/tensor_interface.h index f39e6ff88d..76413a51a3 100644 --- a/src/TiledArray/tensor/tensor_interface.h +++ b/src/TiledArray/tensor/tensor_interface.h @@ -979,17 +979,20 @@ class TensorInterface { /// \c i in the index range of \c this . \c result is initialized to \c /// identity . If HAVE_INTEL_TBB is defined, and this is a contiguous tensor, /// the reduction will be executed in an undefined order, otherwise will - /// execute in the order of increasing \c i . \tparam ReduceOp The reduction - /// operation type \tparam JoinOp The join operation type \param reduce_op The - /// element-wise reduction operation \param join_op The join result operation + /// execute in the order of increasing \c i . + /// \tparam ReduceOp The reduction operation type + /// \tparam JoinOp The join operation type + /// \tparam Identity a type that can be used as argument to ReduceOp + /// \param reduce_op The element-wise reduction operation + /// \param join_op The join result operation /// \param identity The identity value of the reduction /// \return The reduced value - template - numeric_type reduce(ReduceOp&& reduce_op, JoinOp&& join_op, - const numeric_type identity) const { + template + decltype(auto) reduce(ReduceOp&& reduce_op, JoinOp&& join_op, + Identity&& identity) const { return detail::tensor_reduce(std::forward(reduce_op), - std::forward(join_op), identity, - *this); + std::forward(join_op), + std::forward(identity), *this); } /// Binary reduction operation @@ -999,19 +1002,24 @@ class TensorInterface { /// for each \c i in the index range of \c this . \c result is initialized to /// \c identity . If HAVE_INTEL_TBB is defined, and this is a contiguous /// tensor, the reduction will be executed in an undefined order, otherwise - /// will execute in the order of increasing \c i . \tparam Right The - /// right-hand argument tensor type \tparam ReduceOp The reduction operation - /// type \tparam JoinOp The join operation type \param other The right-hand - /// argument of the binary reduction \param reduce_op The element-wise - /// reduction operation \param join_op The join result operation \param - /// identity The identity value of the reduction \return The reduced value + /// will execute in the order of increasing \c i . + /// \tparam Right The right-hand argument tensor type + /// \tparam ReduceOp The reduction operation type + /// \tparam JoinOp The join operation type + /// \tparam Identity a type that can be used as argument to ReduceOp + /// \param other The right-hand argument of the binary reduction + /// \param reduce_op The element-wise reduction operation + /// \param join_op The join result operation + /// \param identity The identity value of the reduction + /// \return The reduced value template ::value>::type* = nullptr> - numeric_type reduce(const Right& other, ReduceOp&& reduce_op, - JoinOp&& join_op, const numeric_type identity) const { - return detail::tensor_reduce(std::forward(reduce_op), - std::forward(join_op), identity, *this, - other); + decltype(auto) reduce(const Right& other, ReduceOp&& reduce_op, + JoinOp&& join_op, Identity&& identity) const { + return detail::tensor_reduce( + std::forward(reduce_op), std::forward(join_op), + std::forward(identity), *this, other); } /// Sum of elements @@ -1043,7 +1051,7 @@ class TensorInterface { auto sum_op = [](scalar_type& MADNESS_RESTRICT res, const scalar_type arg) { res += arg; }; - return reduce(square_op, sum_op, numeric_type(0)); + return reduce(square_op, sum_op, scalar_type(0)); } /// Vector 2-norm @@ -1077,27 +1085,29 @@ class TensorInterface { /// Absolute minimum element /// \return The minimum elements of this tensor - numeric_type abs_min() const { - auto abs_min_op = [](numeric_type& MADNESS_RESTRICT res, + scalar_type abs_min() const { + auto abs_min_op = [](scalar_type& MADNESS_RESTRICT res, const numeric_type arg) { res = std::min(res, std::abs(arg)); }; - auto min_op = [](numeric_type& MADNESS_RESTRICT res, - const numeric_type arg) { res = std::min(res, arg); }; - return reduce(abs_min_op, min_op, std::numeric_limits::max()); + auto min_op = [](scalar_type& MADNESS_RESTRICT res, const scalar_type arg) { + res = std::min(res, arg); + }; + return reduce(abs_min_op, min_op, std::numeric_limits::max()); } /// Absolute maximum element /// \return The maximum elements of this tensor - numeric_type abs_max() const { - auto abs_max_op = [](numeric_type& MADNESS_RESTRICT res, + scalar_type abs_max() const { + auto abs_max_op = [](scalar_type& MADNESS_RESTRICT res, const numeric_type arg) { res = std::max(res, std::abs(arg)); }; - auto max_op = [](numeric_type& MADNESS_RESTRICT res, - const numeric_type arg) { res = std::max(res, arg); }; - return reduce(abs_max_op, max_op, numeric_type(0)); + auto max_op = [](scalar_type& MADNESS_RESTRICT res, const scalar_type arg) { + res = std::max(res, arg); + }; + return reduce(abs_max_op, max_op, scalar_type(0)); } /// Vector dot product From b214439cd8b4bb9368ce59160cc493211a316c0b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 21 Mar 2023 15:12:04 -0400 Subject: [PATCH 004/592] ta_dense_asymm can use complex scalars + do memtrace --- examples/dgemm/ta_dense_asymm.cpp | 133 ++++++++++++++++++++++-------- 1 file changed, 99 insertions(+), 34 deletions(-) diff --git a/examples/dgemm/ta_dense_asymm.cpp b/examples/dgemm/ta_dense_asymm.cpp index d33fd6192a..393922cd71 100644 --- a/examples/dgemm/ta_dense_asymm.cpp +++ b/examples/dgemm/ta_dense_asymm.cpp @@ -31,7 +31,8 @@ int main(int argc, char** argv) { "blocked by Bm, Bn, and Bk, respectively" << std::endl << "Usage: " << argv[0] - << " Nm Bm Nn Bn Nk Bk [repetitions=5] [real=double]\n"; + << " Nm Bm Nn Bn Nk Bk [repetitions=5] [scalar=double] " + "[do_memtrace=0]\n"; return 0; } const long Nm = atol(argv[1]); @@ -59,12 +60,17 @@ int main(int argc, char** argv) { return 1; } - const std::string real_type_str = (argc >= 9 ? argv[8] : "double"); - if (real_type_str != "double" && real_type_str != "float") { - std::cerr << "Error: invalid real type " << real_type_str << ".\n"; + const std::string scalar_type_str = (argc >= 9 ? argv[8] : "double"); + if (scalar_type_str != "double" && scalar_type_str != "float" && + scalar_type_str != "zdouble" && scalar_type_str != "zfloat") { + std::cerr << "Error: invalid real type " << scalar_type_str << ".\n"; + std::cerr << " valid real types are \"double\", \"float\", " + "\"zdouble\", and \"zfloat\".\n"; return 1; } + const bool do_memtrace = (argc >= 10 ? std::atol(argv[9]) : false); + const std::size_t Tm = Nm / Bm; const std::size_t Tn = Nn / Bn; const std::size_t Tk = Nk / Bk; @@ -72,6 +78,7 @@ int main(int argc, char** argv) { if (world.rank() == 0) std::cout << "TiledArray: dense matrix multiply test...\n" << "Number of nodes = " << world.size() + << "\nScalar type = " << scalar_type_str << "\nSize of A = " << Nm << "x" << Nk << " (" << double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)" << "\nSize of A block = " << Bm << "x" << Bk @@ -133,54 +140,112 @@ int main(int argc, char** argv) { auto run = [&](auto* tarray_ptr) { using Array = std::decay_t>; - - // Construct and initialize arrays - Array a(world, trange_a); - Array b(world, trange_b); - Array c(world, trange_c); - a.fill(1.0); - b.fill(1.0); - - // Start clock - world.gop.fence(); - const double wall_time_start = madness::wall_time(); - - // Do matrix multiplication - for (int i = 0; i < repeat; ++i) { - c("m,n") = a("m,k") * b("k,n"); + using scalar_type = TiledArray::detail::scalar_t; + + const auto complex_T = TiledArray::detail::is_complex_v; + const std::int64_t nflops = + (complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real + // 1 add takes 2/1 flops for complex/real + * static_cast(Nn) * static_cast(Nm) * + static_cast(Nk); + + auto memtrace = [do_memtrace, &world](const std::string& str) -> void { + if (do_memtrace) { + world.gop.fence(); + madness::print_meminfo(world.rank(), str); + } +#ifdef TA_TENSOR_MEM_PROFILE + { + world.gop.fence(); + std::cout + << str << ": TA::Tensor allocated " + << TA::hostEnv::instance()->host_allocator_getActualHighWatermark() + << " bytes and used " + << TA::hostEnv::instance()->host_allocator().getHighWatermark() + << " bytes" << std::endl; + } +#endif + }; + + memtrace("start"); + { // array lifetime scope + // Construct and initialize arrays + Array a(world, trange_a); + Array b(world, trange_b); + Array c(world, trange_c); + a.fill(1.0); + b.fill(1.0); + memtrace("allocated a and b"); + + // Start clock world.gop.fence(); - if (world.rank() == 0) std::cout << "Iteration " << i + 1 << "\n"; - } - - // Stop clock - const double wall_time_stop = madness::wall_time(); - - if (world.rank() == 0) - std::cout << "Average wall time = " - << (wall_time_stop - wall_time_start) / double(repeat) - << " sec\nAverage GFLOPS = " - << double(repeat) * 2.0 * double(Nn * Nm * Nk) / - (wall_time_stop - wall_time_start) / 1.0e9 - << "\n"; + if (world.rank() == 0) + std::cout << "Starting iterations: " + << "\n"; + + double total_time = 0.0; + double total_gflop_rate = 0.0; + + // Do matrix multiplication + for (int i = 0; i < repeat; ++i) { + const double start = madness::wall_time(); + c("m,n") = a("m,k") * b("k,n"); + memtrace("c=a*b"); + const double time = madness::wall_time() - start; + total_time += time; + const double gflop_rate = double(nflops) / (time * 1.e9); + total_gflop_rate += gflop_rate; + if (world.rank() == 0) + std::cout << "Iteration " << i + 1 << " time=" << time + << " GFLOPS=" << gflop_rate << "\n"; + } + + // Stop clock + const double wall_time_stop = madness::wall_time(); + + if (world.rank() == 0) { + std::cout << "Average wall time = " << total_time / double(repeat) + << " sec\nAverage GFLOPS = " + << total_gflop_rate / double(repeat) << "\n"; + } + + } // array lifetime scope + memtrace("stop"); }; // by default use TiledArray tensors constexpr bool use_btas = false; // btas::Tensor instead - if (real_type_str == "double") { + if (scalar_type_str == "double") { if constexpr (!use_btas) run(static_cast(nullptr)); else run(static_cast>>*>( nullptr)); - } else { + } else if (scalar_type_str == "float") { if constexpr (!use_btas) run(static_cast(nullptr)); else run(static_cast>>*>( nullptr)); + } else if (scalar_type_str == "zdouble") { + if constexpr (!use_btas) + run(static_cast(nullptr)); + else + run(static_cast, TiledArray::Range>>>*>( + nullptr)); + } else if (scalar_type_str == "zfloat") { + if constexpr (!use_btas) + run(static_cast(nullptr)); + else + run(static_cast, TiledArray::Range>>>*>( + nullptr)); + } else { + abort(); // unreachable } return 0; From 08f6405d772a51671dfc9de4005e89f4523ce452 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 22 Mar 2023 06:07:08 -0400 Subject: [PATCH 005/592] fixup ta_dense_asymm flop computation for the complex case --- examples/dgemm/ta_dense_asymm.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/dgemm/ta_dense_asymm.cpp b/examples/dgemm/ta_dense_asymm.cpp index 393922cd71..ac72a39209 100644 --- a/examples/dgemm/ta_dense_asymm.cpp +++ b/examples/dgemm/ta_dense_asymm.cpp @@ -140,9 +140,8 @@ int main(int argc, char** argv) { auto run = [&](auto* tarray_ptr) { using Array = std::decay_t>; - using scalar_type = TiledArray::detail::scalar_t; - - const auto complex_T = TiledArray::detail::is_complex_v; + using T = TiledArray::detail::numeric_t; + const auto complex_T = TiledArray::detail::is_complex_v; const std::int64_t nflops = (complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real // 1 add takes 2/1 flops for complex/real From c7ae19ed75c2f9378c0649aa92a5dbca4344cb92 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 22 Mar 2023 14:49:58 -0400 Subject: [PATCH 006/592] bump MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/464 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index ad33841e44..c2901a7e11 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag 6fcb6451bc7ca46a00534a30c51dc5c230c39ac3 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 0b44ef319643cb9721fbe17d294987c146e6460e . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 91fff76deba20c751d0646c54f2f1c1e07bd6156 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 676ff2ee4f..40321f91ac 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 0b44ef319643cb9721fbe17d294987c146e6460e) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 29a2bf3d3c2670c608b7bfdf2299d76fbc20e041) +set(TA_TRACKED_MADNESS_TAG 91fff76deba20c751d0646c54f2f1c1e07bd6156) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 0b44ef319643cb9721fbe17d294987c146e6460e) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From c7d6eac3d81c91cf9c41462944dc5847e387364a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 22 Mar 2023 15:25:20 -0400 Subject: [PATCH 007/592] misc fixes for complex-valued API --- src/TiledArray/math/solvers/conjgrad.h | 2 +- src/TiledArray/math/solvers/diis.h | 2 +- src/TiledArray/tensor/complex.h | 26 ++++++++++++++++++++++++++ src/TiledArray/tensor/tensor.h | 11 ++++++++--- 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/math/solvers/conjgrad.h b/src/TiledArray/math/solvers/conjgrad.h index 91992cf7de..cacfd55d63 100644 --- a/src/TiledArray/math/solvers/conjgrad.h +++ b/src/TiledArray/math/solvers/conjgrad.h @@ -60,7 +60,7 @@ namespace TiledArray::math { // clang-format on template struct ConjugateGradientSolver { - typedef typename D::element_type value_type; + typedef typename D::numeric_type value_type; /// \param a object of type F /// \param b RHS diff --git a/src/TiledArray/math/solvers/diis.h b/src/TiledArray/math/solvers/diis.h index 252d40480b..1407ff327e 100644 --- a/src/TiledArray/math/solvers/diis.h +++ b/src/TiledArray/math/solvers/diis.h @@ -82,7 +82,7 @@ namespace TiledArray::math { template class DIIS { public: - typedef typename D::element_type value_type; + typedef typename D::numeric_type value_type; typedef typename TiledArray::detail::scalar_t scalar_type; typedef Eigen::Matrix diff --git a/src/TiledArray/tensor/complex.h b/src/TiledArray/tensor/complex.h index 33698521a2..cfa330101d 100644 --- a/src/TiledArray/tensor/complex.h +++ b/src/TiledArray/tensor/complex.h @@ -274,6 +274,32 @@ inline auto abs(const ComplexConjugate& a) { inline int abs(const ComplexConjugate& a) { return 1; } +template >> +TILEDARRAY_FORCE_INLINE auto operator*(const L l, const std::complex r) { + return static_cast(l) * r; +} + +template >> +TILEDARRAY_FORCE_INLINE auto operator*(const std::complex l, const R r) { + return l * static_cast(r); +} + +template +TILEDARRAY_FORCE_INLINE + std::enable_if_t, std::complex> + operator*(const L l, const std::complex r) { + return std::complex(l, 0.) * r; +} + +template +TILEDARRAY_FORCE_INLINE + std::enable_if_t, std::complex> + operator*(const std::complex l, const R r) { + return l * std::complex(r, 0.); +} + } // namespace detail } // namespace TiledArray diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index f25bba68f7..0eab016b92 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1412,8 +1412,10 @@ class Tensor { template >::type* = nullptr> Tensor scale(const Scalar factor) const { - return unary( - [factor](const numeric_type a) -> numeric_type { return a * factor; }); + return unary([factor](const numeric_type a) -> numeric_type { + using namespace TiledArray::detail; + return a * factor; + }); } /// Construct a scaled and permuted copy of this tensor @@ -1429,7 +1431,10 @@ class Tensor { detail::is_permutation_v>> Tensor scale(const Scalar factor, const Perm& perm) const { return unary( - [factor](const numeric_type a) -> numeric_type { return a * factor; }, + [factor](const numeric_type a) -> numeric_type { + using namespace TiledArray::detail; + return a * factor; + }, perm); } From fbfc31e1f6d569cb98ac27afe7b3f414c6c327a5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 22 Mar 2023 15:25:20 -0400 Subject: [PATCH 008/592] TensorInterface::operator=(other) compiles for other.data() of different type from this->data() --- src/TiledArray/tensor/tensor_interface.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/tensor/tensor_interface.h b/src/TiledArray/tensor/tensor_interface.h index 76413a51a3..bc5e9abab2 100644 --- a/src/TiledArray/tensor/tensor_interface.h +++ b/src/TiledArray/tensor/tensor_interface.h @@ -191,7 +191,9 @@ class TensorInterface { template ::value>::type* = nullptr> TensorInterface_& operator=(const T1& other) { - TA_ASSERT(data_ != other.data()); + if constexpr (std::is_same_v>) { + TA_ASSERT(data_ != other.data()); + } detail::inplace_tensor_op([](numeric_type& MADNESS_RESTRICT result, const numeric_t arg) { result = arg; }, From b144fcd96cf39740e9256563a0cd00cea673501c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 24 Mar 2023 12:20:54 -0400 Subject: [PATCH 009/592] bump BTAS tag to pull in https://github.com/ValeevGroup/BTAS/pull/152 + introduce {DistArray,Tensor,Tile}::{rebind_t,rebind_numeric_t} --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- src/TiledArray/dist_array.h | 21 +++++++++++++++++++++ src/TiledArray/tensor/tensor.h | 26 ++++++++++++++++++++++++++ src/TiledArray/tile.h | 25 +++++++++++++++++++++++++ src/TiledArray/type_traits.h | 25 +++++++++++++++++++++++++ tests/dist_array.cpp | 26 ++++++++++++++++++++++++-- tests/tensor.cpp | 7 +++++++ tests/tensor_of_tensor.cpp | 11 +++++++++++ 9 files changed, 142 insertions(+), 5 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index c2901a7e11..6060c4bd29 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -40,7 +40,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Container: header-only - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* -- [BTAS](http://github.com/ValeevGroup/BTAS), tag 6fcb6451bc7ca46a00534a30c51dc5c230c39ac3 . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag 561fe1bff7f3374814111a15e28c7a141ab9b67a . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 91fff76deba20c751d0646c54f2f1c1e07bd6156 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. diff --git a/external/versions.cmake b/external/versions.cmake index 40321f91ac..e9cfb45375 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -24,8 +24,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 0b44ef319643cb9721fbe17d294987c146e6460e) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG 6fcb6451bc7ca46a00534a30c51dc5c230c39ac3) -set(TA_TRACKED_BTAS_PREVIOUS_TAG 474ddc095cbea12a1d28aca5435703dd9f69b166) +set(TA_TRACKED_BTAS_TAG 561fe1bff7f3374814111a15e28c7a141ab9b67a) +set(TA_TRACKED_BTAS_PREVIOUS_TAG 6fcb6451bc7ca46a00534a30c51dc5c230c39ac3) set(TA_TRACKED_LIBRETT_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 7e27ac766a9038df6aa05613784a54a036c4b796) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index ea6a066441..384e272fac 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -136,6 +136,27 @@ class DistArray : public madness::archive::ParallelSerializableObject { std::is_same_v, Future> || std::is_same_v, value_type>; + /// compute type of DistArray with different Policy and/or Tile + template + using rebind_t = DistArray; + + private: + template + struct rebind_numeric; + template + struct rebind_numeric< + Numeric, std::enable_if_t>> { + using type = + DistArray, Policy>; + }; + + public: + /// compute type of DistArray with Tile's rebound numeric type + /// @note this is SFINAE-disabled if `Tile::rebind_numeric_t` is not + /// defined + template + using rebind_numeric_t = typename rebind_numeric::type; + private: pimpl_type pimpl_; ///< managed ptr to Array implementation bool defer_deleter_to_next_fence_ = diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 0eab016b92..a7bf9ce68f 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -108,6 +108,32 @@ class Tensor { detail::is_tensor_of_tensor::value; }; + public: + /// compute type of Tensor with different element type + template ::template rebind_alloc> + using rebind_t = Tensor; + + template + struct rebind_numeric; + template + struct rebind_numeric::value>> { + using VU = typename V::template rebind_numeric::type; + using type = Tensor::template rebind_alloc>; + }; + template + struct rebind_numeric::value>> { + using type = Tensor< + U, typename std::allocator_traits::template rebind_alloc>; + }; + + /// compute type of Tensor with different numeric type + template + using rebind_numeric_t = typename rebind_numeric::type; + + private: using default_construct = bool; Tensor(const range_type& range, size_t batch_size, bool default_construct) diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h index 57366dbe60..86e8abea99 100644 --- a/src/TiledArray/tile.h +++ b/src/TiledArray/tile.h @@ -95,6 +95,31 @@ class Tile { using scalar_type = typename TiledArray::detail::scalar_type< tensor_type>::type; ///< the scalar type that supports T + private: + template + struct rebind; + template + struct rebind>> { + using type = Tile>; + }; + + template + struct rebind_numeric; + template + struct rebind_numeric< + Numeric, std::enable_if_t>> { + using type = Tile>; + }; + + public: + /// compute type of Tile with different element type + template + using rebind_t = typename rebind::type; + + /// compute type of Tile with different numeric type + template + using rebind_numeric_t = typename rebind_numeric::type; + private: std::shared_ptr pimpl_; diff --git a/src/TiledArray/type_traits.h b/src/TiledArray/type_traits.h index ece535d929..bfb022188e 100644 --- a/src/TiledArray/type_traits.h +++ b/src/TiledArray/type_traits.h @@ -760,6 +760,31 @@ struct scalar_type>::type> template using scalar_t = typename TiledArray::detail::scalar_type::type; +/// is true type if `T::rebind_t` is defined +template +struct has_rebind : std::false_type {}; +template +struct has_rebind>> + : std::true_type {}; + +/// alias to has_rebind::value +template +inline constexpr bool has_rebind_v = has_rebind::value; + +/// is true type if `T::rebind_numeric_t` is defined +template +struct has_rebind_numeric : std::false_type {}; +template +struct has_rebind_numeric< + T, Numeric, std::void_t>> + : std::true_type {}; + +/// alias to has_rebind_numeric::value +template +inline constexpr bool has_rebind_numeric_v = + has_rebind_numeric::value; + template struct is_strictly_ordered_helper { using Yes = char; diff --git a/tests/dist_array.cpp b/tests/dist_array.cpp index 4f2e1dbe9b..b4028ce524 100644 --- a/tests/dist_array.cpp +++ b/tests/dist_array.cpp @@ -513,7 +513,7 @@ BOOST_AUTO_TEST_CASE(make_replicated) { BOOST_REQUIRE_NO_THROW(a.make_replicated()); // check for cda7b8a33b85f9ebe92bc369d6a362c94f1eae40 bug - for (const auto &tile : a) { + for (const auto& tile : a) { BOOST_CHECK(tile.get().size() != 0); } @@ -532,7 +532,6 @@ BOOST_AUTO_TEST_CASE(make_replicated) { it != tile.get().end(); ++it) BOOST_CHECK_EQUAL(*it, distributed_pmap->owner(i) + 1); } - } BOOST_AUTO_TEST_CASE(serialization_by_tile) { @@ -710,4 +709,27 @@ BOOST_AUTO_TEST_CASE(issue_225) { std::remove(archive_file_name); } +BOOST_AUTO_TEST_CASE(rebind) { + static_assert( + std::is_same_v, TArrayD>); + static_assert( + std::is_same_v, + TArrayD>); + static_assert( + std::is_same_v, TSpArrayD>); + static_assert( + std::is_same_v, + TSpArrayD>); + + // DistArray of Tensors + using SpArrayTD = DistArray, SparsePolicy>; + using SpArrayTZ = DistArray, SparsePolicy>; + static_assert(std::is_same_v, + TSpArrayZ>); + static_assert( + std::is_same_v< + typename SpArrayTD::template rebind_numeric_t>, + SpArrayTZ>); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/tensor.cpp b/tests/tensor.cpp index b329b5af44..bc68c9d7bd 100644 --- a/tests/tensor.cpp +++ b/tests/tensor.cpp @@ -724,4 +724,11 @@ BOOST_AUTO_TEST_CASE(block) { #endif } +BOOST_AUTO_TEST_CASE(rebind) { + static_assert( + std::is_same_v>, TensorZ>); + static_assert( + std::is_same_v>, TensorZ>); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/tensor_of_tensor.cpp b/tests/tensor_of_tensor.cpp index 0f4683d174..c623784654 100644 --- a/tests/tensor_of_tensor.cpp +++ b/tests/tensor_of_tensor.cpp @@ -1234,4 +1234,15 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(serialization, ITensor, itensor_types) { cend(a_roundtrip)); } +BOOST_AUTO_TEST_CASE_TEMPLATE(rebind, ITensor, itensor_types) { + using ITensorD = typename ITensor::template rebind_t; + using ITensorZ = typename ITensor::template rebind_t>; + static_assert( + std::is_same_v::template rebind_t, + TensorD>); + static_assert(std::is_same_v< + typename Tensor::template rebind_numeric_t, + Tensor>); +} + BOOST_AUTO_TEST_SUITE_END() From 4029dabedc3d37b869017e5433e49fdaaea31058 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 24 Mar 2023 14:42:19 -0400 Subject: [PATCH 010/592] introduced TA::detail::{real,complex}_t --- src/TiledArray/dist_array.h | 16 +++++++++++++++ src/TiledArray/external/btas.h | 14 +++++++++++++ src/TiledArray/tensor/tensor.h | 16 +++++++++++++++ src/TiledArray/tile.h | 16 +++++++++++++++ src/TiledArray/type_traits.h | 36 ++++++++++++++++++++++++++++++++++ tests/dist_array.cpp | 11 +++++++++++ tests/tensor.cpp | 3 +++ tests/tensor_of_tensor.cpp | 4 ++++ 8 files changed, 116 insertions(+) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index 384e272fac..09e99eda86 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -1859,6 +1859,22 @@ DistArray replicated(const DistArray& a) { return result; } +namespace detail { + +template +struct real_t_impl> { + using type = typename DistArray::template rebind_numeric_t< + typename Tile::scalar_type>; +}; + +template +struct complex_t_impl> { + using type = typename DistArray::template rebind_numeric_t< + std::complex>; +}; + +} // namespace detail + } // namespace TiledArray // serialization diff --git a/src/TiledArray/external/btas.h b/src/TiledArray/external/btas.h index 483be905df..7dbd115d4d 100644 --- a/src/TiledArray/external/btas.h +++ b/src/TiledArray/external/btas.h @@ -841,6 +841,20 @@ struct ordinal_traits> { : OrdinalType::ColMajor; }; +template +struct real_t_impl> { + using type = + typename btas::Tensor::template rebind_numeric_t< + typename btas::Tensor::scalar_type>; +}; + +template +struct complex_t_impl> { + using type = + typename btas::Tensor::template rebind_numeric_t< + std::complex::scalar_type>>; +}; + } // namespace detail } // namespace TiledArray diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index a7bf9ce68f..fcb5ffbe7a 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -2703,6 +2703,22 @@ struct transform> { }; } // namespace detail +namespace detail { + +template +struct real_t_impl> { + using type = typename Tensor::template rebind_numeric_t< + typename Tensor::scalar_type>; +}; + +template +struct complex_t_impl> { + using type = typename Tensor::template rebind_numeric_t< + std::complex::scalar_type>>; +}; + +} // namespace detail + #ifndef TILEDARRAY_HEADER_ONLY extern template class Tensor; diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h index 86e8abea99..b8242fbf19 100644 --- a/src/TiledArray/tile.h +++ b/src/TiledArray/tile.h @@ -1673,6 +1673,22 @@ bool operator!=(const Tile& t1, const Tile& t2) { return !(t1 == t2); } +namespace detail { + +template +struct real_t_impl> { + using type = typename Tile::template rebind_numeric_t< + typename Tile::scalar_type>; +}; + +template +struct complex_t_impl> { + using type = typename Tile::template rebind_numeric_t< + std::complex::scalar_type>>; +}; + +} // namespace detail + } // namespace TiledArray #endif // TILEDARRAY_TILE_H__INCLUDED diff --git a/src/TiledArray/type_traits.h b/src/TiledArray/type_traits.h index bfb022188e..47c90f0130 100644 --- a/src/TiledArray/type_traits.h +++ b/src/TiledArray/type_traits.h @@ -632,6 +632,42 @@ struct is_complex> : public std::true_type {}; template constexpr const bool is_complex_v = is_complex::value; +template +struct complex_t_impl; + +template +struct complex_t_impl> { + using type = std::complex; +}; + +template +struct complex_t_impl>> { + using type = std::complex; +}; + +/// evaluates to std::complex if T is real, else T +/// @note specialize complex_t_impl to customize the behavior for type T +template +using complex_t = typename complex_t_impl::type; + +template +struct real_t_impl; + +template +struct real_t_impl> { + using type = T; +}; + +template +struct real_t_impl>> { + using type = T; +}; + +/// evaluates to U if T is std::complex, or if T is real then evaluates to T +/// @note specialize real_t_impl to customize the behavior for type T +template +using real_t = typename real_t_impl::type; + template struct is_numeric : public std::is_arithmetic {}; diff --git a/tests/dist_array.cpp b/tests/dist_array.cpp index b4028ce524..061c5fdd17 100644 --- a/tests/dist_array.cpp +++ b/tests/dist_array.cpp @@ -720,6 +720,13 @@ BOOST_AUTO_TEST_CASE(rebind) { static_assert( std::is_same_v, TSpArrayD>); + static_assert(std::is_same_v, TArrayD>); + static_assert( + std::is_same_v, TArrayZ>); + static_assert( + std::is_same_v, TSpArrayD>); + static_assert( + std::is_same_v, TSpArrayZ>); // DistArray of Tensors using SpArrayTD = DistArray, SparsePolicy>; @@ -730,6 +737,10 @@ BOOST_AUTO_TEST_CASE(rebind) { std::is_same_v< typename SpArrayTD::template rebind_numeric_t>, SpArrayTZ>); + static_assert( + std::is_same_v, SpArrayTD>); + static_assert( + std::is_same_v, SpArrayTZ>); } BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/tensor.cpp b/tests/tensor.cpp index bc68c9d7bd..1281e5d164 100644 --- a/tests/tensor.cpp +++ b/tests/tensor.cpp @@ -729,6 +729,9 @@ BOOST_AUTO_TEST_CASE(rebind) { std::is_same_v>, TensorZ>); static_assert( std::is_same_v>, TensorZ>); + static_assert( + std::is_same_v, TensorZ>); + static_assert(std::is_same_v, TensorD>); } BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/tensor_of_tensor.cpp b/tests/tensor_of_tensor.cpp index c623784654..21d136b67c 100644 --- a/tests/tensor_of_tensor.cpp +++ b/tests/tensor_of_tensor.cpp @@ -1243,6 +1243,10 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(rebind, ITensor, itensor_types) { static_assert(std::is_same_v< typename Tensor::template rebind_numeric_t, Tensor>); + static_assert(std::is_same_v>, + Tensor>); + static_assert(std::is_same_v>, + Tensor>); } BOOST_AUTO_TEST_SUITE_END() From 17414fa61de920cb9cf2278f3784b057cecf9959 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 24 Mar 2023 16:43:36 -0400 Subject: [PATCH 011/592] non-distributed heig returns real eigenvalues even for complex matrices --- .../math/linalg/non-distributed/heig.h | 8 ++++---- src/TiledArray/math/linalg/rank-local.h | 4 ++-- src/TiledArray/math/linalg/scalapack/heig.h | 17 ++++++++--------- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/TiledArray/math/linalg/non-distributed/heig.h b/src/TiledArray/math/linalg/non-distributed/heig.h index 8a7c244bbc..5490b6b757 100644 --- a/src/TiledArray/math/linalg/non-distributed/heig.h +++ b/src/TiledArray/math/linalg/non-distributed/heig.h @@ -52,10 +52,10 @@ namespace TiledArray::math::linalg::non_distributed { */ template auto heig(const Array& A, TiledRange evec_trange = TiledRange()) { - using numeric_type = typename detail::array_traits::numeric_type; + using scalar_type = typename detail::array_traits::scalar_type; World& world = A.world(); auto A_eig = detail::make_matrix(A); - std::vector evals; + std::vector evals; if (world.rank() == 0) { linalg::rank_local::heig(A_eig, evals); } @@ -93,12 +93,12 @@ auto heig(const Array& A, TiledRange evec_trange = TiledRange()) { template auto heig(const ArrayA& A, const ArrayB& B, TiledRange evec_trange = TiledRange()) { - using numeric_type = typename detail::array_traits::numeric_type; + using scalar_type = typename detail::array_traits::scalar_type; (void)detail::array_traits{}; World& world = A.world(); auto A_eig = detail::make_matrix(A); auto B_eig = detail::make_matrix(B); - std::vector evals; + std::vector evals; if (world.rank() == 0) { linalg::rank_local::heig(A_eig, B_eig, evals); } diff --git a/src/TiledArray/math/linalg/rank-local.h b/src/TiledArray/math/linalg/rank-local.h index 77774c195a..f1621164db 100644 --- a/src/TiledArray/math/linalg/rank-local.h +++ b/src/TiledArray/math/linalg/rank-local.h @@ -42,10 +42,10 @@ template void cholesky_lsolve(Op transpose, Matrix &A, Matrix &X); template -void heig(Matrix &A, std::vector &W); +void heig(Matrix &A, std::vector> &W); template -void heig(Matrix &A, Matrix &B, std::vector &W); +void heig(Matrix &A, Matrix &B, std::vector> &W); template void svd(Job jobu, Job jobvt, Matrix &A, std::vector &S, Matrix *U, diff --git a/src/TiledArray/math/linalg/scalapack/heig.h b/src/TiledArray/math/linalg/scalapack/heig.h index bc9edeaa91..d7e84ae706 100644 --- a/src/TiledArray/math/linalg/scalapack/heig.h +++ b/src/TiledArray/math/linalg/scalapack/heig.h @@ -58,7 +58,7 @@ namespace TiledArray::math::linalg::scalapack { template auto heig(const Array& A, TiledRange evec_trange = TiledRange(), size_t NB = default_block_size()) { - using value_type = typename Array::element_type; + using value_type = typename Array::numeric_type; using real_type = scalapackpp::detail::real_t; auto& world = A.world(); @@ -80,9 +80,8 @@ auto heig(const Array& A, TiledRange evec_trange = TiledRange(), scalapack::BlockCyclicMatrix evecs(world, grid, N, N, NB, NB); auto info = scalapackpp::hereig( - scalapackpp::Job::Vec, blacspp::Uplo::Lower, N, - matrix.local_mat().data(), 1, 1, desc, evals.data(), - evecs.local_mat().data(), 1, 1, desc); + scalapackpp::Job::Vec, blacspp::Uplo::Lower, N, matrix.local_mat().data(), + 1, 1, desc, evals.data(), evecs.local_mat().data(), 1, 1, desc); if (info) TA_EXCEPTION("EVP Failed"); if (evec_trange.rank() == 0) evec_trange = A.trange(); @@ -122,8 +121,8 @@ template auto heig(const ArrayA& A, const ArrayB& B, TiledRange evec_trange = TiledRange(), size_t NB = default_block_size()) { - using value_type = typename ArrayA::element_type; - static_assert(std::is_same_v); + using value_type = typename ArrayA::numeric_type; + static_assert(std::is_same_v); using real_type = scalapackpp::detail::real_t; auto& world = A.world(); @@ -150,9 +149,9 @@ auto heig(const ArrayA& A, const ArrayB& B, scalapack::BlockCyclicMatrix evecs(world, grid, N, N, NB, NB); auto info = scalapackpp::hereig_gen( - scalapackpp::Job::Vec, blacspp::Uplo::Lower, N, - A_sca.local_mat().data(), 1, 1, desc, B_sca.local_mat().data(), 1, 1, - desc, evals.data(), evecs.local_mat().data(), 1, 1, desc); + scalapackpp::Job::Vec, blacspp::Uplo::Lower, N, A_sca.local_mat().data(), + 1, 1, desc, B_sca.local_mat().data(), 1, 1, desc, evals.data(), + evecs.local_mat().data(), 1, 1, desc); if (info) TA_EXCEPTION("EVP Failed"); if (evec_trange.rank() == 0) evec_trange = A.trange(); From c1c27db8c7df04d1679e168c20b7622fc085ce43 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 24 Mar 2023 17:23:33 -0400 Subject: [PATCH 012/592] non-distributed svd returns real singular values even for complex matrices --- .../math/linalg/non-distributed/svd.h | 20 ++--- src/TiledArray/math/linalg/rank-local.cpp | 78 ++++++++++--------- src/TiledArray/math/linalg/rank-local.h | 9 ++- 3 files changed, 57 insertions(+), 50 deletions(-) diff --git a/src/TiledArray/math/linalg/non-distributed/svd.h b/src/TiledArray/math/linalg/non-distributed/svd.h index 9c146784ef..e6ea5ef1da 100644 --- a/src/TiledArray/math/linalg/non-distributed/svd.h +++ b/src/TiledArray/math/linalg/non-distributed/svd.h @@ -27,9 +27,9 @@ #include -#include -#include #include +#include +#include namespace TiledArray::math::linalg::non_distributed { @@ -52,13 +52,14 @@ namespace TiledArray::math::linalg::non_distributed { * @param[in] vt_trange TiledRange for resulting right singular vectors * (transposed). * - * @returns A tuple containing the eigenvalues and eigenvectors of input array - * as std::vector and in TA format, respectively. + * @returns A tuple containing the singular values and singular vectors of + * input array as std::vector and in TA format, respectively. */ -template -auto svd(const Array& A, TiledRange u_trange = TiledRange(), TiledRange vt_trange = TiledRange()) { - +template +auto svd(const Array& A, TiledRange u_trange = TiledRange(), + TiledRange vt_trange = TiledRange()) { using T = typename Array::numeric_type; + using TS = typename Array::scalar_type; using Matrix = linalg::rank_local::Matrix; World& world = A.world(); @@ -68,7 +69,7 @@ auto svd(const Array& A, TiledRange u_trange = TiledRange(), TiledRange vt_trang constexpr bool need_u = (Vectors == SVD::LeftVectors) or svd_all_vectors; constexpr bool need_vt = (Vectors == SVD::RightVectors) or svd_all_vectors; - std::vector S; + std::vector S; std::unique_ptr U, VT; if constexpr (need_u) U = std::make_unique(); @@ -82,7 +83,7 @@ auto svd(const Array& A, TiledRange u_trange = TiledRange(), TiledRange vt_trang if (U) world.gop.broadcast_serializable(*U, 0); if (VT) world.gop.broadcast_serializable(*VT, 0); - auto make_array = [&world](auto && ... args) { + auto make_array = [&world](auto&&... args) { return eigen_to_array(world, args...); }; @@ -97,7 +98,6 @@ auto svd(const Array& A, TiledRange u_trange = TiledRange(), TiledRange vt_trang } if constexpr (!need_u && !need_vt) return S; - } } // namespace TiledArray::math::linalg::non_distributed diff --git a/src/TiledArray/math/linalg/rank-local.cpp b/src/TiledArray/math/linalg/rank-local.cpp index a1e2e5538b..2152460c97 100644 --- a/src/TiledArray/math/linalg/rank-local.cpp +++ b/src/TiledArray/math/linalg/rank-local.cpp @@ -113,19 +113,20 @@ void cholesky_lsolve(Op transpose, Matrix& A, Matrix& X) { } template -void heig(Matrix& A, std::vector& W) { +void heig(Matrix& A, std::vector>& W) { auto jobz = lapack::Job::Vec; auto uplo = lapack::Uplo::Lower; integer n = A.rows(); T* a = A.data(); integer lda = A.rows(); W.resize(n); - T* w = W.data(); + auto* w = W.data(); TA_LAPACK(syev, jobz, uplo, n, a, lda, w); } template -void heig(Matrix& A, Matrix& B, std::vector& W) { +void heig(Matrix& A, Matrix& B, + std::vector>& W) { integer itype = 1; auto jobz = lapack::Job::Vec; auto uplo = lapack::Uplo::Lower; @@ -135,12 +136,14 @@ void heig(Matrix& A, Matrix& B, std::vector& W) { T* b = B.data(); integer ldb = B.rows(); W.resize(n); - T* w = W.data(); + auto* w = W.data(); TA_LAPACK(sygv, itype, jobz, uplo, n, a, lda, b, ldb, w); } template -void svd(Job jobu, Job jobvt, Matrix& A, std::vector& S, Matrix* U, Matrix* VT) { +void svd(Job jobu, Job jobvt, Matrix& A, + std::vector>& S, Matrix* U, + Matrix* VT) { integer m = A.rows(); integer n = A.cols(); integer k = std::min(m, n); @@ -148,40 +151,42 @@ void svd(Job jobu, Job jobvt, Matrix& A, std::vector& S, Matrix* U, Mat integer lda = A.rows(); S.resize(k); - T* s = S.data(); + auto* s = S.data(); - T* u = nullptr; + T* u = nullptr; T* vt = nullptr; integer ldu = 1, ldvt = 1; - if( (jobu == Job::SomeVec or jobu == Job::AllVec) and (not U) ) - TA_LAPACK_ERROR("Requested out-of-place right singular vectors with null U input"); - if( (jobvt == Job::SomeVec or jobvt == Job::AllVec) and (not VT) ) - TA_LAPACK_ERROR("Requested out-of-place left singular vectors with null VT input"); + if ((jobu == Job::SomeVec or jobu == Job::AllVec) and (not U)) + TA_LAPACK_ERROR( + "Requested out-of-place right singular vectors with null U input"); + if ((jobvt == Job::SomeVec or jobvt == Job::AllVec) and (not VT)) + TA_LAPACK_ERROR( + "Requested out-of-place left singular vectors with null VT input"); - if( jobu == Job::SomeVec ) { + if (jobu == Job::SomeVec) { U->resize(m, k); u = U->data(); ldu = m; } - if( jobu == Job::AllVec ) { + if (jobu == Job::AllVec) { U->resize(m, m); u = U->data(); ldu = m; } - if( jobvt == Job::SomeVec ) { + if (jobvt == Job::SomeVec) { VT->resize(k, n); vt = VT->data(); ldvt = k; } - if( jobvt == Job::AllVec ) { + if (jobvt == Job::AllVec) { VT->resize(n, n); vt = VT->data(); ldvt = n; } - + TA_LAPACK(gesvd, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt); } @@ -208,45 +213,44 @@ void lu_inv(Matrix& A) { } template -void householder_qr( Matrix &V, Matrix &R ) { +void householder_qr(Matrix& V, Matrix& R) { integer m = V.rows(); integer n = V.cols(); - integer k = std::min(m,n); - integer ldv = V.rows(); // Col Major + integer k = std::min(m, n); + integer ldv = V.rows(); // Col Major T* v = V.data(); std::vector tau(k); - lapack::geqrf( m, n, v, ldv, tau.data() ); + lapack::geqrf(m, n, v, ldv, tau.data()); // Extract R - if constexpr ( not QOnly ) { + if constexpr (not QOnly) { // Resize R just in case - R.resize(k,n); + R.resize(k, n); R.fill(0.); // Extract Upper triangle into R integer ldr = R.rows(); T* r = R.data(); - lapack::lacpy( lapack::MatrixType::Upper, k, n, v, ldv, r, ldr ); + lapack::lacpy(lapack::MatrixType::Upper, k, n, v, ldv, r, ldr); } // Explicitly form Q // TODO: This is wrong for complex, but it doesn't look like R/C is caught // anywhere else either... - lapack::orgqr( m, n, k, v, ldv, tau.data() ); - + lapack::orgqr(m, n, k, v, ldv, tau.data()); } -#define TA_LAPACK_EXPLICIT(MATRIX, VECTOR) \ - template void cholesky(MATRIX&); \ - template void cholesky_linv(MATRIX&); \ - template void cholesky_solve(MATRIX&, MATRIX&); \ - template void cholesky_lsolve(Op, MATRIX&, MATRIX&); \ - template void heig(MATRIX&, VECTOR&); \ - template void heig(MATRIX&, MATRIX&, VECTOR&); \ - template void svd(Job,Job,MATRIX&, VECTOR&, MATRIX*, MATRIX*); \ - template void lu_solve(MATRIX&, MATRIX&); \ - template void lu_inv(MATRIX&); \ - template void householder_qr(MATRIX&,MATRIX&); \ - template void householder_qr(MATRIX&,MATRIX&); +#define TA_LAPACK_EXPLICIT(MATRIX, VECTOR) \ + template void cholesky(MATRIX&); \ + template void cholesky_linv(MATRIX&); \ + template void cholesky_solve(MATRIX&, MATRIX&); \ + template void cholesky_lsolve(Op, MATRIX&, MATRIX&); \ + template void heig(MATRIX&, VECTOR&); \ + template void heig(MATRIX&, MATRIX&, VECTOR&); \ + template void svd(Job, Job, MATRIX&, VECTOR&, MATRIX*, MATRIX*); \ + template void lu_solve(MATRIX&, MATRIX&); \ + template void lu_inv(MATRIX&); \ + template void householder_qr(MATRIX&, MATRIX&); \ + template void householder_qr(MATRIX&, MATRIX&); TA_LAPACK_EXPLICIT(Matrix, std::vector); TA_LAPACK_EXPLICIT(Matrix, std::vector); diff --git a/src/TiledArray/math/linalg/rank-local.h b/src/TiledArray/math/linalg/rank-local.h index f1621164db..5c46550bd3 100644 --- a/src/TiledArray/math/linalg/rank-local.h +++ b/src/TiledArray/math/linalg/rank-local.h @@ -45,14 +45,17 @@ template void heig(Matrix &A, std::vector> &W); template -void heig(Matrix &A, Matrix &B, std::vector> &W); +void heig(Matrix &A, Matrix &B, + std::vector> &W); template -void svd(Job jobu, Job jobvt, Matrix &A, std::vector &S, Matrix *U, +void svd(Job jobu, Job jobvt, Matrix &A, + std::vector> &S, Matrix *U, Matrix *VT); template -void svd(Matrix &A, std::vector &S, Matrix *U, Matrix *VT) { +void svd(Matrix &A, std::vector> &S, + Matrix *U, Matrix *VT) { svd(U ? Job::SomeVec : Job::NoVec, VT ? Job::SomeVec : Job::NoVec, A, S, U, VT); } From f31a3194d20b09eb7aef3709e0b88ebe243c9f74 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 24 Mar 2023 21:22:21 -0400 Subject: [PATCH 013/592] instantiate non-distributed complex lapack bindings ... this subsumes https://github.com/ValeevGroup/tiledarray/pull/317/commits/4837f3d5fe986b6b81e0c103762dd6863b3d4690 --- src/TiledArray/math/linalg/rank-local.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/math/linalg/rank-local.cpp b/src/TiledArray/math/linalg/rank-local.cpp index 2152460c97..74e1aac526 100644 --- a/src/TiledArray/math/linalg/rank-local.cpp +++ b/src/TiledArray/math/linalg/rank-local.cpp @@ -121,7 +121,10 @@ void heig(Matrix& A, std::vector>& W) { integer lda = A.rows(); W.resize(n); auto* w = W.data(); - TA_LAPACK(syev, jobz, uplo, n, a, lda, w); + if constexpr (TiledArray::detail::is_complex_v) + TA_LAPACK(heev, jobz, uplo, n, a, lda, w); + else + TA_LAPACK(syev, jobz, uplo, n, a, lda, w); } template @@ -137,7 +140,10 @@ void heig(Matrix& A, Matrix& B, integer ldb = B.rows(); W.resize(n); auto* w = W.data(); - TA_LAPACK(sygv, itype, jobz, uplo, n, a, lda, b, ldb, w); + if constexpr (TiledArray::detail::is_complex_v) + TA_LAPACK(hegv, itype, jobz, uplo, n, a, lda, b, ldb, w); + else + TA_LAPACK(sygv, itype, jobz, uplo, n, a, lda, b, ldb, w); } template @@ -236,7 +242,10 @@ void householder_qr(Matrix& V, Matrix& R) { // Explicitly form Q // TODO: This is wrong for complex, but it doesn't look like R/C is caught // anywhere else either... - lapack::orgqr(m, n, k, v, ldv, tau.data()); + if constexpr (TiledArray::detail::is_complex_v) + lapack::ungqr(m, n, k, v, ldv, tau.data()); + else + lapack::orgqr(m, n, k, v, ldv, tau.data()); } #define TA_LAPACK_EXPLICIT(MATRIX, VECTOR) \ @@ -254,5 +263,7 @@ void householder_qr(Matrix& V, Matrix& R) { TA_LAPACK_EXPLICIT(Matrix, std::vector); TA_LAPACK_EXPLICIT(Matrix, std::vector); +TA_LAPACK_EXPLICIT(Matrix>, std::vector); +TA_LAPACK_EXPLICIT(Matrix>, std::vector); } // namespace TiledArray::math::linalg::rank_local From ca27eb0782861fe4e21678461e294594df269179 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 27 Mar 2023 19:45:14 -0400 Subject: [PATCH 014/592] extend device API for complex types, as illustrated by ta_dense_cuda --- examples/cuda/ta_dense_cuda.cpp | 110 ++++++++++++------ examples/dgemm/ta_dense_asymm.cpp | 34 +++--- src/TiledArray/cuda/btas_um_tensor.cpp | 16 +++ src/TiledArray/cuda/btas_um_tensor.h | 16 +++ src/TiledArray/cuda/cpu_cuda_vector.cu | 21 +++- src/TiledArray/cuda/cpu_cuda_vector.h | 2 + src/TiledArray/cuda/cublas.h | 61 ++++++++++ src/TiledArray/cuda/kernel/mult_kernel.cu | 19 +++ src/TiledArray/cuda/kernel/mult_kernel.h | 20 ++++ src/TiledArray/cuda/kernel/reduce_kernel.cu | 41 ++++++- src/TiledArray/cuda/kernel/reduce_kernel.h | 34 ++++++ .../cuda/kernel/reduce_kernel_impl.h | 31 +++-- 12 files changed, 337 insertions(+), 68 deletions(-) diff --git a/examples/cuda/ta_dense_cuda.cpp b/examples/cuda/ta_dense_cuda.cpp index 14f692329b..4a035f176b 100644 --- a/examples/cuda/ta_dense_cuda.cpp +++ b/examples/cuda/ta_dense_cuda.cpp @@ -137,23 +137,31 @@ template void do_main_body(TiledArray::World &world, const long Nm, const long Bm, const long Nn, const long Bn, const long Nk, const long Bk, const long nrepeat) { - using Real = typename Storage::value_type; + using T = TiledArray::detail::numeric_t; + using RT = TiledArray::detail::scalar_t; + constexpr auto complex_T = TiledArray::detail::is_complex_v; const std::size_t Tm = Nm / Bm; const std::size_t Tn = Nn / Bn; const std::size_t Tk = Nk / Bk; + const std::int64_t nflops = + (complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real + // 1 add takes 2/1 flops for complex/real + * static_cast(Nn) * static_cast(Nm) * + static_cast(Nk); + if (world.rank() == 0) std::cout << "TiledArray: dense matrix multiply test...\n" << "Number of nodes = " << world.size() << "\nSize of A = " << Nm << "x" << Nk << " (" - << double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)" + << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)" << "\nSize of A block = " << Bm << "x" << Bk << "\nSize of B = " << Nk << "x" << Nn << " (" - << double(Nk * Nn * sizeof(double)) / 1.0e9 << " GB)" + << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)" << "\nSize of B block = " << Bk << "x" << Bn << "\nSize of C = " << Nm << "x" << Nn << " (" - << double(Nm * Nn * sizeof(double)) / 1.0e9 << " GB)" + << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)" << "\nSize of C block = " << Bm << "x" << Bn << "\n# of blocks of C = " << Tm * Tn << "\nAverage # of blocks of C/node = " @@ -205,14 +213,13 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, TiledArray::TiledRange // TRange for b trange_b(blocking_B.begin(), blocking_B.end()); - using value_type = typename Storage::value_type; - using CUDATile = btas::Tensor; + using CUDATile = btas::Tensor; using CUDAMatrix = TA::DistArray>; - using TAMatrix = TA::DistArray>; + using TAMatrix = TA::DistArray>; CUDAMatrix c(world, trange_c); - value_type val_a = 0.03; - value_type val_b = 0.02; + auto val_a = 0.03; + auto val_b = 0.02; { // Construct and initialize arrays @@ -235,19 +242,26 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, // start profiler cudaProfilerStart(); - // Start clock - const double wall_time_start = madness::wall_time(); + double total_time = 0.0; + double total_gflop_rate = 0.0; // Do matrix multiplication for (int i = 0; i < nrepeat; ++i) { double iter_time_start = madness::wall_time(); // c("m,n") = a("m,k") * b("k,n") + a("m,n") - b("m,n"); c("m,n") = a("m,k") * b("k,n"); + c.world().gop.fence(); // fence since GEMM can return early double iter_time_stop = madness::wall_time(); + const double iter_time = iter_time_stop - iter_time_start; + total_time += iter_time; + const double gflop_rate = double(nflops) / (iter_time * 1.e9); + total_gflop_rate += gflop_rate; if (world.rank() == 0) - std::cout << "Iteration " << i + 1 - << " wall time: " << (iter_time_stop - iter_time_start) + std::cout << "Iteration " << i + 1 << " wall time: " << iter_time << "\n"; + if (world.rank() == 0) + std::cout << "Iteration " << i + 1 << " time=" << time + << " GFLOPS=" << gflop_rate << "\n"; } // Stop clock const double wall_time_stop = madness::wall_time(); @@ -256,32 +270,43 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, cudaProfilerStop(); if (world.rank() == 0) - std::cout << "Average wall time = " - << (wall_time_stop - wall_time_start) / double(nrepeat) + std::cout << "Average wall time = " << total_time / double(nrepeat) << " sec\nAverage GFLOPS = " - << double(nrepeat) * 2.0 * double(Nn * Nm * Nm) / - (wall_time_stop - wall_time_start) / 1.0e9 - << "\n"; + << total_gflop_rate / double(nrepeat) << "\n"; } - double threshold = - std::numeric_limits::epsilon(); + double threshold = std::numeric_limits::epsilon(); auto dot_length = Nk; // auto result = dot_length * val_a * val_b + val_a - val_b; - auto result = dot_length * val_a * val_b; + T result; + if constexpr (complex_T) { + result = T(dot_length * val_a * val_b, 0.); + } else + result = dot_length * val_a * val_b; auto verify = [&world, &threshold, &result, &dot_length](TA::Tile &tile) { auto n_elements = tile.size(); for (std::size_t i = 0; i < n_elements; i++) { - double abs_err = fabs(tile[i] - result); + double abs_err = std::abs(tile[i] - result); // double abs_val = fabs(tile[i]); - double rel_err = abs_err / result / dot_length; + double rel_err = abs_err / std::abs(result) / dot_length; if (rel_err > threshold) { + auto to_string = [](const auto &v) { + constexpr bool complex_T = + TiledArray::detail::is_complex_v>; + if constexpr (complex_T) { + std::string result; + result = "{" + std::to_string(v.real()) + "," + + std::to_string(v.imag()) + "}"; + return result; + } else + return std::to_string(v); + }; std::cout << "Node: " << world.rank() << " Tile: " << tile.range() << " id: " << i - << std::string(" gpu: " + std::to_string(tile[i]) + - " cpu: " + std::to_string(result) + "\n"); + << std::string(" gpu: " + to_string(tile[i]) + + " cpu: " + to_string(result) + "\n"); break; } } @@ -308,7 +333,7 @@ int try_main(int argc, char **argv) { "blocked by Bm, Bn, and Bk, respectively" << std::endl << "Usage: " << argv[0] - << " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [real = double] " + << " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [scalar = double] " "[storage type = cuda_um_btas_varray]\n"; return 0; } @@ -337,13 +362,13 @@ int try_main(int argc, char **argv) { return 1; } - const auto real_type_str = - (argc >= 9) ? std::string(argv[8]) : std::string("double"); - - if (real_type_str != "float" && real_type_str != "double") { - std::cerr << "Error: invalid real type: " << real_type_str - << "\n Valid option includes: float or " - "double. \n"; + const std::string scalar_type_str = (argc >= 9 ? argv[8] : "double"); + if (scalar_type_str != "double" && scalar_type_str != "float" && + scalar_type_str != "zdouble" && scalar_type_str != "zfloat") { + std::cerr << "Error: invalid real type " << scalar_type_str << ".\n"; + std::cerr << " valid real types are \"double\", \"float\", " + "\"zdouble\", and \"zfloat\".\n"; + return 1; } const auto storage_type = @@ -357,7 +382,7 @@ int try_main(int argc, char **argv) { "cuda_um_btas_varray or cuda_um_thrust_vector " "or cpu_cuda_vector. \n"; } - std::cout << "Storage type: " << storage_type << "<" << real_type_str << ">" + std::cout << "Storage type: " << storage_type << "<" << scalar_type_str << ">" << std::endl; // auto to_bool = [](const std::string &str) { // return (str == "true" || str == "True" || str == "TRUE" || str == "1" || @@ -424,7 +449,7 @@ int try_main(int argc, char **argv) { } // print device properties // if (storage_type == "cpu_cuda_vector") { - // if (real_type_str == "double") + // if (scalar_type_str == "double") // do_main_body>(world, Nm, Bm, Nn, // Bn, // Nk, Bk, nrepeat); @@ -434,15 +459,24 @@ int try_main(int argc, char **argv) { // Nk, Bk, nrepeat); // } else if (storage_type == "cuda_um_btas_varray") { if (storage_type == "cuda_um_btas_varray") { - if (real_type_str == "double") + if (scalar_type_str == "double") do_main_body>( world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); - else + else if (scalar_type_str == "float") do_main_body>(world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); + else if (scalar_type_str == "zdouble") + do_main_body>>( + world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); + else if (scalar_type_str == "zfloat") + do_main_body>>( + world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); + else { + abort(); // unreachable + } } // else if (storage_type == "cuda_um_thrust_vector") { - // if (real_type_str == "double") + // if (scalar_type_str == "double") // do_main_body>( // world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); // else diff --git a/examples/dgemm/ta_dense_asymm.cpp b/examples/dgemm/ta_dense_asymm.cpp index ac72a39209..40183603bb 100644 --- a/examples/dgemm/ta_dense_asymm.cpp +++ b/examples/dgemm/ta_dense_asymm.cpp @@ -75,23 +75,6 @@ int main(int argc, char** argv) { const std::size_t Tn = Nn / Bn; const std::size_t Tk = Nk / Bk; - if (world.rank() == 0) - std::cout << "TiledArray: dense matrix multiply test...\n" - << "Number of nodes = " << world.size() - << "\nScalar type = " << scalar_type_str - << "\nSize of A = " << Nm << "x" << Nk << " (" - << double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)" - << "\nSize of A block = " << Bm << "x" << Bk - << "\nSize of B = " << Nk << "x" << Nn << " (" - << double(Nk * Nn * sizeof(double)) / 1.0e9 << " GB)" - << "\nSize of B block = " << Bk << "x" << Bn - << "\nSize of C = " << Nm << "x" << Nn << " (" - << double(Nm * Nn * sizeof(double)) / 1.0e9 << " GB)" - << "\nSize of C block = " << Bm << "x" << Bn - << "\n# of blocks of C = " << Tm * Tn - << "\nAverage # of blocks of C/node = " - << double(Tm * Tn) / double(world.size()) << "\n"; - // Construct TiledRange std::vector blocking_m; blocking_m.reserve(Tm + 1); @@ -148,6 +131,23 @@ int main(int argc, char** argv) { * static_cast(Nn) * static_cast(Nm) * static_cast(Nk); + if (world.rank() == 0) + std::cout << "TiledArray: dense matrix multiply test...\n" + << "Number of nodes = " << world.size() + << "\nScalar type = " << scalar_type_str + << "\nSize of A = " << Nm << "x" << Nk << " (" + << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)" + << "\nSize of A block = " << Bm << "x" << Bk + << "\nSize of B = " << Nk << "x" << Nn << " (" + << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)" + << "\nSize of B block = " << Bk << "x" << Bn + << "\nSize of C = " << Nm << "x" << Nn << " (" + << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)" + << "\nSize of C block = " << Bm << "x" << Bn + << "\n# of blocks of C = " << Tm * Tn + << "\nAverage # of blocks of C/node = " + << double(Tm * Tn) / double(world.size()) << "\n"; + auto memtrace = [do_memtrace, &world](const std::string& str) -> void { if (do_memtrace) { world.gop.fence(); diff --git a/src/TiledArray/cuda/btas_um_tensor.cpp b/src/TiledArray/cuda/btas_um_tensor.cpp index 58c3981f18..9423e7563d 100644 --- a/src/TiledArray/cuda/btas_um_tensor.cpp +++ b/src/TiledArray/cuda/btas_um_tensor.cpp @@ -11,6 +11,10 @@ template class btas::varray>; template class btas::varray>; +template class btas::varray< + std::complex, TiledArray::cuda_um_allocator>>; +template class btas::varray, + TiledArray::cuda_um_allocator>>; template class btas::varray>; template class btas::varray>; @@ -18,6 +22,12 @@ template class btas::Tensor>; template class btas::Tensor>; +template class btas::Tensor< + std::complex, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>; +template class btas::Tensor< + std::complex, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>; template class btas::Tensor>; template class btas::Tensor>>; template class TiledArray::Tile>>; +template class TiledArray::Tile< + btas::Tensor, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>>; +template class TiledArray::Tile< + btas::Tensor, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>>; template class TiledArray::Tile< btas::Tensor>>; template class TiledArray::Tile &array) { extern template class btas::varray>; extern template class btas::varray>; +extern template class btas::varray< + std::complex, TiledArray::cuda_um_allocator>>; +extern template class btas::varray< + std::complex, TiledArray::cuda_um_allocator>>; extern template class btas::varray>; extern template class btas::varray>; @@ -787,6 +791,12 @@ extern template class btas::Tensor>; extern template class btas::Tensor>; +extern template class btas::Tensor< + std::complex, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>; +extern template class btas::Tensor< + std::complex, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>; extern template class btas::Tensor>; extern template class btas::Tensor>>; extern template class TiledArray::Tile>>; +extern template class TiledArray::Tile< + btas::Tensor, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>>; +extern template class TiledArray::Tile< + btas::Tensor, TiledArray::Range, + TiledArray::cuda_um_btas_varray>>>; extern template class TiledArray::Tile< btas::Tensor>>; extern template class TiledArray::Tile>( size_t size) { dev_vec.resize(size); } +template<> +void resize,thrust::device_allocator>>( + thrust::device_vector, thrust::device_allocator>>& dev_vec, + size_t size) { + dev_vec.resize(size); +} +template<> +void resize,thrust::device_allocator>>( + thrust::device_vector, thrust::device_allocator>>& dev_vec, + size_t size) { + dev_vec.resize(size); +} } namespace TiledArray { template class cpu_cuda_vector; template class cpu_cuda_vector; +template class cpu_cuda_vector>; +template class cpu_cuda_vector>; } // Thrust included in CUDA 9+ seems to generate uninstantiated CUB calls @@ -35,6 +49,12 @@ auto force_missing_copy_instantiations_double() { auto force_missing_copy_instantiations_float() { return force_missing_copy_instantiations(); } +auto force_missing_copy_instantiations_zdouble() { + return force_missing_copy_instantiations>(); +} +auto force_missing_copy_instantiations_zfloat() { + return force_missing_copy_instantiations>(); +} auto force_missing_copy_instantiations_unsigned_long() { return force_missing_copy_instantiations(); } @@ -65,4 +85,3 @@ auto force_missing_copy_n_instantiations_long_long(){ } #endif // __CUDACC_VER_MAJOR__ >= 9 - diff --git a/src/TiledArray/cuda/cpu_cuda_vector.h b/src/TiledArray/cuda/cpu_cuda_vector.h index 7370eeaa2e..5a6e52beb5 100644 --- a/src/TiledArray/cuda/cpu_cuda_vector.h +++ b/src/TiledArray/cuda/cpu_cuda_vector.h @@ -158,6 +158,8 @@ class cpu_cuda_vector { extern template class cpu_cuda_vector; extern template class cpu_cuda_vector; +extern template class cpu_cuda_vector>; +extern template class cpu_cuda_vector>; template diff --git a/src/TiledArray/cuda/cublas.h b/src/TiledArray/cuda/cublas.h index a5d3da7afc..8d4085eabb 100644 --- a/src/TiledArray/cuda/cublas.h +++ b/src/TiledArray/cuda/cublas.h @@ -54,6 +54,25 @@ inline void __cublasSafeCall(cublasStatus_t err, const char *file, namespace TiledArray { +namespace detail { + +template +auto cublasPointer(T *std_complex_ptr) { + using Scalar = TiledArray::detail::scalar_t; + static_assert(std::is_same_v || + std::is_same_v); + constexpr bool DP = std::is_same_v; + using cuT = std::conditional_t, + cuDoubleComplex, cuComplex>; + if constexpr (std::is_const_v< + std::remove_pointer_t>) { + return reinterpret_cast(std_complex_ptr); + } else + return reinterpret_cast(std_complex_ptr); +}; + +} // namespace detail + /* * cuBLAS interface functions */ @@ -117,6 +136,29 @@ inline cublasStatus_t cublasGemm( return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); } +template <> +inline cublasStatus_t cublasGemm>( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const std::complex *alpha, + const std::complex *A, int lda, const std::complex *B, + int ldb, const std::complex *beta, std::complex *C, int ldc) { + using detail::cublasPointer; + return cublasCgemm(handle, transa, transb, m, n, k, cublasPointer(alpha), + cublasPointer(A), lda, cublasPointer(B), ldb, + cublasPointer(beta), cublasPointer(C), ldc); +} +template <> +inline cublasStatus_t cublasGemm>( + cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, const std::complex *alpha, + const std::complex *A, int lda, const std::complex *B, + int ldb, const std::complex *beta, std::complex *C, + int ldc) { + using detail::cublasPointer; + return cublasZgemm(handle, transa, transb, m, n, k, cublasPointer(alpha), + cublasPointer(A), lda, cublasPointer(B), ldb, + cublasPointer(beta), cublasPointer(C), ldc); +} /// AXPY interface functions @@ -139,6 +181,25 @@ inline cublasStatus_t cublasAxpy(cublasHandle_t handle, int n, return cublasDaxpy(handle, n, alpha, x, incx, y, incy); } +template <> +inline cublasStatus_t cublasAxpy, std::complex>( + cublasHandle_t handle, int n, const std::complex *alpha, + const std::complex *x, int incx, std::complex *y, int incy) { + using detail::cublasPointer; + return cublasCaxpy(handle, n, cublasPointer(alpha), cublasPointer(x), incx, + cublasPointer(y), incy); +} + +template <> +inline cublasStatus_t cublasAxpy, std::complex>( + cublasHandle_t handle, int n, const std::complex *alpha, + const std::complex *x, int incx, std::complex *y, + int incy) { + using detail::cublasPointer; + return cublasZaxpy(handle, n, cublasPointer(alpha), cublasPointer(x), incx, + cublasPointer(y), incy); +} + template <> inline cublasStatus_t cublasAxpy(cublasHandle_t handle, int n, const int *alpha, const float *x, diff --git a/src/TiledArray/cuda/kernel/mult_kernel.cu b/src/TiledArray/cuda/kernel/mult_kernel.cu index 8bbcae4927..aa3cadbc72 100644 --- a/src/TiledArray/cuda/kernel/mult_kernel.cu +++ b/src/TiledArray/cuda/kernel/mult_kernel.cu @@ -45,6 +45,16 @@ void mult_to_cuda_kernel(double *result, const double *arg, std::size_t n, mult_to_cuda_kernel_impl(result, arg, n, stream, device_id); } +void mult_to_cuda_kernel(std::complex *result, const std::complex *arg, std::size_t n, + cudaStream_t stream, int device_id) { + mult_to_cuda_kernel_impl(result, arg, n, stream, device_id); +} + +void mult_to_cuda_kernel(std::complex *result, const std::complex *arg, std::size_t n, + cudaStream_t stream, int device_id) { + mult_to_cuda_kernel_impl(result, arg, n, stream, device_id); +} + /// result[i] = arg1[i] * arg2[i] void mult_cuda_kernel(int *result, const int *arg1, const int *arg2, std::size_t n, cudaStream_t stream, int device_id){ @@ -61,6 +71,15 @@ void mult_cuda_kernel(double *result, const double *arg1, const double *arg2, st mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id); } +void mult_cuda_kernel(std::complex *result, const std::complex *arg1, const std::complex *arg2, std::size_t n, + cudaStream_t stream, int device_id){ + mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id); +} + +void mult_cuda_kernel(std::complex *result, const std::complex *arg1, const std::complex *arg2, std::size_t n, + cudaStream_t stream, int device_id){ + mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id); +} } // namespace TiledArray diff --git a/src/TiledArray/cuda/kernel/mult_kernel.h b/src/TiledArray/cuda/kernel/mult_kernel.h index 7c333e879a..0c5c3f7822 100644 --- a/src/TiledArray/cuda/kernel/mult_kernel.h +++ b/src/TiledArray/cuda/kernel/mult_kernel.h @@ -28,6 +28,8 @@ #ifdef TILEDARRAY_HAS_CUDA +#include + namespace TiledArray { /// result[i] = result[i] * arg[i] @@ -40,6 +42,14 @@ void mult_to_cuda_kernel(float *result, const float *arg, std::size_t n, void mult_to_cuda_kernel(double *result, const double *arg, std::size_t n, cudaStream_t stream, int device_id); +void mult_to_cuda_kernel(std::complex *result, + const std::complex *arg, std::size_t n, + cudaStream_t stream, int device_id); + +void mult_to_cuda_kernel(std::complex *result, + const std::complex *arg, std::size_t n, + cudaStream_t stream, int device_id); + /// result[i] = arg1[i] * arg2[i] void mult_cuda_kernel(int *result, const int *arg1, const int *arg2, std::size_t n, cudaStream_t stream, int device_id); @@ -50,6 +60,16 @@ void mult_cuda_kernel(float *result, const float *arg1, const float *arg2, void mult_cuda_kernel(double *result, const double *arg1, const double *arg2, std::size_t n, cudaStream_t stream, int device_id); +void mult_cuda_kernel(std::complex *result, + const std::complex *arg1, + const std::complex *arg2, std::size_t n, + cudaStream_t stream, int device_id); + +void mult_cuda_kernel(std::complex *result, + const std::complex *arg1, + const std::complex *arg2, std::size_t n, + cudaStream_t stream, int device_id); + } // namespace TiledArray #endif // TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.cu b/src/TiledArray/cuda/kernel/reduce_kernel.cu index 1e1550260f..d24669b920 100644 --- a/src/TiledArray/cuda/kernel/reduce_kernel.cu +++ b/src/TiledArray/cuda/kernel/reduce_kernel.cu @@ -33,7 +33,6 @@ namespace TiledArray { int product_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, int device_id){ return product_reduce_cuda_kernel_impl(arg, n, stream, device_id); - } float product_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, @@ -47,6 +46,16 @@ double product_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream return product_reduce_cuda_kernel_impl(arg, n, stream, device_id); } +std::complex product_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return product_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + +std::complex product_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + + return product_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} // foreach(i) result += arg[i] int sum_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, @@ -64,6 +73,16 @@ double sum_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id); } +std::complex sum_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + +std::complex sum_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + // foreach(i) result = max(result, arg[i]) int max_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, int device_id){ @@ -112,6 +131,16 @@ double absmax_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id); } +std::complex absmax_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + +std::complex absmax_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + // foreach(i) result = min(result, abs(arg[i])) int absmin_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, int device_id){ @@ -128,6 +157,16 @@ double absmin_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id); } +std::complex absmin_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + +std::complex absmin_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, + int device_id){ + return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id); +} + } // namespace TiledArray #endif // TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.h b/src/TiledArray/cuda/kernel/reduce_kernel.h index 857cad6c0c..1bcf526ee4 100644 --- a/src/TiledArray/cuda/kernel/reduce_kernel.h +++ b/src/TiledArray/cuda/kernel/reduce_kernel.h @@ -28,6 +28,8 @@ #ifdef TILEDARRAY_HAS_CUDA +#include + namespace TiledArray { // foreach(i) result *= arg[i] @@ -40,6 +42,14 @@ float product_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, double product_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, int device_id); +std::complex product_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + +std::complex product_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + // foreach(i) result += arg[i] int sum_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, int device_id); @@ -50,6 +60,14 @@ float sum_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, double sum_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, int device_id); +std::complex sum_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + +std::complex sum_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + // foreach(i) result = max(result, arg[i]) int max_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, int device_id); @@ -80,6 +98,14 @@ float absmax_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, double absmax_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, int device_id); +std::complex absmax_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + +std::complex absmax_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + // foreach(i) result = min(result, abs(arg[i])) int absmin_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, int device_id); @@ -90,6 +116,14 @@ float absmin_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, double absmin_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, int device_id); +std::complex absmin_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + +std::complex absmin_cuda_kernel(const std::complex *arg, + std::size_t n, cudaStream_t stream, + int device_id); + } // namespace TiledArray #endif // TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h b/src/TiledArray/cuda/kernel/reduce_kernel_impl.h index 12a8aa1e19..9dc6507cca 100644 --- a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h +++ b/src/TiledArray/cuda/kernel/reduce_kernel_impl.h @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -38,9 +39,15 @@ namespace TiledArray { namespace detail { template -struct absolute_value : public thrust::unary_function { - __host__ __device__ T operator()(const T &x) const { - return x < T(0) ? -x : x; +struct absolute_value + : public thrust::unary_function> { + __host__ __device__ TiledArray::detail::scalar_t operator()( + const T &x) const { + using RT = TiledArray::detail::scalar_t; + if constexpr (!TiledArray::detail::is_complex_v) { + return x < RT(0) ? -x : x; + } else + return std::sqrt(x.real() * x.real() + x.imag() * x.imag()); } }; @@ -93,10 +100,11 @@ T min_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream, } template -T absmax_reduce_cuda_kernel_impl(const T *arg, std::size_t n, - cudaStream_t stream, int device_id) { - T init(0); - thrust::maximum max_op; +TiledArray::detail::scalar_t absmax_reduce_cuda_kernel_impl( + const T *arg, std::size_t n, cudaStream_t stream, int device_id) { + using TR = TiledArray::detail::scalar_t; + TR init(0); + thrust::maximum max_op; detail::absolute_value abs_op; CudaSafeCall(cudaSetDevice(device_id)); @@ -110,10 +118,11 @@ T absmax_reduce_cuda_kernel_impl(const T *arg, std::size_t n, } template -T absmin_reduce_cuda_kernel_impl(const T *arg, std::size_t n, - cudaStream_t stream, int device_id) { - T init(0); - thrust::minimum min_op; +TiledArray::detail::scalar_t absmin_reduce_cuda_kernel_impl( + const T *arg, std::size_t n, cudaStream_t stream, int device_id) { + using TR = TiledArray::detail::scalar_t; + TR init = std::numeric_limits::max(); + thrust::minimum min_op; detail::absolute_value abs_op; CudaSafeCall(cudaSetDevice(device_id)); From 67b2360a57635bfed8a6fe598960ef9bda0e9cae Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 27 Mar 2023 22:12:05 -0400 Subject: [PATCH 015/592] add --expt-relaxed-constexpr to CMAKE_CUDA_FLAGS to be able to handle std::complex --- external/cuda.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/external/cuda.cmake b/external/cuda.cmake index 3b2eb6ce37..49f2cbc558 100644 --- a/external/cuda.cmake +++ b/external/cuda.cmake @@ -5,6 +5,13 @@ set(CMAKE_CUDA_STANDARD 17) set(CMAKE_CUDA_EXTENSIONS OFF) set(CMAKE_CUDA_STANDARD_REQUIRED ON) set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) +# N.B. need relaxed constexpr for std::complex +# see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#constexpr-functions%5B/url%5D: +if (DEFINED CMAKE_CUDA_FLAGS) + set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr ${CMAKE_CUDA_FLAGS}") +else() + set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr") +endif() enable_language(CUDA) set(CUDA_FOUND TRUE) From 661da1025f2f0e014113a6846a51b442afe91fdb Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 29 Mar 2023 18:55:06 -0400 Subject: [PATCH 016/592] a("i,j") = b("i,j") when a and b are same type is now deep copy of b into a to reduce surprise, since e.g. a("i,j") = b("j,i") makes new a --- src/TiledArray/expressions/tsr_expr.h | 10 ++++++++-- tests/expressions_impl.h | 12 ++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/expressions/tsr_expr.h b/src/TiledArray/expressions/tsr_expr.h index a17fa65cbc..8430a3c852 100644 --- a/src/TiledArray/expressions/tsr_expr.h +++ b/src/TiledArray/expressions/tsr_expr.h @@ -112,8 +112,14 @@ class TsrExpr : public Expr> { /// Expression assignment operator /// \param other The expression that will be assigned to the array - array_type& operator=(TsrExpr_& other) { - other.eval_to(*this); + array_type& operator=(const TsrExpr_& other) { + // N.B. corner case: whether A("i,j") = B("i,j") is deep or shallow copy + // depends on whether the copy semantics of tiles ... to be sure use clone + if (IndexList(this->annotation()) == IndexList(other.annotation())) { + array_ = other.array().clone(); + } else { + other.eval_to(*this); + } return array_; } diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h index 388cdd8e5d..91bcb10cc4 100644 --- a/tests/expressions_impl.h +++ b/tests/expressions_impl.h @@ -64,6 +64,18 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(tensor_factories, F, Fixtures, F) { ca("a,b,c").block(boost::combine(lobound, upbound))); BOOST_CHECK_NO_THROW(c("a,b,c") = ca("a,b,c").block(iv(3, 3, 3), iv(5, 5, 5))); + + // make sure that c("abc") = a("abc") does a deep copy + { + BOOST_CHECK_NO_THROW(c("a,b,c") = a("a, b, c")); + for (auto&& idx : c.tiles_range()) { + if (c.is_local(idx) && !c.is_local(idx) && a.is_local(idx) && + !a.is_zero(idx)) { + BOOST_CHECK(c.find_local(idx).get().data() != + a.find_local(idx).get().data()); + } + } + } } BOOST_FIXTURE_TEST_CASE_TEMPLATE(block_tensor_factories, F, Fixtures, F) { From 9fab62564e90e590c8df58aec1b3ffd39bc03b80 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 31 Mar 2023 06:56:33 -0400 Subject: [PATCH 017/592] SizeArray can be compared to a sized range --- src/TiledArray/size_array.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/TiledArray/size_array.h b/src/TiledArray/size_array.h index 6edbecb222..bd52139ce5 100644 --- a/src/TiledArray/size_array.h +++ b/src/TiledArray/size_array.h @@ -42,6 +42,15 @@ class SizeArray { T* first_ = nullptr; ///< First element of the array T* last_ = nullptr; ///< Last element of the array + // can compare to any sized range + template + friend std::enable_if_t< + is_sized_range_v> && + !std::is_same_v, std::remove_reference_t> && + !std::is_base_of_v, std::remove_reference_t>, + bool> + operator==(const SizeArray&, SizedRange&&); + public: // type definitions typedef T value_type; @@ -436,6 +445,19 @@ class SizeArray { }; // class SizeArray +template +std::enable_if_t< + is_sized_range_v> && + !std::is_same_v, std::remove_reference_t> && + !std::is_base_of_v, std::remove_reference_t>, + bool> +operator==(const SizeArray& idx1, SizedRange&& idx2) { + if (idx1.size() == idx2.size()) + return std::equal(idx1.begin(), idx1.end(), idx2.begin()); + else + return false; +} + template inline std::vector operator*(const Permutation& perm, const SizeArray& orig) { From 07fb1e51c90d88530821e8b57a45de2ababbea73 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 31 Mar 2023 06:58:10 -0400 Subject: [PATCH 018/592] introduce {Index,IndexView} aliases to Range::{index_type,index_view_type} --- src/TiledArray/range.h | 4 ++++ tests/range.cpp | 24 +++++++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h index 8108ecf227..b7a38d38b0 100644 --- a/src/TiledArray/range.h +++ b/src/TiledArray/range.h @@ -1245,6 +1245,10 @@ class Range { }; // class Range +// lift Range::index_type and Range::index_view_type into user-land +using Index = Range::index_type; +using IndexView = Range::index_view_type; + inline Range& Range::operator*=(const Permutation& perm) { TA_ASSERT(perm.size() == rank_); if (rank_ > 1ul) { diff --git a/tests/range.cpp b/tests/range.cpp index 1ad294363f..a71a0629d0 100644 --- a/tests/range.cpp +++ b/tests/range.cpp @@ -65,15 +65,37 @@ BOOST_FIXTURE_TEST_SUITE(range_suite, RangeFixture, TA_UT_LABEL_SERIAL) BOOST_AUTO_TEST_CASE(dimension_accessor) { BOOST_CHECK_EQUAL_COLLECTIONS(r.lobound_data(), r.lobound_data() + r.rank(), start.begin(), start.end()); // check start() + BOOST_CHECK_EQUAL_COLLECTIONS(r.lobound().begin(), r.lobound().end(), + start.begin(), start.end()); // check start() + BOOST_CHECK_EQUAL(r.lobound(), start); // check start() + BOOST_CHECK_EQUAL(r.lobound(), + (Index{start.begin(), start.end()})); // check finish() BOOST_CHECK_EQUAL_COLLECTIONS(r.upbound_data(), r.upbound_data() + r.rank(), finish.begin(), finish.end()); // check finish() + BOOST_CHECK_EQUAL_COLLECTIONS(r.upbound().begin(), r.upbound().end(), + finish.begin(), + finish.end()); // check finish() + BOOST_CHECK_EQUAL(r.upbound(), finish); // check finish() + BOOST_CHECK_EQUAL(r.upbound(), + (Index{finish.begin(), finish.end()})); // check finish() BOOST_CHECK_EQUAL_COLLECTIONS(r.extent_data(), r.extent_data() + r.rank(), size.begin(), size.end()); // check size() + BOOST_CHECK_EQUAL_COLLECTIONS(r.extent().begin(), r.extent().end(), + size.begin(), size.end()); // check size() + BOOST_CHECK_EQUAL(r.extent(), size); // check size() + BOOST_CHECK_EQUAL(r.extent(), + (Index{size.begin(), size.end()})); // check size() BOOST_CHECK_EQUAL_COLLECTIONS(r.stride_data(), r.stride_data() + r.rank(), weight.begin(), weight.end()); // check weight() - BOOST_CHECK_EQUAL(r.volume(), volume); // check volume() + BOOST_CHECK_EQUAL_COLLECTIONS(r.stride().begin(), r.stride().end(), + weight.begin(), + weight.end()); // check weight() + BOOST_CHECK_EQUAL(r.stride(), weight); // check weight() + BOOST_CHECK_EQUAL(r.stride(), + (Index{weight.begin(), weight.end()})); // check weight() + BOOST_CHECK_EQUAL(r.volume(), volume); // check volume() for (size_t d = 0; d != r.rank(); ++d) { auto range_d = r.dim(d); BOOST_CHECK_EQUAL(range_d.first, start[d]); From d8ddc210b5ecf838dcfb66fd7acf38e869ff1810 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 31 Mar 2023 06:59:36 -0400 Subject: [PATCH 019/592] added demo2 exec for the paper code snippets --- examples/demo/CMakeLists.txt | 8 ++-- examples/demo/demo2.cpp | 93 ++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+), 3 deletions(-) create mode 100644 examples/demo/demo2.cpp diff --git a/examples/demo/CMakeLists.txt b/examples/demo/CMakeLists.txt index c4c533cbf0..c2da9cb36e 100644 --- a/examples/demo/CMakeLists.txt +++ b/examples/demo/CMakeLists.txt @@ -16,8 +16,10 @@ # along with this program. If not, see . # -# Create the ta_fock_build executable - -# Add the demo executable +# Standard TA demo to accompany the keynote slides add_ta_executable(demo "demo.cpp" "tiledarray") add_dependencies(examples-tiledarray demo) + +# TA demo snippets for the paper +add_ta_executable(demo2 "demo2.cpp" "tiledarray") +add_dependencies(examples-tiledarray demo2) diff --git a/examples/demo/demo2.cpp b/examples/demo/demo2.cpp new file mode 100644 index 0000000000..64144eab70 --- /dev/null +++ b/examples/demo/demo2.cpp @@ -0,0 +1,93 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2023 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#ifndef EXAMPLES_DEMO_DEMO2_CPP_ +#define EXAMPLES_DEMO_DEMO2_CPP_ + +#include +#include + +#include +#include + +int main(int argc, char* argv[]) { + using namespace std; + + TA::srand(2017); + TA::World& world = TA::initialize(argc, argv); + + using namespace TA; + + // $\rho \equiv \mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ + Range ρ{{1, 11}, {-1, 9}}; + assert((ρ.lobound() == Index{1, -1})); + assert((ρ.upbound() == Index{11, 9})); + assert((ρ.extent() == Index{10, 10})); + assert(ρ.volume() == 100); + assert((ρ.stride() == Index{10, 1})); + assert((ρ.ordinal({1, -1}) == 0)); + assert((ρ.ordinal({10, 8}) + 1 == ρ.volume())); + // prints "[1,-1] [1,0] .. [1,8] [2,-1] .. [10,8] " + for (auto&& idx : ρ) cout << idx << " "; + + // $\mathbb{Z}_{1,11}$ tiled into $\mathbb{Z}_{1,5}$, $\mathbb{Z}_{5,8}$, and + // $\mathbb{Z}_{8,11}$ + TiledRange1 τ0{1, 5, 8, 11}; // hashmarks + assert(τ0.extent() == 10); // there are 10 elements in τ0 + assert((τ0.elements_range() == + Range1{1, 11})); // elements indexed by $\mathbb{Z}_{1,11}$ + assert(τ0.tile_extent() == 3); // there are 3 tiles in τ0 + assert((τ0.tiles_range() == + Range1{0, 3})); // tiles indexed by $\mathbb{Z}_{0,3}$ + assert((τ0.tile(1) == Range1{5, 8})); // 1st tile of τ0 is $\mathbb{Z}_{5,8}$ + + // $\mathbb{Z}_{-1,9}$ tiled into $\mathbb{Z}_{-1,5}$ and $\mathbb{Z}_{5,9}$ + TiledRange1 τ1{-1, 5, 9}; + + // 2nd tile of $\code{tau0}$ is $\mathbb{Z}_{5,8}$ + assert((τ0.tile(1) == Range1{5, 8})); + // 1st tile of $\code{tau1}$ is $\mathbb{Z}_{-1,5}$ + assert((τ1.tile(0) == Range1{-1, 5})); + + // prints "-1 0 1 2 3 4 " + for (auto&& i : τ1.tile(0)) cout << i << " "; + + // tiling of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ by tensor product + // of + // $\code{τ0}$ and $\code{τ1}$ + TiledRange τ{τ0, τ1}; + // shortcut + TiledRange same_as_τ{{1, 5, 8, 11}, {-1, 5, 9}}; + + // tile index {0,0} refers to tile $\mathbb{Z}_{1,5} \otimes + // \mathbb{Z}_{-1,5}$ + auto tile_0_0 = τ.tile({0, 0}); + assert((tile_0_0 == Range{{1, 5}, {-1, 5}})); + + // default instance of $\code{DistArray}$ is a dense array of $\code{double}$s + DistArray array0(world, τ); + array0.fill(1.0); // fill $\code{array0}$ with 1's + + // grab a tile NB this returns a ${\bf future}$ to a tile; see Section 3.2. + auto t00 = array0.find({0, 0}); + + return 0; +} + +#endif /* EXAMPLES_DEMO_DEMO2_CPP_ */ From 94a8fd9e71cd4bcc3f639bb05726314034e21841 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 31 Mar 2023 07:00:10 -0400 Subject: [PATCH 020/592] demo: use TA::srand --- examples/demo/demo.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/demo/demo.cpp b/examples/demo/demo.cpp index d6c1612d95..05f9a25bf5 100644 --- a/examples/demo/demo.cpp +++ b/examples/demo/demo.cpp @@ -39,7 +39,7 @@ auto make_tile(const TA::Range &range) { int main(int argc, char *argv[]) { using namespace std; - std::srand(2017); + TA::srand(2017); TA::World &world = TA::initialize(argc, argv); using namespace TA; @@ -88,7 +88,6 @@ int main(int argc, char *argv[]) { SparseShape shape(shape_tensor, TR); TSpArrayD a1(world, TR, shape); a1.fill_random(); // for deterministic fill: - // TA::srand(seed); // a1.fill_random(); cout << "a1:\n" << a1 << endl; world.gop.fence(); From 6a55cdceb1a9808bb7135f9b25a5ccadd73f3787 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 1 Apr 2023 13:21:29 -0500 Subject: [PATCH 021/592] demo2: comment out most code unless compiler can support unicode chars in variable names --- examples/demo/demo2.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/examples/demo/demo2.cpp b/examples/demo/demo2.cpp index 64144eab70..5818fae22d 100644 --- a/examples/demo/demo2.cpp +++ b/examples/demo/demo2.cpp @@ -34,14 +34,34 @@ int main(int argc, char* argv[]) { using namespace TA; + // requires compiler new enough to support unicode characters in variable + // names +#ifndef TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES +#ifdef TILEDARRAY_CXX_COMPILER_IS_GCC +#if __GNUC__ >= 10 +#define TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES 1 +#endif +#elif !defined(TILEDARRAY_CXX_COMPILER_IS_ICC) +#define TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES 1 +#endif +#endif // !defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES) + +#ifdef TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES + // $\rho \equiv \mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ Range ρ{{1, 11}, {-1, 9}}; + // lower bound of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ assert((ρ.lobound() == Index{1, -1})); + // upper bound of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ assert((ρ.upbound() == Index{11, 9})); + // extent of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ assert((ρ.extent() == Index{10, 10})); + // the number of elements in $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ assert(ρ.volume() == 100); + // row-major order assert((ρ.stride() == Index{10, 1})); assert((ρ.ordinal({1, -1}) == 0)); + assert((ρ.ordinal({1, 0}) == 1)); assert((ρ.ordinal({10, 8}) + 1 == ρ.volume())); // prints "[1,-1] [1,0] .. [1,8] [2,-1] .. [10,8] " for (auto&& idx : ρ) cout << idx << " "; @@ -87,6 +107,8 @@ int main(int argc, char* argv[]) { // grab a tile NB this returns a ${\bf future}$ to a tile; see Section 3.2. auto t00 = array0.find({0, 0}); +#endif // defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES) + return 0; } From 28127f72ef4d84539fdbd2d30f1e9e28f418bc1f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 3 Apr 2023 08:34:24 -0500 Subject: [PATCH 022/592] Tensor's data is shared_ptr, not shared_ptr --- src/TiledArray/tensor/tensor.h | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index fcb5ffbe7a..329355bf1d 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -163,7 +163,7 @@ class Tensor { #endif allocator.deallocate(ptr, size); }; - this->data_ = std::shared_ptr(ptr, std::move(deleter)); + this->data_ = std::shared_ptr(ptr, std::move(deleter)); #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { ptr_registry()->insert( @@ -201,7 +201,7 @@ class Tensor { #endif allocator.deallocate(ptr, size); }; - this->data_ = std::shared_ptr(ptr, std::move(deleter)); + this->data_ = std::shared_ptr(ptr, std::move(deleter)); #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { ptr_registry()->insert( @@ -216,7 +216,7 @@ class Tensor { /// Number of `range_`-sized blocks in `data_` /// \note this is not used for (in)equality comparison size_t batch_size_ = 1; - std::shared_ptr data_; ///< Shared pointer to the data + std::shared_ptr data_; ///< Shared pointer to the data public: /// constructs an empty (null) Tensor @@ -491,8 +491,8 @@ class Tensor { /// \param batch_size The batch size /// \param data shared pointer to the data Tensor(const range_type& range, size_t batch_size, - std::shared_ptr data) - : range_(range), batch_size_(batch_size), data_(data) { + std::shared_ptr data) + : range_(range), batch_size_(batch_size), data_(std::move(data)) { #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { ptr_registry()->insert( @@ -502,6 +502,20 @@ class Tensor { #endif } + /// Construct a tensor with a range equal to \c range using existing data + /// assuming unit batch size \param range The range of the tensor \param data + /// shared pointer to the data + Tensor(const range_type& range, std::shared_ptr data) + : range_(range), batch_size_(1), data_(std::move(data)) { +#ifdef TA_TENSOR_MEM_TRACE + if (nbytes() >= trace_if_larger_than_) { + ptr_registry()->insert( + this, + make_string("TA::Tensor(range, data)::data_.get()=", data_.get())); + } +#endif + } + /// The batch size accessor /// @return the size of tensor batch represented by `*this` @@ -513,8 +527,8 @@ class Tensor { /// the batch Tensor batch(size_t idx) const { TA_ASSERT(idx < this->batch_size()); - std::shared_ptr data(this->data_, - this->data_.get() + idx * this->size()); + std::shared_ptr data(this->data_, + this->data_.get() + idx * this->size()); return Tensor(this->range(), 1, data); } @@ -962,12 +976,14 @@ class Tensor { /// Read-only shared_ptr to the data /// \return A const shared_ptr to the tensor data - std::shared_ptr data_shared() const { return this->data_; } + std::shared_ptr data_shared() const { + return this->data_; + } /// Mutable shared_ptr to the data /// \return A mutable shared_ptr to the tensor data - std::shared_ptr data_shared() { return this->data_; } + std::shared_ptr data_shared() { return this->data_; } /// Test if the tensor is empty From e6f8fd26903c7a00c4bf484518397cb31f86b7d3 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 3 Apr 2023 09:11:34 -0500 Subject: [PATCH 023/592] demo2: added Tensor snippet --- examples/demo/demo2.cpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/examples/demo/demo2.cpp b/examples/demo/demo2.cpp index 5818fae22d..126f7ad1bb 100644 --- a/examples/demo/demo2.cpp +++ b/examples/demo/demo2.cpp @@ -56,6 +56,10 @@ int main(int argc, char* argv[]) { assert((ρ.upbound() == Index{11, 9})); // extent of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ assert((ρ.extent() == Index{10, 10})); + // 1st dimension of ρ is $\mathbb{Z}_{1,11}$ + assert((ρ.dim(0) == Range1{1, 11})); + // 2nd dimension of ρ is $\mathbb{Z}_{-1,9}$ + assert((ρ.dim(1) == Range1{-1, 9})); // the number of elements in $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ assert(ρ.volume() == 100); // row-major order @@ -100,6 +104,28 @@ int main(int argc, char* argv[]) { auto tile_0_0 = τ.tile({0, 0}); assert((tile_0_0 == Range{{1, 5}, {-1, 5}})); + // clang-format off + + // 2-d array of $\code{double}$ 0s, indexed by ρ + Tensor t0(ρ, 0.); + // same as $\code{t0}$ but filled with ordinals + TensorD t1(ρ, [&ρ](auto&& idx) { + return ρ.ordinal(idx); + }); + // print out "0 1 .. 99 " + for (auto&& v : t1) cout << v << " "; + // same as $\code{t0}$, using external buffer + shared_ptr v(new double[ρ.volume()]); + TensorD t2(ρ, v); + v[0] = 1.; + assert(t2(1, -1) == 1.); + // Tensor has shallow-copy semantics + auto t3 = t0; + t0(1, -1) = 2.; + assert(t3(1, -1) == 2.); + + // clang-format on + // default instance of $\code{DistArray}$ is a dense array of $\code{double}$s DistArray array0(world, τ); array0.fill(1.0); // fill $\code{array0}$ with 1's From 0a76a7ad65a9cff81d83aba23fa4dde88a5a3de1 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 3 Apr 2023 11:22:33 -0500 Subject: [PATCH 024/592] added ctors for DistArray that take default world --- src/TiledArray/dist_array.h | 103 +++++++++++++++++++++++++++++++++++- tests/dist_array.cpp | 87 ++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index 09e99eda86..fd0450ed8e 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -158,7 +158,7 @@ class DistArray : public madness::archive::ParallelSerializableObject { using rebind_numeric_t = typename rebind_numeric::type; private: - pimpl_type pimpl_; ///< managed ptr to Array implementation + pimpl_type pimpl_ = {}; ///< managed ptr to Array implementation bool defer_deleter_to_next_fence_ = false; ///< if true, the impl object is scheduled to be destroyed in the ///< next fence @@ -277,7 +277,7 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// array is uninitialized, but these arrays may be assign via a tensor /// expression assignment or the copy construction. - DistArray() : pimpl_() {} + DistArray() = default; /// Copy constructor @@ -298,6 +298,19 @@ class DistArray : public madness::archive::ParallelSerializableObject { const std::shared_ptr& pmap = {}) : pimpl_(init(world, trange, shape_type(1, trange), pmap)) {} + /// Dense array constructor + + /// Constructs an array with the given meta data in default World. + /// This constructor only initializes the array meta data; + /// the array tiles are empty and must be assigned by the user. + /// \param trange The tiled range object that will be used to set the array + /// tiling. + /// \param pmap The tile index -> process map + explicit DistArray(const trange_type& trange, + const std::shared_ptr& pmap = {}) + : pimpl_(init(get_default_world(), trange, shape_type(1, trange), pmap)) { + } + /// Sparse array constructor /// Constructs an array with the given meta data. This constructor only @@ -312,6 +325,19 @@ class DistArray : public madness::archive::ParallelSerializableObject { std::shared_ptr()) : pimpl_(init(world, trange, shape, pmap)) {} + /// Sparse array constructor + + /// Constructs an array with the given meta data in default World. + /// This constructor only initializes the array meta data; the array tiles + /// are empty and must be assigned by the user. + /// \param trange The tiled range object that will be used to set the array + /// tiling. \param shape The array shape that defines zero and non-zero tiles + /// \param pmap The tile index -> process map + DistArray(const trange_type& trange, const shape_type& shape, + const std::shared_ptr& pmap = + std::shared_ptr()) + : pimpl_(init(get_default_world(), trange, shape, pmap)) {} + /// \name Initializer list constructors /// \brief Creates a new tensor containing the elements in the provided /// `std::initializer_list`. @@ -374,6 +400,41 @@ class DistArray : public madness::archive::ParallelSerializableObject { std::initializer_list>>>>> il) : DistArray(array_from_il(world, il)) {} + + template + explicit DistArray(std::initializer_list il) // N.B. clang does not like + // detail::vector_il here + : DistArray(array_from_il(get_default_world(), il)) {} + + template + explicit DistArray(std::initializer_list> il) + : DistArray(array_from_il(get_default_world(), il)) {} + + template + explicit DistArray( + std::initializer_list>> il) + : DistArray(array_from_il(get_default_world(), il)) {} + + template + explicit DistArray(std::initializer_list>>> + il) + : DistArray(array_from_il(get_default_world(), il)) {} + + template + explicit DistArray( + std::initializer_list>>>> + il) + : DistArray(array_from_il(get_default_world(), il)) {} + + template + explicit DistArray( + std::initializer_list< + std::initializer_list>>>>> + il) + : DistArray(array_from_il(get_default_world(), il)) {} ///@} /// \name Tiling initializer list constructors @@ -440,6 +501,44 @@ class DistArray : public madness::archive::ParallelSerializableObject { std::initializer_list>>>>> il) : DistArray(array_from_il(world, trange, il)) {} + + template + DistArray(const trange_type& trange, std::initializer_list il) + : DistArray(array_from_il(get_default_world(), trange, il)) {} + + template + DistArray(const trange_type& trange, + std::initializer_list> il) + : DistArray(array_from_il(get_default_world(), trange, il)) {} + + template + DistArray( + const trange_type& trange, + std::initializer_list>> il) + : DistArray(array_from_il(get_default_world(), trange, il)) {} + + template + DistArray(const trange_type& trange, + std::initializer_list>>> + il) + : DistArray(array_from_il(get_default_world(), trange, il)) {} + + template + DistArray(const trange_type& trange, + std::initializer_list>>>> + il) + : DistArray(array_from_il(get_default_world(), trange, il)) {} + + template + DistArray( + const trange_type& trange, + std::initializer_list< + std::initializer_list>>>>> + il) + : DistArray(array_from_il(get_default_world(), trange, il)) {} /// @} /// converting copy constructor diff --git a/tests/dist_array.cpp b/tests/dist_array.cpp index 061c5fdd17..c2ac8262d0 100644 --- a/tests/dist_array.cpp +++ b/tests/dist_array.cpp @@ -76,6 +76,13 @@ BOOST_AUTO_TEST_CASE(constructors) { for (ArrayN::const_iterator it = ad.begin(); it != ad.end(); ++it) BOOST_CHECK(!it->probe()); + // Construct a dense array in default world + { + BOOST_REQUIRE_NO_THROW(ArrayN ad(tr)); + ArrayN ad(tr); + BOOST_CHECK_EQUAL(ad.world().id(), get_default_world().id()); + } + // Construct a sparse array BOOST_REQUIRE_NO_THROW( SpArrayN as(world, tr, TiledArray::SparseShape(shape_tensor, tr))); @@ -88,6 +95,14 @@ BOOST_AUTO_TEST_CASE(constructors) { // now fill it BOOST_REQUIRE_NO_THROW(as.fill(1)); + // Construct a sparse array in default world + { + BOOST_REQUIRE_NO_THROW( + SpArrayN as(tr, TiledArray::SparseShape(shape_tensor, tr))); + SpArrayN as(tr, TiledArray::SparseShape(shape_tensor, tr)); + BOOST_CHECK_EQUAL(as.world().id(), get_default_world().id()); + } + // Construct a sparse array from another sparse array { auto op = [](auto& result, const auto& input) { result = input.clone(); }; @@ -107,6 +122,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) { ++itr; } } + + // now with default world + { + TArray a_vector(il); + BOOST_CHECK_EQUAL(a_vector.world().id(), get_default_world().id()); + } } // Create a matrix with an initializer list @@ -122,6 +143,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) { } } } + + // now with default world + { + TArray a_matrix(il); + BOOST_CHECK_EQUAL(a_matrix.world().id(), get_default_world().id()); + } } // Create a rank 3 tensor with an initializer list @@ -144,6 +171,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) { } } } + + // now with default world + { + TArray a_tensor3(il); + BOOST_CHECK_EQUAL(a_tensor3.world().id(), get_default_world().id()); + } } // Create a rank 4 tensor with an initializer list @@ -168,6 +201,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) { } } } + + // now with default world + { + TArray a_tensor4(il); + BOOST_CHECK_EQUAL(a_tensor4.world().id(), get_default_world().id()); + } } // Create a rank 5 tensor with an initializer list @@ -194,6 +233,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) { } } } + + // now with default world + { + TArray a_tensor5(il); + BOOST_CHECK_EQUAL(a_tensor5.world().id(), get_default_world().id()); + } } // Create a rank 6 tensor with an initializer list @@ -222,6 +267,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) { } } } + + // now with default world + { + TArray a_tensor6(il); + BOOST_CHECK_EQUAL(a_tensor6.world().id(), get_default_world().id()); + } } } @@ -232,6 +283,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) { TiledRange tr{{0, 1, 3}}; TArray a_vector(world, tr, il); BOOST_CHECK_EQUAL(a_vector.size(), 2); + + // now with default world + { + TArray a_vector(tr, il); + BOOST_CHECK_EQUAL(a_vector.world().id(), get_default_world().id()); + } } { @@ -239,6 +296,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) { TiledRange tr{{0, 1, 2}, {0, 1, 3}}; TArray a_matrix(world, tr, il); BOOST_CHECK_EQUAL(a_matrix.size(), 4); + + // now with default world + { + TArray a_matrix(tr, il); + BOOST_CHECK_EQUAL(a_matrix.world().id(), get_default_world().id()); + } } { @@ -247,6 +310,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) { TiledRange tr{{0, 1, 2}, {0, 1, 2}, {0, 1, 3}}; TArray a_tensor(world, tr, il); BOOST_CHECK_EQUAL(a_tensor.size(), 8); + + // now with default world + { + TArray a_tensor(tr, il); + BOOST_CHECK_EQUAL(a_tensor.world().id(), get_default_world().id()); + } } { @@ -257,6 +326,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) { TiledRange tr{{0, 1, 2}, {0, 1, 2}, {0, 1, 2}, {0, 1, 3}}; TArray a_tensor(world, tr, il); BOOST_CHECK_EQUAL(a_tensor.size(), 16); + + // now with default world + { + TArray a_tensor(tr, il); + BOOST_CHECK_EQUAL(a_tensor.world().id(), get_default_world().id()); + } } { @@ -269,6 +344,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) { TiledRange tr{{0, 1, 2}, {0, 1, 2}, {0, 1, 2}, {0, 1, 2}, {0, 1, 3}}; TArray a_tensor(world, tr, il); BOOST_CHECK_EQUAL(a_tensor.size(), 32); + + // now with default world + { + TArray a_tensor(tr, il); + BOOST_CHECK_EQUAL(a_tensor.world().id(), get_default_world().id()); + } } { @@ -286,6 +367,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) { {0, 1, 2}, {0, 1, 2}, {0, 1, 3}}; TArray a_tensor(world, tr, il); BOOST_CHECK_EQUAL(a_tensor.size(), 64); + + // now with default world + { + TArray a_tensor(tr, il); + BOOST_CHECK_EQUAL(a_tensor.world().id(), get_default_world().id()); + } } } From e4737719c2a7c1096d71de917d630e08640c153e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 3 Apr 2023 16:07:11 -0500 Subject: [PATCH 025/592] added shapes to fwd.h to make demo2 cleaner --- examples/demo/demo2.cpp | 25 +++++++++++++++++++------ src/TiledArray/fwd.h | 5 +++++ src/TiledArray/sparse_shape.h | 2 ++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/examples/demo/demo2.cpp b/examples/demo/demo2.cpp index 126f7ad1bb..ec5b819ad3 100644 --- a/examples/demo/demo2.cpp +++ b/examples/demo/demo2.cpp @@ -126,12 +126,25 @@ int main(int argc, char* argv[]) { // clang-format on - // default instance of $\code{DistArray}$ is a dense array of $\code{double}$s - DistArray array0(world, τ); - array0.fill(1.0); // fill $\code{array0}$ with 1's - - // grab a tile NB this returns a ${\bf future}$ to a tile; see Section 3.2. - auto t00 = array0.find({0, 0}); + // default instance of $\code{DistArray}$ is + // a {\em dense} array of $\code{double}$s + // NB can use TArrayD instead of DistArray<> + DistArray<> a0(τ); + a0.fill(1.); // fill $\code{da}$ with 1s + // every tile exists in a dense array + assert(!a0.is_zero({0, 0})); + // grab a ${\em future}$ to the {0,0} tile + auto t00 = a0.find({0, 0}); + + // shape of a {\em sparse} array over τ + // tiles with even ordinals ({0,0}, {0,2}, {1,1}) are zero + SparseShape s(TensorF(τ.tiles_range(), {0, 1, 0, 1, 0, 1}), τ); + // a sparse array of $\code{double}$s + // TSpArrayX $\equiv$ DistArray + TSpArrayD a1(τ, s); + // only some tiles are nonzero in sparse array + assert(a1.is_zero({0, 0})); + assert(!a1.is_zero({0, 1})); #endif // defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES) diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index 87af0e6115..f09a98c0e5 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -117,6 +117,11 @@ namespace symmetry { class Permutation; } +// shapes +class DenseShape; +template +class SparseShape; + // TiledArray Arrays template class DistArray; diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h index 7346f45d1c..bf51487922 100644 --- a/src/TiledArray/sparse_shape.h +++ b/src/TiledArray/sparse_shape.h @@ -26,6 +26,8 @@ #ifndef TILEDARRAY_SPARSE_SHAPE_H__INCLUDED #define TILEDARRAY_SPARSE_SHAPE_H__INCLUDED +#include + #include #include #include From f7b672ea0d16415cf5719c90b2e4ea161ad65488 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 6 Apr 2023 17:50:12 -0400 Subject: [PATCH 026/592] fixups for c++20 --- src/TiledArray/util/vector.h | 9 ++++++--- tests/eigen.cpp | 5 +++++ tests/einsum.cpp | 5 +++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/util/vector.h b/src/TiledArray/util/vector.h index 12c5d0dfcd..48446aa806 100644 --- a/src/TiledArray/util/vector.h +++ b/src/TiledArray/util/vector.h @@ -27,13 +27,16 @@ #define TILEDARRAY_UTIL_VECTOR_H #include +#include // Boost.Container 1.75 and earlier uses standard exception classes, 1.76+ use -// Boost.Container exceptions, unless BOOST_CONTAINER_USE_STD_EXCEPTIONS is defined: +// Boost.Container exceptions, unless BOOST_CONTAINER_USE_STD_EXCEPTIONS is +// defined: // https://www.boost.org/doc/libs/master/doc/html/container/release_notes.html#container.release_notes.release_notes_boost_1_76_00 -// Define BOOST_CONTAINER_USE_STD_EXCEPTIONS for Boost <1.76 so that exception checking can use this macro with all versions of Boost +// Define BOOST_CONTAINER_USE_STD_EXCEPTIONS for Boost <1.76 so that exception +// checking can use this macro with all versions of Boost #if BOOST_VERSION < 107600 && !defined(BOOST_CONTAINER_USE_STD_EXCEPTIONS) -# define BOOST_CONTAINER_USE_STD_EXCEPTIONS 1 +#define BOOST_CONTAINER_USE_STD_EXCEPTIONS 1 #endif #include diff --git a/tests/eigen.cpp b/tests/eigen.cpp index bfa4f1a0db..d577804417 100644 --- a/tests/eigen.cpp +++ b/tests/eigen.cpp @@ -421,7 +421,12 @@ BOOST_AUTO_TEST_CASE(tensor_to_array) { decltype(tensor) tensor_copy; if (GlobalFixture::world->rank() == 1) tensor_copy = tensor; GlobalFixture::world->gop.broadcast_serializable(tensor_copy, 1); +// Eigen::TensorBase::operator== is ambiguously defined in C++20 +#if __cplusplus >= 202002L + Eigen::Tensor eq = ((tensor - tensor_copy).abs() == 0).all(); +#else Eigen::Tensor eq = (tensor == tensor_copy).all(); +#endif BOOST_CHECK(eq() == true); } diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 1c0172e554..0fcb71f072 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -764,7 +764,12 @@ bool isApprox(const Eigen::TensorBase& A, Eigen::Tensor r; if constexpr (std::is_integral_v && std::is_integral_v) { +// Eigen::TensorBase::operator== is ambiguously defined in C++20 +#if __cplusplus >= 202002L + r = ((derived(A) - derived(B)).abs() == 0).all(); +#else r = (derived(A) == derived(B)).all(); +#endif } else { // soft floating-point comparison r = ((derived(A) - derived(B)).abs() <= abs_comparison_threshold).all(); } From 80537b53b5f0520f9cb156dc0d7bdad5347ea4f4 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 7 Apr 2023 08:47:16 -0400 Subject: [PATCH 027/592] dox cleanup --- src/TiledArray/tensor/kernels.h | 125 ++++++++++++++------------------ src/TiledArray/tensor/tensor.h | 9 ++- 2 files changed, 58 insertions(+), 76 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 81141f4982..2fa6535dc1 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -107,12 +107,14 @@ inline TR tensor_op(Op&& op, const Permutation& perm, const T1& tensor1, } /// provides transform functionality to class \p T, useful for nonintrusive -/// extension of a tensor type \p T to be usable as element type \p T in \c -/// Tensor \tparam T a tensor type \note The default implementation +/// extension of a tensor type \p T to be usable as element type \p T in +/// \c Tensor +/// \tparam T a tensor type +/// \note The default implementation /// constructs T, then computes it by coiterating over elements of the argument /// tensors and transforming with the transform \c Op . -/// This should be specialized for classes like TiledArray::Tensor that -/// already include the appropriate transform constructors already +/// This should be specialized for classes like TiledArray::Tensor that +/// already include the appropriate transform constructors already template struct transform { /// creates a result tensor in which element \c i is obtained by \c @@ -283,29 +285,23 @@ inline void inplace_tensor_op(InputOp&& input_op, OutputOp&& output_op, /// \endcode /// The expected signature of the output /// operations is: -/// \code void op(TR::value_type::value_type*, const +/// \code +/// void op(TR::value_type::value_type*, const /// TR::value_type::value_type) /// \endcode -/// \tparam InputOp The input operation -/// type +/// \tparam InputOp The input operation type /// \tparam OutputOp The output operation type -/// \tparam TR The result tensor -/// type +/// \tparam TR The result tensor type /// \tparam T1 The first argument tensor type -/// \tparam Ts The remaining -/// argument tensor types +/// \tparam Ts The remaining argument tensor types /// \param[in] input_op The operation that is used to /// generate the output value from the input arguments -/// \param[in] output_op The -/// operation that is used to set the value of the result tensor given the -/// element pointer and the result value -/// \param[in] perm The permutation applied -/// to the argument tensors +/// \param[in] output_op The operation that is used to set the value +/// of the result tensor given the element pointer and the result value +/// \param[in] perm The permutation applied to the argument tensors /// \param[in,out] result The result tensor -/// \param[in] -/// tensor1 The first argument tensor -/// \param[in] tensors The remaining argument -/// tensors +/// \param[in] tensor1 The first argument tensor +/// \param[in] tensors The remaining argument tensors template ::value>::type* = nullptr> @@ -505,8 +495,7 @@ inline void tensor_init(Op&& op, const Permutation& perm, TR& result, /// This function initializes the \c i -th element of \c result with the result /// of \c op(tensor1[i], tensors[i]...) -/// \pre The memory of \c result has been -/// allocated but not initialized. +/// \pre The memory of \c result has been allocated but not initialized. /// \tparam Op The element initialization operation type /// \tparam Perm A permutation type /// \tparam TR The result tensor type @@ -546,18 +535,13 @@ inline void tensor_init(Op&& op, const Permutation& perm, TR& result, /// This function initializes the \c i -th element of \c result with the result /// of \c op(tensor1[i], tensors[i]...) -/// \pre The memory of \c tensor1 has been -/// allocated but not initialized. -/// \tparam Op The element initialization -/// operation type +/// \pre The memory of \c tensor1 has been allocated but not initialized. +/// \tparam Op The element initialization operation type /// \tparam T1 The result tensor type -/// \tparam Ts The argument -/// tensor types -/// \param[in] op The result tensor element initialization -/// operation +/// \tparam Ts The argument tensor types +/// \param[in] op The result tensor element initialization operation /// \param[out] result The result tensor -/// \param[in] tensor1 The first -/// argument tensor +/// \param[in] tensor1 The first argument tensor /// \param[in] tensors The argument tensors template < typename Op, typename TR, typename T1, typename... Ts, @@ -591,13 +575,10 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1, /// of \c op(tensor1[i],tensors[i]...) /// \pre The memory of \c tensor1 has been /// allocated but not initialized. -/// \tparam Op The element initialization -/// operation type +/// \tparam Op The element initialization operation type /// \tparam T1 The result tensor type -/// \tparam Ts The argument -/// tensor types -/// \param[in] op The result tensor element initialization -/// operation +/// \tparam Ts The argument tensor types +/// \param[in] op The result tensor element initialization operation /// \param[out] result The result tensor /// \param[in] tensor1 The first /// argument tensor @@ -639,12 +620,11 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1, /// Perform an element-wise reduction of the tensors by /// executing join_op(result, reduce_op(result, &tensor1[i], -/// &tensors[i]...)) for each \c i in the index range of \c tensor1 . \c -/// result is initialized to \c identity . If HAVE_INTEL_TBB is defined, the -/// reduction will be executed in an undefined order, otherwise will execute in -/// the order of increasing \c i . -/// \tparam ReduceOp The element-wise reduction -/// operation type +/// &tensors[i]...)) for each \c i in the index range of \c tensor1 . +/// \c result is initialized to \c identity . If `HAVE_INTEL_TBB` is defined, +/// the reduction will be executed in an undefined order, otherwise will +/// execute in the order of increasing \c i . +/// \tparam ReduceOp The element-wise reduction operation type /// \tparam JoinOp The result operation type /// \tparam Identity A type that can be used as an argument to ReduceOp /// \tparam T1 The first argument tensor type @@ -708,10 +688,10 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity, /// Perform reduction of the tensor-of-tensors' elements by /// executing join_op(result, reduce_op(tensor1[i], tensors[i]...)) for -/// each \c i in the index range of \c tensor1 . \c result is initialized to \c -/// identity . This will execute serially, in the order of increasing \c i (each -/// element's reduction can however be executed in parallel, depending on the -/// element type). +/// each \c i in the index range of \c tensor1 . \c result is initialized to +/// \c identity . This will execute serially, in the order of increasing +/// \c i (each element's reduction can however be executed in parallel, +/// depending on the element type). /// \tparam ReduceOp The tensor-wise reduction operation type /// \tparam JoinOp The result operation type /// \tparam Scalar A scalar type @@ -751,10 +731,10 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, /// Perform an element-wise reduction of the tensors by /// executing join_op(result, reduce_op(tensor1[i], tensors[i]...)) for -/// each \c i in the index range of \c tensor1 . \c result is initialized to \c -/// identity . This will execute serially, in the order of increasing \c i (each -/// element-wise reduction can however be executed in parallel, depending on the -/// element type). +/// each \c i in the index range of \c tensor1 . \c result is initialized to +/// \c identity . This will execute serially, in the order of increasing +/// \c i (each element-wise reduction can however be executed in parallel, +/// depending on the element type). /// \tparam ReduceOp The element-wise reduction operation type /// \tparam JoinOp The result operation type /// \tparam Scalar A scalar type @@ -797,10 +777,11 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, /// Perform an element-wise reduction of the tensors by /// executing join_op(result, reduce_op(tensor1[i], tensors[i]...)) for -/// each \c i in the index range of \c tensor1 . \c result is initialized to \c -/// identity . This will execute serially, in the order of increasing \c i (each -/// element-wise reduction can however be executed in parallel, depending on the -/// element type). \tparam ReduceOp The element-wise reduction operation type +/// each \c i in the index range of \c tensor1 . \c result is initialized to +/// \c identity . This will execute serially, in the order of increasing +/// \c i (each element-wise reduction can however be executed in parallel, +/// depending on the element type). +/// \tparam ReduceOp The element-wise reduction operation type /// \tparam JoinOp The result operation type /// \tparam Scalar A scalar type /// \tparam T1 The first argument tensor type diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 329355bf1d..982fef1f21 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -308,9 +308,10 @@ class Tensor { /// Construct a tensor with a fill op that takes an element index - /// \tparam ElementIndexOp callable of signature `value_type(const - /// Range::index_type&)` \param range An array with the size of of each - /// dimension \param element_idx_op a callable of type ElementIndexOp + /// \tparam ElementIndexOp callable of signature + /// `value_type(const Range::index_type&)` + /// \param range An array with the size of of each dimension + /// \param element_idx_op a callable of type ElementIndexOp template >> @@ -1494,7 +1495,7 @@ class Tensor { // Addition operations - /// Add this and \c other to construct a new tensors + /// Add this and \c other to construct a new tensor /// \tparam Right The right-hand tensor type /// \param right The tensor that will be added to this tensor From 860ce673389b7b43518e3fb8744bb009b623aa17 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 7 Apr 2023 09:59:47 -0400 Subject: [PATCH 028/592] introduced Range::{index1,index}_difference_type --- src/TiledArray/range.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h index b7a38d38b0..b4f2a0d48f 100644 --- a/src/TiledArray/range.h +++ b/src/TiledArray/range.h @@ -49,9 +49,12 @@ class Range { typedef Range Range_; ///< This object type typedef TA_1INDEX_TYPE index1_type; ///< 1-index type, to conform to ///< Tensor Working Group (TWG) spec + typedef std::make_signed_t + index1_difference_type; ///< type representing difference of 1-indices typedef container::svector - index_type; ///< Coordinate index type, to conform to - ///< TWG spec + index_type; ///< Coordinate index type, to conform to + ///< TWG spec + typedef container::svector index_difference_type; typedef index_type index; ///< Coordinate index type (deprecated) typedef detail::SizeArray index_view_type; ///< Non-owning variant of index_type From c30462b37b7c0cab489230cfa29ba84620949172 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 7 Apr 2023 12:27:40 -0400 Subject: [PATCH 029/592] tensor/kernels.h: make work correctly and more efficiently for noncontiguous tensors --- src/TiledArray/tensor/kernels.h | 224 ++++++++++++++++++++++---------- src/TiledArray/util/vector.h | 27 ++++ 2 files changed, 184 insertions(+), 67 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 2fa6535dc1..2cd2d46fe3 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -28,6 +28,7 @@ #include #include +#include namespace TiledArray { @@ -353,13 +354,26 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) { TA_ASSERT(!empty(result, tensors...)); TA_ASSERT(is_range_set_congruent(result, tensors...)); - const auto stride = inner_size(result, tensors...); const auto volume = result.range().volume(); - for (decltype(result.range().volume()) i = 0ul; i < volume; i += stride) - math::inplace_vector_op(std::forward(op), stride, - result.data() + result.range().ordinal(i), - (tensors.data() + tensors.range().ordinal(i))...); + if constexpr (detail::has_member_function_data_anyreturn_v && + (detail::has_member_function_data_anyreturn_v && ...)) { + const auto stride = inner_size(result, tensors...); + for (std::decay_t i = 0ul; i < volume; i += stride) + math::inplace_vector_op(std::forward(op), stride, + result.data() + result.range().ordinal(i), + (tensors.data() + tensors.range().ordinal(i))...); + } else { // if 1+ tensor lacks data() must iterate over individual elements + auto& result_rng = result.range(); + using signed_idx_t = Range::index_difference_type; + auto result_lobound = signed_idx_t(result_rng.lobound()); + for (auto&& idx : result_rng) { + using namespace container::operators; + std::forward(op)( + result[idx], (tensors[idx - result_lobound + + signed_idx_t(tensors.range().lobound())])...); + } + } } /// In-place tensor of tensors operations with non-contiguous data @@ -380,20 +394,33 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) { TA_ASSERT(!empty(result, tensors...)); TA_ASSERT(is_range_set_congruent(result, tensors...)); - const auto stride = inner_size(result, tensors...); const auto volume = result.range().volume(); - auto inplace_tensor_range = - [&op, stride]( - typename TR::pointer MADNESS_RESTRICT const result_data, - typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) { - for (decltype(result.range().volume()) i = 0ul; i < stride; ++i) - inplace_tensor_op(op, result_data[i], tensors_data[i]...); - }; - - for (decltype(result.range().volume()) ord = 0ul; ord < volume; ord += stride) - inplace_tensor_range(result.data() + result.range().ordinal(ord), - (tensors.data() + tensors.range().ordinal(ord))...); + if constexpr (detail::has_member_function_data_anyreturn_v && + (detail::has_member_function_data_anyreturn_v && ...)) { + const auto stride = inner_size(result, tensors...); + auto inplace_tensor_range = + [&op, stride]( + typename TR::pointer MADNESS_RESTRICT const result_data, + typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) { + for (decltype(result.range().volume()) i = 0ul; i < stride; ++i) + inplace_tensor_op(op, result_data[i], tensors_data[i]...); + }; + + for (std::decay_t ord = 0ul; ord < volume; ord += stride) + inplace_tensor_range(result.data() + result.range().ordinal(ord), + (tensors.data() + tensors.range().ordinal(ord))...); + } else { // if 1+ tensor lacks data() must iterate over individual elements + auto& result_rng = result.range(); + using signed_idx_t = Range::index_difference_type; + auto result_lobound = signed_idx_t(result_rng.lobound()); + for (auto&& idx : result_rng) { + using namespace container::operators; + std::forward(op)( + result[idx], (tensors[idx - result_lobound + + signed_idx_t(tensors.range().lobound())])...); + } + } } // ------------------------------------------------------------------------- @@ -553,7 +580,6 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1, TA_ASSERT(!empty(result, tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(result, tensor1, tensors...)); - const auto stride = inner_size(tensor1, tensors...); const auto volume = tensor1.range().volume(); auto wrapper_op = [&op](typename TR::pointer MADNESS_RESTRICT result_ptr, @@ -562,11 +588,27 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1, new (result_ptr) typename T1::value_type(op(value1, values...)); }; - for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume; - ord += stride) - math::vector_ptr_op(wrapper_op, stride, result.data() + ord, - (tensor1.data() + tensor1.range().ordinal(ord)), - (tensors.data() + tensors.range().ordinal(ord))...); + if constexpr (detail::has_member_function_data_anyreturn_v && + (detail::has_member_function_data_anyreturn_v && ...)) { + const auto stride = inner_size(tensor1, tensors...); + for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume; + ord += stride) + math::vector_ptr_op(wrapper_op, stride, result.data() + ord, + (tensor1.data() + tensor1.range().ordinal(ord)), + (tensors.data() + tensors.range().ordinal(ord))...); + } else { // if 1+ tensor lacks data() must iterate over individual elements + auto& result_rng = result.range(); + using signed_idx_t = Range::index_difference_type; + auto result_lobound = signed_idx_t(result_rng.lobound()); + for (auto&& idx : result_rng) { + using namespace container::operators; + const signed_idx_t relidx = idx - result_lobound; + wrapper_op( + &(result[idx]), + tensor1[relidx + signed_idx_t(tensor1.range().lobound())], + (tensors[relidx + signed_idx_t(tensors.range().lobound())])...); + } + } } /// Initialize tensor with one or more non-contiguous tensor arguments @@ -593,24 +635,40 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1, TA_ASSERT(!empty(result, tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(result, tensor1, tensors...)); - const auto stride = inner_size(tensor1, tensors...); const auto volume = tensor1.range().volume(); - auto inplace_tensor_range = - [&op, stride]( - typename TR::pointer MADNESS_RESTRICT const result_data, - typename T1::const_pointer MADNESS_RESTRICT const tensor1_data, - typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) { - for (decltype(result.range().volume()) i = 0ul; i < stride; ++i) - new (result_data + i) - typename TR::value_type(tensor_op( - op, tensor1_data[i], tensors_data[i]...)); - }; - - for (decltype(volume) ord = 0ul; ord < volume; ord += stride) - inplace_tensor_range(result.data() + ord, - (tensor1.data() + tensor1.range().ordinal(ord)), - (tensors.data() + tensors.range().ordinal(ord))...); + if constexpr (detail::has_member_function_data_anyreturn_v && + (detail::has_member_function_data_anyreturn_v && ...)) { + const auto stride = inner_size(tensor1, tensors...); + auto inplace_tensor_range = + [&op, stride]( + typename TR::pointer MADNESS_RESTRICT const result_data, + typename T1::const_pointer MADNESS_RESTRICT const tensor1_data, + typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) { + for (std::decay_t i = 0ul; i < stride; ++i) + new (result_data + i) + typename TR::value_type(tensor_op( + op, tensor1_data[i], tensors_data[i]...)); + }; + + for (std::decay_t ord = 0ul; ord < volume; ord += stride) + inplace_tensor_range(result.data() + ord, + (tensor1.data() + tensor1.range().ordinal(ord)), + (tensors.data() + tensors.range().ordinal(ord))...); + } else { + auto& result_rng = result.range(); + using signed_idx_t = Range::index_difference_type; + auto result_lobound = signed_idx_t(result_rng.lobound()); + for (auto&& idx : result_rng) { + using namespace container::operators; + const signed_idx_t relidx = idx - result_lobound; + + new (&(result[idx])) + typename TR::value_type(tensor_op( + op, tensor1[relidx + signed_idx_t(tensor1.range().lobound())], + (tensors[relidx + signed_idx_t(tensors.range().lobound())])...)); + } + } } // ------------------------------------------------------------------------- @@ -757,17 +815,31 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); - const auto stride = inner_size(tensor1, tensors...); const auto volume = tensor1.range().volume(); auto result = identity; - for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume; - ord += stride) { - auto temp = identity; - math::reduce_op(reduce_op, join_op, identity, stride, temp, - tensor1.data() + tensor1.range().ordinal(ord), - (tensors.data() + tensors.range().ordinal(ord))...); - join_op(result, temp); + if constexpr (detail::has_member_function_data_anyreturn_v && + (detail::has_member_function_data_anyreturn_v && ...)) { + const auto stride = inner_size(tensor1, tensors...); + for (std::decay_t ord = 0ul; ord < volume; + ord += stride) { + auto temp = identity; + math::reduce_op(reduce_op, join_op, identity, stride, temp, + tensor1.data() + tensor1.range().ordinal(ord), + (tensors.data() + tensors.range().ordinal(ord))...); + join_op(result, temp); + } + } else { // if 1+ tensor lacks data() must iterate over individual elements + auto& t1_rng = tensor1.range(); + using signed_idx_t = Range::index_difference_type; + auto t1_lobound = signed_idx_t(t1_rng.lobound()); + for (auto&& idx : t1_rng) { + using namespace container::operators; + signed_idx_t relidx = idx - t1_lobound; + reduce_op(result, tensor1[idx], + (tensors[idx - t1_lobound + + signed_idx_t(tensors.range().lobound())])...); + } } return result; @@ -803,31 +875,49 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); - const auto stride = inner_size(tensor1, tensors...); const auto volume = tensor1.range().volume(); - auto tensor_reduce_range = - [&reduce_op, &join_op, &identity, stride]( - Scalar& MADNESS_RESTRICT result, - typename T1::const_pointer MADNESS_RESTRICT const tensor1_data, - typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) { - for (decltype(result.range().volume()) i = 0ul; i < stride; ++i) { - Scalar temp = tensor_reduce(reduce_op, join_op, identity, - tensor1_data[i], tensors_data[i]...); - join_op(result, temp); - } - }; - Scalar result = identity; - for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume; - ord += stride) { - Scalar temp = tensor_reduce_range( - result, tensor1.data() + tensor1.range().ordinal(ord), - (tensors.data() + tensors.range().ordinal(ord))...); - join_op(result, temp); + + if constexpr (detail::has_member_function_data_anyreturn_v && + (detail::has_member_function_data_anyreturn_v && ...)) { + const auto stride = inner_size(tensor1, tensors...); + auto tensor_reduce_range = + [&reduce_op, &join_op, &identity, stride]( + Scalar& MADNESS_RESTRICT result, + typename T1::const_pointer MADNESS_RESTRICT const tensor1_data, + typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) { + for (decltype(result.range().volume()) i = 0ul; i < stride; ++i) { + Scalar temp = tensor_reduce(reduce_op, join_op, identity, + tensor1_data[i], tensors_data[i]...); + join_op(result, temp); + } + }; + + for (std::decay_t ord = 0ul; ord < volume; + ord += stride) { + Scalar temp = tensor_reduce_range( + result, tensor1.data() + tensor1.range().ordinal(ord), + (tensors.data() + tensors.range().ordinal(ord))...); + join_op(result, temp); + } + } else { // if 1+ tensor lacks data() must iterate over individual elements + auto& t1_rng = tensor1.range(); + using signed_idx_t = Range::index_difference_type; + auto t1_lobound = signed_idx_t(t1_rng.lobound()); + for (auto&& idx : t1_rng) { + using namespace container::operators; + signed_idx_t relidx = idx - t1_lobound; + + Scalar temp = + tensor_reduce(reduce_op, join_op, identity, tensor1[idx], + (tensors[idx - t1_lobound + + signed_idx_t(tensors.range().lobound())])...); + join_op(result, temp); + } } - return identity; + return result; } } // namespace detail diff --git a/src/TiledArray/util/vector.h b/src/TiledArray/util/vector.h index 48446aa806..6e69f523f4 100644 --- a/src/TiledArray/util/vector.h +++ b/src/TiledArray/util/vector.h @@ -44,6 +44,7 @@ #include #include +#include "TiledArray/error.h" namespace TiledArray { @@ -93,6 +94,32 @@ constexpr auto iv(Int i0, Ints... rest) { return result; } +namespace operators { + +template +decltype(auto) operator+(const boost::container::small_vector& v1, + const boost::container::small_vector& v2) { + TA_ASSERT(v1.size() == v2.size()); + boost::container::small_vector, std::max(N1, N2)> + result(v1.size()); + std::transform(v1.begin(), v1.end(), v2.begin(), result.begin(), + [](auto&& a, auto&& b) { return a + b; }); + return result; +} + +template +decltype(auto) operator-(const boost::container::small_vector& v1, + const boost::container::small_vector& v2) { + TA_ASSERT(v1.size() == v2.size()); + boost::container::small_vector, std::max(N1, N2)> + result(v1.size()); + std::transform(v1.begin(), v1.end(), v2.begin(), result.begin(), + [](auto&& a, auto&& b) { return a - b; }); + return result; +} + +} // namespace operators + } // namespace container } // namespace TiledArray From a8fdcf9579cc4e86c9953a401d6fb7150b7974bc Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 7 Apr 2023 12:29:34 -0400 Subject: [PATCH 030/592] optimize tensor/tile add operations by being able to consume arguments automatically and/or reorder args --- src/TiledArray/tensor/operators.h | 12 +-- src/TiledArray/tensor/tensor.h | 15 +++- src/TiledArray/tile_interface/add.h | 110 ++++++++++++++++++++++------ tests/sparse_tile.h | 72 ++++++++++++------ tests/tile_op_add.cpp | 84 ++++++++++++++++++++- 5 files changed, 240 insertions(+), 53 deletions(-) diff --git a/src/TiledArray/tensor/operators.h b/src/TiledArray/tensor/operators.h index f7c7a5f2ae..f243e46d7e 100644 --- a/src/TiledArray/tensor/operators.h +++ b/src/TiledArray/tensor/operators.h @@ -41,11 +41,13 @@ namespace TiledArray { /// \param right The right-hand tensor argument /// \return A tensor where element \c i is equal to left[i] + right[i] template ::value || - detail::is_tensor_of_tensor::value>::type* = nullptr> -inline auto operator+(const T1& left, const T2& right) { - return add(left, right); + typename = std::enable_if_t< + detail::is_tensor, + detail::remove_cvr_t>::value || + detail::is_tensor_of_tensor, + detail::remove_cvr_t>::value>> +inline decltype(auto) operator+(T1&& left, T2&& right) { + return add(std::forward(left), std::forward(right)); } /// Tensor minus operator diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 982fef1f21..138b92266c 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1503,7 +1503,7 @@ class Tensor { /// \c this and \c other template ::value>::type* = nullptr> - Tensor add(const Right& right) const { + Tensor add(const Right& right) const& { return binary( right, [](const numeric_type l, const numeric_t r) -> numeric_type { @@ -1511,6 +1511,19 @@ class Tensor { }); } + /// Add this and \c other to construct a new tensor + + /// \tparam Right The right-hand tensor type + /// \param right The tensor that will be added to this tensor + /// \return A new tensor where the elements are the sum of the elements of + /// \c this and \c other + template ::value>::type* = nullptr> + Tensor add(const Right& right) && { + add_to(right); + return std::move(*this); + } + /// Add this and \c other to construct a new, permuted tensor /// \tparam Right The right-hand tensor type diff --git a/src/TiledArray/tile_interface/add.h b/src/TiledArray/tile_interface/add.h index 9c0e02e558..b0bb7cf968 100644 --- a/src/TiledArray/tile_interface/add.h +++ b/src/TiledArray/tile_interface/add.h @@ -39,10 +39,21 @@ namespace TiledArray { /// \param left The left-hand argument to be added /// \param right The right-hand argument to be added /// \return A tile that is equal to (left + right) -template -inline auto add(const Left& left, const Right& right) - -> decltype(left.add(right)) { - return left.add(right); +template || + detail::has_member_function_add_anyreturn_v>> +inline decltype(auto) add(Left&& left, Right&& right) { + constexpr auto left_right = + (detail::has_member_function_add_anyreturn_v && + detail::has_member_function_add_anyreturn_v && + !std::is_reference_v && std::is_reference_v) || + (detail::has_member_function_add_anyreturn_v && + !detail::has_member_function_add_anyreturn_v); + if constexpr (left_right) + return std::forward(left).add(std::forward(right)); + else + return std::forward(right).add(std::forward(left)); } /// Add and scale tile arguments @@ -56,9 +67,26 @@ inline auto add(const Left& left, const Right& right) /// \return A tile that is equal to (left + right) * factor template < typename Left, typename Right, typename Scalar, - typename std::enable_if>::type* = nullptr> -inline auto add(const Left& left, const Right& right, const Scalar factor) { - return left.add(right, factor); + typename = std::enable_if_t && + (detail::has_member_function_add_anyreturn_v< + Left&&, Right&&, const Scalar> || + detail::has_member_function_add_anyreturn_v< + Right&&, Left&&, const Scalar>)>> +inline decltype(auto) add(Left&& left, Right&& right, const Scalar factor) { + constexpr auto left_right = + (detail::has_member_function_add_anyreturn_v && + detail::has_member_function_add_anyreturn_v && + !std::is_reference_v && std::is_reference_v) || + (detail::has_member_function_add_anyreturn_v && + !detail::has_member_function_add_anyreturn_v); + if constexpr (left_right) + return std::forward(left).add(std::forward(right), factor); + else + return std::forward(right).add(std::forward(left), factor); } /// Add and permute tile arguments @@ -72,10 +100,25 @@ inline auto add(const Left& left, const Right& right, const Scalar factor) { template < typename Left, typename Right, typename Perm, typename = std::enable_if_t && - detail::has_member_function_add_anyreturn_v< - const Left, const Right&, const Perm&>>> -inline auto add(const Left& left, const Right& right, const Perm& perm) { - return left.add(right, perm); + (detail::has_member_function_add_anyreturn_v< + Left&&, Right&&, const Perm&> || + detail::has_member_function_add_anyreturn_v< + Right&&, Left&&, const Perm&>)>> +inline decltype(auto) add(Left&& left, Right&& right, const Perm& perm) { + constexpr auto left_right = + (detail::has_member_function_add_anyreturn_v && + detail::has_member_function_add_anyreturn_v && + !std::is_reference_v && std::is_reference_v) || + (detail::has_member_function_add_anyreturn_v && + !detail::has_member_function_add_anyreturn_v); + if constexpr (left_right) + return std::forward(left).add(std::forward(right), perm); + else + return std::forward(right).add(std::forward(left), perm); } /// Add, scale, and permute tile arguments @@ -88,13 +131,31 @@ inline auto add(const Left& left, const Right& right, const Perm& perm) { /// \param factor The scaling factor /// \param perm The permutation to be applied to the result /// \return A tile that is equal to perm ^ (left + right) * factor -template < - typename Left, typename Right, typename Scalar, typename Perm, - typename std::enable_if && - detail::is_permutation_v>::type* = nullptr> -inline auto add(const Left& left, const Right& right, const Scalar factor, - const Perm& perm) { - return left.add(right, factor, perm); +template && detail::is_permutation_v && + (detail::has_member_function_add_anyreturn_v< + Left&&, Right&&, const Scalar, const Perm&> || + detail::has_member_function_add_anyreturn_v< + Right&&, Left&&, const Scalar, const Perm&>)>> +inline decltype(auto) add(Left&& left, Right&& right, const Scalar factor, + const Perm& perm) { + constexpr auto left_right = + (detail::has_member_function_add_anyreturn_v && + detail::has_member_function_add_anyreturn_v && + !std::is_reference_v && std::is_reference_v) || + (detail::has_member_function_add_anyreturn_v && + !detail::has_member_function_add_anyreturn_v); + if constexpr (left_right) + return std::forward(left).add(std::forward(right), factor, + perm); + else + return std::forward(right).add(std::forward(left), factor, + perm); } /// Add to the result tile @@ -104,7 +165,10 @@ inline auto add(const Left& left, const Right& right, const Scalar factor, /// \param result The result tile /// \param arg The argument to be added to the result /// \return A tile that is equal to result[i] += arg[i] -template +template < + typename Result, typename Arg, + typename = std::enable_if_t< + detail::has_member_function_add_to_anyreturn_v>> inline Result& add_to(Result& result, const Arg& arg) { return result.add_to(arg); } @@ -118,9 +182,11 @@ inline Result& add_to(Result& result, const Arg& arg) { /// \param arg The argument to be added to \c result /// \param factor The scaling factor /// \return A tile that is equal to (result[i] += arg[i]) *= factor -template < - typename Result, typename Arg, typename Scalar, - typename std::enable_if>::type* = nullptr> +template && + detail::has_member_function_add_to_anyreturn_v< + Result&, const Arg&, const Scalar>>::type* = nullptr> inline Result& add_to(Result& result, const Arg& arg, const Scalar factor) { return result.add_to(arg, factor); } diff --git a/tests/sparse_tile.h b/tests/sparse_tile.h index 6c365334fa..1b7cdd07e1 100644 --- a/tests/sparse_tile.h +++ b/tests/sparse_tile.h @@ -122,10 +122,36 @@ class EigenSparseTile { matrix_type& matrix() { return std::get<0>(*impl_); } /// data read-write accessor - template + template >> value_type& operator[](const Index& idx) { auto start = range().lobound_data(); - return std::get<0>(*impl_).coeffRef(idx[0] - start[0], idx[1] - start[1]); + return matrix().coeffRef(idx[0] - start[0], idx[1] - start[1]); + } + + /// data read-write accessor + template >* = nullptr> + value_type& operator[](const Ordinal& ord) { + auto idx = range().idx(ord); + auto start = range().lobound_data(); + return matrix().coeffRef(idx[0] - start[0], idx[1] - start[1]); + } + + /// data read-only accessor + template >> + value_type operator[](const Index& idx) const { + auto start = range().lobound_data(); + return matrix().coeff(idx[0] - start[0], idx[1] - start[1]); + } + + /// data read-only accessor + template >> + value_type operator[](const Ordinal& ord) const { + auto idx = range().idx(ord); + auto start = range().lobound_data(); + return matrix().coeff(idx[0] - start[0], idx[1] - start[1]); } /// Maximum # of elements in the tile @@ -138,8 +164,8 @@ class EigenSparseTile { // output template >::type* = nullptr> + typename std::enable_if< + madness::is_output_archive_v>::type* = nullptr> void serialize(Archive& ar) { if (impl_) { ar & true; @@ -151,7 +177,7 @@ class EigenSparseTile { for (typename matrix_type::InnerIterator it(mat, k); it; ++it) { datavec.push_back(Eigen::Triplet(it.row(), it.col(), it.value())); } - ar& datavec & this->range(); + ar& datavec& this->range(); } else { ar & false; } @@ -159,8 +185,8 @@ class EigenSparseTile { // output template >::type* = nullptr> + typename std::enable_if< + madness::is_input_archive_v>::type* = nullptr> void serialize(Archive& ar) { bool have_impl = false; ar& have_impl; @@ -229,22 +255,22 @@ EigenSparseTile add(const EigenSparseTile& arg1, arg1.range()); } -// dense_result[i] = dense_arg1[i] + sparse_arg2[i] -template -TiledArray::Tensor add(const TiledArray::Tensor& arg1, - const EigenSparseTile& arg2) { - TA_ASSERT(arg1.range() == arg2.range()); - - // this could be done better ... - return TiledArray::add(arg1, static_cast>(arg2)); -} - -// dense_result[i] = sparse_arg1[i] + dense_arg2[i] -template -TiledArray::Tensor add(const EigenSparseTile& arg1, - const TiledArray::Tensor& arg2) { - return TiledArray::add(arg2, arg1); -} +//// dense_result[i] = dense_arg1[i] + sparse_arg2[i] +// template +// TiledArray::Tensor add(const TiledArray::Tensor& arg1, +// const EigenSparseTile& arg2) { +// TA_ASSERT(arg1.range() == arg2.range()); +// +// // this could be done better ... +// return TiledArray::add(arg1, static_cast>(arg2)); +// } +// +//// dense_result[i] = sparse_arg1[i] + dense_arg2[i] +// template +// TiledArray::Tensor add(const EigenSparseTile& arg1, +// const TiledArray::Tensor& arg2) { +// return TiledArray::add(arg2, static_cast>(arg1)); +// } // dense_result[perm ^ i] = dense_arg1[i] + sparse_arg2[i] template < diff --git a/tests/tile_op_add.cpp b/tests/tile_op_add.cpp index b264ae15bf..c2e08c170c 100644 --- a/tests/tile_op_add.cpp +++ b/tests/tile_op_add.cpp @@ -26,6 +26,7 @@ #include "../src/TiledArray/tile_op/add.h" #include "../src/tiledarray.h" #include "range_fixture.h" +#include "sparse_tile.h" #include "unit_test_config.h" // using TiledArray::detail::Add; @@ -49,8 +50,7 @@ struct AddFixture : public RangeFixture { }; // AddFixture -BOOST_FIXTURE_TEST_SUITE(tile_op_add_suite, AddFixture, - TA_UT_LABEL_SERIAL) +BOOST_FIXTURE_TEST_SUITE(tile_op_add_suite, AddFixture, TA_UT_LABEL_SERIAL) BOOST_AUTO_TEST_CASE(constructor) { // Check that the constructors can be called without throwing exceptions @@ -398,4 +398,84 @@ BOOST_AUTO_TEST_CASE(binary_add_right_zero_perm_consume_right) { } } +BOOST_AUTO_TEST_CASE(binary_add_heterogeneous) { + TensorD a(RangeFixture::r, [](auto&) { return TiledArray::drand(); }); + EigenSparseTile b(RangeFixture::r); + + ///////////////// + // dense + sparse + ///////////////// + {{// a is persistent + auto c = add(a, b); + + // Check that the result range is correct + BOOST_CHECK_EQUAL(c.range(), a.range()); + + // Check that a nor b were consumed + BOOST_CHECK_NE(a.data(), nullptr); + BOOST_CHECK_NE(c.data(), a.data()); + + // Check that the data in the new tile is correct + for (std::size_t i = 0ul; i < r.volume(); ++i) { + BOOST_CHECK_EQUAL(c[i], a[i] + b[i]); + } +} +{ // a is consumed + auto a_copy = a.clone(); + if (r.rank() == 3) a.shift({-7, 7, 0}); + auto c = add(std::move(a), std::move(b)); + + // Check that the result range is correct + BOOST_CHECK_EQUAL(c.range(), b.range()); + + // Check that a was consumed + BOOST_CHECK_EQUAL(a.data(), nullptr); + + // Check that the data in the new tile is correct + for (std::size_t i = 0ul; i < r.volume(); ++i) { + BOOST_CHECK_EQUAL(c[i], a_copy[i] + b[i]); + } + a = a_copy; +} +} + +///////////////// +// sparse + dense +///////////////// +{ + { // a is persistent + auto c = add(b, a); + + // Check that the result range is correct + BOOST_CHECK_EQUAL(c.range(), b.range()); + + // Check that a was not consumed + BOOST_CHECK_NE(a.data(), nullptr); + BOOST_CHECK_NE(c.data(), a.data()); + + // Check that the data in the new tile is correct + for (std::size_t i = 0ul; i < r.volume(); ++i) { + BOOST_CHECK_EQUAL(c[i], b[i] + a[i]); + } + } + { // a is consumed + auto a_copy = a.clone(); + if (r.rank() == 3) a.shift({-7, 7, 0}); + auto c = add(std::move(b), std::move(a)); + + // Check that the result range is correct + BOOST_CHECK_EQUAL(c.range(), b.range()); + + // Check that a was consumed + BOOST_CHECK_EQUAL(a.data(), nullptr); + + // Check that the data in the new tile is correct + for (std::size_t i = 0ul; i < r.volume(); ++i) { + BOOST_CHECK_EQUAL(c[i], b[i] + a_copy[i]); + } + a = a_copy; + } +} +} + BOOST_AUTO_TEST_SUITE_END() From 6c4ad7ef88f7d89c2c8d8eb67094aa1ec0e669e4 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 7 Apr 2023 12:30:06 -0400 Subject: [PATCH 031/592] moar demo2 --- examples/demo/demo2.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/demo/demo2.cpp b/examples/demo/demo2.cpp index ec5b819ad3..7ef5ca45c8 100644 --- a/examples/demo/demo2.cpp +++ b/examples/demo/demo2.cpp @@ -114,15 +114,19 @@ int main(int argc, char* argv[]) { }); // print out "0 1 .. 99 " for (auto&& v : t1) cout << v << " "; - // same as $\code{t0}$, using external buffer + // same as $\code{t0}$, using existing buffer shared_ptr v(new double[ρ.volume()]); - TensorD t2(ρ, v); + TensorD t2(ρ, v); // t2 and v co-manage buffer lifetime v[0] = 1.; assert(t2(1, -1) == 1.); - // Tensor has shallow-copy semantics - auto t3 = t0; - t0(1, -1) = 2.; + // same as $\code{t0}$, using existing (unmanaged) buffer + auto t3 = make_map(v.get(), ρ); + v[0] = 2.; assert(t3(1, -1) == 2.); + // Tensor has shallow-copy semantics + auto t4 = t0; + t0(1, -1) = 3.; + assert(t4(1, -1) == 3.); // clang-format on From 79c419f0f417b2eb0c4b14a3b7b1a2672166abd3 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 14 Apr 2023 22:51:23 -0400 Subject: [PATCH 032/592] similar to https://github.com/ValeevGroup/tiledarray/pull/401/commits/5ca9b8625feb92827b89ee054d1c27b495f75cfc but for subt_to --- src/TiledArray/tensor/operators.h | 141 ++++++++++++++---------- src/TiledArray/tile_interface/add.h | 13 ++- src/TiledArray/tile_op/tile_interface.h | 4 +- 3 files changed, 89 insertions(+), 69 deletions(-) diff --git a/src/TiledArray/tensor/operators.h b/src/TiledArray/tensor/operators.h index f243e46d7e..4be515d3b3 100644 --- a/src/TiledArray/tensor/operators.h +++ b/src/TiledArray/tensor/operators.h @@ -58,12 +58,16 @@ inline decltype(auto) operator+(T1&& left, T2&& right) { /// \param left The left-hand tensor argument /// \param right The right-hand tensor argument /// \return A tensor where element \c i is equal to left[i] - right[i] -template ::value || - detail::is_tensor_of_tensor::value>::type* = nullptr> -inline auto operator-(const T1& left, const T2& right) { - return subt(left, right); +template < + typename T1, typename T2, + typename std::enable_if< + detail::is_tensor, + detail::remove_cvr_t>::value || + detail::is_tensor_of_tensor, + detail::remove_cvr_t>::value>::type* = + nullptr> +inline decltype(auto) operator-(T1&& left, T2&& right) { + return subt(std::forward(left), std::forward(right)); } /// Tensor multiplication operator @@ -74,12 +78,16 @@ inline auto operator-(const T1& left, const T2& right) { /// \param left The left-hand tensor argument /// \param right The right-hand tensor argument /// \return A tensor where element \c i is equal to left[i] * right[i] -template ::value || - detail::is_tensor_of_tensor::value>::type* = nullptr> -inline auto operator*(const T1& left, const T2& right) { - return mult(left, right); +template < + typename T1, typename T2, + typename std::enable_if< + detail::is_tensor, + detail::remove_cvr_t>::value || + detail::is_tensor_of_tensor, + detail::remove_cvr_t>::value>::type* = + nullptr> +inline decltype(auto) operator*(T1&& left, T2&& right) { + return mult(std::forward(left), std::forward(right)); } /// Create a copy of \c left that is scaled by \c right @@ -91,11 +99,12 @@ inline auto operator*(const T1& left, const T2& right) { /// \param right The right-hand scalar argument /// \return A tensor where element \c i is equal to left[i] * right template ::value || - detail::is_tensor_of_tensor::value) && - detail::is_numeric_v>::type* = nullptr> -inline auto operator*(const T& left, N right) { - return scale(left, right); + typename std::enable_if< + (detail::is_tensor>::value || + detail::is_tensor_of_tensor>::value) && + detail::is_numeric_v>::type* = nullptr> +inline decltype(auto) operator*(T&& left, N right) { + return scale(std::forward(left), right); } /// Create a copy of \c right that is scaled by \c left @@ -105,13 +114,15 @@ inline auto operator*(const T& left, N right) { /// \param left The left-hand scalar argument /// \param right The right-hand tensor argument /// \return A tensor where element \c i is equal to left * right[i] -template && - (detail::is_tensor::value || - detail::is_tensor_of_tensor::value)>::type* = nullptr> -inline auto operator*(N left, const T& right) { - return scale(right, left); +template < + typename N, typename T, + typename std::enable_if< + detail::is_numeric_v && + (detail::is_tensor>::value || + detail::is_tensor_of_tensor>::value)>::type* = + nullptr> +inline decltype(auto) operator*(N left, T&& right) { + return scale(std::forward(right), left); } /// Create a negated copy of \c arg @@ -119,11 +130,12 @@ inline auto operator*(N left, const T& right) { /// \tparam T The element type of \c arg /// \param arg The argument tensor /// \return A tensor where element \c i is equal to \c -arg[i] -template ::value || - detail::is_tensor_of_tensor< - T>::value>::type* = nullptr> -inline auto operator-(const T& arg) -> decltype(arg.neg()) { - return neg(arg); +template >::value || + detail::is_tensor_of_tensor< + detail::remove_cvr_t>::value>::type* = nullptr> +inline decltype(auto) operator-(T&& arg) { + return neg(std::forward(arg)); } /// Create a permuted copy of \c arg @@ -131,11 +143,12 @@ inline auto operator-(const T& arg) -> decltype(arg.neg()) { /// \tparam T The argument tensor type /// \param perm The permutation to be applied to \c arg /// \param arg The argument tensor to be permuted -template ::value || - detail::is_tensor_of_tensor< - T>::value>::type* = nullptr> -inline auto operator*(const Permutation& perm, const T& arg) { - return permute(arg, perm); +template >::value || + detail::is_tensor_of_tensor< + detail::remove_cvr_t>::value>::type* = nullptr> +inline decltype(auto) operator*(const Permutation& perm, T&& arg) { + return permute(std::forward(arg), perm); } /// Tensor plus operator @@ -148,10 +161,11 @@ inline auto operator*(const Permutation& perm, const T& arg) { /// \return A tensor where element \c i is equal to left[i] + right[i] template ::value || - detail::is_tensor_of_tensor::value>::type* = nullptr> -inline auto operator+=(T1& left, const T2& right) { - return add_to(left, right); + detail::is_tensor, T2>::value || + detail::is_tensor_of_tensor, + T2>::value>::type* = nullptr> +inline decltype(auto) operator+=(T1&& left, const T2& right) { + return add_to(std::forward(left), right); } /// Tensor minus operator @@ -164,10 +178,11 @@ inline auto operator+=(T1& left, const T2& right) { /// \return A reference to \c left template ::value || - detail::is_tensor_of_tensor::value>::type* = nullptr> -inline auto operator-=(T1& left, const T2& right) { - return sub_to(left, right); + detail::is_tensor, T2>::value || + detail::is_tensor_of_tensor, + T2>::value>::type* = nullptr> +inline decltype(auto) operator-=(T1&& left, const T2& right) { + return subt_to(std::forward(left), right); } /// In place tensor multiplication @@ -180,10 +195,11 @@ inline auto operator-=(T1& left, const T2& right) { /// \return A reference to \c left template ::value || - detail::is_tensor_of_tensor::value>::type* = nullptr> -inline auto operator*=(T1& left, const T2& right) { - return mult_to(left, right); + detail::is_tensor, T2>::value || + detail::is_tensor_of_tensor, + T2>::value>::type* = nullptr> +inline decltype(auto) operator*=(T1&& left, const T2& right) { + return mult_to(std::forward(left), right); } /// In place tensor add constant @@ -195,11 +211,12 @@ inline auto operator*=(T1& left, const T2& right) { /// \param right The right-hand scalar argument /// \return A reference to \c left template ::value || - detail::is_tensor_of_tensor::value) && - detail::is_numeric_v>::type* = nullptr> -inline auto operator+=(T& left, N right) { - return add_to(left, right); + typename std::enable_if< + (detail::is_tensor>::value || + detail::is_tensor_of_tensor>::value) && + detail::is_numeric_v>::type* = nullptr> +inline decltype(auto) operator+=(T&& left, N right) { + return add_to(std::forward(left), right); } /// In place tensor subtract constant @@ -211,11 +228,12 @@ inline auto operator+=(T& left, N right) { /// \param right The right-hand scalar argument /// \return A reference to \c left template ::value || - detail::is_tensor_of_tensor::value) && - detail::is_numeric_v>::type* = nullptr> -inline auto operator-=(T& left, N right) { - return subt_to(left, right); + typename std::enable_if< + (detail::is_tensor>::value || + detail::is_tensor_of_tensor>::value) && + detail::is_numeric_v>::type* = nullptr> +inline decltype(auto) operator-=(T&& left, N right) { + return subt_to(std::forward(left), right); } /// In place tensor scale @@ -227,11 +245,12 @@ inline auto operator-=(T& left, N right) { /// \param right The right-hand scalar argument /// \return A reference to \c left template ::value || - detail::is_tensor_of_tensor::value) && - detail::is_numeric_v>::type* = nullptr> -inline auto operator*=(T& left, N right) { - return scale_to(left, right); + typename std::enable_if< + (detail::is_tensor>::value || + detail::is_tensor_of_tensor>::value) && + detail::is_numeric_v>::type* = nullptr> +inline decltype(auto) operator*=(T&& left, N right) { + return scale_to(std::forward(left), right); } } // namespace TiledArray diff --git a/src/TiledArray/tile_interface/add.h b/src/TiledArray/tile_interface/add.h index b0bb7cf968..879d2ed9d2 100644 --- a/src/TiledArray/tile_interface/add.h +++ b/src/TiledArray/tile_interface/add.h @@ -168,9 +168,9 @@ inline decltype(auto) add(Left&& left, Right&& right, const Scalar factor, template < typename Result, typename Arg, typename = std::enable_if_t< - detail::has_member_function_add_to_anyreturn_v>> -inline Result& add_to(Result& result, const Arg& arg) { - return result.add_to(arg); + detail::has_member_function_add_to_anyreturn_v>> +inline decltype(auto) add_to(Result&& result, const Arg& arg) { + return std::forward(result).add_to(arg); } /// Add and scale to the result tile @@ -186,9 +186,10 @@ template && detail::has_member_function_add_to_anyreturn_v< - Result&, const Arg&, const Scalar>>::type* = nullptr> -inline Result& add_to(Result& result, const Arg& arg, const Scalar factor) { - return result.add_to(arg, factor); + Result&&, const Arg&, const Scalar>>::type* = nullptr> +inline decltype(auto) add_to(Result&& result, const Arg& arg, + const Scalar factor) { + return std::forward(result).add_to(arg, factor); } namespace tile_interface { diff --git a/src/TiledArray/tile_op/tile_interface.h b/src/TiledArray/tile_op/tile_interface.h index 65d970ebeb..ee8c1093a2 100644 --- a/src/TiledArray/tile_op/tile_interface.h +++ b/src/TiledArray/tile_op/tile_interface.h @@ -372,8 +372,8 @@ inline auto subt(const Arg& arg, const Scalar value, const Perm& perm) { /// \param arg The argument to be subtracted from the result /// \return A tile that is equal to result[i] -= arg[i] template -inline Result& subt_to(Result& result, const Arg& arg) { - return result.subt_to(arg); +inline decltype(auto) subt_to(Result&& result, const Arg& arg) { + return std::forward(result).subt_to(arg); } /// Subtract and scale from the result tile From d9c6386f5978eb4247acd9e22df56298068ed11f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 15 Apr 2023 01:23:01 -0400 Subject: [PATCH 033/592] introduce is_congruent for mixed BTAS/TA range comparisons --- src/TiledArray/external/btas.h | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/TiledArray/external/btas.h b/src/TiledArray/external/btas.h index 7dbd115d4d..11971c269e 100644 --- a/src/TiledArray/external/btas.h +++ b/src/TiledArray/external/btas.h @@ -109,6 +109,34 @@ inline bool is_congruent(const btas::RangeNd& r1, r2.extent_data()); } +/// Test if a BTAS range and a TA range are congruent + +/// This function tests that the rank and extent of +/// \c r1 are equal to those of \c r2. +/// \param r1 The first Range to compare +/// \param r2 The second Range to compare +template +inline bool is_congruent(const btas::RangeNd& r1, + const TiledArray::Range& r2) { + return (r1.rank() == r2.rank()) && + std::equal(r1.extent_data(), r1.extent_data() + r1.rank(), + r2.extent_data()); +} + +/// Test if a TA range and a BTAS range are congruent + +/// This function tests that the rank and extent of +/// \c r1 are equal to those of \c r2. +/// \param r1 The first Range to compare +/// \param r2 The second Range to compare +template +inline bool is_congruent(const TiledArray::Range& r1, + const btas::RangeNd& r2) { + return (r1.rank() == r2.rank()) && + std::equal(r1.extent_data(), r1.extent_data() + r1.rank(), + r2.extent_data()); +} + template decltype(auto) make_ti(const btas::Tensor& arg) { return TiledArray::detail::TensorInterface Date: Sun, 16 Apr 2023 12:29:17 -0400 Subject: [PATCH 034/592] fixup host_allocator to handle zero size allocations (by allocating 1 byte in same pool as the rest of allocations, rather than in special pool Umpire uses) and handle rebinding correctly --- src/TiledArray/external/umpire.h | 21 ++++++++++++--------- src/TiledArray/host/allocator.h | 4 +++- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/TiledArray/external/umpire.h b/src/TiledArray/external/umpire.h index 644039abe7..ac42f3bf1c 100644 --- a/src/TiledArray/external/umpire.h +++ b/src/TiledArray/external/umpire.h @@ -91,14 +91,16 @@ class umpire_allocator_impl { : umpalloc_(umpalloc) {} template - umpire_allocator_impl(const umpire_allocator_impl& rhs) noexcept + umpire_allocator_impl( + const umpire_allocator_impl& rhs) noexcept : umpalloc_(rhs.umpalloc_) {} /// allocates memory using umpire dynamic pool pointer allocate(size_t n) { TA_ASSERT(umpalloc_); - size_t nbytes = n * sizeof(T); + // QuickPool::allocate_internal does not handle zero-size allocations + size_t nbytes = n == 0 ? 1 : n * sizeof(T); pointer result = nullptr; auto* allocation_strategy = umpalloc_->getAllocationStrategy(); @@ -117,7 +119,8 @@ class umpire_allocator_impl { void deallocate(pointer ptr, size_t n) { TA_ASSERT(umpalloc_); - const auto nbytes = n * sizeof(T); + // QuickPool::allocate_internal does not handle zero-size allocations + const auto nbytes = n == 0 ? 1 : n * sizeof(T); auto* allocation_strategy = umpalloc_->getAllocationStrategy(); // N.B. with multiple threads would have to do this test in @@ -137,15 +140,15 @@ class umpire_allocator_impl { umpire::Allocator* umpalloc_; }; // class umpire_allocator -template -bool operator==(const umpire_allocator_impl& lhs, - const umpire_allocator_impl& rhs) noexcept { +template +bool operator==(const umpire_allocator_impl& lhs, + const umpire_allocator_impl& rhs) noexcept { return lhs.umpire_allocator() == rhs.umpire_allocator(); } -template -bool operator!=(const umpire_allocator_impl& lhs, - const umpire_allocator_impl& rhs) noexcept { +template +bool operator!=(const umpire_allocator_impl& lhs, + const umpire_allocator_impl& rhs) noexcept { return !(lhs == rhs); } diff --git a/src/TiledArray/host/allocator.h b/src/TiledArray/host/allocator.h index 9e221c42d7..dbb8f53b55 100644 --- a/src/TiledArray/host/allocator.h +++ b/src/TiledArray/host/allocator.h @@ -53,7 +53,9 @@ class host_allocator_impl template host_allocator_impl(const host_allocator_impl& rhs) noexcept - : base_type(static_cast&>(rhs)) {} + : base_type(static_cast< + const umpire_allocator_impl>&>( + rhs)) {} template friend bool operator==(const host_allocator_impl& lhs, From 074cbeaa75f567e25c7d6bc221c3d8f93e231105 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 17 Apr 2023 08:58:03 -0400 Subject: [PATCH 035/592] Tensor::clone: handle corner case of data_==null && range_!=null --- src/TiledArray/tensor/tensor.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 138b92266c..d6ce1b62be 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -552,6 +552,10 @@ class Tensor { result = detail::tensor_op( [](const numeric_type value) -> numeric_type { return value; }, *this); + } else if (range_) { // corner case: data_ = null implies range_.volume() + // == 0; + TA_ASSERT(range_.volume() == 0); + result = Tensor(range_); } return result; } From c76f0c0fe80ae4bb5a2e02255cf6f5003fb3021c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 18 Apr 2023 16:09:04 -0400 Subject: [PATCH 036/592] dox cleanup --- src/TiledArray/special/diagonal_array.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/special/diagonal_array.h b/src/TiledArray/special/diagonal_array.h index 825d66fd98..dd62db1498 100644 --- a/src/TiledArray/special/diagonal_array.h +++ b/src/TiledArray/special/diagonal_array.h @@ -267,7 +267,7 @@ Array diagonal_array(World &world, TiledRange const &trange, T val = 1) { /// \param[in] diagonals_begin the begin iterator of the range of the diagonals /// \param[in] diagonals_end the end iterator of the range of the diagonals; /// if not given, default initialized and thus will not be checked -/// \return a constant diagonal DistArray +/// \return a diagonal DistArray template std::enable_if_t::value, Array> diagonal_array(World &world, TiledRange const &trange, From f1c91458213c32002415c45acf58af7f085ae2d5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 18 Apr 2023 16:11:19 -0400 Subject: [PATCH 037/592] TA_LAPACK_ERROR throws lapack::Error, the exception class used by lapackpp --- src/TiledArray/math/linalg/rank-local.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/math/linalg/rank-local.cpp b/src/TiledArray/math/linalg/rank-local.cpp index 74e1aac526..d23f3b4e3f 100644 --- a/src/TiledArray/math/linalg/rank-local.cpp +++ b/src/TiledArray/math/linalg/rank-local.cpp @@ -40,7 +40,7 @@ inline int ta_lapack_fortran_call(F f, Args... args) { return info; } -#define TA_LAPACK_ERROR(F) throw std::runtime_error("lapack::" #F " failed") +#define TA_LAPACK_ERROR(F) throw lapack::Error("lapack::" #F " failed") #define TA_LAPACK_FORTRAN_CALL(F, ARGS...) \ ((ta_lapack_fortran_call(F, ARGS) == 0) || (TA_LAPACK_ERROR(F), 0)) From 764dadbb5247efa64bdabdf604f4f8f3c056acee Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 18 Apr 2023 16:12:43 -0400 Subject: [PATCH 038/592] make lapack::Error serializable --- src/TiledArray/math/linalg/rank-local.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/TiledArray/math/linalg/rank-local.h b/src/TiledArray/math/linalg/rank-local.h index 5c46550bd3..3326dbc5d5 100644 --- a/src/TiledArray/math/linalg/rank-local.h +++ b/src/TiledArray/math/linalg/rank-local.h @@ -71,4 +71,26 @@ void householder_qr(Matrix &V, Matrix &R); } // namespace TiledArray::math::linalg::rank_local +namespace madness::archive { + +/// Serialize (deserialize) an lapack::Error + +/// \tparam Archive The archive type. +template +struct ArchiveSerializeImpl { + static inline void serialize(const Archive &ar, lapack::Error &e) { + MAD_ARCHIVE_DEBUG(std::cout << "(de)serialize lapack::Error" << std::endl); + if constexpr (is_output_archive_v) { // serialize + const std::string msg = e.what(); + ar &msg; + } else { + std::string msg; + ar &msg; + e = lapack::Error(msg); + } + } +}; + +} // namespace madness::archive + #endif // TILEDARRAY_MATH_LINALG_RANK_LOCAL_H__INCLUDED From b2254f352260d80e15c5f4a94b7bfcad71f118fb Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 18 Apr 2023 16:51:27 -0400 Subject: [PATCH 039/592] errors received from rank-local lapack calls are broadcast/rethrown on every rank bumps MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/469 --- INSTALL.md | 2 +- external/versions.cmake | 4 +-- .../math/linalg/non-distributed/cholesky.h | 29 ++++++++++--------- .../math/linalg/non-distributed/heig.h | 8 ++--- .../math/linalg/non-distributed/lu.h | 17 +++++------ .../math/linalg/non-distributed/qr.h | 27 ++++++++--------- .../math/linalg/non-distributed/svd.h | 4 +-- src/TiledArray/math/linalg/rank-local.h | 16 ++++++++++ 8 files changed, 57 insertions(+), 50 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 6060c4bd29..ea5a1d87ae 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag 561fe1bff7f3374814111a15e28c7a141ab9b67a . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 91fff76deba20c751d0646c54f2f1c1e07bd6156 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag dc3294160209cbd683bfb57cb2b933bd5f86e07e . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index e9cfb45375..9d834e47a1 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 91fff76deba20c751d0646c54f2f1c1e07bd6156) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 0b44ef319643cb9721fbe17d294987c146e6460e) +set(TA_TRACKED_MADNESS_TAG dc3294160209cbd683bfb57cb2b933bd5f86e07e) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 91fff76deba20c751d0646c54f2f1c1e07bd6156) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) diff --git a/src/TiledArray/math/linalg/non-distributed/cholesky.h b/src/TiledArray/math/linalg/non-distributed/cholesky.h index 4196002533..fc96a6bf1c 100644 --- a/src/TiledArray/math/linalg/non-distributed/cholesky.h +++ b/src/TiledArray/math/linalg/non-distributed/cholesky.h @@ -42,9 +42,7 @@ auto rank_local_cholesky(const DistArray& A) { World& world = A.world(); auto A_eig = detail::make_matrix(A); - if (world.rank() == 0) { - linalg::rank_local::cholesky(A_eig); - } + TA_LAPACK_ON_RANK_ZERO(cholesky, world, A_eig); world.gop.broadcast_serializable(A_eig, 0); return A_eig; } @@ -140,11 +138,20 @@ auto cholesky_linv(const Array& A, TiledRange l_trange = TiledRange()) { // if need to return L use its copy to compute inverse decltype(L_eig) L_inv_eig; + std::optional error_opt; if (world.rank() == 0) { - if (Both) L_inv_eig = L_eig; - auto& L_inv_eig_ref = Both ? L_inv_eig : L_eig; - linalg::rank_local::cholesky_linv(L_inv_eig_ref); - detail::zero_out_upper_triangle(L_inv_eig_ref); + try { + if (Both) L_inv_eig = L_eig; + auto& L_inv_eig_ref = Both ? L_inv_eig : L_eig; + linalg::rank_local::cholesky_linv(L_inv_eig_ref); + detail::zero_out_upper_triangle(L_inv_eig_ref); + } catch (lapack::Error& err) { + error_opt = err; + } + } + world.gop.broadcast_serializable(error_opt, 0); + if (error_opt) { + throw error_opt.value(); } world.gop.broadcast_serializable(Both ? L_inv_eig : L_eig, 0); @@ -169,9 +176,7 @@ auto cholesky_solve(const Array& A, const Array& B, auto A_eig = detail::make_matrix(A); auto X_eig = detail::make_matrix(B); World& world = A.world(); - if (world.rank() == 0) { - linalg::rank_local::cholesky_solve(A_eig, X_eig); - } + TA_LAPACK_ON_RANK_ZERO(cholesky_solve, world, A_eig, X_eig); world.gop.broadcast_serializable(X_eig, 0); if (x_trange.rank() == 0) x_trange = B.trange(); return eigen_to_array(world, x_trange, X_eig); @@ -192,9 +197,7 @@ auto cholesky_lsolve(Op transpose, const Array& A, const Array& B, "scalar types"); auto X_eig = detail::make_matrix(B); - if (world.rank() == 0) { - linalg::rank_local::cholesky_lsolve(transpose, L_eig, X_eig); - } + TA_LAPACK_ON_RANK_ZERO(cholesky_lsolve, world, transpose, L_eig, X_eig); world.gop.broadcast_serializable(X_eig, 0); if (l_trange.rank() == 0) l_trange = A.trange(); if (x_trange.rank() == 0) x_trange = B.trange(); diff --git a/src/TiledArray/math/linalg/non-distributed/heig.h b/src/TiledArray/math/linalg/non-distributed/heig.h index 5490b6b757..85079f356c 100644 --- a/src/TiledArray/math/linalg/non-distributed/heig.h +++ b/src/TiledArray/math/linalg/non-distributed/heig.h @@ -56,9 +56,7 @@ auto heig(const Array& A, TiledRange evec_trange = TiledRange()) { World& world = A.world(); auto A_eig = detail::make_matrix(A); std::vector evals; - if (world.rank() == 0) { - linalg::rank_local::heig(A_eig, evals); - } + TA_LAPACK_ON_RANK_ZERO(heig, world, A_eig, evals); world.gop.broadcast_serializable(A_eig, 0); world.gop.broadcast_serializable(evals, 0); if (evec_trange.rank() == 0) evec_trange = A.trange(); @@ -99,9 +97,7 @@ auto heig(const ArrayA& A, const ArrayB& B, auto A_eig = detail::make_matrix(A); auto B_eig = detail::make_matrix(B); std::vector evals; - if (world.rank() == 0) { - linalg::rank_local::heig(A_eig, B_eig, evals); - } + TA_LAPACK_ON_RANK_ZERO(heig, world, A_eig, B_eig, evals); world.gop.broadcast_serializable(A_eig, 0); world.gop.broadcast_serializable(evals, 0); if (evec_trange.rank() == 0) evec_trange = A.trange(); diff --git a/src/TiledArray/math/linalg/non-distributed/lu.h b/src/TiledArray/math/linalg/non-distributed/lu.h index d1b06bbb1c..6a3e1ea424 100644 --- a/src/TiledArray/math/linalg/non-distributed/lu.h +++ b/src/TiledArray/math/linalg/non-distributed/lu.h @@ -27,9 +27,9 @@ #include -#include -#include #include +#include +#include namespace TiledArray::math::linalg::non_distributed { @@ -37,15 +37,14 @@ namespace TiledArray::math::linalg::non_distributed { * @brief Solve a linear system via LU factorization */ template -auto lu_solve(const ArrayA& A, const ArrayB& B, TiledRange x_trange = TiledRange()) { +auto lu_solve(const ArrayA& A, const ArrayB& B, + TiledRange x_trange = TiledRange()) { (void)detail::array_traits{}; (void)detail::array_traits{}; auto& world = A.world(); auto A_eig = detail::make_matrix(A); auto B_eig = detail::make_matrix(B); - if (world.rank() == 0) { - linalg::rank_local::lu_solve(A_eig, B_eig); - } + TA_LAPACK_ON_RANK_ZERO(lu_solve, world, A_eig, B_eig); world.gop.broadcast_serializable(B_eig, 0); if (x_trange.rank() == 0) x_trange = B.trange(); return eigen_to_array(world, x_trange, B_eig); @@ -59,14 +58,12 @@ auto lu_inv(const Array& A, TiledRange ainv_trange = TiledRange()) { (void)detail::array_traits{}; auto& world = A.world(); auto A_eig = detail::make_matrix(A); - if (world.rank() == 0) { - linalg::rank_local::lu_inv(A_eig); - } + TA_LAPACK_ON_RANK_ZERO(lu_inv, world, A_eig); world.gop.broadcast_serializable(A_eig, 0); if (ainv_trange.rank() == 0) ainv_trange = A.trange(); return eigen_to_array(A.world(), ainv_trange, A_eig); } -} // namespace TiledArray::math::linalg::lapack +} // namespace TiledArray::math::linalg::non_distributed #endif // TILEDARRAY_MATH_LINALG_NON_DISTRIBUTED_LU_H__INCLUDED diff --git a/src/TiledArray/math/linalg/non-distributed/qr.h b/src/TiledArray/math/linalg/non-distributed/qr.h index e43cec632d..b66ee222ea 100644 --- a/src/TiledArray/math/linalg/non-distributed/qr.h +++ b/src/TiledArray/math/linalg/non-distributed/qr.h @@ -3,35 +3,32 @@ #include -#include -#include #include +#include +#include namespace TiledArray::math::linalg::non_distributed { template -auto householder_qr( const ArrayV& V, TiledRange q_trange = TiledRange(), - TiledRange r_trange = TiledRange() ) { - +auto householder_qr(const ArrayV& V, TiledRange q_trange = TiledRange(), + TiledRange r_trange = TiledRange()) { (void)detail::array_traits{}; auto& world = V.world(); auto V_eig = detail::make_matrix(V); decltype(V_eig) R_eig; - if( !world.rank() ) { - linalg::rank_local::householder_qr( V_eig, R_eig ); - } - world.gop.broadcast_serializable( V_eig, 0 ); - if(q_trange.rank() == 0) q_trange = V.trange(); - auto Q = eigen_to_array( world, q_trange, V_eig ); + TA_LAPACK_ON_RANK_ZERO(householder_qr, world, V_eig, R_eig); + world.gop.broadcast_serializable(V_eig, 0); + if (q_trange.rank() == 0) q_trange = V.trange(); + auto Q = eigen_to_array(world, q_trange, V_eig); if constexpr (not QOnly) { - world.gop.broadcast_serializable( R_eig, 0 ); + world.gop.broadcast_serializable(R_eig, 0); if (r_trange.rank() == 0) { // Generate a TRange based on column tiling of V auto col_tiling = V.trange().dim(1); - r_trange = TiledRange( {col_tiling, col_tiling} ); + r_trange = TiledRange({col_tiling, col_tiling}); } - auto R = eigen_to_array( world, r_trange, R_eig ); - return std::make_tuple( Q, R ); + auto R = eigen_to_array(world, r_trange, R_eig); + return std::make_tuple(Q, R); } else { return Q; } diff --git a/src/TiledArray/math/linalg/non-distributed/svd.h b/src/TiledArray/math/linalg/non-distributed/svd.h index e6ea5ef1da..e0094ef906 100644 --- a/src/TiledArray/math/linalg/non-distributed/svd.h +++ b/src/TiledArray/math/linalg/non-distributed/svd.h @@ -75,9 +75,7 @@ auto svd(const Array& A, TiledRange u_trange = TiledRange(), if constexpr (need_u) U = std::make_unique(); if constexpr (need_vt) VT = std::make_unique(); - if (world.rank() == 0) { - linalg::rank_local::svd(A_eig, S, U.get(), VT.get()); - } + TA_LAPACK_ON_RANK_ZERO(svd, world, A_eig, S, U.get(), VT.get()); world.gop.broadcast_serializable(S, 0); if (U) world.gop.broadcast_serializable(*U, 0); diff --git a/src/TiledArray/math/linalg/rank-local.h b/src/TiledArray/math/linalg/rank-local.h index 3326dbc5d5..f7db4abd01 100644 --- a/src/TiledArray/math/linalg/rank-local.h +++ b/src/TiledArray/math/linalg/rank-local.h @@ -93,4 +93,20 @@ struct ArchiveSerializeImpl { } // namespace madness::archive +/// TA_LAPACK_ON_RANK_ZERO(fn,args...) invokes linalg::rank_local::fn(args...) +/// on rank 0 and broadcasts/rethrows the exception, if any +#define TA_LAPACK_ON_RANK_ZERO(fn, world, args...) \ + std::optional error_opt; \ + if (world.rank() == 0) { \ + try { \ + linalg::rank_local::fn(args); \ + } catch (lapack::Error & err) { \ + error_opt = err; \ + } \ + } \ + world.gop.broadcast_serializable(error_opt, 0); \ + if (error_opt) { \ + throw error_opt.value(); \ + } + #endif // TILEDARRAY_MATH_LINALG_RANK_LOCAL_H__INCLUDED From 9911c8179e2ca2450c6e8004575a0a7c077fe21a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 8 May 2023 09:59:57 -0400 Subject: [PATCH 040/592] no more WORLD_INSTANTIATE_STATIC_TEMPLATES bumps MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/470 which removes the need for WORLD_INSTANTIATE_STATIC_TEMPLATES --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- src/TiledArray/external/madness.h | 5 ----- 3 files changed, 3 insertions(+), 8 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index ea5a1d87ae..6517dc86c6 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag 561fe1bff7f3374814111a15e28c7a141ab9b67a . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag dc3294160209cbd683bfb57cb2b933bd5f86e07e . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 58b3e2c623d772f6e4a2e9cf5758073de32ecc50 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 9d834e47a1..313b8d5d11 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG dc3294160209cbd683bfb57cb2b933bd5f86e07e) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 91fff76deba20c751d0646c54f2f1c1e07bd6156) +set(TA_TRACKED_MADNESS_TAG 58b3e2c623d772f6e4a2e9cf5758073de32ecc50) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG dc3294160209cbd683bfb57cb2b933bd5f86e07e) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) diff --git a/src/TiledArray/external/madness.h b/src/TiledArray/external/madness.h index ecfa313d9b..8a8efd6b2e 100644 --- a/src/TiledArray/external/madness.h +++ b/src/TiledArray/external/madness.h @@ -20,11 +20,6 @@ #ifndef TILEDARRAY_EXTERNAL_MADNESS_H__INCLUDED #define TILEDARRAY_EXTERNAL_MADNESS_H__INCLUDED -// This needs to be defined before world/worldreduce.h and world/worlddc.h -#ifndef WORLD_INSTANTIATE_STATIC_TEMPLATES -#define WORLD_INSTANTIATE_STATIC_TEMPLATES -#endif // WORLD_INSTANTIATE_STATIC_TEMPLATES - #include #include From 0ab453e32f40a324b770637a5dc0f995caca30dc Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 11 May 2023 12:57:59 -0400 Subject: [PATCH 041/592] bumps MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/471 which allows use of std::pair as key in ConcurrentHashMap --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 6517dc86c6..5a190a93aa 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag 561fe1bff7f3374814111a15e28c7a141ab9b67a . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 58b3e2c623d772f6e4a2e9cf5758073de32ecc50 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 313b8d5d11..ea45a87437 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 58b3e2c623d772f6e4a2e9cf5758073de32ecc50) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG dc3294160209cbd683bfb57cb2b933bd5f86e07e) +set(TA_TRACKED_MADNESS_TAG 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 58b3e2c623d772f6e4a2e9cf5758073de32ecc50) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From 4bde729663e9309805af6b08aba84369e54960c7 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 19 May 2023 10:19:25 -0400 Subject: [PATCH 042/592] created a placeholded test --- tests/einsum.cpp | 56 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 0fcb71f072..f62be4e016 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -714,7 +714,55 @@ BOOST_AUTO_TEST_CASE(xxx) { BOOST_CHECK(are_equal); } -BOOST_AUTO_TEST_SUITE_END() +BOOST_AUTO_TEST_SUITE_END() // einsum_tot + +BOOST_AUTO_TEST_SUITE(einsum_tot_t) + +BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { + using t_type = DistArray, SparsePolicy>; + using tot_type = DistArray>, SparsePolicy>; + using matrix_il = TiledArray::detail::matrix_il>; + auto& world = TiledArray::get_default_world(); + Tensor lhs_elem_0_0( + Range{7, 2}, {49, 73, 28, 46, 12, 83, 29, 61, 61, 98, 57, 28, 96, 57}); + Tensor lhs_elem_0_1( + Range{7, 2}, {78, 15, 69, 55, 87, 94, 28, 94, 79, 30, 26, 88, 48, 74}); + Tensor lhs_elem_1_0( + Range{7, 2}, {70, 32, 25, 71, 6, 56, 4, 13, 72, 50, 15, 95, 52, 89}); + Tensor lhs_elem_1_1( + Range{7, 2}, {12, 29, 17, 68, 37, 79, 5, 52, 13, 35, 53, 54, 78, 71}); + Tensor lhs_elem_2_0( + Range{7, 2}, {77, 39, 34, 94, 16, 82, 63, 27, 75, 12, 14, 59, 3, 14}); + Tensor lhs_elem_2_1( + Range{7, 2}, {65, 90, 37, 41, 65, 75, 59, 16, 44, 85, 86, 11, 40, 24}); + Tensor lhs_elem_3_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_3_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, + {lhs_elem_1_0, lhs_elem_1_1}, + {lhs_elem_2_0, lhs_elem_2_1}, + {lhs_elem_3_0, lhs_elem_3_1}}; + TiledRange lhs_trange{{0, 2, 4}, {0, 2}}; + tot_type lhs(world, lhs_trange, lhs_il); + + TiledRange rhs_trange{{0, 2}, {0, 2, 4, 6}}; + t_type rhs(world, rhs_trange); + rhs.fill_random(); + + TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1), + rhs_trange.dim(0)}; + tot_type ref_result(world, ref_result_trange); + // TODO compute ref_result + + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); + // dist_array_t out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,k;m,n"); + // const bool are_equal = ToTArrayFixture::are_equal(corr, out); + // BOOST_CHECK(are_equal); +} + +BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t // Eigen einsum indices BOOST_AUTO_TEST_SUITE(einsum_index, TA_UT_LABEL_SERIAL) @@ -740,7 +788,7 @@ BOOST_AUTO_TEST_CASE(einsum_index) { BOOST_CHECK((v.range() == Range{src})); } -BOOST_AUTO_TEST_SUITE_END() +BOOST_AUTO_TEST_SUITE_END() // einsum_index #include "TiledArray/einsum/eigen.h" @@ -919,7 +967,7 @@ BOOST_AUTO_TEST_CASE(einsum_eigen_hji_jih_hj) { BOOST_CHECK(isApprox(reference, result)); } -BOOST_AUTO_TEST_SUITE_END() +BOOST_AUTO_TEST_SUITE_END() // einsum_eigen // TiledArray einsum expressions BOOST_AUTO_TEST_SUITE(einsum_tiledarray) @@ -1098,4 +1146,4 @@ BOOST_AUTO_TEST_CASE(einsum_tiledarray_dot) { // BOOST_CHECK(hik_hkj_hji == hkj_hji_hik); // } -BOOST_AUTO_TEST_SUITE_END() +BOOST_AUTO_TEST_SUITE_END() // einsum_tiledarray From 6f63289c391290bdae8c450daa7d082656aabc12 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 21 May 2023 16:44:40 -0400 Subject: [PATCH 043/592] [unit] re-enable type_traits --- tests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 32a8e9ee6c..49b5c61f95 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -34,6 +34,7 @@ set(executable ta_test) set(ta_test_src_files ta_test.cpp range1.cpp range.cpp + type_traits.cpp tensor.cpp tensor_of_tensor.cpp tensor_tensor_view.cpp From 47223893caf8602436bf044e2565344e2d4625e2 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 21 May 2023 17:31:08 -0400 Subject: [PATCH 044/592] introduced is_nested_tensor and tensors_have_equal_nested_rank also cleaned up implementation of is_tensor_of_tensor --- src/TiledArray/expressions/expr_engine.h | 2 +- src/TiledArray/tensor/type_traits.h | 97 +++++++++++++++++++++--- tests/type_traits.cpp | 28 +++++++ 3 files changed, 114 insertions(+), 13 deletions(-) diff --git a/src/TiledArray/expressions/expr_engine.h b/src/TiledArray/expressions/expr_engine.h index bd4dbd9ccd..c364a5c1ba 100644 --- a/src/TiledArray/expressions/expr_engine.h +++ b/src/TiledArray/expressions/expr_engine.h @@ -73,7 +73,7 @@ class ExprEngine : private NO_DEFAULTS { World* world_; ///< The world where this expression will be evaluated BipartiteIndexList indices_; ///< The index list of this expression; bipartite due to need - ///< to support recursive tensors (i.e. Tensor-of-Tensor) + ///< to support nested tensors (e.g. tensors of tensors) bool permute_tiles_; ///< Result tile permutation flag (\c true == permute ///< tile) /// The permutation that will be applied to the outer tensor of tensors diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index 62448336a3..2903e5e7f7 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -28,9 +28,9 @@ #include +#include #include #include -#include namespace Eigen { @@ -60,10 +60,23 @@ class ShiftWrapper; // Note: These type traits help differentiate different implementation // functions for tensors, so a tensor of tensors is not considered a tensor. +/// is true type if all `Ts...` are tensors of scalars template struct is_tensor; +/// is true type if all `Ts...` are tensors of tensors of scalars template struct is_tensor_of_tensor; +/// is true type if all `Ts...` are _nested_ tensors; a nested tensor is a +/// tensors of scalars or tensors of nested tensors +template +struct is_nested_tensor; +/// is true type if `T1`, `T2`, and `Ts...` are tensors of same nested +/// rank, i.e. they are all tensors of scalars or tensors of tensors of scalars, +/// etc. ; +/// \warning the types must be tensors, hence +/// `tensors_have_equal_nested_rank` is false +template +struct tensors_have_equal_nested_rank; template struct is_tensor_helper : public std::false_type {}; @@ -83,23 +96,41 @@ struct is_tensor_helper> : public is_tensor_helper {}; template struct is_tensor_helper> : public is_tensor_helper {}; +//////////////////////////////////////////////////////////////////////////////// + +template <> +struct is_nested_tensor<> : public std::false_type {}; + template -struct is_tensor_of_tensor_helper : public std::false_type {}; +struct is_nested_tensor : is_tensor_helper {}; -template -struct is_tensor_of_tensor_helper> : public is_tensor_helper {}; +template +struct is_nested_tensor { + static constexpr bool value = + is_tensor_helper::value && is_nested_tensor::value; +}; -template -struct is_tensor_of_tensor_helper> - : public is_tensor_helper {}; +/// @tparam Ts a parameter pack +/// @c is_nested_tensor_v is an alias for @c +/// is_nested_tensor::value +template +constexpr const bool is_nested_tensor_v = is_nested_tensor::value; -template -struct is_tensor_of_tensor_helper> - : public is_tensor_of_tensor_helper {}; +//////////////////////////////////////////////////////////////////////////////// + +template +struct is_tensor_of_tensor_helper : public std::false_type {}; template -struct is_tensor_of_tensor_helper> - : public is_tensor_of_tensor_helper {}; +struct is_tensor_of_tensor_helper< + T, std::enable_if_t::value>> { + static constexpr bool value = + is_tensor_helper>::value && + !is_tensor_of_tensor_helper< + detail::remove_cvr_t>::value; +}; + +//////////////////////////////////////////////////////////////////////////////// template <> struct is_tensor<> : public std::false_type {}; @@ -121,6 +152,8 @@ struct is_tensor { template constexpr const bool is_tensor_v = is_tensor::value; +//////////////////////////////////////////////////////////////////////////////// + template <> struct is_tensor_of_tensor<> : public std::false_type {}; @@ -141,6 +174,42 @@ struct is_tensor_of_tensor { template constexpr const bool is_tensor_of_tensor_v = is_tensor_of_tensor::value; +//////////////////////////////////////////////////////////////////////////////// + +template +struct tensors_have_equal_nested_rank_helper : std::false_type {}; + +template +struct tensors_have_equal_nested_rank_helper< + T1, T2, std::enable_if_t>> { + static constexpr bool value = + tensors_have_equal_nested_rank_helper< + detail::remove_cvr_t, + detail::remove_cvr_t>::value || + (detail::is_numeric_v> && + detail::is_numeric_v>); +}; + +template +struct tensors_have_equal_nested_rank + : tensors_have_equal_nested_rank_helper {}; + +template +struct tensors_have_equal_nested_rank { + static constexpr bool value = + tensors_have_equal_nested_rank::value && + tensors_have_equal_nested_rank::value; +}; + +/// @tparam Ts a parameter pack +/// @c tensors_have_equal_nested_rank_v is an alias for @c +/// tensors_have_equal_nested_rank::value +template +constexpr const bool tensors_have_equal_nested_rank_v = + tensors_have_equal_nested_rank::value; + +//////////////////////////////////////////////////////////////////////////////// + template struct is_ta_tensor : public std::false_type {}; @@ -150,6 +219,8 @@ struct is_ta_tensor> : public std::true_type {}; template constexpr const bool is_ta_tensor_v = is_ta_tensor::value; +//////////////////////////////////////////////////////////////////////////////// + // Test if the tensor is contiguous template @@ -198,6 +269,8 @@ template constexpr const bool is_contiguous_tensor_v = is_contiguous_tensor::value; +//////////////////////////////////////////////////////////////////////////////// + // Test if the tensor is shifted template diff --git a/tests/type_traits.cpp b/tests/type_traits.cpp index 105ae6ff72..77940bcb6f 100644 --- a/tests/type_traits.cpp +++ b/tests/type_traits.cpp @@ -275,4 +275,32 @@ BOOST_AUTO_TEST_CASE(convertibility) { } } +BOOST_AUTO_TEST_CASE(tensor) { + using TI = TiledArray::Tensor; + using TTI = TiledArray::Tensor>; + using TTTI = TiledArray::Tensor>>; + using TD = TiledArray::Tensor; + using TTD = TiledArray::Tensor>; + using TTTD = + TiledArray::Tensor>>; + + using namespace TiledArray::detail; + BOOST_CHECK((is_tensor_v)); + BOOST_CHECK(!(is_tensor_v)); + BOOST_CHECK((is_tensor_of_tensor_v)); + BOOST_CHECK(!(is_tensor_of_tensor_v)); + BOOST_CHECK((!is_tensor_of_tensor_v)); + BOOST_CHECK((!is_tensor_of_tensor_v)); + BOOST_CHECK((is_nested_tensor_v)); + BOOST_CHECK((!tensors_have_equal_nested_rank_v)); + BOOST_CHECK((tensors_have_equal_nested_rank_v)); + BOOST_CHECK((tensors_have_equal_nested_rank_v)); + BOOST_CHECK((tensors_have_equal_nested_rank_v)); + BOOST_CHECK((!tensors_have_equal_nested_rank_v)); + BOOST_CHECK((!tensors_have_equal_nested_rank_v)); + BOOST_CHECK((!tensors_have_equal_nested_rank_v)); + BOOST_CHECK((!tensors_have_equal_nested_rank_v)); + BOOST_CHECK((!tensors_have_equal_nested_rank_v)); +} + BOOST_AUTO_TEST_SUITE_END() From 1f2c08c090e93e033af92bf633ed1eb60d80d0aa Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 22 May 2023 23:44:19 -0600 Subject: [PATCH 045/592] Tensor::mult(other,...) can deal with other of different nested rank than this --- src/TiledArray/expressions/mult_engine.h | 14 +- src/TiledArray/tensor/kernels.h | 33 +++-- src/TiledArray/tensor/operators.h | 33 ++--- src/TiledArray/tensor/permute.h | 18 ++- src/TiledArray/tensor/tensor.h | 173 ++++++++++++++--------- tests/einsum.cpp | 16 ++- tests/sparse_tile.h | 46 +++++- tests/tensor_of_tensor.cpp | 62 +++++++- 8 files changed, 267 insertions(+), 128 deletions(-) diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 19788505fd..a53133d4b0 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -189,12 +189,14 @@ struct EngineTrait> { /// Multiplication expression engine /// This implements any expression encoded with the multiplication operator. -/// This includes Hadamard product, e.g. \code (c("i,j")=)a("i,j")*b("i,j") -/// \endcode , and pure contractions, e.g. \code (c("i,j")=)a("i,k")*b("k,j") -/// \endcode . \internal mixed Hadamard-contraction case, e.g. \code -/// c("i,j,l")=a("i,l,k")*b("j,l,k") \endcode , is not supported since -/// this requires that the result labels are assigned by user (currently they -/// are computed by this engine) +/// This includes Hadamard product, e.g. +/// \code (c("i,j")=)a("i,j")*b("i,j") \endcode , +/// and pure contractions, e.g. \code (c("i,j")=)a("i,k")*b("k,j") \endcode . +/// \internal mixed Hadamard-contraction case, e.g. +/// \code c("i,j,l")=a("i,l,k")*b("j,l,k") \endcode , +/// is not supported since +/// this requires that the result labels are assigned by user (currently they +/// are computed by this engine) /// \tparam Left The left-hand engine type /// \tparam Right The right-hand engine type /// \tparam Result The result tile type diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 2cd2d46fe3..87db8c1cc6 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -61,13 +61,14 @@ struct transform; /// \param tensor1 The first argument tensor /// \param tensors The remaining argument tensors template ::value || - is_tensor_of_tensor::value>::type* = nullptr> + typename = std::enable_if_t< + detail::is_nested_tensor_v || + std::is_invocable_r_v>> inline TR tensor_op(Op&& op, const T1& tensor1, const Ts&... tensors) { if constexpr (std::is_invocable_r_v) { return std::forward(op)(tensor1, tensors...); } else { + static_assert(detail::is_nested_tensor_v); return TiledArray::detail::transform()(std::forward(op), tensor1, tensors...); } @@ -93,8 +94,7 @@ inline TR tensor_op(Op&& op, const T1& tensor1, const Ts&... tensors) { /// \param[in] tensors The remaining argument tensors template ::value || - is_tensor_of_tensor::value) && + is_nested_tensor_v && is_contiguous_tensor::value>::type* = nullptr> inline TR tensor_op(Op&& op, const Permutation& perm, const T1& tensor1, const Ts&... tensors) { @@ -219,7 +219,7 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) { /// \param[in] tensors The argument tensors template ::value && + !is_tensor_v && is_contiguous_tensor::value>::type* = nullptr> inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) { TA_ASSERT(!empty(result, tensors...)); @@ -228,7 +228,11 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) { const auto volume = result.range().volume(); for (decltype(result.range().volume()) ord = 0ul; ord < volume; ++ord) { - inplace_tensor_op(op, result.at_ordinal(ord), tensors.at_ordinal(ord)...); + if constexpr (std::is_invocable_r_v) + op(result.at_ordinal(ord), tensors.at_ordinal(ord)...); + else + inplace_tensor_op(op, result.at_ordinal(ord), tensors.at_ordinal(ord)...); } } @@ -457,7 +461,7 @@ inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) { tensors.data()...); } -/// Initialize tensor of tensors with contiguous tensor arguments +/// Initialize nested tensor with contiguous tensor arguments /// This function initializes the \c i -th element of \c result with the result /// of \c op(tensors[i]...) @@ -470,7 +474,8 @@ inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) { /// \param[in] tensors The argument tensors template < typename Op, typename TR, typename... Ts, - typename std::enable_if::value && + typename std::enable_if<(is_nested_tensor::value && + !is_tensor::value) && is_contiguous_tensor::value>::type* = nullptr> inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) { TA_ASSERT(!empty(result, tensors...)); @@ -478,9 +483,13 @@ inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) { const auto volume = result.range().volume(); - for (decltype(result.range().volume()) ord = 0ul; ord < volume; ++ord) { - new (result.data() + ord) typename TR::value_type( - tensor_op(op, tensors.at_ordinal(ord)...)); + if constexpr (std::is_invocable_r_v) { + result = std::forward(op)(tensors...); + } else { + for (decltype(result.range().volume()) ord = 0ul; ord < volume; ++ord) { + new (result.data() + ord) typename TR::value_type( + tensor_op(op, tensors.at_ordinal(ord)...)); + } } } diff --git a/src/TiledArray/tensor/operators.h b/src/TiledArray/tensor/operators.h index 4be515d3b3..b8ed77671d 100644 --- a/src/TiledArray/tensor/operators.h +++ b/src/TiledArray/tensor/operators.h @@ -41,11 +41,8 @@ namespace TiledArray { /// \param right The right-hand tensor argument /// \return A tensor where element \c i is equal to left[i] + right[i] template , - detail::remove_cvr_t>::value || - detail::is_tensor_of_tensor, - detail::remove_cvr_t>::value>> + typename = std::enable_if_t, detail::remove_cvr_t>>> inline decltype(auto) operator+(T1&& left, T2&& right) { return add(std::forward(left), std::forward(right)); } @@ -58,14 +55,9 @@ inline decltype(auto) operator+(T1&& left, T2&& right) { /// \param left The left-hand tensor argument /// \param right The right-hand tensor argument /// \return A tensor where element \c i is equal to left[i] - right[i] -template < - typename T1, typename T2, - typename std::enable_if< - detail::is_tensor, - detail::remove_cvr_t>::value || - detail::is_tensor_of_tensor, - detail::remove_cvr_t>::value>::type* = - nullptr> +template , detail::remove_cvr_t>>> inline decltype(auto) operator-(T1&& left, T2&& right) { return subt(std::forward(left), std::forward(right)); } @@ -80,12 +72,8 @@ inline decltype(auto) operator-(T1&& left, T2&& right) { /// \return A tensor where element \c i is equal to left[i] * right[i] template < typename T1, typename T2, - typename std::enable_if< - detail::is_tensor, - detail::remove_cvr_t>::value || - detail::is_tensor_of_tensor, - detail::remove_cvr_t>::value>::type* = - nullptr> + typename std::enable_if, detail::remove_cvr_t>>::type* = nullptr> inline decltype(auto) operator*(T1&& left, T2&& right) { return mult(std::forward(left), std::forward(right)); } @@ -100,8 +88,7 @@ inline decltype(auto) operator*(T1&& left, T2&& right) { /// \return A tensor where element \c i is equal to left[i] * right template >::value || - detail::is_tensor_of_tensor>::value) && + detail::is_nested_tensor_v> && detail::is_numeric_v>::type* = nullptr> inline decltype(auto) operator*(T&& left, N right) { return scale(std::forward(left), right); @@ -118,9 +105,7 @@ template < typename N, typename T, typename std::enable_if< detail::is_numeric_v && - (detail::is_tensor>::value || - detail::is_tensor_of_tensor>::value)>::type* = - nullptr> + detail::is_nested_tensor_v>>::type* = nullptr> inline decltype(auto) operator*(N left, T&& right) { return scale(std::forward(right), left); } diff --git a/src/TiledArray/tensor/permute.h b/src/TiledArray/tensor/permute.h index 1b888e3a3d..7fb103217f 100644 --- a/src/TiledArray/tensor/permute.h +++ b/src/TiledArray/tensor/permute.h @@ -97,10 +97,14 @@ inline void fuse_dimensions(SizeType* MADNESS_RESTRICT const fused_size, /// The expected signature of the input operations is: /// \code -/// Result::value_type input_op(const Arg0::value_type, const -/// Args::value_type...) \endcode The expected signature of the output -/// operations is: \code void output_op(Result::value_type*, const -/// Result::value_type) \endcode \tparam InputOp The input operation type +/// Result::value_type input_op(const Arg0::value_type, +/// const Args::value_type...) +/// \endcode +/// The expected signature of the output +/// operations is: +/// \code void output_op(Result::value_type*, const Result::value_type) +/// \endcode +/// \tparam InputOp The input operation type /// \tparam OutputOp The output operation type /// \tparam Result The result tensor type /// \tparam Arg0 The first tensor argument type @@ -152,7 +156,7 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result, // Copy the block math::vector_ptr_op(op, block_size, result.data() + perm_index, - arg0.data() + index, (args.data() + index)...); + &arg0[index], &args[index]...); } } else { @@ -194,8 +198,8 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result, math::transpose(input_op, output_op, other_fused_size[1], other_fused_size[3], result_outer_stride, - result.data() + perm_index, other_fused_weight[1], - arg0.data() + index, (args.data() + index)...); + &result[perm_index], other_fused_weight[1], + &arg0[index], &args[index]...); } } } diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index d6ce1b62be..73e3fc0caf 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -99,6 +99,8 @@ class Tensor { scalar_type; ///< the scalar type that supports T private: + template + using value_t = typename X::value_type; template using numeric_t = typename TiledArray::detail::numeric_type::type; @@ -350,6 +352,12 @@ class Tensor { /// \param other The tensor to be copied /// \note this constructor is disabled if \p T1 already has a conversion /// operator to this type + /// \warning if `T1` is a tensor of tensors its elements are _cloned_ rather + /// than copied to make the semantics of this to be consistent + /// between tensors of scalars and tensors of scalars; specifically, + /// if `T1` is a tensor of scalars the constructed tensor is + /// is independent of \p other, thus should apply clone to inner + /// tensor nests to behave similarly for nested tensors template < typename T1, typename std::enable_if< @@ -357,7 +365,13 @@ class Tensor { !detail::has_conversion_operator_v>::type* = nullptr> explicit Tensor(const T1& other) : Tensor(detail::clone_range(other), 1, default_construct{false}) { - auto op = [](const numeric_t arg) -> numeric_t { return arg; }; + auto op = [](const value_type& arg) -> decltype(auto) { + // clone nested tensors + if constexpr (detail::is_tensor_v) + return arg.clone(); + else + return arg; + }; detail::tensor_init(op, *this, other); } @@ -368,13 +382,25 @@ class Tensor { /// \tparam Perm A permutation type /// \param other The tensor to be copied /// \param perm The permutation that will be applied to the copy + /// \warning if `T1` is a tensor of tensors its elements are _cloned_ rather + /// than copied to make the semantics of this to be consistent + /// between tensors of scalars and tensors of scalars; specifically, + /// if `T1` is a tensor of scalars the constructed tensor is + /// is independent of \p other, thus should apply clone to inner + /// tensor nests to behave similarly for nested tensors template < typename T1, typename Perm, - typename std::enable_if::value && + typename std::enable_if && detail::is_permutation_v>::type* = nullptr> Tensor(const T1& other, const Perm& perm) : Tensor(outer(perm) * other.range(), 1, default_construct{false}) { - auto op = [](const numeric_t arg) -> numeric_t { return arg; }; + auto op = [](const value_type& arg) -> decltype(auto) { + // clone nested tensors + if constexpr (detail::is_tensor_v) + return arg.clone(); + else + return arg; + }; detail::tensor_init(op, outer(perm), *this, other); @@ -448,7 +474,7 @@ class Tensor { /// \param right The right-hand tensor argument /// \param op The element-wise operation template ::value>::type* = nullptr> + typename = std::enable_if_t>> Tensor(const T1& left, const T2& right, Op&& op) : Tensor(detail::clone_range(left), 1, default_construct{false}) { detail::tensor_init(op, *this, left, right); @@ -1331,8 +1357,10 @@ class Tensor { /// \c op(*this[i],other[i]) template ::value>::type* = nullptr> - Tensor binary(const Right& right, Op&& op) const { - return Tensor(*this, right, op); + auto binary(const Right& right, Op&& op) const { + using result_value_type = decltype(op( + std::declval(), std::declval&>())); + return Tensor(*this, right, op); } /// Use a binary, element wise operation to construct a new, permuted tensor @@ -1341,7 +1369,7 @@ class Tensor { /// \tparam Op The binary operation type /// \tparam Perm A permutation tile /// \param right The right-hand argument in the binary operation - /// \param op The binary, element-wise operation + /// \param op The binary element-wise operation /// \param perm The permutation to be applied to this tensor /// \return A tensor where element \c i of the new tensor is equal to /// \c op(*this[i],other[i]) @@ -1349,7 +1377,7 @@ class Tensor { typename Right, typename Op, typename Perm, typename std::enable_if::value && detail::is_permutation_v>::type* = nullptr> - Tensor binary(const Right& right, Op&& op, const Perm& perm) const { + auto binary(const Right& right, Op&& op, const Perm& perm) const { constexpr bool is_tot = detail::is_tensor_of_tensor_v; [[maybe_unused]] constexpr bool is_bperm = detail::is_bipartite_permutation_v; @@ -1357,16 +1385,19 @@ class Tensor { // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does // not match Tensor"); if constexpr (!is_tot) { + using result_value_type = decltype(op( + std::declval(), std::declval&>())); + using ResultTensor = Tensor; if constexpr (is_bperm) { TA_ASSERT(inner_size(perm) == 0); // ensure this is a plain permutation - return Tensor(*this, right, op, outer(perm)); + return ResultTensor(*this, right, op, outer(perm)); } else - return Tensor(*this, right, op, perm); + return ResultTensor(*this, right, op, perm); } else { // AFAIK the other branch fundamentally relies on raw pointer arithmetic, // which won't work for ToTs. auto temp = binary(right, std::forward(op)); - Permute p; + Permute p; return p(temp, perm); } abort(); // unreachable @@ -1377,7 +1408,7 @@ class Tensor { /// \tparam Right The right-hand tensor type /// \tparam Op The binary operation type /// \param right The right-hand argument in the binary operation - /// \param op The binary, element-wise operation + /// \param op The binary element-wise operation /// \return A reference to this object /// \throw TiledArray::Exception When this tensor is empty. /// \throw TiledArray::Exception When \c other is empty. @@ -1385,7 +1416,8 @@ class Tensor { /// to the range of \c other. /// \throw TiledArray::Exception When this and \c other are the same. template ::value>::type* = nullptr> + typename std::enable_if>::type* = + nullptr> Tensor& inplace_binary(const Right& right, Op&& op) { detail::inplace_tensor_op(op, *this, right); return *this; @@ -1394,7 +1426,7 @@ class Tensor { /// Use a unary, element wise operation to construct a new tensor /// \tparam Op The unary operation type - /// \param op The unary, element-wise operation + /// \param op The unary element-wise operation /// \return A tensor where element \c i of the new tensor is equal to /// \c op(*this[i]) /// \throw TiledArray::Exception When this tensor is empty. @@ -1407,7 +1439,7 @@ class Tensor { /// \tparam Op The unary operation type /// \tparam Perm A permutation tile - /// \param op The unary operation + /// \param op The unary element-wise operation /// \param perm The permutation to be applied to this tensor /// \return A permuted tensor with elements that have been modified by \c op /// \throw TiledArray::Exception When this tensor is empty. @@ -1459,7 +1491,7 @@ class Tensor { template >::type* = nullptr> Tensor scale(const Scalar factor) const { - return unary([factor](const numeric_type a) -> numeric_type { + return unary([factor](const value_type& a) -> decltype(auto) { using namespace TiledArray::detail; return a * factor; }); @@ -1494,7 +1526,7 @@ class Tensor { detail::is_numeric_v>::type* = nullptr> Tensor& scale_to(const Scalar factor) { return inplace_unary( - [factor](numeric_type& MADNESS_RESTRICT res) { res *= factor; }); + [factor](value_type& MADNESS_RESTRICT res) { res *= factor; }); } // Addition operations @@ -1510,7 +1542,7 @@ class Tensor { Tensor add(const Right& right) const& { return binary( right, - [](const numeric_type l, const numeric_t r) -> numeric_type { + [](const value_type& l, const value_t& r) -> decltype(auto) { return l + r; }); } @@ -1543,7 +1575,7 @@ class Tensor { Tensor add(const Right& right, const Perm& perm) const { return binary( right, - [](const numeric_type l, const numeric_t r) -> numeric_type { + [](const value_type& l, const value_type& r) -> decltype(auto) { return l + r; }, perm); @@ -1562,9 +1594,11 @@ class Tensor { typename std::enable_if::value && detail::is_numeric_v>::type* = nullptr> Tensor add(const Right& right, const Scalar factor) const { - return binary(right, - [factor](const numeric_type l, const numeric_t r) - -> numeric_type { return (l + r) * factor; }); + return binary( + right, + [factor](const value_type& l, const value_type& r) -> decltype(auto) { + return (l + r) * factor; + }); } /// Scale and add this and \c other to construct a new, permuted tensor @@ -1584,8 +1618,9 @@ class Tensor { Tensor add(const Right& right, const Scalar factor, const Perm& perm) const { return binary( right, - [factor](const numeric_type l, const numeric_t r) - -> numeric_type { return (l + r) * factor; }, + [factor](const value_type& l, const value_type& r) -> decltype(auto) { + return (l + r) * factor; + }, perm); } @@ -1622,8 +1657,8 @@ class Tensor { template ::value>::type* = nullptr> Tensor& add_to(const Right& right) { - return inplace_binary(right, [](numeric_type& MADNESS_RESTRICT l, - const numeric_t r) { l += r; }); + return inplace_binary(right, [](value_type& MADNESS_RESTRICT l, + const value_t r) { l += r; }); } /// Add \c other to this tensor, and scale the result @@ -1639,8 +1674,8 @@ class Tensor { detail::is_numeric_v>::type* = nullptr> Tensor& add_to(const Right& right, const Scalar factor) { return inplace_binary( - right, [factor](numeric_type& MADNESS_RESTRICT l, - const numeric_t r) { (l += r) *= factor; }); + right, [factor](value_type& MADNESS_RESTRICT l, + const value_t r) { (l += r) *= factor; }); } /// Add a constant to this tensor @@ -1661,11 +1696,11 @@ class Tensor { /// \return A new tensor where the elements are the different between the /// elements of \c this and \c right template ::value>::type* = nullptr> + typename = std::enable_if< + detail::tensors_have_equal_nested_rank_v, Right>>> Tensor subt(const Right& right) const { return binary( - right, - [](const numeric_type l, const numeric_t r) -> numeric_type { + right, [](const value_type& l, const value_type& r) -> decltype(auto) { return l - r; }); } @@ -1685,7 +1720,7 @@ class Tensor { Tensor subt(const Right& right, const Perm& perm) const { return binary( right, - [](const numeric_type l, const numeric_t r) -> numeric_type { + [](const value_type& l, const value_type& r) -> decltype(auto) { return l - r; }, perm); @@ -1705,9 +1740,11 @@ class Tensor { typename std::enable_if::value && detail::is_numeric_v>::type* = nullptr> Tensor subt(const Right& right, const Scalar factor) const { - return binary(right, - [factor](const numeric_type l, const numeric_t r) - -> numeric_type { return (l - r) * factor; }); + return binary( + right, + [factor](const value_type& l, const value_type& r) -> decltype(auto) { + return (l - r) * factor; + }); } /// Subtract \c right from this and return the result scaled by a scaling \c @@ -1728,8 +1765,9 @@ class Tensor { Tensor subt(const Right& right, const Scalar factor, const Perm& perm) const { return binary( right, - [factor](const numeric_type l, const numeric_t r) - -> numeric_type { return (l - r) * factor; }, + [factor](const value_type& l, const value_type& r) -> decltype(auto) { + return (l - r) * factor; + }, perm); } @@ -1760,8 +1798,8 @@ class Tensor { template ::value>::type* = nullptr> Tensor& subt_to(const Right& right) { - return inplace_binary(right, [](numeric_type& MADNESS_RESTRICT l, - const numeric_t r) { l -= r; }); + return inplace_binary( + right, [](auto& MADNESS_RESTRICT l, const auto& r) { l -= r; }); } /// Subtract \c right from and scale this tensor @@ -1776,9 +1814,10 @@ class Tensor { typename std::enable_if::value && detail::is_numeric_v>::type* = nullptr> Tensor& subt_to(const Right& right, const Scalar factor) { - return inplace_binary( - right, [factor](numeric_type& MADNESS_RESTRICT l, - const numeric_t r) { (l -= r) *= factor; }); + return inplace_binary(right, + [factor](auto& MADNESS_RESTRICT l, const auto& r) { + (l -= r) *= factor; + }); } /// Subtract a constant from this tensor @@ -1795,11 +1834,12 @@ class Tensor { /// \return A new tensor where the elements are the product of the elements /// of \c this and \c right template ::value>::type* = nullptr> - Tensor mult(const Right& right) const { + typename std::enable_if>::type* = + nullptr> + decltype(auto) mult(const Right& right) const { return binary( right, - [](const numeric_type l, const numeric_t r) -> numeric_type { + [](const value_type& l, const value_t& r) -> decltype(auto) { return l * r; }); } @@ -1814,12 +1854,12 @@ class Tensor { /// of \c this and \c right template < typename Right, typename Perm, - typename std::enable_if::value && + typename std::enable_if && detail::is_permutation_v>::type* = nullptr> - Tensor mult(const Right& right, const Perm& perm) const { + decltype(auto) mult(const Right& right, const Perm& perm) const { return binary( right, - [](const numeric_type l, const numeric_t r) -> numeric_type { + [](const value_type& l, const value_t& r) -> decltype(auto) { return l * r; }, perm); @@ -1835,12 +1875,12 @@ class Tensor { /// of \c this and \c right, scaled by \c factor template < typename Right, typename Scalar, - typename std::enable_if::value && + typename std::enable_if && detail::is_numeric_v>::type* = nullptr> - Tensor mult(const Right& right, const Scalar factor) const { + decltype(auto) mult(const Right& right, const Scalar factor) const { return binary(right, - [factor](const numeric_type l, const numeric_t r) - -> numeric_type { return (l * r) * factor; }); + [factor](const value_type& l, const value_t& r) + -> decltype(auto) { return (l * r) * factor; }); } /// Scale and multiply this by \c right to create a new, permuted tensor @@ -1853,15 +1893,17 @@ class Tensor { /// \param perm The permutation to be applied to this tensor /// \return A new tensor where the elements are the product of the elements /// of \c this and \c right, scaled by \c factor - template ::value && detail::is_numeric_v && - detail::is_permutation_v>::type* = nullptr> - Tensor mult(const Right& right, const Scalar factor, const Perm& perm) const { + template < + typename Right, typename Scalar, typename Perm, + typename std::enable_if && + detail::is_numeric_v && + detail::is_permutation_v>::type* = nullptr> + decltype(auto) mult(const Right& right, const Scalar factor, + const Perm& perm) const { return binary( right, - [factor](const numeric_type l, const numeric_t r) - -> numeric_type { return (l * r) * factor; }, + [factor](const value_type& l, const value_t& r) + -> decltype(auto) { return (l * r) * factor; }, perm); } @@ -1871,10 +1913,11 @@ class Tensor { /// \param right The tensor that will be multiplied by this tensor /// \return A reference to this tensor template ::value>::type* = nullptr> + typename std::enable_if>::type* = + nullptr> Tensor& mult_to(const Right& right) { - return inplace_binary(right, [](numeric_type& MADNESS_RESTRICT l, - const numeric_t r) { l *= r; }); + return inplace_binary(right, [](value_type& MADNESS_RESTRICT l, + const value_t& r) { l *= r; }); } /// Scale and multiply this tensor by \c right @@ -1886,12 +1929,12 @@ class Tensor { /// \return A reference to this tensor template < typename Right, typename Scalar, - typename std::enable_if::value && + typename std::enable_if && detail::is_numeric_v>::type* = nullptr> Tensor& mult_to(const Right& right, const Scalar factor) { return inplace_binary( - right, [factor](numeric_type& MADNESS_RESTRICT l, - const numeric_t r) { (l *= r) *= factor; }); + right, [factor](value_type& MADNESS_RESTRICT l, + const value_t& r) { (l *= r) *= factor; }); } // Negation operations diff --git a/tests/einsum.cpp b/tests/einsum.cpp index f62be4e016..ee06cf099f 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -755,11 +755,17 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { tot_type ref_result(world, ref_result_trange); // TODO compute ref_result - tot_type result; - BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); - // dist_array_t out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,k;m,n"); - // const bool are_equal = ToTArrayFixture::are_equal(corr, out); - // BOOST_CHECK(are_equal); + ///////////////////////////////////////////////////////// + // ToT * T + + // this is not supported by the expression layer since this is a + // - general product w.r.t. outer indices + // - involves ToT * T + // tot_type result; + // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); + + // will try to make this work + // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); } BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t diff --git a/tests/sparse_tile.h b/tests/sparse_tile.h index 1b7cdd07e1..360790b2de 100644 --- a/tests/sparse_tile.h +++ b/tests/sparse_tile.h @@ -47,6 +47,7 @@ class EigenSparseTile { typedef T value_type; // Element type typedef T numeric_type; // The scalar type that is compatible with value_type typedef size_t size_type; // Size type + typedef const T& const_reference; // other typedefs typedef Eigen::SparseMatrix matrix_type; @@ -139,19 +140,24 @@ class EigenSparseTile { } /// data read-only accessor - template >> - value_type operator[](const Index& idx) const { + template + std::enable_if_t, const value_type&> + operator[](const Index& idx) const { + static const value_type zero = 0; auto start = range().lobound_data(); - return matrix().coeff(idx[0] - start[0], idx[1] - start[1]); + auto* ptr = coeffPtr(idx[0] - start[0], idx[1] - start[1]); + return ptr == nullptr ? zero : *ptr; } /// data read-only accessor - template >> - value_type operator[](const Ordinal& ord) const { + template >> + const value_type& operator[](const Ordinal& ord) const { + static const value_type zero = 0; auto idx = range().idx(ord); auto start = range().lobound_data(); - return matrix().coeff(idx[0] - start[0], idx[1] - start[1]); + auto* ptr = coeffPtr(idx[0] - start[0], idx[1] - start[1]); + return ptr == nullptr ? zero : *ptr; } /// Maximum # of elements in the tile @@ -218,6 +224,32 @@ class EigenSparseTile { private: std::shared_ptr impl_; + // pointer-based coeffRef + const value_type* coeffPtr(Eigen::Index row, Eigen::Index col) const { + auto& mat = matrix(); + constexpr bool IsRowMajor = + std::decay_t::Flags & Eigen::RowMajorBit ? 1 : 0; + using Eigen::Index; + const Index outer = IsRowMajor ? row : col; + const Index inner = IsRowMajor ? col : row; + + auto* outerIndexPtr = mat.outerIndexPtr(); + auto* innerNonZeros = mat.innerNonZeroPtr(); + const auto start = outerIndexPtr[outer]; + const auto end = innerNonZeros ? outerIndexPtr[outer] + innerNonZeros[outer] + : outerIndexPtr[outer + 1]; + TA_ASSERT(end >= start && + "you probably called coeffRef on a non finalized matrix"); + if (end <= start) return nullptr; + const Index p = mat.data().searchLowerIndex( + start, end - 1, + (typename std::decay_t::StorageIndex)inner); + if ((p < end) && (mat.data().index(p) == inner)) + return &(mat.data().value(p)); + else + return nullptr; + } + }; // class EigenSparseTile // configure TA traits to be usable as tile diff --git a/tests/tensor_of_tensor.cpp b/tests/tensor_of_tensor.cpp index 21d136b67c..f6fae22be5 100644 --- a/tests/tensor_of_tensor.cpp +++ b/tests/tensor_of_tensor.cpp @@ -47,7 +47,10 @@ struct TensorOfTensorFixture { TensorOfTensorFixture() : a(make_rand_tensor_of_tensor(Range(size))), b(make_rand_tensor_of_tensor(Range(size))), - c(a - b) + c(a - b), + aa(make_rand_tensor(Range(size))), + bb(make_rand_tensor(Range(size))), + cc(aa - bb) #ifdef TILEDARRAY_HAS_BTAS , d(make_rand_TobT(Range(size))), @@ -123,13 +126,15 @@ struct TensorOfTensorFixture { static const BipartitePermutation bperm; Tensor> a, b, c; + Tensor aa, bb, cc; #ifdef TILEDARRAY_HAS_BTAS Tensor d, e, f, g, h; #endif // defined(TILEDARRAY_HAS_BTAS) template Tensor& ToT(size_t idx); - + template + T& ToS(size_t idx); }; // TensorOfTensorFixture template <> @@ -158,6 +163,18 @@ Tensor& TensorOfTensorFixture::ToT(size_t idx) { } #endif +template <> +Tensor& TensorOfTensorFixture::ToS>(size_t idx) { + if (idx == 0) + return aa; + else if (idx == 1) + return bb; + else if (idx == 2) + return cc; + else + throw std::range_error("idx out of range"); +} + const std::array TensorOfTensorFixture::size{{10, 11}}; const Permutation TensorOfTensorFixture::perm{1, 0}; const BipartitePermutation TensorOfTensorFixture::bperm(Permutation{1, 0, 3, 2}, @@ -171,6 +188,7 @@ typedef boost::mpl::list, bTensorI> itensor_types; #else typedef boost::mpl::list> itensor_types; #endif +typedef boost::mpl::list> itensor_nobtas_types; BOOST_AUTO_TEST_CASE_TEMPLATE(default_constructor, ITensor, itensor_types) { BOOST_CHECK_NO_THROW(Tensor t); @@ -964,6 +982,46 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(scal_mult_to, ITensor, itensor_types) { } } +BOOST_AUTO_TEST_CASE_TEMPLATE(mixed_mult_TxS, ITensor, itensor_nobtas_types) { + const auto& a = ToT(0); + const auto& b = ToS(0); + Tensor t; + BOOST_CHECK_NO_THROW(t = a.mult(b)); + + BOOST_CHECK(!t.empty()); + BOOST_CHECK_EQUAL(t.range(), a.range()); + + for (decltype(t.range().extent(0)) i = 0; i < t.range().extent(0); ++i) { + for (decltype(t.range().extent(1)) j = 0; j < t.range().extent(1); ++j) { + BOOST_CHECK(!t(i, j).empty()); + BOOST_CHECK_EQUAL(t(i, j).range(), a(i, j).range()); + for (std::size_t index = 0ul; index < t(i, j).size(); ++index) { + BOOST_CHECK_EQUAL(t(i, j)[index], a(i, j)[index] * b(i, j)); + } + } + } +} + +BOOST_AUTO_TEST_CASE_TEMPLATE(mixed_mult_SxT, ITensor, itensor_nobtas_types) { + const auto& a = ToS(0); + const auto& b = ToT(0); + Tensor t; + BOOST_CHECK_NO_THROW(t = a.mult(b)); + + BOOST_CHECK(!t.empty()); + BOOST_CHECK_EQUAL(t.range(), a.range()); + + for (decltype(t.range().extent(0)) i = 0; i < t.range().extent(0); ++i) { + for (decltype(t.range().extent(1)) j = 0; j < t.range().extent(1); ++j) { + BOOST_CHECK(!t(i, j).empty()); + BOOST_CHECK_EQUAL(t(i, j).range(), b(i, j).range()); + for (std::size_t index = 0ul; index < t(i, j).size(); ++index) { + BOOST_CHECK_EQUAL(t(i, j)[index], a(i, j) * b(i, j)[index]); + } + } + } +} + BOOST_AUTO_TEST_CASE_TEMPLATE(neg, ITensor, itensor_types) { const auto& a = ToT(0); Tensor t; From a9e5029293b26d67ea2ad32c998758bcf8a90ed5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 23 May 2023 21:18:04 -0600 Subject: [PATCH 046/592] introduce TensorInterface::at_ordinal --- src/TiledArray/tensor/tensor_interface.h | 40 +++++++++++++++++------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/src/TiledArray/tensor/tensor_interface.h b/src/TiledArray/tensor/tensor_interface.h index bc5e9abab2..a514959cab 100644 --- a/src/TiledArray/tensor/tensor_interface.h +++ b/src/TiledArray/tensor/tensor_interface.h @@ -219,25 +219,43 @@ class TensorInterface { /// Element subscript accessor - /// \param index The ordinal element index - /// \return A const reference to the element at \c index. - const_reference operator[](const ordinal_type index) const { - TA_ASSERT(range_.includes(index)); - return data_[range_.ordinal(index)]; + /// \param index_ordinal The ordinal element index + /// \return A const reference to the element at \c index_ordinal. + const_reference operator[](const ordinal_type index_ordinal) const { + TA_ASSERT(range_.includes(index_ordinal)); + return data_[range_.ordinal(index_ordinal)]; } /// Element subscript accessor /// \param index The ordinal element index - /// \return A const reference to the element at \c index. - reference operator[](const ordinal_type index) { - TA_ASSERT(range_.includes(index)); - return data_[range_.ordinal(index)]; + /// \return A const reference to the element at \c index_ordinal. + reference operator[](const ordinal_type index_ordinal) { + TA_ASSERT(range_.includes(index_ordinal)); + return data_[range_.ordinal(index_ordinal)]; + } + + /// Element accessor + + /// \param index_ordinal The ordinal element index + /// \return A const reference to the element at \c index_ordinal. + const_reference at_ordinal(const ordinal_type index_ordinal) const { + TA_ASSERT(range_.includes(index_ordinal)); + return data_[range_.ordinal(index_ordinal)]; + } + + /// Element accessor + + /// \param index_ordinal The ordinal element index + /// \return A const reference to the element at \c index_ordinal. + reference at_ordinal(const ordinal_type index_ordinal) { + TA_ASSERT(range_.includes(index_ordinal)); + return data_[range_.ordinal(index_ordinal)]; } /// Element accessor - /// \tparam Index An integral type pack or a single coodinate index type + /// \tparam Index An integral type pack or a single coordinate index type /// \param idx The index pack template reference operator()(const Index&... idx) { @@ -247,7 +265,7 @@ class TensorInterface { /// Element accessor - /// \tparam Index An integral type pack or a single coodinate index type + /// \tparam Index An integral type pack or a single coordinate index type /// \param idx The index pack template const_reference operator()(const Index&... idx) const { From 4d8d7019056febc5748450cd2e4637678350aaaa Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 23 May 2023 21:19:50 -0600 Subject: [PATCH 047/592] TA::detail::permute prefers Tensor::at_ordinal bump BTAS tag to pull in https://github.com/ValeevGroup/BTAS/pull/158 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- src/TiledArray/tensor/permute.h | 20 ++++++++++---------- tests/sparse_tile.h | 9 +++++++++ 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 5a190a93aa..765d443235 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -40,7 +40,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Container: header-only - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* -- [BTAS](http://github.com/ValeevGroup/BTAS), tag 561fe1bff7f3374814111a15e28c7a141ab9b67a . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag fdb6aa000f4314b16d74e2dd35bfb527c268cac5 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. diff --git a/external/versions.cmake b/external/versions.cmake index ea45a87437..3362c306c6 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -24,8 +24,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 58b3e2c623d772f6e4a2e9cf5758073de32ecc50) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG 561fe1bff7f3374814111a15e28c7a141ab9b67a) -set(TA_TRACKED_BTAS_PREVIOUS_TAG 6fcb6451bc7ca46a00534a30c51dc5c230c39ac3) +set(TA_TRACKED_BTAS_TAG fdb6aa000f4314b16d74e2dd35bfb527c268cac5) +set(TA_TRACKED_BTAS_PREVIOUS_TAG 561fe1bff7f3374814111a15e28c7a141ab9b67a) set(TA_TRACKED_LIBRETT_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 7e27ac766a9038df6aa05613784a54a036c4b796) diff --git a/src/TiledArray/tensor/permute.h b/src/TiledArray/tensor/permute.h index 7fb103217f..43fbfc9328 100644 --- a/src/TiledArray/tensor/permute.h +++ b/src/TiledArray/tensor/permute.h @@ -150,13 +150,13 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result, }; // Permute the data - for (typename Result::ordinal_type index = 0ul; index < volume; - index += block_size) { - const typename Result::ordinal_type perm_index = perm_index_op(index); + for (typename Result::ordinal_type ord = 0ul; ord < volume; + ord += block_size) { + const typename Result::ordinal_type perm_ord = perm_index_op(ord); // Copy the block - math::vector_ptr_op(op, block_size, result.data() + perm_index, - &arg0[index], &args[index]...); + math::vector_ptr_op(op, block_size, result.data() + perm_ord, + &arg0.at_ordinal(ord), &args.at_ordinal(ord)...); } } else { @@ -190,16 +190,16 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result, // Copy data from the input to the output matrix via a series of matrix // transposes. for (typename Result::ordinal_type i = 0ul; i < other_fused_size[0]; ++i) { - typename Result::ordinal_type index = i * other_fused_weight[0]; + typename Result::ordinal_type ord = i * other_fused_weight[0]; for (typename Result::ordinal_type j = 0ul; j < other_fused_size[2]; - ++j, index += other_fused_weight[2]) { + ++j, ord += other_fused_weight[2]) { // Compute the ordinal index of the input and output matrices. - typename Result::ordinal_type perm_index = perm_index_op(index); + typename Result::ordinal_type perm_ord = perm_index_op(ord); math::transpose(input_op, output_op, other_fused_size[1], other_fused_size[3], result_outer_stride, - &result[perm_index], other_fused_weight[1], - &arg0[index], &args[index]...); + &result.at_ordinal(perm_ord), other_fused_weight[1], + &arg0.at_ordinal(ord), &args.at_ordinal(ord)...); } } } diff --git a/tests/sparse_tile.h b/tests/sparse_tile.h index 360790b2de..888c39811f 100644 --- a/tests/sparse_tile.h +++ b/tests/sparse_tile.h @@ -48,6 +48,7 @@ class EigenSparseTile { typedef T numeric_type; // The scalar type that is compatible with value_type typedef size_t size_type; // Size type typedef const T& const_reference; + typedef size_type ordinal_type; // other typedefs typedef Eigen::SparseMatrix matrix_type; @@ -160,6 +161,14 @@ class EigenSparseTile { return ptr == nullptr ? zero : *ptr; } + const value_type& at_ordinal(const ordinal_type index_ordinal) const { + return this->operator[](index_ordinal); + } + + value_type& at_ordinal(const ordinal_type index_ordinal) { + return this->operator[](index_ordinal); + } + /// Maximum # of elements in the tile size_type size() const { return std::get<0>(*impl_).volume(); } From 95b5e9805095df2b11b1fadc315654c923a9d0c6 Mon Sep 17 00:00:00 2001 From: topazus Date: Wed, 24 May 2023 12:12:00 +0800 Subject: [PATCH 048/592] Use GNUInstallDirs variables instead of hard-coded paths --- CMakeLists.txt | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a50f0a789f..ec53f6bccd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -76,23 +76,6 @@ project(TiledArray HOMEPAGE_URL "https://valeevgroup.github.io/tiledarray/") enable_language(C) # C needed even for basic platform introspection -# Set install paths ============================================================ - -set(TILEDARRAY_INSTALL_BINDIR "bin" - CACHE PATH "TiledArray binary install directory") -set(TILEDARRAY_INSTALL_INCLUDEDIR "include" - CACHE PATH "TiledArray INCLUDE install directory") -set(TILEDARRAY_INSTALL_LIBDIR "lib" - CACHE PATH "TiledArray LIB install directory") -set(TILEDARRAY_INSTALL_SHAREDIR "share/tiledarray/${TILEDARRAY_EXT_VERSION}" - CACHE PATH "TiledArray DATA install directory") -set(TILEDARRAY_INSTALL_DATADIR "${TILEDARRAY_INSTALL_SHAREDIR}/data" - CACHE PATH "TiledArray DATA install directory") -set(TILEDARRAY_INSTALL_DOCDIR "${TILEDARRAY_INSTALL_SHAREDIR}/doc" - CACHE PATH "TiledArray DOC install directory") -set(TILEDARRAY_INSTALL_CMAKEDIR "lib/cmake/tiledarray" - CACHE PATH "TiledArray CMAKE install directory") - # Add module directory and modules ============================================= list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules/) include(CMakePushCheckState) @@ -106,6 +89,23 @@ include(FindPackageRegimport) init_package_regimport() include(LoadFetchContent) +# Set install paths ============================================================ + +set(TILEDARRAY_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}" + CACHE PATH "TiledArray binary install directory") +set(TILEDARRAY_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}" + CACHE PATH "TiledArray INCLUDE install directory") +set(TILEDARRAY_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}" + CACHE PATH "TiledArray LIB install directory") +set(TILEDARRAY_INSTALL_SHAREDIR "share/tiledarray/${TILEDARRAY_EXT_VERSION}" + CACHE PATH "TiledArray DATA install directory") +set(TILEDARRAY_INSTALL_DATADIR "${TILEDARRAY_INSTALL_SHAREDIR}/data" + CACHE PATH "TiledArray DATA install directory") +set(TILEDARRAY_INSTALL_DOCDIR "${TILEDARRAY_INSTALL_SHAREDIR}/doc" + CACHE PATH "TiledArray DOC install directory") +set(TILEDARRAY_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/tiledarray" + CACHE PATH "TiledArray CMAKE install directory") + # Load extra CMake features ==================================================== include(CMakeDependentOption) @@ -431,7 +431,7 @@ CONFIGURE_FILE( # install config files install(FILES ${PROJECT_BINARY_DIR}/tiledarray.pc - DESTINATION lib/pkgconfig) + DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") # include extra cmake files install(FILES From a575b358d4f2b6bb8346ab4060798a921f9dba59 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 25 May 2023 20:43:21 -0400 Subject: [PATCH 049/592] fix up casting in mixed expressions where casting is not provided by the tile evaluation e.g. complex = real --- src/TiledArray/expressions/expr.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index bcc65cb412..1a7bc2ff05 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -252,13 +252,13 @@ class Expr { >::type* = nullptr> void set_tile(A& array, const I index, const Future& tile, const std::shared_ptr& op) const { - auto eval_tile_fn = - &Expr_::template eval_tile, - Op>; - array.set(index, array.world().taskq.add( - eval_tile_fn, tile, - TiledArray::Cast(), op)); + auto eval_tile_fn = &Expr_::template eval_tile< + typename A::value_type, const T&, + TiledArray::Cast, Op>; + array.set(index, + array.world().taskq.add( + eval_tile_fn, tile, + TiledArray::Cast(), op)); } #ifdef TILEDARRAY_HAS_CUDA @@ -278,13 +278,13 @@ class Expr { ::TiledArray::detail::is_cuda_tile_v>::type* = nullptr> void set_tile(A& array, const I index, const Future& tile, const std::shared_ptr& op) const { - auto eval_tile_fn = - &Expr_::template eval_tile, - Op>; - array.set(index, madness::add_cuda_task( - array.world(), eval_tile_fn, tile, - TiledArray::Cast(), op)); + auto eval_tile_fn = &Expr_::template eval_tile< + typename A::value_type, const T&, + TiledArray::Cast, Op>; + array.set(index, + madness::add_cuda_task( + array.world(), eval_tile_fn, tile, + TiledArray::Cast(), op)); } #endif From 751ec783c87a7f6e0ac519175d4c2eb59ab51a44 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 25 May 2023 20:44:53 -0400 Subject: [PATCH 050/592] introduced TA::conversions::to that can be used for custom element conversions in e.g. TA::Tensor(TA::Tensor) --- src/TiledArray/fwd.h | 13 +++++++++++++ src/TiledArray/tensor/complex.h | 14 ++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index f09a98c0e5..2d99c1078a 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -156,6 +156,19 @@ using Array enum class HostExecutor { Thread, MADWorld, Default = MADWorld }; +namespace conversions { + +/// user defined conversions + +/// must define +/// \code +/// To operator()(From&& from); +/// \endcode +template +struct to; + +} // namespace conversions + } // namespace TiledArray #ifndef TILEDARRAY_DISABLE_NAMESPACE_TA diff --git a/src/TiledArray/tensor/complex.h b/src/TiledArray/tensor/complex.h index cfa330101d..69a8971bf6 100644 --- a/src/TiledArray/tensor/complex.h +++ b/src/TiledArray/tensor/complex.h @@ -27,6 +27,7 @@ #define TILEDARRAY_SRC_TILEDARRAY_TENSOR_COMPLEX_H__INCLUDED #include +#include #include namespace TiledArray { @@ -301,6 +302,19 @@ TILEDARRAY_FORCE_INLINE } } // namespace detail + +namespace conversions { + +template +struct to> { + T operator()(const std::complex& v) { + TA_ASSERT(v.imag() == 0); + return v.real(); + } +}; + +} // namespace conversions + } // namespace TiledArray #endif // TILEDARRAY_SRC_TILEDARRAY_TENSOR_COMPLEX_H__INCLUDED From 56868245a2da7ef5d9b0fba9ea724d68b3983759 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 25 May 2023 20:46:22 -0400 Subject: [PATCH 051/592] converting constructor of TA::Tensor, i.e. TA::Tensor(TA::Tensor) and TA::Tensor(TA::Tensor, TA::Permutation), can use custom element conversions provided by TA::conversions::to --- src/TiledArray/tensor/tensor.h | 35 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 73e3fc0caf..b0141faa19 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -214,6 +214,20 @@ class Tensor { #endif } + template + static decltype(auto) value_converter(const T_& arg) { + using arg_type = detail::remove_cvr_t; + if constexpr (detail::is_tensor_v) // clone nested tensors + return arg.clone(); + else if constexpr (!std::is_same_v) { // convert + if constexpr (std::is_convertible_v) + return static_cast(arg); + else + return conversions::to()(arg); + } else + return arg; + }; + range_type range_; ///< Range /// Number of `range_`-sized blocks in `data_` /// \note this is not used for (in)equality comparison @@ -365,15 +379,7 @@ class Tensor { !detail::has_conversion_operator_v>::type* = nullptr> explicit Tensor(const T1& other) : Tensor(detail::clone_range(other), 1, default_construct{false}) { - auto op = [](const value_type& arg) -> decltype(auto) { - // clone nested tensors - if constexpr (detail::is_tensor_v) - return arg.clone(); - else - return arg; - }; - - detail::tensor_init(op, *this, other); + detail::tensor_init(value_converter, *this, other); } /// Construct a permuted tensor copy @@ -394,15 +400,8 @@ class Tensor { detail::is_permutation_v>::type* = nullptr> Tensor(const T1& other, const Perm& perm) : Tensor(outer(perm) * other.range(), 1, default_construct{false}) { - auto op = [](const value_type& arg) -> decltype(auto) { - // clone nested tensors - if constexpr (detail::is_tensor_v) - return arg.clone(); - else - return arg; - }; - - detail::tensor_init(op, outer(perm), *this, other); + detail::tensor_init(value_converter, outer(perm), + *this, other); // If we actually have a ToT the inner permutation was not applied above so // we do that now From edc9f9f5bc4849c0981e62df40d7809d273e0666 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 25 May 2023 20:46:59 -0400 Subject: [PATCH 052/592] converting constructor of TA::DistArray is not explicit to support TZArray = TArray --- src/TiledArray/dist_array.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index fd0450ed8e..c6d6cddb79 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -548,7 +548,7 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// initialized using TiledArray::Cast /// \param other The array to be copied template > - explicit DistArray(const DistArray& other) : pimpl_() { + DistArray(const DistArray& other) : pimpl_() { *this = foreach(other, [](Tile& result, const OtherTile& source) { result = TiledArray::Cast{}(source); }); From 2a6fcdfd9c076112e4d043f7efaf25aabfc68440 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 30 May 2023 16:15:49 -0400 Subject: [PATCH 053/592] bump MAD tag to use master PaRSEC backend + associated TTG bump - pulls in https://github.com/m-a-d-n-e-s-s/madness/pull/472 - pull in https://github.com/TESSEorg/ttg/pull/252 --- INSTALL.md | 2 +- external/versions.cmake | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 5a190a93aa..16439f9c77 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag 561fe1bff7f3374814111a15e28c7a141ab9b67a . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 41324ea8f2c04df687ac2095c9001230db83b5cc . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index ea45a87437..2e92dfa679 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 58b3e2c623d772f6e4a2e9cf5758073de32ecc50) +set(TA_TRACKED_MADNESS_TAG 41324ea8f2c04df687ac2095c9001230db83b5cc) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) @@ -40,5 +40,5 @@ set(TA_TRACKED_RANGEV3_TAG 2e0591c57fce2aca6073ad6e4fdc50d841827864) set(TA_TRACKED_RANGEV3_PREVIOUS_TAG dbdaa247a25a0daa24c68f1286a5693c72ea0006) set(TA_TRACKED_TTG_URL https://github.com/TESSEorg/ttg) -set(TA_TRACKED_TTG_TAG a9a1a55b45f7503da39d8466a1a421155ac5ca2a) -set(TA_TRACKED_TTG_PREVIOUS_TAG 1251bec25e07a74a05e5cd4cdec181a95a9baa66) +set(TA_TRACKED_TTG_TAG 0adff52aa1ebdad013ab3843a7a68c2bb06b60a8) +set(TA_TRACKED_TTG_PREVIOUS_TAG a9a1a55b45f7503da39d8466a1a421155ac5ca2a) From 7f9ffea286c34104478944225ac02e498beef631 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 30 May 2023 16:15:49 -0400 Subject: [PATCH 054/592] bump MAD tag to use master PaRSEC backend + associated TTG bump - pulls in https://github.com/m-a-d-n-e-s-s/madness/pull/472 - pull in https://github.com/TESSEorg/ttg/pull/252 --- INSTALL.md | 2 +- external/versions.cmake | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 765d443235..9279d59e26 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag fdb6aa000f4314b16d74e2dd35bfb527c268cac5 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 41324ea8f2c04df687ac2095c9001230db83b5cc . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 3362c306c6..d5c71cb632 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 58b3e2c623d772f6e4a2e9cf5758073de32ecc50) +set(TA_TRACKED_MADNESS_TAG 41324ea8f2c04df687ac2095c9001230db83b5cc) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) @@ -40,5 +40,5 @@ set(TA_TRACKED_RANGEV3_TAG 2e0591c57fce2aca6073ad6e4fdc50d841827864) set(TA_TRACKED_RANGEV3_PREVIOUS_TAG dbdaa247a25a0daa24c68f1286a5693c72ea0006) set(TA_TRACKED_TTG_URL https://github.com/TESSEorg/ttg) -set(TA_TRACKED_TTG_TAG a9a1a55b45f7503da39d8466a1a421155ac5ca2a) -set(TA_TRACKED_TTG_PREVIOUS_TAG 1251bec25e07a74a05e5cd4cdec181a95a9baa66) +set(TA_TRACKED_TTG_TAG 0adff52aa1ebdad013ab3843a7a68c2bb06b60a8) +set(TA_TRACKED_TTG_PREVIOUS_TAG a9a1a55b45f7503da39d8466a1a421155ac5ca2a) From dca3d5ecefc51bfb5ee82fe8fcb84aa991148c05 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 30 May 2023 19:47:00 -0400 Subject: [PATCH 055/592] microopt --- src/TiledArray/math/linalg/scalapack/block_cyclic.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/math/linalg/scalapack/block_cyclic.h b/src/TiledArray/math/linalg/scalapack/block_cyclic.h index 902312788b..34e2a17b38 100644 --- a/src/TiledArray/math/linalg/scalapack/block_cyclic.h +++ b/src/TiledArray/math/linalg/scalapack/block_cyclic.h @@ -133,7 +133,7 @@ class BlockCyclicMatrix : public madness::WorldObject> { template >> - Tile extract_submatrix(std::vector lo, std::vector up) { + Tile extract_submatrix(std::array lo, std::array up) { assert(bc_dist_.i_own(lo[0], lo[1])); auto [i_st, j_st] = bc_dist_.local_indx(lo[0], lo[1]); @@ -247,8 +247,10 @@ class BlockCyclicMatrix : public madness::WorldObject> { const auto j_block_end = std::min(n, j_block_begin + nb); // Cut block if necessary to adhere to tile dimensions - const auto i_last = std::min(i_block_end, static_cast(up[0])); - const auto j_last = std::min(j_block_end, static_cast(up[1])); + const auto i_last = + std::min(i_block_end, static_cast(up[0])); + const auto j_last = + std::min(j_block_end, static_cast(up[1])); // Calculate extents of the block to be copied i_extent = i_last - i; @@ -263,8 +265,8 @@ class BlockCyclicMatrix : public madness::WorldObject> { local_mat_.block(i_local, j_local, i_extent, j_extent); } else { - std::vector lo{i, j}; - std::vector up{i_last, j_last}; + std::array lo{i, j}; + std::array up{i_last, j_last}; madness::Future> remtile_fut = world_base_t::send( owner(i, j), &BlockCyclicMatrix::template extract_submatrix>, From 289d5d93c4a3264e5d11ad92e0dc2af62907e0f3 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 13 Jul 2023 14:59:27 -0400 Subject: [PATCH 056/592] bump BTAS tag to pull in https://github.com/ValeevGroup/BTAS/pull/153 , https://github.com/ValeevGroup/BTAS/pull/154 , https://github.com/ValeevGroup/BTAS/pull/158 , and https://github.com/ValeevGroup/BTAS/pull/159 --- INSTALL.md | 2 +- cmake/modules/FindOrFetchBTAS.cmake | 4 ++-- external/versions.cmake | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 5a190a93aa..78a89d4db1 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -40,7 +40,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Container: header-only - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* -- [BTAS](http://github.com/ValeevGroup/BTAS), tag 561fe1bff7f3374814111a15e28c7a141ab9b67a . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag 39b4f2603df500891da8a5ce58f1f4a0d4bdc268 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. diff --git a/cmake/modules/FindOrFetchBTAS.cmake b/cmake/modules/FindOrFetchBTAS.cmake index 764ec7046e..57a4b94ac0 100644 --- a/cmake/modules/FindOrFetchBTAS.cmake +++ b/cmake/modules/FindOrFetchBTAS.cmake @@ -13,9 +13,9 @@ if (NOT TARGET BTAS::BTAS) # BTAS will load BLAS++/LAPACK++ ... if those use CMake's FindBLAS/FindLAPACK (as indicated by defined BLA_VENDOR) # will need to specify Fortran linkage convention ... manually for now, switching to NWX's linear algebra discovery # is necessary to handle all the corner cases for automatic discovery - if (BLA_VENDOR) + if (DEFINED BLA_VENDOR) set(_linalgpp_use_standard_linalg_kits TRUE) - endif(BLA_VENDOR) + endif(DEFINED BLA_VENDOR) if (NOT TILEDARRAY_HAS_CUDA) # tell BLAS++/LAPACK++ to ignore CUDA diff --git a/external/versions.cmake b/external/versions.cmake index ea45a87437..5fdcc2ea08 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -24,8 +24,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 58b3e2c623d772f6e4a2e9cf5758073de32ecc50) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG 561fe1bff7f3374814111a15e28c7a141ab9b67a) -set(TA_TRACKED_BTAS_PREVIOUS_TAG 6fcb6451bc7ca46a00534a30c51dc5c230c39ac3) +set(TA_TRACKED_BTAS_TAG 39b4f2603df500891da8a5ce58f1f4a0d4bdc268) +set(TA_TRACKED_BTAS_PREVIOUS_TAG 561fe1bff7f3374814111a15e28c7a141ab9b67a) set(TA_TRACKED_LIBRETT_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 7e27ac766a9038df6aa05613784a54a036c4b796) From 14ba6533971ae27bcf60a2b8eba626bc72bfeabd Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 13 Jul 2023 15:06:34 -0400 Subject: [PATCH 057/592] bump Librett tag to pull in https://github.com/victor-anisimov/Librett/pull/8 so that it compiles correctly as a shared library --- INSTALL.md | 2 +- external/versions.cmake | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 78a89d4db1..1d0e8951b6 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -64,7 +64,7 @@ Compiling BTAS requires the following prerequisites: Optional prerequisites: - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on CUDA-enabled accelerators. CUDA 11 or later is required. Support for CUDA also requires the following additional prerequisites, both of which will be built and installed automatically if missing: - - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 68abe31a9ec6fd2fd9ffbcd874daa80457f947da). + - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 35db79a1acaa723f468e7a88a8cd17fa67baf09a). - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag f9640e0fa4245691cdd434e4f719ac5f7d455f82). - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later). - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing: diff --git a/external/versions.cmake b/external/versions.cmake index 5fdcc2ea08..2b7629ec14 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -27,8 +27,9 @@ set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) set(TA_TRACKED_BTAS_TAG 39b4f2603df500891da8a5ce58f1f4a0d4bdc268) set(TA_TRACKED_BTAS_PREVIOUS_TAG 561fe1bff7f3374814111a15e28c7a141ab9b67a) -set(TA_TRACKED_LIBRETT_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da) -set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 7e27ac766a9038df6aa05613784a54a036c4b796) +set(LIBRETT_URL https://github.com/ValeevGroup/Librett) +set(TA_TRACKED_LIBRETT_TAG 35db79a1acaa723f468e7a88a8cd17fa67baf09a) +set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da) set(TA_TRACKED_UMPIRE_TAG f9640e0fa4245691cdd434e4f719ac5f7d455f82) set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v6.0.0) From 37c6b288f33a0f20d2da20732d0668961eaa10cf Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 13 Jul 2023 15:08:24 -0400 Subject: [PATCH 058/592] bug.h : #include cstdint for std::int64_t --- src/TiledArray/util/bug.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/TiledArray/util/bug.h b/src/TiledArray/util/bug.h index 829c592ee1..ea4b980d55 100644 --- a/src/TiledArray/util/bug.h +++ b/src/TiledArray/util/bug.h @@ -29,6 +29,7 @@ #define TILEDARRAY_UTIL_BUG_H_ #include +#include #include #include #include From 6a3ddff41fb41b5697d80cfa3f5091f181c30c23 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 13 Jul 2023 15:59:36 -0400 Subject: [PATCH 059/592] hostEnv: dox++ + initialize takes World& --- src/TiledArray/host/env.h | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/TiledArray/host/env.h b/src/TiledArray/host/env.h index 3feef3c4cc..5852cf6a20 100644 --- a/src/TiledArray/host/env.h +++ b/src/TiledArray/host/env.h @@ -65,13 +65,18 @@ class hostEnv { return instance_accessor(); } + // clang-format off /// initialize the instance using explicit params - /// \param max_memory_size max amount of memory (bytes) that TiledArray - /// can use for storage of TA::Tensor objects (these by default + /// \param world the world to use for initialization + /// \param host_alloc_limit the maximum total amount of memory (in bytes) that + /// allocator returned by `this->host_allocator()` can allocate; + /// this allocator is used by TiledArray for storage of TA::Tensor objects (these by default /// store DistArray tile data and (if sparse) shape [default=2^40] /// \param page_size memory added to the pool in chunks of at least /// this size (bytes) [default=2^25] - static void initialize(const std::uint64_t max_memory_size = (1ul << 40), + // clang-format on + static void initialize(World& world = TiledArray::get_default_world(), + const std::uint64_t host_alloc_limit = (1ul << 40), const std::uint64_t page_size = (1ul << 25)) { static std::mutex mtx; // to make initialize() reentrant std::scoped_lock lock{mtx}; @@ -92,14 +97,14 @@ class hostEnv { // use QuickPool for host memory allocation, with min grain of 1 page auto host_size_limited_alloc = rm.makeAllocator( - "SizeLimited_HOST", rm.getAllocator("HOST"), max_memory_size); + "SizeLimited_HOST", rm.getAllocator("HOST"), host_alloc_limit); auto host_dynamic_pool = rm.makeAllocator( "QuickPool_SizeLimited_HOST", host_size_limited_alloc, page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE); - auto host_env = std::unique_ptr( - new hostEnv(TiledArray::get_default_world(), host_dynamic_pool)); + auto host_env = + std::unique_ptr(new hostEnv(world, host_dynamic_pool)); instance_accessor() = std::move(host_env); } } From aff4c3ce888bc829f43af2bd4d3f58288683ddc1 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 13 Jul 2023 20:57:58 -0400 Subject: [PATCH 060/592] cudaEnv: introduced pinned_allocation + cleanup/dox++ --- src/TiledArray/external/cuda.h | 91 ++++++++++++++++++++++++++-------- src/TiledArray/fwd.h | 26 ++++++++-- 2 files changed, 93 insertions(+), 24 deletions(-) diff --git a/src/TiledArray/external/cuda.h b/src/TiledArray/external/cuda.h index 26424fa9f6..dd141f6f52 100644 --- a/src/TiledArray/external/cuda.h +++ b/src/TiledArray/external/cuda.h @@ -191,11 +191,11 @@ inline void synchronize_stream(const cudaStream_t* stream) { } /** - * cudaEnv set up global environment + * cudaEnv maintains the CUDA-related part of the runtime environment, + * such as CUDA-specific memory allocators * - * Singleton class + * \note this is a Singleton */ - class cudaEnv { public: ~cudaEnv() { @@ -210,19 +210,31 @@ class cudaEnv { cudaEnv& operator=(const cudaEnv&) = delete; cudaEnv& operator=(cudaEnv&&) = delete; - /// access to static member + /// access the singleton instance; if not initialized will be + /// initialized via cudaEnv::initialize() with the default params static std::unique_ptr& instance() { - static std::unique_ptr instance_{nullptr}; - if (!instance_) { - initialize(instance_, TiledArray::get_default_world()); + if (!instance_accessor()) { + initialize(); } - return instance_; + return instance_accessor(); } - /// initialize static member - static void initialize(std::unique_ptr& instance, World& world) { - // initialize only when not initialized - if (instance == nullptr) { + // clang-format off + /// initialize the instance using explicit params + /// \param world the world to use for initialization + /// \param page_size memory added to the pools supporting `this->um_allocator()`, `this->device_allocator()`, and `this->pinned_allocator()` in chunks of at least + /// this size (bytes) [default=2^25] + /// \param pinned_alloc_limit the maximum total amount of memory (in bytes) that + /// allocator returned by `this->pinned_allocator()` can allocate; + /// this allocator is not used by default [default=0] + // clang-format on + static void initialize(World& world = TiledArray::get_default_world(), + const std::uint64_t page_size = (1ul << 25), + const std::uint64_t pinned_alloc_limit = (1ul << 40)) { + static std::mutex mtx; // to make initialize() reentrant + std::scoped_lock lock{mtx}; + // only the winner of the lock race gets to initialize + if (instance_accessor() == nullptr) { int num_streams = detail::num_cuda_streams(); int num_devices = detail::num_cuda_devices(); int device_id = detail::current_cuda_device_id(world); @@ -248,27 +260,35 @@ class cudaEnv { constexpr auto introspect = false; #endif - // allocate all free memory for UM pool - // subsequent allocs will use 1/10 of the total device memory - auto alloc_grain = mem_total_free.second / 10; + // allocate all currently-free memory for UM pool auto um_dynamic_pool = rm.makeAllocator( "UMDynamicPool", rm.getAllocator("UM"), mem_total_free.second, - alloc_grain); + pinned_alloc_limit); - // allocate zero memory for device pool, same grain for subsequent allocs + // allocate zero memory for device pool auto dev_size_limited_alloc = rm.makeAllocator( "size_limited_alloc", rm.getAllocator("DEVICE"), mem_total_free.first); auto dev_dynamic_pool = rm.makeAllocator( - "CUDADynamicPool", dev_size_limited_alloc, 0, alloc_grain); + "CUDADynamicPool", dev_size_limited_alloc, 0, pinned_alloc_limit); + + // allocate pinned_alloc_limit in pinned memory + auto pinned_size_limited_alloc = + rm.makeAllocator( + "SizeLimited_PINNED", rm.getAllocator("PINNED"), + pinned_alloc_limit); + auto pinned_dynamic_pool = + rm.makeAllocator( + "QuickPool_SizeLimited_PINNED", pinned_size_limited_alloc, + page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE); auto cuda_env = std::unique_ptr( new cudaEnv(world, num_devices, device_id, num_streams, - um_dynamic_pool, dev_dynamic_pool)); - instance = std::move(cuda_env); + um_dynamic_pool, dev_dynamic_pool, pinned_dynamic_pool)); + instance_accessor() = std::move(cuda_env); } } @@ -361,12 +381,33 @@ class cudaEnv { ->getActualHighwaterMark(); } + /// @return an Umpire allocator that allocates from a + /// pinned memory pool + /// @warning this is not a thread-safe allocator, should be only used when + /// wrapped into umpire_allocator_impl + umpire::Allocator& pinned_allocator() { return pinned_allocator_; } + + // clang-format off + /// @return the max actual amount of memory held by pinned_allocator() + /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()` + /// @note if there is only 1 Umpire allocator using PINNED memory this should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("PINNED").getHighWatermark()` + // clang-format on + std::size_t pinned_allocator_getActualHighWatermark() { + TA_ASSERT(dynamic_cast( + pinned_allocator_.getAllocationStrategy()) != nullptr); + return dynamic_cast( + pinned_allocator_.getAllocationStrategy()) + ->getActualHighwaterMark(); + } + protected: cudaEnv(World& world, int num_devices, int device_id, int num_streams, - umpire::Allocator um_alloc, umpire::Allocator device_alloc) + umpire::Allocator um_alloc, umpire::Allocator device_alloc, + umpire::Allocator pinned_alloc) : world_(&world), um_allocator_(um_alloc), device_allocator_(device_alloc), + pinned_allocator_(pinned_alloc), num_cuda_devices_(num_devices), current_cuda_device_id_(device_id), num_cuda_streams_(num_streams) { @@ -411,6 +452,9 @@ class cudaEnv { /// allocator backed by a (non-thread-safe) dynamically-sized pool for device /// memory umpire::Allocator device_allocator_; + // allocates from a dynamic, size-limited pinned memory pool + // N.B. not thread safe, so must be wrapped into umpire_allocator_impl + umpire::Allocator pinned_allocator_; int num_cuda_devices_; int current_cuda_device_id_; @@ -418,6 +462,11 @@ class cudaEnv { int num_cuda_streams_; std::vector cuda_streams_; + + inline static std::unique_ptr& instance_accessor() { + static std::unique_ptr instance_{nullptr}; + return instance_; + } }; namespace detail { diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index f09a98c0e5..3d5c728d0e 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -84,14 +84,34 @@ typedef Tensor> TensorC; // CUDA tensor #ifdef TILEDARRAY_HAS_CUDA -template -class cuda_um_allocator_impl; +class cudaEnv; + +template +class cuda_allocator_impl; template > class default_init_allocator; +namespace detail { +struct get_um_allocator; +struct get_pinned_allocator; +struct NullLock; +template +class MutexLock; +} // namespace detail + +/// pooled thread-safe CUDA UM allocator +template +using cuda_um_allocator = + default_init_allocator, + detail::get_um_allocator>>; + +/// pooled thread-safe CUDA-based pinned host memory allocator template -using cuda_um_allocator = default_init_allocator>; +using cuda_pinned_allocator = + default_init_allocator, + detail::get_pinned_allocator>>; /// \brief a vector that lives in CUDA Unified Memory, with most operations /// implemented on the CPU From 44b96e1cd4764680014e98813bae93f4ad2bec8b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 13 Jul 2023 20:59:39 -0400 Subject: [PATCH 061/592] rename um_allocator.h -> allocators.h and generalized cuda_um_allocator_impl to support both UM and pinned allocators --- src/CMakeLists.txt | 2 +- src/TiledArray/cuda/allocators.h | 135 +++++++++++++++++++++++++++++ src/TiledArray/cuda/um_allocator.h | 82 ------------------ src/TiledArray/cuda/um_storage.cu | 2 +- src/TiledArray/cuda/um_storage.h | 2 +- 5 files changed, 138 insertions(+), 85 deletions(-) create mode 100644 src/TiledArray/cuda/allocators.h delete mode 100644 src/TiledArray/cuda/um_allocator.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index afd67dc797..b96180ce76 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -223,7 +223,7 @@ if(CUDA_FOUND) TiledArray/cuda/kernel/reduce_kernel_impl.h TiledArray/cuda/platform.h TiledArray/cuda/thrust.h - TiledArray/cuda/um_allocator.h + TiledArray/cuda/allocators.h TiledArray/cuda/um_storage.h) endif(CUDA_FOUND) diff --git a/src/TiledArray/cuda/allocators.h b/src/TiledArray/cuda/allocators.h new file mode 100644 index 0000000000..d9dba94897 --- /dev/null +++ b/src/TiledArray/cuda/allocators.h @@ -0,0 +1,135 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2018 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Eduard Valeyev + * Department of Chemistry, Virginia Tech + * Jan 31, 2018 + * + */ + +#ifndef TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED +#define TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED + +#include + +#ifdef TILEDARRAY_HAS_CUDA + +#include +#include + +#include + +#include +#include + +namespace TiledArray { + +template +class cuda_allocator_impl : public umpire_allocator_impl { + public: + using base_type = umpire_allocator_impl; + using typename base_type::const_pointer; + using typename base_type::const_reference; + using typename base_type::pointer; + using typename base_type::reference; + using typename base_type::value_type; + + cuda_allocator_impl() noexcept : base_type(&UmpireAllocatorAccessor{}()) {} + + template + cuda_allocator_impl( + const cuda_allocator_impl& + rhs) noexcept + : base_type( + static_cast&>(rhs)) {} + + template + friend bool operator==( + const cuda_allocator_impl& lhs, + const cuda_allocator_impl& + rhs) noexcept; +}; // class cuda_allocator_impl + +template +bool operator==( + const cuda_allocator_impl& lhs, + const cuda_allocator_impl& + rhs) noexcept { + return lhs.umpire_allocator() == rhs.umpire_allocator(); +} + +template +bool operator!=( + const cuda_allocator_impl& lhs, + const cuda_allocator_impl& + rhs) noexcept { + return !(lhs == rhs); +} + +namespace detail { + +struct get_um_allocator { + umpire::Allocator& operator()() { + return cudaEnv::instance()->um_allocator(); + } +}; + +struct get_pinned_allocator { + umpire::Allocator& operator()() { + return cudaEnv::instance()->um_allocator(); + } +}; + +} // namespace detail + +} // namespace TiledArray + +namespace madness { +namespace archive { + +template +struct ArchiveLoadImpl> { + static inline void load( + const Archive& ar, + TiledArray::cuda_allocator_impl& + allocator) { + allocator = TiledArray::cuda_allocator_impl{}; + } +}; + +template +struct ArchiveStoreImpl> { + static inline void store( + const Archive& ar, + const TiledArray::cuda_allocator_impl< + T, StaticLock, UmpireAllocatorAccessor>& allocator) {} +}; + +} // namespace archive +} // namespace madness + +#endif // TILEDARRAY_HAS_CUDA + +#endif // TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED diff --git a/src/TiledArray/cuda/um_allocator.h b/src/TiledArray/cuda/um_allocator.h deleted file mode 100644 index 99b281dc51..0000000000 --- a/src/TiledArray/cuda/um_allocator.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2018 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Eduard Valeyev - * Department of Chemistry, Virginia Tech - * Jan 31, 2018 - * - */ - -#ifndef TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED -#define TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED - -#include - -#ifdef TILEDARRAY_HAS_CUDA - -#include -#include - -#include -#include - -namespace TiledArray { - -/// pooled thread-safe CUDA UM allocator -template -class cuda_um_allocator_impl - : public umpire_allocator_impl> { - public: - using base_type = umpire_allocator_impl>; - using typename base_type::const_pointer; - using typename base_type::const_reference; - using typename base_type::pointer; - using typename base_type::reference; - using typename base_type::value_type; - - cuda_um_allocator_impl() noexcept - : base_type(&cudaEnv::instance()->um_allocator()) {} - - template - cuda_um_allocator_impl(const cuda_um_allocator_impl& rhs) noexcept - : base_type(static_cast&>(rhs)) {} - - template - friend bool operator==(const cuda_um_allocator_impl& lhs, - const cuda_um_allocator_impl& rhs) noexcept; -}; // class cuda_um_allocator - -template -bool operator==(const cuda_um_allocator_impl& lhs, - const cuda_um_allocator_impl& rhs) noexcept { - return lhs.umpire_allocator() == rhs.umpire_allocator(); -} - -template -bool operator!=(const cuda_um_allocator_impl& lhs, - const cuda_um_allocator_impl& rhs) noexcept { - return !(lhs == rhs); -} - -template -using cuda_um_allocator = default_init_allocator>; - -} // namespace TiledArray - -#endif // TILEDARRAY_HAS_CUDA - -#endif // TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED diff --git a/src/TiledArray/cuda/um_storage.cu b/src/TiledArray/cuda/um_storage.cu index 3462f7d7c1..a16811e91b 100644 --- a/src/TiledArray/cuda/um_storage.cu +++ b/src/TiledArray/cuda/um_storage.cu @@ -22,7 +22,7 @@ */ -#include +#include #include #ifdef TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/um_storage.h b/src/TiledArray/cuda/um_storage.h index 4b3781185c..27515a528e 100644 --- a/src/TiledArray/cuda/um_storage.h +++ b/src/TiledArray/cuda/um_storage.h @@ -24,8 +24,8 @@ #ifndef TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED #define TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED +#include #include -#include #ifdef TILEDARRAY_HAS_CUDA From 70c1fc48437b21db32db4b8b1bf6a25170a6a2e6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 13 Jul 2023 21:01:43 -0400 Subject: [PATCH 062/592] make umpire_allocator_impl and default_init_allocator MAD-serializable custom serializer for cuda_um_btas_varray now no longer needed since fixes to btas::varray serializer + these fixes make btas::varray with custom TA allocators fully serializable --- src/TiledArray/cuda/um_storage.h | 20 ----------- src/TiledArray/external/umpire.h | 59 ++++++++++++++++++++++++++++++-- 2 files changed, 56 insertions(+), 23 deletions(-) diff --git a/src/TiledArray/cuda/um_storage.h b/src/TiledArray/cuda/um_storage.h index 27515a528e..bea591cbb2 100644 --- a/src/TiledArray/cuda/um_storage.h +++ b/src/TiledArray/cuda/um_storage.h @@ -147,26 +147,6 @@ struct ArchiveStoreImpl> { } }; -template -struct ArchiveLoadImpl> { - static inline void load(const Archive& ar, - TiledArray::cuda_um_btas_varray& x) { - typename TiledArray::cuda_um_btas_varray::size_type n(0); - ar& n; - x.resize(n); - for (auto& xi : x) ar& xi; - } -}; - -template -struct ArchiveStoreImpl> { - static inline void store(const Archive& ar, - const TiledArray::cuda_um_btas_varray& x) { - ar& x.size(); - for (const auto& xi : x) ar& xi; - } -}; - } // namespace archive } // namespace madness diff --git a/src/TiledArray/external/umpire.h b/src/TiledArray/external/umpire.h index ac42f3bf1c..9dba387dc7 100644 --- a/src/TiledArray/external/umpire.h +++ b/src/TiledArray/external/umpire.h @@ -33,6 +33,8 @@ #include #include +#include + #include #include @@ -45,7 +47,7 @@ struct NullLock { static void unlock() {} }; -template +template class MutexLock { static std::mutex mtx_; @@ -138,7 +140,7 @@ class umpire_allocator_impl { private: umpire::Allocator* umpalloc_; -}; // class umpire_allocator +}; // class umpire_allocator_impl template bool operator==(const umpire_allocator_impl& lhs, @@ -172,6 +174,9 @@ class default_init_allocator : public A { using A::A; + default_init_allocator(A const& a) noexcept : A(a) {} + default_init_allocator(A&& a) noexcept : A(std::move(a)) {} + template void construct(U* ptr) noexcept( std::is_nothrow_default_constructible::value) { @@ -185,4 +190,52 @@ class default_init_allocator : public A { } // namespace TiledArray -#endif // TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED +namespace madness { +namespace archive { + +template +struct ArchiveLoadImpl> { + static inline void load( + const Archive& ar, + TiledArray::umpire_allocator_impl& allocator) { + std::string allocator_name; + ar& allocator_name; + allocator = TiledArray::umpire_allocator_impl( + umpire::ResourceManager::getInstance().getAllocator(allocator_name)); + } +}; + +template +struct ArchiveStoreImpl> { + static inline void store( + const Archive& ar, + const TiledArray::umpire_allocator_impl& allocator) { + ar& allocator.umpire_allocator()->getName(); + } +}; + +template +struct ArchiveLoadImpl> { + static inline void load(const Archive& ar, + TiledArray::default_init_allocator& allocator) { + A base_allocator; + ar& base_allocator; + allocator = TiledArray::default_init_allocator(base_allocator); + } +}; + +template +struct ArchiveStoreImpl> { + static inline void store( + const Archive& ar, + const TiledArray::default_init_allocator& allocator) { + ar& static_cast(allocator); + } +}; + +} // namespace archive +} // namespace madness + +#endif // TILEDARRAY_EXTERNAL_UMPIRE_H___INCLUDED From 65ddab153fc41b6881bcf3e9bb1f49d4d641d8e9 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 13 Jul 2023 21:07:23 -0400 Subject: [PATCH 063/592] amended ta_dense_cuda to use pinned CUDA memory for input matrices --- examples/cuda/ta_dense_cuda.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/cuda/ta_dense_cuda.cpp b/examples/cuda/ta_dense_cuda.cpp index 4a035f176b..ab8c118622 100644 --- a/examples/cuda/ta_dense_cuda.cpp +++ b/examples/cuda/ta_dense_cuda.cpp @@ -215,7 +215,12 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, using CUDATile = btas::Tensor; using CUDAMatrix = TA::DistArray>; - using TAMatrix = TA::DistArray>; + using PinnedTile = + btas::Tensor>>; + using PinnedMatrix = TA::DistArray>; + // using TAMatrix = TA::DistArray>; CUDAMatrix c(world, trange_c); auto val_a = 0.03; @@ -224,8 +229,8 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, { // Construct and initialize arrays - TAMatrix a_host(world, trange_a); - TAMatrix b_host(world, trange_b); + PinnedMatrix a_host(world, trange_a); + PinnedMatrix b_host(world, trange_b); a_host.fill(val_a); b_host.fill(val_b); From fb9c761ceaf3ac19b4338a26b86bd7d65ed6e064 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 13 Jul 2023 22:42:41 -0400 Subject: [PATCH 064/592] bump MAD tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/487 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 1d0e8951b6..46b9ff3d39 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag 39b4f2603df500891da8a5ce58f1f4a0d4bdc268 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag e4329701d47a0c0416499574e47802a6f775cec1 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 2b7629ec14..b836ca31e9 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 58b3e2c623d772f6e4a2e9cf5758073de32ecc50) +set(TA_TRACKED_MADNESS_TAG e4329701d47a0c0416499574e47802a6f775cec1) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From 32dec927f5e0c783603df3031ade440aa70d1a5b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 13 Jul 2023 22:59:16 -0400 Subject: [PATCH 065/592] optimize serialization of stateless allocators --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- src/TiledArray/external/umpire.h | 12 ++++++++---- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 46b9ff3d39..60559564de 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -40,7 +40,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Container: header-only - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* -- [BTAS](http://github.com/ValeevGroup/BTAS), tag 39b4f2603df500891da8a5ce58f1f4a0d4bdc268 . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag 5a45699b78d0540b490c8c769b61033bd4d4f49c . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag e4329701d47a0c0416499574e47802a6f775cec1 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. diff --git a/external/versions.cmake b/external/versions.cmake index b836ca31e9..3bf9ae249b 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -24,8 +24,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG 39b4f2603df500891da8a5ce58f1f4a0d4bdc268) -set(TA_TRACKED_BTAS_PREVIOUS_TAG 561fe1bff7f3374814111a15e28c7a141ab9b67a) +set(TA_TRACKED_BTAS_TAG 5a45699b78d0540b490c8c769b61033bd4d4f49c) +set(TA_TRACKED_BTAS_PREVIOUS_TAG 39b4f2603df500891da8a5ce58f1f4a0d4bdc268) set(LIBRETT_URL https://github.com/ValeevGroup/Librett) set(TA_TRACKED_LIBRETT_TAG 35db79a1acaa723f468e7a88a8cd17fa67baf09a) diff --git a/src/TiledArray/external/umpire.h b/src/TiledArray/external/umpire.h index 9dba387dc7..71508226a4 100644 --- a/src/TiledArray/external/umpire.h +++ b/src/TiledArray/external/umpire.h @@ -220,9 +220,11 @@ template struct ArchiveLoadImpl> { static inline void load(const Archive& ar, TiledArray::default_init_allocator& allocator) { - A base_allocator; - ar& base_allocator; - allocator = TiledArray::default_init_allocator(base_allocator); + if constexpr (!std::allocator_traits::is_always_equal::value) { + A base_allocator; + ar& base_allocator; + allocator = TiledArray::default_init_allocator(base_allocator); + } } }; @@ -231,7 +233,9 @@ struct ArchiveStoreImpl> { static inline void store( const Archive& ar, const TiledArray::default_init_allocator& allocator) { - ar& static_cast(allocator); + if constexpr (!std::allocator_traits::is_always_equal::value) { + ar& static_cast(allocator); + } } }; From a9947ce6e64c871bc6f3f3610bf2cb2d1c02cac1 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 13 Jul 2023 23:00:04 -0400 Subject: [PATCH 066/592] update librett.cpp for the changed librettPlan API --- tests/librett.cpp | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/tests/librett.cpp b/tests/librett.cpp index 91c5b5b8ad..9cc7c7b684 100644 --- a/tests/librett.cpp +++ b/tests/librett.cpp @@ -69,10 +69,11 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) { TiledArray::permutation_to_col_major(perm); librettHandle plan; - //librettResult_t status; + librett_gpuStream_t stream; librettResult status; - status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0); + status = + librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), stream); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -117,7 +118,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { cudaMemcpy(a_device, a_host, A * B * sizeof(int), cudaMemcpyHostToDevice); librettHandle plan; - //librettResult_t status; + librett_gpuStream_t stream; librettResult status; std::vector extent({B, A}); @@ -126,7 +127,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { std::vector perm({1, 0}); TiledArray::permutation_to_col_major(perm); - status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0); + status = + librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), stream); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -175,7 +177,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { // b(j,i,k) = a(i,j,k) librettHandle plan; - //librettResult_t status; + librett_gpuStream_t stream; librettResult status; std::vector extent3{int(A), int(B), int(C)}; @@ -183,8 +185,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { std::vector perm3{1, 0, 2}; // std::vector perm3{0, 2, 1}; - status = librettPlanMeasure(&plan, 3, extent3.data(), perm3.data(), sizeof(int), - 0, a_device, b_device); + status = librettPlanMeasure(&plan, 3, extent3.data(), perm3.data(), + sizeof(int), stream, a_device, b_device); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -238,7 +240,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { // b(j,i,k) = a(i,j,k) librettHandle plan; - //librettResult_t status; + librett_gpuStream_t stream; librettResult status; std::vector extent({A, B, C}); @@ -247,8 +249,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { std::vector perm({1, 0, 2}); TiledArray::permutation_to_col_major(perm); - status = librettPlanMeasure(&plan, 3, extent.data(), perm.data(), sizeof(int), 0, - a_device, b_device); + status = librettPlanMeasure(&plan, 3, extent.data(), perm.data(), sizeof(int), + stream, a_device, b_device); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -295,7 +297,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) { } librettHandle plan; - //librettResult_t status; + librett_gpuStream_t stream; librettResult status; std::vector extent({A, A}); @@ -304,7 +306,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) { std::vector perm({1, 0}); TiledArray::permutation_to_col_major(perm); - status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0); + status = + librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), stream); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -344,7 +347,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { } librettHandle plan; - //librettResult_t status; + librett_gpuStream_t stream; librettResult status; std::vector extent({B, A}); @@ -353,7 +356,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { std::vector perm({1, 0}); TiledArray::permutation_to_col_major(perm); - status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0); + status = + librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), stream); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -393,7 +397,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { } librettHandle plan; - //librettResult_t status; + librett_gpuStream_t stream; librettResult status; // b(k,i,j) = a(i,j,k) @@ -404,7 +408,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { std::vector perm({2, 0, 1}); TiledArray::permutation_to_col_major(perm); - status = librettPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), 0); + status = + librettPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), stream); BOOST_CHECK(status == LIBRETT_SUCCESS); From b3a4e00e5dc9b1f857558339548e09e936d8a32a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 16 Jul 2023 17:38:12 -0400 Subject: [PATCH 067/592] bump BTAS tag to pull in https://github.com/ValeevGroup/BTAS/pull/160 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 60559564de..67a00ff9e7 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -40,7 +40,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Container: header-only - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* -- [BTAS](http://github.com/ValeevGroup/BTAS), tag 5a45699b78d0540b490c8c769b61033bd4d4f49c . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag 3c91f086090390930bba62c6512c4e74a5520e76 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag e4329701d47a0c0416499574e47802a6f775cec1 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. diff --git a/external/versions.cmake b/external/versions.cmake index 3bf9ae249b..140096f350 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -24,8 +24,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG 5a45699b78d0540b490c8c769b61033bd4d4f49c) -set(TA_TRACKED_BTAS_PREVIOUS_TAG 39b4f2603df500891da8a5ce58f1f4a0d4bdc268) +set(TA_TRACKED_BTAS_TAG 3c91f086090390930bba62c6512c4e74a5520e76) +set(TA_TRACKED_BTAS_PREVIOUS_TAG 5a45699b78d0540b490c8c769b61033bd4d4f49c) set(LIBRETT_URL https://github.com/ValeevGroup/Librett) set(TA_TRACKED_LIBRETT_TAG 35db79a1acaa723f468e7a88a8cd17fa67baf09a) From c445f4c04c93448803371b64f902d3ef8aa9e55a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 17 Jul 2023 09:01:37 -0400 Subject: [PATCH 068/592] cuda_pinned_allocator actually uses pinned allocator --- src/TiledArray/cuda/allocators.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/cuda/allocators.h b/src/TiledArray/cuda/allocators.h index d9dba94897..72c5ae3b0e 100644 --- a/src/TiledArray/cuda/allocators.h +++ b/src/TiledArray/cuda/allocators.h @@ -21,8 +21,8 @@ * */ -#ifndef TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED -#define TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED +#ifndef TILEDARRAY_CUDA_ALLOCATORS_H___INCLUDED +#define TILEDARRAY_CUDA_ALLOCATORS_H___INCLUDED #include @@ -93,7 +93,7 @@ struct get_um_allocator { struct get_pinned_allocator { umpire::Allocator& operator()() { - return cudaEnv::instance()->um_allocator(); + return cudaEnv::instance()->pinned_allocator(); } }; @@ -132,4 +132,4 @@ struct ArchiveStoreImpl Date: Mon, 17 Jul 2023 13:08:04 -0400 Subject: [PATCH 069/592] add missing in random.h to pacify gcc-13 --- src/TiledArray/util/random.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/util/random.h b/src/TiledArray/util/random.h index b096654bc6..15daf0d716 100644 --- a/src/TiledArray/util/random.h +++ b/src/TiledArray/util/random.h @@ -20,12 +20,13 @@ #ifndef TILEDARRAY_RANDOM_H__INCLUDED #define TILEDARRAY_RANDOM_H__INCLUDED +#include + #include // for std::complex +#include // for std::int64_t #include // for std::rand #include // for true_type, false_type, and enable_if -#include - namespace TiledArray { /// \return reference to the thread-specific random engine used to implement From eb8df99d61a3120f4bd66243272b9320d2ea9565 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 17 Jul 2023 13:33:41 -0400 Subject: [PATCH 070/592] [cmake] to build TA unit tests need Boost 1.74.0 or later when using C++20 or later --- tests/CMakeLists.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 32a8e9ee6c..3db1675cc9 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -105,6 +105,15 @@ if(CUDA_FOUND) list(APPEND ta_test_src_files librett.cpp expressions_cuda_um.cpp tensor_um.cpp) endif() +# if using C++20 must use Boost 1.74 or later: +# - https://en.cppreference.com/w/cpp/io/basic_ostream/operator_ltlt2 +# - https://github.com/boostorg/test/commit/db6b98c72783351e0acd3c558691323a7a103ba9 +if (CMAKE_CXX_STANDARD GREATER_EQUAL 20 AND DEFINED Boost_VERSION) + if (Boost_VERSION VERSION_LESS 1.74.0) + message(FATAL_ERROR "Boost 1.74 or later required to build TA unit tests when using C++20 or higher") + endif() +endif() + # if tiledarray library was compiled without exceptions, use TA header-only (see below) if (NOT (TA_ASSERT_POLICY STREQUAL TA_ASSERT_THROW) AND NOT CUDA_FOUND AND FALSE) add_ta_executable(${executable} "${ta_test_src_files}" "MADworld;${TILEDARRAY_PRIVATE_LINK_LIBRARIES}") From f6f5039b54ee5a1180106351cac05e0d33e14c62 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 17 Jul 2023 13:36:32 -0400 Subject: [PATCH 071/592] bump MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/489 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 67a00ff9e7..be9afb660f 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag 3c91f086090390930bba62c6512c4e74a5520e76 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag e4329701d47a0c0416499574e47802a6f775cec1 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 31d803325623de75371774feffb0270c796bea24 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 140096f350..d1f7113c58 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG e4329701d47a0c0416499574e47802a6f775cec1) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258) +set(TA_TRACKED_MADNESS_TAG 31d803325623de75371774feffb0270c796bea24) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG e4329701d47a0c0416499574e47802a6f775cec1) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From fbdade7c336c65c4ff3cc7b08784f56f1b0b78f2 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 24 Jul 2023 17:03:20 -0400 Subject: [PATCH 072/592] [skip ci] typos --- src/TiledArray/conversions/foreach.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/conversions/foreach.h b/src/TiledArray/conversions/foreach.h index 9d219ac191..20f2d36ec3 100644 --- a/src/TiledArray/conversions/foreach.h +++ b/src/TiledArray/conversions/foreach.h @@ -463,7 +463,7 @@ inline std::enable_if_t, DistArray> foreach ( /// want to modify the elements of the array to be equal to the square /// root of the original value: /// \code -/// foreach(array, [] (TiledArray::TensorD& tile) { +/// foreach_inplace(array, [] (TiledArray::TensorD& tile) { /// tile.inplace_unary([&] (double& value) { value = std::sqrt(value); }); /// }); /// \endcode @@ -561,7 +561,7 @@ inline std::enable_if_t, DistArray> foreach ( /// example, if we want to modify the elements of the array to be equal to the /// square root of the original value: /// \code -/// foreach(array, [] (auto& tile) -> float { +/// foreach_inplace(array, [] (auto& tile) -> float { /// double norm_squared = 0.0; /// tile.inplace_unary([&] (double& value) { /// norm_squared += value; // Assume value >= 0 From a5d634819e06783942041494a48a599e94bb0760 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 25 Jul 2023 08:51:41 -0400 Subject: [PATCH 073/592] bump MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/492 ... this should fix ScaLAPACK->DistArray conversion and resolve #410 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 16439f9c77..c7a44dd8f7 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag 561fe1bff7f3374814111a15e28c7a141ab9b67a . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 41324ea8f2c04df687ac2095c9001230db83b5cc . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 4785f17bec34e08f10fa4de84c7359f0404a4d78 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 2e92dfa679..b16b5c4c6d 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 41324ea8f2c04df687ac2095c9001230db83b5cc) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 4d9f0c6c4b5cfdaf6b685c20637c45dbcb117258) +set(TA_TRACKED_MADNESS_TAG 4785f17bec34e08f10fa4de84c7359f0404a4d78) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 41324ea8f2c04df687ac2095c9001230db83b5cc) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From 907d40c259a28db0e842f999e818f97034bd1e92 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 25 Jul 2023 15:05:25 -0400 Subject: [PATCH 074/592] update MADNESS tag to pull in updated https://github.com/m-a-d-n-e-s-s/madness/pull/492 + amend BlockCyclicMatrix::tensor_from_matrix to use Future::get(dowork=false) --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- src/TiledArray/math/linalg/scalapack/block_cyclic.h | 13 +++++++++---- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index c7a44dd8f7..876e50347f 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag 561fe1bff7f3374814111a15e28c7a141ab9b67a . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 4785f17bec34e08f10fa4de84c7359f0404a4d78 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 3d585293f0094588778dbd3bec24b65e7bbe6a5d . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index b16b5c4c6d..30e1d6c524 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 4785f17bec34e08f10fa4de84c7359f0404a4d78) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 41324ea8f2c04df687ac2095c9001230db83b5cc) +set(TA_TRACKED_MADNESS_TAG 3d585293f0094588778dbd3bec24b65e7bbe6a5d) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 4785f17bec34e08f10fa4de84c7359f0404a4d78) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) diff --git a/src/TiledArray/math/linalg/scalapack/block_cyclic.h b/src/TiledArray/math/linalg/scalapack/block_cyclic.h index 902312788b..7140e3b63d 100644 --- a/src/TiledArray/math/linalg/scalapack/block_cyclic.h +++ b/src/TiledArray/math/linalg/scalapack/block_cyclic.h @@ -247,8 +247,10 @@ class BlockCyclicMatrix : public madness::WorldObject> { const auto j_block_end = std::min(n, j_block_begin + nb); // Cut block if necessary to adhere to tile dimensions - const auto i_last = std::min(i_block_end, static_cast(up[0])); - const auto j_last = std::min(j_block_end, static_cast(up[1])); + const auto i_last = + std::min(i_block_end, static_cast(up[0])); + const auto j_last = + std::min(j_block_end, static_cast(up[1])); // Calculate extents of the block to be copied i_extent = i_last - i; @@ -265,20 +267,23 @@ class BlockCyclicMatrix : public madness::WorldObject> { } else { std::vector lo{i, j}; std::vector up{i_last, j_last}; + // N.B. send instead of task guarantees progress madness::Future> remtile_fut = world_base_t::send( owner(i, j), &BlockCyclicMatrix::template extract_submatrix>, lo, up); + // N.B. Future::get(dowork=false) since calling from within a task + // and PaRSEC gets sad otherwise if constexpr (TiledArray::detail::is_ta_tensor_v) - tile.block(lo, up) = remtile_fut.get(); + tile.block(lo, up) = remtile_fut.get(/* dowork = */ false); else { auto tile_blk_range = TiledArray::BlockRange( TiledArray::detail::make_ta_range(tile.range()), lo, up); using std::data; auto tile_blk_view = TiledArray::make_map(data(tile), tile_blk_range); - tile_blk_view = remtile_fut.get(); + tile_blk_view = remtile_fut.get(/* dowork = */ false); } } } From 6e739674badc5b83fbfa4ba2b589fc1f04f3fd29 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 25 Jul 2023 16:04:50 -0400 Subject: [PATCH 075/592] [ci] upgrade clang-9 -> clang-13 in GitLab CI jobs in response to https://github.com/ValeevGroup/DevOps/commit/0dee980d11c5e5fc7e1caac02d46deaa9545ba0f --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 93850215f1..2981c74a87 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -65,7 +65,7 @@ ubuntu: TA_PYTHON : [ "TA_PYTHON=OFF" ] # needs to be fixed for MKL RUNNER_TAGS: [ linux ] - IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ] - CXX: [ g++, clang++-9 ] + CXX: [ g++, clang++-13 ] BUILD_TYPE : [ "Release", "Debug" ] ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ] RUNNER_TAGS: [ linux ] From 03c21f724402add113f544ca171bae7e47ee60d0 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 25 Jul 2023 16:10:06 -0400 Subject: [PATCH 076/592] [ci] use Ubuntu {20,22}.04 instead of {18,20}.04 in response to https://github.com/ValeevGroup/DevOps/commit/872bd23b3f2a38878851e6f62096740ddb9749e3 --- .gitlab-ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2981c74a87..b8e7f9eb32 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -56,7 +56,7 @@ ubuntu: metrics: build/metrics.txt parallel: matrix: - - IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ] + - IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ] CXX: [ g++ ] BUILD_TYPE : [ "Release" ] BLA_VENDOR : [ "BLAS_PREFERENCE_LIST=IntelMKL" ] @@ -64,12 +64,12 @@ ubuntu: # ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ] TA_PYTHON : [ "TA_PYTHON=OFF" ] # needs to be fixed for MKL RUNNER_TAGS: [ linux ] - - IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ] + - IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ] CXX: [ g++, clang++-13 ] BUILD_TYPE : [ "Release", "Debug" ] ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ] RUNNER_TAGS: [ linux ] - - IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ] + - IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ] CXX: [ g++ ] BUILD_TYPE : [ "Release", "Debug" ] ENABLE_CUDA : [ "ENABLE_CUDA=ON" ] From b7c7a9b181ebbece5c62018ca2187ddd85acf651 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 26 Jul 2023 08:44:23 -0400 Subject: [PATCH 077/592] [ci] [gitlab] add build/CMakeFiles/CMakeConfigureLog.yaml to artifacts in case cmake 3.26+ is used --- .gitlab-ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b8e7f9eb32..fd9c49aefa 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -52,6 +52,7 @@ ubuntu: - build/CMakeCache.txt - build/CMakeFiles/CMakeOutput.log - build/CMakeFiles/CMakeError.log + - build/CMakeFiles/CMakeConfigureLog.yaml reports: metrics: build/metrics.txt parallel: From 9eec3629cfb08d751193c687bc90a2a0c88e6a11 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 26 Jul 2023 08:45:07 -0400 Subject: [PATCH 078/592] [ci] [gitlab] replace libtbb-dev by libtbb2-dev for 22.04 to make MKL usable --- ci/.build-project | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ci/.build-project b/ci/.build-project index aeb7c73787..57fc67300b 100755 --- a/ci/.build-project +++ b/ci/.build-project @@ -75,6 +75,14 @@ cmd "source ci/openmpi.env" cmd "echo 'localhost slots=2' > /etc/openmpi/openmpi-default-hostfile" if [[ "$vars" =~ \"-DBLAS_PREFERENCE_LIST=IntelMKL ]]; then cmd "make -C /home/ValeevGroup install/intel-mkl" + # DevOps installs MKL 2020.3 which requires non-OneAPI TBB ... although MKL bundles TBB + # the systemwide TBB package is found first (the MKL's TBB does not bundle + # headers anyway, so it's almost useless for us) + # unfortunately the default, libtbb-dev, package on ubuntu 22.04 is OneAPI, get rid of it and use + # libtbb2-dev instead + if [[ "$vars" =~ \"-DIntelMKL_THREAD_LAYER=tbb ]]; then + cmd "(apt show libtbb2-dev && apt install -y libtbb2-dev) || echo \"no need to install libtbb2-dev\"" + fi cmd "source /opt/intel/mkl/bin/mklvars.sh intel64" cmd "echo MKLROOT=\$MKLROOT" fi From dea24e846d2f7876780666462e9b3e6ab187fb99 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 26 Jul 2023 14:36:53 -0400 Subject: [PATCH 079/592] minor fixups in tensor.h to support operation with nonstandard allocators ... addresses https://github.com/ValeevGroup/mpqc4/issues/412 --- src/TiledArray/tensor/tensor.h | 57 ++++++++++++++++++---------------- tests/tensor.cpp | 14 +++++++++ 2 files changed, 45 insertions(+), 26 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index b0141faa19..3c10ba4077 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -60,9 +60,9 @@ template class Tensor { // meaningful error if T& is not assignable, see // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=48101 - static_assert( - std::is_assignable, T>::value, - "Tensor: T must be an assignable type (e.g. cannot be const)"); + static_assert(std::is_assignable, T>::value, + "Tensor: T must be an assignable type (e.g. " + "cannot be const)"); #ifdef TA_TENSOR_MEM_TRACE template @@ -80,16 +80,17 @@ class Tensor { typedef typename range_type::ordinal_type size_type; ///< Size type (to meet the container concept) typedef Allocator allocator_type; ///< Allocator type - typedef - typename allocator_type::value_type value_type; ///< Array element type - typedef - typename allocator_type::reference reference; ///< Element reference type - typedef typename allocator_type::const_reference - const_reference; ///< Element reference type - typedef typename allocator_type::pointer pointer; ///< Element pointer type - typedef typename allocator_type::const_pointer + typedef typename std::allocator_traits::value_type + value_type; ///< Array element type + typedef std::add_lvalue_reference_t + reference; ///< Element (lvalue) reference type + typedef std::add_lvalue_reference_t> + const_reference; ///< Element (const lvalue) reference type + typedef typename std::allocator_traits::pointer + pointer; ///< Element pointer type + typedef typename std::allocator_traits::const_pointer const_pointer; ///< Element const pointer type - typedef typename allocator_type::difference_type + typedef typename std::allocator_traits::difference_type difference_type; ///< Difference type typedef pointer iterator; ///< Element iterator type typedef const_pointer const_iterator; ///< Element const iterator type @@ -1359,7 +1360,9 @@ class Tensor { auto binary(const Right& right, Op&& op) const { using result_value_type = decltype(op( std::declval(), std::declval&>())); - return Tensor(*this, right, op); + using result_allocator_type = typename std::allocator_traits< + Allocator>::template rebind_alloc; + return Tensor(*this, right, op); } /// Use a binary, element wise operation to construct a new, permuted tensor @@ -1386,7 +1389,9 @@ class Tensor { if constexpr (!is_tot) { using result_value_type = decltype(op( std::declval(), std::declval&>())); - using ResultTensor = Tensor; + using result_allocator_type = typename std::allocator_traits< + Allocator>::template rebind_alloc; + using ResultTensor = Tensor; if constexpr (is_bperm) { TA_ASSERT(inner_size(perm) == 0); // ensure this is a plain permutation return ResultTensor(*this, right, op, outer(perm)); @@ -1696,7 +1701,7 @@ class Tensor { /// elements of \c this and \c right template , Right>>> + detail::tensors_have_equal_nested_rank_v>> Tensor subt(const Right& right) const { return binary( right, [](const value_type& l, const value_type& r) -> decltype(auto) { @@ -2490,8 +2495,8 @@ std::size_t Tensor::trace_if_larger_than_ = std::numeric_limits::max(); #endif -template -Tensor operator*(const Permutation& p, const Tensor& t) { +template +Tensor operator*(const Permutation& p, const Tensor& t) { return t.permute(p); } @@ -2543,11 +2548,11 @@ template void gemm(Alpha alpha, const Tensor& A, const Tensor& B, Beta beta, Tensor& C, const math::GemmHelper& gemm_helper) { - static_assert( - !detail::is_tensor_of_tensor_v, Tensor, - Tensor>, - "TA::Tensor::gemm without custom element op is only applicable to " - "plain tensors"); + static_assert(!detail::is_tensor_of_tensor_v, Tensor, + Tensor>, + "TA::Tensor::gemm without custom element op is " + "only applicable to " + "plain tensors"); { // Check that tensor C is not empty and has the correct rank TA_ASSERT(!C.empty()); @@ -2705,16 +2710,16 @@ bool operator!=(const Tensor& a, const Tensor& b) { namespace detail { -/// Implements taking the trace of a Tensor (\c T is a numeric type) +/// Implements taking the trace of a Tensor /// /// \tparam T The type of the elements in the tensor. For this specialization /// to be considered must satisfy the concept of numeric type. /// \tparam A The type of the allocator for the tensor template struct Trace, detail::enable_if_numeric_t> { - decltype(auto) operator()(const Tensor& t) const { - using size_type = typename Tensor::size_type; - using value_type = typename Tensor::value_type; + decltype(auto) operator()(const Tensor& t) const { + using size_type = typename Tensor::size_type; + using value_type = typename Tensor::value_type; const auto range = t.range(); // Get pointers to the range data diff --git a/tests/tensor.cpp b/tests/tensor.cpp index 1281e5d164..be214ef841 100644 --- a/tests/tensor.cpp +++ b/tests/tensor.cpp @@ -724,6 +724,20 @@ BOOST_AUTO_TEST_CASE(block) { #endif } +BOOST_AUTO_TEST_CASE(allocator) { + TensorD x(r, 1.0); + Tensor> y(r, 1.0); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + BOOST_REQUIRE_NO_THROW(x.add_to(y)); + BOOST_REQUIRE_NO_THROW(x.subt_to(y)); + BOOST_REQUIRE_NO_THROW(x.mult_to(y)); +} + BOOST_AUTO_TEST_CASE(rebind) { static_assert( std::is_same_v>, TensorZ>); From c6b987370755605f7d8538c59e5f4cf072a17d3c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 26 Jul 2023 16:31:28 -0400 Subject: [PATCH 080/592] [cmake] bump LibreTT tag to pull in https://github.com/victor-anisimov/Librett/pull/8 and https://github.com/victor-anisimov/Librett/pull/9 --- INSTALL.md | 2 +- external/librett.cmake | 4 +++- external/versions.cmake | 3 +-- tests/librett.cpp | 9 +++++++++ 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index e2518103d2..f265676d6a 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -64,7 +64,7 @@ Compiling BTAS requires the following prerequisites: Optional prerequisites: - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on CUDA-enabled accelerators. CUDA 11 or later is required. Support for CUDA also requires the following additional prerequisites, both of which will be built and installed automatically if missing: - - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 35db79a1acaa723f468e7a88a8cd17fa67baf09a). + - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag f5ebdbbba9c9689aa4613a5469021db2dacd8e46). - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag f9640e0fa4245691cdd434e4f719ac5f7d455f82). - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later). - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing: diff --git a/external/librett.cmake b/external/librett.cmake index a34dbf7869..797bc8e64e 100644 --- a/external/librett.cmake +++ b/external/librett.cmake @@ -73,7 +73,7 @@ else() -DENABLE_NO_ALIGNED_ALLOC=ON -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CUDA_HOST_COMPILER} -DCUDA_TOOLKIT_ROOT_DIR=${CUDAToolkit_ROOT} - -DENABLE_CUDA=ON + -DENABLE_CUDA=ON ) if (DEFINED CMAKE_CUDA_ARCHITECTURES) list(APPEND LIBRETT_CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}) @@ -147,6 +147,8 @@ set_target_properties(TiledArray_LIBRETT "$;$" INTERFACE_LINK_LIBRARIES "$;$" + INTERFACE_COMPILE_DEFINITIONS + "LIBRETT_USES_CUDA=1" ) install(TARGETS TiledArray_LIBRETT EXPORT tiledarray COMPONENT tiledarray) diff --git a/external/versions.cmake b/external/versions.cmake index 01d99789ba..b1588c3a50 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -27,8 +27,7 @@ set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) set(TA_TRACKED_BTAS_TAG 3c91f086090390930bba62c6512c4e74a5520e76) set(TA_TRACKED_BTAS_PREVIOUS_TAG 5a45699b78d0540b490c8c769b61033bd4d4f49c) -set(LIBRETT_URL https://github.com/ValeevGroup/Librett) -set(TA_TRACKED_LIBRETT_TAG 35db79a1acaa723f468e7a88a8cd17fa67baf09a) +set(TA_TRACKED_LIBRETT_TAG f5ebdbbba9c9689aa4613a5469021db2dacd8e46) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da) set(TA_TRACKED_UMPIRE_TAG f9640e0fa4245691cdd434e4f719ac5f7d455f82) diff --git a/tests/librett.cpp b/tests/librett.cpp index 9cc7c7b684..cdb2bcf6ce 100644 --- a/tests/librett.cpp +++ b/tests/librett.cpp @@ -27,6 +27,8 @@ #include #include "unit_test_config.h" +#include + struct LibreTTFixture { // LibreTTFixture() // : A(100), @@ -70,6 +72,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) { librettHandle plan; librett_gpuStream_t stream; + cudaCheck(cudaStreamCreate(&stream)); librettResult status; status = @@ -119,6 +122,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { librettHandle plan; librett_gpuStream_t stream; + cudaCheck(cudaStreamCreate(&stream)); librettResult status; std::vector extent({B, A}); @@ -178,6 +182,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { librettHandle plan; librett_gpuStream_t stream; + cudaCheck(cudaStreamCreate(&stream)); librettResult status; std::vector extent3{int(A), int(B), int(C)}; @@ -241,6 +246,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { librettHandle plan; librett_gpuStream_t stream; + cudaCheck(cudaStreamCreate(&stream)); librettResult status; std::vector extent({A, B, C}); @@ -298,6 +304,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) { librettHandle plan; librett_gpuStream_t stream; + cudaCheck(cudaStreamCreate(&stream)); librettResult status; std::vector extent({A, A}); @@ -348,6 +355,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { librettHandle plan; librett_gpuStream_t stream; + cudaCheck(cudaStreamCreate(&stream)); librettResult status; std::vector extent({B, A}); @@ -398,6 +406,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { librettHandle plan; librett_gpuStream_t stream; + cudaCheck(cudaStreamCreate(&stream)); librettResult status; // b(k,i,j) = a(i,j,k) From 843b746ebd1805af8f992b8a41faee7462021347 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 28 Jul 2023 07:06:19 -0400 Subject: [PATCH 081/592] amend https://github.com/ValeevGroup/tiledarray/commit/c6b987370755605f7d8538c59e5f4cf072a17d3c to use TA-allocated streams --- tests/librett.cpp | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/tests/librett.cpp b/tests/librett.cpp index cdb2bcf6ce..3785071071 100644 --- a/tests/librett.cpp +++ b/tests/librett.cpp @@ -27,8 +27,6 @@ #include #include "unit_test_config.h" -#include - struct LibreTTFixture { // LibreTTFixture() // : A(100), @@ -71,8 +69,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) { TiledArray::permutation_to_col_major(perm); librettHandle plan; - librett_gpuStream_t stream; - cudaCheck(cudaStreamCreate(&stream)); + auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); librettResult status; status = @@ -121,8 +118,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { cudaMemcpy(a_device, a_host, A * B * sizeof(int), cudaMemcpyHostToDevice); librettHandle plan; - librett_gpuStream_t stream; - cudaCheck(cudaStreamCreate(&stream)); + auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); librettResult status; std::vector extent({B, A}); @@ -181,8 +177,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { // b(j,i,k) = a(i,j,k) librettHandle plan; - librett_gpuStream_t stream; - cudaCheck(cudaStreamCreate(&stream)); + auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); librettResult status; std::vector extent3{int(A), int(B), int(C)}; @@ -245,8 +240,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { // b(j,i,k) = a(i,j,k) librettHandle plan; - librett_gpuStream_t stream; - cudaCheck(cudaStreamCreate(&stream)); + auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); librettResult status; std::vector extent({A, B, C}); @@ -303,8 +297,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) { } librettHandle plan; - librett_gpuStream_t stream; - cudaCheck(cudaStreamCreate(&stream)); + auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); librettResult status; std::vector extent({A, A}); @@ -354,8 +347,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { } librettHandle plan; - librett_gpuStream_t stream; - cudaCheck(cudaStreamCreate(&stream)); + auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); librettResult status; std::vector extent({B, A}); @@ -405,8 +397,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { } librettHandle plan; - librett_gpuStream_t stream; - cudaCheck(cudaStreamCreate(&stream)); + auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); librettResult status; // b(k,i,j) = a(i,j,k) From 04157a1e07a35fee58ccbe4b5380daaa2f888cc4 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 1 Aug 2023 14:05:51 -0400 Subject: [PATCH 082/592] [cmake] bump VG cmake kit to refresh lapackpp tags --- external/versions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/versions.cmake b/external/versions.cmake index b1588c3a50..6f6c05c977 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -1,7 +1,7 @@ # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS) # to be able to auto-update them -set(TA_TRACKED_VGCMAKEKIT_TAG 3cbfe7c1e2e2667964b737e6abcc44d173fb9775) +set(TA_TRACKED_VGCMAKEKIT_TAG d7d589dddb89bac879ec1df97d8f9510a80055f0) # Boost explicitly downgraded to 1.59 from 1.68 set(TA_TRACKED_BOOST_VERSION 1.59) From 50c122f75fdbe487fc844abdaab9982afa7e8a9a Mon Sep 17 00:00:00 2001 From: David Williams-Young Date: Thu, 17 Aug 2023 10:44:11 -0700 Subject: [PATCH 083/592] Fix disambiguation for tile_norm storage in make_array when target is rank-1 --- src/TiledArray/conversions/make_array.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/conversions/make_array.h b/src/TiledArray/conversions/make_array.h index cc2216e58a..6f5ada0bba 100644 --- a/src/TiledArray/conversions/make_array.h +++ b/src/TiledArray/conversions/make_array.h @@ -155,7 +155,7 @@ inline Array make_array( int task_count = 0; auto task = [&](const ordinal_type index) -> value_type { value_type tile; - tile_norms[index] = op(tile, trange.make_tile_range(index)); + tile_norms.at_ordinal(index) = op(tile, trange.make_tile_range(index)); ++counter; return tile; }; From 28cf631f85d964dfe111629b610a2586e2d9aac3 Mon Sep 17 00:00:00 2001 From: David Williams-Young Date: Thu, 17 Aug 2023 14:55:50 -0700 Subject: [PATCH 084/592] Fix GNU Makefile parallel build for LibRETT --- external/librett.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/librett.cmake b/external/librett.cmake index 797bc8e64e..aa1287d90d 100644 --- a/external/librett.cmake +++ b/external/librett.cmake @@ -133,7 +133,7 @@ else() ") # Add LibreTT dependency to External - add_dependencies(External-tiledarray librett-build) + add_dependencies(External-tiledarray librett) set(_LIBRETT_INSTALL_DIR ${EXTERNAL_INSTALL_DIR}) From 1338737bc5018ff83c63e267f2bad52252fa6b63 Mon Sep 17 00:00:00 2001 From: David Williams-Young Date: Thu, 17 Aug 2023 14:56:47 -0700 Subject: [PATCH 085/592] Explicitly link to CUDA::cudart to fix missing symbols --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b96180ce76..a8b3d6f4fd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -267,7 +267,7 @@ if(CUDA_FOUND) LANGUAGE CUDA) # the list of libraries on which TiledArray depends on - list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cublas CUDA::nvToolsExt TiledArray_LIBRETT) + list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cudart CUDA::cublas CUDA::nvToolsExt TiledArray_LIBRETT) endif(CUDA_FOUND) From 68b87342c64cac3d6099379aa775e85d0fad258c Mon Sep 17 00:00:00 2001 From: David Williams-Young Date: Fri, 25 Aug 2023 16:43:36 -0700 Subject: [PATCH 086/592] Fix CUDA compilation with Cray Wrappers by manually passing the implicit include directories --- src/CMakeLists.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a8b3d6f4fd..a0dc42e6f5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -255,13 +255,21 @@ set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers # TODO better ways to handle tiledarray cuda dependency if(CUDA_FOUND) - list(APPEND TILEDARRAY_SOURCE_FILES + set(TILEDARRAY_CUDA_SOURCE_FILES TiledArray/cuda/btas_um_tensor.cpp TiledArray/cuda/cpu_cuda_vector.cu TiledArray/cuda/kernel/mult_kernel.cu TiledArray/cuda/kernel/reduce_kernel.cu TiledArray/cuda/um_storage.cu) + list(APPEND TILEDARRAY_SOURCE_FILES "${TILEDARRAY_CUDA_SOURCE_FILES}") + + foreach( f IN LISTS TILEDARRAY_CUDA_SOURCE_FILES ) + set_source_files_properties( "${f}" + PROPERTIES + INCLUDE_DIRECTORIES "${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}") + endforeach() + set_source_files_properties(TiledArray/cuda/btas_um_tensor.cpp PROPERTIES LANGUAGE CUDA) From f044490078c1722f74cdf311ce0675289584dd3e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 14 Sep 2023 16:45:56 -0400 Subject: [PATCH 087/592] move definitions of `TILEDARRAY_INSTALL_*` to address https://github.com/ValeevGroup/tiledarray/pull/404/files#r1208030129 --- CMakeLists.txt | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ec53f6bccd..66c4ad2d74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -76,6 +76,23 @@ project(TiledArray HOMEPAGE_URL "https://valeevgroup.github.io/tiledarray/") enable_language(C) # C needed even for basic platform introspection +# Set install paths ============================================================ + +set(TILEDARRAY_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}" + CACHE PATH "TiledArray binary install directory") +set(TILEDARRAY_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}" + CACHE PATH "TiledArray INCLUDE install directory") +set(TILEDARRAY_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}" + CACHE PATH "TiledArray LIB install directory") +set(TILEDARRAY_INSTALL_SHAREDIR "share/tiledarray/${TILEDARRAY_EXT_VERSION}" + CACHE PATH "TiledArray DATA install directory") +set(TILEDARRAY_INSTALL_DATADIR "${TILEDARRAY_INSTALL_SHAREDIR}/data" + CACHE PATH "TiledArray DATA install directory") +set(TILEDARRAY_INSTALL_DOCDIR "${TILEDARRAY_INSTALL_SHAREDIR}/doc" + CACHE PATH "TiledArray DOC install directory") +set(TILEDARRAY_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/tiledarray" + CACHE PATH "TiledArray CMAKE install directory") + # Add module directory and modules ============================================= list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules/) include(CMakePushCheckState) @@ -89,23 +106,6 @@ include(FindPackageRegimport) init_package_regimport() include(LoadFetchContent) -# Set install paths ============================================================ - -set(TILEDARRAY_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}" - CACHE PATH "TiledArray binary install directory") -set(TILEDARRAY_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}" - CACHE PATH "TiledArray INCLUDE install directory") -set(TILEDARRAY_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}" - CACHE PATH "TiledArray LIB install directory") -set(TILEDARRAY_INSTALL_SHAREDIR "share/tiledarray/${TILEDARRAY_EXT_VERSION}" - CACHE PATH "TiledArray DATA install directory") -set(TILEDARRAY_INSTALL_DATADIR "${TILEDARRAY_INSTALL_SHAREDIR}/data" - CACHE PATH "TiledArray DATA install directory") -set(TILEDARRAY_INSTALL_DOCDIR "${TILEDARRAY_INSTALL_SHAREDIR}/doc" - CACHE PATH "TiledArray DOC install directory") -set(TILEDARRAY_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/tiledarray" - CACHE PATH "TiledArray CMAKE install directory") - # Load extra CMake features ==================================================== include(CMakeDependentOption) From 3cb0767f82f6237d4907757efca3e2e2af0f9c8b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 15 Sep 2023 08:24:35 -0400 Subject: [PATCH 088/592] redefine TILEDARRAY_INSTALL_SHAREDIR in terms of CMAKE_INSTALL_DATAROOTDIR --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 66c4ad2d74..19400f63ed 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,7 +84,7 @@ set(TILEDARRAY_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}" CACHE PATH "TiledArray INCLUDE install directory") set(TILEDARRAY_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}" CACHE PATH "TiledArray LIB install directory") -set(TILEDARRAY_INSTALL_SHAREDIR "share/tiledarray/${TILEDARRAY_EXT_VERSION}" +set(TILEDARRAY_INSTALL_SHAREDIR "${CMAKE_INSTALL_DATAROOTDIR}/tiledarray/${TILEDARRAY_EXT_VERSION}" CACHE PATH "TiledArray DATA install directory") set(TILEDARRAY_INSTALL_DATADIR "${TILEDARRAY_INSTALL_SHAREDIR}/data" CACHE PATH "TiledArray DATA install directory") From fa52e10567de51d6f8093f24eac285433866f7dc Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 17 Sep 2023 18:56:52 -0400 Subject: [PATCH 089/592] getting started with HIP support --- CMakeLists.txt | 11 ++++++----- INSTALL.md | 2 +- external/cuda.cmake | 5 +---- external/hip.cmake | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 10 deletions(-) create mode 100644 external/hip.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 19400f63ed..520898704d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,10 +130,8 @@ add_feature_info(TBB ENABLE_TBB "Intel Thread-Building Blocks (TBB) supports pro option(ENABLE_CUDA "Enable use of CUDA with TiledArray" OFF) add_feature_info(CUDA ENABLE_CUDA "NVIDIA CUDA support for GPU") -if(ENABLE_CUDA) - option(ENABLE_CUDA_ERROR_CHECK "TiledArray will always check errors in CUDA calls" ON) - add_feature_info(CUDA_ERROR_CHECK ENABLE_CUDA_ERROR_CHECK "Checks CUDA Error") -endif() +option(ENABLE_HIP "Enable use of HIP with TiledArray" OFF) +add_feature_info(HIP ENABLE_HIP "AMD HIP/ROCm support for GPU") option(ENABLE_GPERFTOOLS "Enable linking with Gperftools" OFF) add_feature_info(GPERFTOOLS ENABLE_GPERFTOOLS "Google Performance Tools provide fast memory allocation and performance profiling") @@ -306,10 +304,13 @@ include_directories(${PROJECT_SOURCE_DIR}/src ${PROJECT_BINARY_DIR}/src) add_custom_target(External-tiledarray) # required deps: -# 1. CUDA first since others may depend on it +# 1. derive runtime (CUDA/HIP/...) first since others may depend on it if(ENABLE_CUDA) include(external/cuda.cmake) endif() +if(ENABLE_HIP) + include(external/hip.cmake) +endif() if (TA_TTG) include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchTTG.cmake) endif(TA_TTG) diff --git a/INSTALL.md b/INSTALL.md index f265676d6a..1aabcf0a5f 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -329,7 +329,7 @@ Support for execution on CUDA-enabled hardware is controlled by the following va * `ENABLE_CUDA` -- Set to `ON` to turn on CUDA support. [Default=OFF]. * `CMAKE_CUDA_HOST_COMPILER` -- Set to the path to the host C++ compiler to be used by CUDA compiler. CUDA compilers used to be notorious for only being able to use specific C++ host compilers, but support for more recent C++ host compilers has improved. The default is determined by the CUDA compiler and the user environment variables (`PATH` etc.). -* `ENABLE_CUDA_ERROR_CHECK` -- Set to `ON` to turn on assertions for successful completion of calls to CUDA runtime and libraries. [Default=OFF]. +* `ENABLE_HIP` -- Set to `ON` to turn on HIP support. [Default=OFF]. * `LIBRETT_INSTALL_DIR` -- the installation prefix of the pre-installed LibreTT library. This should not be normally needed; it is strongly recommended to let TiledArray build and install LibreTT. * `UMPIRE_INSTALL_DIR` -- the installation prefix of the pre-installed Umpire library. This should not be normally needed; it is strongly recommended to let TiledArray build and install Umpire. diff --git a/external/cuda.cmake b/external/cuda.cmake index 49f2cbc558..dd8bef0dee 100644 --- a/external/cuda.cmake +++ b/external/cuda.cmake @@ -16,10 +16,7 @@ enable_language(CUDA) set(CUDA_FOUND TRUE) set(TILEDARRAY_HAS_CUDA 1 CACHE BOOL "Whether TiledArray has CUDA support") - -if(ENABLE_CUDA_ERROR_CHECK) - set (TILEDARRAY_CHECK_CUDA_ERROR 1) -endif(ENABLE_CUDA_ERROR_CHECK) +set(TILEDARRAY_CHECK_CUDA_ERROR 1 CACHE BOOL "Whether TiledArray will check CUDA errors") # find CUDA toolkit # NB CUDAToolkit does NOT have COMPONENTS diff --git a/external/hip.cmake b/external/hip.cmake new file mode 100644 index 0000000000..53a28a4caa --- /dev/null +++ b/external/hip.cmake @@ -0,0 +1,36 @@ +# cmake 3.21 introduced HIP language support +cmake_minimum_required(VERSION 3.21.0) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) +# N.B. need relaxed constexpr for std::complex +# see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#constexpr-functions%5B/url%5D: +if (DEFINED CMAKE_HIP_FLAGS) + set(CMAKE_HIP_FLAGS "--expt-relaxed-constexpr ${CMAKE_HIPE_FLAGS}") +else() + set(CMAKE_HIP_FLAGS "--expt-relaxed-constexpr") +endif() +enable_language(HIP) + +set(HIP_FOUND TRUE) +set(TILEDARRAY_HAS_HIP 1 CACHE BOOL "Whether TiledArray has HIP support") +set(TILEDARRAY_CHECK_HIP_ERROR 1 CACHE BOOL "Whether TiledArray will check HIP errors") + +# find HIP components +find_package(hipblas REQUIRED) + +foreach (library hipblas) + if (NOT TARGET roc::${library}) + message(FATAL_ERROR "roc::${library} not found") + endif() +endforeach() + +## +## Umpire +## +include(external/umpire.cmake) + +## +## LibreTT +## +include(external/librett.cmake) From 778f3c42a8f9379e6f5ab41f4d8538ab11d075f5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 17 Sep 2023 20:08:55 -0400 Subject: [PATCH 090/592] --expt-relaxed-constexpr is not supported in HIP ... see https://github.com/ROCm-Developer-Tools/HIP/issues/374 --- external/hip.cmake | 7 ------- 1 file changed, 7 deletions(-) diff --git a/external/hip.cmake b/external/hip.cmake index 53a28a4caa..91a9e2cd82 100644 --- a/external/hip.cmake +++ b/external/hip.cmake @@ -3,13 +3,6 @@ cmake_minimum_required(VERSION 3.21.0) set(CMAKE_HIP_STANDARD 17) set(CMAKE_HIP_EXTENSIONS OFF) set(CMAKE_HIP_STANDARD_REQUIRED ON) -# N.B. need relaxed constexpr for std::complex -# see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#constexpr-functions%5B/url%5D: -if (DEFINED CMAKE_HIP_FLAGS) - set(CMAKE_HIP_FLAGS "--expt-relaxed-constexpr ${CMAKE_HIPE_FLAGS}") -else() - set(CMAKE_HIP_FLAGS "--expt-relaxed-constexpr") -endif() enable_language(HIP) set(HIP_FOUND TRUE) From 0e4259931cbefb796d2e915a140a7bc73779303d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 17 Sep 2023 20:14:37 -0400 Subject: [PATCH 091/592] librett.cmake: support HIP --- external/librett.cmake | 54 +++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/external/librett.cmake b/external/librett.cmake index aa1287d90d..53ecdaa893 100644 --- a/external/librett.cmake +++ b/external/librett.cmake @@ -37,11 +37,6 @@ else() message("** Will clone LibreTT from ${LIBRETT_URL}") - # need to change the separator of list to avoid issues with ExternalProject parsing -# set(CUDA_FLAGS "${CUDA_NVCC_FLAGS}") -# string(REPLACE ";" "::" CUDA_FLAGS "${CUDA_NVCC_FLAGS}") - #message(STATUS "CUDA_FLAGS: " "${CUDA_FLAGS}") - set(LIBRETT_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${EXTERNAL_INSTALL_DIR} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} @@ -62,22 +57,37 @@ else() -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} -DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS} -DCMAKE_AR=${CMAKE_AR} - -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER} - -DCMAKE_CUDA_STANDARD=${CMAKE_CUDA_STANDARD} - -DCMAKE_CUDA_EXTENSIONS=${CMAKE_CUDA_EXTENSIONS} -DENABLE_UMPIRE=OFF # N.B. ThreadSafeUMDynamicPool this no longer exists!!! Must teach LibreTT to take allocate/deallocate methods # from the user code -DLIBRETT_USES_THIS_UMPIRE_ALLOCATOR=ThreadSafeUMDynamicPool -DCMAKE_PREFIX_PATH=${_UMPIRE_INSTALL_DIR} -DENABLE_NO_ALIGNED_ALLOC=ON - -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CUDA_HOST_COMPILER} - -DCUDA_TOOLKIT_ROOT_DIR=${CUDAToolkit_ROOT} - -DENABLE_CUDA=ON ) - if (DEFINED CMAKE_CUDA_ARCHITECTURES) - list(APPEND LIBRETT_CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}) - endif(DEFINED CMAKE_CUDA_ARCHITECTURES) + if (ENABLE_CUDA) + list(APPEND LIBRETT_CMAKE_ARGS + -DENABLE_CUDA=ON + -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER} + -DCMAKE_CUDA_STANDARD=${CMAKE_CUDA_STANDARD} + -DCMAKE_CUDA_EXTENSIONS=${CMAKE_CUDA_EXTENSIONS} + -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CUDA_HOST_COMPILER} + -DCUDA_TOOLKIT_ROOT_DIR=${CUDAToolkit_ROOT} + ) + if (DEFINED CMAKE_CUDA_ARCHITECTURES) + list(APPEND LIBRETT_CMAKE_ARGS "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}") + endif(DEFINED CMAKE_CUDA_ARCHITECTURES) + endif() + if (ENABLE_HIP) + list(APPEND LIBRETT_CMAKE_ARGS + -DENABLE_HIP=ON + -DCMAKE_HIP_COMPILER=${CMAKE_HIP_COMPILER} + -DCMAKE_HIP_STANDARD=${CMAKE_HIP_STANDARD} + -DCMAKE_HIP_EXTENSIONS=${CMAKE_HIP_EXTENSIONS} + ) + if (DEFINED CMAKE_HIP_ARCHITECTURES) + list(APPEND LIBRETT_CMAKE_ARGS "-DCMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}") + endif(DEFINED CMAKE_HIP_ARCHITECTURES) + endif() if (CMAKE_TOOLCHAIN_FILE) set(LIBRETT_CMAKE_ARGS "${LIBRETT_CMAKE_ARGS}" "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") @@ -147,9 +157,21 @@ set_target_properties(TiledArray_LIBRETT "$;$" INTERFACE_LINK_LIBRARIES "$;$" - INTERFACE_COMPILE_DEFINITIONS - "LIBRETT_USES_CUDA=1" ) +if (ENABLE_CUDA) + set_target_properties(TiledArray_LIBRETT + PROPERTIES + INTERFACE_COMPILE_DEFINITIONS + "LIBRETT_USES_CUDA=1" + ) +endif() +if (ENABLE_HIP) + set_target_properties(TiledArray_LIBRETT + PROPERTIES + INTERFACE_COMPILE_DEFINITIONS + "LIBRETT_USES_HIP=1" + ) +endif() install(TARGETS TiledArray_LIBRETT EXPORT tiledarray COMPONENT tiledarray) From 7765d10d844d975cdd62031d1415aee6f0ca615b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 17 Sep 2023 20:23:50 -0400 Subject: [PATCH 092/592] umpire.cmake: support HIP --- external/umpire.cmake | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/external/umpire.cmake b/external/umpire.cmake index 1ee9dde48b..081afc9d4a 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -102,9 +102,20 @@ else() -DCUDA_TOOLKIT_ROOT_DIR=${CUDAToolkit_ROOT} ) if (DEFINED CMAKE_CUDA_ARCHITECTURES) - list(APPEND UMPIRE_CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}) + list(APPEND UMPIRE_CMAKE_ARGS "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}") endif(DEFINED CMAKE_CUDA_ARCHITECTURES) endif(ENABLE_CUDA) + if (ENABLE_HIP) + list(APPEND UMPIRE_CMAKE_ARGS + -DENABLE_HIP=ON + -DCMAKE_HIP_COMPILER=${CMAKE_HIP_COMPILER} + -DCMAKE_HIP_STANDARD=${CMAKE_HIP_STANDARD} + -DCMAKE_HIP_EXTENSIONS=${CMAKE_HIP_EXTENSIONS} + ) + if (DEFINED CMAKE_HIP_ARCHITECTURES) + list(APPEND UMPIRE_CMAKE_ARGS "-DCMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}") + endif(DEFINED CMAKE_HIP_ARCHITECTURES) + endif(ENABLE_HIP) if (CMAKE_TOOLCHAIN_FILE) set(UMPIRE_CMAKE_ARGS "${UMPIRE_CMAKE_ARGS}" "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}" From 360dd47091d00e392caa14f7a2ee9e187299eb55 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 17 Sep 2023 20:29:06 -0400 Subject: [PATCH 093/592] FindOrFetchBTAS: tell linalg++ to look for CUDA or HIP --- cmake/modules/FindOrFetchBTAS.cmake | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cmake/modules/FindOrFetchBTAS.cmake b/cmake/modules/FindOrFetchBTAS.cmake index 57a4b94ac0..35ad3dd200 100644 --- a/cmake/modules/FindOrFetchBTAS.cmake +++ b/cmake/modules/FindOrFetchBTAS.cmake @@ -17,8 +17,14 @@ if (NOT TARGET BTAS::BTAS) set(_linalgpp_use_standard_linalg_kits TRUE) endif(DEFINED BLA_VENDOR) - if (NOT TILEDARRAY_HAS_CUDA) - # tell BLAS++/LAPACK++ to ignore CUDA + if (TILEDARRAY_HAS_CUDA) + # tell BLAS++/LAPACK++ to also look for CUDA + set(gpu_backend cuda CACHE STRING "The device backend to use for Linalg++") + elseif (TILEDARRAY_HAS_HIP) + # tell BLAS++/LAPACK++ to also look for HIP + set(gpu_backend hip CACHE STRING "The device backend to use for Linalg++") + else () + # tell BLAS++/LAPACK++ to not look for device backends set(gpu_backend none CACHE STRING "The device backend to use for Linalg++") endif() From 25824469959e856434db2b2c5e00126929cb081b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 17 Sep 2023 20:48:29 -0400 Subject: [PATCH 094/592] bump VRG cmake kit to the latest --- external/versions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/versions.cmake b/external/versions.cmake index 6f6c05c977..89c93da6c5 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -1,7 +1,7 @@ # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS) # to be able to auto-update them -set(TA_TRACKED_VGCMAKEKIT_TAG d7d589dddb89bac879ec1df97d8f9510a80055f0) +set(TA_TRACKED_VGCMAKEKIT_TAG e68b3b4e8a57a175bb9d1b4e4cfa7d31b9363de5) # Boost explicitly downgraded to 1.59 from 1.68 set(TA_TRACKED_BOOST_VERSION 1.59) From dd0440a027af619e2088e328a75851aaa0432a43 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 18 Sep 2023 05:17:49 -0400 Subject: [PATCH 095/592] [cmake] bump Librett tag to pull in https://github.com/victor-anisimov/Librett/pull/10 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 1aabcf0a5f..c85985b7f1 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -64,7 +64,7 @@ Compiling BTAS requires the following prerequisites: Optional prerequisites: - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on CUDA-enabled accelerators. CUDA 11 or later is required. Support for CUDA also requires the following additional prerequisites, both of which will be built and installed automatically if missing: - - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag f5ebdbbba9c9689aa4613a5469021db2dacd8e46). + - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 354e0ccee54aeb2f191c3ce2c617ebf437e49d83). - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag f9640e0fa4245691cdd434e4f719ac5f7d455f82). - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later). - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing: diff --git a/external/versions.cmake b/external/versions.cmake index 89c93da6c5..4bea408a0d 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -27,8 +27,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) set(TA_TRACKED_BTAS_TAG 3c91f086090390930bba62c6512c4e74a5520e76) set(TA_TRACKED_BTAS_PREVIOUS_TAG 5a45699b78d0540b490c8c769b61033bd4d4f49c) -set(TA_TRACKED_LIBRETT_TAG f5ebdbbba9c9689aa4613a5469021db2dacd8e46) -set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da) +set(TA_TRACKED_LIBRETT_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) +set(TA_TRACKED_LIBRETT_PREVIOUS_TAG f5ebdbbba9c9689aa4613a5469021db2dacd8e46) set(TA_TRACKED_UMPIRE_TAG f9640e0fa4245691cdd434e4f719ac5f7d455f82) set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v6.0.0) From 1f721aad6a28c5d5567c7638c14f5597f4bbc41d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 18 Sep 2023 05:18:11 -0400 Subject: [PATCH 096/592] [cmake] pass through CMAKE_PREFIX_PATH to Librett+Umpire --- external/librett.cmake | 7 ++++++- external/umpire.cmake | 5 +++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/external/librett.cmake b/external/librett.cmake index 53ecdaa893..c04cf56b38 100644 --- a/external/librett.cmake +++ b/external/librett.cmake @@ -35,6 +35,11 @@ else() set(LIBRETT_TAG ${TA_TRACKED_LIBRETT_TAG}) endif (NOT LIBRETT_TAG) + if (CMAKE_PREFIX_PATH) + set(LIBRETT_CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH}) + endif() + list(APPEND LIBRETT_CMAKE_PREFIX_PATH ${_UMPIRE_INSTALL_DIR}) + message("** Will clone LibreTT from ${LIBRETT_URL}") set(LIBRETT_CMAKE_ARGS @@ -61,7 +66,7 @@ else() # N.B. ThreadSafeUMDynamicPool this no longer exists!!! Must teach LibreTT to take allocate/deallocate methods # from the user code -DLIBRETT_USES_THIS_UMPIRE_ALLOCATOR=ThreadSafeUMDynamicPool - -DCMAKE_PREFIX_PATH=${_UMPIRE_INSTALL_DIR} + -DCMAKE_PREFIX_PATH=${LIBRETT_CMAKE_PREFIX_PATH} -DENABLE_NO_ALIGNED_ALLOC=ON ) if (ENABLE_CUDA) diff --git a/external/umpire.cmake b/external/umpire.cmake index 081afc9d4a..24c9e5e56d 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -55,11 +55,16 @@ else() set(BLT_CXX_STD ${BLT_CXX_STD_MAX}) endif() + if (CMAKE_PREFIX_PATH) + set(UMPIRE_CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH}) + endif() + set(UMPIRE_CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${EXTERNAL_INSTALL_DIR} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} -DCMAKE_POSITION_INDEPENDENT_CODE=${CMAKE_POSITION_INDEPENDENT_CODE} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_PREFIX_PATH=${UMPIRE_CMAKE_PREFIX_PATH} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} From ebc3836fdc134248063e68051b1e3ab686808d8a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 18 Sep 2023 05:39:28 -0400 Subject: [PATCH 097/592] [dox] updated INSTALL.md for HIP/ROCm --- INSTALL.md | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index c85985b7f1..00541a9135 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -32,7 +32,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b See the current [Travis CI matrix](.travis.yml) for the most up-to-date list of compilers that are known to work. -- [CMake](https://cmake.org/), version 3.15 or higher; if CUDA support is needed, CMake 3.18 or higher is required. +- [CMake](https://cmake.org/), version 3.15 or higher; if {CUDA,HIP} support is needed, CMake {3.18,3.21} or higher is required. - [Git](https://git-scm.com/) 1.8 or later (required to obtain TiledArray and MADNESS source code from GitHub) - [Eigen](http://eigen.tuxfamily.org/), version 3.3.5 or higher; if CUDA is enabled then 3.3.7 is required (will be downloaded automatically, if missing) - [Boost libraries](www.boost.org/), version 1.59 or higher (will be downloaded automatically, if missing). The following principal Boost components are used: @@ -63,8 +63,11 @@ Compiling BTAS requires the following prerequisites: - BLAS and LAPACK libraries Optional prerequisites: -- [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on CUDA-enabled accelerators. CUDA 11 or later is required. Support for CUDA also requires the following additional prerequisites, both of which will be built and installed automatically if missing: - - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 354e0ccee54aeb2f191c3ce2c617ebf437e49d83). +- for execution on GPGPUs: + - device programming runtime: + - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required. + - [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators. Note that TiledArray does not use ROCm directly but its C++ Heterogeneous-Compute Interface for Portability, `HIP`; although HIP can also be used to program CUDA-enabled devices, in TiledArray it is used only to program ROCm devices, hence ROCm and HIP will be used interchangeably. + - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 354e0ccee54aeb2f191c3ce2c617ebf437e49d83). - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag f9640e0fa4245691cdd434e4f719ac5f7d455f82). - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later). - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing: @@ -323,17 +326,18 @@ To discover and configure the use of Intel MKL consider these suggestions: Also note that even if OpenMP or TBB backends are used, TiledArray will be default set the number of threads to be used by MKL kernels to 1, regardless of the value of environment variables `MKL_NUM_THREADS`/`OMP_NUM_THREADS`. It is possible to change the number of threads to be used programmatically in your application by calling MKL function `mkl_set_num_threads()`. -## CUDA +## GPGPU support -Support for execution on CUDA-enabled hardware is controlled by the following variables: +Support for execution on NVIDIA and AMD GPGPUs is controlled by the following variables: * `ENABLE_CUDA` -- Set to `ON` to turn on CUDA support. [Default=OFF]. * `CMAKE_CUDA_HOST_COMPILER` -- Set to the path to the host C++ compiler to be used by CUDA compiler. CUDA compilers used to be notorious for only being able to use specific C++ host compilers, but support for more recent C++ host compilers has improved. The default is determined by the CUDA compiler and the user environment variables (`PATH` etc.). -* `ENABLE_HIP` -- Set to `ON` to turn on HIP support. [Default=OFF]. +* `ENABLE_HIP` -- Set to `ON` to turn on HIP/ROCm support. [Default=OFF]. * `LIBRETT_INSTALL_DIR` -- the installation prefix of the pre-installed LibreTT library. This should not be normally needed; it is strongly recommended to let TiledArray build and install LibreTT. * `UMPIRE_INSTALL_DIR` -- the installation prefix of the pre-installed Umpire library. This should not be normally needed; it is strongly recommended to let TiledArray build and install Umpire. -For the CUDA compiler and toolkit to be discoverable the CUDA compiler (`nvcc`) should be in the `PATH` environment variable. Refer to the [FindCUDAToolkit module](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html) for more info. +- For the CUDA compiler and toolkit to be discoverable the CUDA compiler (`nvcc`) should be in the `PATH` environment variable. Refer to the [FindCUDAToolkit module](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html) for more info. +- For the ROCm platform to be discoverable add its prefix path (e.g., `/opt/rocm`) to `CMAKE_PREFIX_PATH` ## Eigen 3 From b06a17a6b4438b178fdc629dd4db80fc50dc96a6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 18 Sep 2023 06:32:08 -0400 Subject: [PATCH 098/592] removed TILEDARRAY_CHECK_CUDA_ERROR --- external/cuda.cmake | 1 - src/TiledArray/config.h.in | 4 +++- src/TiledArray/cuda/cublas.h | 2 -- src/TiledArray/external/cuda.h | 6 ------ 4 files changed, 3 insertions(+), 10 deletions(-) diff --git a/external/cuda.cmake b/external/cuda.cmake index dd8bef0dee..00a8b17477 100644 --- a/external/cuda.cmake +++ b/external/cuda.cmake @@ -16,7 +16,6 @@ enable_language(CUDA) set(CUDA_FOUND TRUE) set(TILEDARRAY_HAS_CUDA 1 CACHE BOOL "Whether TiledArray has CUDA support") -set(TILEDARRAY_CHECK_CUDA_ERROR 1 CACHE BOOL "Whether TiledArray will check CUDA errors") # find CUDA toolkit # NB CUDAToolkit does NOT have COMPONENTS diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in index 0c4d5d5cbc..4cba5ee840 100644 --- a/src/TiledArray/config.h.in +++ b/src/TiledArray/config.h.in @@ -76,7 +76,9 @@ /* Define if TiledArray configured with CUDA support */ #cmakedefine TILEDARRAY_HAS_CUDA @TILEDARRAY_HAS_CUDA@ -#cmakedefine TILEDARRAY_CHECK_CUDA_ERROR @TILEDARRAY_CHECK_CUDA_ERROR@ + +/* Define if TiledArray configured with HIP support */ +#cmakedefine TILEDARRAY_HAS_HIP @TILEDARRAY_HAS_HIP@ /* Is TA::Tensor memory profiling enabled? */ #cmakedefine TA_TENSOR_MEM_PROFILE 1 diff --git a/src/TiledArray/cuda/cublas.h b/src/TiledArray/cuda/cublas.h index 8d4085eabb..501a0402d1 100644 --- a/src/TiledArray/cuda/cublas.h +++ b/src/TiledArray/cuda/cublas.h @@ -40,14 +40,12 @@ inline void __cublasSafeCall(cublasStatus_t err, const char *file, const int line) { -#ifdef TILEDARRAY_CHECK_CUDA_ERROR if (CUBLAS_STATUS_SUCCESS != err) { std::stringstream ss; ss << "cublasSafeCall() failed at: " << file << "(" << line << ")"; std::string what = ss.str(); throw std::runtime_error(what); } -#endif return; } diff --git a/src/TiledArray/external/cuda.h b/src/TiledArray/external/cuda.h index dd141f6f52..a7d7601915 100644 --- a/src/TiledArray/external/cuda.h +++ b/src/TiledArray/external/cuda.h @@ -52,27 +52,22 @@ #define CudaCheckError() __cudaCheckError(__FILE__, __LINE__) inline void __cudaSafeCall(cudaError err, const char* file, const int line) { -#ifdef TILEDARRAY_CHECK_CUDA_ERROR if (cudaSuccess != err) { std::stringstream ss; ss << "cudaSafeCall() failed at: " << file << ":" << line; std::string what = ss.str(); throw thrust::system_error(err, thrust::cuda_category(), what); } -#endif } inline void __cudaSafeCallNoThrow(cudaError err, const char* file, const int line) { -#ifdef TILEDARRAY_CHECK_CUDA_ERROR if (cudaSuccess != err) { madness::print_error("cudaSafeCallNoThrow() failed at: ", file, ":", line); } -#endif } inline void __cudaCheckError(const char* file, const int line) { -#ifdef TILEDARRAY_CHECK_CUDA_ERROR cudaError err = cudaGetLastError(); if (cudaSuccess != err) { std::stringstream ss; @@ -80,7 +75,6 @@ inline void __cudaCheckError(const char* file, const int line) { std::string what = ss.str(); throw thrust::system_error(err, thrust::cuda_category(), what); } -#endif } namespace TiledArray { From 899b8d06d9760cfee66e53c56a8801e1d5058157 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 18 Sep 2023 06:32:45 -0400 Subject: [PATCH 099/592] moved TA::detail::mpi_local_rank_size to external/madness.h --- src/TiledArray/external/cuda.h | 6 ------ src/TiledArray/external/madness.h | 8 ++++++++ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/TiledArray/external/cuda.h b/src/TiledArray/external/cuda.h index a7d7601915..1d169b1098 100644 --- a/src/TiledArray/external/cuda.h +++ b/src/TiledArray/external/cuda.h @@ -81,12 +81,6 @@ namespace TiledArray { namespace detail { -inline std::pair mpi_local_rank_size(World& world) { - auto host_comm = - world.mpi.comm().Split_type(SafeMPI::Intracomm::SHARED_SPLIT_TYPE, 0); - return std::make_pair(host_comm.Get_rank(), host_comm.Get_size()); -} - inline int num_cuda_streams() { int num_streams = -1; char* num_stream_char = std::getenv("TA_CUDA_NUM_STREAMS"); diff --git a/src/TiledArray/external/madness.h b/src/TiledArray/external/madness.h index 8a8efd6b2e..bf75813c61 100644 --- a/src/TiledArray/external/madness.h +++ b/src/TiledArray/external/madness.h @@ -128,6 +128,14 @@ inline World split(const World& w, int color, int key = 0) { return std::move(comm); } +namespace detail { +inline std::pair mpi_local_rank_size(World& world) { + auto host_comm = + world.mpi.comm().Split_type(SafeMPI::Intracomm::SHARED_SPLIT_TYPE, 0); + return std::make_pair(host_comm.Get_rank(), host_comm.Get_size()); +} +} // namespace detail + } // namespace TiledArray #endif // TILEDARRAY_EXTERNAL_MADNESS_H__INCLUDED From e03b17db417cbd88f3efb76d98fc2a8e1cbb4ef5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 18 Sep 2023 06:38:05 -0400 Subject: [PATCH 100/592] initial version of hip.h --- src/CMakeLists.txt | 22 +- src/TiledArray/external/hip.h | 466 ++++++++++++++++++++++++++++++++++ 2 files changed, 485 insertions(+), 3 deletions(-) create mode 100644 src/TiledArray/external/hip.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a0dc42e6f5..e4a3b0211e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -208,10 +208,8 @@ TiledArray/util/vector.h ) if(CUDA_FOUND) - list(APPEND TILEDARRAY_HEADER_FILES TiledArray/external/cuda.h - TiledArray/external/librett.h TiledArray/cuda/cublas.h TiledArray/cuda/btas_cublas.h TiledArray/cuda/btas_um_tensor.h @@ -225,9 +223,18 @@ if(CUDA_FOUND) TiledArray/cuda/thrust.h TiledArray/cuda/allocators.h TiledArray/cuda/um_storage.h) +endif(CUDA_FOUND) +if(HIP_FOUND) + list(APPEND TILEDARRAY_HEADER_FILES + TiledArray/external/hip.h) endif(CUDA_FOUND) +if(HIP_FOUND OR CUDA_FOUND) + list(APPEND TILEDARRAY_HEADER_FILES + TiledArray/external/librett.h) +endif() + set(TILEDARRAY_SOURCE_FILES TiledArray/tiledarray.cpp TiledArray/tensor/tensor.cpp @@ -275,10 +282,19 @@ if(CUDA_FOUND) LANGUAGE CUDA) # the list of libraries on which TiledArray depends on - list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cudart CUDA::cublas CUDA::nvToolsExt TiledArray_LIBRETT) + list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cudart CUDA::cublas CUDA::nvToolsExt) endif(CUDA_FOUND) +if (HIP_FOUND) + list(APPEND _TILEDARRAY_DEPENDENCIES hip::host) +endif() + +# LibreTT needed for either CUDA or HIP +if(CUDA_FOUND OR HIP_FOUND) + list(APPEND _TILEDARRAY_DEPENDENCIES TiledArray_LIBRETT) +endif() + if( TARGET TiledArray_SCALAPACK ) list(APPEND _TILEDARRAY_DEPENDENCIES TiledArray_SCALAPACK) endif() diff --git a/src/TiledArray/external/hip.h b/src/TiledArray/external/hip.h new file mode 100644 index 0000000000..75dbfc6955 --- /dev/null +++ b/src/TiledArray/external/hip.h @@ -0,0 +1,466 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2018 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Chong Peng + * Department of Chemistry, Virginia Tech + * July 23, 2018 + * + */ + +#ifndef TILEDARRAY_EXTERNAL_HIP_H__INCLUDED +#define TILEDARRAY_EXTERNAL_HIP_H__INCLUDED + +#include +#include +#include + +#include + +#ifdef TILEDARRAY_HAS_HIP + +#include + +#include + +#include +#include +#include +#include + +#include + +#define HipSafeCall(err) __hipSafeCall(err, __FILE__, __LINE__) +#define HipSafeCallNoThrow(err) __hipSafeCallNoThrow(err, __FILE__, __LINE__) +#define HipCheckError() __hipCheckError(__FILE__, __LINE__) + +inline void __hipSafeCall(hipError_t err, const char* file, const int line) { + if (hipSuccess != err) { + std::stringstream ss; + ss << "hipSafeCall() failed at: " << file << ":" << line << ": "; + ss << hipGetErrorString(err); + throw std::runtime_error(ss.str()); + } +} + +inline void __hipSafeCallNoThrow(hipError_t err, const char* file, + const int line) { + if (hipSuccess != err) { + madness::print_error("hipSafeCallNoThrow() failed at: ", file, ":", line, + ": ", hipGetErrorString(err)); + } +} + +inline void __hipCheckError(const char* file, const int line) { + auto err = hipGetLastError(); + if (hipSuccess != err) { + std::stringstream ss; + ss << "hipCheckError() failed at: " << file << ":" << line << ": "; + ss << hipGetErrorString(err); + throw std::runtime_error(ss.str()); + } +} + +namespace TiledArray { + +namespace detail { + +inline int num_streams() { + int num_streams = -1; + char* num_stream_char = std::getenv("TA_HIP_NUM_STREAMS"); + /// default num of streams is 3 + if (num_stream_char) { + num_streams = std::atoi(num_stream_char); + } else { + num_streams = 3; + } + return num_streams; +} + +inline int num_devices() { + int num_devices = -1; + HipSafeCall(hipGetDeviceCount(&num_devices)); + return num_devices; +} + +inline int current_device_id(World& world) { + int mpi_local_size = -1; + int mpi_local_rank = -1; + std::tie(mpi_local_rank, mpi_local_size) = mpi_local_rank_size(world); + + int num_devices = detail::num_devices(); + + int device_id = -1; + // devices may already be pre-mapped + // if mpi_local_size <= num_devices : all ranks are in same resource set, map + // round robin + if (mpi_local_size <= num_devices) { + device_id = mpi_local_rank % num_devices; + } else { // mpi_local_size > num_devices + char* cvd_cstr = std::getenv("HIP_VISIBLE_DEVICES"); + if (cvd_cstr) { // HIP_VISIBLE_DEVICES is set, assume that pre-mapped + // make sure that there is only 1 device available here + if (num_devices != 1) { + throw std::runtime_error( + std::string( + "HIP_VISIBLE_DEVICES environment variable is set, hence using " + "the provided device-to-rank mapping; BUT TiledArray found ") + + std::to_string(num_devices) + + " HIP devices; only 1 HIP device / MPI process is supported"); + } + device_id = 0; + } else { // not enough devices + devices are not pre-mapped + throw std::runtime_error( + std::string("TiledArray found ") + std::to_string(mpi_local_size) + + " MPI ranks on a node with " + std::to_string(num_devices) + + " HIP devices; only 1 MPI process / HIP device model is currently " + "supported"); + } + } + + return device_id; +} + +inline void HIPRT_CB hip_readyflag_callback(void* userData) { + // convert void * to std::atomic + std::atomic* flag = static_cast*>(userData); + // set the flag to be true + flag->store(true); +} + +struct ProbeFlag { + ProbeFlag(std::atomic* f) : flag(f) {} + + bool operator()() const { return flag->load(); } + + std::atomic* flag; +}; + +inline void thread_wait_stream(const hipStream_t& stream) { + std::atomic* flag = new std::atomic(false); + + HipSafeCall(hipLaunchHostFunc(stream, detail::hip_readyflag_callback, flag)); + + detail::ProbeFlag probe(flag); + + // wait with sleep and do not do work + madness::ThreadPool::await(probe, false, true); + // madness::ThreadPool::await(probe, true, true); + + delete flag; +} + +} // namespace detail + +inline const hipStream_t*& tls_stream_accessor() { + static thread_local const hipStream_t* thread_local_stream_ptr{nullptr}; + return thread_local_stream_ptr; +} + +inline void synchronize_stream(const hipStream_t* stream) { + tls_stream_accessor() = stream; +} + +/** + * hipEnv maintains the HIP-related part of the runtime environment, + * such as HIP-specific memory allocators + * + * \note this is a Singleton + */ +class hipEnv { + public: + ~hipEnv() { + // destroy streams on current device + for (auto& stream : streams_) { + HipSafeCallNoThrow(hipStreamDestroy(stream)); + } + } + + hipEnv(const hipEnv&) = delete; + hipEnv(hipEnv&&) = delete; + hipEnv& operator=(const hipEnv&) = delete; + hipEnv& operator=(hipEnv&&) = delete; + + /// access the singleton instance; if not initialized will be + /// initialized via hipEnv::initialize() with the default params + static std::unique_ptr& instance() { + if (!instance_accessor()) { + initialize(); + } + return instance_accessor(); + } + + // clang-format off + /// initialize the instance using explicit params + /// \param world the world to use for initialization + /// \param page_size memory added to the pools supporting `this->um_allocator()`, `this->device_allocator()`, and `this->pinned_allocator()` in chunks of at least + /// this size (bytes) [default=2^25] + /// \param pinned_alloc_limit the maximum total amount of memory (in bytes) that + /// allocator returned by `this->pinned_allocator()` can allocate; + /// this allocator is not used by default [default=0] + // clang-format on + static void initialize(World& world = TiledArray::get_default_world(), + const std::uint64_t page_size = (1ul << 25), + const std::uint64_t pinned_alloc_limit = (1ul << 40)) { + static std::mutex mtx; // to make initialize() reentrant + std::scoped_lock lock{mtx}; + // only the winner of the lock race gets to initialize + if (instance_accessor() == nullptr) { + int num_streams = detail::num_streams(); + int num_devices = detail::num_devices(); + int device_id = detail::current_device_id(world); + // set device for current MPI process .. will be set in the ctor as well + HipSafeCall(hipSetDevice(device_id)); + HipSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared)); + + // uncomment to debug umpire ops + // + // umpire::util::Logger::getActiveLogger()->setLoggingMsgLevel( + // umpire::util::message::Debug); + + // make Thread Safe UM Dynamic POOL + + auto& rm = umpire::ResourceManager::getInstance(); + + auto mem_total_free = hipEnv::memory_total_and_free_device(); + + // turn off Umpire introspection for non-Debug builds +#ifndef NDEBUG + constexpr auto introspect = true; +#else + constexpr auto introspect = false; +#endif + + // allocate all currently-free memory for UM pool + auto um_dynamic_pool = + rm.makeAllocator( + "UMDynamicPool", rm.getAllocator("UM"), mem_total_free.second, + pinned_alloc_limit); + + // allocate zero memory for device pool + auto dev_size_limited_alloc = + rm.makeAllocator( + "size_limited_alloc", rm.getAllocator("DEVICE"), + mem_total_free.first); + auto dev_dynamic_pool = + rm.makeAllocator( + "HIPDynamicPool", dev_size_limited_alloc, 0, pinned_alloc_limit); + + // allocate pinned_alloc_limit in pinned memory + auto pinned_size_limited_alloc = + rm.makeAllocator( + "SizeLimited_PINNED", rm.getAllocator("PINNED"), + pinned_alloc_limit); + auto pinned_dynamic_pool = + rm.makeAllocator( + "QuickPool_SizeLimited_PINNED", pinned_size_limited_alloc, + page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE); + + auto hip_env = std::unique_ptr( + new hipEnv(world, num_devices, device_id, num_streams, + um_dynamic_pool, dev_dynamic_pool, pinned_dynamic_pool)); + instance_accessor() = std::move(hip_env); + } + } + + World& world() const { return *world_; } + + int num_devices() const { return num_devices_; } + + int current_device_id() const { return current_device_id_; } + + int num_streams() const { return num_streams_; } + + bool concurrent_managed_access() const { + return device_concurrent_managed_access_; + } + + size_t stream_id(const hipStream_t& stream) const { + auto it = std::find(streams_.begin(), streams_.end(), stream); + if (it == streams_.end()) abort(); + return it - streams_.begin(); + } + + /// @return the total size of all and free device memory on the current device + static std::pair memory_total_and_free_device() { + std::pair result; + // N.B. hipMemGetInfo returns {free,total} + HipSafeCall(hipMemGetInfo(&result.second, &result.first)); + return result; + } + + /// Collective call to probe HIP {total,free} memory + + /// @return the total size of all and free device memory on every rank's + /// device + std::vector> memory_total_and_free() const { + auto world_size = world_->size(); + std::vector total_memory(world_size, 0), free_memory(world_size, 0); + auto rank = world_->rank(); + std::tie(total_memory.at(rank), free_memory.at(rank)) = + hipEnv::memory_total_and_free_device(); + world_->gop.sum(total_memory.data(), total_memory.size()); + world_->gop.sum(free_memory.data(), free_memory.size()); + std::vector> result(world_size); + for (int r = 0; r != world_size; ++r) { + result.at(r) = {total_memory.at(r), free_memory.at(r)}; + } + return result; + } + + const hipStream_t& stream(std::size_t i) const { return streams_.at(i); } + + const hipStream_t& stream_h2d() const { return streams_[num_streams_]; } + + const hipStream_t& stream_d2h() const { return streams_[num_streams_ + 1]; } + + /// @return a (non-thread-safe) Umpire allocator for device UM + umpire::Allocator& um_allocator() { return um_allocator_; } + + // clang-format off + /// @return the max actual amount of memory held by um_allocator() + /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()` + /// @note if there is only 1 Umpire allocator using UM memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("UM").getHighWatermark()` + // clang-format on + std::size_t um_allocator_getActualHighWatermark() { + TA_ASSERT(dynamic_cast( + um_allocator_.getAllocationStrategy()) != nullptr); + return dynamic_cast( + um_allocator_.getAllocationStrategy()) + ->getActualHighwaterMark(); + } + + /// @return a (non-thread-safe) Umpire allocator for device memory + umpire::Allocator& device_allocator() { return device_allocator_; } + + // clang-format off + /// @return the max actual amount of memory held by um_allocator() + /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()` + /// @note if there is only 1 Umpire allocator using DEVICE memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("DEVICE").getHighWatermark()` + // clang-format on + std::size_t device_allocator_getActualHighWatermark() { + TA_ASSERT(dynamic_cast( + device_allocator_.getAllocationStrategy()) != nullptr); + return dynamic_cast( + device_allocator_.getAllocationStrategy()) + ->getActualHighwaterMark(); + } + + /// @return an Umpire allocator that allocates from a + /// pinned memory pool + /// @warning this is not a thread-safe allocator, should be only used when + /// wrapped into umpire_allocator_impl + umpire::Allocator& pinned_allocator() { return pinned_allocator_; } + + // clang-format off + /// @return the max actual amount of memory held by pinned_allocator() + /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()` + /// @note if there is only 1 Umpire allocator using PINNED memory this should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("PINNED").getHighWatermark()` + // clang-format on + std::size_t pinned_allocator_getActualHighWatermark() { + TA_ASSERT(dynamic_cast( + pinned_allocator_.getAllocationStrategy()) != nullptr); + return dynamic_cast( + pinned_allocator_.getAllocationStrategy()) + ->getActualHighwaterMark(); + } + + protected: + hipEnv(World& world, int num_devices, int device_id, int num_streams, + umpire::Allocator um_alloc, umpire::Allocator device_alloc, + umpire::Allocator pinned_alloc) + : world_(&world), + um_allocator_(um_alloc), + device_allocator_(device_alloc), + pinned_allocator_(pinned_alloc), + num_devices_(num_devices), + current_device_id_(device_id), + num_streams_(num_streams) { + if (num_devices <= 0) { + throw std::runtime_error("No HIP-Enabled GPUs Found!\n"); + } + + // set device for current MPI process + HipSafeCall(hipSetDevice(current_device_id_)); + + /// check the capability of HIP device + hipDeviceProp prop; + HipSafeCall(hipGetDeviceProperties(&prop, device_id)); + if (!prop.managedMemory) { + throw std::runtime_error("HIP Device doesn't support managedMemory\n"); + } + int concurrent_managed_access; + HipSafeCall(hipDeviceGetAttribute(&concurrent_managed_access, + hipDeviceAttributeConcurrentManagedAccess, + device_id)); + device_concurrent_managed_access_ = concurrent_managed_access; + if (!device_concurrent_managed_access_) { + std::cout << "\nWarning: HIP Device doesn't support " + "ConcurrentManagedAccess!\n\n"; + } + + // creates streams on current device + streams_.resize(num_streams_ + 2); + for (auto& stream : streams_) { + HipSafeCall(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + } + std::cout << "created " << num_streams_ << " HIP streams + 2 I/O streams" + << std::endl; + } + + private: + // the world used to initialize this + World* world_; + + /// allocator backed by a (non-thread-safe) dynamically-sized pool for UM + umpire::Allocator um_allocator_; + /// allocator backed by a (non-thread-safe) dynamically-sized pool for device + /// memory + umpire::Allocator device_allocator_; + // allocates from a dynamic, size-limited pinned memory pool + // N.B. not thread safe, so must be wrapped into umpire_allocator_impl + umpire::Allocator pinned_allocator_; + + int num_devices_; + int current_device_id_; + bool device_concurrent_managed_access_; + + int num_streams_; + std::vector streams_; + + inline static std::unique_ptr& instance_accessor() { + static std::unique_ptr instance_{nullptr}; + return instance_; + } +}; + +namespace detail { + +template +const hipStream_t& get_stream_based_on_range(const Range& range) { + // TODO better way to get stream based on the id of tensor + auto stream_id = range.offset() % hipEnv::instance()->num_streams(); + auto& stream = hipEnv::instance()->stream(stream_id); + return stream; +} + +} // namespace detail + +} // namespace TiledArray + +#endif // TILEDARRAY_HAS_HIP + +#endif // TILEDARRAY_EXTERNAL_HIP_H__INCLUDED From 4ae79594fe59f493be628506c0d991fa7429da12 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 18 Sep 2023 17:05:56 -0400 Subject: [PATCH 101/592] phase 1 of cuda -> device conversion - cuda.h -> device.h - {cuda,hip} -> device for many API elements --- examples/cuda/cuda_librett.cpp | 2 +- examples/cuda/cuda_task.cpp | 12 +- examples/cuda/ta_cc_abcd_cuda.cpp | 4 +- examples/cuda/ta_dense_cuda.cpp | 42 +- examples/cuda/ta_reduce_cuda.cpp | 10 +- examples/cuda/ta_vector_cuda.cpp | 12 +- src/CMakeLists.txt | 54 +- src/TiledArray/config.h.in | 11 + src/TiledArray/cuda/btas_um_tensor.cpp | 51 -- src/TiledArray/{cuda => device}/allocators.h | 53 +- src/TiledArray/{cuda => device}/btas_cublas.h | 288 +++++----- src/TiledArray/device/btas_um_tensor.cpp | 52 ++ .../{cuda => device}/btas_um_tensor.h | 210 +++---- .../{cuda => device}/cpu_cuda_vector.cu | 2 +- .../{cuda => device}/cpu_cuda_vector.h | 12 +- src/TiledArray/{cuda => device}/cublas.h | 0 .../device_task_fn.h} | 287 +++++----- .../{cuda => device}/kernel/mult_kernel.cu | 4 +- .../{cuda => device}/kernel/mult_kernel.h | 0 .../kernel/mult_kernel_impl.h | 4 +- .../{cuda => device}/kernel/reduce_kernel.cu | 4 +- .../{cuda => device}/kernel/reduce_kernel.h | 0 .../kernel/reduce_kernel_impl.h | 8 +- src/TiledArray/{cuda => device}/platform.h | 8 +- src/TiledArray/{cuda => device}/thrust.h | 0 src/TiledArray/{cuda => device}/um_storage.cu | 16 +- src/TiledArray/{cuda => device}/um_storage.h | 39 +- src/TiledArray/dist_eval/binary_eval.h | 8 +- src/TiledArray/dist_eval/contraction_eval.h | 19 +- src/TiledArray/dist_eval/dist_eval.h | 6 +- src/TiledArray/dist_eval/unary_eval.h | 6 +- src/TiledArray/expressions/expr.h | 38 +- src/TiledArray/external/cuda.h | 512 +----------------- src/TiledArray/external/{hip.h => device.h} | 371 ++++++++++--- src/TiledArray/external/umpire.h | 32 +- src/TiledArray/fwd.h | 39 +- src/TiledArray/host/allocator.h | 9 +- src/TiledArray/host/env.h | 4 +- src/TiledArray/reduce_task.h | 117 ++-- src/TiledArray/tensor/type_traits.h | 10 +- src/TiledArray/tiledarray.cpp | 40 +- tests/CMakeLists.txt | 6 +- tests/expressions_cuda_um.cpp | 6 +- tests/librett.cpp | 18 +- tests/tensor_um.cpp | 2 +- 45 files changed, 1084 insertions(+), 1344 deletions(-) delete mode 100644 src/TiledArray/cuda/btas_um_tensor.cpp rename src/TiledArray/{cuda => device}/allocators.h (63%) rename src/TiledArray/{cuda => device}/btas_cublas.h (66%) create mode 100644 src/TiledArray/device/btas_um_tensor.cpp rename src/TiledArray/{cuda => device}/btas_um_tensor.h (81%) rename src/TiledArray/{cuda => device}/cpu_cuda_vector.cu (98%) rename src/TiledArray/{cuda => device}/cpu_cuda_vector.h (96%) rename src/TiledArray/{cuda => device}/cublas.h (100%) rename src/TiledArray/{cuda/cuda_task_fn.h => device/device_task_fn.h} (72%) rename src/TiledArray/{cuda => device}/kernel/mult_kernel.cu (96%) rename src/TiledArray/{cuda => device}/kernel/mult_kernel.h (100%) rename src/TiledArray/{cuda => device}/kernel/mult_kernel_impl.h (95%) rename src/TiledArray/{cuda => device}/kernel/reduce_kernel.cu (98%) rename src/TiledArray/{cuda => device}/kernel/reduce_kernel.h (100%) rename src/TiledArray/{cuda => device}/kernel/reduce_kernel_impl.h (95%) rename src/TiledArray/{cuda => device}/platform.h (93%) rename src/TiledArray/{cuda => device}/thrust.h (100%) rename src/TiledArray/{cuda => device}/um_storage.cu (66%) rename src/TiledArray/{cuda => device}/um_storage.h (78%) rename src/TiledArray/external/{hip.h => device.h} (53%) diff --git a/examples/cuda/cuda_librett.cpp b/examples/cuda/cuda_librett.cpp index c513f41af1..1460f54117 100644 --- a/examples/cuda/cuda_librett.cpp +++ b/examples/cuda/cuda_librett.cpp @@ -23,7 +23,7 @@ #ifdef TILEDARRAY_HAS_CUDA -#include +#include #include #include diff --git a/examples/cuda/cuda_task.cpp b/examples/cuda/cuda_task.cpp index a019523b6e..f2b0b2ab1b 100644 --- a/examples/cuda/cuda_task.cpp +++ b/examples/cuda/cuda_task.cpp @@ -2,8 +2,8 @@ // Created by Chong Peng on 11/14/18. // -#include -#include +#include +#include #include using value_type = double; @@ -28,8 +28,8 @@ void verify(const tile_type& tile, value_type value, std::size_t index) { tile_type scale(const tile_type& arg, value_type a, const cudaStream_t* stream, std::size_t index) { - CudaSafeCall( - cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice( + TiledArray::deviceEnv::instance()->current_device_id())); /// make result Tensor using Storage = typename tile_type::tensor_type::storage_type; Storage result_storage; @@ -81,7 +81,7 @@ void process_task(madness::World* world, tile_type (*scale_fn)(const tile_type&, double, const cudaStream_t*, std::size_t) = &::scale; - madness::Future scale_future = madness::add_cuda_task( + madness::Future scale_future = madness::add_device_task( *world, ::scale, tensor, scale_factor, &stream, ntask * iter + i); /// this should start until scale_taskfn is finished @@ -98,7 +98,7 @@ int try_main(int argc, char** argv) { std::vector streams(n_stream); for (auto& stream : streams) { // create the streams - CudaSafeCall(cudaStreamCreate(&stream)); + DeviceSafeCall(cudaStreamCreate(&stream)); // std::cout << "stream: " << stream << "\n"; } diff --git a/examples/cuda/ta_cc_abcd_cuda.cpp b/examples/cuda/ta_cc_abcd_cuda.cpp index 0887c90562..b531dee495 100644 --- a/examples/cuda/ta_cc_abcd_cuda.cpp +++ b/examples/cuda/ta_cc_abcd_cuda.cpp @@ -17,7 +17,7 @@ * */ -#include +#include #include #include #include @@ -186,7 +186,7 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, std::pow(n_uocc, 4) / std::pow(1024., 3); using CUDATile = - btas::Tensor>; + btas::Tensor>; using CUDAMatrix = TA::DistArray>; // Construct tensors diff --git a/examples/cuda/ta_dense_cuda.cpp b/examples/cuda/ta_dense_cuda.cpp index ab8c118622..864938302c 100644 --- a/examples/cuda/ta_dense_cuda.cpp +++ b/examples/cuda/ta_dense_cuda.cpp @@ -24,8 +24,8 @@ // clang-format off #include -#include -#include "TiledArray/cuda/cpu_cuda_vector.h" +#include +#include "TiledArray/device/cpu_cuda_vector.h" #include // clang-format on @@ -98,7 +98,7 @@ void to_host( // do norm on GPU auto tile_norm = norm(tile.tensor()); - TiledArray::to_execution_space( + TiledArray::to_execution_space( tile.tensor().storage(), stream); return tile_norm; @@ -120,7 +120,7 @@ void to_device( btas::Tensor>> &tile) { auto &stream = detail::get_stream_based_on_range(tile.range()); - TiledArray::to_execution_space( + TiledArray::to_execution_space( tile.tensor().storage(), stream); return norm(tile.tensor()); @@ -218,7 +218,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, using PinnedTile = btas::Tensor>>; + TiledArray::device_pinned_allocator>>; using PinnedMatrix = TA::DistArray>; // using TAMatrix = TA::DistArray>; @@ -339,7 +339,7 @@ int try_main(int argc, char **argv) { << std::endl << "Usage: " << argv[0] << " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [scalar = double] " - "[storage type = cuda_um_btas_varray]\n"; + "[storage type = device_um_btas_varray]\n"; return 0; } const long Nm = atol(argv[1]); @@ -376,15 +376,15 @@ int try_main(int argc, char **argv) { return 1; } - const auto storage_type = - (argc >= 10) ? std::string(argv[9]) : std::string{"cuda_um_btas_varray"}; + const auto storage_type = (argc >= 10) ? std::string(argv[9]) + : std::string{"device_um_btas_varray"}; - if (storage_type != "cuda_um_btas_varray" && + if (storage_type != "device_um_btas_varray" && storage_type != "cuda_um_thrust_vector" && storage_type != "cpu_cuda_vector") { std::cerr << "Error: invalid storage type: " << storage_type << "\n Valid option includes: cuda_um_vector or " - "cuda_um_btas_varray or cuda_um_thrust_vector " + "device_um_btas_varray or cuda_um_thrust_vector " "or cpu_cuda_vector. \n"; } std::cout << "Storage type: " << storage_type << "<" << scalar_type_str << ">" @@ -407,13 +407,13 @@ int try_main(int argc, char **argv) { << runtimeVersion << std::endl; { // print device properties - int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices(); + int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices(); if (num_cuda_devices <= 0) { throw std::runtime_error("No CUDA-Enabled GPUs Found!\n"); } - int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id(); + int cuda_device_id = TA::deviceEnv::instance()->current_device_id(); int mpi_size = world.size(); int mpi_rank = world.rank(); @@ -440,9 +440,9 @@ int try_main(int argc, char **argv) { error = cudaDeviceGetAttribute( &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id); std::cout << " attrConcurrentManagedAccess = " << result << std::endl; - error = cudaSetDevice(cuda_device_id); + error = device::setDevice(cuda_device_id); if (error != cudaSuccess) { - std::cout << "error(cudaSetDevice) = " << error << std::endl; + std::cout << "error(device::setDevice) = " << error << std::endl; } size_t free_mem, total_mem; error = cudaMemGetInfo(&free_mem, &total_mem); @@ -462,19 +462,19 @@ int try_main(int argc, char **argv) { // do_main_body>(world, Nm, Bm, Nn, // Bn, // Nk, Bk, nrepeat); - // } else if (storage_type == "cuda_um_btas_varray") { - if (storage_type == "cuda_um_btas_varray") { + // } else if (storage_type == "device_um_btas_varray") { + if (storage_type == "device_um_btas_varray") { if (scalar_type_str == "double") - do_main_body>( + do_main_body>( world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); else if (scalar_type_str == "float") - do_main_body>(world, Nm, Bm, Nn, - Bn, Nk, Bk, nrepeat); + do_main_body>( + world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); else if (scalar_type_str == "zdouble") - do_main_body>>( + do_main_body>>( world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); else if (scalar_type_str == "zfloat") - do_main_body>>( + do_main_body>>( world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); else { abort(); // unreachable diff --git a/examples/cuda/ta_reduce_cuda.cpp b/examples/cuda/ta_reduce_cuda.cpp index e453069892..b475ff78ef 100644 --- a/examples/cuda/ta_reduce_cuda.cpp +++ b/examples/cuda/ta_reduce_cuda.cpp @@ -24,7 +24,7 @@ // clang-format off #include -#include +#include // clang-format on template @@ -298,13 +298,13 @@ int try_main(int argc, char **argv) { << runtimeVersion << std::endl; { // print device properties - int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices(); + int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices(); if (num_cuda_devices <= 0) { throw std::runtime_error("No CUDA-Enabled GPUs Found!\n"); } - int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id(); + int cuda_device_id = TA::deviceEnv::instance()->current_device_id(); int mpi_size = world.size(); int mpi_rank = world.rank(); @@ -331,9 +331,9 @@ int try_main(int argc, char **argv) { error = cudaDeviceGetAttribute( &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id); std::cout << " attrConcurrentManagedAccess = " << result << std::endl; - error = cudaSetDevice(cuda_device_id); + error = device::setDevice(cuda_device_id); if (error != cudaSuccess) { - std::cout << "error(cudaSetDevice) = " << error << std::endl; + std::cout << "error(device::setDevice) = " << error << std::endl; } size_t free_mem, total_mem; error = cudaMemGetInfo(&free_mem, &total_mem); diff --git a/examples/cuda/ta_vector_cuda.cpp b/examples/cuda/ta_vector_cuda.cpp index 1593a68e8b..a82a057807 100644 --- a/examples/cuda/ta_vector_cuda.cpp +++ b/examples/cuda/ta_vector_cuda.cpp @@ -24,8 +24,8 @@ // clang-format off #include -#include -#include "TiledArray/cuda/cpu_cuda_vector.h" +#include +#include "TiledArray/device/cpu_cuda_vector.h" #include // clang-format on @@ -316,13 +316,13 @@ int try_main(int argc, char **argv) { << runtimeVersion << std::endl; { // print device properties - int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices(); + int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices(); if (num_cuda_devices <= 0) { throw std::runtime_error("No CUDA-Enabled GPUs Found!\n"); } - int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id(); + int cuda_device_id = TA::deviceEnv::instance()->current_device_id(); int mpi_size = world.size(); int mpi_rank = world.rank(); @@ -349,9 +349,9 @@ int try_main(int argc, char **argv) { error = cudaDeviceGetAttribute( &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id); std::cout << " attrConcurrentManagedAccess = " << result << std::endl; - error = cudaSetDevice(cuda_device_id); + error = device::setDevice(cuda_device_id); if (error != cudaSuccess) { - std::cout << "error(cudaSetDevice) = " << error << std::endl; + std::cout << "error(device::setDevice) = " << error << std::endl; } size_t free_mem, total_mem; error = cudaMemGetInfo(&free_mem, &total_mem); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index e4a3b0211e..156bb7b2cf 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -207,34 +207,30 @@ TiledArray/util/time.h TiledArray/util/vector.h ) -if(CUDA_FOUND) - list(APPEND TILEDARRAY_HEADER_FILES - TiledArray/external/cuda.h - TiledArray/cuda/cublas.h - TiledArray/cuda/btas_cublas.h - TiledArray/cuda/btas_um_tensor.h - TiledArray/cuda/cpu_cuda_vector.h - TiledArray/cuda/cuda_task_fn.h - TiledArray/cuda/kernel/mult_kernel.h - TiledArray/cuda/kernel/mult_kernel_impl.h - TiledArray/cuda/kernel/reduce_kernel.h - TiledArray/cuda/kernel/reduce_kernel_impl.h - TiledArray/cuda/platform.h - TiledArray/cuda/thrust.h - TiledArray/cuda/allocators.h - TiledArray/cuda/um_storage.h) -endif(CUDA_FOUND) - -if(HIP_FOUND) - list(APPEND TILEDARRAY_HEADER_FILES - TiledArray/external/hip.h) -endif(CUDA_FOUND) - if(HIP_FOUND OR CUDA_FOUND) list(APPEND TILEDARRAY_HEADER_FILES + TiledArray/external/device.h TiledArray/external/librett.h) endif() +if(CUDA_FOUND) + list(APPEND TILEDARRAY_HEADER_FILES + TiledArray/external/cuda.h + TiledArray/device/cublas.h + TiledArray/device/btas_cublas.h + TiledArray/device/btas_um_tensor.h + TiledArray/device/cpu_cuda_vector.h + TiledArray/device/device_task_fn.h + TiledArray/device/kernel/mult_kernel.h + TiledArray/device/kernel/mult_kernel_impl.h + TiledArray/device/kernel/reduce_kernel.h + TiledArray/device/kernel/reduce_kernel_impl.h + TiledArray/device/platform.h + TiledArray/device/thrust.h + TiledArray/device/allocators.h + TiledArray/device/um_storage.h) +endif(CUDA_FOUND) + set(TILEDARRAY_SOURCE_FILES TiledArray/tiledarray.cpp TiledArray/tensor/tensor.cpp @@ -263,11 +259,11 @@ set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers if(CUDA_FOUND) set(TILEDARRAY_CUDA_SOURCE_FILES - TiledArray/cuda/btas_um_tensor.cpp - TiledArray/cuda/cpu_cuda_vector.cu - TiledArray/cuda/kernel/mult_kernel.cu - TiledArray/cuda/kernel/reduce_kernel.cu - TiledArray/cuda/um_storage.cu) + TiledArray/device/btas_um_tensor.cpp + TiledArray/device/cpu_cuda_vector.cu + TiledArray/device/kernel/mult_kernel.cu + TiledArray/device/kernel/reduce_kernel.cu + TiledArray/device/um_storage.cu) list(APPEND TILEDARRAY_SOURCE_FILES "${TILEDARRAY_CUDA_SOURCE_FILES}") @@ -277,7 +273,7 @@ if(CUDA_FOUND) INCLUDE_DIRECTORIES "${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}") endforeach() - set_source_files_properties(TiledArray/cuda/btas_um_tensor.cpp + set_source_files_properties(TiledArray/device/btas_um_tensor.cpp PROPERTIES LANGUAGE CUDA) diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in index 4cba5ee840..1c38298623 100644 --- a/src/TiledArray/config.h.in +++ b/src/TiledArray/config.h.in @@ -80,6 +80,17 @@ /* Define if TiledArray configured with HIP support */ #cmakedefine TILEDARRAY_HAS_HIP @TILEDARRAY_HAS_HIP@ +// Umpire and LibreTT limited to 1 device runtime at a time, so is TA +#if defined(TILEDARRAY_HAS_HIP) +# define TILEDARRAY_HAS_DEVICE 1 +# define TILEDARRAY_DEVICE_RUNTIME HIP +# define TILEDARRAY_DEVICE_RUNTIME_STR "HIP" +#elif defined(TILEDARRAY_HAS_CUDA) +# define TILEDARRAY_HAS_DEVICE 1 +# define TILEDARRAY_DEVICE_RUNTIME CUDA +# define TILEDARRAY_DEVICE_RUNTIME_STR "CUDA" +#endif + /* Is TA::Tensor memory profiling enabled? */ #cmakedefine TA_TENSOR_MEM_PROFILE 1 diff --git a/src/TiledArray/cuda/btas_um_tensor.cpp b/src/TiledArray/cuda/btas_um_tensor.cpp deleted file mode 100644 index 9423e7563d..0000000000 --- a/src/TiledArray/cuda/btas_um_tensor.cpp +++ /dev/null @@ -1,51 +0,0 @@ -// -// Created by Chong Peng on 7/24/18. -// - -// clang-format off -#include // provides c++17 features (stds::data, std::size) when compiling CUDA (i.e. c++14) -#include -// clang-format on - -#ifdef TILEDARRAY_HAS_CUDA - -template class btas::varray>; -template class btas::varray>; -template class btas::varray< - std::complex, TiledArray::cuda_um_allocator>>; -template class btas::varray, - TiledArray::cuda_um_allocator>>; -template class btas::varray>; -template class btas::varray>; - -template class btas::Tensor>; -template class btas::Tensor>; -template class btas::Tensor< - std::complex, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>; -template class btas::Tensor< - std::complex, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>; -template class btas::Tensor>; -template class btas::Tensor>; - -template class TiledArray::Tile>>; -template class TiledArray::Tile>>; -template class TiledArray::Tile< - btas::Tensor, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>>; -template class TiledArray::Tile< - btas::Tensor, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>>; -template class TiledArray::Tile< - btas::Tensor>>; -template class TiledArray::Tile>>; - -#endif // TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/allocators.h b/src/TiledArray/device/allocators.h similarity index 63% rename from src/TiledArray/cuda/allocators.h rename to src/TiledArray/device/allocators.h index 72c5ae3b0e..ff3ed6a3ac 100644 --- a/src/TiledArray/cuda/allocators.h +++ b/src/TiledArray/device/allocators.h @@ -26,9 +26,9 @@ #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE -#include +#include #include #include @@ -39,37 +39,40 @@ namespace TiledArray { template -class cuda_allocator_impl : public umpire_allocator_impl { +class umpire_based_allocator + : public umpire_based_allocator_impl { public: - using base_type = umpire_allocator_impl; + using base_type = umpire_based_allocator_impl; using typename base_type::const_pointer; using typename base_type::const_reference; using typename base_type::pointer; using typename base_type::reference; using typename base_type::value_type; - cuda_allocator_impl() noexcept : base_type(&UmpireAllocatorAccessor{}()) {} + umpire_based_allocator() noexcept : base_type(&UmpireAllocatorAccessor{}()) {} template - cuda_allocator_impl( - const cuda_allocator_impl& + umpire_based_allocator( + const umpire_based_allocator& rhs) noexcept : base_type( - static_cast&>(rhs)) {} + static_cast&>( + rhs)) {} template friend bool operator==( - const cuda_allocator_impl& lhs, - const cuda_allocator_impl& + const umpire_based_allocator& + lhs, + const umpire_based_allocator& rhs) noexcept; -}; // class cuda_allocator_impl +}; // class umpire_based_allocator template bool operator==( - const cuda_allocator_impl& lhs, - const cuda_allocator_impl& + const umpire_based_allocator& lhs, + const umpire_based_allocator& rhs) noexcept { return lhs.umpire_allocator() == rhs.umpire_allocator(); } @@ -77,8 +80,8 @@ bool operator==( template bool operator!=( - const cuda_allocator_impl& lhs, - const cuda_allocator_impl& + const umpire_based_allocator& lhs, + const umpire_based_allocator& rhs) noexcept { return !(lhs == rhs); } @@ -87,13 +90,13 @@ namespace detail { struct get_um_allocator { umpire::Allocator& operator()() { - return cudaEnv::instance()->um_allocator(); + return deviceEnv::instance()->um_allocator(); } }; struct get_pinned_allocator { umpire::Allocator& operator()() { - return cudaEnv::instance()->pinned_allocator(); + return deviceEnv::instance()->pinned_allocator(); } }; @@ -106,30 +109,30 @@ namespace archive { template -struct ArchiveLoadImpl> { static inline void load( const Archive& ar, - TiledArray::cuda_allocator_impl& - allocator) { - allocator = TiledArray::cuda_allocator_impl{}; + TiledArray::umpire_based_allocator& allocator) { + allocator = TiledArray::umpire_based_allocator{}; } }; template -struct ArchiveStoreImpl> { static inline void store( const Archive& ar, - const TiledArray::cuda_allocator_impl< + const TiledArray::umpire_based_allocator< T, StaticLock, UmpireAllocatorAccessor>& allocator) {} }; } // namespace archive } // namespace madness -#endif // TILEDARRAY_HAS_CUDA +#endif // TILEDARRAY_HAS_DEVICE #endif // TILEDARRAY_CUDA_ALLOCATORS_H___INCLUDED diff --git a/src/TiledArray/cuda/btas_cublas.h b/src/TiledArray/device/btas_cublas.h similarity index 66% rename from src/TiledArray/cuda/btas_cublas.h rename to src/TiledArray/device/btas_cublas.h index ea073d0a78..9ac97ce649 100644 --- a/src/TiledArray/cuda/btas_cublas.h +++ b/src/TiledArray/device/btas_cublas.h @@ -24,7 +24,7 @@ #ifndef TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED #define TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED -#include +#include #include #ifdef TILEDARRAY_HAS_CUDA @@ -32,16 +32,16 @@ #include #include -#include -#include -#include -#include +#include +#include +#include +#include #include namespace TiledArray { template >> + typename = std::enable_if_t>> btas::Tensor btas_tensor_gemm_cuda_impl( const btas::Tensor &left, const btas::Tensor &right, Scalar factor, @@ -78,44 +78,44 @@ btas::Tensor btas_tensor_gemm_cuda_impl( T factor_t = T(factor); T zero(0); - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); // typedef typename Tensor::storage_type storage_type; auto result_range = gemm_helper.make_result_range(left.range(), right.range()); - auto &cuda_stream = detail::get_stream_based_on_range(result_range); + auto &stream = detail::get_stream_based_on_range(result_range); // the result Tensor type typedef btas::Tensor Tensor; Tensor result; // check if stream is busy - // auto stream_status = cudaStreamQuery(cuda_stream); + // auto stream_status = cudaStreamQuery(stream); // if stream is completed, use GPU // if (stream_status == cudaSuccess) { if (true) { Storage result_storage; - make_device_storage(result_storage, result_range.area(), cuda_stream); + make_device_storage(result_storage, result_range.area(), stream); result = Tensor(std::move(result_range), std::move(result_storage)); // left and right are readonly!! // cudaMemAdvise(device_data(left), left.size() * sizeof(T), // cudaMemAdviseSetReadMostly, - // cudaEnv::instance()->current_cuda_device_id()); + // deviceEnv::instance()->current_device_id()); // cudaMemAdvise(device_data(right), right.size() * sizeof(T), // cudaMemAdviseSetReadMostly, - // cudaEnv::instance()->current_cuda_device_id()); + // deviceEnv::instance()->current_device_id()); // prefetch data - TiledArray::to_execution_space( - left.storage(), cuda_stream); - TiledArray::to_execution_space( - right.storage(), cuda_stream); + TiledArray::to_execution_space( + left.storage(), stream); + TiledArray::to_execution_space( + right.storage(), stream); const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.right_op()), to_cublas_op(gemm_helper.left_op()), n, m, k, @@ -124,40 +124,41 @@ btas::Tensor btas_tensor_gemm_cuda_impl( device_data(result.storage()), n)); // wait for cuda calls to finish - // detail::thread_wait_cuda_stream(cuda_stream); - synchronize_stream(&cuda_stream); + // detail::thread_wait_stream(stream); + device::synchronize_stream(&stream); } // otherwise, use CPU else { Storage result_storage(result_range.area()); result = Tensor(std::move(result_range), std::move(result_storage)); - TiledArray::to_execution_space( - result.storage(), cuda_stream); + TiledArray::to_execution_space( + result.storage(), stream); // left and right are readonly!! cudaMemAdvise(device_data(left), left.size() * sizeof(T), cudaMemAdviseSetReadMostly, - cudaEnv::instance()->current_cuda_device_id()); + deviceEnv::instance()->current_device_id()); cudaMemAdvise(device_data(right), right.size() * sizeof(T), cudaMemAdviseSetReadMostly, - cudaEnv::instance()->current_cuda_device_id()); + deviceEnv::instance()->current_device_id()); // prefetch data - TiledArray::to_execution_space( - left.storage(), cuda_stream); - TiledArray::to_execution_space( - right.storage(), cuda_stream); - - TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, - k, factor_t, left.data(), lda, right.data(), ldb, - zero, result.data(), n); + TiledArray::to_execution_space( + left.storage(), stream); + TiledArray::to_execution_space( + right.storage(), stream); + + TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), + m, n, k, factor_t, left.data(), lda, + right.data(), ldb, zero, result.data(), n); } return result; } -template >> +template >> void btas_tensor_gemm_cuda_impl( btas::Tensor &result, const btas::Tensor &left, @@ -224,13 +225,13 @@ void btas_tensor_gemm_cuda_impl( const integer ldb = (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k); - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(result.range()); T factor_t = T(factor); T one(1); // check if stream is busy - // auto stream_status = cudaStreamQuery(cuda_stream); + // auto stream_status = cudaStreamQuery(stream); // if stream is completed, use GPU // if (stream_status == cudaSuccess) { @@ -238,50 +239,50 @@ void btas_tensor_gemm_cuda_impl( // left and right are readonly!! // cudaMemAdvise(device_data(left), left.size() * sizeof(T), // cudaMemAdviseSetReadMostly, - // cudaEnv::instance()->current_cuda_device_id()); + // deviceEnv::instance()->current_device_id()); // cudaMemAdvise(device_data(right), right.size() * sizeof(T), // cudaMemAdviseSetReadMostly, - // cudaEnv::instance()->current_cuda_device_id()); + // deviceEnv::instance()->current_device_id()); // prefetch all data - TiledArray::to_execution_space( - left.storage(), cuda_stream); - TiledArray::to_execution_space( - right.storage(), cuda_stream); - TiledArray::to_execution_space( - result.storage(), cuda_stream); + TiledArray::to_execution_space( + left.storage(), stream); + TiledArray::to_execution_space( + right.storage(), stream); + TiledArray::to_execution_space( + result.storage(), stream); const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.right_op()), to_cublas_op(gemm_helper.left_op()), n, m, k, &factor_t, device_data(right.storage()), ldb, device_data(left.storage()), lda, &one, device_data(result.storage()), n)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); - // detail::thread_wait_cuda_stream(cuda_stream); + // detail::thread_wait_stream(stream); } else { // left and right are readonly!! cudaMemAdvise(device_data(left), left.size() * sizeof(T), cudaMemAdviseSetReadMostly, - cudaEnv::instance()->current_cuda_device_id()); + deviceEnv::instance()->current_device_id()); cudaMemAdvise(device_data(right), right.size() * sizeof(T), cudaMemAdviseSetReadMostly, - cudaEnv::instance()->current_cuda_device_id()); + deviceEnv::instance()->current_device_id()); // prefetch data - TiledArray::to_execution_space( - left.storage(), cuda_stream); - TiledArray::to_execution_space( - right.storage(), cuda_stream); - TiledArray::to_execution_space( - result.storage(), cuda_stream); - - TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, - k, factor_t, left.data(), lda, right.data(), ldb, - one, result.data(), n); + TiledArray::to_execution_space( + left.storage(), stream); + TiledArray::to_execution_space( + right.storage(), stream); + TiledArray::to_execution_space( + result.storage(), stream); + + TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), + m, n, k, factor_t, left.data(), lda, + right.data(), ldb, one, result.data(), n); } } @@ -289,69 +290,69 @@ void btas_tensor_gemm_cuda_impl( template btas::Tensor btas_tensor_clone_cuda_impl( const btas::Tensor &arg) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); Storage result_storage; auto result_range = arg.range(); - auto &cuda_stream = detail::get_stream_based_on_range(result_range); + auto &stream = detail::get_stream_based_on_range(result_range); - make_device_storage(result_storage, arg.size(), cuda_stream); + make_device_storage(result_storage, arg.size(), stream); btas::Tensor result(std::move(result_range), std::move(result_storage)); // call cublasCopy const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasCopy(handle, result.size(), device_data(arg.storage()), 1, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } /// result[i] = a * arg[i] template >> + typename = std::enable_if_t>> btas::Tensor btas_tensor_scale_cuda_impl( const btas::Tensor &arg, const Scalar a) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(arg.range()); // std::cout << "scale, tile offset: " << arg.range().offset() << " stream: " - // << arg.range().offset() % cudaEnv::instance()->num_cuda_streams() << "\n"; + // << arg.range().offset() % deviceEnv::instance()->num_streams() << "\n"; auto result = btas_tensor_clone_cuda_impl(arg); // call cublasScale const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall( cublasScal(handle, result.size(), &a, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } /// result[i] *= a template >> + typename = std::enable_if_t>> void btas_tensor_scale_to_cuda_impl(btas::Tensor &result, const Scalar a) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(result.range()); // call cublasScale const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall( cublasScal(handle, result.size(), &a, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); } /// result[i] = arg1[i] - a * arg2[i] template >> + typename = std::enable_if_t>> btas::Tensor btas_tensor_subt_cuda_impl( const btas::Tensor &arg1, const btas::Tensor &arg2, const Scalar a) { @@ -360,12 +361,12 @@ btas::Tensor btas_tensor_subt_cuda_impl( // revert the sign of a auto b = -a; - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(result.range()); - if (in_memory_space(result.storage())) { + if (in_memory_space(result.storage())) { const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasAxpy(handle, result.size(), &b, device_data(arg2.storage()), 1, device_data(result.storage()), 1)); @@ -374,82 +375,85 @@ btas::Tensor btas_tensor_subt_cuda_impl( // btas::axpy(1.0, arg, result); } - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } /// result[i] -= a * arg1[i] -template >> +template >> void btas_tensor_subt_to_cuda_impl(btas::Tensor &result, const btas::Tensor &arg1, const Scalar a) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(result.range()); // revert the sign of a auto b = -a; const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasAxpy(handle, result.size(), &b, device_data(arg1.storage()), 1, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); } /// result[i] = arg1[i] + a * arg2[i] -template >> +template >> btas::Tensor btas_tensor_add_cuda_impl( const btas::Tensor &arg1, const btas::Tensor &arg2, const Scalar a) { auto result = btas_tensor_clone_cuda_impl(arg1); - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(result.range()); const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasAxpy(handle, result.size(), &a, device_data(arg2.storage()), 1, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } /// result[i] += a * arg[i] -template >> +template >> void btas_tensor_add_to_cuda_impl(btas::Tensor &result, const btas::Tensor &arg, const Scalar a) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); + auto &stream = detail::get_stream_based_on_range(result.range()); - // TiledArray::to_execution_space(result.storage(),cuda_stream); - // TiledArray::to_execution_space(arg.storage(),cuda_stream); + // TiledArray::to_execution_space(result.storage(),stream); + // TiledArray::to_execution_space(arg.storage(),stream); const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasAxpy(handle, result.size(), &a, device_data(arg.storage()), 1, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); } /// result[i] = result[i] * arg[i] template void btas_tensor_mult_to_cuda_impl(btas::Tensor &result, const btas::Tensor &arg) { - auto device_id = cudaEnv::instance()->current_cuda_device_id(); - auto &cuda_stream = detail::get_stream_based_on_range(result.range()); + auto device_id = deviceEnv::instance()->current_device_id(); + auto &stream = detail::get_stream_based_on_range(result.range()); std::size_t n = result.size(); TA_ASSERT(n == arg.size()); - mult_to_cuda_kernel(result.data(), arg.data(), n, cuda_stream, device_id); - synchronize_stream(&cuda_stream); + mult_to_cuda_kernel(result.data(), arg.data(), n, stream, device_id); + device::synchronize_stream(&stream); } /// result[i] = arg1[i] * arg2[i] @@ -461,19 +465,19 @@ btas::Tensor btas_tensor_mult_cuda_impl( TA_ASSERT(arg2.size() == n); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); - CudaSafeCall(cudaSetDevice(device_id)); - auto &cuda_stream = detail::get_stream_based_on_range(arg1.range()); + auto device_id = deviceEnv::instance()->current_device_id(); + DeviceSafeCall(device::setDevice(device_id)); + auto &stream = detail::get_stream_based_on_range(arg1.range()); Storage result_storage; - make_device_storage(result_storage, n, cuda_stream); + make_device_storage(result_storage, n, stream); btas::Tensor result(arg1.range(), std::move(result_storage)); - mult_cuda_kernel(result.data(), arg1.data(), arg2.data(), n, cuda_stream, + mult_cuda_kernel(result.data(), arg1.data(), arg2.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } @@ -481,24 +485,24 @@ btas::Tensor btas_tensor_mult_cuda_impl( template typename btas::Tensor::value_type btas_tensor_squared_norm_cuda_impl(const btas::Tensor &arg) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); + auto &stream = detail::get_stream_based_on_range(arg.range()); auto &storage = arg.storage(); using TiledArray::math::blas::integer; integer size = storage.size(); T result = 0; - if (in_memory_space(storage)) { + if (in_memory_space(storage)) { const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasDot(handle, size, device_data(storage), 1, device_data(storage), 1, &result)); } else { TA_ASSERT(false); // result = TiledArray::math::dot(size, storage.data(), storage.data()); } - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } @@ -507,9 +511,9 @@ template typename btas::Tensor::value_type btas_tensor_dot_cuda_impl( const btas::Tensor &arg1, const btas::Tensor &arg2) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &cuda_stream = detail::get_stream_based_on_range(arg1.range()); + auto &stream = detail::get_stream_based_on_range(arg1.range()); using TiledArray::math::blas::integer; integer size = arg1.storage().size(); @@ -517,101 +521,101 @@ typename btas::Tensor::value_type btas_tensor_dot_cuda_impl( TA_ASSERT(size == arg2.storage().size()); T result = 0; - if (in_memory_space(arg1.storage()) && - in_memory_space(arg2.storage())) { + if (in_memory_space(arg1.storage()) && + in_memory_space(arg2.storage())) { const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasDot(handle, size, device_data(arg1.storage()), 1, device_data(arg2.storage()), 1, &result)); } else { TA_ASSERT(false); // result = TiledArray::math::dot(size, storage.data(), storage.data()); } - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } template T btas_tensor_sum_cuda_impl(const btas::Tensor &arg) { - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); + auto &stream = detail::get_stream_based_on_range(arg.range()); + auto device_id = deviceEnv::instance()->current_device_id(); auto &storage = arg.storage(); auto n = storage.size(); - auto result = sum_cuda_kernel(arg.data(), n, cuda_stream, device_id); + auto result = sum_cuda_kernel(arg.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } template T btas_tensor_product_cuda_impl(const btas::Tensor &arg) { - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); + auto &stream = detail::get_stream_based_on_range(arg.range()); + auto device_id = deviceEnv::instance()->current_device_id(); auto &storage = arg.storage(); auto n = storage.size(); - auto result = product_cuda_kernel(arg.data(), n, cuda_stream, device_id); + auto result = product_cuda_kernel(arg.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } template T btas_tensor_min_cuda_impl(const btas::Tensor &arg) { - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); + auto &stream = detail::get_stream_based_on_range(arg.range()); + auto device_id = deviceEnv::instance()->current_device_id(); auto &storage = arg.storage(); auto n = storage.size(); - auto result = min_cuda_kernel(arg.data(), n, cuda_stream, device_id); + auto result = min_cuda_kernel(arg.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } template T btas_tensor_max_cuda_impl(const btas::Tensor &arg) { - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); + auto &stream = detail::get_stream_based_on_range(arg.range()); + auto device_id = deviceEnv::instance()->current_device_id(); auto &storage = arg.storage(); auto n = storage.size(); - auto result = max_cuda_kernel(arg.data(), n, cuda_stream, device_id); + auto result = max_cuda_kernel(arg.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } template T btas_tensor_absmin_cuda_impl(const btas::Tensor &arg) { - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); + auto &stream = detail::get_stream_based_on_range(arg.range()); + auto device_id = deviceEnv::instance()->current_device_id(); auto &storage = arg.storage(); auto n = storage.size(); - auto result = absmin_cuda_kernel(arg.data(), n, cuda_stream, device_id); + auto result = absmin_cuda_kernel(arg.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } template T btas_tensor_absmax_cuda_impl(const btas::Tensor &arg) { - auto &cuda_stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = cudaEnv::instance()->current_cuda_device_id(); + auto &stream = detail::get_stream_based_on_range(arg.range()); + auto device_id = deviceEnv::instance()->current_device_id(); auto &storage = arg.storage(); auto n = storage.size(); - auto result = absmax_cuda_kernel(arg.data(), n, cuda_stream, device_id); + auto result = absmax_cuda_kernel(arg.data(), n, stream, device_id); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } diff --git a/src/TiledArray/device/btas_um_tensor.cpp b/src/TiledArray/device/btas_um_tensor.cpp new file mode 100644 index 0000000000..270f30aad4 --- /dev/null +++ b/src/TiledArray/device/btas_um_tensor.cpp @@ -0,0 +1,52 @@ +// +// Created by Chong Peng on 7/24/18. +// + +// clang-format off +#include // provides c++17 features (stds::data, std::size) when compiling CUDA (i.e. c++14) +#include +// clang-format on + +#ifdef TILEDARRAY_HAS_CUDA + +template class btas::varray>; +template class btas::varray>; +template class btas::varray< + std::complex, + TiledArray::device_um_allocator>>; +template class btas::varray< + std::complex, TiledArray::device_um_allocator>>; +template class btas::varray>; +template class btas::varray>; + +template class btas::Tensor>; +template class btas::Tensor>; +template class btas::Tensor< + std::complex, TiledArray::Range, + TiledArray::device_um_btas_varray>>; +template class btas::Tensor< + std::complex, TiledArray::Range, + TiledArray::device_um_btas_varray>>; +template class btas::Tensor>; +template class btas::Tensor>; + +template class TiledArray::Tile>>; +template class TiledArray::Tile>>; +template class TiledArray::Tile< + btas::Tensor, TiledArray::Range, + TiledArray::device_um_btas_varray>>>; +template class TiledArray::Tile< + btas::Tensor, TiledArray::Range, + TiledArray::device_um_btas_varray>>>; +template class TiledArray::Tile>>; +template class TiledArray::Tile>>; + +#endif // TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/btas_um_tensor.h b/src/TiledArray/device/btas_um_tensor.h similarity index 81% rename from src/TiledArray/cuda/btas_um_tensor.h rename to src/TiledArray/device/btas_um_tensor.h index 6342c54771..0c448a24b2 100644 --- a/src/TiledArray/cuda/btas_um_tensor.h +++ b/src/TiledArray/device/btas_um_tensor.h @@ -27,11 +27,12 @@ #include #include +#include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE -#include -#include +#include +#include #include #include @@ -39,15 +40,15 @@ namespace TiledArray { namespace detail { template -struct is_cuda_tile< - ::btas::Tensor>> +struct is_device_tile< + ::btas::Tensor>> : public std::true_type {}; template -void to_cuda(const TiledArray::btasUMTensorVarray &tile) { - cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id()); +void to_device(const TiledArray::btasUMTensorVarray &tile) { + device::setDevice(TiledArray::deviceEnv::instance()->current_device_id()); auto &stream = TiledArray::detail::get_stream_based_on_range(tile.range()); - TiledArray::to_execution_space( + TiledArray::to_execution_space( tile.storage(), stream); } @@ -64,12 +65,12 @@ struct ArchiveLoadImpl> { static inline void load(const Archive &ar, TiledArray::btasUMTensorVarray &t) { TiledArray::Range range{}; - TiledArray::cuda_um_btas_varray store{}; + TiledArray::device_um_btas_varray store{}; ar &range &store; t = TiledArray::btasUMTensorVarray(std::move(range), std::move(store)); - // cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id()); + // device::setDevice(TiledArray::deviceEnv::instance()->current_device_id()); // auto &stream = TiledArray::detail::get_stream_based_on_range(range); - // TiledArray::to_execution_space(t.storage(), + // TiledArray::to_execution_space(t.storage(), // stream); } }; @@ -78,11 +79,11 @@ template struct ArchiveStoreImpl> { static inline void store(const Archive &ar, const TiledArray::btasUMTensorVarray &t) { - CudaSafeCall(cudaSetDevice( - TiledArray::cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(TiledArray::device::setDevice( + TiledArray::deviceEnv::instance()->current_device_id())); auto &stream = TiledArray::detail::get_stream_based_on_range(t.range()); - TiledArray::to_execution_space(t.storage(), - stream); + TiledArray::to_execution_space( + t.storage(), stream); ar &t.range() & t.storage(); } }; @@ -135,25 +136,25 @@ btasUMTensorVarray shift(const btasUMTensorVarray &arg, // shift the range result_range.inplace_shift(range_shift); - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); // @important select the stream using the shifted range - auto &cuda_stream = detail::get_stream_based_on_range(result_range); + auto &stream = detail::get_stream_based_on_range(result_range); typename btasUMTensorVarray::storage_type result_storage; - make_device_storage(result_storage, result_range.volume(), cuda_stream); + make_device_storage(result_storage, result_range.volume(), stream); btasUMTensorVarray result(std::move(result_range), std::move(result_storage)); // call cublasCopy const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, cuda_stream)); + CublasSafeCall(cublasSetStream(handle, stream)); CublasSafeCall(cublasCopy(handle, result.size(), device_data(arg.storage()), 1, device_data(result.storage()), 1)); - synchronize_stream(&cuda_stream); + device::synchronize_stream(&stream); return result; } @@ -176,7 +177,7 @@ btasUMTensorVarray permute(const btasUMTensorVarray &arg, const TiledArray::Permutation &perm) { // compute result range auto result_range = perm * arg.range(); - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); // compute the stream to use auto &stream = detail::get_stream_based_on_range(result_range); @@ -192,7 +193,7 @@ btasUMTensorVarray permute(const btasUMTensorVarray &arg, librett_permute(const_cast(device_data(arg.storage())), device_data(result.storage()), arg.range(), perm, stream); - synchronize_stream(&stream); + device::synchronize_stream(&stream); return result; } @@ -205,7 +206,7 @@ template >> btasUMTensorVarray scale(const btasUMTensorVarray &arg, const Scalar factor) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_scale_cuda_impl(arg, factor); } @@ -213,7 +214,7 @@ template >> btasUMTensorVarray &scale_to(btasUMTensorVarray &arg, const Scalar factor) { - detail::to_cuda(arg); + detail::to_device(arg); btas_tensor_scale_to_cuda_impl(arg, factor); return arg; } @@ -227,8 +228,8 @@ btasUMTensorVarray scale(const btasUMTensorVarray &arg, auto result = scale(arg, factor); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -239,7 +240,7 @@ btasUMTensorVarray scale(const btasUMTensorVarray &arg, template btasUMTensorVarray neg(const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_scale_cuda_impl(arg, T(-1.0)); } @@ -251,15 +252,15 @@ btasUMTensorVarray neg(const btasUMTensorVarray &arg, auto result = neg(arg); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } template btasUMTensorVarray &neg_to(btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); btas_tensor_scale_to_cuda_impl(arg, T(-1.0)); return arg; } @@ -271,8 +272,8 @@ btasUMTensorVarray &neg_to(btasUMTensorVarray &arg) { template btasUMTensorVarray subt(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2) { - detail::to_cuda(arg1); - detail::to_cuda(arg2); + detail::to_device(arg1); + detail::to_device(arg2); return btas_tensor_subt_cuda_impl(arg1, arg2, T(1.0)); } @@ -295,8 +296,8 @@ btasUMTensorVarray subt(const btasUMTensorVarray &arg1, auto result = subt(arg1, arg2); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -311,8 +312,8 @@ btasUMTensorVarray subt(const btasUMTensorVarray &arg1, auto result = subt(arg1, arg2, factor); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -325,8 +326,8 @@ template btasUMTensorVarray &subt_to( btasUMTensorVarray &result, const btasUMTensorVarray &arg1) { - detail::to_cuda(result); - detail::to_cuda(arg1); + detail::to_device(result); + detail::to_device(arg1); btas_tensor_subt_to_cuda_impl(result, arg1, T(1.0)); return result; } @@ -348,8 +349,8 @@ btasUMTensorVarray &subt_to(btasUMTensorVarray &result, template btasUMTensorVarray add(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2) { - detail::to_cuda(arg1); - detail::to_cuda(arg2); + detail::to_device(arg1); + detail::to_device(arg2); return btas_tensor_add_cuda_impl(arg1, arg2, T(1.0)); } @@ -373,8 +374,8 @@ btasUMTensorVarray add(const btasUMTensorVarray &arg1, auto result = add(arg1, arg2, factor); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -388,8 +389,8 @@ btasUMTensorVarray add(const btasUMTensorVarray &arg1, auto result = add(arg1, arg2); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -401,8 +402,8 @@ btasUMTensorVarray add(const btasUMTensorVarray &arg1, template btasUMTensorVarray &add_to(btasUMTensorVarray &result, const btasUMTensorVarray &arg) { - detail::to_cuda(result); - detail::to_cuda(arg); + detail::to_device(result); + detail::to_device(arg); btas_tensor_add_to_cuda_impl(result, arg, T(1.0)); return result; } @@ -424,8 +425,8 @@ template typename btasUMTensorVarray::value_type dot( const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2) { - detail::to_cuda(arg1); - detail::to_cuda(arg2); + detail::to_device(arg1); + detail::to_device(arg2); return btas_tensor_dot_cuda_impl(arg1, arg2); } @@ -435,8 +436,8 @@ typename btasUMTensorVarray::value_type dot( template btasUMTensorVarray mult(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2) { - detail::to_cuda(arg1); - detail::to_cuda(arg2); + detail::to_device(arg1); + detail::to_device(arg2); return btas_tensor_mult_cuda_impl(arg1, arg2); } @@ -459,8 +460,8 @@ btasUMTensorVarray mult(const btasUMTensorVarray &arg1, auto result = mult(arg1, arg2); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -475,8 +476,8 @@ btasUMTensorVarray mult(const btasUMTensorVarray &arg1, auto result = mult(arg1, arg2, factor); // wait to finish before switch stream - auto stream = tls_cudastream_accessor(); - cudaStreamSynchronize(*stream); + auto stream = device::tls_stream_accessor(); + device::streamSynchronize(*stream); return permute(result, perm); } @@ -487,8 +488,8 @@ btasUMTensorVarray mult(const btasUMTensorVarray &arg1, template btasUMTensorVarray &mult_to(btasUMTensorVarray &result, const btasUMTensorVarray &arg) { - detail::to_cuda(result); - detail::to_cuda(arg); + detail::to_device(result); + detail::to_device(arg); btas_tensor_mult_to_cuda_impl(result, arg); return result; } @@ -514,7 +515,7 @@ btasUMTensorVarray &mult_to(btasUMTensorVarray &result, template typename btasUMTensorVarray::value_type squared_norm( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_squared_norm_cuda_impl(arg); } @@ -525,7 +526,7 @@ typename btasUMTensorVarray::value_type squared_norm( template typename btasUMTensorVarray::value_type norm( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return std::sqrt(btas_tensor_squared_norm_cuda_impl(arg)); } @@ -544,7 +545,7 @@ typename btasUMTensorVarray::value_type trace( template typename btasUMTensorVarray::value_type sum( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_sum_cuda_impl(arg); } @@ -554,7 +555,7 @@ typename btasUMTensorVarray::value_type sum( template typename btasUMTensorVarray::value_type product( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_product_cuda_impl(arg); } @@ -564,7 +565,7 @@ typename btasUMTensorVarray::value_type product( template typename btasUMTensorVarray::value_type max( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_max_cuda_impl(arg); } @@ -574,7 +575,7 @@ typename btasUMTensorVarray::value_type max( template typename btasUMTensorVarray::value_type abs_max( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_absmax_cuda_impl(arg); } @@ -584,7 +585,7 @@ typename btasUMTensorVarray::value_type abs_max( template typename btasUMTensorVarray::value_type min( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_min_cuda_impl(arg); } @@ -594,7 +595,7 @@ typename btasUMTensorVarray::value_type min( template typename btasUMTensorVarray::value_type abs_min( const btasUMTensorVarray &arg) { - detail::to_cuda(arg); + detail::to_device(arg); return btas_tensor_absmin_cuda_impl(arg); } @@ -603,10 +604,11 @@ template void to_host( TiledArray::DistArray, Policy> &um_array) { auto to_host = [](TiledArray::Tile &tile) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); auto &stream = detail::get_stream_based_on_range(tile.range()); - TiledArray::to_execution_space( + TiledArray::to_execution_space( tile.tensor().storage(), stream); }; @@ -622,7 +624,7 @@ void to_host( } world.gop.fence(); - CudaSafeCall(cudaDeviceSynchronize()); + DeviceSafeCall(cudaDeviceSynchronize()); }; /// to device for UM Array @@ -630,10 +632,11 @@ template void to_device( TiledArray::DistArray, Policy> &um_array) { auto to_device = [](TiledArray::Tile &tile) { - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); auto &stream = detail::get_stream_based_on_range(tile.range()); - TiledArray::to_execution_space( + TiledArray::to_execution_space( tile.tensor().storage(), stream); }; @@ -649,7 +652,7 @@ void to_device( } world.gop.fence(); - CudaSafeCall(cudaDeviceSynchronize()); + DeviceSafeCall(device::deviceSynchronize()); }; /// convert array from UMTensor to TiledArray::Tensor @@ -661,12 +664,12 @@ um_tensor_to_ta_tensor( const auto convert_tile_memcpy = [](const UMTensor &tile) { TATensor result(tile.tensor().range()); - auto &stream = cudaEnv::instance()->cuda_stream_d2h(); - CudaSafeCall( + auto &stream = deviceEnv::instance()->stream_d2h(); + DeviceSafeCall( cudaMemcpyAsync(result.data(), tile.data(), tile.size() * sizeof(typename TATensor::value_type), cudaMemcpyDefault, stream)); - synchronize_stream(&stream); + device::synchronize_stream(&stream); return result; }; @@ -676,10 +679,11 @@ um_tensor_to_ta_tensor( using std::begin; const auto n = tile.tensor().size(); - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); auto &stream = detail::get_stream_based_on_range(tile.range()); - TiledArray::to_execution_space( + TiledArray::to_execution_space( tile.tensor().storage(), stream); std::copy_n(tile.data(), n, result.data()); @@ -714,28 +718,30 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { auto convert_tile_memcpy = [](const TATensor &tile) { /// UMTensor must be wrapped into TA::Tile - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); using Tensor = typename UMTensor::tensor_type; - auto &stream = cudaEnv::instance()->cuda_stream_h2d(); + auto &stream = deviceEnv::instance()->stream_h2d(); typename Tensor::storage_type storage; make_device_storage(storage, tile.range().area(), stream); Tensor result(tile.range(), std::move(storage)); - CudaSafeCall( + DeviceSafeCall( cudaMemcpyAsync(result.data(), tile.data(), tile.size() * sizeof(typename Tensor::value_type), cudaMemcpyDefault, stream)); - synchronize_stream(&stream); + device::synchronize_stream(&stream); return TiledArray::Tile(std::move(result)); }; auto convert_tile_um = [](const TATensor &tile) { /// UMTensor must be wrapped into TA::Tile - CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); using Tensor = typename UMTensor::tensor_type; typename Tensor::storage_type storage(tile.range().area()); @@ -749,7 +755,7 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { auto &stream = detail::get_stream_based_on_range(result.range()); // prefetch data to GPU - TiledArray::to_execution_space( + TiledArray::to_execution_space( result.storage(), stream); return TiledArray::Tile(std::move(result)); @@ -778,47 +784,49 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { #ifndef TILEDARRAY_HEADER_ONLY extern template class btas::varray>; -extern template class btas::varray>; + TiledArray::device_um_allocator>; +extern template class btas::varray>; extern template class btas::varray< - std::complex, TiledArray::cuda_um_allocator>>; + std::complex, + TiledArray::device_um_allocator>>; extern template class btas::varray< - std::complex, TiledArray::cuda_um_allocator>>; -extern template class btas::varray>; -extern template class btas::varray>; + std::complex, TiledArray::device_um_allocator>>; +extern template class btas::varray>; +extern template class btas::varray>; extern template class btas::Tensor>; + TiledArray::device_um_btas_varray>; extern template class btas::Tensor>; + TiledArray::device_um_btas_varray>; extern template class btas::Tensor< std::complex, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>; + TiledArray::device_um_btas_varray>>; extern template class btas::Tensor< std::complex, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>; + TiledArray::device_um_btas_varray>>; extern template class btas::Tensor>; + TiledArray::device_um_btas_varray>; extern template class btas::Tensor>; + TiledArray::device_um_btas_varray>; extern template class TiledArray::Tile>>; + double, TiledArray::Range, TiledArray::device_um_btas_varray>>; extern template class TiledArray::Tile>>; + float, TiledArray::Range, TiledArray::device_um_btas_varray>>; extern template class TiledArray::Tile< btas::Tensor, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>>; + TiledArray::device_um_btas_varray>>>; extern template class TiledArray::Tile< btas::Tensor, TiledArray::Range, - TiledArray::cuda_um_btas_varray>>>; -extern template class TiledArray::Tile< - btas::Tensor>>; + TiledArray::device_um_btas_varray>>>; +extern template class TiledArray::Tile>>; extern template class TiledArray::Tile>>; + long, TiledArray::Range, TiledArray::device_um_btas_varray>>; #endif // TILEDARRAY_HEADER_ONLY -#endif // TILEDARRAY_HAS_CUDA +#endif // TILEDARRAY_HAS_DEVICE #endif // TILEDARRAY_CUDA_CUDA_UM_TENSOR_H diff --git a/src/TiledArray/cuda/cpu_cuda_vector.cu b/src/TiledArray/device/cpu_cuda_vector.cu similarity index 98% rename from src/TiledArray/cuda/cpu_cuda_vector.cu rename to src/TiledArray/device/cpu_cuda_vector.cu index 6c58fdd123..639cc56acc 100644 --- a/src/TiledArray/cuda/cpu_cuda_vector.cu +++ b/src/TiledArray/device/cpu_cuda_vector.cu @@ -1,5 +1,5 @@ -#include +#include namespace thrust { diff --git a/src/TiledArray/cuda/cpu_cuda_vector.h b/src/TiledArray/device/cpu_cuda_vector.h similarity index 96% rename from src/TiledArray/cuda/cpu_cuda_vector.h rename to src/TiledArray/device/cpu_cuda_vector.h index 5a6e52beb5..8c7b32900a 100644 --- a/src/TiledArray/cuda/cpu_cuda_vector.h +++ b/src/TiledArray/device/cpu_cuda_vector.h @@ -4,8 +4,8 @@ #include -#include -#include +#include +#include #include @@ -165,8 +165,8 @@ template bool in_memory_space( const cpu_cuda_vector& vec) noexcept { - return (vec.on_host() && overlap(MemorySpace::CPU, Space)) || - (vec.on_device() && overlap(MemorySpace::CUDA, Space)); + return (vec.on_host() && overlap(MemorySpace::Host, Space)) || + (vec.on_device() && overlap(MemorySpace::Device, Space)); } template & vec, cudaStream_t stream = 0) { switch (Space) { - case ExecutionSpace::CPU: { + case ExecutionSpace::Host: { vec.to_host(); break; } - case ExecutionSpace::CUDA: { + case ExecutionSpace::Device: { vec.to_device(); break; } diff --git a/src/TiledArray/cuda/cublas.h b/src/TiledArray/device/cublas.h similarity index 100% rename from src/TiledArray/cuda/cublas.h rename to src/TiledArray/device/cublas.h diff --git a/src/TiledArray/cuda/cuda_task_fn.h b/src/TiledArray/device/device_task_fn.h similarity index 72% rename from src/TiledArray/cuda/cuda_task_fn.h rename to src/TiledArray/device/device_task_fn.h index 8de133b3bd..a4b9db92e4 100644 --- a/src/TiledArray/cuda/cuda_task_fn.h +++ b/src/TiledArray/device/device_task_fn.h @@ -7,23 +7,22 @@ #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE -#include +#include #include -#include #include namespace TiledArray { namespace detail { template -std::atomic& cuda_callback_duration_ns() { +std::atomic& device_callback_duration_ns() { static std::atomic value{0}; return value; } -inline std::atomic& cuda_taskfn_callback_duration_ns() { +inline std::atomic& device_taskfn_callback_duration_ns() { static std::atomic value{0}; return value; } @@ -34,8 +33,8 @@ inline std::atomic& cuda_taskfn_callback_duration_ns() { namespace madness { /// -/// cudaTaskFn class -/// represent a task that calls an async cuda kernel +/// deviceTaskFn class +/// represent a task that calls an async device kernel /// the task must call synchronize_stream function to tell which stream it /// used /// @@ -44,55 +43,55 @@ template -struct cudaTaskFn : public TaskInterface { - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg1T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg2T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg3T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg4T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg5T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg6T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg7T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg8T cannot be a const " - "or reference type"); - static_assert(not(std::is_const::value || - std::is_reference::value), - "improper instantiation of cudaTaskFn, arg9T cannot be a const " - "or reference type"); +struct deviceTaskFn : public TaskInterface { + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg1T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg2T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg3T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg4T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg5T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg6T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg7T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg8T cannot be a const " + "or reference type"); + static_assert( + not(std::is_const::value || std::is_reference::value), + "improper instantiation of deviceTaskFn, arg9T cannot be a const " + "or reference type"); private: /// This class type - typedef cudaTaskFn - cudaTaskFn_; + typedef deviceTaskFn + deviceTaskFn_; friend class AsyncTaskInterface; - /// internal Task structure that wraps the Async cuda function + /// internal Task structure that wraps the Async device function struct AsyncTaskInterface : public madness::TaskInterface { - AsyncTaskInterface(cudaTaskFn_* task, int ndepend = 0, + AsyncTaskInterface(deviceTaskFn_* task, int ndepend = 0, const TaskAttributes attr = TaskAttributes()) : TaskInterface(ndepend, attr), task_(task) {} @@ -105,7 +104,7 @@ struct cudaTaskFn : public TaskInterface { task_->run_async(); // get the stream used by async function - auto stream = TiledArray::tls_cudastream_accessor(); + auto stream = TiledArray::device::tls_stream_accessor(); // TA_ASSERT(stream != nullptr); @@ -113,32 +112,33 @@ struct cudaTaskFn : public TaskInterface { if (stream == nullptr) { task_->notify(); } else { - // TODO should we use cuda callback or cuda events?? - // insert cuda callback - cudaLaunchHostFunc(*stream, cuda_callback, task_); + // TODO should we use device callback or device events?? + // insert device callback + TiledArray::device::launchHostFunc(*stream, device_callback, task_); // reset stream to nullptr - TiledArray::synchronize_stream(nullptr); + TiledArray::device::synchronize_stream(nullptr); } } private: - static void CUDART_CB cuda_callback(void* userData) { + static void DEVICERT_CB device_callback(void* userData) { TA_ASSERT(!madness::is_madness_thread()); const auto t0 = TiledArray::now(); // convert void * to AsyncTaskInterface* - auto* callback = static_cast(userData); + auto* callback = static_cast(userData); // std::stringstream address; // address << (void*) callback; - // std::string message = "callback on cudaTaskFn: " + address.str() + + // std::string message = "callback on deviceTaskFn: " + address.str() + // + // '\n'; std::cout << message; callback->notify(); const auto t1 = TiledArray::now(); - TiledArray::detail::cuda_taskfn_callback_duration_ns() += + TiledArray::detail::device_taskfn_callback_duration_ns() += TiledArray::duration_in_ns(t0, t1); } - cudaTaskFn_* task_; + deviceTaskFn_* task_; }; public: @@ -160,7 +160,7 @@ struct cudaTaskFn : public TaskInterface { futureT result_; ///< The task Future result const functionT func_; ///< The task function TaskInterface* async_task_; ///< The internal AsyncTaskInterface that wraps - ///< the async cuda function + ///< the async device function futureT async_result_; ///< the future returned from the async task // If the value of the argument is known at the time the @@ -258,7 +258,7 @@ struct cudaTaskFn : public TaskInterface { /// Check dependencies and register callbacks where necessary void check_dependencies() { - this->inc(); // the current cudaTaskFn depends on the internal + this->inc(); // the current deviceTaskFn depends on the internal // AsyncTaskInterface, dependency = 1 check_dependency(arg1_); check_dependency(arg2_); @@ -272,13 +272,14 @@ struct cudaTaskFn : public TaskInterface { } // Copies are not allowed. - cudaTaskFn(const cudaTaskFn_&); - cudaTaskFn_ operator=(cudaTaskFn_&); + deviceTaskFn(const deviceTaskFn_&); + deviceTaskFn_ operator=(deviceTaskFn_&); public: #if MADNESS_TASKQ_VARIADICS - cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -298,8 +299,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -319,8 +320,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -340,8 +341,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -361,8 +362,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, a4T&& a4, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, a4T&& a4, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -383,8 +384,8 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, a4T&& a4, a5T&& a5, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, a4T&& a4, a5T&& a5, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -405,8 +406,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -427,9 +429,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -450,9 +452,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -474,9 +476,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, - a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8, - a9T&& a9, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2, + a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8, + a9T&& a9, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -495,8 +497,9 @@ struct cudaTaskFn : public TaskInterface { check_dependencies(); } - cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr, - archive::BufferInputArchive& input_arch) + deviceTaskFn(const futureT& result, functionT func, + const TaskAttributes& attr, + archive::BufferInputArchive& input_arch) : TaskInterface(attr), result_(result), func_(func), @@ -514,7 +517,8 @@ struct cudaTaskFn : public TaskInterface { check_dependencies(); } #else // MADNESS_TASKQ_VARIADICS - cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -534,8 +538,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -555,8 +559,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const TaskAttributes& attr = TaskAttributes()) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const TaskAttributes& attr = TaskAttributes()) : TaskInterface(attr), result_(result), func_(func), @@ -576,8 +580,8 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -597,9 +601,9 @@ struct cudaTaskFn : public TaskInterface { } template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const a4T& a4, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const a4T& a4, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -620,9 +624,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -643,9 +647,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, - const a6T& a6, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, + const a6T& a6, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -666,9 +670,9 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, - const a6T& a6, const a7T& a7, const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, + const a6T& a6, const a7T& a7, const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -689,10 +693,10 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, - const a6T& a6, const a7T& a7, const a8T& a8, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, + const a6T& a6, const a7T& a7, const a8T& a8, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -714,10 +718,10 @@ struct cudaTaskFn : public TaskInterface { template - cudaTaskFn(const futureT& result, functionT func, const a1T& a1, - const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, - const a6T& a6, const a7T& a7, const a8T& a8, const a9T& a9, - const TaskAttributes& attr) + deviceTaskFn(const futureT& result, functionT func, const a1T& a1, + const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5, + const a6T& a6, const a7T& a7, const a8T& a8, const a9T& a9, + const TaskAttributes& attr) : TaskInterface(attr), result_(result), func_(func), @@ -736,8 +740,9 @@ struct cudaTaskFn : public TaskInterface { check_dependencies(); } - cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr, - archive::BufferInputArchive& input_arch) + deviceTaskFn(const futureT& result, functionT func, + const TaskAttributes& attr, + archive::BufferInputArchive& input_arch) : TaskInterface(attr), result_(result), func_(func), @@ -757,7 +762,7 @@ struct cudaTaskFn : public TaskInterface { #endif // MADNESS_TASKQ_VARIADICS // no need to delete async_task_, as it will be deleted by the TaskQueue - virtual ~cudaTaskFn() = default; + virtual ~deviceTaskFn() = default; const futureT& result() const { return result_; } @@ -770,16 +775,16 @@ struct cudaTaskFn : public TaskInterface { } #else protected: - /// when this cudaTaskFn gets run, it means the AsyncTaskInterface is done + /// when this deviceTaskFn gets run, it means the AsyncTaskInterface is done /// set the result with async_result_, which is finished void run(const TaskThreadEnv& env) override { result_.set(std::move(async_result_)); } #endif // HAVE_INTEL_TBB -}; // class cudaTaskFn +}; // class deviceTaskFn -/// add a cudaTaskFn object to World +/// add a deviceTaskFn object to World /// \tparam fnT A function pointer or functor /// \tparam a1T Type of argument 1. /// \tparam a2T Type of argument 2. @@ -794,15 +799,15 @@ struct cudaTaskFn : public TaskInterface { /// \return Description needed. template -typename cudaTaskFn::futureT -add_cuda_taskfn( +typename deviceTaskFn::futureT +add_device_taskfn( madness::World& world, - cudaTaskFn* t) { - typename cudaTaskFn::futureT - res(t->result()); - // add the internal async task in cuda task as well + deviceTaskFn* t) { + typename deviceTaskFn::futureT res(t->result()); + // add the internal async task in device task as well world.taskq.add(t->async_task()); - // add the cuda task + // add the device task world.taskq.add(static_cast(t)); return res; } @@ -815,13 +820,13 @@ template < typename fnT, typename... argsT, typename = std::enable_if_t::value>> typename detail::function_enabler...)>::type -add_cuda_task(madness::World& world, fnT&& fn, argsT&&... args) { - /// type of cudaTaskFn object +add_device_task(madness::World& world, fnT&& fn, argsT&&... args) { + /// type of deviceTaskFn object using taskT = - cudaTaskFn, - std::remove_const_t>...>; + deviceTaskFn, + std::remove_const_t>...>; - return add_cuda_taskfn( + return add_device_taskfn( world, new taskT(typename taskT::futureT(), std::forward(fn), std::forward(args)..., TaskAttributes())); } @@ -835,13 +840,13 @@ template < typename = std::enable_if_t::value>> typename meta::drop_last_arg_and_apply_callable< detail::function_enabler, fnT, future_to_ref_t...>::type::type -add_cuda_task(madness::World& world, fnT&& fn, argsT&&... args) { - /// type of cudaTaskFn object +add_device_task(madness::World& world, fnT&& fn, argsT&&... args) { + /// type of deviceTaskFn object using taskT = typename meta::drop_last_arg_and_apply< - cudaTaskFn, std::decay_t, + deviceTaskFn, std::decay_t, std::remove_const_t>...>::type; - return add_cuda_taskfn( + return add_device_taskfn( world, new taskT(typename taskT::futureT(), std::forward(fn), std::forward(args)...)); } @@ -852,14 +857,14 @@ add_cuda_task(madness::World& world, fnT&& fn, argsT&&... args) { /// \tparam argsT variadic template for arguments /// \return A future to the result template -typename detail::memfunc_enabler::type add_cuda_task( +typename detail::memfunc_enabler::type add_device_task( madness::World& world, objT&& obj, memfnT memfn, argsT&&... args) { - return add_cuda_task(world, - detail::wrap_mem_fn(std::forward(obj), memfn), - std::forward(args)...); + return add_device_task(world, + detail::wrap_mem_fn(std::forward(obj), memfn), + std::forward(args)...); } } // namespace madness -#endif // TILDARRAY_HAS_CUDA +#endif // TILDARRAY_HAS_DEVICE #endif // TILEDARRAY_CUDA_CUDA_TASK_FN_H__INCLUDED diff --git a/src/TiledArray/cuda/kernel/mult_kernel.cu b/src/TiledArray/device/kernel/mult_kernel.cu similarity index 96% rename from src/TiledArray/cuda/kernel/mult_kernel.cu rename to src/TiledArray/device/kernel/mult_kernel.cu index aa3cadbc72..ca2d86d4b9 100644 --- a/src/TiledArray/cuda/kernel/mult_kernel.cu +++ b/src/TiledArray/device/kernel/mult_kernel.cu @@ -21,8 +21,8 @@ * */ -#include -#include +#include +#include #ifdef TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/kernel/mult_kernel.h b/src/TiledArray/device/kernel/mult_kernel.h similarity index 100% rename from src/TiledArray/cuda/kernel/mult_kernel.h rename to src/TiledArray/device/kernel/mult_kernel.h diff --git a/src/TiledArray/cuda/kernel/mult_kernel_impl.h b/src/TiledArray/device/kernel/mult_kernel_impl.h similarity index 95% rename from src/TiledArray/cuda/kernel/mult_kernel_impl.h rename to src/TiledArray/device/kernel/mult_kernel_impl.h index b237dfab1e..a1c217ce3d 100644 --- a/src/TiledArray/cuda/kernel/mult_kernel_impl.h +++ b/src/TiledArray/device/kernel/mult_kernel_impl.h @@ -34,7 +34,7 @@ namespace TiledArray { template void mult_to_cuda_kernel_impl(T *result, const T *arg, std::size_t n, cudaStream_t stream, int device_id) { - CudaSafeCall(cudaSetDevice(device_id)); + DeviceSafeCall(device::setDevice(device_id)); thrust::multiplies mul_op; thrust::transform( @@ -47,7 +47,7 @@ void mult_to_cuda_kernel_impl(T *result, const T *arg, std::size_t n, template void mult_cuda_kernel_impl(T *result, const T *arg1, const T *arg2, std::size_t n, cudaStream_t stream, int device_id) { - CudaSafeCall(cudaSetDevice(device_id)); + DeviceSafeCall(device::setDevice(device_id)); thrust::multiplies mul_op; thrust::transform( diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.cu b/src/TiledArray/device/kernel/reduce_kernel.cu similarity index 98% rename from src/TiledArray/cuda/kernel/reduce_kernel.cu rename to src/TiledArray/device/kernel/reduce_kernel.cu index d24669b920..a09b3f7a41 100644 --- a/src/TiledArray/cuda/kernel/reduce_kernel.cu +++ b/src/TiledArray/device/kernel/reduce_kernel.cu @@ -21,8 +21,8 @@ * */ -#include -#include +#include +#include #ifdef TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.h b/src/TiledArray/device/kernel/reduce_kernel.h similarity index 100% rename from src/TiledArray/cuda/kernel/reduce_kernel.h rename to src/TiledArray/device/kernel/reduce_kernel.h diff --git a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h b/src/TiledArray/device/kernel/reduce_kernel_impl.h similarity index 95% rename from src/TiledArray/cuda/kernel/reduce_kernel_impl.h rename to src/TiledArray/device/kernel/reduce_kernel_impl.h index 9dc6507cca..f03e333dbb 100644 --- a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h +++ b/src/TiledArray/device/kernel/reduce_kernel_impl.h @@ -26,7 +26,7 @@ #include -#include +#include #include #include #include @@ -57,7 +57,7 @@ struct absolute_value template T reduce_cuda_kernel_impl(ReduceOp &&op, const T *arg, std::size_t n, T init, cudaStream_t stream, int device_id) { - CudaSafeCall(cudaSetDevice(device_id)); + DeviceSafeCall(device::setDevice(device_id)); auto arg_p = thrust::device_pointer_cast(arg); @@ -107,7 +107,7 @@ TiledArray::detail::scalar_t absmax_reduce_cuda_kernel_impl( thrust::maximum max_op; detail::absolute_value abs_op; - CudaSafeCall(cudaSetDevice(device_id)); + DeviceSafeCall(device::setDevice(device_id)); auto arg_p = thrust::device_pointer_cast(arg); @@ -125,7 +125,7 @@ TiledArray::detail::scalar_t absmin_reduce_cuda_kernel_impl( thrust::minimum min_op; detail::absolute_value abs_op; - CudaSafeCall(cudaSetDevice(device_id)); + DeviceSafeCall(device::setDevice(device_id)); auto arg_p = thrust::device_pointer_cast(arg); diff --git a/src/TiledArray/cuda/platform.h b/src/TiledArray/device/platform.h similarity index 93% rename from src/TiledArray/cuda/platform.h rename to src/TiledArray/device/platform.h index f94226b39e..9d0cac3cdd 100644 --- a/src/TiledArray/cuda/platform.h +++ b/src/TiledArray/device/platform.h @@ -31,9 +31,9 @@ enum class MemorySpace { // MemorySpace is represented as a bitfield to compute unions and // intersections easier Null = 0b00, - CPU = 0b01, - CUDA = 0b10, - CUDA_UM = CPU | CUDA // union of CPU and CUDA spaces + Host = 0b01, + Device = 0b10, + Device_UM = Host | Device // union of host and device spaces }; // customization point: in_memory_space(O) -> bool @@ -55,7 +55,7 @@ constexpr bool overlap(MemorySpace space1, MemorySpace space2) { } /// enumerates the execution spaces -enum class ExecutionSpace { CPU, CUDA }; +enum class ExecutionSpace { Host, Device }; // customization point: to_execution_space(O) -> void // "moves" O to execution space S diff --git a/src/TiledArray/cuda/thrust.h b/src/TiledArray/device/thrust.h similarity index 100% rename from src/TiledArray/cuda/thrust.h rename to src/TiledArray/device/thrust.h diff --git a/src/TiledArray/cuda/um_storage.cu b/src/TiledArray/device/um_storage.cu similarity index 66% rename from src/TiledArray/cuda/um_storage.cu rename to src/TiledArray/device/um_storage.cu index a16811e91b..cc3a1aae55 100644 --- a/src/TiledArray/cuda/um_storage.cu +++ b/src/TiledArray/device/um_storage.cu @@ -22,29 +22,29 @@ */ -#include -#include +#include +#include #ifdef TILEDARRAY_HAS_CUDA namespace thrust { template<> -void resize>( - thrust::device_vector>& dev_vec, +void resize>( + thrust::device_vector>& dev_vec, size_t size) { dev_vec.resize(size); } template<> -void resize>( - thrust::device_vector>& dev_vec, +void resize>( + thrust::device_vector>& dev_vec, size_t size) { dev_vec.resize(size); } } namespace thrust { -template class device_vector>; -template class device_vector>; +template class device_vector>; +template class device_vector>; } #endif //TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/cuda/um_storage.h b/src/TiledArray/device/um_storage.h similarity index 78% rename from src/TiledArray/cuda/um_storage.h rename to src/TiledArray/device/um_storage.h index bea591cbb2..e4318e5666 100644 --- a/src/TiledArray/cuda/um_storage.h +++ b/src/TiledArray/device/um_storage.h @@ -24,15 +24,15 @@ #ifndef TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED #define TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED -#include -#include +#include +#include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE #include #include -#include +#include #include #include @@ -41,40 +41,41 @@ namespace TiledArray { template using cuda_um_thrust_vector = - thrust::device_vector>; + thrust::device_vector>; /// @return true if @c dev_vec is present in space @space template bool in_memory_space(const Storage& vec) noexcept { - return overlap(MemorySpace::CUDA_UM, Space); + return overlap(MemorySpace::Device_UM, Space); } /** * @tparam Space - * @tparam Storage the Storage type of the vector, such as cuda_um_btas_varray + * @tparam Storage the Storage type of the vector, such as + * device_um_btas_varray */ template void to_execution_space(Storage& vec, cudaStream_t stream = 0) { switch (Space) { - case ExecutionSpace::CPU: { + case ExecutionSpace::Host: { using std::data; using std::size; using value_type = typename Storage::value_type; - if (cudaEnv::instance()->concurrent_managed_access()) { - CudaSafeCall(cudaMemPrefetchAsync(data(vec), - size(vec) * sizeof(value_type), - cudaCpuDeviceId, stream)); + if (deviceEnv::instance()->concurrent_managed_access()) { + DeviceSafeCall(cudaMemPrefetchAsync(data(vec), + size(vec) * sizeof(value_type), + cudaCpuDeviceId, stream)); } break; } - case ExecutionSpace::CUDA: { + case ExecutionSpace::Device: { using std::data; using std::size; using value_type = typename Storage::value_type; int device = -1; - if (cudaEnv::instance()->concurrent_managed_access()) { - CudaSafeCall(cudaGetDevice(&device)); - CudaSafeCall(cudaMemPrefetchAsync( + if (deviceEnv::instance()->concurrent_managed_access()) { + DeviceSafeCall(cudaGetDevice(&device)); + DeviceSafeCall(cudaMemPrefetchAsync( data(vec), size(vec) * sizeof(value_type), device, stream)); } break; @@ -95,8 +96,8 @@ template void make_device_storage(Storage& storage, std::size_t n, const cudaStream_t& stream = 0) { storage = Storage(n); - TiledArray::to_execution_space(storage, - stream); + TiledArray::to_execution_space(storage, + stream); } /** @@ -131,7 +132,7 @@ struct ArchiveLoadImpl> { static inline void load(const Archive& ar, TiledArray::cuda_um_thrust_vector& x) { typename thrust::device_vector< - T, TiledArray::cuda_um_allocator>::size_type n(0); + T, TiledArray::device_um_allocator>::size_type n(0); ar& n; x.resize(n); for (auto& xi : x) ar& xi; diff --git a/src/TiledArray/dist_eval/binary_eval.h b/src/TiledArray/dist_eval/binary_eval.h index a4c203d3dd..fa33d74d9c 100644 --- a/src/TiledArray/dist_eval/binary_eval.h +++ b/src/TiledArray/dist_eval/binary_eval.h @@ -123,12 +123,12 @@ class BinaryEvalImpl : public DistEvalImpl, private: /// Task function for evaluating tiles -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE /// \param i The tile index /// \param left The left-hand tile /// \param right The right-hand tile template - std::enable_if_t, void> eval_tile( + std::enable_if_t, void> eval_tile( const ordinal_type i, L left, R right) { DistEvalImpl_::set_tile(i, op_(left, right)); } @@ -137,11 +137,11 @@ class BinaryEvalImpl : public DistEvalImpl, /// \param left The left-hand tile /// \param right The right-hand tile template - std::enable_if_t, void> eval_tile( + std::enable_if_t, void> eval_tile( const ordinal_type i, L left, R right) { // TODO avoid copy the Op object auto result_tile = - madness::add_cuda_task(DistEvalImpl_::world(), op_, left, right); + madness::add_device_task(DistEvalImpl_::world(), op_, left, right); DistEvalImpl_::set_tile(i, result_tile); } #else diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h index ae3b456bc0..0d07821a4e 100644 --- a/src/TiledArray/dist_eval/contraction_eval.h +++ b/src/TiledArray/dist_eval/contraction_eval.h @@ -31,11 +31,11 @@ #include -//#define TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1 -//#define TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 1 -//#define TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1 -//#define TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 1 -//#define TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1 +// #define TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1 +// #define TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 1 +// #define TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1 +// #define TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 1 +// #define TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1 namespace TiledArray { namespace detail { @@ -479,7 +479,7 @@ class Summa static typename std::enable_if< is_lazy_tile::value #ifdef TILEDARRAY_HAS_CUDA - && !detail::is_cuda_tile_v + && !detail::is_device_tile_v #endif , Future>::type @@ -502,13 +502,14 @@ class Summa template static typename std::enable_if< is_lazy_tile::value && - detail::is_cuda_tile_v, + detail::is_device_tile_v, Future>::type get_tile(Arg& arg, const typename Arg::ordinal_type index) { auto convert_tile_fn = &Summa_::template convert_tile; - return madness::add_cuda_task(arg.world(), convert_tile_fn, arg.get(index), - madness::TaskAttributes::hipri()); + return madness::add_device_task(arg.world(), convert_tile_fn, + arg.get(index), + madness::TaskAttributes::hipri()); } #endif diff --git a/src/TiledArray/dist_eval/dist_eval.h b/src/TiledArray/dist_eval/dist_eval.h index b1056e0ac1..2fd6329de5 100644 --- a/src/TiledArray/dist_eval/dist_eval.h +++ b/src/TiledArray/dist_eval/dist_eval.h @@ -25,9 +25,9 @@ #include #include #include -#ifdef TILEDARRAY_HAS_CUDA -#include -#include +#ifdef TILEDARRAY_HAS_DEVICE +#include +#include #endif namespace TiledArray { diff --git a/src/TiledArray/dist_eval/unary_eval.h b/src/TiledArray/dist_eval/unary_eval.h index b3707b92c2..32052d2700 100644 --- a/src/TiledArray/dist_eval/unary_eval.h +++ b/src/TiledArray/dist_eval/unary_eval.h @@ -115,18 +115,18 @@ class UnaryEvalImpl /// \param i The tile index /// \param tile The tile to be evaluated template - std::enable_if_t, void> eval_tile( + std::enable_if_t, void> eval_tile( const ordinal_type i, tile_argument_type tile) { // TODO avoid copy Op object auto result_tile = - madness::add_cuda_task(DistEvalImpl_::world(), op_, tile); + madness::add_device_task(DistEvalImpl_::world(), op_, tile); DistEvalImpl_::set_tile(i, result_tile); } /// \param i The tile index /// \param tile The tile to be evaluated template - std::enable_if_t, void> eval_tile( + std::enable_if_t, void> eval_tile( const ordinal_type i, tile_argument_type tile) { DistEvalImpl_::set_tile(i, op_(tile)); } diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index 1a7bc2ff05..72ad9a42cd 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -40,9 +40,9 @@ #include "TiledArray/tile.h" #include "TiledArray/tile_interface/trace.h" #include "expr_engine.h" -#ifdef TILEDARRAY_HAS_CUDA -#include -#include +#ifdef TILEDARRAY_HAS_DEVICE +#include +#include #endif #include @@ -186,8 +186,8 @@ class Expr { typename A, typename I, typename T, typename std::enable_if::value && is_lazy_tile::value -#ifdef TILEDARRAY_HAS_CUDA - && !::TiledArray::detail::is_cuda_tile_v +#ifdef TILEDARRAY_HAS_DEVICE + && !::TiledArray::detail::is_device_tile_v #endif >::type* = nullptr> void set_tile(A& array, const I& index, const Future& tile) const { @@ -195,7 +195,7 @@ class Expr { TiledArray::Cast(), tile)); } -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE /// Set an array tile with a lazy tile /// Spawn a task to evaluate a lazy tile and set the \a array tile at @@ -210,9 +210,9 @@ class Expr { typename std::enable_if< !std::is_same::value && is_lazy_tile::value && - ::TiledArray::detail::is_cuda_tile_v>::type* = nullptr> + ::TiledArray::detail::is_device_tile_v>::type* = nullptr> void set_tile(A& array, const I& index, const Future& tile) const { - array.set(index, madness::add_cuda_task( + array.set(index, madness::add_device_task( array.world(), TiledArray::Cast(), tile)); } @@ -246,8 +246,8 @@ class Expr { template < typename A, typename I, typename T, typename Op, typename std::enable_if::value -#ifdef TILEDARRAY_HAS_CUDA - && !::TiledArray::detail::is_cuda_tile_v +#ifdef TILEDARRAY_HAS_DEVICE + && !::TiledArray::detail::is_device_tile_v #endif >::type* = nullptr> void set_tile(A& array, const I index, const Future& tile, @@ -261,7 +261,7 @@ class Expr { TiledArray::Cast(), op)); } -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE /// Set an array tile with a lazy tile /// Spawn a task to evaluate a lazy tile and set the \a array tile at @@ -275,14 +275,14 @@ class Expr { template ::value && - ::TiledArray::detail::is_cuda_tile_v>::type* = nullptr> + ::TiledArray::detail::is_device_tile_v>::type* = nullptr> void set_tile(A& array, const I index, const Future& tile, const std::shared_ptr& op) const { auto eval_tile_fn = &Expr_::template eval_tile< typename A::value_type, const T&, TiledArray::Cast, Op>; array.set(index, - madness::add_cuda_task( + madness::add_device_task( array.world(), eval_tile_fn, tile, TiledArray::Cast(), op)); } @@ -303,8 +303,8 @@ class Expr { template < typename A, typename I, typename T, typename Op, typename std::enable_if::value -#ifdef TILEDARRAY_HAS_CUDA - && !::TiledArray::detail::is_cuda_tile_v +#ifdef TILEDARRAY_HAS_DEVICE + && !::TiledArray::detail::is_device_tile_v #endif >::type* = nullptr> void set_tile(A& array, const I index, const Future& tile, @@ -317,7 +317,7 @@ class Expr { array.set(index, array.world().taskq.add(eval_tile_fn_ptr, tile, op)); } -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE /// Spawn a task to evaluate a lazy tile and set the \a array tile at /// \c index with the result. @@ -332,7 +332,7 @@ class Expr { template ::value&& ::TiledArray:: - detail::is_cuda_tile_v>::type* = nullptr> + detail::is_device_tile_v>::type* = nullptr> void set_tile(A& array, const I index, const Future& tile, const std::shared_ptr& op) const { auto eval_tile_fn_ptr = &Expr_::template eval_tile; @@ -340,8 +340,8 @@ class Expr { static_assert(madness::detail::function_traits&)>::value, "ouch"); - array.set(index, madness::add_cuda_task(array.world(), eval_tile_fn_ptr, - tile, op)); + array.set(index, madness::add_device_task(array.world(), eval_tile_fn_ptr, + tile, op)); } #endif diff --git a/src/TiledArray/external/cuda.h b/src/TiledArray/external/cuda.h index 1d169b1098..7a6e4d50e1 100644 --- a/src/TiledArray/external/cuda.h +++ b/src/TiledArray/external/cuda.h @@ -1,509 +1,3 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2018 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Chong Peng - * Department of Chemistry, Virginia Tech - * July 23, 2018 - * - */ - -#ifndef TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED -#define TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED - -#include -#include -#include - -#include - -#ifdef TILEDARRAY_HAS_CUDA - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include - -#include - -#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__) -#define CudaSafeCallNoThrow(err) __cudaSafeCallNoThrow(err, __FILE__, __LINE__) -#define CudaCheckError() __cudaCheckError(__FILE__, __LINE__) - -inline void __cudaSafeCall(cudaError err, const char* file, const int line) { - if (cudaSuccess != err) { - std::stringstream ss; - ss << "cudaSafeCall() failed at: " << file << ":" << line; - std::string what = ss.str(); - throw thrust::system_error(err, thrust::cuda_category(), what); - } -} - -inline void __cudaSafeCallNoThrow(cudaError err, const char* file, - const int line) { - if (cudaSuccess != err) { - madness::print_error("cudaSafeCallNoThrow() failed at: ", file, ":", line); - } -} - -inline void __cudaCheckError(const char* file, const int line) { - cudaError err = cudaGetLastError(); - if (cudaSuccess != err) { - std::stringstream ss; - ss << "cudaCheckError() failed at: " << file << ":" << line; - std::string what = ss.str(); - throw thrust::system_error(err, thrust::cuda_category(), what); - } -} - -namespace TiledArray { - -namespace detail { - -inline int num_cuda_streams() { - int num_streams = -1; - char* num_stream_char = std::getenv("TA_CUDA_NUM_STREAMS"); - /// default num of streams is 3 - if (num_stream_char) { - num_streams = std::atoi(num_stream_char); - } else { - num_streams = 3; - } - return num_streams; -} - -inline int num_cuda_devices() { - int num_devices = -1; - CudaSafeCall(cudaGetDeviceCount(&num_devices)); - return num_devices; -} - -inline int current_cuda_device_id(World& world) { - int mpi_local_size = -1; - int mpi_local_rank = -1; - std::tie(mpi_local_rank, mpi_local_size) = mpi_local_rank_size(world); - - int num_devices = detail::num_cuda_devices(); - - int cuda_device_id = -1; - // devices may already be pre-mapped - // if mpi_local_size <= num_devices : all ranks are in same resource set, map - // round robin - if (mpi_local_size <= num_devices) { - cuda_device_id = mpi_local_rank % num_devices; - } else { // mpi_local_size > num_devices - char* cvd_cstr = std::getenv("CUDA_VISIBLE_DEVICES"); - if (cvd_cstr) { // CUDA_VISIBLE_DEVICES is set, assume that pre-mapped - // make sure that there is only 1 device available here - if (num_devices != 1) { - throw std::runtime_error( - std::string( - "CUDA_VISIBLE_DEVICES environment variable is set, hence using " - "the provided device-to-rank mapping; BUT TiledArray found ") + - std::to_string(num_devices) + - " CUDA devices; only 1 CUDA device / MPI process is supported"); - } - cuda_device_id = 0; - } else { // not enough devices + devices are not pre-mapped - throw std::runtime_error( - std::string("TiledArray found ") + std::to_string(mpi_local_size) + - " MPI ranks on a node with " + std::to_string(num_devices) + - " CUDA devices; only 1 MPI process / CUDA device model is currently " - "supported"); - } - } - - return cuda_device_id; -} - -inline void CUDART_CB cuda_readyflag_callback(void* userData) { - // convert void * to std::atomic - std::atomic* flag = static_cast*>(userData); - // set the flag to be true - flag->store(true); -} - -struct ProbeFlag { - ProbeFlag(std::atomic* f) : flag(f) {} - - bool operator()() const { return flag->load(); } - - std::atomic* flag; -}; - -inline void thread_wait_cuda_stream(const cudaStream_t& stream) { - std::atomic* flag = new std::atomic(false); - - CudaSafeCall( - cudaLaunchHostFunc(stream, detail::cuda_readyflag_callback, flag)); - - detail::ProbeFlag probe(flag); - - // wait with sleep and do not do work - madness::ThreadPool::await(probe, false, true); - // madness::ThreadPool::await(probe, true, true); - - delete flag; -} - -} // namespace detail - -inline const cudaStream_t*& tls_cudastream_accessor() { - static thread_local const cudaStream_t* thread_local_stream_ptr{nullptr}; - return thread_local_stream_ptr; -} - -inline void synchronize_stream(const cudaStream_t* stream) { - tls_cudastream_accessor() = stream; -} - -/** - * cudaEnv maintains the CUDA-related part of the runtime environment, - * such as CUDA-specific memory allocators - * - * \note this is a Singleton - */ -class cudaEnv { - public: - ~cudaEnv() { - // destroy cuda streams on current device - for (auto& stream : cuda_streams_) { - CudaSafeCallNoThrow(cudaStreamDestroy(stream)); - } - } - - cudaEnv(const cudaEnv&) = delete; - cudaEnv(cudaEnv&&) = delete; - cudaEnv& operator=(const cudaEnv&) = delete; - cudaEnv& operator=(cudaEnv&&) = delete; - - /// access the singleton instance; if not initialized will be - /// initialized via cudaEnv::initialize() with the default params - static std::unique_ptr& instance() { - if (!instance_accessor()) { - initialize(); - } - return instance_accessor(); - } - - // clang-format off - /// initialize the instance using explicit params - /// \param world the world to use for initialization - /// \param page_size memory added to the pools supporting `this->um_allocator()`, `this->device_allocator()`, and `this->pinned_allocator()` in chunks of at least - /// this size (bytes) [default=2^25] - /// \param pinned_alloc_limit the maximum total amount of memory (in bytes) that - /// allocator returned by `this->pinned_allocator()` can allocate; - /// this allocator is not used by default [default=0] - // clang-format on - static void initialize(World& world = TiledArray::get_default_world(), - const std::uint64_t page_size = (1ul << 25), - const std::uint64_t pinned_alloc_limit = (1ul << 40)) { - static std::mutex mtx; // to make initialize() reentrant - std::scoped_lock lock{mtx}; - // only the winner of the lock race gets to initialize - if (instance_accessor() == nullptr) { - int num_streams = detail::num_cuda_streams(); - int num_devices = detail::num_cuda_devices(); - int device_id = detail::current_cuda_device_id(world); - // set device for current MPI process .. will be set in the ctor as well - CudaSafeCall(cudaSetDevice(device_id)); - CudaSafeCall(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared)); - - // uncomment to debug umpire ops - // - // umpire::util::Logger::getActiveLogger()->setLoggingMsgLevel( - // umpire::util::message::Debug); - - // make Thread Safe UM Dynamic POOL - - auto& rm = umpire::ResourceManager::getInstance(); - - auto mem_total_free = cudaEnv::memory_total_and_free_device(); - - // turn off Umpire introspection for non-Debug builds -#ifndef NDEBUG - constexpr auto introspect = true; -#else - constexpr auto introspect = false; -#endif - - // allocate all currently-free memory for UM pool - auto um_dynamic_pool = - rm.makeAllocator( - "UMDynamicPool", rm.getAllocator("UM"), mem_total_free.second, - pinned_alloc_limit); - - // allocate zero memory for device pool - auto dev_size_limited_alloc = - rm.makeAllocator( - "size_limited_alloc", rm.getAllocator("DEVICE"), - mem_total_free.first); - auto dev_dynamic_pool = - rm.makeAllocator( - "CUDADynamicPool", dev_size_limited_alloc, 0, pinned_alloc_limit); - - // allocate pinned_alloc_limit in pinned memory - auto pinned_size_limited_alloc = - rm.makeAllocator( - "SizeLimited_PINNED", rm.getAllocator("PINNED"), - pinned_alloc_limit); - auto pinned_dynamic_pool = - rm.makeAllocator( - "QuickPool_SizeLimited_PINNED", pinned_size_limited_alloc, - page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE); - - auto cuda_env = std::unique_ptr( - new cudaEnv(world, num_devices, device_id, num_streams, - um_dynamic_pool, dev_dynamic_pool, pinned_dynamic_pool)); - instance_accessor() = std::move(cuda_env); - } - } - - World& world() const { return *world_; } - - int num_cuda_devices() const { return num_cuda_devices_; } - - int current_cuda_device_id() const { return current_cuda_device_id_; } - - int num_cuda_streams() const { return num_cuda_streams_; } - - bool concurrent_managed_access() const { - return cuda_device_concurrent_managed_access_; - } - - size_t stream_id(const cudaStream_t& stream) const { - auto it = std::find(cuda_streams_.begin(), cuda_streams_.end(), stream); - if (it == cuda_streams_.end()) abort(); - return it - cuda_streams_.begin(); - } - - /// @return the total size of all and free device memory on the current device - static std::pair memory_total_and_free_device() { - std::pair result; - // N.B. cudaMemGetInfo returns {free,total} - CudaSafeCall(cudaMemGetInfo(&result.second, &result.first)); - return result; - } - - /// Collective call to probe CUDA {total,free} memory - - /// @return the total size of all and free device memory on every rank's - /// device - std::vector> memory_total_and_free() const { - auto world_size = world_->size(); - std::vector total_memory(world_size, 0), free_memory(world_size, 0); - auto rank = world_->rank(); - std::tie(total_memory.at(rank), free_memory.at(rank)) = - cudaEnv::memory_total_and_free_device(); - world_->gop.sum(total_memory.data(), total_memory.size()); - world_->gop.sum(free_memory.data(), free_memory.size()); - std::vector> result(world_size); - for (int r = 0; r != world_size; ++r) { - result.at(r) = {total_memory.at(r), free_memory.at(r)}; - } - return result; - } - - const cudaStream_t& cuda_stream(std::size_t i) const { - return cuda_streams_.at(i); - } - - const cudaStream_t& cuda_stream_h2d() const { - return cuda_streams_[num_cuda_streams_]; - } - - const cudaStream_t& cuda_stream_d2h() const { - return cuda_streams_[num_cuda_streams_ + 1]; - } - - /// @return a (non-thread-safe) Umpire allocator for CUDA UM - umpire::Allocator& um_allocator() { return um_allocator_; } - - // clang-format off - /// @return the max actual amount of memory held by um_allocator() - /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()` - /// @note if there is only 1 Umpire allocator using UM memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("UM").getHighWatermark()` - // clang-format on - std::size_t um_allocator_getActualHighWatermark() { - TA_ASSERT(dynamic_cast( - um_allocator_.getAllocationStrategy()) != nullptr); - return dynamic_cast( - um_allocator_.getAllocationStrategy()) - ->getActualHighwaterMark(); - } - - /// @return a (non-thread-safe) Umpire allocator for CUDA device memory - umpire::Allocator& device_allocator() { return device_allocator_; } - - // clang-format off - /// @return the max actual amount of memory held by um_allocator() - /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()` - /// @note if there is only 1 Umpire allocator using DEVICE memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("DEVICE").getHighWatermark()` - // clang-format on - std::size_t device_allocator_getActualHighWatermark() { - TA_ASSERT(dynamic_cast( - device_allocator_.getAllocationStrategy()) != nullptr); - return dynamic_cast( - device_allocator_.getAllocationStrategy()) - ->getActualHighwaterMark(); - } - - /// @return an Umpire allocator that allocates from a - /// pinned memory pool - /// @warning this is not a thread-safe allocator, should be only used when - /// wrapped into umpire_allocator_impl - umpire::Allocator& pinned_allocator() { return pinned_allocator_; } - - // clang-format off - /// @return the max actual amount of memory held by pinned_allocator() - /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()` - /// @note if there is only 1 Umpire allocator using PINNED memory this should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("PINNED").getHighWatermark()` - // clang-format on - std::size_t pinned_allocator_getActualHighWatermark() { - TA_ASSERT(dynamic_cast( - pinned_allocator_.getAllocationStrategy()) != nullptr); - return dynamic_cast( - pinned_allocator_.getAllocationStrategy()) - ->getActualHighwaterMark(); - } - - protected: - cudaEnv(World& world, int num_devices, int device_id, int num_streams, - umpire::Allocator um_alloc, umpire::Allocator device_alloc, - umpire::Allocator pinned_alloc) - : world_(&world), - um_allocator_(um_alloc), - device_allocator_(device_alloc), - pinned_allocator_(pinned_alloc), - num_cuda_devices_(num_devices), - current_cuda_device_id_(device_id), - num_cuda_streams_(num_streams) { - if (num_devices <= 0) { - throw std::runtime_error("No CUDA-Enabled GPUs Found!\n"); - } - - // set device for current MPI process - CudaSafeCall(cudaSetDevice(current_cuda_device_id_)); - - /// check the capability of CUDA device - cudaDeviceProp prop; - CudaSafeCall(cudaGetDeviceProperties(&prop, device_id)); - if (!prop.managedMemory) { - throw std::runtime_error("CUDA Device doesn't support managedMemory\n"); - } - int concurrent_managed_access; - CudaSafeCall(cudaDeviceGetAttribute(&concurrent_managed_access, - cudaDevAttrConcurrentManagedAccess, - device_id)); - cuda_device_concurrent_managed_access_ = concurrent_managed_access; - if (!cuda_device_concurrent_managed_access_) { - std::cout << "\nWarning: CUDA Device doesn't support " - "ConcurrentManagedAccess!\n\n"; - } - - // creates cuda streams on current device - cuda_streams_.resize(num_cuda_streams_ + 2); - for (auto& stream : cuda_streams_) { - CudaSafeCall(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - } - std::cout << "created " << num_cuda_streams_ - << " CUDA streams + 2 I/O streams" << std::endl; - } - - private: - // the world used to initialize this - World* world_; - - /// allocator backed by a (non-thread-safe) dynamically-sized pool for CUDA UM - umpire::Allocator um_allocator_; - /// allocator backed by a (non-thread-safe) dynamically-sized pool for device - /// memory - umpire::Allocator device_allocator_; - // allocates from a dynamic, size-limited pinned memory pool - // N.B. not thread safe, so must be wrapped into umpire_allocator_impl - umpire::Allocator pinned_allocator_; - - int num_cuda_devices_; - int current_cuda_device_id_; - bool cuda_device_concurrent_managed_access_; - - int num_cuda_streams_; - std::vector cuda_streams_; - - inline static std::unique_ptr& instance_accessor() { - static std::unique_ptr instance_{nullptr}; - return instance_; - } -}; - -namespace detail { - -template -const cudaStream_t& get_stream_based_on_range(const Range& range) { - // TODO better way to get stream based on the id of tensor - auto stream_id = range.offset() % cudaEnv::instance()->num_cuda_streams(); - auto& stream = cudaEnv::instance()->cuda_stream(stream_id); - return stream; -} - -} // namespace detail - -namespace nvidia { - -// Color definitions for nvtxcalls -enum class argbColor : uint32_t { - red = 0xFFFF0000, - blue = 0xFF0000FF, - green = 0xFF008000, - yellow = 0xFFFFFF00, - cyan = 0xFF00FFFF, - magenta = 0xFFFF00FF, - gray = 0xFF808080, - purple = 0xFF800080 -}; - -/// enter a profiling range by calling nvtxRangePushEx -/// \param[in] range_title a char string containing the range title -/// \param[in] range_color the range color -inline void range_push(const char* range_title, argbColor range_color) { - nvtxEventAttributes_t eventAttrib = {0}; - eventAttrib.version = NVTX_VERSION; - eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; - eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; - eventAttrib.colorType = NVTX_COLOR_ARGB; - eventAttrib.color = static_cast(range_color); - eventAttrib.message.ascii = range_title; - nvtxRangePushEx(&eventAttrib); -} - -/// exits the current profiling range by calling nvtxRangePopEx -inline void range_pop() { nvtxRangePop(); } - -} // namespace nvidia - -} // namespace TiledArray - -#endif // TILEDARRAY_HAS_CUDA - -#endif // TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED +#warning \ + "This header is deprecated. Please use TiledArray/external/device.h instead." +#include diff --git a/src/TiledArray/external/hip.h b/src/TiledArray/external/device.h similarity index 53% rename from src/TiledArray/external/hip.h rename to src/TiledArray/external/device.h index 75dbfc6955..5016d55b23 100644 --- a/src/TiledArray/external/hip.h +++ b/src/TiledArray/external/device.h @@ -21,8 +21,8 @@ * */ -#ifndef TILEDARRAY_EXTERNAL_HIP_H__INCLUDED -#define TILEDARRAY_EXTERNAL_HIP_H__INCLUDED +#ifndef TILEDARRAY_EXTERNAL_DEVICE_H__INCLUDED +#define TILEDARRAY_EXTERNAL_DEVICE_H__INCLUDED #include #include @@ -30,9 +30,15 @@ #include -#ifdef TILEDARRAY_HAS_HIP - +#if defined(TILEDARRAY_HAS_HIP) #include +#elif defined(TILEDARRAY_HAS_CUDA) +#include +#include +#include +#include +#include +#endif #include @@ -43,9 +49,40 @@ #include -#define HipSafeCall(err) __hipSafeCall(err, __FILE__, __LINE__) -#define HipSafeCallNoThrow(err) __hipSafeCallNoThrow(err, __FILE__, __LINE__) -#define HipCheckError() __hipCheckError(__FILE__, __LINE__) +#if defined(TILEDARRAY_HAS_CUDA) + +inline void __DeviceSafeCall(cudaError err, const char* file, const int line) { + if (cudaSuccess != err) { + std::stringstream ss; + ss << "DeviceSafeCall() failed at: " << file << ":" << line; + std::string what = ss.str(); + throw thrust::system_error(err, thrust::cuda_category(), what); + } +} + +inline void __cudaSafeCallNoThrow(cudaError err, const char* file, + const int line) { + if (cudaSuccess != err) { + madness::print_error("cudaSafeCallNoThrow() failed at: ", file, ":", line); + } +} + +inline void __cudaCheckError(const char* file, const int line) { + cudaError err = cudaGetLastError(); + if (cudaSuccess != err) { + std::stringstream ss; + ss << "cudaCheckError() failed at: " << file << ":" << line; + std::string what = ss.str(); + throw thrust::system_error(err, thrust::cuda_category(), what); + } +} + +#define DeviceSafeCall(err) __DeviceSafeCall(err, __FILE__, __LINE__) +#define DeviceSafeCallNoThrow(err) \ + __cudaSafeCallNoThrow(err, __FILE__, __LINE__) +#define DeviceCheckError() __cudaCheckError(__FILE__, __LINE__) + +#elif defined(TILEDARRAY_HAS_HIP) inline void __hipSafeCall(hipError_t err, const char* file, const int line) { if (hipSuccess != err) { @@ -74,34 +111,158 @@ inline void __hipCheckError(const char* file, const int line) { } } +#define DeviceSafeCall(err) __hipSafeCall(err, __FILE__, __LINE__) +#define DeviceSafeCallNoThrow(err) __hipSafeCallNoThrow(err, __FILE__, __LINE__) +#define DeviceCheckError() __hipCheckError(__FILE__, __LINE__) + +#endif + namespace TiledArray { +namespace device { + +#if defined(TILEDARRAY_HAS_CUDA) +inline namespace cuda { +using stream_t = cudaStream_t; +using error_t = cudaError_t; +using hostFn_t = cudaHostFn_t; +using deviceProp_t = cudaDeviceProp; +using deviceAttr_t = cudaDeviceAttr; +#define DeviceAttributeConcurrentManagedAccess \ + cudaDevAttrConcurrentManagedAccess +#define DEVICERT_CB CUDART_CB + +enum FuncCache { + FuncCachePreferNone = cudaFuncCachePreferNone, + FuncCachePreferShared = cudaFuncCachePreferShared, + FuncCachePreferL1 = cudaFuncCachePreferL1, + FuncCachePreferEqual = cudaFuncCachePreferEqual +}; -namespace detail { +enum StreamCreateFlags { + StreamDefault = cudaStreamDefault, + StreamNonBlocking = cudaStreamNonBlocking +}; + +inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) { + return cudaLaunchHostFunc(stream, fn, userData); +} +inline error_t streamDestroy(stream_t stream) { + return cudaStreamDestroy(stream); +} +inline error_t setDevice(int device) { return device::setDevice(device); } +inline error_t deviceSetCacheConfig(FuncCache cache_config) { + return cudaDeviceSetCacheConfig(static_cast(cache_config)); +} +inline error_t memGetInfo(size_t* free, size_t* total) { + return cudaMemGetInfo(free, total); +} +inline error_t getDeviceProperties(deviceProp_t* prop, int device) { + return cudaGetDeviceProperties(prop, device); +} +inline error_t deviceGetAttribute(int* value, deviceAttr_t attr, int device) { + return cudaDeviceGetAttribute(value, attr, device); +} +inline error_t streamCreateWithFlags(stream_t* pStream, + StreamCreateFlags flags) { + return cudaStreamCreateWithFlags(pStream, flags); +} +inline error_t deviceSynchronize() { return cudaDeviceSynchronize(); } +inline error_t streamSynchronize(stream_t stream) { + return cudaStreamSynchronize(stream); +} +} // namespace cuda +#elif defined(TILEDARRAY_HAS_HIP) +inline namespace hip { +using stream_t = hipStream_t; +using error_t = hipError_t; +using hostFn_t = hipHostFn_t; +using deviceProp_t = hipDeviceProp; +using deviceAttr_t = hipDeviceAttr; +#define DeviceAttributeConcurrentManagedAccess \ + hipDeviceAttributeConcurrentManagedAccess +#define DEVICERT_CD HIPRT_CB + +enum FuncCache { + FuncCachePreferNone = hipFuncCachePreferNone, + FuncCachePreferShared = hipFuncCachePreferShared, + FuncCachePreferL1 = hipFuncCachePreferL1, + FuncCachePreferEqual = hipFuncCachePreferEqual +}; + +enum StreamCreateFlags { + StreamDefault = hipStreamDefault, + StreamNonBlocking = hipStreamNonBlocking +}; + +inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) { + return hipLaunchHostFunc(stream, fn, userData); +} +inline error_t streamDestroy(stream_t stream) { + return hipStreamDestroy(stream); +} +inline error_t setDevice(int device) { return hipSetDevice(device); } +inline error_t deviceSetCacheConfig(FuncCache cache_config) { + return hipDeviceSetCacheConfig(static_cast(cache_config)); +} +inline error_t memGetInfo(size_t* free, size_t* total) { + return hipMemGetInfo(free, total); +} +inline error_t getDeviceProperties(deviceProp* prop, int device) { + return hipGetDeviceProperties(prop, device); +} +inline error_t deviceGetAttribute(int* value, deviceAttr_t attr, int device) { + return hipDeviceGetAttribute(value, attr, device); +} +inline error_t streamCreateWithFlags(stream_t* pStream, + StreamCreateFlags flags) { + return hipStreamCreateWithFlags(pStream, flags); +} +inline error_t deviceSynchronize() { return hipDeviceSynchronize(); } +inline error_t streamSynchronize(stream_t stream) { + return hipStreamSynchronize(stream); +} +} // namespace hip +#endif + +#ifdef TILEDARRAY_HAS_DEVICE inline int num_streams() { int num_streams = -1; - char* num_stream_char = std::getenv("TA_HIP_NUM_STREAMS"); - /// default num of streams is 3 + char* num_stream_char = std::getenv("TA_DEVICE_NUM_STREAMS"); if (num_stream_char) { num_streams = std::atoi(num_stream_char); } else { - num_streams = 3; +#if defined(TILEDARRAY_HAS_CUDA) + char* num_stream_char = std::getenv("TA_CUDA_NUM_STREAMS"); +#elif defined(TILEDARRAY_HAS_HIP) + char* num_stream_char = std::getenv("TA_HIP_NUM_STREAMS"); +#endif + if (num_stream_char) { + num_streams = std::atoi(num_stream_char); + } else { + /// default num of streams is 3 + num_streams = 3; + } } return num_streams; } inline int num_devices() { int num_devices = -1; - HipSafeCall(hipGetDeviceCount(&num_devices)); +#if defined(TILEDARRAY_HAS_CUDA) + DeviceSafeCall(cudaGetDeviceCount(&num_devices)); +#elif defined(TILEDARRAY_HAS_HIP) + DeviceSafeCall(hipGetDeviceCount(&num_devices)); +#endif return num_devices; } inline int current_device_id(World& world) { int mpi_local_size = -1; int mpi_local_rank = -1; - std::tie(mpi_local_rank, mpi_local_size) = mpi_local_rank_size(world); + std::tie(mpi_local_rank, mpi_local_size) = detail::mpi_local_rank_size(world); - int num_devices = detail::num_devices(); + int num_devices = device::num_devices(); int device_id = -1; // devices may already be pre-mapped @@ -110,23 +271,25 @@ inline int current_device_id(World& world) { if (mpi_local_size <= num_devices) { device_id = mpi_local_rank % num_devices; } else { // mpi_local_size > num_devices - char* cvd_cstr = std::getenv("HIP_VISIBLE_DEVICES"); - if (cvd_cstr) { // HIP_VISIBLE_DEVICES is set, assume that pre-mapped + const char* vd_cstr = + std::getenv(TILEDARRAY_DEVICE_RUNTIME_STR "_VISIBLE_DEVICES"); + if (vd_cstr) { // *_VISIBLE_DEVICES is set, assume that pre-mapped // make sure that there is only 1 device available here if (num_devices != 1) { throw std::runtime_error( std::string( - "HIP_VISIBLE_DEVICES environment variable is set, hence using " + TILEDARRAY_DEVICE_RUNTIME_STR + "_VISIBLE_DEVICES environment variable is set, hence using " "the provided device-to-rank mapping; BUT TiledArray found ") + std::to_string(num_devices) + - " HIP devices; only 1 HIP device / MPI process is supported"); + " devices; only 1 device / MPI process is supported"); } device_id = 0; } else { // not enough devices + devices are not pre-mapped throw std::runtime_error( std::string("TiledArray found ") + std::to_string(mpi_local_size) + " MPI ranks on a node with " + std::to_string(num_devices) + - " HIP devices; only 1 MPI process / HIP device model is currently " + " devices; only 1 MPI process / device model is currently " "supported"); } } @@ -134,7 +297,7 @@ inline int current_device_id(World& world) { return device_id; } -inline void HIPRT_CB hip_readyflag_callback(void* userData) { +inline void DEVICERT_CB readyflag_callback(void* userData) { // convert void * to std::atomic std::atomic* flag = static_cast*>(userData); // set the flag to be true @@ -149,12 +312,12 @@ struct ProbeFlag { std::atomic* flag; }; -inline void thread_wait_stream(const hipStream_t& stream) { +inline void thread_wait_stream(const stream_t& stream) { std::atomic* flag = new std::atomic(false); - HipSafeCall(hipLaunchHostFunc(stream, detail::hip_readyflag_callback, flag)); + DeviceSafeCall(launchHostFunc(stream, readyflag_callback, flag)); - detail::ProbeFlag probe(flag); + ProbeFlag probe(flag); // wait with sleep and do not do work madness::ThreadPool::await(probe, false, true); @@ -163,40 +326,38 @@ inline void thread_wait_stream(const hipStream_t& stream) { delete flag; } -} // namespace detail - -inline const hipStream_t*& tls_stream_accessor() { - static thread_local const hipStream_t* thread_local_stream_ptr{nullptr}; +inline const stream_t*& tls_stream_accessor() { + static thread_local const stream_t* thread_local_stream_ptr{nullptr}; return thread_local_stream_ptr; } -inline void synchronize_stream(const hipStream_t* stream) { +inline void synchronize_stream(const stream_t* stream) { tls_stream_accessor() = stream; } /** - * hipEnv maintains the HIP-related part of the runtime environment, - * such as HIP-specific memory allocators + * Env maintains the device-related part of the runtime environment, + * such as specialized data structures like device streams and memory allocators * * \note this is a Singleton */ -class hipEnv { +class Env { public: - ~hipEnv() { + ~Env() { // destroy streams on current device for (auto& stream : streams_) { - HipSafeCallNoThrow(hipStreamDestroy(stream)); + DeviceSafeCallNoThrow(streamDestroy(stream)); } } - hipEnv(const hipEnv&) = delete; - hipEnv(hipEnv&&) = delete; - hipEnv& operator=(const hipEnv&) = delete; - hipEnv& operator=(hipEnv&&) = delete; + Env(const Env&) = delete; + Env(Env&&) = delete; + Env& operator=(const Env&) = delete; + Env& operator=(Env&&) = delete; /// access the singleton instance; if not initialized will be - /// initialized via hipEnv::initialize() with the default params - static std::unique_ptr& instance() { + /// initialized via Env::initialize() with the default params + static std::unique_ptr& instance() { if (!instance_accessor()) { initialize(); } @@ -219,12 +380,12 @@ class hipEnv { std::scoped_lock lock{mtx}; // only the winner of the lock race gets to initialize if (instance_accessor() == nullptr) { - int num_streams = detail::num_streams(); - int num_devices = detail::num_devices(); - int device_id = detail::current_device_id(world); + int num_streams = device::num_streams(); + int num_devices = device::num_devices(); + int device_id = device::current_device_id(world); // set device for current MPI process .. will be set in the ctor as well - HipSafeCall(hipSetDevice(device_id)); - HipSafeCall(hipDeviceSetCacheConfig(hipFuncCachePreferShared)); + DeviceSafeCall(setDevice(device_id)); + DeviceSafeCall(deviceSetCacheConfig(FuncCachePreferShared)); // uncomment to debug umpire ops // @@ -235,7 +396,7 @@ class hipEnv { auto& rm = umpire::ResourceManager::getInstance(); - auto mem_total_free = hipEnv::memory_total_and_free_device(); + auto mem_total_free = Env::memory_total_and_free_device(); // turn off Umpire introspection for non-Debug builds #ifndef NDEBUG @@ -257,7 +418,8 @@ class hipEnv { mem_total_free.first); auto dev_dynamic_pool = rm.makeAllocator( - "HIPDynamicPool", dev_size_limited_alloc, 0, pinned_alloc_limit); + "DEVICEDynamicPool", dev_size_limited_alloc, 0, + pinned_alloc_limit); // allocate pinned_alloc_limit in pinned memory auto pinned_size_limited_alloc = @@ -269,10 +431,10 @@ class hipEnv { "QuickPool_SizeLimited_PINNED", pinned_size_limited_alloc, page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE); - auto hip_env = std::unique_ptr( - new hipEnv(world, num_devices, device_id, num_streams, - um_dynamic_pool, dev_dynamic_pool, pinned_dynamic_pool)); - instance_accessor() = std::move(hip_env); + auto env = std::unique_ptr( + new Env(world, num_devices, device_id, num_streams, um_dynamic_pool, + dev_dynamic_pool, pinned_dynamic_pool)); + instance_accessor() = std::move(env); } } @@ -288,7 +450,7 @@ class hipEnv { return device_concurrent_managed_access_; } - size_t stream_id(const hipStream_t& stream) const { + size_t stream_id(const stream_t& stream) const { auto it = std::find(streams_.begin(), streams_.end(), stream); if (it == streams_.end()) abort(); return it - streams_.begin(); @@ -297,12 +459,12 @@ class hipEnv { /// @return the total size of all and free device memory on the current device static std::pair memory_total_and_free_device() { std::pair result; - // N.B. hipMemGetInfo returns {free,total} - HipSafeCall(hipMemGetInfo(&result.second, &result.first)); + // N.B. *MemGetInfo returns {free,total} + DeviceSafeCall(memGetInfo(&result.second, &result.first)); return result; } - /// Collective call to probe HIP {total,free} memory + /// Collective call to probe device {total,free} memory /// @return the total size of all and free device memory on every rank's /// device @@ -311,7 +473,7 @@ class hipEnv { std::vector total_memory(world_size, 0), free_memory(world_size, 0); auto rank = world_->rank(); std::tie(total_memory.at(rank), free_memory.at(rank)) = - hipEnv::memory_total_and_free_device(); + Env::memory_total_and_free_device(); world_->gop.sum(total_memory.data(), total_memory.size()); world_->gop.sum(free_memory.data(), free_memory.size()); std::vector> result(world_size); @@ -321,11 +483,11 @@ class hipEnv { return result; } - const hipStream_t& stream(std::size_t i) const { return streams_.at(i); } + const stream_t& stream(std::size_t i) const { return streams_.at(i); } - const hipStream_t& stream_h2d() const { return streams_[num_streams_]; } + const stream_t& stream_h2d() const { return streams_[num_streams_]; } - const hipStream_t& stream_d2h() const { return streams_[num_streams_ + 1]; } + const stream_t& stream_d2h() const { return streams_[num_streams_ + 1]; } /// @return a (non-thread-safe) Umpire allocator for device UM umpire::Allocator& um_allocator() { return um_allocator_; } @@ -362,7 +524,7 @@ class hipEnv { /// @return an Umpire allocator that allocates from a /// pinned memory pool /// @warning this is not a thread-safe allocator, should be only used when - /// wrapped into umpire_allocator_impl + /// wrapped into umpire_based_allocator_impl umpire::Allocator& pinned_allocator() { return pinned_allocator_; } // clang-format off @@ -379,9 +541,9 @@ class hipEnv { } protected: - hipEnv(World& world, int num_devices, int device_id, int num_streams, - umpire::Allocator um_alloc, umpire::Allocator device_alloc, - umpire::Allocator pinned_alloc) + Env(World& world, int num_devices, int device_id, int num_streams, + umpire::Allocator um_alloc, umpire::Allocator device_alloc, + umpire::Allocator pinned_alloc) : world_(&world), um_allocator_(um_alloc), device_allocator_(device_alloc), @@ -390,34 +552,38 @@ class hipEnv { current_device_id_(device_id), num_streams_(num_streams) { if (num_devices <= 0) { - throw std::runtime_error("No HIP-Enabled GPUs Found!\n"); + throw std::runtime_error("No " TILEDARRAY_DEVICE_RUNTIME_STR + " compute devices found!\n"); } // set device for current MPI process - HipSafeCall(hipSetDevice(current_device_id_)); + DeviceSafeCall(setDevice(current_device_id_)); - /// check the capability of HIP device - hipDeviceProp prop; - HipSafeCall(hipGetDeviceProperties(&prop, device_id)); + /// check the capability of device + deviceProp_t prop; + DeviceSafeCall(getDeviceProperties(&prop, device_id)); if (!prop.managedMemory) { - throw std::runtime_error("HIP Device doesn't support managedMemory\n"); + throw std::runtime_error(TILEDARRAY_DEVICE_RUNTIME_STR + "device doesn't support managedMemory\n"); } int concurrent_managed_access; - HipSafeCall(hipDeviceGetAttribute(&concurrent_managed_access, - hipDeviceAttributeConcurrentManagedAccess, + DeviceSafeCall(deviceGetAttribute(&concurrent_managed_access, + DeviceAttributeConcurrentManagedAccess, device_id)); device_concurrent_managed_access_ = concurrent_managed_access; if (!device_concurrent_managed_access_) { - std::cout << "\nWarning: HIP Device doesn't support " + std::cout << "\nWarning: " TILEDARRAY_DEVICE_RUNTIME_STR + " device doesn't support " "ConcurrentManagedAccess!\n\n"; } // creates streams on current device streams_.resize(num_streams_ + 2); for (auto& stream : streams_) { - HipSafeCall(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + DeviceSafeCall(streamCreateWithFlags(&stream, StreamNonBlocking)); } - std::cout << "created " << num_streams_ << " HIP streams + 2 I/O streams" + std::cout << "created " << num_streams_ + << " " TILEDARRAY_DEVICE_RUNTIME_STR " streams + 2 I/O streams" << std::endl; } @@ -431,7 +597,7 @@ class hipEnv { /// memory umpire::Allocator device_allocator_; // allocates from a dynamic, size-limited pinned memory pool - // N.B. not thread safe, so must be wrapped into umpire_allocator_impl + // N.B. not thread safe, so must be wrapped into umpire_based_allocator_impl umpire::Allocator pinned_allocator_; int num_devices_; @@ -439,28 +605,63 @@ class hipEnv { bool device_concurrent_managed_access_; int num_streams_; - std::vector streams_; + std::vector streams_; - inline static std::unique_ptr& instance_accessor() { - static std::unique_ptr instance_{nullptr}; + inline static std::unique_ptr& instance_accessor() { + static std::unique_ptr instance_{nullptr}; return instance_; } }; -namespace detail { +} // namespace device +namespace detail { template -const hipStream_t& get_stream_based_on_range(const Range& range) { +const device::stream_t& get_stream_based_on_range(const Range& range) { // TODO better way to get stream based on the id of tensor - auto stream_id = range.offset() % hipEnv::instance()->num_streams(); - auto& stream = hipEnv::instance()->stream(stream_id); + auto stream_id = range.offset() % device::Env::instance()->num_streams(); + auto& stream = device::Env::instance()->stream(stream_id); return stream; } - } // namespace detail -} // namespace TiledArray +#endif // TILEDARRAY_HAS_DEVICE + +#ifdef TILEDARRAY_HAS_CUDA +namespace nvidia { + +// Color definitions for nvtxcalls +enum class argbColor : uint32_t { + red = 0xFFFF0000, + blue = 0xFF0000FF, + green = 0xFF008000, + yellow = 0xFFFFFF00, + cyan = 0xFF00FFFF, + magenta = 0xFFFF00FF, + gray = 0xFF808080, + purple = 0xFF800080 +}; + +/// enter a profiling range by calling nvtxRangePushEx +/// \param[in] range_title a char string containing the range title +/// \param[in] range_color the range color +inline void range_push(const char* range_title, argbColor range_color) { + nvtxEventAttributes_t eventAttrib = {0}; + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.color = static_cast(range_color); + eventAttrib.message.ascii = range_title; + nvtxRangePushEx(&eventAttrib); +} -#endif // TILEDARRAY_HAS_HIP +/// exits the current profiling range by calling nvtxRangePopEx +inline void range_pop() { nvtxRangePop(); } + +} // namespace nvidia +#endif // #ifdef TILEDARRAY_HAS_DEVICE + +} // namespace TiledArray -#endif // TILEDARRAY_EXTERNAL_HIP_H__INCLUDED +#endif // TILEDARRAY_EXTERNAL_DEVICE_H__INCLUDED diff --git a/src/TiledArray/external/umpire.h b/src/TiledArray/external/umpire.h index 71508226a4..e8d0d48632 100644 --- a/src/TiledArray/external/umpire.h +++ b/src/TiledArray/external/umpire.h @@ -71,7 +71,7 @@ std::mutex MutexLock::mtx_; /// \tparam StaticLock a type providing static `lock()` and `unlock()` methods ; /// defaults to NullLock which does not lock template -class umpire_allocator_impl { +class umpire_based_allocator_impl { public: using value_type = T; using pointer = value_type*; @@ -89,12 +89,12 @@ class umpire_allocator_impl { typename std::pointer_traits::difference_type; using size_type = std::make_unsigned_t; - umpire_allocator_impl(umpire::Allocator* umpalloc) noexcept + umpire_based_allocator_impl(umpire::Allocator* umpalloc) noexcept : umpalloc_(umpalloc) {} template - umpire_allocator_impl( - const umpire_allocator_impl& rhs) noexcept + umpire_based_allocator_impl( + const umpire_based_allocator_impl& rhs) noexcept : umpalloc_(rhs.umpalloc_) {} /// allocates memory using umpire dynamic pool @@ -140,17 +140,19 @@ class umpire_allocator_impl { private: umpire::Allocator* umpalloc_; -}; // class umpire_allocator_impl +}; // class umpire_based_allocator_impl template -bool operator==(const umpire_allocator_impl& lhs, - const umpire_allocator_impl& rhs) noexcept { +bool operator==( + const umpire_based_allocator_impl& lhs, + const umpire_based_allocator_impl& rhs) noexcept { return lhs.umpire_allocator() == rhs.umpire_allocator(); } template -bool operator!=(const umpire_allocator_impl& lhs, - const umpire_allocator_impl& rhs) noexcept { +bool operator!=( + const umpire_based_allocator_impl& lhs, + const umpire_based_allocator_impl& rhs) noexcept { return !(lhs == rhs); } @@ -195,23 +197,23 @@ namespace archive { template struct ArchiveLoadImpl> { + TiledArray::umpire_based_allocator_impl> { static inline void load( const Archive& ar, - TiledArray::umpire_allocator_impl& allocator) { + TiledArray::umpire_based_allocator_impl& allocator) { std::string allocator_name; ar& allocator_name; - allocator = TiledArray::umpire_allocator_impl( + allocator = TiledArray::umpire_based_allocator_impl( umpire::ResourceManager::getInstance().getAllocator(allocator_name)); } }; template -struct ArchiveStoreImpl> { +struct ArchiveStoreImpl< + Archive, TiledArray::umpire_based_allocator_impl> { static inline void store( const Archive& ar, - const TiledArray::umpire_allocator_impl& allocator) { + const TiledArray::umpire_based_allocator_impl& allocator) { ar& allocator.umpire_allocator()->getName(); } }; diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index 6c364be113..7f411eaeba 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -81,13 +81,14 @@ typedef Tensor TensorL; typedef Tensor> TensorZ; typedef Tensor> TensorC; -// CUDA tensor -#ifdef TILEDARRAY_HAS_CUDA - -class cudaEnv; +#ifdef TILEDARRAY_HAS_DEVICE +namespace device { +class Env; +} +using deviceEnv = device::Env; template -class cuda_allocator_impl; +class umpire_based_allocator; template > class default_init_allocator; @@ -100,32 +101,32 @@ template class MutexLock; } // namespace detail -/// pooled thread-safe CUDA UM allocator +/// pooled thread-safe unified memory (UM) allocator for device computing template -using cuda_um_allocator = - default_init_allocator, - detail::get_um_allocator>>; +using device_um_allocator = default_init_allocator< + T, umpire_based_allocator, + detail::get_um_allocator>>; -/// pooled thread-safe CUDA-based pinned host memory allocator +/// pooled thread-safe pinned host memory allocator for device computing template -using cuda_pinned_allocator = - default_init_allocator, - detail::get_pinned_allocator>>; +using device_pinned_allocator = default_init_allocator< + T, umpire_based_allocator, + detail::get_pinned_allocator>>; -/// \brief a vector that lives in CUDA Unified Memory, with most operations +/// \brief a vector that lives in UM, with most operations /// implemented on the CPU template -using cuda_um_btas_varray = ::btas::varray>; +using device_um_btas_varray = + ::btas::varray>; /** - * btas::Tensor with UM storage cuda_um_btas_varray + * btas::Tensor with UM storage device_um_btas_varray */ template using btasUMTensorVarray = - ::btas::Tensor>; + ::btas::Tensor>; -#endif +#endif // TILEDARRAY_HAS_DEVICE template class Tile; diff --git a/src/TiledArray/host/allocator.h b/src/TiledArray/host/allocator.h index dbb8f53b55..a22613fb38 100644 --- a/src/TiledArray/host/allocator.h +++ b/src/TiledArray/host/allocator.h @@ -39,9 +39,9 @@ namespace TiledArray { /// pooled, thread-safe allocator for host memory template class host_allocator_impl - : public umpire_allocator_impl> { + : public umpire_based_allocator_impl> { public: - using base_type = umpire_allocator_impl>; + using base_type = umpire_based_allocator_impl>; using typename base_type::const_pointer; using typename base_type::const_reference; using typename base_type::pointer; @@ -53,9 +53,8 @@ class host_allocator_impl template host_allocator_impl(const host_allocator_impl& rhs) noexcept - : base_type(static_cast< - const umpire_allocator_impl>&>( - rhs)) {} + : base_type(static_cast>&>(rhs)) {} template friend bool operator==(const host_allocator_impl& lhs, diff --git a/src/TiledArray/host/env.h b/src/TiledArray/host/env.h index 5852cf6a20..1b3c4f277f 100644 --- a/src/TiledArray/host/env.h +++ b/src/TiledArray/host/env.h @@ -114,7 +114,7 @@ class hostEnv { /// @return an Umpire allocator that allocates from a /// host memory pool /// @warning this is not a thread-safe allocator, should be only used when - /// wrapped into umpire_allocator_impl + /// wrapped into umpire_based_allocator_impl umpire::Allocator& host_allocator() { return host_allocator_; } // clang-format off @@ -139,7 +139,7 @@ class hostEnv { World* world_; // allocates from a dynamic, size-limited host memory pool - // N.B. not thread safe, so must be wrapped into umpire_allocator_impl + // N.B. not thread safe, so must be wrapped into umpire_based_allocator_impl umpire::Allocator host_allocator_; inline static std::unique_ptr& instance_accessor() { diff --git a/src/TiledArray/reduce_task.h b/src/TiledArray/reduce_task.h index 753ac5df58..60d536eec9 100644 --- a/src/TiledArray/reduce_task.h +++ b/src/TiledArray/reduce_task.h @@ -24,9 +24,9 @@ #include #include -#ifdef TILEDARRAY_HAS_CUDA -#include -#include +#ifdef TILEDARRAY_HAS_DEVICE +#include +#include #include #include #endif @@ -304,9 +304,10 @@ class ReduceTask { }; // class ReduceObject -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE - static void CUDART_CB cuda_reduceobject_delete_callback(void* userData) { + static void DEVICERT_CB + device_reduceobject_delete_callback(void* userData) { TA_ASSERT(!madness::is_madness_thread()); const auto t0 = TiledArray::now(); @@ -334,15 +335,15 @@ class ReduceTask { }; /// use madness task to call the destroy function, since it might call - /// cuda API + /// device API world->taskq.add(destroy_vector, objects, TaskAttributes::hipri()); const auto t1 = TiledArray::now(); - TiledArray::detail::cuda_callback_duration_ns<0>() += + TiledArray::detail::device_callback_duration_ns<0>() += TiledArray::duration_in_ns(t0, t1); } - static void CUDART_CB cuda_dependency_dec_callback(void* userData) { + static void DEVICERT_CB device_dependency_dec_callback(void* userData) { TA_ASSERT(!madness::is_madness_thread()); const auto t0 = TiledArray::now(); @@ -361,12 +362,12 @@ class ReduceTask { // " call 2\n"; const auto t1 = TiledArray::now(); - TiledArray::detail::cuda_callback_duration_ns<1>() += + TiledArray::detail::device_callback_duration_ns<1>() += TiledArray::duration_in_ns(t0, t1); } - static void CUDART_CB - cuda_dependency_dec_reduceobject_delete_callback(void* userData) { + static void DEVICERT_CB + device_dependency_dec_reduceobject_delete_callback(void* userData) { TA_ASSERT(!madness::is_madness_thread()); const auto t0 = TiledArray::now(); @@ -399,11 +400,11 @@ class ReduceTask { delete objects; const auto t1 = TiledArray::now(); - TiledArray::detail::cuda_callback_duration_ns<2>() += + TiledArray::detail::device_callback_duration_ns<2>() += TiledArray::duration_in_ns(t0, t1); } - static void CUDART_CB cuda_readyresult_reset_callback(void* userData) { + static void DEVICERT_CB device_readyresult_reset_callback(void* userData) { TA_ASSERT(!madness::is_madness_thread()); const auto t0 = TiledArray::now(); @@ -429,7 +430,7 @@ class ReduceTask { world->taskq.add(reset, objects, TaskAttributes::hipri()); const auto t1 = TiledArray::now(); - TiledArray::detail::cuda_callback_duration_ns<3>() += + TiledArray::detail::device_callback_duration_ns<3>() += TiledArray::duration_in_ns(t0, t1); } @@ -459,10 +460,10 @@ class ReduceTask { op_(*result, ready_object->arg()); // cleanup the argument -#ifdef TILEDARRAY_HAS_CUDA - auto stream_ptr = tls_cudastream_accessor(); +#ifdef TILEDARRAY_HAS_DEVICE + auto stream_ptr = device::tls_stream_accessor(); - /// non-CUDA op + /// non-device op if (stream_ptr == nullptr) { ReduceObject::destroy(ready_object); this->dec(); @@ -471,12 +472,12 @@ class ReduceTask { (*callback_object)[0] = &world_; (*callback_object)[1] = this; (*callback_object)[2] = ready_object; - CudaSafeCall( - cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - CudaSafeCall(cudaLaunchHostFunc( - *stream_ptr, cuda_dependency_dec_reduceobject_delete_callback, + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(device::launchHostFunc( + *stream_ptr, device_dependency_dec_reduceobject_delete_callback, callback_object)); - synchronize_stream(nullptr); + device::synchronize_stream(nullptr); // std::cout << std::to_string(world().rank()) + " // add 3\n"; } @@ -494,8 +495,8 @@ class ReduceTask { op_(*result, *ready_result); // cleanup the result -#ifdef TILEDARRAY_HAS_CUDA - auto stream_ptr = tls_cudastream_accessor(); +#ifdef TILEDARRAY_HAS_DEVICE + auto stream_ptr = device::tls_stream_accessor(); if (stream_ptr == nullptr) { ready_result.reset(); } else { @@ -504,11 +505,12 @@ class ReduceTask { auto callback_object = new std::vector(2); (*callback_object)[0] = &world_; (*callback_object)[1] = ready_result_heap; - CudaSafeCall( - cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - CudaSafeCall(cudaLaunchHostFunc( - *stream_ptr, cuda_readyresult_reset_callback, callback_object)); - synchronize_stream(nullptr); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(device::launchHostFunc( + *stream_ptr, device_readyresult_reset_callback, + callback_object)); + device::synchronize_stream(nullptr); // std::cout << std::to_string(world().rank()) + " // add 4\n"; } @@ -534,19 +536,19 @@ class ReduceTask { op_(*result, object->arg()); // Cleanup the argument -#ifdef TILEDARRAY_HAS_CUDA - auto stream_ptr = tls_cudastream_accessor(); +#ifdef TILEDARRAY_HAS_DEVICE + auto stream_ptr = device::tls_stream_accessor(); if (stream_ptr == nullptr) { ReduceObject::destroy(object); } else { auto callback_object = new std::vector(2); (*callback_object)[0] = &world_; (*callback_object)[1] = const_cast(object); - CudaSafeCall( - cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - CudaSafeCall(cudaLaunchHostFunc( - *stream_ptr, cuda_reduceobject_delete_callback, callback_object)); - synchronize_stream(nullptr); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(device::launchHostFunc( + *stream_ptr, device_reduceobject_delete_callback, callback_object)); + device::synchronize_stream(nullptr); // std::cout << std::to_string(world().rank()) + " add 1\n"; } #else @@ -557,16 +559,16 @@ class ReduceTask { // Decrement the dependency counter for the argument. This must // be done after the reduce call to avoid a race condition. -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE if (stream_ptr == nullptr) { this->dec(); } else { auto callback_object2 = new std::vector(1); (*callback_object2)[0] = this; - CudaSafeCall( - cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - CudaSafeCall(cudaLaunchHostFunc( - *stream_ptr, cuda_dependency_dec_callback, callback_object2)); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(device::launchHostFunc( + *stream_ptr, device_dependency_dec_callback, callback_object2)); // std::cout << std::to_string(world().rank()) + " add 2\n"; } #else @@ -585,8 +587,8 @@ class ReduceTask { op_(*result, object2->arg()); // Cleanup arguments -#ifdef TILEDARRAY_HAS_CUDA - auto stream_ptr = tls_cudastream_accessor(); +#ifdef TILEDARRAY_HAS_DEVICE + auto stream_ptr = device::tls_stream_accessor(); if (stream_ptr == nullptr) { ReduceObject::destroy(object1); ReduceObject::destroy(object2); @@ -595,11 +597,12 @@ class ReduceTask { (*callback_object1)[0] = &world_; (*callback_object1)[1] = const_cast(object1); (*callback_object1)[2] = const_cast(object2); - CudaSafeCall( - cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - CudaSafeCall(cudaLaunchHostFunc( - *stream_ptr, cuda_reduceobject_delete_callback, callback_object1)); - synchronize_stream(nullptr); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(device::launchHostFunc( + *stream_ptr, device_reduceobject_delete_callback, + callback_object1)); + device::synchronize_stream(nullptr); // std::cout << std::to_string(world().rank()) + " add 1\n"; } #else @@ -612,7 +615,7 @@ class ReduceTask { // Decrement the dependency counter for the two arguments. This // must be done after the reduce call to avoid a race condition. -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE if (stream_ptr == nullptr) { this->dec(); this->dec(); @@ -620,10 +623,10 @@ class ReduceTask { auto callback_object2 = new std::vector(2); (*callback_object2)[0] = this; (*callback_object2)[1] = this; - CudaSafeCall( - cudaSetDevice(cudaEnv::instance()->current_cuda_device_id())); - CudaSafeCall(cudaLaunchHostFunc( - *stream_ptr, cuda_dependency_dec_callback, callback_object2)); + DeviceSafeCall( + device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(device::launchHostFunc( + *stream_ptr, device_dependency_dec_callback, callback_object2)); // std::cout << std::to_string(world().rank()) + " add 2\n"; } @@ -633,13 +636,13 @@ class ReduceTask { #endif } -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE template - std::enable_if_t, void> internal_run( + std::enable_if_t, void> internal_run( const madness::TaskThreadEnv&) { TA_ASSERT(ready_result_); - auto post_result = madness::add_cuda_task(world_, op_, *ready_result_); + auto post_result = madness::add_device_task(world_, op_, *ready_result_); result_.set(post_result); if (callback_) { @@ -648,7 +651,7 @@ class ReduceTask { } template - std::enable_if_t, void> + std::enable_if_t, void> #else void #endif diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index 2903e5e7f7..413b784d22 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -317,17 +317,17 @@ constexpr const bool is_reduce_op_v = /// detect cuda tile #ifdef TILEDARRAY_HAS_CUDA template -struct is_cuda_tile : public std::false_type {}; +struct is_device_tile : public std::false_type {}; template -struct is_cuda_tile> : public is_cuda_tile {}; +struct is_device_tile> : public is_device_tile {}; template -struct is_cuda_tile> - : public is_cuda_tile::eval_type> {}; +struct is_device_tile> + : public is_device_tile::eval_type> {}; template -static constexpr const auto is_cuda_tile_v = is_cuda_tile::value; +static constexpr const auto is_device_tile_v = is_device_tile::value; #endif diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp index 47ccc00d8e..51cfc02825 100644 --- a/src/TiledArray/tiledarray.cpp +++ b/src/TiledArray/tiledarray.cpp @@ -6,9 +6,11 @@ #include +#ifdef TILEDARRAY_HAS_DEVICE #ifdef TILEDARRAY_HAS_CUDA -#include -#include +#include +#endif +#include #include #endif @@ -22,28 +24,32 @@ namespace TiledArray { namespace { -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE /// initialize cuda environment -inline void cuda_initialize() { - /// initialize cudaGlobal - cudaEnv::instance(); - // +inline void device_initialize() { + /// initialize deviceEnv + deviceEnv::instance(); +#if defined(TILEDARRAY_HAS_CUDA) cuBLASHandlePool::handle(); +#endif // initialize LibreTT librettInitialize(); } /// finalize cuda environment -inline void cuda_finalize() { - CudaSafeCall(cudaDeviceSynchronize()); +inline void device_finalize() { + DeviceSafeCall(device::deviceSynchronize()); librettFinalize(); +#if defined(TILEDARRAY_HAS_CUDA) cublasDestroy(cuBLASHandlePool::handle()); delete &cuBLASHandlePool::handle(); - // although TA::cudaEnv is a singleton, must explicitly delete it so - // that CUDA runtime is not finalized before the cudaEnv dtor is called - cudaEnv::instance().reset(nullptr); -} #endif + // although TA::deviceEnv is a singleton, must explicitly delete it so + // that the device runtime is not finalized before the deviceEnv dtor is + // called + deviceEnv::instance().reset(nullptr); +} +#endif // TILEDARRAY_HAS_DEVICE inline bool& initialized_madworld_accessor() { static bool flag = false; @@ -102,8 +108,8 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv, ? madness::initialize(argc, argv, comm, quiet) : *madness::World::find_instance(comm); TiledArray::set_default_world(default_world); -#ifdef TILEDARRAY_HAS_CUDA - TiledArray::cuda_initialize(); +#ifdef TILEDARRAY_HAS_DEVICE + TiledArray::device_initialize(); #endif TiledArray::max_threads = TiledArray::get_num_threads(); TiledArray::set_num_threads(1); @@ -164,8 +170,8 @@ void TiledArray::finalize() { TiledArray::set_num_threads(TiledArray::max_threads); TiledArray::get_default_world().gop.fence(); // this should ensure no pending // tasks using cuda allocators -#ifdef TILEDARRAY_HAS_CUDA - TiledArray::cuda_finalize(); +#ifdef TILEDARRAY_HAS_DEVICE + TiledArray::device_finalize(); #endif if (initialized_madworld()) { madness::finalize(); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 217c522018..d17dd8345b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -102,8 +102,12 @@ set(ta_test_src_files ta_test.cpp cp.cpp ) +if(CUDA_FOUND OR HIP_FOUND) + list(APPEND ta_test_src_files librett.cpp) +endif() + if(CUDA_FOUND) - list(APPEND ta_test_src_files librett.cpp expressions_cuda_um.cpp tensor_um.cpp) + list(APPEND ta_test_src_files expressions_cuda_um.cpp tensor_um.cpp) endif() # if using C++20 must use Boost 1.74 or later: diff --git a/tests/expressions_cuda_um.cpp b/tests/expressions_cuda_um.cpp index a17b749789..29408c27c2 100644 --- a/tests/expressions_cuda_um.cpp +++ b/tests/expressions_cuda_um.cpp @@ -27,7 +27,7 @@ #ifdef TILEDARRAY_HAS_CUDA -#include +#include #include #include #include "unit_test_config.h" @@ -76,8 +76,8 @@ struct UMExpressionsFixture : public TiledRangeFixture { static UMTensor permute_fn(const madness::Future& tensor_f, const Permutation& perm) { - return madness::add_cuda_task(*GlobalFixture::world, permute_task, tensor_f, - perm) + return madness::add_device_task(*GlobalFixture::world, permute_task, + tensor_f, perm) .get(); } diff --git a/tests/librett.cpp b/tests/librett.cpp index 3785071071..ced23239fd 100644 --- a/tests/librett.cpp +++ b/tests/librett.cpp @@ -22,9 +22,9 @@ #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE -#include +#include #include "unit_test_config.h" struct LibreTTFixture { @@ -69,7 +69,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) { TiledArray::permutation_to_col_major(perm); librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; status = @@ -118,7 +118,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { cudaMemcpy(a_device, a_host, A * B * sizeof(int), cudaMemcpyHostToDevice); librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; std::vector extent({B, A}); @@ -177,7 +177,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { // b(j,i,k) = a(i,j,k) librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; std::vector extent3{int(A), int(B), int(C)}; @@ -240,7 +240,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { // b(j,i,k) = a(i,j,k) librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; std::vector extent({A, B, C}); @@ -297,7 +297,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) { } librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; std::vector extent({A, A}); @@ -347,7 +347,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { } librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; std::vector extent({B, A}); @@ -397,7 +397,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { } librettHandle plan; - auto stream = TiledArray::cudaEnv::instance()->cuda_stream(0); + auto stream = TiledArray::deviceEnv::instance()->stream(0); librettResult status; // b(k,i,j) = a(i,j,k) diff --git a/tests/tensor_um.cpp b/tests/tensor_um.cpp index 33efbfd7d4..d860b7c813 100644 --- a/tests/tensor_um.cpp +++ b/tests/tensor_um.cpp @@ -18,7 +18,7 @@ * Chong Peng on 9/19/18. */ -#include +#include #include "global_fixture.h" #include "unit_test_config.h" From 125cba5e7260084438bb9d468153c3a2a7a666d3 Mon Sep 17 00:00:00 2001 From: Ed Valeev Date: Mon, 18 Sep 2023 23:50:03 +0000 Subject: [PATCH 102/592] fixup device.h for HIP --- src/TiledArray/external/device.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 5016d55b23..b829963899 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -176,11 +176,11 @@ inline namespace hip { using stream_t = hipStream_t; using error_t = hipError_t; using hostFn_t = hipHostFn_t; -using deviceProp_t = hipDeviceProp; -using deviceAttr_t = hipDeviceAttr; +using deviceProp_t = hipDeviceProp_t; +using deviceAttr_t = hipDeviceAttribute_t; #define DeviceAttributeConcurrentManagedAccess \ hipDeviceAttributeConcurrentManagedAccess -#define DEVICERT_CD HIPRT_CB +#define DEVICERT_CB enum FuncCache { FuncCachePreferNone = hipFuncCachePreferNone, @@ -202,12 +202,12 @@ inline error_t streamDestroy(stream_t stream) { } inline error_t setDevice(int device) { return hipSetDevice(device); } inline error_t deviceSetCacheConfig(FuncCache cache_config) { - return hipDeviceSetCacheConfig(static_cast(cache_config)); + return hipDeviceSetCacheConfig(static_cast(cache_config)); } inline error_t memGetInfo(size_t* free, size_t* total) { return hipMemGetInfo(free, total); } -inline error_t getDeviceProperties(deviceProp* prop, int device) { +inline error_t getDeviceProperties(deviceProp_t* prop, int device) { return hipGetDeviceProperties(prop, device); } inline error_t deviceGetAttribute(int* value, deviceAttr_t attr, int device) { From 792717fbd0af1e2eed337b911e21b1a5f0983f82 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 19 Sep 2023 06:08:43 -0400 Subject: [PATCH 103/592] phase 2 of cuda -> device conversion --- examples/cuda/cuda_librett.cpp | 2 +- src/TiledArray/device/btas_um_tensor.cpp | 4 ++-- src/TiledArray/dist_eval/array_eval.h | 12 ++++++------ src/TiledArray/dist_eval/contraction_eval.h | 4 ++-- src/TiledArray/dist_eval/unary_eval.h | 4 ++-- src/TiledArray/external/device.h | 4 ++-- src/TiledArray/external/librett.h | 4 ++-- src/TiledArray/tensor/type_traits.h | 2 +- tests/librett.cpp | 2 +- 9 files changed, 19 insertions(+), 19 deletions(-) diff --git a/examples/cuda/cuda_librett.cpp b/examples/cuda/cuda_librett.cpp index 1460f54117..d56f6362fa 100644 --- a/examples/cuda/cuda_librett.cpp +++ b/examples/cuda/cuda_librett.cpp @@ -21,7 +21,7 @@ #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE #include #include diff --git a/src/TiledArray/device/btas_um_tensor.cpp b/src/TiledArray/device/btas_um_tensor.cpp index 270f30aad4..0bf648ee42 100644 --- a/src/TiledArray/device/btas_um_tensor.cpp +++ b/src/TiledArray/device/btas_um_tensor.cpp @@ -7,7 +7,7 @@ #include // clang-format on -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE template class btas::varray>; template class btas::varray>; @@ -49,4 +49,4 @@ template class TiledArray::Tile>>; -#endif // TILEDARRAY_HAS_CUDA +#endif // TILEDARRAY_HAS_DEVICE diff --git a/src/TiledArray/dist_eval/array_eval.h b/src/TiledArray/dist_eval/array_eval.h index c9f3daf195..3bb34742cf 100644 --- a/src/TiledArray/dist_eval/array_eval.h +++ b/src/TiledArray/dist_eval/array_eval.h @@ -59,7 +59,7 @@ class LazyArrayTile { (!Op::is_consumable) && consume_ ? op_->consume(tile_) : (*op_)(tile_))); ///< conversion_type -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE // TODO need a better design on how to manage the lifetime of converted Tile mutable conversion_result_type conversion_tile_; #endif @@ -69,7 +69,7 @@ class LazyArrayTile { : tile_(), op_(), consume_(false) -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE , conversion_tile_() #endif @@ -83,7 +83,7 @@ class LazyArrayTile { : tile_(other.tile_), op_(other.op_), consume_(other.consume_) -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE , conversion_tile_() #endif @@ -100,7 +100,7 @@ class LazyArrayTile { : tile_(tile), op_(op), consume_(consume) -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE , conversion_tile_() #endif @@ -114,7 +114,7 @@ class LazyArrayTile { tile_ = other.tile_; op_ = other.op_; consume_ = other.consume_; -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE conversion_tile_ = other.conversion_tile_; #endif return *this; @@ -126,7 +126,7 @@ class LazyArrayTile { bool is_consumable() const { return consume_ || op_->permutation(); } /// Convert tile to evaluation type using the op object -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE explicit operator conversion_result_type&() const { conversion_tile_ = diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h index 0d07821a4e..18aac80c57 100644 --- a/src/TiledArray/dist_eval/contraction_eval.h +++ b/src/TiledArray/dist_eval/contraction_eval.h @@ -478,7 +478,7 @@ class Summa template static typename std::enable_if< is_lazy_tile::value -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE && !detail::is_device_tile_v #endif , @@ -490,7 +490,7 @@ class Summa madness::TaskAttributes::hipri()); } -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE /// Conversion function /// This function spawns a task that will convert a lazy tile from the diff --git a/src/TiledArray/dist_eval/unary_eval.h b/src/TiledArray/dist_eval/unary_eval.h index 32052d2700..191d247aef 100644 --- a/src/TiledArray/dist_eval/unary_eval.h +++ b/src/TiledArray/dist_eval/unary_eval.h @@ -111,7 +111,7 @@ class UnaryEvalImpl /// Task function for evaluating tiles -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE /// \param i The tile index /// \param tile The tile to be evaluated template @@ -167,7 +167,7 @@ class UnaryEvalImpl const auto target_index = DistEvalImpl_::perm_index_to_target(index); // Schedule tile evaluation task -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE TensorImpl_::world().taskq.add(self, &UnaryEvalImpl_::template eval_tile<>, target_index, arg_.get(index)); diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 5016d55b23..9321ba7cfe 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -176,7 +176,7 @@ inline namespace hip { using stream_t = hipStream_t; using error_t = hipError_t; using hostFn_t = hipHostFn_t; -using deviceProp_t = hipDeviceProp; +using deviceProp_t = hipDeviceProp_t; using deviceAttr_t = hipDeviceAttr; #define DeviceAttributeConcurrentManagedAccess \ hipDeviceAttributeConcurrentManagedAccess @@ -660,7 +660,7 @@ inline void range_push(const char* range_title, argbColor range_color) { inline void range_pop() { nvtxRangePop(); } } // namespace nvidia -#endif // #ifdef TILEDARRAY_HAS_DEVICE +#endif // TILEDARRAY_HAS_CUDA } // namespace TiledArray diff --git a/src/TiledArray/external/librett.h b/src/TiledArray/external/librett.h index 46d116c45b..8af10d7ecb 100644 --- a/src/TiledArray/external/librett.h +++ b/src/TiledArray/external/librett.h @@ -26,7 +26,7 @@ #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE #include #include @@ -110,6 +110,6 @@ void librett_permute(T* inData, T* outData, const TiledArray::Range& range, } // namespace TiledArray -#endif // TILEDARRAY_HAS_CUDA +#endif // TILEDARRAY_HAS_DEVICE #endif // TILEDARRAY_EXTERNAL_LIBRETT_H__INCLUDED diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index 413b784d22..e9d1681f71 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -315,7 +315,7 @@ constexpr const bool is_reduce_op_v = is_reduce_op_::value; /// detect cuda tile -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE template struct is_device_tile : public std::false_type {}; diff --git a/tests/librett.cpp b/tests/librett.cpp index ced23239fd..666b13e9c4 100644 --- a/tests/librett.cpp +++ b/tests/librett.cpp @@ -22,7 +22,7 @@ #include -#ifdef TILEDARRAY_HAS_DEVICE +#ifdef TILEDARRAY_HAS_CUDA #include #include "unit_test_config.h" From 75aa6204f69a4b213c47841d86d4af37a98e99e2 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 19 Sep 2023 06:10:55 -0400 Subject: [PATCH 104/592] bump LibreTT tag to pull in https://github.com/victor-anisimov/Librett/pull/11 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 00541a9135..c9dd6b2d87 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -67,7 +67,7 @@ Optional prerequisites: - device programming runtime: - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required. - [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators. Note that TiledArray does not use ROCm directly but its C++ Heterogeneous-Compute Interface for Portability, `HIP`; although HIP can also be used to program CUDA-enabled devices, in TiledArray it is used only to program ROCm devices, hence ROCm and HIP will be used interchangeably. - - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 354e0ccee54aeb2f191c3ce2c617ebf437e49d83). + - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 6eed30d4dd2a5aa58840fe895dcffd80be7fbece). - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag f9640e0fa4245691cdd434e4f719ac5f7d455f82). - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later). - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing: diff --git a/external/versions.cmake b/external/versions.cmake index 4bea408a0d..40e94c3509 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -27,8 +27,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) set(TA_TRACKED_BTAS_TAG 3c91f086090390930bba62c6512c4e74a5520e76) set(TA_TRACKED_BTAS_PREVIOUS_TAG 5a45699b78d0540b490c8c769b61033bd4d4f49c) -set(TA_TRACKED_LIBRETT_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) -set(TA_TRACKED_LIBRETT_PREVIOUS_TAG f5ebdbbba9c9689aa4613a5469021db2dacd8e46) +set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) +set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) set(TA_TRACKED_UMPIRE_TAG f9640e0fa4245691cdd434e4f719ac5f7d455f82) set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v6.0.0) From a7b0ab3720c110318c6e3e2b05dcae3c68b26964 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 19 Sep 2023 07:58:03 -0400 Subject: [PATCH 105/592] tests/librett: cuda -> device --- src/TiledArray/external/device.h | 125 +++++++++++++++++++++++++++++++ tests/librett.cpp | 106 ++++++++++++++------------ 2 files changed, 182 insertions(+), 49 deletions(-) diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 185332bdc4..49ed38425a 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -131,6 +131,20 @@ using deviceAttr_t = cudaDeviceAttr; cudaDevAttrConcurrentManagedAccess #define DEVICERT_CB CUDART_CB +enum MemAttach { + MemAttachGlobal = cudaMemAttachGlobal, + MemAttachHost = cudaMemAttachHost, + MemAttachSingle = cudaMemAttachSingle +}; + +enum MemcpyKind { + MemcpyHostToHost = cudaMemcpyHostToHost, + MemcpyHostToDevice = cudaMemcpyHostToDevice, + MemcpyDeviceToHost = cudaMemcpyDeviceToHost, + MemcpyDeviceToDevice = cudaMemcpyDeviceToDevice, + MemcpyDefault = cudaMemcpyDefault +}; + enum FuncCache { FuncCachePreferNone = cudaFuncCachePreferNone, FuncCachePreferShared = cudaFuncCachePreferShared, @@ -146,30 +160,78 @@ enum StreamCreateFlags { inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) { return cudaLaunchHostFunc(stream, fn, userData); } + inline error_t streamDestroy(stream_t stream) { return cudaStreamDestroy(stream); } + inline error_t setDevice(int device) { return device::setDevice(device); } + inline error_t deviceSetCacheConfig(FuncCache cache_config) { return cudaDeviceSetCacheConfig(static_cast(cache_config)); } + inline error_t memGetInfo(size_t* free, size_t* total) { return cudaMemGetInfo(free, total); } + inline error_t getDeviceProperties(deviceProp_t* prop, int device) { return cudaGetDeviceProperties(prop, device); } + inline error_t deviceGetAttribute(int* value, deviceAttr_t attr, int device) { return cudaDeviceGetAttribute(value, attr, device); } + inline error_t streamCreateWithFlags(stream_t* pStream, StreamCreateFlags flags) { return cudaStreamCreateWithFlags(pStream, flags); } + inline error_t deviceSynchronize() { return cudaDeviceSynchronize(); } inline error_t streamSynchronize(stream_t stream) { return cudaStreamSynchronize(stream); } + +template +inline error_t malloc(T** devPtr, size_t size) { + return cudaMalloc(devPtr, size); +} + +template +inline error_t mallocHost(T** devPtr, size_t size) { + return cudaMallocHost(devPtr, size); +} + +template +inline error_t mallocManaged(T** devPtr, size_t size, + unsigned int flag = MemAttachGlobal) { + return cudaMallocManaged(devPtr, size, flag); +} + +template +error_t free(T* devPtr) { + return cudaFree(devPtr); +} + +template +error_t memcpy(T* dst, const T* src, size_t count, MemcpyKind kind) { + return cudaMemcpy(dst, src, count, static_cast(kind)); +} + +template +error_t memcpyAsync(T* dst, const T* src, size_t count, MemcpyKind kind, + stream_t stream = 0) { + return cudaMemcpyAsync(dst, src, count, static_cast(kind), + stream); +} + +template +error_t memPrefetchAsync(const T* devPtr, size_t count, int dstDevice, + stream_t stream = 0) { + return cudaMemPrefetchAsync(devPtr, count, dstDevice, stream); +} + } // namespace cuda #elif defined(TILEDARRAY_HAS_HIP) inline namespace hip { @@ -182,6 +244,20 @@ using deviceAttr_t = hipDeviceAttribute_t; hipDeviceAttributeConcurrentManagedAccess #define DEVICERT_CB +enum MemcpyKind { + MemcpyHostToHost = hipMemcpyHostToHost, + MemcpyHostToDevice = hipMemcpyHostToDevice, + MemcpyDeviceToHost = hipMemcpyDeviceToHost, + MemcpyDeviceToDevice = hipMemcpyDeviceToDevice, + MemcpyDefault = hipMemcpyDefault +}; + +enum MemAttach { + MemAttachGlobal = hipMemAttachGlobal, + MemAttachHost = hipMemAttachHost, + MemAttachSingle = hipMemAttachSingle +}; + enum FuncCache { FuncCachePreferNone = hipFuncCachePreferNone, FuncCachePreferShared = hipFuncCachePreferShared, @@ -197,30 +273,79 @@ enum StreamCreateFlags { inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) { return hipLaunchHostFunc(stream, fn, userData); } + inline error_t streamDestroy(stream_t stream) { return hipStreamDestroy(stream); } + inline error_t setDevice(int device) { return hipSetDevice(device); } + inline error_t deviceSetCacheConfig(FuncCache cache_config) { return hipDeviceSetCacheConfig(static_cast(cache_config)); } + inline error_t memGetInfo(size_t* free, size_t* total) { return hipMemGetInfo(free, total); } + inline error_t getDeviceProperties(deviceProp_t* prop, int device) { return hipGetDeviceProperties(prop, device); } + inline error_t deviceGetAttribute(int* value, deviceAttr_t attr, int device) { return hipDeviceGetAttribute(value, attr, device); } + inline error_t streamCreateWithFlags(stream_t* pStream, StreamCreateFlags flags) { return hipStreamCreateWithFlags(pStream, flags); } + inline error_t deviceSynchronize() { return hipDeviceSynchronize(); } + inline error_t streamSynchronize(stream_t stream) { return hipStreamSynchronize(stream); } + +template +inline error_t malloc(T** devPtr, size_t size) { + return hipMalloc(devPtr, size); +} + +template +inline error_t mallocHost(T** devPtr, size_t size) { + return hipMallocHost(devPtr, size); +} + +template +inline error_t mallocManaged(T** devPtr, size_t size, + unsigned int flag = MemAttachGlobal) { + return hipMallocManaged(devPtr, size, flag); +} + +template +error_t free(T* devPtr) { + return hipFree(devPtr); +} + +template +error_t memcpy(T* dst, const T* src, size_t count, MemcpyKind kind) { + return hipMemcpy(dst, src, count, static_cast(kind)); +} + +template +error_t memcpyAsync(T* dst, const T* src, size_t count, MemcpyKind kind, + stream_t stream = 0) { + return hipMemcpyAsync(dst, src, count, static_cast(kind), + stream); +} + +template +error_t memPrefetchAsync(const T* devPtr, size_t count, int dstDevice, + stream_t stream = 0) { + return hipMemPrefetchAsync(devPtr, count, dstDevice, stream); +} + } // namespace hip #endif diff --git a/tests/librett.cpp b/tests/librett.cpp index 666b13e9c4..de0c771a50 100644 --- a/tests/librett.cpp +++ b/tests/librett.cpp @@ -22,7 +22,7 @@ #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE #include #include "unit_test_config.h" @@ -56,11 +56,12 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) { } } int* a_device; - cudaMalloc(&a_device, A * A * sizeof(int)); + TiledArray::device::malloc(&a_device, A * A * sizeof(int)); int* b_device; - cudaMalloc(&b_device, A * A * sizeof(int)); + TiledArray::device::malloc(&b_device, A * A * sizeof(int)); - cudaMemcpy(a_device, a_host, A * A * sizeof(int), cudaMemcpyHostToDevice); + TiledArray::device::memcpy(a_device, a_host, A * A * sizeof(int), + TiledArray::device::MemcpyHostToDevice); std::vector extent({A, A}); TiledArray::extent_to_col_major(extent); @@ -82,7 +83,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) { BOOST_CHECK(status == LIBRETT_SUCCESS); librettDestroy(plan); - cudaMemcpy(b_host, b_device, A * A * sizeof(int), cudaMemcpyDeviceToHost); + TiledArray::device::memcpy(b_host, b_device, A * A * sizeof(int), + TiledArray::device::MemcpyDeviceToHost); iter = 0; for (std::size_t i = 0; i < A; i++) { @@ -95,8 +97,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) { free(a_host); free(b_host); - cudaFree(a_device); - cudaFree(b_device); + TiledArray::device::free(a_device); + TiledArray::device::free(b_device); } BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { @@ -111,11 +113,12 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { } int* a_device; - cudaMalloc(&a_device, A * B * sizeof(int)); + TiledArray::device::malloc(&a_device, A * B * sizeof(int)); int* b_device; - cudaMalloc(&b_device, A * B * sizeof(int)); + TiledArray::device::malloc(&b_device, A * B * sizeof(int)); - cudaMemcpy(a_device, a_host, A * B * sizeof(int), cudaMemcpyHostToDevice); + TiledArray::device::memcpy(a_device, a_host, A * B * sizeof(int), + TiledArray::device::MemcpyHostToDevice); librettHandle plan; auto stream = TiledArray::deviceEnv::instance()->stream(0); @@ -137,7 +140,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { BOOST_CHECK(status == LIBRETT_SUCCESS); librettDestroy(plan); - cudaMemcpy(b_host, b_device, A * B * sizeof(int), cudaMemcpyDeviceToHost); + TiledArray::device::memcpy(b_host, b_device, A * B * sizeof(int), + TiledArray::device::MemcpyDeviceToHost); iter = 0; for (std::size_t i = 0; i < B; i++) { @@ -150,8 +154,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { free(a_host); free(b_host); - cudaFree(a_device); - cudaFree(b_device); + TiledArray::device::free(a_device); + TiledArray::device::free(b_device); } BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { @@ -168,11 +172,12 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { } int* a_device; - cudaMalloc(&a_device, A * B * C * sizeof(int)); + TiledArray::device::malloc(&a_device, A * B * C * sizeof(int)); int* b_device; - cudaMalloc(&b_device, A * B * C * sizeof(int)); + TiledArray::device::malloc(&b_device, A * B * C * sizeof(int)); - cudaMemcpy(a_device, a_host, A * B * C * sizeof(int), cudaMemcpyHostToDevice); + TiledArray::device::memcpy(a_device, a_host, A * B * C * sizeof(int), + TiledArray::device::MemcpyHostToDevice); // b(j,i,k) = a(i,j,k) @@ -194,7 +199,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { BOOST_CHECK(status == LIBRETT_SUCCESS); - cudaMemcpy(b_host, b_device, A * B * C * sizeof(int), cudaMemcpyDeviceToHost); + TiledArray::device::memcpy(b_host, b_device, A * B * C * sizeof(int), + TiledArray::device::MemcpyDeviceToHost); status = librettDestroy(plan); @@ -213,8 +219,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { free(a_host); free(b_host); - cudaFree(a_device); - cudaFree(b_device); + TiledArray::device::free(a_device); + TiledArray::device::free(b_device); } BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { @@ -231,11 +237,12 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { } int* a_device; - cudaMalloc(&a_device, A * B * C * sizeof(int)); + TiledArray::device::malloc(&a_device, A * B * C * sizeof(int)); int* b_device; - cudaMalloc(&b_device, A * B * C * sizeof(int)); + TiledArray::device::malloc(&b_device, A * B * C * sizeof(int)); - cudaMemcpy(a_device, a_host, A * B * C * sizeof(int), cudaMemcpyHostToDevice); + TiledArray::device::memcpy(a_device, a_host, A * B * C * sizeof(int), + TiledArray::device::MemcpyHostToDevice); // b(j,i,k) = a(i,j,k) @@ -258,7 +265,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { BOOST_CHECK(status == LIBRETT_SUCCESS); - cudaMemcpy(b_host, b_device, A * B * C * sizeof(int), cudaMemcpyDeviceToHost); + TiledArray::device::memcpy(b_host, b_device, A * B * C * sizeof(int), + TiledArray::device::MemcpyDeviceToHost); status = librettDestroy(plan); @@ -277,16 +285,16 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { free(a_host); free(b_host); - cudaFree(a_device); - cudaFree(b_device); + TiledArray::device::free(a_device); + TiledArray::device::free(b_device); } BOOST_AUTO_TEST_CASE(librett_unified_mem) { int* a_um; - cudaMallocManaged(&a_um, A * A * sizeof(int)); + TiledArray::device::mallocManaged(&a_um, A * A * sizeof(int)); int* b_um; - cudaMallocManaged(&b_um, A * A * sizeof(int)); + TiledArray::device::mallocManaged(&b_um, A * A * sizeof(int)); int iter = 0; for (std::size_t i = 0; i < A; i++) { @@ -317,7 +325,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) { librettDestroy(plan); - cudaDeviceSynchronize(); + TiledArray::device::deviceSynchronize(); iter = 0; for (std::size_t i = 0; i < A; i++) { @@ -327,16 +335,16 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) { } } - cudaFree(a_um); - cudaFree(b_um); + TiledArray::device::free(a_um); + TiledArray::device::free(b_um); } BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { int* a_um; - cudaMallocManaged(&a_um, A * B * sizeof(int)); + TiledArray::device::mallocManaged(&a_um, A * B * sizeof(int)); int* b_um; - cudaMallocManaged(&b_um, A * B * sizeof(int)); + TiledArray::device::mallocManaged(&b_um, A * B * sizeof(int)); int iter = 0; for (std::size_t i = 0; i < B; i++) { @@ -366,7 +374,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { BOOST_CHECK(status == LIBRETT_SUCCESS); librettDestroy(plan); - cudaDeviceSynchronize(); + TiledArray::device::deviceSynchronize(); iter = 0; for (std::size_t i = 0; i < B; i++) { @@ -375,16 +383,16 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { iter++; } } - cudaFree(a_um); - cudaFree(b_um); + TiledArray::device::free(a_um); + TiledArray::device::free(b_um); } BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { int* a_um; - cudaMallocManaged(&a_um, A * B * C * sizeof(int)); + TiledArray::device::mallocManaged(&a_um, A * B * C * sizeof(int)); int* b_um; - cudaMallocManaged(&b_um, A * B * C * sizeof(int)); + TiledArray::device::mallocManaged(&b_um, A * B * C * sizeof(int)); int iter = 0; for (std::size_t i = 0; i < A; i++) { @@ -418,7 +426,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { BOOST_CHECK(status == LIBRETT_SUCCESS); librettDestroy(plan); - cudaDeviceSynchronize(); + TiledArray::device::deviceSynchronize(); iter = 0; for (std::size_t i = 0; i < A; i++) { @@ -429,8 +437,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { } } } - cudaFree(a_um); - cudaFree(b_um); + TiledArray::device::free(a_um); + TiledArray::device::free(b_um); } BOOST_AUTO_TEST_CASE(librett_um_tensor) { @@ -455,7 +463,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor) { auto b = permute(a, permutation); - cudaDeviceSynchronize(); + TiledArray::device::deviceSynchronize(); iter = 0; for (std::size_t i = 0; i < A; i++) { for (std::size_t j = 0; j < A; j++) { @@ -487,7 +495,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_nonsym) { auto b = permute(a, permutation); - cudaDeviceSynchronize(); + TiledArray::device::deviceSynchronize(); iter = 0; for (std::size_t i = 0; i < B; i++) { for (std::size_t j = 0; j < A; j++) { @@ -521,7 +529,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_three) { auto b = permute(a, permutation); - cudaDeviceSynchronize(); + TiledArray::device::deviceSynchronize(); iter = 0; for (std::size_t i = 0; i < A; i++) { for (std::size_t j = 0; j < B; j++) { @@ -539,7 +547,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_three) { auto b = permute(a, permutation); - cudaDeviceSynchronize(); + TiledArray::device::deviceSynchronize(); iter = 0; for (std::size_t i = 0; i < A; i++) { for (std::size_t j = 0; j < B; j++) { @@ -584,7 +592,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_four) { auto tile_b = permute(tile_a, permutation); - cudaDeviceSynchronize(); + TiledArray::device::deviceSynchronize(); // validate iter = 0; for (std::size_t i = 0; i < a; i++) { @@ -605,7 +613,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_four) { auto tile_b = permute(tile_a, permutation); - cudaDeviceSynchronize(); + TiledArray::device::deviceSynchronize(); // validate iter = 0; for (std::size_t i = 0; i < a; i++) { @@ -659,7 +667,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_six) { auto tile_b = permute(tile_a, permutation); - cudaDeviceSynchronize(); + TiledArray::device::deviceSynchronize(); // validate iter = 0; for (std::size_t i = 0; i < a; i++) { @@ -684,7 +692,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_six) { auto tile_b = permute(tile_a, permutation); - cudaDeviceSynchronize(); + TiledArray::device::deviceSynchronize(); // validate iter = 0; for (std::size_t i = 0; i < a; i++) { @@ -705,4 +713,4 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_six) { } BOOST_AUTO_TEST_SUITE_END() -#endif // TILEDARRAY_HAS_CUDA +#endif // TILEDARRAY_HAS_DEVICE From b0fc8a31a6ef36e584367a43678f902a389c4235 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 19 Sep 2023 08:06:46 -0400 Subject: [PATCH 106/592] updated header guards in device/ --- src/TiledArray/device/allocators.h | 6 +++--- src/TiledArray/device/btas_um_tensor.h | 6 +++--- src/TiledArray/device/cpu_cuda_vector.h | 6 +++--- src/TiledArray/device/device_task_fn.h | 6 +++--- src/TiledArray/device/kernel/mult_kernel.h | 6 +++--- src/TiledArray/device/kernel/mult_kernel_impl.h | 6 +++--- src/TiledArray/device/kernel/reduce_kernel.h | 6 +++--- src/TiledArray/device/kernel/reduce_kernel_impl.h | 6 +++--- src/TiledArray/device/platform.h | 6 +++--- src/TiledArray/device/thrust.h | 6 +++--- src/TiledArray/device/um_storage.h | 6 +++--- 11 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/TiledArray/device/allocators.h b/src/TiledArray/device/allocators.h index ff3ed6a3ac..2bda79e768 100644 --- a/src/TiledArray/device/allocators.h +++ b/src/TiledArray/device/allocators.h @@ -21,8 +21,8 @@ * */ -#ifndef TILEDARRAY_CUDA_ALLOCATORS_H___INCLUDED -#define TILEDARRAY_CUDA_ALLOCATORS_H___INCLUDED +#ifndef TILEDARRAY_DEVICE_ALLOCATORS_H___INCLUDED +#define TILEDARRAY_DEVICE_ALLOCATORS_H___INCLUDED #include @@ -135,4 +135,4 @@ struct ArchiveStoreImpl @@ -829,4 +829,4 @@ extern template class TiledArray::Tile @@ -231,4 +231,4 @@ struct ArchiveStoreImpl> { } // namespace archive } // namespace madness -#endif // TILEDARRAY_CUDA_CPU_CUDA_VECTOR_H__INCLUDED +#endif // TILEDARRAY_DEVICE_CPU_CUDA_VECTOR_H__INCLUDED diff --git a/src/TiledArray/device/device_task_fn.h b/src/TiledArray/device/device_task_fn.h index a4b9db92e4..8d2ab0e248 100644 --- a/src/TiledArray/device/device_task_fn.h +++ b/src/TiledArray/device/device_task_fn.h @@ -2,8 +2,8 @@ // Created by Chong Peng on 2019-03-20. // -#ifndef TILEDARRAY_CUDA_CUDA_TASK_FN_H__INCLUDED -#define TILEDARRAY_CUDA_CUDA_TASK_FN_H__INCLUDED +#ifndef TILEDARRAY_DEVICE_CUDA_TASK_FN_H__INCLUDED +#define TILEDARRAY_DEVICE_CUDA_TASK_FN_H__INCLUDED #include @@ -867,4 +867,4 @@ typename detail::memfunc_enabler::type add_device_task( } // namespace madness #endif // TILDARRAY_HAS_DEVICE -#endif // TILEDARRAY_CUDA_CUDA_TASK_FN_H__INCLUDED +#endif // TILEDARRAY_DEVICE_CUDA_TASK_FN_H__INCLUDED diff --git a/src/TiledArray/device/kernel/mult_kernel.h b/src/TiledArray/device/kernel/mult_kernel.h index 0c5c3f7822..1ea7b9de6d 100644 --- a/src/TiledArray/device/kernel/mult_kernel.h +++ b/src/TiledArray/device/kernel/mult_kernel.h @@ -21,8 +21,8 @@ * */ -#ifndef TILEDARRAY_CUDA_MULT_KERNEL_H__INCLUDED -#define TILEDARRAY_CUDA_MULT_KERNEL_H__INCLUDED +#ifndef TILEDARRAY_DEVICE_MULT_KERNEL_H__INCLUDED +#define TILEDARRAY_DEVICE_MULT_KERNEL_H__INCLUDED #include @@ -74,4 +74,4 @@ void mult_cuda_kernel(std::complex *result, #endif // TILEDARRAY_HAS_CUDA -#endif // TILEDARRAY_CUDA_MULT_KERNEL_H__INCLUDED +#endif // TILEDARRAY_DEVICE_MULT_KERNEL_H__INCLUDED diff --git a/src/TiledArray/device/kernel/mult_kernel_impl.h b/src/TiledArray/device/kernel/mult_kernel_impl.h index a1c217ce3d..1d2a35e862 100644 --- a/src/TiledArray/device/kernel/mult_kernel_impl.h +++ b/src/TiledArray/device/kernel/mult_kernel_impl.h @@ -21,8 +21,8 @@ * */ -#ifndef TILEDARRAY_CUDA_MULT_KERNEL_IMPL_H__INCLUDED -#define TILEDARRAY_CUDA_MULT_KERNEL_IMPL_H__INCLUDED +#ifndef TILEDARRAY_DEVICE_MULT_KERNEL_IMPL_H__INCLUDED +#define TILEDARRAY_DEVICE_MULT_KERNEL_IMPL_H__INCLUDED #include #include @@ -58,4 +58,4 @@ void mult_cuda_kernel_impl(T *result, const T *arg1, const T *arg2, } // namespace TiledArray -#endif // TILEDARRAY_CUDA_MULT_KERNEL_IMPL_H__INCLUDED +#endif // TILEDARRAY_DEVICE_MULT_KERNEL_IMPL_H__INCLUDED diff --git a/src/TiledArray/device/kernel/reduce_kernel.h b/src/TiledArray/device/kernel/reduce_kernel.h index 1bcf526ee4..8910da8b69 100644 --- a/src/TiledArray/device/kernel/reduce_kernel.h +++ b/src/TiledArray/device/kernel/reduce_kernel.h @@ -21,8 +21,8 @@ * */ -#ifndef TILEDARRAY_CUDA_REDUCE_KERNEL_H__INCLUDED -#define TILEDARRAY_CUDA_REDUCE_KERNEL_H__INCLUDED +#ifndef TILEDARRAY_DEVICE_REDUCE_KERNEL_H__INCLUDED +#define TILEDARRAY_DEVICE_REDUCE_KERNEL_H__INCLUDED #include @@ -128,4 +128,4 @@ std::complex absmin_cuda_kernel(const std::complex *arg, #endif // TILEDARRAY_HAS_CUDA -#endif // TILEDARRAY_CUDA_REDUCE_KERNEL_H__INCLUDED +#endif // TILEDARRAY_DEVICE_REDUCE_KERNEL_H__INCLUDED diff --git a/src/TiledArray/device/kernel/reduce_kernel_impl.h b/src/TiledArray/device/kernel/reduce_kernel_impl.h index f03e333dbb..6daae446bf 100644 --- a/src/TiledArray/device/kernel/reduce_kernel_impl.h +++ b/src/TiledArray/device/kernel/reduce_kernel_impl.h @@ -21,8 +21,8 @@ * */ -#ifndef TILEDARRAY_CUDA_REDUCE_KERNEL_IMPL_H__INCLUDED -#define TILEDARRAY_CUDA_REDUCE_KERNEL_IMPL_H__INCLUDED +#ifndef TILEDARRAY_DEVICE_REDUCE_KERNEL_IMPL_H__INCLUDED +#define TILEDARRAY_DEVICE_REDUCE_KERNEL_IMPL_H__INCLUDED #include @@ -136,4 +136,4 @@ TiledArray::detail::scalar_t absmin_reduce_cuda_kernel_impl( } // namespace TiledArray -#endif // TILEDARRAY_CUDA_REDUCE_KERNEL_IMPL_H__INCLUDED +#endif // TILEDARRAY_DEVICE_REDUCE_KERNEL_IMPL_H__INCLUDED diff --git a/src/TiledArray/device/platform.h b/src/TiledArray/device/platform.h index 9d0cac3cdd..d30a204fb4 100644 --- a/src/TiledArray/device/platform.h +++ b/src/TiledArray/device/platform.h @@ -21,8 +21,8 @@ * */ -#ifndef TILEDARRAY_CUDA_PLATFORM_H__INCLUDED -#define TILEDARRAY_CUDA_PLATFORM_H__INCLUDED +#ifndef TILEDARRAY_DEVICE_PLATFORM_H__INCLUDED +#define TILEDARRAY_DEVICE_PLATFORM_H__INCLUDED namespace TiledArray { @@ -62,4 +62,4 @@ enum class ExecutionSpace { Host, Device }; } // namespace TiledArray -#endif // TILEDARRAY_CUDA_PLATFORM_H__INCLUDED +#endif // TILEDARRAY_DEVICE_PLATFORM_H__INCLUDED diff --git a/src/TiledArray/device/thrust.h b/src/TiledArray/device/thrust.h index fe9d02c529..6007c7a4de 100644 --- a/src/TiledArray/device/thrust.h +++ b/src/TiledArray/device/thrust.h @@ -21,8 +21,8 @@ * */ -#ifndef TILEDARRAY_CUDA_THRUST_H__INCLUDED -#define TILEDARRAY_CUDA_THRUST_H__INCLUDED +#ifndef TILEDARRAY_DEVICE_THRUST_H__INCLUDED +#define TILEDARRAY_DEVICE_THRUST_H__INCLUDED #include @@ -59,4 +59,4 @@ void resize(thrust::device_vector& dev_vec, size_t size); #endif // TILEDARRAY_HAS_CUDA -#endif // TILEDARRAY_CUDA_THRUST_H__INCLUDED +#endif // TILEDARRAY_DEVICE_THRUST_H__INCLUDED diff --git a/src/TiledArray/device/um_storage.h b/src/TiledArray/device/um_storage.h index e4318e5666..a4cc2dcc76 100644 --- a/src/TiledArray/device/um_storage.h +++ b/src/TiledArray/device/um_storage.h @@ -21,8 +21,8 @@ * */ -#ifndef TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED -#define TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED +#ifndef TILEDARRAY_DEVICE_UM_VECTOR_H__INCLUDED +#define TILEDARRAY_DEVICE_UM_VECTOR_H__INCLUDED #include #include @@ -153,4 +153,4 @@ struct ArchiveStoreImpl> { #endif // TILEDARRAY_HAS_CUDA -#endif // TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED +#endif // TILEDARRAY_DEVICE_UM_VECTOR_H__INCLUDED From bbbdf25fd976df2d25583aabee07a9592116cf02 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 19 Sep 2023 08:15:28 -0400 Subject: [PATCH 107/592] more cuda -> device in um_storage and btas_um_tensor --- src/TiledArray/device/btas_um_tensor.h | 14 +++++++------- src/TiledArray/device/um_storage.h | 22 ++++++++++++++-------- src/TiledArray/external/device.h | 16 +++++++++++++++- 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/src/TiledArray/device/btas_um_tensor.h b/src/TiledArray/device/btas_um_tensor.h index 8d2c8f17c8..3c2a120fd3 100644 --- a/src/TiledArray/device/btas_um_tensor.h +++ b/src/TiledArray/device/btas_um_tensor.h @@ -624,7 +624,7 @@ void to_host( } world.gop.fence(); - DeviceSafeCall(cudaDeviceSynchronize()); + DeviceSafeCall(device::deviceSynchronize()); }; /// to device for UM Array @@ -666,9 +666,9 @@ um_tensor_to_ta_tensor( auto &stream = deviceEnv::instance()->stream_d2h(); DeviceSafeCall( - cudaMemcpyAsync(result.data(), tile.data(), - tile.size() * sizeof(typename TATensor::value_type), - cudaMemcpyDefault, stream)); + device::memcpyAsync(result.data(), tile.data(), + tile.size() * sizeof(typename TATensor::value_type), + device::MemcpyDefault, stream)); device::synchronize_stream(&stream); return result; @@ -729,9 +729,9 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { Tensor result(tile.range(), std::move(storage)); DeviceSafeCall( - cudaMemcpyAsync(result.data(), tile.data(), - tile.size() * sizeof(typename Tensor::value_type), - cudaMemcpyDefault, stream)); + device::memcpyAsync(result.data(), tile.data(), + tile.size() * sizeof(typename Tensor::value_type), + device::MemcpyDefault, stream)); device::synchronize_stream(&stream); return TiledArray::Tile(std::move(result)); diff --git a/src/TiledArray/device/um_storage.h b/src/TiledArray/device/um_storage.h index a4cc2dcc76..9dd3242f45 100644 --- a/src/TiledArray/device/um_storage.h +++ b/src/TiledArray/device/um_storage.h @@ -39,9 +39,11 @@ namespace TiledArray { +#ifdef TILEDARRAY_HAS_CUDA template using cuda_um_thrust_vector = thrust::device_vector>; +#endif // TILEDARRAY_HAS_CUDA /// @return true if @c dev_vec is present in space @space template @@ -55,16 +57,16 @@ bool in_memory_space(const Storage& vec) noexcept { * device_um_btas_varray */ template -void to_execution_space(Storage& vec, cudaStream_t stream = 0) { +void to_execution_space(Storage& vec, device::stream_t stream = 0) { switch (Space) { case ExecutionSpace::Host: { using std::data; using std::size; using value_type = typename Storage::value_type; if (deviceEnv::instance()->concurrent_managed_access()) { - DeviceSafeCall(cudaMemPrefetchAsync(data(vec), - size(vec) * sizeof(value_type), - cudaCpuDeviceId, stream)); + DeviceSafeCall(device::memPrefetchAsync(data(vec), + size(vec) * sizeof(value_type), + device::CpuDeviceId, stream)); } break; } @@ -74,8 +76,8 @@ void to_execution_space(Storage& vec, cudaStream_t stream = 0) { using value_type = typename Storage::value_type; int device = -1; if (deviceEnv::instance()->concurrent_managed_access()) { - DeviceSafeCall(cudaGetDevice(&device)); - DeviceSafeCall(cudaMemPrefetchAsync( + DeviceSafeCall(device::getDevice(&device)); + DeviceSafeCall(device::memPrefetchAsync( data(vec), size(vec) * sizeof(value_type), device, stream)); } break; @@ -90,11 +92,11 @@ void to_execution_space(Storage& vec, cudaStream_t stream = 0) { * * @param storage UM Storage type object * @param n size of um storage object - * @param stream cuda stream used to perform prefetch + * @param stream device stream used to perform prefetch */ template void make_device_storage(Storage& storage, std::size_t n, - const cudaStream_t& stream = 0) { + const device::stream_t& stream = 0) { storage = Storage(n); TiledArray::to_execution_space(storage, stream); @@ -127,6 +129,8 @@ const typename Storage::value_type* device_data(const Storage& storage) { namespace madness { namespace archive { +#ifdef TILEDARRAY_HAS_CUDA + template struct ArchiveLoadImpl> { static inline void load(const Archive& ar, @@ -148,6 +152,8 @@ struct ArchiveStoreImpl> { } }; +#endif // TILEDARRAY_HAS_CUDA + } // namespace archive } // namespace madness diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 49ed38425a..3d02192c2d 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -131,6 +131,11 @@ using deviceAttr_t = cudaDeviceAttr; cudaDevAttrConcurrentManagedAccess #define DEVICERT_CB CUDART_CB +enum DeviceId { + CpuDeviceId = cudaCpuDeviceId, + InvalidDeviceId = cudaInvalidDeviceId +}; + enum MemAttach { MemAttachGlobal = cudaMemAttachGlobal, MemAttachHost = cudaMemAttachHost, @@ -165,7 +170,9 @@ inline error_t streamDestroy(stream_t stream) { return cudaStreamDestroy(stream); } -inline error_t setDevice(int device) { return device::setDevice(device); } +inline error_t setDevice(int device) { return cudaSetDevice(device); } + +inline error_t getDevice(int* device) { return cudaGetDevice(device); } inline error_t deviceSetCacheConfig(FuncCache cache_config) { return cudaDeviceSetCacheConfig(static_cast(cache_config)); @@ -244,6 +251,11 @@ using deviceAttr_t = hipDeviceAttribute_t; hipDeviceAttributeConcurrentManagedAccess #define DEVICERT_CB +enum DeviceId { + CpuDeviceId = hipCpuDeviceId, + InvalidDeviceId = hipInvalidDeviceId +}; + enum MemcpyKind { MemcpyHostToHost = hipMemcpyHostToHost, MemcpyHostToDevice = hipMemcpyHostToDevice, @@ -280,6 +292,8 @@ inline error_t streamDestroy(stream_t stream) { inline error_t setDevice(int device) { return hipSetDevice(device); } +inline error_t getDevice(int* device) { return hipGetDevice(device); } + inline error_t deviceSetCacheConfig(FuncCache cache_config) { return hipDeviceSetCacheConfig(static_cast(cache_config)); } From 7a9687e58f63feb3b2c99ab695226ac3690b60ed Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 19 Sep 2023 08:35:36 -0400 Subject: [PATCH 108/592] external/librett.hpp: cuda -> device --- src/TiledArray/external/librett.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/external/librett.h b/src/TiledArray/external/librett.h index 8af10d7ecb..b6b6cee3bc 100644 --- a/src/TiledArray/external/librett.h +++ b/src/TiledArray/external/librett.h @@ -74,11 +74,12 @@ inline void permutation_to_col_major(std::vector& perm) { * @param outData pointer to data in output Tensor, must be accessible on GPU * @param range the Range of input Tensor inData * @param perm the permutation object - * @param stream the CUDA stream this permutation will be submitted to + * @param stream the device stream this permutation will be submitted to */ template void librett_permute(T* inData, T* outData, const TiledArray::Range& range, - const TiledArray::Permutation& perm, cudaStream_t stream) { + const TiledArray::Permutation& perm, + device::stream_t stream) { auto extent = range.extent(); std::vector extent_int(extent.begin(), extent.end()); From 5383dc548db64f2457fcec7070a9d56a8e31d23f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 19 Sep 2023 08:53:05 -0400 Subject: [PATCH 109/592] first attempt to introduce rocthrust --- external/hip.cmake | 4 +++- src/CMakeLists.txt | 2 +- src/TiledArray/device/thrust.h | 9 +++++++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/external/hip.cmake b/external/hip.cmake index 91a9e2cd82..a76f543454 100644 --- a/external/hip.cmake +++ b/external/hip.cmake @@ -11,8 +11,10 @@ set(TILEDARRAY_CHECK_HIP_ERROR 1 CACHE BOOL "Whether TiledArray will check HIP e # find HIP components find_package(hipblas REQUIRED) +find_package(rocprim REQUIRED) # for rocthrust, per https://github.com/ROCmSoftwarePlatform/rocThrust#using-rocthrust-in-a-project +find_package(rocthrust REQUIRED) -foreach (library hipblas) +foreach (library hipblas;rocthrust) if (NOT TARGET roc::${library}) message(FATAL_ERROR "roc::${library} not found") endif() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 156bb7b2cf..0cc651a06e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -283,7 +283,7 @@ if(CUDA_FOUND) endif(CUDA_FOUND) if (HIP_FOUND) - list(APPEND _TILEDARRAY_DEPENDENCIES hip::host) + list(APPEND _TILEDARRAY_DEPENDENCIES hip::host roc::rocthrust) endif() # LibreTT needed for either CUDA or HIP diff --git a/src/TiledArray/device/thrust.h b/src/TiledArray/device/thrust.h index 6007c7a4de..9a3ec8f23b 100644 --- a/src/TiledArray/device/thrust.h +++ b/src/TiledArray/device/thrust.h @@ -26,9 +26,12 @@ #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE +#ifdef TILEDARRAY_HAS_CUDA #include +#endif + #include #include @@ -38,10 +41,12 @@ namespace thrust { // thrust::device_malloc_allocator name changed to device_allocator after // version 10 +#ifdef TILEDARRAY_HAS_CUDA #if CUDART_VERSION < 10000 template using device_allocator = thrust::device_malloc_allocator; #endif +#endif // TILEDARRAY_HAS_CUDA template const T* data(const thrust::device_vector& dev_vec) { @@ -57,6 +62,6 @@ template void resize(thrust::device_vector& dev_vec, size_t size); } // namespace thrust -#endif // TILEDARRAY_HAS_CUDA +#endif // TILEDARRAY_HAS_DEVICE #endif // TILEDARRAY_DEVICE_THRUST_H__INCLUDED From 6a858e9e4f90cc7e6ed25850baacb66ac2ba23fa Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 19 Sep 2023 15:07:54 -0400 Subject: [PATCH 110/592] examples/cuda -> examples/device ta_dense_cuda -> ta_dense_device + eliminated support for thrust-based storage --- examples/CMakeLists.txt | 2 +- examples/{cuda => device}/CMakeLists.txt | 16 +- examples/{cuda => device}/cuda_librett.cpp | 0 examples/{cuda => device}/cuda_task.cpp | 0 examples/{cuda => device}/ta_cc_abcd_cuda.cpp | 0 .../ta_dense_device.cpp} | 206 ++++-------------- examples/{cuda => device}/ta_reduce_cuda.cpp | 0 examples/{cuda => device}/ta_vector_cuda.cpp | 0 src/TiledArray/external/device.h | 45 +++- 9 files changed, 90 insertions(+), 179 deletions(-) rename examples/{cuda => device}/CMakeLists.txt (74%) rename examples/{cuda => device}/cuda_librett.cpp (100%) rename examples/{cuda => device}/cuda_task.cpp (100%) rename examples/{cuda => device}/ta_cc_abcd_cuda.cpp (100%) rename examples/{cuda/ta_dense_cuda.cpp => device/ta_dense_device.cpp} (65%) rename examples/{cuda => device}/ta_reduce_cuda.cpp (100%) rename examples/{cuda => device}/ta_vector_cuda.cpp (100%) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index f74d35345a..99edd4e33b 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -29,7 +29,7 @@ endif() # Add Subdirectories add_subdirectory (cc) -add_subdirectory (cuda) +add_subdirectory (device) add_subdirectory (dgemm) add_subdirectory (demo) add_subdirectory (scalapack) diff --git a/examples/cuda/CMakeLists.txt b/examples/device/CMakeLists.txt similarity index 74% rename from examples/cuda/CMakeLists.txt rename to examples/device/CMakeLists.txt index 5d7f56c86e..81339a3606 100644 --- a/examples/cuda/CMakeLists.txt +++ b/examples/device/CMakeLists.txt @@ -25,7 +25,7 @@ if(CUDA_FOUND) - foreach(_exec cuda_librett cuda_task ta_dense_cuda ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda) + foreach(_exec cuda_librett cuda_task ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda) # Add executable add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray") @@ -33,4 +33,16 @@ if(CUDA_FOUND) endforeach() -endif(CUDA_FOUND) +endif() + +if(CUDA_FOUND OR HIP_FOUND) + + foreach(_exec ta_dense_device) + + # Add executable + add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray") + add_dependencies(examples-tiledarray ${_exec}) + + endforeach() + +endif() diff --git a/examples/cuda/cuda_librett.cpp b/examples/device/cuda_librett.cpp similarity index 100% rename from examples/cuda/cuda_librett.cpp rename to examples/device/cuda_librett.cpp diff --git a/examples/cuda/cuda_task.cpp b/examples/device/cuda_task.cpp similarity index 100% rename from examples/cuda/cuda_task.cpp rename to examples/device/cuda_task.cpp diff --git a/examples/cuda/ta_cc_abcd_cuda.cpp b/examples/device/ta_cc_abcd_cuda.cpp similarity index 100% rename from examples/cuda/ta_cc_abcd_cuda.cpp rename to examples/device/ta_cc_abcd_cuda.cpp diff --git a/examples/cuda/ta_dense_cuda.cpp b/examples/device/ta_dense_device.cpp similarity index 65% rename from examples/cuda/ta_dense_cuda.cpp rename to examples/device/ta_dense_device.cpp index 864938302c..4b14085624 100644 --- a/examples/cuda/ta_dense_cuda.cpp +++ b/examples/device/ta_dense_device.cpp @@ -25,113 +25,12 @@ #include #include -#include "TiledArray/device/cpu_cuda_vector.h" #include // clang-format on +#ifdef TILEDARRAY_HAS_CUDA #include - -namespace TiledArray { - -/// -/// cuda gemm interface function on left*right -/// - -template -btas::Tensor> gemm( - const btas::Tensor> &left, - const btas::Tensor> &right, - T factor, const TiledArray::math::GemmHelper &gemm_helper) { - return btas_tensor_gemm_cuda_impl(left, right, factor, gemm_helper); -} - -/// -/// cuda gemm interface function on result = left*right -/// - -template -void gemm(btas::Tensor> &result, - const btas::Tensor> &left, - const btas::Tensor> &right, - T factor, const TiledArray::math::GemmHelper &gemm_helper) { - return btas_tensor_gemm_cuda_impl(result, left, right, factor, gemm_helper); -} - -/// -/// cuda axpy interface function -/// - -template -void add_to(btas::Tensor> &result, - const btas::Tensor> &arg) { - btas_tensor_add_to_cuda_impl(result, arg, T(1.0)); -} - -/// -/// cuda dot interface function -/// - -template -typename btas::Tensor>::value_type -squared_norm( - const btas::Tensor> &arg) { - return btas_tensor_squared_norm_cuda_impl(arg); -} - -template -typename btas::Tensor>::value_type -norm(const btas::Tensor> &arg) { - return std::sqrt(squared_norm(arg)); -} - -/// to host for CPU GPU Array -template -void to_host( - TiledArray::DistArray>>, - Policy> &cpu_cuda_array) { - auto to_host = - [](TiledArray::Tile< - btas::Tensor>> &tile) { - auto &stream = detail::get_stream_based_on_range(tile.range()); - - // do norm on GPU - auto tile_norm = norm(tile.tensor()); - - TiledArray::to_execution_space( - tile.tensor().storage(), stream); - - return tile_norm; - }; - - foreach_inplace(cpu_cuda_array, to_host); - cpu_cuda_array.world().gop.fence(); - cudaDeviceSynchronize(); -}; - -/// to device for CPU GPU array -template -void to_device( - TiledArray::DistArray>>, - Policy> &cpu_gpu_array) { - auto to_device = - [](TiledArray::Tile< - btas::Tensor>> &tile) { - auto &stream = detail::get_stream_based_on_range(tile.range()); - - TiledArray::to_execution_space( - tile.tensor().storage(), stream); - - return norm(tile.tensor()); - }; - - foreach_inplace(cpu_gpu_array, to_device); - cpu_gpu_array.world().gop.fence(); - cudaDeviceSynchronize(); -}; - -} // namespace TiledArray +#endif template void do_main_body(TiledArray::World &world, const long Nm, const long Bm, @@ -213,8 +112,8 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, TiledArray::TiledRange // TRange for b trange_b(blocking_B.begin(), blocking_B.end()); - using CUDATile = btas::Tensor; - using CUDAMatrix = TA::DistArray>; + using DeviceTile = btas::Tensor; + using DeviceMatrix = TA::DistArray>; using PinnedTile = btas::Tensor>; // using TAMatrix = TA::DistArray>; - CUDAMatrix c(world, trange_c); + DeviceMatrix c(world, trange_c); auto val_a = 0.03; auto val_b = 0.02; @@ -234,8 +133,8 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, a_host.fill(val_a); b_host.fill(val_b); - CUDAMatrix a = TA::ta_tensor_to_um_tensor>(a_host); - CUDAMatrix b = TA::ta_tensor_to_um_tensor>(b_host); + DeviceMatrix a = TA::ta_tensor_to_um_tensor>(a_host); + DeviceMatrix b = TA::ta_tensor_to_um_tensor>(b_host); world.gop.fence(); @@ -244,8 +143,10 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, // c("m,n") = a("m,k") * b("k,n"); +#ifdef TILEDARRAY_HAS_CUDA // start profiler cudaProfilerStart(); +#endif // TILEDARRAY_HAS_CUDA double total_time = 0.0; double total_gflop_rate = 0.0; @@ -271,8 +172,10 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, // Stop clock const double wall_time_stop = madness::wall_time(); +#ifdef TILEDARRAY_HAS_CUDA // stop profiler cudaProfilerStop(); +#endif // TILEDARRAY_HAS_CUDA if (world.rank() == 0) std::cout << "Average wall time = " << total_time / double(nrepeat) @@ -290,7 +193,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, result = dot_length * val_a * val_b; auto verify = [&world, &threshold, &result, - &dot_length](TA::Tile &tile) { + &dot_length](TA::Tile &tile) { auto n_elements = tile.size(); for (std::size_t i = 0; i < n_elements; i++) { double abs_err = std::abs(tile[i] - result); @@ -379,13 +282,10 @@ int try_main(int argc, char **argv) { const auto storage_type = (argc >= 10) ? std::string(argv[9]) : std::string{"device_um_btas_varray"}; - if (storage_type != "device_um_btas_varray" && - storage_type != "cuda_um_thrust_vector" && - storage_type != "cpu_cuda_vector") { + if (storage_type != "device_um_btas_varray") { std::cerr << "Error: invalid storage type: " << storage_type - << "\n Valid option includes: cuda_um_vector or " - "device_um_btas_varray or cuda_um_thrust_vector " - "or cpu_cuda_vector. \n"; + << "\n Valid option includes: " + "device_um_btas_varray \n"; } std::cout << "Storage type: " << storage_type << "<" << scalar_type_str << ">" << std::endl; @@ -395,25 +295,25 @@ int try_main(int argc, char **argv) { // }; int driverVersion, runtimeVersion; - auto error = cudaDriverGetVersion(&driverVersion); - if (error != cudaSuccess) { - std::cout << "error(cudaDriverGetVersion) = " << error << std::endl; + auto error = TiledArray::device::driverVersion(&driverVersion); + if (error != TiledArray::device::Success) { + std::cout << "error(DriverGetVersion) = " << error << std::endl; } - error = cudaRuntimeGetVersion(&runtimeVersion); - if (error != cudaSuccess) { - std::cout << "error(cudaRuntimeGetVersion) = " << error << std::endl; + error = TiledArray::device::runtimeVersion(&runtimeVersion); + if (error != TiledArray::device::Success) { + std::cout << "error(RuntimeGetVersion) = " << error << std::endl; } - std::cout << "CUDA {driver,runtime} versions = " << driverVersion << "," + std::cout << "device {driver,runtime} versions = " << driverVersion << "," << runtimeVersion << std::endl; { // print device properties - int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices(); + int num_devices = TA::deviceEnv::instance()->num_devices(); - if (num_cuda_devices <= 0) { - throw std::runtime_error("No CUDA-Enabled GPUs Found!\n"); + if (num_devices <= 0) { + throw std::runtime_error("No GPUs Found!\n"); } - int cuda_device_id = TA::deviceEnv::instance()->current_device_id(); + int device_id = TA::deviceEnv::instance()->current_device_id(); int mpi_size = world.size(); int mpi_rank = world.rank(); @@ -423,29 +323,28 @@ int try_main(int argc, char **argv) { std::cout << "CUDA Device Information for MPI Process Rank: " << mpi_rank << std::endl; cudaDeviceProp prop; - auto error = cudaGetDeviceProperties(&prop, cuda_device_id); - if (error != cudaSuccess) { - std::cout << "error(cudaGetDeviceProperties) = " << error - << std::endl; + auto error = cudaGetDeviceProperties(&prop, device_id); + if (error != TiledArray::device::Success) { + std::cout << "error(GetDeviceProperties) = " << error << std::endl; } - std::cout << "Device #" << cuda_device_id << ": " << prop.name - << std::endl + std::cout << "Device #" << device_id << ": " << prop.name << std::endl << " managedMemory = " << prop.managedMemory << std::endl << " singleToDoublePrecisionPerfRatio = " << prop.singleToDoublePrecisionPerfRatio << std::endl; int result; - error = cudaDeviceGetAttribute(&result, cudaDevAttrUnifiedAddressing, - cuda_device_id); + error = TiledArray::device::deviceGetAttribute( + &result, TiledArray::device::DevAttrUnifiedAddressing, device_id); std::cout << " attrUnifiedAddressing = " << result << std::endl; - error = cudaDeviceGetAttribute( - &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id); + error = TiledArray::device::deviceGetAttribute( + &result, TiledArray::device::DevAttrConcurrentManagedAccess, + device_id); std::cout << " attrConcurrentManagedAccess = " << result << std::endl; - error = device::setDevice(cuda_device_id); - if (error != cudaSuccess) { + error = TiledArray::device::setDevice(device_id); + if (error != TiledArray::device::Success) { std::cout << "error(device::setDevice) = " << error << std::endl; } size_t free_mem, total_mem; - error = cudaMemGetInfo(&free_mem, &total_mem); + error = TiledArray::device::memGetInfo(&free_mem, &total_mem); std::cout << " {total,free} memory = {" << total_mem << "," << free_mem << "}" << std::endl; } @@ -453,16 +352,6 @@ int try_main(int argc, char **argv) { } } // print device properties - // if (storage_type == "cpu_cuda_vector") { - // if (scalar_type_str == "double") - // do_main_body>(world, Nm, Bm, Nn, - // Bn, - // Nk, Bk, nrepeat); - // else - // do_main_body>(world, Nm, Bm, Nn, - // Bn, - // Nk, Bk, nrepeat); - // } else if (storage_type == "device_um_btas_varray") { if (storage_type == "device_um_btas_varray") { if (scalar_type_str == "double") do_main_body>( @@ -479,16 +368,7 @@ int try_main(int argc, char **argv) { else { abort(); // unreachable } - } - // else if (storage_type == "cuda_um_thrust_vector") { - // if (scalar_type_str == "double") - // do_main_body>( - // world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); - // else - // do_main_body>( - // world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat); - // } - else { + } else { throw std::runtime_error("Invalid storage type!\n"); } @@ -498,15 +378,13 @@ int try_main(int argc, char **argv) { int main(int argc, char *argv[]) { try { try_main(argc, argv); - } catch (thrust::system::detail::bad_alloc &ex) { + } catch (std::exception &ex) { std::cout << ex.what() << std::endl; size_t free_mem, total_mem; - auto result = cudaMemGetInfo(&free_mem, &total_mem); - std::cout << "CUDA memory stats: {total,free} = {" << total_mem << "," + auto result = TiledArray::device::memGetInfo(&free_mem, &total_mem); + std::cout << "device memory stats: {total,free} = {" << total_mem << "," << free_mem << "}" << std::endl; - } catch (std::exception &ex) { - std::cout << ex.what() << std::endl; } catch (...) { std::cerr << "unknown exception" << std::endl; } diff --git a/examples/cuda/ta_reduce_cuda.cpp b/examples/device/ta_reduce_cuda.cpp similarity index 100% rename from examples/cuda/ta_reduce_cuda.cpp rename to examples/device/ta_reduce_cuda.cpp diff --git a/examples/cuda/ta_vector_cuda.cpp b/examples/device/ta_vector_cuda.cpp similarity index 100% rename from examples/cuda/ta_vector_cuda.cpp rename to examples/device/ta_vector_cuda.cpp diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 3d02192c2d..745697faaa 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -131,6 +131,8 @@ using deviceAttr_t = cudaDeviceAttr; cudaDevAttrConcurrentManagedAccess #define DEVICERT_CB CUDART_CB +enum Error { Success = cudaSuccess }; + enum DeviceId { CpuDeviceId = cudaCpuDeviceId, InvalidDeviceId = cudaInvalidDeviceId @@ -162,13 +164,13 @@ enum StreamCreateFlags { StreamNonBlocking = cudaStreamNonBlocking }; -inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) { - return cudaLaunchHostFunc(stream, fn, userData); -} +constexpr inline auto DevAttrUnifiedAddressing = cudaDevAttrUnifiedAddressing; +constexpr inline auto DevAttrConcurrentManagedAccess = + cudaDevAttrConcurrentManagedAccess; -inline error_t streamDestroy(stream_t stream) { - return cudaStreamDestroy(stream); -} +inline error_t driverVersion(int* v) { return cudaDriverGetVersion(v); } + +inline error_t runtimeVersion(int* v) { return cudaRuntimeGetVersion(v); } inline error_t setDevice(int device) { return cudaSetDevice(device); } @@ -239,6 +241,14 @@ error_t memPrefetchAsync(const T* devPtr, size_t count, int dstDevice, return cudaMemPrefetchAsync(devPtr, count, dstDevice, stream); } +inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) { + return cudaLaunchHostFunc(stream, fn, userData); +} + +inline error_t streamDestroy(stream_t stream) { + return cudaStreamDestroy(stream); +} + } // namespace cuda #elif defined(TILEDARRAY_HAS_HIP) inline namespace hip { @@ -251,6 +261,8 @@ using deviceAttr_t = hipDeviceAttribute_t; hipDeviceAttributeConcurrentManagedAccess #define DEVICERT_CB +enum Error { Success = hipSuccess }; + enum DeviceId { CpuDeviceId = hipCpuDeviceId, InvalidDeviceId = hipInvalidDeviceId @@ -282,13 +294,14 @@ enum StreamCreateFlags { StreamNonBlocking = hipStreamNonBlocking }; -inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) { - return hipLaunchHostFunc(stream, fn, userData); -} +constexpr inline auto DevAttrUnifiedAddressing = + hipDeviceAttributeUnifiedAddressing; +constexpr inline auto DevAttrConcurrentManagedAccess = + hipDeviceAttributeConcurrentManagedAccess; -inline error_t streamDestroy(stream_t stream) { - return hipStreamDestroy(stream); -} +inline error_t driverVersion(int* v) { return hipDriverGetVersion(v); } + +inline error_t runtimeVersion(int* v) { return hipRuntimeGetVersion(v); } inline error_t setDevice(int device) { return hipSetDevice(device); } @@ -360,6 +373,14 @@ error_t memPrefetchAsync(const T* devPtr, size_t count, int dstDevice, return hipMemPrefetchAsync(devPtr, count, dstDevice, stream); } +inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) { + return hipLaunchHostFunc(stream, fn, userData); +} + +inline error_t streamDestroy(stream_t stream) { + return hipStreamDestroy(stream); +} + } // namespace hip #endif From f44ebb5f5d307e47f74e5343eb409e5f8caa0786 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 19 Sep 2023 15:09:23 -0400 Subject: [PATCH 111/592] um_storage no longer depends on thrust --- src/TiledArray/device/um_storage.h | 38 ------------------------------ 1 file changed, 38 deletions(-) diff --git a/src/TiledArray/device/um_storage.h b/src/TiledArray/device/um_storage.h index 9dd3242f45..c940bca45c 100644 --- a/src/TiledArray/device/um_storage.h +++ b/src/TiledArray/device/um_storage.h @@ -25,7 +25,6 @@ #define TILEDARRAY_DEVICE_UM_VECTOR_H__INCLUDED #include -#include #ifdef TILEDARRAY_HAS_DEVICE @@ -39,12 +38,6 @@ namespace TiledArray { -#ifdef TILEDARRAY_HAS_CUDA -template -using cuda_um_thrust_vector = - thrust::device_vector>; -#endif // TILEDARRAY_HAS_CUDA - /// @return true if @c dev_vec is present in space @space template bool in_memory_space(const Storage& vec) noexcept { @@ -126,37 +119,6 @@ const typename Storage::value_type* device_data(const Storage& storage) { } // namespace TiledArray -namespace madness { -namespace archive { - -#ifdef TILEDARRAY_HAS_CUDA - -template -struct ArchiveLoadImpl> { - static inline void load(const Archive& ar, - TiledArray::cuda_um_thrust_vector& x) { - typename thrust::device_vector< - T, TiledArray::device_um_allocator>::size_type n(0); - ar& n; - x.resize(n); - for (auto& xi : x) ar& xi; - } -}; - -template -struct ArchiveStoreImpl> { - static inline void store(const Archive& ar, - const TiledArray::cuda_um_thrust_vector& x) { - ar& x.size(); - for (const auto& xi : x) ar& xi; - } -}; - -#endif // TILEDARRAY_HAS_CUDA - -} // namespace archive -} // namespace madness - #endif // TILEDARRAY_HAS_CUDA #endif // TILEDARRAY_DEVICE_UM_VECTOR_H__INCLUDED From e477ad4166ab9e55cc3499fea54bd1c9ec28c702 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 19 Sep 2023 19:31:13 -0400 Subject: [PATCH 112/592] converted cublas calls to blaspp --- examples/device/cuda_task.cpp | 7 +- src/CMakeLists.txt | 3 +- src/TiledArray/device/blas.cpp | 60 ++++++ src/TiledArray/device/{cublas.h => blas.h} | 167 +++------------- src/TiledArray/device/btas_cublas.h | 218 ++++++--------------- src/TiledArray/device/btas_um_tensor.h | 11 +- src/TiledArray/tiledarray.cpp | 11 +- 7 files changed, 168 insertions(+), 309 deletions(-) create mode 100644 src/TiledArray/device/blas.cpp rename src/TiledArray/device/{cublas.h => blas.h} (71%) diff --git a/examples/device/cuda_task.cpp b/examples/device/cuda_task.cpp index f2b0b2ab1b..e5519b8b85 100644 --- a/examples/device/cuda_task.cpp +++ b/examples/device/cuda_task.cpp @@ -28,7 +28,7 @@ void verify(const tile_type& tile, value_type value, std::size_t index) { tile_type scale(const tile_type& arg, value_type a, const cudaStream_t* stream, std::size_t index) { - DeviceSafeCall(device::setDevice( + DeviceSafeCall(TiledArray::device::setDevice( TiledArray::deviceEnv::instance()->current_device_id())); /// make result Tensor using Storage = typename tile_type::tensor_type::storage_type; @@ -40,8 +40,7 @@ tile_type scale(const tile_type& arg, value_type a, const cudaStream_t* stream, std::move(result_storage)); /// copy the original Tensor - const auto& handle = TiledArray::cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, *stream)); + const auto& handle = TiledArray::BLASQueuePool::handle(*stream); CublasSafeCall(TiledArray::cublasCopy(handle, result.size(), arg.data(), 1, device_data(result.storage()), 1)); @@ -51,7 +50,7 @@ tile_type scale(const tile_type& arg, value_type a, const cudaStream_t* stream, // cudaStreamSynchronize(stream); - TiledArray::synchronize_stream(stream); + TiledArray::device::synchronize_stream(stream); // std::stringstream stream_str; // stream_str << *stream; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0cc651a06e..606cf0df2d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -216,7 +216,8 @@ endif() if(CUDA_FOUND) list(APPEND TILEDARRAY_HEADER_FILES TiledArray/external/cuda.h - TiledArray/device/cublas.h + TiledArray/device/blas.cpp + TiledArray/device/blas.h TiledArray/device/btas_cublas.h TiledArray/device/btas_um_tensor.h TiledArray/device/cpu_cuda_vector.h diff --git a/src/TiledArray/device/blas.cpp b/src/TiledArray/device/blas.cpp new file mode 100644 index 0000000000..ea8eb00faf --- /dev/null +++ b/src/TiledArray/device/blas.cpp @@ -0,0 +1,60 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2018 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Eduard Valeyev + * Department of Chemistry, Virginia Tech + * Sept 19, 2023 + * + */ + +#include + +namespace TiledArray { + +std::vector> BLASQueuePool::queues_; + +bool BLASQueuePool::initialized() { return !queues_.empty(); } + +void BLASQueuePool::initialize() { + if (initialized()) return; + queues_.reserve(deviceEnv::instance()->num_streams()); + for (std::size_t sidx = 0; sidx != deviceEnv::instance()->num_streams(); + ++sidx) { + auto stream = deviceEnv::instance()->stream( + sidx); // blaspp forsome reason wants non-const lvalue ref to stream + queues_.emplace_back(std::make_unique(0, stream)); + } +} + +void BLASQueuePool::finalize() { queues_.clear(); } + +blas::Queue& BLASQueuePool::queue(std::size_t ordinal) { + TA_ASSERT(initialized()); + TA_ASSERT(ordinal < deviceEnv::instance()->num_streams()); + return *(queues_[ordinal]); +} + +blas::Queue& BLASQueuePool::queue(device::stream_t const& stream) { + TA_ASSERT(initialized()); + for (auto&& q : queues_) { + if (q->stream() == stream) return *q; + } + throw TiledArray::Exception( + "no matching device stream found in the BLAS queue pool"); +} + +} // namespace TiledArray diff --git a/src/TiledArray/device/cublas.h b/src/TiledArray/device/blas.h similarity index 71% rename from src/TiledArray/device/cublas.h rename to src/TiledArray/device/blas.h index 501a0402d1..da9d594bdc 100644 --- a/src/TiledArray/device/cublas.h +++ b/src/TiledArray/device/blas.h @@ -21,35 +21,20 @@ * */ -#ifndef TILEDARRAY_MATH_CUBLAS_H__INCLUDED -#define TILEDARRAY_MATH_CUBLAS_H__INCLUDED +#ifndef TILEDARRAY_DEVICE_BLAS_H__INCLUDED +#define TILEDARRAY_DEVICE_BLAS_H__INCLUDED #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE + +#include #include #include -#include -#include -#include #include -#define CublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__) - -inline void __cublasSafeCall(cublasStatus_t err, const char *file, - const int line) { - if (CUBLAS_STATUS_SUCCESS != err) { - std::stringstream ss; - ss << "cublasSafeCall() failed at: " << file << "(" << line << ")"; - std::string what = ss.str(); - throw std::runtime_error(what); - } - - return; -} - namespace TiledArray { namespace detail { @@ -76,87 +61,32 @@ auto cublasPointer(T *std_complex_ptr) { */ /** - * cuBLASHandlePool - * - * assign 1 cuBLAS handle / thread, use thread-local storage to manage - * + * BLASQueuePool is a singleton controlling a pool of blas::Queue objects: + * - queues map to stream 1-to-1, so do not call Queue::set_stream to maintain + * this invariant + * - can access queues by the corresponding stream ordinal a la + * deviceEnv::stream() */ -class cuBLASHandlePool { - public: - static const cublasHandle_t &handle() { - static thread_local cublasHandle_t *handle_{nullptr}; - if (handle_ == nullptr) { - handle_ = new cublasHandle_t; - CublasSafeCall(cublasCreate(handle_)); - CublasSafeCall(cublasSetPointerMode(*handle_, CUBLAS_POINTER_MODE_HOST)); - } - return *handle_; - } -}; -// thread_local cublasHandle_t *cuBLASHandlePool::handle_; - -inline cublasOperation_t to_cublas_op(math::blas::Op cblas_op) { - cublasOperation_t result{}; - switch (cblas_op) { - case math::blas::Op::NoTrans: - result = CUBLAS_OP_N; - break; - case math::blas::Op::Trans: - result = CUBLAS_OP_T; - break; - case math::blas::Op::ConjTrans: - result = CUBLAS_OP_C; - break; - } - return result; -} +struct BLASQueuePool { + static bool initialized(); + static void initialize(); + static void finalize(); -/// GEMM interface functions + static blas::Queue &queue(std::size_t ordinal = 0); + static blas::Queue &queue(const device::stream_t &stream); -template -cublasStatus_t cublasGemm(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const T *alpha, const T *A, int lda, const T *B, - int ldb, const T *beta, T *C, int ldc); -template <> -inline cublasStatus_t cublasGemm( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const float *alpha, const float *A, int lda, - const float *B, int ldb, const float *beta, float *C, int ldc) { - return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, - beta, C, ldc); -} -template <> -inline cublasStatus_t cublasGemm( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const double *alpha, const double *A, int lda, - const double *B, int ldb, const double *beta, double *C, int ldc) { - return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, - beta, C, ldc); -} -template <> -inline cublasStatus_t cublasGemm>( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const std::complex *alpha, - const std::complex *A, int lda, const std::complex *B, - int ldb, const std::complex *beta, std::complex *C, int ldc) { - using detail::cublasPointer; - return cublasCgemm(handle, transa, transb, m, n, k, cublasPointer(alpha), - cublasPointer(A), lda, cublasPointer(B), ldb, - cublasPointer(beta), cublasPointer(C), ldc); -} -template <> -inline cublasStatus_t cublasGemm>( - cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, const std::complex *alpha, - const std::complex *A, int lda, const std::complex *B, - int ldb, const std::complex *beta, std::complex *C, - int ldc) { - using detail::cublasPointer; - return cublasZgemm(handle, transa, transb, m, n, k, cublasPointer(alpha), - cublasPointer(A), lda, cublasPointer(B), ldb, - cublasPointer(beta), cublasPointer(C), ldc); + private: + static std::vector> queues_; +}; + +namespace detail { +template +blas::Queue &get_blasqueue_based_on_range(const Range &range) { + // TODO better way to get stream based on the id of tensor + auto stream_ord = range.offset() % device::Env::instance()->num_streams(); + return BLASQueuePool::queue(stream_ord); } +} // namespace detail /// AXPY interface functions @@ -314,26 +244,6 @@ inline cublasStatus_t cublasAxpy>( return cublasDaxpy(handle, n, &alpha_double, x, incx, y, incy); } -/// DOT interface functions - -template -cublasStatus_t cublasDot(cublasHandle_t handle, int n, const T *x, int incx, - const T *y, int incy, T *result); -template <> -inline cublasStatus_t cublasDot(cublasHandle_t handle, int n, - const float *x, int incx, const float *y, - int incy, float *result) { - return cublasSdot(handle, n, x, incx, y, incy, result); -} - -template <> -inline cublasStatus_t cublasDot(cublasHandle_t handle, int n, - const double *x, int incx, - const double *y, int incy, - double *result) { - return cublasDdot(handle, n, x, incx, y, incy, result); -} - /// SCAL interface function template cublasStatus_t cublasScal(cublasHandle_t handle, int n, const Scalar *alpha, @@ -468,25 +378,8 @@ inline cublasStatus_t cublasScal>( return cublasDscal(handle, n, &alpha_double, x, incx); } -/// COPY inerface function -template -cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const T *x, int incx, - T *y, int incy); - -template <> -inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const float *x, - int incx, float *y, int incy) { - return cublasScopy(handle, n, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x, - int incx, double *y, int incy) { - return cublasDcopy(handle, n, x, incx, y, incy); -} - -} // end of namespace TiledArray +} // namespace TiledArray -#endif // TILEDARRAY_HAS_CUDA +#endif // TILEDARRAY_HAS_DEVICE -#endif // TILEDARRAY_MATH_CUBLAS_H__INCLUDED +#endif // TILEDARRAY_DEVICE_BLAS_H__INCLUDED diff --git a/src/TiledArray/device/btas_cublas.h b/src/TiledArray/device/btas_cublas.h index 9ac97ce649..96d9edd3ea 100644 --- a/src/TiledArray/device/btas_cublas.h +++ b/src/TiledArray/device/btas_cublas.h @@ -24,12 +24,12 @@ #ifndef TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED #define TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED -#include +#include #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE -#include +#include #include #include @@ -84,75 +84,33 @@ btas::Tensor btas_tensor_gemm_cuda_impl( auto result_range = gemm_helper.make_result_range(left.range(), right.range()); - auto &stream = detail::get_stream_based_on_range(result_range); + auto &queue = detail::get_blasqueue_based_on_range(result_range); + auto &stream = queue.stream(); // the result Tensor type typedef btas::Tensor Tensor; Tensor result; - // check if stream is busy - // auto stream_status = cudaStreamQuery(stream); - - // if stream is completed, use GPU - // if (stream_status == cudaSuccess) { if (true) { Storage result_storage; make_device_storage(result_storage, result_range.area(), stream); result = Tensor(std::move(result_range), std::move(result_storage)); - // left and right are readonly!! - // cudaMemAdvise(device_data(left), left.size() * sizeof(T), - // cudaMemAdviseSetReadMostly, - // deviceEnv::instance()->current_device_id()); - // cudaMemAdvise(device_data(right), right.size() * sizeof(T), - // cudaMemAdviseSetReadMostly, - // deviceEnv::instance()->current_device_id()); - // prefetch data TiledArray::to_execution_space( left.storage(), stream); TiledArray::to_execution_space( right.storage(), stream); - const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, stream)); - - CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.right_op()), - to_cublas_op(gemm_helper.left_op()), n, m, k, - &factor_t, device_data(right.storage()), ldb, - device_data(left.storage()), lda, &zero, - device_data(result.storage()), n)); + static_assert(btas::boxrange_iteration_order::value == + btas::boxrange_iteration_order::row_major); + blas::gemm(blas::Layout::ColMajor, gemm_helper.right_op(), + gemm_helper.left_op(), n, m, k, factor_t, + device_data(right.storage()), ldb, device_data(left.storage()), + lda, zero, device_data(result.storage()), n, queue); - // wait for cuda calls to finish - // detail::thread_wait_stream(stream); device::synchronize_stream(&stream); } - // otherwise, use CPU - else { - Storage result_storage(result_range.area()); - result = Tensor(std::move(result_range), std::move(result_storage)); - - TiledArray::to_execution_space( - result.storage(), stream); - - // left and right are readonly!! - cudaMemAdvise(device_data(left), left.size() * sizeof(T), - cudaMemAdviseSetReadMostly, - deviceEnv::instance()->current_device_id()); - cudaMemAdvise(device_data(right), right.size() * sizeof(T), - cudaMemAdviseSetReadMostly, - deviceEnv::instance()->current_device_id()); - - // prefetch data - TiledArray::to_execution_space( - left.storage(), stream); - TiledArray::to_execution_space( - right.storage(), stream); - - TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), - m, n, k, factor_t, left.data(), lda, - right.data(), ldb, zero, result.data(), n); - } return result; } @@ -226,24 +184,12 @@ void btas_tensor_gemm_cuda_impl( (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k); DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &stream = detail::get_stream_based_on_range(result.range()); + auto &queue = detail::get_blasqueue_based_on_range(result.range()); + auto &stream = queue.stream(); T factor_t = T(factor); T one(1); - // check if stream is busy - // auto stream_status = cudaStreamQuery(stream); - - // if stream is completed, use GPU - // if (stream_status == cudaSuccess) { if (true) { - // left and right are readonly!! - // cudaMemAdvise(device_data(left), left.size() * sizeof(T), - // cudaMemAdviseSetReadMostly, - // deviceEnv::instance()->current_device_id()); - // cudaMemAdvise(device_data(right), right.size() * sizeof(T), - // cudaMemAdviseSetReadMostly, - // deviceEnv::instance()->current_device_id()); - // prefetch all data TiledArray::to_execution_space( left.storage(), stream); @@ -252,37 +198,13 @@ void btas_tensor_gemm_cuda_impl( TiledArray::to_execution_space( result.storage(), stream); - const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, stream)); - CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.right_op()), - to_cublas_op(gemm_helper.left_op()), n, m, k, - &factor_t, device_data(right.storage()), ldb, - device_data(left.storage()), lda, &one, - device_data(result.storage()), n)); + static_assert(btas::boxrange_iteration_order::value == + btas::boxrange_iteration_order::row_major); + blas::gemm(blas::Layout::ColMajor, gemm_helper.right_op(), + gemm_helper.left_op(), n, m, k, factor_t, + device_data(right.storage()), ldb, device_data(left.storage()), + lda, one, device_data(result.storage()), n, queue); device::synchronize_stream(&stream); - - // detail::thread_wait_stream(stream); - - } else { - // left and right are readonly!! - cudaMemAdvise(device_data(left), left.size() * sizeof(T), - cudaMemAdviseSetReadMostly, - deviceEnv::instance()->current_device_id()); - cudaMemAdvise(device_data(right), right.size() * sizeof(T), - cudaMemAdviseSetReadMostly, - deviceEnv::instance()->current_device_id()); - - // prefetch data - TiledArray::to_execution_space( - left.storage(), stream); - TiledArray::to_execution_space( - right.storage(), stream); - TiledArray::to_execution_space( - result.storage(), stream); - - TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), - m, n, k, factor_t, left.data(), lda, - right.data(), ldb, one, result.data(), n); } } @@ -294,18 +216,15 @@ btas::Tensor btas_tensor_clone_cuda_impl( Storage result_storage; auto result_range = arg.range(); - auto &stream = detail::get_stream_based_on_range(result_range); + auto &queue = detail::get_blasqueue_based_on_range(result_range); + auto &stream = queue.stream(); make_device_storage(result_storage, arg.size(), stream); btas::Tensor result(std::move(result_range), std::move(result_storage)); - // call cublasCopy - const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, stream)); - - CublasSafeCall(cublasCopy(handle, result.size(), device_data(arg.storage()), - 1, device_data(result.storage()), 1)); + blas::copy(result.size(), device_data(arg.storage()), 1, + device_data(result.storage()), 1, queue); device::synchronize_stream(&stream); return result; @@ -317,17 +236,17 @@ template btas_tensor_scale_cuda_impl( const btas::Tensor &arg, const Scalar a) { DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &stream = detail::get_stream_based_on_range(arg.range()); - // std::cout << "scale, tile offset: " << arg.range().offset() << " stream: " - // << arg.range().offset() % deviceEnv::instance()->num_streams() << "\n"; + auto &queue = detail::get_blasqueue_based_on_range(arg.range()); + auto &stream = queue.stream(); auto result = btas_tensor_clone_cuda_impl(arg); - // call cublasScale - const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, stream)); - CublasSafeCall( - cublasScal(handle, result.size(), &a, device_data(result.storage()), 1)); + if constexpr (std::is_same_v || + std::is_same_v) { + blas::scal(result.size(), a, device_data(result.storage()), 1, queue); + } else { + abort(); // not yet implemented + } device::synchronize_stream(&stream); @@ -340,12 +259,14 @@ template &result, const Scalar a) { DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &stream = detail::get_stream_based_on_range(result.range()); - // call cublasScale - const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, stream)); - CublasSafeCall( - cublasScal(handle, result.size(), &a, device_data(result.storage()), 1)); + auto &queue = detail::get_blasqueue_based_on_range(result.range()); + auto &stream = queue.stream(); + if constexpr (std::is_same_v || + std::is_same_v) { + blas::scal(result.size(), a, device_data(result.storage()), 1, queue); + } else { + abort(); // not yet implemented + } device::synchronize_stream(&stream); } @@ -362,17 +283,14 @@ btas::Tensor btas_tensor_subt_cuda_impl( auto b = -a; DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &stream = detail::get_stream_based_on_range(result.range()); + auto &queue = detail::get_blasqueue_based_on_range(result.range()); + auto &stream = queue.stream(); if (in_memory_space(result.storage())) { - const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, stream)); - CublasSafeCall(cublasAxpy(handle, result.size(), &b, - device_data(arg2.storage()), 1, - device_data(result.storage()), 1)); + blas::axpy(result.size(), b, device_data(arg2.storage()), 1, + device_data(result.storage()), 1, queue); } else { TA_ASSERT(false); - // btas::axpy(1.0, arg, result); } device::synchronize_stream(&stream); @@ -386,16 +304,14 @@ void btas_tensor_subt_to_cuda_impl(btas::Tensor &result, const btas::Tensor &arg1, const Scalar a) { DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &stream = detail::get_stream_based_on_range(result.range()); + auto &queue = detail::get_blasqueue_based_on_range(result.range()); + auto &stream = queue.stream(); // revert the sign of a auto b = -a; - const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, stream)); - CublasSafeCall(cublasAxpy(handle, result.size(), &b, - device_data(arg1.storage()), 1, - device_data(result.storage()), 1)); + blas::axpy(result.size(), b, device_data(arg1.storage()), 1, + device_data(result.storage()), 1, queue); device::synchronize_stream(&stream); } @@ -408,13 +324,11 @@ btas::Tensor btas_tensor_add_cuda_impl( auto result = btas_tensor_clone_cuda_impl(arg1); DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &stream = detail::get_stream_based_on_range(result.range()); + auto &queue = detail::get_blasqueue_based_on_range(result.range()); + auto &stream = queue.stream(); - const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, stream)); - CublasSafeCall(cublasAxpy(handle, result.size(), &a, - device_data(arg2.storage()), 1, - device_data(result.storage()), 1)); + blas::axpy(result.size(), a, device_data(arg2.storage()), 1, + device_data(result.storage()), 1, queue); device::synchronize_stream(&stream); return result; @@ -427,16 +341,14 @@ void btas_tensor_add_to_cuda_impl(btas::Tensor &result, const btas::Tensor &arg, const Scalar a) { DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &stream = detail::get_stream_based_on_range(result.range()); + auto &queue = detail::get_blasqueue_based_on_range(result.range()); + auto &stream = queue.stream(); // TiledArray::to_execution_space(result.storage(),stream); // TiledArray::to_execution_space(arg.storage(),stream); - const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, stream)); - CublasSafeCall(cublasAxpy(handle, result.size(), &a, - device_data(arg.storage()), 1, - device_data(result.storage()), 1)); + blas::axpy(result.size(), a, device_data(arg.storage()), 1, + device_data(result.storage()), 1, queue); device::synchronize_stream(&stream); } @@ -487,17 +399,16 @@ typename btas::Tensor::value_type btas_tensor_squared_norm_cuda_impl(const btas::Tensor &arg) { DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &stream = detail::get_stream_based_on_range(arg.range()); + auto &queue = detail::get_blasqueue_based_on_range(arg.range()); + auto &stream = queue.stream(); auto &storage = arg.storage(); using TiledArray::math::blas::integer; integer size = storage.size(); T result = 0; if (in_memory_space(storage)) { - const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, stream)); - CublasSafeCall(cublasDot(handle, size, device_data(storage), 1, - device_data(storage), 1, &result)); + blas::dot(size, device_data(storage), 1, device_data(storage), 1, &result, + queue); } else { TA_ASSERT(false); // result = TiledArray::math::dot(size, storage.data(), storage.data()); @@ -513,7 +424,8 @@ typename btas::Tensor::value_type btas_tensor_dot_cuda_impl( const btas::Tensor &arg2) { DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &stream = detail::get_stream_based_on_range(arg1.range()); + auto &queue = detail::get_blasqueue_based_on_range(arg1.range()); + auto &stream = queue.stream(); using TiledArray::math::blas::integer; integer size = arg1.storage().size(); @@ -523,10 +435,8 @@ typename btas::Tensor::value_type btas_tensor_dot_cuda_impl( T result = 0; if (in_memory_space(arg1.storage()) && in_memory_space(arg2.storage())) { - const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, stream)); - CublasSafeCall(cublasDot(handle, size, device_data(arg1.storage()), 1, - device_data(arg2.storage()), 1, &result)); + blas::dot(size, device_data(arg1.storage()), 1, device_data(arg2.storage()), + 1, &result, queue); } else { TA_ASSERT(false); // result = TiledArray::math::dot(size, storage.data(), storage.data()); @@ -621,6 +531,6 @@ T btas_tensor_absmax_cuda_impl(const btas::Tensor &arg) { } // namespace TiledArray -#endif // TILEDARRAY_HAS_CUDA +#endif // TILEDARRAY_HAS_DEVICE #endif // TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED diff --git a/src/TiledArray/device/btas_um_tensor.h b/src/TiledArray/device/btas_um_tensor.h index 3c2a120fd3..05bbca4ce6 100644 --- a/src/TiledArray/device/btas_um_tensor.h +++ b/src/TiledArray/device/btas_um_tensor.h @@ -139,7 +139,8 @@ btasUMTensorVarray shift(const btasUMTensorVarray &arg, DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); // @important select the stream using the shifted range - auto &stream = detail::get_stream_based_on_range(result_range); + auto &queue = detail::get_blasqueue_based_on_range(result_range); + auto &stream = queue.stream(); typename btasUMTensorVarray::storage_type result_storage; @@ -147,12 +148,8 @@ btasUMTensorVarray shift(const btasUMTensorVarray &arg, btasUMTensorVarray result(std::move(result_range), std::move(result_storage)); - // call cublasCopy - const auto &handle = cuBLASHandlePool::handle(); - CublasSafeCall(cublasSetStream(handle, stream)); - - CublasSafeCall(cublasCopy(handle, result.size(), device_data(arg.storage()), - 1, device_data(result.storage()), 1)); + blas::copy(result.size(), device_data(arg.storage()), 1, + device_data(result.storage()), 1, queue); device::synchronize_stream(&stream); return result; diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp index 51cfc02825..a24e72c5f9 100644 --- a/src/TiledArray/tiledarray.cpp +++ b/src/TiledArray/tiledarray.cpp @@ -8,7 +8,7 @@ #ifdef TILEDARRAY_HAS_DEVICE #ifdef TILEDARRAY_HAS_CUDA -#include +#include #endif #include #include @@ -29,8 +29,8 @@ namespace { inline void device_initialize() { /// initialize deviceEnv deviceEnv::instance(); -#if defined(TILEDARRAY_HAS_CUDA) - cuBLASHandlePool::handle(); +#if defined(TILEDARRAY_HAS_DEVICE) + BLASQueuePool::initialize(); #endif // initialize LibreTT librettInitialize(); @@ -40,9 +40,8 @@ inline void device_initialize() { inline void device_finalize() { DeviceSafeCall(device::deviceSynchronize()); librettFinalize(); -#if defined(TILEDARRAY_HAS_CUDA) - cublasDestroy(cuBLASHandlePool::handle()); - delete &cuBLASHandlePool::handle(); +#if defined(TILEDARRAY_HAS_DEVICE) + BLASQueuePool::finalize(); #endif // although TA::deviceEnv is a singleton, must explicitly delete it so // that the device runtime is not finalized before the deviceEnv dtor is From 664cc6fe37283ee00a4f41a1f82ba13e31a79530 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 20 Sep 2023 14:23:50 -0400 Subject: [PATCH 113/592] btas_um_tensor is now CUDA-free --- src/CMakeLists.txt | 76 ++++----- .../device/{btas_cublas.h => btas.h} | 153 ++++++++++-------- src/TiledArray/device/btas_um_tensor.cpp | 3 - src/TiledArray/device/btas_um_tensor.h | 63 ++++---- src/TiledArray/type_traits.h | 33 +++- 5 files changed, 187 insertions(+), 141 deletions(-) rename src/TiledArray/device/{btas_cublas.h => btas.h} (79%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 606cf0df2d..126902862e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -213,24 +213,27 @@ if(HIP_FOUND OR CUDA_FOUND) TiledArray/external/librett.h) endif() -if(CUDA_FOUND) +if(CUDA_FOUND OR HIP_FOUND) list(APPEND TILEDARRAY_HEADER_FILES - TiledArray/external/cuda.h - TiledArray/device/blas.cpp - TiledArray/device/blas.h - TiledArray/device/btas_cublas.h - TiledArray/device/btas_um_tensor.h - TiledArray/device/cpu_cuda_vector.h - TiledArray/device/device_task_fn.h - TiledArray/device/kernel/mult_kernel.h - TiledArray/device/kernel/mult_kernel_impl.h - TiledArray/device/kernel/reduce_kernel.h - TiledArray/device/kernel/reduce_kernel_impl.h - TiledArray/device/platform.h - TiledArray/device/thrust.h - TiledArray/device/allocators.h - TiledArray/device/um_storage.h) -endif(CUDA_FOUND) + TiledArray/device/blas.cpp + TiledArray/device/blas.h + TiledArray/device/btas.h + TiledArray/device/btas_um_tensor.h + TiledArray/device/device_task_fn.h + TiledArray/device/kernel/mult_kernel.h + TiledArray/device/kernel/mult_kernel_impl.h + TiledArray/device/kernel/reduce_kernel.h + TiledArray/device/kernel/reduce_kernel_impl.h + TiledArray/device/platform.h + TiledArray/device/thrust.h + TiledArray/device/allocators.h + TiledArray/device/um_storage.h) + if(CUDA_FOUND) + list(APPEND TILEDARRAY_HEADER_FILES + TiledArray/external/cuda.h + TiledArray/device/cpu_cuda_vector.h) + endif(CUDA_FOUND) +endif(CUDA_FOUND OR HIP_FOUND) set(TILEDARRAY_SOURCE_FILES TiledArray/tiledarray.cpp @@ -256,41 +259,40 @@ set_source_files_properties( # when FetchContent umpire: set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers umpire) set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers TiledArray_UMPIRE) -# TODO better ways to handle tiledarray cuda dependency -if(CUDA_FOUND) +if(CUDA_FOUND OR HIP_FOUND) - set(TILEDARRAY_CUDA_SOURCE_FILES + set(TILEDARRAY_DEVICE_SOURCE_FILES TiledArray/device/btas_um_tensor.cpp + ) + + if(CUDA_FOUND) + + list(APPEND TILEDARRAY_DEVICE_SOURCE_FILES TiledArray/device/cpu_cuda_vector.cu TiledArray/device/kernel/mult_kernel.cu TiledArray/device/kernel/reduce_kernel.cu TiledArray/device/um_storage.cu) - list(APPEND TILEDARRAY_SOURCE_FILES "${TILEDARRAY_CUDA_SOURCE_FILES}") + list(APPEND TILEDARRAY_SOURCE_FILES "${TILEDARRAY_DEVICE_SOURCE_FILES}") - foreach( f IN LISTS TILEDARRAY_CUDA_SOURCE_FILES ) - set_source_files_properties( "${f}" + foreach( f IN LISTS TILEDARRAY_DEVICE_SOURCE_FILES ) + set_source_files_properties( "${f}" PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}") - endforeach() + endforeach() - set_source_files_properties(TiledArray/device/btas_um_tensor.cpp - PROPERTIES - LANGUAGE CUDA) + # the list of libraries on which TiledArray depends on + list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cudart CUDA::cublas CUDA::nvToolsExt) - # the list of libraries on which TiledArray depends on - list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cudart CUDA::cublas CUDA::nvToolsExt) + endif(CUDA_FOUND) -endif(CUDA_FOUND) + if (HIP_FOUND) + list(APPEND _TILEDARRAY_DEPENDENCIES hip::host roc::rocthrust) + endif() -if (HIP_FOUND) - list(APPEND _TILEDARRAY_DEPENDENCIES hip::host roc::rocthrust) -endif() - -# LibreTT needed for either CUDA or HIP -if(CUDA_FOUND OR HIP_FOUND) + # LibreTT needed for either CUDA or HIP list(APPEND _TILEDARRAY_DEPENDENCIES TiledArray_LIBRETT) -endif() +endif(CUDA_FOUND OR HIP_FOUND) if( TARGET TiledArray_SCALAPACK ) list(APPEND _TILEDARRAY_DEPENDENCIES TiledArray_SCALAPACK) diff --git a/src/TiledArray/device/btas_cublas.h b/src/TiledArray/device/btas.h similarity index 79% rename from src/TiledArray/device/btas_cublas.h rename to src/TiledArray/device/btas.h index 96d9edd3ea..28c3eb2f4a 100644 --- a/src/TiledArray/device/btas_cublas.h +++ b/src/TiledArray/device/btas.h @@ -21,14 +21,16 @@ * */ -#ifndef TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED -#define TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED +#ifndef TILEDARRAY_DEVICE_BTAS_H__INCLUDED +#define TILEDARRAY_DEVICE_BTAS_H__INCLUDED + +#include -#include #include #ifdef TILEDARRAY_HAS_DEVICE +#include #include #include @@ -40,11 +42,15 @@ namespace TiledArray { +namespace device { + +namespace btas { + template >> -btas::Tensor btas_tensor_gemm_cuda_impl( - const btas::Tensor &left, - const btas::Tensor &right, Scalar factor, +::btas::Tensor gemm( + const ::btas::Tensor &left, + const ::btas::Tensor &right, Scalar factor, const TiledArray::math::GemmHelper &gemm_helper) { // Check that the arguments are not empty and have the correct ranks TA_ASSERT(!left.empty()); @@ -88,7 +94,7 @@ btas::Tensor btas_tensor_gemm_cuda_impl( auto &stream = queue.stream(); // the result Tensor type - typedef btas::Tensor Tensor; + typedef ::btas::Tensor Tensor; Tensor result; if (true) { @@ -102,8 +108,8 @@ btas::Tensor btas_tensor_gemm_cuda_impl( TiledArray::to_execution_space( right.storage(), stream); - static_assert(btas::boxrange_iteration_order::value == - btas::boxrange_iteration_order::row_major); + static_assert(::btas::boxrange_iteration_order::value == + ::btas::boxrange_iteration_order::row_major); blas::gemm(blas::Layout::ColMajor, gemm_helper.right_op(), gemm_helper.left_op(), n, m, k, factor_t, device_data(right.storage()), ldb, device_data(left.storage()), @@ -117,11 +123,10 @@ btas::Tensor btas_tensor_gemm_cuda_impl( template >> -void btas_tensor_gemm_cuda_impl( - btas::Tensor &result, - const btas::Tensor &left, - const btas::Tensor &right, Scalar factor, - const TiledArray::math::GemmHelper &gemm_helper) { +void gemm(::btas::Tensor &result, + const ::btas::Tensor &left, + const ::btas::Tensor &right, Scalar factor, + const TiledArray::math::GemmHelper &gemm_helper) { // Check that the result is not empty and has the correct rank TA_ASSERT(!result.empty()); TA_ASSERT(result.range().rank() == gemm_helper.result_rank()); @@ -198,8 +203,8 @@ void btas_tensor_gemm_cuda_impl( TiledArray::to_execution_space( result.storage(), stream); - static_assert(btas::boxrange_iteration_order::value == - btas::boxrange_iteration_order::row_major); + static_assert(::btas::boxrange_iteration_order::value == + ::btas::boxrange_iteration_order::row_major); blas::gemm(blas::Layout::ColMajor, gemm_helper.right_op(), gemm_helper.left_op(), n, m, k, factor_t, device_data(right.storage()), ldb, device_data(left.storage()), @@ -210,8 +215,8 @@ void btas_tensor_gemm_cuda_impl( /// result[i] = arg[i] template -btas::Tensor btas_tensor_clone_cuda_impl( - const btas::Tensor &arg) { +::btas::Tensor clone( + const ::btas::Tensor &arg) { DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); Storage result_storage; @@ -220,8 +225,8 @@ btas::Tensor btas_tensor_clone_cuda_impl( auto &stream = queue.stream(); make_device_storage(result_storage, arg.size(), stream); - btas::Tensor result(std::move(result_range), - std::move(result_storage)); + ::btas::Tensor result(std::move(result_range), + std::move(result_storage)); blas::copy(result.size(), device_data(arg.storage()), 1, device_data(result.storage()), 1, queue); @@ -233,19 +238,28 @@ btas::Tensor btas_tensor_clone_cuda_impl( /// result[i] = a * arg[i] template >> -btas::Tensor btas_tensor_scale_cuda_impl( - const btas::Tensor &arg, const Scalar a) { +::btas::Tensor scale( + const ::btas::Tensor &arg, const Scalar a) { DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); auto &queue = detail::get_blasqueue_based_on_range(arg.range()); auto &stream = queue.stream(); - auto result = btas_tensor_clone_cuda_impl(arg); + auto result = clone(arg); - if constexpr (std::is_same_v || - std::is_same_v) { + if constexpr (detail::is_blas_numeric_v || + std::is_arithmetic_v) { blas::scal(result.size(), a, device_data(result.storage()), 1, queue); } else { - abort(); // not yet implemented + if constexpr (detail::is_complex_v) { + abort(); // fused conjugation requires custom kernels, not yet supported + } else { + if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { + blas::scal(result.size(), static_cast(-1), + device_data(result.storage()), 1, queue); + } + } } device::synchronize_stream(&stream); @@ -256,16 +270,25 @@ btas::Tensor btas_tensor_scale_cuda_impl( /// result[i] *= a template >> -void btas_tensor_scale_to_cuda_impl(btas::Tensor &result, - const Scalar a) { +void scale_to(::btas::Tensor &result, const Scalar a) { DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); auto &queue = detail::get_blasqueue_based_on_range(result.range()); auto &stream = queue.stream(); - if constexpr (std::is_same_v || - std::is_same_v) { + + if constexpr (detail::is_blas_numeric_v || + std::is_arithmetic_v) { blas::scal(result.size(), a, device_data(result.storage()), 1, queue); } else { - abort(); // not yet implemented + if constexpr (detail::is_complex_v) { + abort(); // fused conjugation requires custom kernels, not yet supported + } else { + if constexpr (std::is_same_v>) { + } else if constexpr (std::is_same_v>) { + blas::scal(result.size(), static_cast(-1), + device_data(result.storage()), 1, queue); + } + } } device::synchronize_stream(&stream); @@ -274,10 +297,10 @@ void btas_tensor_scale_to_cuda_impl(btas::Tensor &result, /// result[i] = arg1[i] - a * arg2[i] template >> -btas::Tensor btas_tensor_subt_cuda_impl( - const btas::Tensor &arg1, - const btas::Tensor &arg2, const Scalar a) { - auto result = btas_tensor_clone_cuda_impl(arg1); +::btas::Tensor subt( + const ::btas::Tensor &arg1, + const ::btas::Tensor &arg2, const Scalar a) { + auto result = clone(arg1); // revert the sign of a auto b = -a; @@ -300,9 +323,8 @@ btas::Tensor btas_tensor_subt_cuda_impl( /// result[i] -= a * arg1[i] template >> -void btas_tensor_subt_to_cuda_impl(btas::Tensor &result, - const btas::Tensor &arg1, - const Scalar a) { +void subt_to(::btas::Tensor &result, + const ::btas::Tensor &arg1, const Scalar a) { DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); auto &queue = detail::get_blasqueue_based_on_range(result.range()); auto &stream = queue.stream(); @@ -318,10 +340,10 @@ void btas_tensor_subt_to_cuda_impl(btas::Tensor &result, /// result[i] = arg1[i] + a * arg2[i] template >> -btas::Tensor btas_tensor_add_cuda_impl( - const btas::Tensor &arg1, - const btas::Tensor &arg2, const Scalar a) { - auto result = btas_tensor_clone_cuda_impl(arg1); +::btas::Tensor add( + const ::btas::Tensor &arg1, + const ::btas::Tensor &arg2, const Scalar a) { + auto result = clone(arg1); DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); auto &queue = detail::get_blasqueue_based_on_range(result.range()); @@ -337,9 +359,8 @@ btas::Tensor btas_tensor_add_cuda_impl( /// result[i] += a * arg[i] template >> -void btas_tensor_add_to_cuda_impl(btas::Tensor &result, - const btas::Tensor &arg, - const Scalar a) { +void add_to(::btas::Tensor &result, + const ::btas::Tensor &arg, const Scalar a) { DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); auto &queue = detail::get_blasqueue_based_on_range(result.range()); auto &stream = queue.stream(); @@ -355,8 +376,8 @@ void btas_tensor_add_to_cuda_impl(btas::Tensor &result, /// result[i] = result[i] * arg[i] template -void btas_tensor_mult_to_cuda_impl(btas::Tensor &result, - const btas::Tensor &arg) { +void mult_to(::btas::Tensor &result, + const ::btas::Tensor &arg) { auto device_id = deviceEnv::instance()->current_device_id(); auto &stream = detail::get_stream_based_on_range(result.range()); @@ -370,9 +391,9 @@ void btas_tensor_mult_to_cuda_impl(btas::Tensor &result, /// result[i] = arg1[i] * arg2[i] template -btas::Tensor btas_tensor_mult_cuda_impl( - const btas::Tensor &arg1, - const btas::Tensor &arg2) { +::btas::Tensor mult( + const ::btas::Tensor &arg1, + const ::btas::Tensor &arg2) { std::size_t n = arg1.size(); TA_ASSERT(arg2.size() == n); @@ -383,8 +404,8 @@ btas::Tensor btas_tensor_mult_cuda_impl( Storage result_storage; make_device_storage(result_storage, n, stream); - btas::Tensor result(arg1.range(), - std::move(result_storage)); + ::btas::Tensor result(arg1.range(), + std::move(result_storage)); mult_cuda_kernel(result.data(), arg1.data(), arg2.data(), n, stream, device_id); @@ -395,8 +416,8 @@ btas::Tensor btas_tensor_mult_cuda_impl( // foreach(i) result += arg[i] * arg[i] template -typename btas::Tensor::value_type -btas_tensor_squared_norm_cuda_impl(const btas::Tensor &arg) { +typename ::btas::Tensor::value_type squared_norm( + const ::btas::Tensor &arg) { DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); auto &queue = detail::get_blasqueue_based_on_range(arg.range()); @@ -419,9 +440,9 @@ btas_tensor_squared_norm_cuda_impl(const btas::Tensor &arg) { // foreach(i) result += arg1[i] * arg2[i] template -typename btas::Tensor::value_type btas_tensor_dot_cuda_impl( - const btas::Tensor &arg1, - const btas::Tensor &arg2) { +typename ::btas::Tensor::value_type dot( + const ::btas::Tensor &arg1, + const ::btas::Tensor &arg2) { DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); auto &queue = detail::get_blasqueue_based_on_range(arg1.range()); @@ -446,7 +467,7 @@ typename btas::Tensor::value_type btas_tensor_dot_cuda_impl( } template -T btas_tensor_sum_cuda_impl(const btas::Tensor &arg) { +T sum(const ::btas::Tensor &arg) { auto &stream = detail::get_stream_based_on_range(arg.range()); auto device_id = deviceEnv::instance()->current_device_id(); @@ -460,7 +481,7 @@ T btas_tensor_sum_cuda_impl(const btas::Tensor &arg) { } template -T btas_tensor_product_cuda_impl(const btas::Tensor &arg) { +T product(const ::btas::Tensor &arg) { auto &stream = detail::get_stream_based_on_range(arg.range()); auto device_id = deviceEnv::instance()->current_device_id(); @@ -474,7 +495,7 @@ T btas_tensor_product_cuda_impl(const btas::Tensor &arg) { } template -T btas_tensor_min_cuda_impl(const btas::Tensor &arg) { +T min(const ::btas::Tensor &arg) { auto &stream = detail::get_stream_based_on_range(arg.range()); auto device_id = deviceEnv::instance()->current_device_id(); @@ -488,7 +509,7 @@ T btas_tensor_min_cuda_impl(const btas::Tensor &arg) { } template -T btas_tensor_max_cuda_impl(const btas::Tensor &arg) { +T max(const ::btas::Tensor &arg) { auto &stream = detail::get_stream_based_on_range(arg.range()); auto device_id = deviceEnv::instance()->current_device_id(); @@ -502,7 +523,7 @@ T btas_tensor_max_cuda_impl(const btas::Tensor &arg) { } template -T btas_tensor_absmin_cuda_impl(const btas::Tensor &arg) { +T absmin(const ::btas::Tensor &arg) { auto &stream = detail::get_stream_based_on_range(arg.range()); auto device_id = deviceEnv::instance()->current_device_id(); @@ -516,7 +537,7 @@ T btas_tensor_absmin_cuda_impl(const btas::Tensor &arg) { } template -T btas_tensor_absmax_cuda_impl(const btas::Tensor &arg) { +T absmax(const ::btas::Tensor &arg) { auto &stream = detail::get_stream_based_on_range(arg.range()); auto device_id = deviceEnv::instance()->current_device_id(); @@ -529,8 +550,12 @@ T btas_tensor_absmax_cuda_impl(const btas::Tensor &arg) { return result; } +} // namespace btas + +} // namespace device + } // namespace TiledArray #endif // TILEDARRAY_HAS_DEVICE -#endif // TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED +#endif // TILEDARRAY_DEVICE_BTAS_H__INCLUDED diff --git a/src/TiledArray/device/btas_um_tensor.cpp b/src/TiledArray/device/btas_um_tensor.cpp index 0bf648ee42..0b442620cc 100644 --- a/src/TiledArray/device/btas_um_tensor.cpp +++ b/src/TiledArray/device/btas_um_tensor.cpp @@ -2,10 +2,7 @@ // Created by Chong Peng on 7/24/18. // -// clang-format off -#include // provides c++17 features (stds::data, std::size) when compiling CUDA (i.e. c++14) #include -// clang-format on #ifdef TILEDARRAY_HAS_DEVICE diff --git a/src/TiledArray/device/btas_um_tensor.h b/src/TiledArray/device/btas_um_tensor.h index 05bbca4ce6..af4899f4ee 100644 --- a/src/TiledArray/device/btas_um_tensor.h +++ b/src/TiledArray/device/btas_um_tensor.h @@ -26,12 +26,13 @@ #include +#include #include #include #ifdef TILEDARRAY_HAS_DEVICE -#include +#include #include #include #include @@ -102,7 +103,7 @@ btasUMTensorVarray gemm( const btasUMTensorVarray &left, const btasUMTensorVarray &right, Scalar factor, const TiledArray::math::GemmHelper &gemm_helper) { - return btas_tensor_gemm_cuda_impl(left, right, factor, gemm_helper); + return device::btas::gemm(left, right, factor, gemm_helper); } template &result, const btasUMTensorVarray &left, const btasUMTensorVarray &right, Scalar factor, const TiledArray::math::GemmHelper &gemm_helper) { - return btas_tensor_gemm_cuda_impl(result, left, right, factor, gemm_helper); + return device::btas::gemm(result, left, right, factor, gemm_helper); } /// @@ -122,7 +123,7 @@ template btasUMTensorVarray clone(const btasUMTensorVarray &arg) { // TODO how to copy Unified Memory? from CPU or GPU? currently // always copy on GPU, but need to investigate - return btas_tensor_clone_cuda_impl(arg); + return device::btas::clone(arg); } /// @@ -204,7 +205,7 @@ template scale(const btasUMTensorVarray &arg, const Scalar factor) { detail::to_device(arg); - return btas_tensor_scale_cuda_impl(arg, factor); + return device::btas::scale(arg, factor); } template &scale_to(btasUMTensorVarray &arg, const Scalar factor) { detail::to_device(arg); - btas_tensor_scale_to_cuda_impl(arg, factor); + device::btas::scale_to(arg, factor); return arg; } @@ -238,7 +239,7 @@ btasUMTensorVarray scale(const btasUMTensorVarray &arg, template btasUMTensorVarray neg(const btasUMTensorVarray &arg) { detail::to_device(arg); - return btas_tensor_scale_cuda_impl(arg, T(-1.0)); + return device::btas::scale(arg, T(-1.0)); } template < @@ -258,7 +259,7 @@ btasUMTensorVarray neg(const btasUMTensorVarray &arg, template btasUMTensorVarray &neg_to(btasUMTensorVarray &arg) { detail::to_device(arg); - btas_tensor_scale_to_cuda_impl(arg, T(-1.0)); + device::btas::scale_to(arg, T(-1.0)); return arg; } @@ -271,7 +272,7 @@ btasUMTensorVarray subt(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2) { detail::to_device(arg1); detail::to_device(arg2); - return btas_tensor_subt_cuda_impl(arg1, arg2, T(1.0)); + return device::btas::subt(arg1, arg2, T(1.0)); } template subt(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2, const Scalar factor) { auto result = subt(arg1, arg2); - btas_tensor_scale_to_cuda_impl(result, factor); + device::btas::scale_to(result, factor); return result; } @@ -325,7 +326,7 @@ btasUMTensorVarray &subt_to( const btasUMTensorVarray &arg1) { detail::to_device(result); detail::to_device(arg1); - btas_tensor_subt_to_cuda_impl(result, arg1, T(1.0)); + device::btas::subt_to(result, arg1, T(1.0)); return result; } @@ -335,7 +336,7 @@ btasUMTensorVarray &subt_to(btasUMTensorVarray &result, const btasUMTensorVarray &arg1, const Scalar factor) { subt_to(result, arg1); - btas_tensor_scale_to_cuda_impl(result, factor); + device::btas::scale_to(result, factor); return result; } @@ -348,7 +349,7 @@ btasUMTensorVarray add(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2) { detail::to_device(arg1); detail::to_device(arg2); - return btas_tensor_add_cuda_impl(arg1, arg2, T(1.0)); + return device::btas::add(arg1, arg2, T(1.0)); } template add(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2, const Scalar factor) { auto result = add(arg1, arg2); - btas_tensor_scale_to_cuda_impl(result, factor); + device::btas::scale_to(result, factor); return result; } @@ -401,7 +402,7 @@ btasUMTensorVarray &add_to(btasUMTensorVarray &result, const btasUMTensorVarray &arg) { detail::to_device(result); detail::to_device(arg); - btas_tensor_add_to_cuda_impl(result, arg, T(1.0)); + device::btas::add_to(result, arg, T(1.0)); return result; } @@ -411,7 +412,7 @@ btasUMTensorVarray &add_to(btasUMTensorVarray &result, const btasUMTensorVarray &arg, const Scalar factor) { add_to(result, arg); - btas_tensor_scale_to_cuda_impl(result, factor); + device::btas::scale_to(result, factor); return result; } @@ -424,7 +425,7 @@ typename btasUMTensorVarray::value_type dot( const btasUMTensorVarray &arg2) { detail::to_device(arg1); detail::to_device(arg2); - return btas_tensor_dot_cuda_impl(arg1, arg2); + return device::btas::dot(arg1, arg2); } /// @@ -435,7 +436,7 @@ btasUMTensorVarray mult(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2) { detail::to_device(arg1); detail::to_device(arg2); - return btas_tensor_mult_cuda_impl(arg1, arg2); + return device::btas::mult(arg1, arg2); } template mult(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2, const Scalar factor) { auto result = mult(arg1, arg2); - btas_tensor_scale_to_cuda_impl(result, factor); + device::btas::scale_to(result, factor); return result; } @@ -487,7 +488,7 @@ btasUMTensorVarray &mult_to(btasUMTensorVarray &result, const btasUMTensorVarray &arg) { detail::to_device(result); detail::to_device(arg); - btas_tensor_mult_to_cuda_impl(result, arg); + device::btas::mult_to(result, arg); return result; } @@ -497,7 +498,7 @@ btasUMTensorVarray &mult_to(btasUMTensorVarray &result, const btasUMTensorVarray &arg, const Scalar factor) { mult_to(result, arg); - btas_tensor_scale_to_cuda_impl(result, factor); + device::btas::scale_to(result, factor); return result; } @@ -513,7 +514,7 @@ template typename btasUMTensorVarray::value_type squared_norm( const btasUMTensorVarray &arg) { detail::to_device(arg); - return btas_tensor_squared_norm_cuda_impl(arg); + return device::btas::squared_norm(arg); } /// @@ -524,7 +525,7 @@ template typename btasUMTensorVarray::value_type norm( const btasUMTensorVarray &arg) { detail::to_device(arg); - return std::sqrt(btas_tensor_squared_norm_cuda_impl(arg)); + return std::sqrt(device::btas::squared_norm(arg)); } /// @@ -543,7 +544,7 @@ template typename btasUMTensorVarray::value_type sum( const btasUMTensorVarray &arg) { detail::to_device(arg); - return btas_tensor_sum_cuda_impl(arg); + return device::btas::sum(arg); } /// @@ -553,7 +554,7 @@ template typename btasUMTensorVarray::value_type product( const btasUMTensorVarray &arg) { detail::to_device(arg); - return btas_tensor_product_cuda_impl(arg); + return device::btas::product(arg); } /// @@ -563,7 +564,7 @@ template typename btasUMTensorVarray::value_type max( const btasUMTensorVarray &arg) { detail::to_device(arg); - return btas_tensor_max_cuda_impl(arg); + return device::btas::max(arg); } /// @@ -573,7 +574,7 @@ template typename btasUMTensorVarray::value_type abs_max( const btasUMTensorVarray &arg) { detail::to_device(arg); - return btas_tensor_absmax_cuda_impl(arg); + return device::btas::absmax(arg); } /// @@ -583,7 +584,7 @@ template typename btasUMTensorVarray::value_type min( const btasUMTensorVarray &arg) { detail::to_device(arg); - return btas_tensor_min_cuda_impl(arg); + return device::btas::min(arg); } /// @@ -593,7 +594,7 @@ template typename btasUMTensorVarray::value_type abs_min( const btasUMTensorVarray &arg) { detail::to_device(arg); - return btas_tensor_absmin_cuda_impl(arg); + return device::btas::absmin(arg); } /// to host for UM Array @@ -689,7 +690,7 @@ um_tensor_to_ta_tensor( }; const char *use_legacy_conversion = - std::getenv("TA_CUDA_LEGACY_UM_CONVERSION"); + std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION"); auto ta_array = use_legacy_conversion ? to_new_tile_type(um_array, convert_tile_um) : to_new_tile_type(um_array, convert_tile_memcpy); @@ -759,7 +760,7 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { }; const char *use_legacy_conversion = - std::getenv("TA_CUDA_LEGACY_UM_CONVERSION"); + std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION"); auto um_array = use_legacy_conversion ? to_new_tile_type(array, convert_tile_um) : to_new_tile_type(array, convert_tile_memcpy); diff --git a/src/TiledArray/type_traits.h b/src/TiledArray/type_traits.h index 47c90f0130..428ad63716 100644 --- a/src/TiledArray/type_traits.h +++ b/src/TiledArray/type_traits.h @@ -248,9 +248,10 @@ class LazyArrayTile; using Yes = char; \ using No = int; \ template \ - static auto func(void*) -> decltype( \ - std::add_pointer_t()...))>{}, \ - Yes{}); \ + static auto func(void*) \ + -> decltype(std::add_pointer_t< \ + decltype(Function(std::declval()...))>{}, \ + Yes{}); \ template \ static No func(...); \ \ @@ -695,6 +696,25 @@ struct is_scalar> : public std::false_type {}; template constexpr const bool is_scalar_v = is_scalar::value; +template +struct is_blas_numeric : public std::false_type {}; + +template <> +struct is_blas_numeric : public std::true_type {}; + +template <> +struct is_blas_numeric : public std::true_type {}; + +template <> +struct is_blas_numeric> : public std::true_type {}; + +template <> +struct is_blas_numeric> : public std::true_type {}; + +/// \c is_blas_numeric_v is an alias for \c is_blas_numeric::value +template +constexpr const bool is_blas_numeric_v = is_blas_numeric::value; + /// Detect tiles used by \c ArrayEvalImpl /// \c is_array_tile evaluates to \c std::true_type when \c T is a \c @@ -826,9 +846,10 @@ struct is_strictly_ordered_helper { using Yes = char; using No = int; template - static auto test(void*) -> decltype( - std::add_pointer_t() < std::declval())>{}, - Yes{}); + static auto test(void*) + -> decltype(std::add_pointer_t() < + std::declval())>{}, + Yes{}); template static No test(...); From b417f96d22df6982188a01ed2afc0e97fe4f7af5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 20 Sep 2023 14:52:01 -0400 Subject: [PATCH 114/592] reorganize device/kernel + device/btas.h is now CUDA-free --- src/CMakeLists.txt | 17 +- src/TiledArray/device/btas.h | 18 +- src/TiledArray/device/kernel/mult_kernel.cu | 86 --------- src/TiledArray/device/kernel/mult_kernel.h | 59 +++--- src/TiledArray/device/kernel/reduce_kernel.cu | 172 ------------------ src/TiledArray/device/kernel/reduce_kernel.h | 123 ++++++------- .../device/kernel/thrust/mult_kernel.cu | 81 +++++++++ .../mult_kernel.h} | 25 +-- .../device/kernel/thrust/mult_kernel.hip | 1 + .../device/kernel/thrust/reduce_kernel.cu | 167 +++++++++++++++++ .../reduce_kernel.h} | 58 +++--- .../device/kernel/thrust/reduce_kernel.hip | 1 + src/TiledArray/device/thrust.h | 10 + 13 files changed, 412 insertions(+), 406 deletions(-) delete mode 100644 src/TiledArray/device/kernel/mult_kernel.cu delete mode 100644 src/TiledArray/device/kernel/reduce_kernel.cu create mode 100644 src/TiledArray/device/kernel/thrust/mult_kernel.cu rename src/TiledArray/device/kernel/{mult_kernel_impl.h => thrust/mult_kernel.h} (66%) create mode 100644 src/TiledArray/device/kernel/thrust/mult_kernel.hip create mode 100644 src/TiledArray/device/kernel/thrust/reduce_kernel.cu rename src/TiledArray/device/kernel/{reduce_kernel_impl.h => thrust/reduce_kernel.h} (58%) create mode 100644 src/TiledArray/device/kernel/thrust/reduce_kernel.hip diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 126902862e..431785dc54 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -221,9 +221,9 @@ if(CUDA_FOUND OR HIP_FOUND) TiledArray/device/btas_um_tensor.h TiledArray/device/device_task_fn.h TiledArray/device/kernel/mult_kernel.h - TiledArray/device/kernel/mult_kernel_impl.h TiledArray/device/kernel/reduce_kernel.h - TiledArray/device/kernel/reduce_kernel_impl.h + TiledArray/device/kernel/thrust/mult_kernel.h + TiledArray/device/kernel/thrust/reduce_kernel.h TiledArray/device/platform.h TiledArray/device/thrust.h TiledArray/device/allocators.h @@ -269,12 +269,10 @@ if(CUDA_FOUND OR HIP_FOUND) list(APPEND TILEDARRAY_DEVICE_SOURCE_FILES TiledArray/device/cpu_cuda_vector.cu - TiledArray/device/kernel/mult_kernel.cu - TiledArray/device/kernel/reduce_kernel.cu + TiledArray/device/kernel/thrust/mult_kernel.cu + TiledArray/device/kernel/thrust/reduce_kernel.cu TiledArray/device/um_storage.cu) - list(APPEND TILEDARRAY_SOURCE_FILES "${TILEDARRAY_DEVICE_SOURCE_FILES}") - foreach( f IN LISTS TILEDARRAY_DEVICE_SOURCE_FILES ) set_source_files_properties( "${f}" PROPERTIES @@ -287,11 +285,18 @@ if(CUDA_FOUND OR HIP_FOUND) endif(CUDA_FOUND) if (HIP_FOUND) + list(APPEND TILEDARRAY_DEVICE_SOURCE_FILES + TiledArray/device/kernel/thrust/mult_kernel.hip + TiledArray/device/kernel/thrust/reduce_kernel.hip + ) + list(APPEND _TILEDARRAY_DEPENDENCIES hip::host roc::rocthrust) endif() # LibreTT needed for either CUDA or HIP list(APPEND _TILEDARRAY_DEPENDENCIES TiledArray_LIBRETT) + + list(APPEND TILEDARRAY_SOURCE_FILES "${TILEDARRAY_DEVICE_SOURCE_FILES}") endif(CUDA_FOUND OR HIP_FOUND) if( TARGET TiledArray_SCALAPACK ) diff --git a/src/TiledArray/device/btas.h b/src/TiledArray/device/btas.h index 28c3eb2f4a..576cbe9e37 100644 --- a/src/TiledArray/device/btas.h +++ b/src/TiledArray/device/btas.h @@ -385,7 +385,7 @@ void mult_to(::btas::Tensor &result, TA_ASSERT(n == arg.size()); - mult_to_cuda_kernel(result.data(), arg.data(), n, stream, device_id); + device::mult_to_kernel(result.data(), arg.data(), n, stream, device_id); device::synchronize_stream(&stream); } @@ -407,8 +407,8 @@ ::btas::Tensor mult( ::btas::Tensor result(arg1.range(), std::move(result_storage)); - mult_cuda_kernel(result.data(), arg1.data(), arg2.data(), n, stream, - device_id); + device::mult_kernel(result.data(), arg1.data(), arg2.data(), n, stream, + device_id); device::synchronize_stream(&stream); return result; @@ -474,7 +474,7 @@ T sum(const ::btas::Tensor &arg) { auto &storage = arg.storage(); auto n = storage.size(); - auto result = sum_cuda_kernel(arg.data(), n, stream, device_id); + auto result = device::sum_kernel(arg.data(), n, stream, device_id); device::synchronize_stream(&stream); return result; @@ -488,7 +488,7 @@ T product(const ::btas::Tensor &arg) { auto &storage = arg.storage(); auto n = storage.size(); - auto result = product_cuda_kernel(arg.data(), n, stream, device_id); + auto result = device::product_kernel(arg.data(), n, stream, device_id); device::synchronize_stream(&stream); return result; @@ -502,7 +502,7 @@ T min(const ::btas::Tensor &arg) { auto &storage = arg.storage(); auto n = storage.size(); - auto result = min_cuda_kernel(arg.data(), n, stream, device_id); + auto result = device::min_kernel(arg.data(), n, stream, device_id); device::synchronize_stream(&stream); return result; @@ -516,7 +516,7 @@ T max(const ::btas::Tensor &arg) { auto &storage = arg.storage(); auto n = storage.size(); - auto result = max_cuda_kernel(arg.data(), n, stream, device_id); + auto result = device::max_kernel(arg.data(), n, stream, device_id); device::synchronize_stream(&stream); return result; @@ -530,7 +530,7 @@ T absmin(const ::btas::Tensor &arg) { auto &storage = arg.storage(); auto n = storage.size(); - auto result = absmin_cuda_kernel(arg.data(), n, stream, device_id); + auto result = device::absmin_kernel(arg.data(), n, stream, device_id); device::synchronize_stream(&stream); return result; @@ -544,7 +544,7 @@ T absmax(const ::btas::Tensor &arg) { auto &storage = arg.storage(); auto n = storage.size(); - auto result = absmax_cuda_kernel(arg.data(), n, stream, device_id); + auto result = device::absmax_kernel(arg.data(), n, stream, device_id); device::synchronize_stream(&stream); return result; diff --git a/src/TiledArray/device/kernel/mult_kernel.cu b/src/TiledArray/device/kernel/mult_kernel.cu deleted file mode 100644 index ca2d86d4b9..0000000000 --- a/src/TiledArray/device/kernel/mult_kernel.cu +++ /dev/null @@ -1,86 +0,0 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2018 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Chong Peng - * Department of Chemistry, Virginia Tech - * Aug 21, 2018 - * - */ - -#include -#include - - -#ifdef TILEDARRAY_HAS_CUDA - -namespace TiledArray { - -/// result[i] = result[i] * arg[i] -void mult_to_cuda_kernel(int *result, const int *arg, std::size_t n, - cudaStream_t stream, int device_id) { - mult_to_cuda_kernel_impl(result, arg, n, stream, device_id); -} - -void mult_to_cuda_kernel(float *result, const float *arg, std::size_t n, - cudaStream_t stream, int device_id) { - mult_to_cuda_kernel_impl(result, arg, n, stream, device_id); -} - -void mult_to_cuda_kernel(double *result, const double *arg, std::size_t n, - cudaStream_t stream, int device_id) { - mult_to_cuda_kernel_impl(result, arg, n, stream, device_id); -} - -void mult_to_cuda_kernel(std::complex *result, const std::complex *arg, std::size_t n, - cudaStream_t stream, int device_id) { - mult_to_cuda_kernel_impl(result, arg, n, stream, device_id); -} - -void mult_to_cuda_kernel(std::complex *result, const std::complex *arg, std::size_t n, - cudaStream_t stream, int device_id) { - mult_to_cuda_kernel_impl(result, arg, n, stream, device_id); -} - -/// result[i] = arg1[i] * arg2[i] -void mult_cuda_kernel(int *result, const int *arg1, const int *arg2, std::size_t n, - cudaStream_t stream, int device_id){ - mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id); -} - -void mult_cuda_kernel(float *result, const float *arg1, const float *arg2, std::size_t n, - cudaStream_t stream, int device_id){ - mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id); -} - -void mult_cuda_kernel(double *result, const double *arg1, const double *arg2, std::size_t n, - cudaStream_t stream, int device_id){ - mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id); -} - -void mult_cuda_kernel(std::complex *result, const std::complex *arg1, const std::complex *arg2, std::size_t n, - cudaStream_t stream, int device_id){ - mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id); -} - -void mult_cuda_kernel(std::complex *result, const std::complex *arg1, const std::complex *arg2, std::size_t n, - cudaStream_t stream, int device_id){ - mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id); -} - -} // namespace TiledArray - -#endif // TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/device/kernel/mult_kernel.h b/src/TiledArray/device/kernel/mult_kernel.h index 1ea7b9de6d..e21c33e2e9 100644 --- a/src/TiledArray/device/kernel/mult_kernel.h +++ b/src/TiledArray/device/kernel/mult_kernel.h @@ -26,52 +26,51 @@ #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE #include -namespace TiledArray { +#include + +namespace TiledArray::device { /// result[i] = result[i] * arg[i] -void mult_to_cuda_kernel(int *result, const int *arg, std::size_t n, - cudaStream_t stream, int device_id); +void mult_to_kernel(int *result, const int *arg, std::size_t n, stream_t stream, + int device_id); -void mult_to_cuda_kernel(float *result, const float *arg, std::size_t n, - cudaStream_t stream, int device_id); +void mult_to_kernel(float *result, const float *arg, std::size_t n, + stream_t stream, int device_id); -void mult_to_cuda_kernel(double *result, const double *arg, std::size_t n, - cudaStream_t stream, int device_id); +void mult_to_kernel(double *result, const double *arg, std::size_t n, + stream_t stream, int device_id); -void mult_to_cuda_kernel(std::complex *result, - const std::complex *arg, std::size_t n, - cudaStream_t stream, int device_id); +void mult_to_kernel(std::complex *result, const std::complex *arg, + std::size_t n, stream_t stream, int device_id); -void mult_to_cuda_kernel(std::complex *result, - const std::complex *arg, std::size_t n, - cudaStream_t stream, int device_id); +void mult_to_kernel(std::complex *result, + const std::complex *arg, std::size_t n, + stream_t stream, int device_id); /// result[i] = arg1[i] * arg2[i] -void mult_cuda_kernel(int *result, const int *arg1, const int *arg2, - std::size_t n, cudaStream_t stream, int device_id); +void mult_kernel(int *result, const int *arg1, const int *arg2, std::size_t n, + stream_t stream, int device_id); -void mult_cuda_kernel(float *result, const float *arg1, const float *arg2, - std::size_t n, cudaStream_t stream, int device_id); +void mult_kernel(float *result, const float *arg1, const float *arg2, + std::size_t n, stream_t stream, int device_id); -void mult_cuda_kernel(double *result, const double *arg1, const double *arg2, - std::size_t n, cudaStream_t stream, int device_id); +void mult_kernel(double *result, const double *arg1, const double *arg2, + std::size_t n, stream_t stream, int device_id); -void mult_cuda_kernel(std::complex *result, - const std::complex *arg1, - const std::complex *arg2, std::size_t n, - cudaStream_t stream, int device_id); +void mult_kernel(std::complex *result, const std::complex *arg1, + const std::complex *arg2, std::size_t n, + stream_t stream, int device_id); -void mult_cuda_kernel(std::complex *result, - const std::complex *arg1, - const std::complex *arg2, std::size_t n, - cudaStream_t stream, int device_id); +void mult_kernel(std::complex *result, const std::complex *arg1, + const std::complex *arg2, std::size_t n, + stream_t stream, int device_id); -} // namespace TiledArray +} // namespace TiledArray::device -#endif // TILEDARRAY_HAS_CUDA +#endif // TILEDARRAY_HAS_DEVICE #endif // TILEDARRAY_DEVICE_MULT_KERNEL_H__INCLUDED diff --git a/src/TiledArray/device/kernel/reduce_kernel.cu b/src/TiledArray/device/kernel/reduce_kernel.cu deleted file mode 100644 index a09b3f7a41..0000000000 --- a/src/TiledArray/device/kernel/reduce_kernel.cu +++ /dev/null @@ -1,172 +0,0 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2018 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Chong Peng - * Department of Chemistry, Virginia Tech - * May 8, 2019 - * - */ - -#include -#include - - -#ifdef TILEDARRAY_HAS_CUDA - -namespace TiledArray { - -// foreach(i) result *= arg[i] -int product_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return product_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -float product_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return product_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -double product_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, - int device_id){ - - return product_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -std::complex product_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return product_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -std::complex product_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, - int device_id){ - - return product_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -// foreach(i) result += arg[i] -int sum_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -float sum_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -double sum_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -std::complex sum_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -std::complex sum_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -// foreach(i) result = max(result, arg[i]) -int max_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return max_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -float max_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return max_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -double max_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return max_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -// foreach(i) result = min(result, arg[i]) -int min_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return min_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -float min_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return min_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -double min_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return min_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -// foreach(i) result = max(result, abs(arg[i])) -int absmax_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -float absmax_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -double absmax_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -std::complex absmax_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -std::complex absmax_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -// foreach(i) result = min(result, abs(arg[i])) -int absmin_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -float absmin_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -double absmin_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -std::complex absmin_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -std::complex absmin_cuda_kernel(const std::complex *arg, std::size_t n, cudaStream_t stream, - int device_id){ - return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id); -} - -} // namespace TiledArray - -#endif // TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/device/kernel/reduce_kernel.h b/src/TiledArray/device/kernel/reduce_kernel.h index 8910da8b69..89f3600ba6 100644 --- a/src/TiledArray/device/kernel/reduce_kernel.h +++ b/src/TiledArray/device/kernel/reduce_kernel.h @@ -26,106 +26,101 @@ #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE #include -namespace TiledArray { +#include + +namespace TiledArray::device { // foreach(i) result *= arg[i] -int product_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, - int device_id); +int product_kernel(const int *arg, std::size_t n, stream_t stream, + int device_id); -float product_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, - int device_id); +float product_kernel(const float *arg, std::size_t n, stream_t stream, + int device_id); -double product_cuda_kernel(const double *arg, std::size_t n, - cudaStream_t stream, int device_id); +double product_kernel(const double *arg, std::size_t n, stream_t stream, + int device_id); -std::complex product_cuda_kernel(const std::complex *arg, - std::size_t n, cudaStream_t stream, - int device_id); +std::complex product_kernel(const std::complex *arg, + std::size_t n, stream_t stream, + int device_id); -std::complex product_cuda_kernel(const std::complex *arg, - std::size_t n, cudaStream_t stream, - int device_id); +std::complex product_kernel(const std::complex *arg, + std::size_t n, stream_t stream, + int device_id); // foreach(i) result += arg[i] -int sum_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, - int device_id); +int sum_kernel(const int *arg, std::size_t n, stream_t stream, int device_id); -float sum_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, - int device_id); +float sum_kernel(const float *arg, std::size_t n, stream_t stream, + int device_id); -double sum_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, - int device_id); +double sum_kernel(const double *arg, std::size_t n, stream_t stream, + int device_id); -std::complex sum_cuda_kernel(const std::complex *arg, - std::size_t n, cudaStream_t stream, - int device_id); +std::complex sum_kernel(const std::complex *arg, std::size_t n, + stream_t stream, int device_id); -std::complex sum_cuda_kernel(const std::complex *arg, - std::size_t n, cudaStream_t stream, - int device_id); +std::complex sum_kernel(const std::complex *arg, std::size_t n, + stream_t stream, int device_id); // foreach(i) result = max(result, arg[i]) -int max_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, - int device_id); +int max_kernel(const int *arg, std::size_t n, stream_t stream, int device_id); -float max_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, - int device_id); +float max_kernel(const float *arg, std::size_t n, stream_t stream, + int device_id); -double max_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, - int device_id); +double max_kernel(const double *arg, std::size_t n, stream_t stream, + int device_id); // foreach(i) result = min(result, arg[i]) -int min_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, - int device_id); +int min_kernel(const int *arg, std::size_t n, stream_t stream, int device_id); -float min_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, - int device_id); +float min_kernel(const float *arg, std::size_t n, stream_t stream, + int device_id); -double min_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, - int device_id); +double min_kernel(const double *arg, std::size_t n, stream_t stream, + int device_id); // foreach(i) result = max(result, abs(arg[i])) -int absmax_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, - int device_id); +int absmax_kernel(const int *arg, std::size_t n, stream_t stream, + int device_id); -float absmax_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, - int device_id); +float absmax_kernel(const float *arg, std::size_t n, stream_t stream, + int device_id); -double absmax_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, - int device_id); +double absmax_kernel(const double *arg, std::size_t n, stream_t stream, + int device_id); -std::complex absmax_cuda_kernel(const std::complex *arg, - std::size_t n, cudaStream_t stream, - int device_id); +std::complex absmax_kernel(const std::complex *arg, std::size_t n, + stream_t stream, int device_id); -std::complex absmax_cuda_kernel(const std::complex *arg, - std::size_t n, cudaStream_t stream, - int device_id); +std::complex absmax_kernel(const std::complex *arg, + std::size_t n, stream_t stream, + int device_id); // foreach(i) result = min(result, abs(arg[i])) -int absmin_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream, - int device_id); +int absmin_kernel(const int *arg, std::size_t n, stream_t stream, + int device_id); -float absmin_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream, - int device_id); +float absmin_kernel(const float *arg, std::size_t n, stream_t stream, + int device_id); -double absmin_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream, - int device_id); +double absmin_kernel(const double *arg, std::size_t n, stream_t stream, + int device_id); -std::complex absmin_cuda_kernel(const std::complex *arg, - std::size_t n, cudaStream_t stream, - int device_id); +std::complex absmin_kernel(const std::complex *arg, std::size_t n, + stream_t stream, int device_id); -std::complex absmin_cuda_kernel(const std::complex *arg, - std::size_t n, cudaStream_t stream, - int device_id); +std::complex absmin_kernel(const std::complex *arg, + std::size_t n, stream_t stream, + int device_id); -} // namespace TiledArray +} // namespace TiledArray::device -#endif // TILEDARRAY_HAS_CUDA +#endif // TILEDARRAY_HAS_DEVICE #endif // TILEDARRAY_DEVICE_REDUCE_KERNEL_H__INCLUDED diff --git a/src/TiledArray/device/kernel/thrust/mult_kernel.cu b/src/TiledArray/device/kernel/thrust/mult_kernel.cu new file mode 100644 index 0000000000..bac60041f3 --- /dev/null +++ b/src/TiledArray/device/kernel/thrust/mult_kernel.cu @@ -0,0 +1,81 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2018 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Chong Peng + * Department of Chemistry, Virginia Tech + * Aug 21, 2018 + * + */ + +#include +#include + +namespace TiledArray::device { + +/// result[i] = result[i] * arg[i] +void mult_to_kernel(int *result, const int *arg, std::size_t n, + stream_t stream, int device_id) { + mult_to_kernel_thrust(result, arg, n, stream, device_id); +} + +void mult_to_kernel(float *result, const float *arg, std::size_t n, + stream_t stream, int device_id) { + mult_to_kernel_thrust(result, arg, n, stream, device_id); +} + +void mult_to_kernel(double *result, const double *arg, std::size_t n, + stream_t stream, int device_id) { + mult_to_kernel_thrust(result, arg, n, stream, device_id); +} + +void mult_to_kernel(std::complex *result, const std::complex *arg, std::size_t n, + stream_t stream, int device_id) { + mult_to_kernel_thrust(result, arg, n, stream, device_id); +} + +void mult_to_kernel(std::complex *result, const std::complex *arg, std::size_t n, + stream_t stream, int device_id) { + mult_to_kernel_thrust(result, arg, n, stream, device_id); +} + +/// result[i] = arg1[i] * arg2[i] +void mult_kernel(int *result, const int *arg1, const int *arg2, std::size_t n, + stream_t stream, int device_id){ + mult_kernel_thrust(result,arg1,arg2,n,stream,device_id); +} + +void mult_kernel(float *result, const float *arg1, const float *arg2, std::size_t n, + stream_t stream, int device_id){ + mult_kernel_thrust(result,arg1,arg2,n,stream,device_id); +} + +void mult_kernel(double *result, const double *arg1, const double *arg2, std::size_t n, + stream_t stream, int device_id){ + mult_kernel_thrust(result,arg1,arg2,n,stream,device_id); +} + +void mult_kernel(std::complex *result, const std::complex *arg1, const std::complex *arg2, std::size_t n, + stream_t stream, int device_id){ + mult_kernel_thrust(result,arg1,arg2,n,stream,device_id); +} + +void mult_kernel(std::complex *result, const std::complex *arg1, const std::complex *arg2, std::size_t n, + stream_t stream, int device_id){ + mult_kernel_thrust(result,arg1,arg2,n,stream,device_id); +} + +} // namespace TiledArray::device diff --git a/src/TiledArray/device/kernel/mult_kernel_impl.h b/src/TiledArray/device/kernel/thrust/mult_kernel.h similarity index 66% rename from src/TiledArray/device/kernel/mult_kernel_impl.h rename to src/TiledArray/device/kernel/thrust/mult_kernel.h index 1d2a35e862..08e07efa54 100644 --- a/src/TiledArray/device/kernel/mult_kernel_impl.h +++ b/src/TiledArray/device/kernel/thrust/mult_kernel.h @@ -21,41 +21,42 @@ * */ -#ifndef TILEDARRAY_DEVICE_MULT_KERNEL_IMPL_H__INCLUDED -#define TILEDARRAY_DEVICE_MULT_KERNEL_IMPL_H__INCLUDED +#ifndef TILEDARRAY_DEVICE_KERNEL_THRUST_MULT_KERNEL_H__INCLUDED +#define TILEDARRAY_DEVICE_KERNEL_THRUST_MULT_KERNEL_H__INCLUDED -#include +#include +#include #include #include -namespace TiledArray { +namespace TiledArray::device { /// result[i] = result[i] * arg[i] template -void mult_to_cuda_kernel_impl(T *result, const T *arg, std::size_t n, - cudaStream_t stream, int device_id) { +void mult_to_kernel_thrust(T *result, const T *arg, std::size_t n, + stream_t stream, int device_id) { DeviceSafeCall(device::setDevice(device_id)); thrust::multiplies mul_op; thrust::transform( - thrust::cuda::par.on(stream), thrust::device_pointer_cast(arg), + thrust_system::par.on(stream), thrust::device_pointer_cast(arg), thrust::device_pointer_cast(arg) + n, thrust::device_pointer_cast(result), thrust::device_pointer_cast(result), mul_op); } /// result[i] = arg1[i] * arg2[i] template -void mult_cuda_kernel_impl(T *result, const T *arg1, const T *arg2, - std::size_t n, cudaStream_t stream, int device_id) { +void mult_kernel_thrust(T *result, const T *arg1, const T *arg2, std::size_t n, + stream_t stream, int device_id) { DeviceSafeCall(device::setDevice(device_id)); thrust::multiplies mul_op; thrust::transform( - thrust::cuda::par.on(stream), thrust::device_pointer_cast(arg1), + thrust_system::par.on(stream), thrust::device_pointer_cast(arg1), thrust::device_pointer_cast(arg1) + n, thrust::device_pointer_cast(arg2), thrust::device_pointer_cast(result), mul_op); } -} // namespace TiledArray +} // namespace TiledArray::device -#endif // TILEDARRAY_DEVICE_MULT_KERNEL_IMPL_H__INCLUDED +#endif // TILEDARRAY_DEVICE_KERNEL_THRUST_MULT_KERNEL_H__INCLUDED diff --git a/src/TiledArray/device/kernel/thrust/mult_kernel.hip b/src/TiledArray/device/kernel/thrust/mult_kernel.hip new file mode 100644 index 0000000000..f0788eb5e2 --- /dev/null +++ b/src/TiledArray/device/kernel/thrust/mult_kernel.hip @@ -0,0 +1 @@ +#include diff --git a/src/TiledArray/device/kernel/thrust/reduce_kernel.cu b/src/TiledArray/device/kernel/thrust/reduce_kernel.cu new file mode 100644 index 0000000000..bfca9f2e64 --- /dev/null +++ b/src/TiledArray/device/kernel/thrust/reduce_kernel.cu @@ -0,0 +1,167 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2018 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Chong Peng + * Department of Chemistry, Virginia Tech + * May 8, 2019 + * + */ + +#include +#include + +namespace TiledArray::device { + +// foreach(i) result *= arg[i] +int product_kernel(const int *arg, std::size_t n, stream_t stream, + int device_id){ + return product_reduce_kernel_thrust(arg, n, stream, device_id); +} + +float product_kernel(const float *arg, std::size_t n, stream_t stream, + int device_id){ + return product_reduce_kernel_thrust(arg, n, stream, device_id); +} + +double product_kernel(const double *arg, std::size_t n, stream_t stream, + int device_id){ + + return product_reduce_kernel_thrust(arg, n, stream, device_id); +} + +std::complex product_kernel(const std::complex *arg, std::size_t n, stream_t stream, + int device_id){ + return product_reduce_kernel_thrust(arg, n, stream, device_id); +} + +std::complex product_kernel(const std::complex *arg, std::size_t n, stream_t stream, + int device_id){ + + return product_reduce_kernel_thrust(arg, n, stream, device_id); +} + +// foreach(i) result += arg[i] +int sum_kernel(const int *arg, std::size_t n, stream_t stream, + int device_id){ + return sum_reduce_kernel_thrust(arg, n, stream, device_id); +} + +float sum_kernel(const float *arg, std::size_t n, stream_t stream, + int device_id){ + return sum_reduce_kernel_thrust(arg, n, stream, device_id); +} + +double sum_kernel(const double *arg, std::size_t n, stream_t stream, + int device_id){ + return sum_reduce_kernel_thrust(arg, n, stream, device_id); +} + +std::complex sum_kernel(const std::complex *arg, std::size_t n, stream_t stream, + int device_id){ + return sum_reduce_kernel_thrust(arg, n, stream, device_id); +} + +std::complex sum_kernel(const std::complex *arg, std::size_t n, stream_t stream, + int device_id){ + return sum_reduce_kernel_thrust(arg, n, stream, device_id); +} + +// foreach(i) result = max(result, arg[i]) +int max_kernel(const int *arg, std::size_t n, stream_t stream, + int device_id){ + return max_reduce_kernel_thrust(arg, n, stream, device_id); +} + +float max_kernel(const float *arg, std::size_t n, stream_t stream, + int device_id){ + return max_reduce_kernel_thrust(arg, n, stream, device_id); +} + +double max_kernel(const double *arg, std::size_t n, stream_t stream, + int device_id){ + return max_reduce_kernel_thrust(arg, n, stream, device_id); +} + +// foreach(i) result = min(result, arg[i]) +int min_kernel(const int *arg, std::size_t n, stream_t stream, + int device_id){ + return min_reduce_kernel_thrust(arg, n, stream, device_id); +} + +float min_kernel(const float *arg, std::size_t n, stream_t stream, + int device_id){ + return min_reduce_kernel_thrust(arg, n, stream, device_id); +} + +double min_kernel(const double *arg, std::size_t n, stream_t stream, + int device_id){ + return min_reduce_kernel_thrust(arg, n, stream, device_id); +} + +// foreach(i) result = max(result, abs(arg[i])) +int absmax_kernel(const int *arg, std::size_t n, stream_t stream, + int device_id){ + return absmax_reduce_kernel_thrust(arg, n, stream, device_id); +} + +float absmax_kernel(const float *arg, std::size_t n, stream_t stream, + int device_id){ + return absmax_reduce_kernel_thrust(arg, n, stream, device_id); +} + +double absmax_kernel(const double *arg, std::size_t n, stream_t stream, + int device_id){ + return absmax_reduce_kernel_thrust(arg, n, stream, device_id); +} + +std::complex absmax_kernel(const std::complex *arg, std::size_t n, stream_t stream, + int device_id){ + return absmax_reduce_kernel_thrust(arg, n, stream, device_id); +} + +std::complex absmax_kernel(const std::complex *arg, std::size_t n, stream_t stream, + int device_id){ + return absmax_reduce_kernel_thrust(arg, n, stream, device_id); +} + +// foreach(i) result = min(result, abs(arg[i])) +int absmin_kernel(const int *arg, std::size_t n, stream_t stream, + int device_id){ + return absmin_reduce_kernel_thrust(arg, n, stream, device_id); +} + +float absmin_kernel(const float *arg, std::size_t n, stream_t stream, + int device_id){ + return absmin_reduce_kernel_thrust(arg, n, stream, device_id); +} + +double absmin_kernel(const double *arg, std::size_t n, stream_t stream, + int device_id){ + return absmin_reduce_kernel_thrust(arg, n, stream, device_id); +} + +std::complex absmin_kernel(const std::complex *arg, std::size_t n, stream_t stream, + int device_id){ + return absmin_reduce_kernel_thrust(arg, n, stream, device_id); +} + +std::complex absmin_kernel(const std::complex *arg, std::size_t n, stream_t stream, + int device_id){ + return absmin_reduce_kernel_thrust(arg, n, stream, device_id); +} + +} // namespace TiledArray::device diff --git a/src/TiledArray/device/kernel/reduce_kernel_impl.h b/src/TiledArray/device/kernel/thrust/reduce_kernel.h similarity index 58% rename from src/TiledArray/device/kernel/reduce_kernel_impl.h rename to src/TiledArray/device/kernel/thrust/reduce_kernel.h index 6daae446bf..8ee8e57b29 100644 --- a/src/TiledArray/device/kernel/reduce_kernel_impl.h +++ b/src/TiledArray/device/kernel/thrust/reduce_kernel.h @@ -21,11 +21,12 @@ * */ -#ifndef TILEDARRAY_DEVICE_REDUCE_KERNEL_IMPL_H__INCLUDED -#define TILEDARRAY_DEVICE_REDUCE_KERNEL_IMPL_H__INCLUDED +#ifndef TILEDARRAY_DEVICE_THRUST_REDUCE_KERNEL_H__INCLUDED +#define TILEDARRAY_DEVICE_THRUST_REDUCE_KERNEL_H__INCLUDED #include +#include #include #include #include @@ -34,10 +35,9 @@ #include #include -namespace TiledArray { +namespace TiledArray::device { namespace detail { - template struct absolute_value : public thrust::unary_function> { @@ -55,53 +55,55 @@ struct absolute_value /// T = reduce(T* arg) template -T reduce_cuda_kernel_impl(ReduceOp &&op, const T *arg, std::size_t n, T init, - cudaStream_t stream, int device_id) { +T reduce_kernel_thrust(ReduceOp &&op, const T *arg, std::size_t n, T init, + stream_t stream, int device_id) { DeviceSafeCall(device::setDevice(device_id)); auto arg_p = thrust::device_pointer_cast(arg); - auto result = thrust::reduce(thrust::cuda::par.on(stream), arg_p, arg_p + n, + auto result = thrust::reduce(thrust_system::par.on(stream), arg_p, arg_p + n, init, std::forward(op)); return result; } template -T product_reduce_cuda_kernel_impl(const T *arg, std::size_t n, - cudaStream_t stream, int device_id) { +T product_reduce_kernel_thrust(const T *arg, std::size_t n, stream_t stream, + int device_id) { T init(1); thrust::multiplies mul_op; - return reduce_cuda_kernel_impl(mul_op, arg, n, init, stream, device_id); + return reduce_kernel_thrust(mul_op, arg, n, init, stream, device_id); } template -T sum_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream, - int device_id) { +T sum_reduce_kernel_thrust(const T *arg, std::size_t n, stream_t stream, + int device_id) { T init(0); thrust::plus plus_op; - return reduce_cuda_kernel_impl(plus_op, arg, n, init, stream, device_id); + return reduce_kernel_thrust(plus_op, arg, n, init, stream, device_id); } template -T max_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream, - int device_id) { +T max_reduce_kernel_thrust(const T *arg, std::size_t n, stream_t stream, + int device_id) { T init = std::numeric_limits::lowest(); thrust::maximum max_op; - return reduce_cuda_kernel_impl(max_op, arg, n, init, stream, device_id); + return reduce_kernel_thrust(max_op, arg, n, init, stream, device_id); } template -T min_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream, - int device_id) { +T min_reduce_kernel_thrust(const T *arg, std::size_t n, stream_t stream, + int device_id) { T init = std::numeric_limits::max(); thrust::minimum min_op; - return reduce_cuda_kernel_impl(min_op, arg, n, init, stream, device_id); + return reduce_kernel_thrust(min_op, arg, n, init, stream, device_id); } template -TiledArray::detail::scalar_t absmax_reduce_cuda_kernel_impl( - const T *arg, std::size_t n, cudaStream_t stream, int device_id) { +TiledArray::detail::scalar_t absmax_reduce_kernel_thrust(const T *arg, + std::size_t n, + stream_t stream, + int device_id) { using TR = TiledArray::detail::scalar_t; TR init(0); thrust::maximum max_op; @@ -111,15 +113,17 @@ TiledArray::detail::scalar_t absmax_reduce_cuda_kernel_impl( auto arg_p = thrust::device_pointer_cast(arg); - auto result = thrust::transform_reduce(thrust::cuda::par.on(stream), arg_p, + auto result = thrust::transform_reduce(thrust_system::par.on(stream), arg_p, arg_p + n, abs_op, init, max_op); return result; } template -TiledArray::detail::scalar_t absmin_reduce_cuda_kernel_impl( - const T *arg, std::size_t n, cudaStream_t stream, int device_id) { +TiledArray::detail::scalar_t absmin_reduce_kernel_thrust(const T *arg, + std::size_t n, + stream_t stream, + int device_id) { using TR = TiledArray::detail::scalar_t; TR init = std::numeric_limits::max(); thrust::minimum min_op; @@ -129,11 +133,11 @@ TiledArray::detail::scalar_t absmin_reduce_cuda_kernel_impl( auto arg_p = thrust::device_pointer_cast(arg); - auto result = thrust::transform_reduce(thrust::cuda::par.on(stream), arg_p, + auto result = thrust::transform_reduce(thrust_system::par.on(stream), arg_p, arg_p + n, abs_op, init, min_op); return result; } -} // namespace TiledArray +} // namespace TiledArray::device -#endif // TILEDARRAY_DEVICE_REDUCE_KERNEL_IMPL_H__INCLUDED +#endif // TILEDARRAY_DEVICE_THRUST_REDUCE_KERNEL_H__INCLUDED diff --git a/src/TiledArray/device/kernel/thrust/reduce_kernel.hip b/src/TiledArray/device/kernel/thrust/reduce_kernel.hip new file mode 100644 index 0000000000..5be5002c84 --- /dev/null +++ b/src/TiledArray/device/kernel/thrust/reduce_kernel.hip @@ -0,0 +1 @@ +#include diff --git a/src/TiledArray/device/thrust.h b/src/TiledArray/device/thrust.h index 9a3ec8f23b..b98e425a46 100644 --- a/src/TiledArray/device/thrust.h +++ b/src/TiledArray/device/thrust.h @@ -62,6 +62,16 @@ template void resize(thrust::device_vector& dev_vec, size_t size); } // namespace thrust +namespace TiledArray::device { + +#ifdef TILEDARRAY_HAS_CUDA +namespace thrust_system = thrust::cuda; +#elif TILEDARRAY_HAS_HIP +namespace thrust_system = thrust::hip; +#endif + +} // namespace TiledArray::device + #endif // TILEDARRAY_HAS_DEVICE #endif // TILEDARRAY_DEVICE_THRUST_H__INCLUDED From ef3ad689f2a0068807af8642479e58ec1df152b0 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 20 Sep 2023 16:57:51 -0400 Subject: [PATCH 115/592] device/blas.h is now CUDA-free --- src/TiledArray/device/blas.h | 309 ---------------------------------- src/TiledArray/tiledarray.cpp | 2 - 2 files changed, 311 deletions(-) diff --git a/src/TiledArray/device/blas.h b/src/TiledArray/device/blas.h index da9d594bdc..a41bb9b908 100644 --- a/src/TiledArray/device/blas.h +++ b/src/TiledArray/device/blas.h @@ -37,25 +37,6 @@ namespace TiledArray { -namespace detail { - -template -auto cublasPointer(T *std_complex_ptr) { - using Scalar = TiledArray::detail::scalar_t; - static_assert(std::is_same_v || - std::is_same_v); - constexpr bool DP = std::is_same_v; - using cuT = std::conditional_t, - cuDoubleComplex, cuComplex>; - if constexpr (std::is_const_v< - std::remove_pointer_t>) { - return reinterpret_cast(std_complex_ptr); - } else - return reinterpret_cast(std_complex_ptr); -}; - -} // namespace detail - /* * cuBLAS interface functions */ @@ -88,296 +69,6 @@ blas::Queue &get_blasqueue_based_on_range(const Range &range) { } } // namespace detail -/// AXPY interface functions - -template -cublasStatus_t cublasAxpy(cublasHandle_t handle, int n, const Scalar *alpha, - const T *x, int incx, T *y, int incy); -template <> -inline cublasStatus_t cublasAxpy(cublasHandle_t handle, int n, - const float *alpha, - const float *x, int incx, - float *y, int incy) { - return cublasSaxpy(handle, n, alpha, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasAxpy(cublasHandle_t handle, int n, - const double *alpha, - const double *x, int incx, - double *y, int incy) { - return cublasDaxpy(handle, n, alpha, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasAxpy, std::complex>( - cublasHandle_t handle, int n, const std::complex *alpha, - const std::complex *x, int incx, std::complex *y, int incy) { - using detail::cublasPointer; - return cublasCaxpy(handle, n, cublasPointer(alpha), cublasPointer(x), incx, - cublasPointer(y), incy); -} - -template <> -inline cublasStatus_t cublasAxpy, std::complex>( - cublasHandle_t handle, int n, const std::complex *alpha, - const std::complex *x, int incx, std::complex *y, - int incy) { - using detail::cublasPointer; - return cublasZaxpy(handle, n, cublasPointer(alpha), cublasPointer(x), incx, - cublasPointer(y), incy); -} - -template <> -inline cublasStatus_t cublasAxpy(cublasHandle_t handle, int n, - const int *alpha, const float *x, - int incx, float *y, int incy) { - const float alpha_float = float(*alpha); - return cublasSaxpy(handle, n, &alpha_float, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasAxpy(cublasHandle_t handle, int n, - const double *alpha, - const float *x, int incx, - float *y, int incy) { - const float alpha_float = float(*alpha); - return cublasSaxpy(handle, n, &alpha_float, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasAxpy(cublasHandle_t handle, int n, - const int *alpha, const double *x, - int incx, double *y, int incy) { - const double alpha_double = double(*alpha); - return cublasDaxpy(handle, n, &alpha_double, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasAxpy(cublasHandle_t handle, int n, - const float *alpha, - const double *x, int incx, - double *y, int incy) { - const double alpha_double = double(*alpha); - return cublasDaxpy(handle, n, &alpha_double, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasAxpy>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - const float *x, int incx, float *y, int incy) { - return CUBLAS_STATUS_SUCCESS; -} - -template <> -inline cublasStatus_t -cublasAxpy>( - cublasHandle_t handle, int n, - const detail::ComplexConjugate *alpha, - const float *x, int incx, float *y, int incy) { - const float alpha_float = float(-1.0); - return cublasSaxpy(handle, n, &alpha_float, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasAxpy>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - const float *x, int incx, float *y, int incy) { - const float alpha_float = float(alpha->factor()); - return cublasSaxpy(handle, n, &alpha_float, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasAxpy>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - const float *x, int incx, float *y, int incy) { - const float alpha_float = float(alpha->factor()); - return cublasSaxpy(handle, n, &alpha_float, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasAxpy>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - const float *x, int incx, float *y, int incy) { - const float alpha_float = float(alpha->factor()); - return cublasSaxpy(handle, n, &alpha_float, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasAxpy>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - const double *x, int incx, double *y, int incy) { - return CUBLAS_STATUS_SUCCESS; -} - -template <> -inline cublasStatus_t -cublasAxpy>( - cublasHandle_t handle, int n, - const detail::ComplexConjugate *alpha, - const double *x, int incx, double *y, int incy) { - const double alpha_double = double(-1.0); - return cublasDaxpy(handle, n, &alpha_double, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasAxpy>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - const double *x, int incx, double *y, int incy) { - const double alpha_double = double(alpha->factor()); - return cublasDaxpy(handle, n, &alpha_double, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasAxpy>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - const double *x, int incx, double *y, int incy) { - const double alpha_double = double(alpha->factor()); - return cublasDaxpy(handle, n, &alpha_double, x, incx, y, incy); -} - -template <> -inline cublasStatus_t cublasAxpy>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - const double *x, int incx, double *y, int incy) { - const double alpha_double = double(alpha->factor()); - return cublasDaxpy(handle, n, &alpha_double, x, incx, y, incy); -} - -/// SCAL interface function -template -cublasStatus_t cublasScal(cublasHandle_t handle, int n, const Scalar *alpha, - T *x, int incx); - -template <> -inline cublasStatus_t cublasScal(cublasHandle_t handle, int n, - const float *alpha, float *x, - int incx) { - return cublasSscal(handle, n, alpha, x, incx); -}; - -template <> -inline cublasStatus_t cublasScal(cublasHandle_t handle, int n, - const double *alpha, double *x, - int incx) { - return cublasDscal(handle, n, alpha, x, incx); -}; - -template <> -inline cublasStatus_t cublasScal(cublasHandle_t handle, int n, - const int *alpha, float *x, - int incx) { - const float alpha_float = float(*alpha); - return cublasSscal(handle, n, &alpha_float, x, incx); -}; - -template <> -inline cublasStatus_t cublasScal(cublasHandle_t handle, int n, - const double *alpha, float *x, - int incx) { - const float alpha_float = float(*alpha); - return cublasSscal(handle, n, &alpha_float, x, incx); -}; - -// -template <> -inline cublasStatus_t cublasScal(cublasHandle_t handle, int n, - const int *alpha, double *x, - int incx) { - const double alpha_double = double(*alpha); - return cublasDscal(handle, n, &alpha_double, x, incx); -}; - -template <> -inline cublasStatus_t cublasScal(cublasHandle_t handle, int n, - const float *alpha, double *x, - int incx) { - const double alpha_double = double(*alpha); - return cublasDscal(handle, n, &alpha_double, x, incx); -}; - -template <> -inline cublasStatus_t cublasScal>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - float *x, int incx) { - return CUBLAS_STATUS_SUCCESS; -} - -template <> -inline cublasStatus_t -cublasScal>( - cublasHandle_t handle, int n, - const detail::ComplexConjugate *alpha, float *x, - int incx) { - const float alpha_float = float(-1.0); - return cublasSscal(handle, n, &alpha_float, x, incx); -} - -template <> -inline cublasStatus_t cublasScal>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - float *x, int incx) { - const float alpha_float = float(alpha->factor()); - return cublasSscal(handle, n, &alpha_float, x, incx); -} - -template <> -inline cublasStatus_t cublasScal>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - float *x, int incx) { - const float alpha_float = float(alpha->factor()); - return cublasSscal(handle, n, &alpha_float, x, incx); -} - -template <> -inline cublasStatus_t cublasScal>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - float *x, int incx) { - const float alpha_float = float(alpha->factor()); - return cublasSscal(handle, n, &alpha_float, x, incx); -} - -template <> -inline cublasStatus_t cublasScal>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - double *x, int incx) { - return CUBLAS_STATUS_SUCCESS; -} - -template <> -inline cublasStatus_t -cublasScal>( - cublasHandle_t handle, int n, - const detail::ComplexConjugate *alpha, double *x, - int incx) { - const double alpha_double = double(-1.0); - return cublasDscal(handle, n, &alpha_double, x, incx); -} - -template <> -inline cublasStatus_t cublasScal>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - double *x, int incx) { - const double alpha_double = double(alpha->factor()); - return cublasDscal(handle, n, &alpha_double, x, incx); -} - -template <> -inline cublasStatus_t cublasScal>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - double *x, int incx) { - const double alpha_double = double(alpha->factor()); - return cublasDscal(handle, n, &alpha_double, x, incx); -} - -template <> -inline cublasStatus_t cublasScal>( - cublasHandle_t handle, int n, const detail::ComplexConjugate *alpha, - double *x, int incx) { - const double alpha_double = double(alpha->factor()); - return cublasDscal(handle, n, &alpha_double, x, incx); -} - } // namespace TiledArray #endif // TILEDARRAY_HAS_DEVICE diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp index a24e72c5f9..ae5a8662ac 100644 --- a/src/TiledArray/tiledarray.cpp +++ b/src/TiledArray/tiledarray.cpp @@ -7,9 +7,7 @@ #include #ifdef TILEDARRAY_HAS_DEVICE -#ifdef TILEDARRAY_HAS_CUDA #include -#endif #include #include #endif From 4e38a44f2336dc1740553a38b78207a871d0a660 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 20 Sep 2023 17:09:15 -0400 Subject: [PATCH 116/592] no need to link to rocthrust --- src/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 431785dc54..55227c2093 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -290,7 +290,11 @@ if(CUDA_FOUND OR HIP_FOUND) TiledArray/device/kernel/thrust/reduce_kernel.hip ) - list(APPEND _TILEDARRAY_DEPENDENCIES hip::host roc::rocthrust) + list(APPEND _TILEDARRAY_DEPENDENCIES hip::host + # N.B. linking to rocthrust makes all files in tiledarray target compiled as HIP ... + # seemingly (like with CUDA thrust) linking to this target is not needed + # roc::rocthrust + ) endif() # LibreTT needed for either CUDA or HIP From 598f65aa193bb1dcaadfb27b18b57c1623c74844 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 21 Sep 2023 08:28:19 -0400 Subject: [PATCH 117/592] removed obsolete (and nonfunctional) cuda_librett.cpp --- examples/device/CMakeLists.txt | 2 +- examples/device/cuda_librett.cpp | 47 -------------------------------- 2 files changed, 1 insertion(+), 48 deletions(-) delete mode 100644 examples/device/cuda_librett.cpp diff --git a/examples/device/CMakeLists.txt b/examples/device/CMakeLists.txt index 81339a3606..40b1aabdfb 100644 --- a/examples/device/CMakeLists.txt +++ b/examples/device/CMakeLists.txt @@ -25,7 +25,7 @@ if(CUDA_FOUND) - foreach(_exec cuda_librett cuda_task ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda) + foreach(_exec cuda_task ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda) # Add executable add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray") diff --git a/examples/device/cuda_librett.cpp b/examples/device/cuda_librett.cpp deleted file mode 100644 index d56f6362fa..0000000000 --- a/examples/device/cuda_librett.cpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2018 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Created by Chong Peng on 7/19/18. - * - */ - -#include - -#ifdef TILEDARRAY_HAS_DEVICE - -#include -#include - -#include - -/** - * Test LibreTT - */ - -const std::size_t N = 100; -using namespace TiledArray; - -int main(int argc, char* argv[]) { - TA_SCOPED_INITIALIZE(argc, argv); - - std::vector extent{N, N}; - std::vector perm{1, 0}; - - return 0; -} - -#endif From 4dad689a1d79b69a434237331277df180669d8fb Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 21 Sep 2023 08:28:28 -0400 Subject: [PATCH 118/592] fixup --- doc/devsamp/wiki/user-guide-2.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/devsamp/wiki/user-guide-2.cpp b/doc/devsamp/wiki/user-guide-2.cpp index da7664c8d4..ebef5be776 100644 --- a/doc/devsamp/wiki/user-guide-2.cpp +++ b/doc/devsamp/wiki/user-guide-2.cpp @@ -36,6 +36,7 @@ TA::Tensor make_tile2(const TA::Range& range, const double v) { // Fill array x with value v void init_array(TA::TArrayD& x, const double v) { + using std::begin, std::end; // Add local tiles to a for (auto it = begin(x); it != end(x); ++it) { // Construct a tile using a MADNESS task. From 98dcff321f75f69ac2b97afc05fea8843a6a09a6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 21 Sep 2023 08:28:57 -0400 Subject: [PATCH 119/592] [examples] cuda_task -> device_task --- examples/device/CMakeLists.txt | 4 +- .../device/{cuda_task.cpp => device_task.cpp} | 57 ++++++++----------- src/TiledArray/external/device.h | 8 +++ 3 files changed, 33 insertions(+), 36 deletions(-) rename examples/device/{cuda_task.cpp => device_task.cpp} (64%) diff --git a/examples/device/CMakeLists.txt b/examples/device/CMakeLists.txt index 40b1aabdfb..e185e26b41 100644 --- a/examples/device/CMakeLists.txt +++ b/examples/device/CMakeLists.txt @@ -25,7 +25,7 @@ if(CUDA_FOUND) - foreach(_exec cuda_task ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda) + foreach(_exec ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda) # Add executable add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray") @@ -37,7 +37,7 @@ endif() if(CUDA_FOUND OR HIP_FOUND) - foreach(_exec ta_dense_device) + foreach(_exec device_task ta_dense_device) # Add executable add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray") diff --git a/examples/device/cuda_task.cpp b/examples/device/device_task.cpp similarity index 64% rename from examples/device/cuda_task.cpp rename to examples/device/device_task.cpp index e5519b8b85..49225ae664 100644 --- a/examples/device/cuda_task.cpp +++ b/examples/device/device_task.cpp @@ -2,8 +2,10 @@ // Created by Chong Peng on 11/14/18. // +#include #include #include + #include using value_type = double; @@ -26,8 +28,8 @@ void verify(const tile_type& tile, value_type value, std::size_t index) { } } -tile_type scale(const tile_type& arg, value_type a, const cudaStream_t* stream, - std::size_t index) { +tile_type scale(const tile_type& arg, value_type a, + const TiledArray::device::stream_t* stream, std::size_t index) { DeviceSafeCall(TiledArray::device::setDevice( TiledArray::deviceEnv::instance()->current_device_id())); /// make result Tensor @@ -40,35 +42,33 @@ tile_type scale(const tile_type& arg, value_type a, const cudaStream_t* stream, std::move(result_storage)); /// copy the original Tensor - const auto& handle = TiledArray::BLASQueuePool::handle(*stream); + auto& queue = TiledArray::BLASQueuePool::queue(*stream); - CublasSafeCall(TiledArray::cublasCopy(handle, result.size(), arg.data(), 1, - device_data(result.storage()), 1)); + blas::copy(result.size(), arg.data(), 1, device_data(result.storage()), 1, + queue); - CublasSafeCall(TiledArray::cublasScal(handle, result.size(), &a, - device_data(result.storage()), 1)); + blas::scal(result.size(), a, device_data(result.storage()), 1, queue); - // cudaStreamSynchronize(stream); + // std::stringstream stream_str; + // stream_str << *stream; + // std::string message = "run scale on Tensor: " + std::to_string(index) + + // "on stream: " + stream_str.str() + '\n'; + // std::cout << message; TiledArray::device::synchronize_stream(stream); - // std::stringstream stream_str; - // stream_str << *stream; - // std::string message = "run scale on Tensor: " + std::to_string(index) + " - // on stream: " + stream_str.str() +'\n'; std::cout << message; return tile_type(std::move(result)); } -void process_task(madness::World* world, - const std::vector* streams, std::size_t ntask) { +void process_task(madness::World* world, std::size_t ntask) { const std::size_t iter = 50; const std::size_t M = 1000; const std::size_t N = 1000; - std::size_t n_stream = streams->size(); + std::size_t n_stream = TiledArray::deviceEnv::instance()->num_streams(); for (std::size_t i = 0; i < iter; i++) { - auto& stream = (*streams)[i % n_stream]; + auto& stream = TiledArray::deviceEnv::instance()->stream(i % n_stream); TiledArray::Range range{M, N}; @@ -77,8 +77,9 @@ void process_task(madness::World* world, const double scale_factor = 2.0; // function pointer to the scale function to call - tile_type (*scale_fn)(const tile_type&, double, const cudaStream_t*, - std::size_t) = &::scale; + tile_type (*scale_fn)(const tile_type&, double, + const TiledArray::device::stream_t*, std::size_t) = + &::scale; madness::Future scale_future = madness::add_device_task( *world, ::scale, tensor, scale_factor, &stream, ntask * iter + i); @@ -91,27 +92,15 @@ void process_task(madness::World* world, int try_main(int argc, char** argv) { auto& world = TiledArray::get_default_world(); - const std::size_t n_stream = 5; const std::size_t n_tasks = 5; - std::vector streams(n_stream); - for (auto& stream : streams) { - // create the streams - DeviceSafeCall(cudaStreamCreate(&stream)); - // std::cout << "stream: " << stream << "\n"; - } - // add process_task to different tasks/threads for (auto i = 0; i < n_tasks; i++) { - world.taskq.add(process_task, &world, &streams, i); + world.taskq.add(process_task, &world, i); } world.gop.fence(); - for (auto& stream : streams) { - // create the streams - cudaStreamDestroy(stream); - } return 0; } @@ -120,12 +109,12 @@ int main(int argc, char* argv[]) { try { // Initialize runtime try_main(argc, argv); - } catch (thrust::system::detail::bad_alloc& ex) { + } catch (std::exception& ex) { std::cout << ex.what() << std::endl; size_t free_mem, total_mem; - auto result = cudaMemGetInfo(&free_mem, &total_mem); - std::cout << "CUDA memory stats: {total,free} = {" << total_mem << "," + auto result = TiledArray::device::memGetInfo(&free_mem, &total_mem); + std::cout << "device memory stats: {total,free} = {" << total_mem << "," << free_mem << "}" << std::endl; } catch (...) { std::cerr << "unknown exception" << std::endl; diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 745697faaa..6d53dc73a3 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -192,6 +192,10 @@ inline error_t deviceGetAttribute(int* value, deviceAttr_t attr, int device) { return cudaDeviceGetAttribute(value, attr, device); } +inline error_t streamCreate(stream_t* pStream) { + return cudaStreamCreate(pStream); +} + inline error_t streamCreateWithFlags(stream_t* pStream, StreamCreateFlags flags) { return cudaStreamCreateWithFlags(pStream, flags); @@ -323,6 +327,10 @@ inline error_t deviceGetAttribute(int* value, deviceAttr_t attr, int device) { return hipDeviceGetAttribute(value, attr, device); } +inline error_t streamCreate(stream_t* pStream) { + return hipStreamCreate(pStream); +} + inline error_t streamCreateWithFlags(stream_t* pStream, StreamCreateFlags flags) { return hipStreamCreateWithFlags(pStream, flags); From 45463d673b13e5422ddb3c8a913cf2fd13ce4b89 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 21 Sep 2023 14:31:26 -0400 Subject: [PATCH 120/592] ta_dense_device: more de-cudaization - removed cudaGetDeviceProperties and cudaDeviceProp - guard CUDA_API_PER_THREAD_DEFAULT_STREAM by TILEDARRAY_HAS_CUDA --- examples/device/ta_dense_device.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/examples/device/ta_dense_device.cpp b/examples/device/ta_dense_device.cpp index 4b14085624..16e0dc9a79 100644 --- a/examples/device/ta_dense_device.cpp +++ b/examples/device/ta_dense_device.cpp @@ -17,7 +17,9 @@ * */ +#ifdef TILEDARRAY_HAS_CUDA #define CUDA_API_PER_THREAD_DEFAULT_STREAM +#endif // TILEDARRAY_HAS_CUDA #include @@ -30,7 +32,7 @@ #ifdef TILEDARRAY_HAS_CUDA #include -#endif +#endif // TILEDARRAY_HAS_CUDA template void do_main_body(TiledArray::World &world, const long Nm, const long Bm, @@ -320,10 +322,10 @@ int try_main(int argc, char **argv) { for (int i = 0; i < mpi_size; i++) { if (i == mpi_rank) { - std::cout << "CUDA Device Information for MPI Process Rank: " - << mpi_rank << std::endl; - cudaDeviceProp prop; - auto error = cudaGetDeviceProperties(&prop, device_id); + std::cout << "Device Information for MPI Process Rank: " << mpi_rank + << std::endl; + TiledArray::device::deviceProp_t prop; + auto error = TiledArray::device::getDeviceProperties(&prop, device_id); if (error != TiledArray::device::Success) { std::cout << "error(GetDeviceProperties) = " << error << std::endl; } From d917179620460a88d62b16213c11f66512ae22b1 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 22 Sep 2023 07:56:26 -0400 Subject: [PATCH 121/592] decudaized the rest of device examples (vector, reduce, cc_abcd) --- examples/device/CMakeLists.txt | 14 +--- ...cc_abcd_cuda.cpp => ta_cc_abcd_device.cpp} | 12 +-- examples/device/ta_dense_device.cpp | 6 -- ...a_reduce_cuda.cpp => ta_reduce_device.cpp} | 76 ++++++++----------- ...a_vector_cuda.cpp => ta_vector_device.cpp} | 76 ++++++++----------- src/TiledArray/external/device.h | 4 +- 6 files changed, 74 insertions(+), 114 deletions(-) rename examples/device/{ta_cc_abcd_cuda.cpp => ta_cc_abcd_device.cpp} (97%) rename examples/device/{ta_reduce_cuda.cpp => ta_reduce_device.cpp} (84%) rename examples/device/{ta_vector_cuda.cpp => ta_vector_device.cpp} (84%) diff --git a/examples/device/CMakeLists.txt b/examples/device/CMakeLists.txt index e185e26b41..e2376c4eae 100644 --- a/examples/device/CMakeLists.txt +++ b/examples/device/CMakeLists.txt @@ -23,21 +23,9 @@ # -if(CUDA_FOUND) - - foreach(_exec ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda) - - # Add executable - add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray") - add_dependencies(examples-tiledarray ${_exec}) - - endforeach() - -endif() - if(CUDA_FOUND OR HIP_FOUND) - foreach(_exec device_task ta_dense_device) + foreach(_exec device_task ta_dense_device ta_cc_abcd_device ta_vector_device ta_reduce_device) # Add executable add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray") diff --git a/examples/device/ta_cc_abcd_cuda.cpp b/examples/device/ta_cc_abcd_device.cpp similarity index 97% rename from examples/device/ta_cc_abcd_cuda.cpp rename to examples/device/ta_cc_abcd_device.cpp index b531dee495..7a2046a5ef 100644 --- a/examples/device/ta_cc_abcd_cuda.cpp +++ b/examples/device/ta_cc_abcd_device.cpp @@ -185,14 +185,14 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, const double n_gflop = flops_per_fma * std::pow(n_occ, 2) * std::pow(n_uocc, 4) / std::pow(1024., 3); - using CUDATile = + using deviceTile = btas::Tensor>; - using CUDAMatrix = TA::DistArray>; + using deviceMatrix = TA::DistArray>; // Construct tensors - CUDAMatrix t2(world, trange_oovv); - CUDAMatrix v(world, trange_vvvv); - CUDAMatrix t2_v; + deviceMatrix t2(world, trange_oovv); + deviceMatrix v(world, trange_vvvv); + deviceMatrix t2_v; // To validate, fill input tensors with random data, otherwise just with 1s // if (do_validate) { // rand_fill_array(t2); @@ -245,7 +245,7 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, auto result = dot_length * 0.2 * 0.3; auto verify = [&world, &threshold, &result, - &dot_length](const TA::Tile& tile) { + &dot_length](const TA::Tile& tile) { auto n_elements = tile.size(); for (std::size_t i = 0; i < n_elements; i++) { double abs_err = fabs(tile[i] - result); diff --git a/examples/device/ta_dense_device.cpp b/examples/device/ta_dense_device.cpp index 16e0dc9a79..348b0a2d39 100644 --- a/examples/device/ta_dense_device.cpp +++ b/examples/device/ta_dense_device.cpp @@ -17,12 +17,6 @@ * */ -#ifdef TILEDARRAY_HAS_CUDA -#define CUDA_API_PER_THREAD_DEFAULT_STREAM -#endif // TILEDARRAY_HAS_CUDA - -#include - // clang-format off #include diff --git a/examples/device/ta_reduce_cuda.cpp b/examples/device/ta_reduce_device.cpp similarity index 84% rename from examples/device/ta_reduce_cuda.cpp rename to examples/device/ta_reduce_device.cpp index b475ff78ef..41f84c126c 100644 --- a/examples/device/ta_reduce_cuda.cpp +++ b/examples/device/ta_reduce_device.cpp @@ -17,15 +17,8 @@ * */ -#define CUDA_API_PER_THREAD_DEFAULT_STREAM - -#include - -// clang-format off - -#include #include -// clang-format on +#include template void do_main_body(TiledArray::World &world, const long Nm, const long Bm, @@ -237,7 +230,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, } template -using cudaTile = TiledArray::Tile>; +using deviceTile = TiledArray::Tile>; int try_main(int argc, char **argv) { // Initialize runtime @@ -286,57 +279,56 @@ int try_main(int argc, char **argv) { } int driverVersion, runtimeVersion; - auto error = cudaDriverGetVersion(&driverVersion); - if (error != cudaSuccess) { - std::cout << "error(cudaDriverGetVersion) = " << error << std::endl; + auto error = TiledArray::device::driverVersion(&driverVersion); + if (error != TiledArray::device::Success) { + std::cout << "error(driverVersion) = " << error << std::endl; } - error = cudaRuntimeGetVersion(&runtimeVersion); - if (error != cudaSuccess) { - std::cout << "error(cudaRuntimeGetVersion) = " << error << std::endl; + error = TiledArray::device::runtimeVersion(&runtimeVersion); + if (error != TiledArray::device::Success) { + std::cout << "error(runtimeVersion) = " << error << std::endl; } - std::cout << "CUDA {driver,runtime} versions = " << driverVersion << "," + std::cout << "device {driver,runtime} versions = " << driverVersion << "," << runtimeVersion << std::endl; { // print device properties - int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices(); + int num_devices = TA::deviceEnv::instance()->num_devices(); - if (num_cuda_devices <= 0) { - throw std::runtime_error("No CUDA-Enabled GPUs Found!\n"); + if (num_devices <= 0) { + throw std::runtime_error("No GPUs Found!\n"); } - int cuda_device_id = TA::deviceEnv::instance()->current_device_id(); + int device_id = TA::deviceEnv::instance()->current_device_id(); int mpi_size = world.size(); int mpi_rank = world.rank(); for (int i = 0; i < mpi_size; i++) { if (i == mpi_rank) { - std::cout << "CUDA Device Information for MPI Process Rank: " - << mpi_rank << std::endl; - cudaDeviceProp prop; - auto error = cudaGetDeviceProperties(&prop, cuda_device_id); - if (error != cudaSuccess) { - std::cout << "error(cudaGetDeviceProperties) = " << error - << std::endl; + std::cout << "Device Information for MPI Process Rank: " << mpi_rank + << std::endl; + TiledArray::device::deviceProp_t prop; + auto error = TiledArray::device::getDeviceProperties(&prop, device_id); + if (error != TiledArray::device::Success) { + std::cout << "error(getDeviceProperties) = " << error << std::endl; } - std::cout << "Device #" << cuda_device_id << ": " << prop.name - << std::endl + std::cout << "Device #" << device_id << ": " << prop.name << std::endl << " managedMemory = " << prop.managedMemory << std::endl << " singleToDoublePrecisionPerfRatio = " << prop.singleToDoublePrecisionPerfRatio << std::endl; int result; - error = cudaDeviceGetAttribute(&result, cudaDevAttrUnifiedAddressing, - cuda_device_id); + error = TiledArray::device::deviceGetAttribute( + &result, TiledArray::device::DevAttrUnifiedAddressing, device_id); std::cout << " attrUnifiedAddressing = " << result << std::endl; - error = cudaDeviceGetAttribute( - &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id); + error = TiledArray::device::deviceGetAttribute( + &result, TiledArray::device::DevAttrConcurrentManagedAccess, + device_id); std::cout << " attrConcurrentManagedAccess = " << result << std::endl; - error = device::setDevice(cuda_device_id); - if (error != cudaSuccess) { + error = TiledArray::device::setDevice(device_id); + if (error != TiledArray::device::Success) { std::cout << "error(device::setDevice) = " << error << std::endl; } size_t free_mem, total_mem; - error = cudaMemGetInfo(&free_mem, &total_mem); + error = TiledArray::device::memGetInfo(&free_mem, &total_mem); std::cout << " {total,free} memory = {" << total_mem << "," << free_mem << "}" << std::endl; } @@ -348,7 +340,7 @@ int try_main(int argc, char **argv) { if (world.rank() == 0) { std::cout << "\n GPU vector operations. \n\n"; } - do_main_body>(world, Nm, Bm, Nn, Bn, nrepeat); + do_main_body>(world, Nm, Bm, Nn, Bn, nrepeat); if (world.rank() == 0) { std::cout << "\n CPU vector operations. \n\n"; @@ -359,7 +351,7 @@ int try_main(int argc, char **argv) { if (world.rank() == 0) { std::cout << "\n GPU vector operations. \n\n"; } - do_main_body>(world, Nm, Bm, Nn, Bn, nrepeat); + do_main_body>(world, Nm, Bm, Nn, Bn, nrepeat); if (world.rank() == 0) { std::cout << "\n CPU vector operations. \n\n"; @@ -373,15 +365,13 @@ int try_main(int argc, char **argv) { int main(int argc, char *argv[]) { try { try_main(argc, argv); - } catch (thrust::system::detail::bad_alloc &ex) { + } catch (std::exception &ex) { std::cout << ex.what() << std::endl; size_t free_mem, total_mem; - auto result = cudaMemGetInfo(&free_mem, &total_mem); - std::cout << "CUDA memory stats: {total,free} = {" << total_mem << "," + auto result = TiledArray::device::memGetInfo(&free_mem, &total_mem); + std::cout << "device memory stats: {total,free} = {" << total_mem << "," << free_mem << "}" << std::endl; - } catch (std::exception &ex) { - std::cout << ex.what() << std::endl; } catch (...) { std::cerr << "unknown exception" << std::endl; } diff --git a/examples/device/ta_vector_cuda.cpp b/examples/device/ta_vector_device.cpp similarity index 84% rename from examples/device/ta_vector_cuda.cpp rename to examples/device/ta_vector_device.cpp index a82a057807..bc128f1e95 100644 --- a/examples/device/ta_vector_cuda.cpp +++ b/examples/device/ta_vector_device.cpp @@ -17,17 +17,9 @@ * */ -#define CUDA_API_PER_THREAD_DEFAULT_STREAM - -#include - -// clang-format off - -#include #include -#include "TiledArray/device/cpu_cuda_vector.h" #include -// clang-format on +#include template void do_main_body(TiledArray::World &world, const long Nm, const long Bm, @@ -255,7 +247,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, } template -using cudaTile = TiledArray::Tile>; +using deviceTile = TiledArray::Tile>; int try_main(int argc, char **argv) { // Initialize runtime @@ -304,57 +296,55 @@ int try_main(int argc, char **argv) { } int driverVersion, runtimeVersion; - auto error = cudaDriverGetVersion(&driverVersion); - if (error != cudaSuccess) { - std::cout << "error(cudaDriverGetVersion) = " << error << std::endl; + auto error = TA::device::driverVersion(&driverVersion); + if (error != TA::device::Success) { + std::cout << "error(DriverGetVersion) = " << error << std::endl; } - error = cudaRuntimeGetVersion(&runtimeVersion); - if (error != cudaSuccess) { - std::cout << "error(cudaRuntimeGetVersion) = " << error << std::endl; + error = TA::device::runtimeVersion(&runtimeVersion); + if (error != TA::device::Success) { + std::cout << "error(RuntimeGetVersion) = " << error << std::endl; } - std::cout << "CUDA {driver,runtime} versions = " << driverVersion << "," + std::cout << "device {driver,runtime} versions = " << driverVersion << "," << runtimeVersion << std::endl; { // print device properties - int num_cuda_devices = TA::deviceEnv::instance()->num_cuda_devices(); + int num_devices = TA::deviceEnv::instance()->num_devices(); - if (num_cuda_devices <= 0) { - throw std::runtime_error("No CUDA-Enabled GPUs Found!\n"); + if (num_devices <= 0) { + throw std::runtime_error("No GPUs Found!\n"); } - int cuda_device_id = TA::deviceEnv::instance()->current_device_id(); + int device_id = TA::deviceEnv::instance()->current_device_id(); int mpi_size = world.size(); int mpi_rank = world.rank(); for (int i = 0; i < mpi_size; i++) { if (i == mpi_rank) { - std::cout << "CUDA Device Information for MPI Process Rank: " - << mpi_rank << std::endl; - cudaDeviceProp prop; - auto error = cudaGetDeviceProperties(&prop, cuda_device_id); - if (error != cudaSuccess) { - std::cout << "error(cudaGetDeviceProperties) = " << error - << std::endl; + std::cout << "Device Information for MPI Process Rank: " << mpi_rank + << std::endl; + TA::device::deviceProp_t prop; + auto error = TA::device::getDeviceProperties(&prop, device_id); + if (error != TA::device::Success) { + std::cout << "error(GetDeviceProperties) = " << error << std::endl; } - std::cout << "Device #" << cuda_device_id << ": " << prop.name - << std::endl + std::cout << "Device #" << device_id << ": " << prop.name << std::endl << " managedMemory = " << prop.managedMemory << std::endl << " singleToDoublePrecisionPerfRatio = " << prop.singleToDoublePrecisionPerfRatio << std::endl; int result; - error = cudaDeviceGetAttribute(&result, cudaDevAttrUnifiedAddressing, - cuda_device_id); + error = TA::device::deviceGetAttribute( + &result, TA::device::DevAttrUnifiedAddressing, device_id); std::cout << " attrUnifiedAddressing = " << result << std::endl; - error = cudaDeviceGetAttribute( - &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id); + error = TA::device::deviceGetAttribute( + &result, TA::device::DevAttrConcurrentManagedAccess, device_id); std::cout << " attrConcurrentManagedAccess = " << result << std::endl; - error = device::setDevice(cuda_device_id); - if (error != cudaSuccess) { + error = TA::device::setDevice(device_id); + if (error != TA::device::Success) { std::cout << "error(device::setDevice) = " << error << std::endl; } size_t free_mem, total_mem; - error = cudaMemGetInfo(&free_mem, &total_mem); + error = TA::device::memGetInfo(&free_mem, &total_mem); std::cout << " {total,free} memory = {" << total_mem << "," << free_mem << "}" << std::endl; } @@ -366,7 +356,7 @@ int try_main(int argc, char **argv) { if (world.rank() == 0) { std::cout << "\n GPU vector operations. \n\n"; } - do_main_body>(world, Nm, Bm, Nn, Bn, nrepeat); + do_main_body>(world, Nm, Bm, Nn, Bn, nrepeat); if (world.rank() == 0) { std::cout << "\n CPU vector operations. \n\n"; @@ -377,7 +367,7 @@ int try_main(int argc, char **argv) { if (world.rank() == 0) { std::cout << "\n GPU vector operations. \n\n"; } - do_main_body>(world, Nm, Bm, Nn, Bn, nrepeat); + do_main_body>(world, Nm, Bm, Nn, Bn, nrepeat); if (world.rank() == 0) { std::cout << "\n CPU vector operations. \n\n"; @@ -391,15 +381,13 @@ int try_main(int argc, char **argv) { int main(int argc, char *argv[]) { try { try_main(argc, argv); - } catch (thrust::system::detail::bad_alloc &ex) { + } catch (std::exception &ex) { std::cout << ex.what() << std::endl; size_t free_mem, total_mem; - auto result = cudaMemGetInfo(&free_mem, &total_mem); - std::cout << "CUDA memory stats: {total,free} = {" << total_mem << "," + auto result = TA::device::memGetInfo(&free_mem, &total_mem); + std::cout << "device memory stats: {total,free} = {" << total_mem << "," << free_mem << "}" << std::endl; - } catch (std::exception &ex) { - std::cout << ex.what() << std::endl; } catch (...) { std::cerr << "unknown exception" << std::endl; } diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 6d53dc73a3..21923e4598 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -131,7 +131,7 @@ using deviceAttr_t = cudaDeviceAttr; cudaDevAttrConcurrentManagedAccess #define DEVICERT_CB CUDART_CB -enum Error { Success = cudaSuccess }; +const inline auto Success = cudaSuccess; enum DeviceId { CpuDeviceId = cudaCpuDeviceId, @@ -265,7 +265,7 @@ using deviceAttr_t = hipDeviceAttribute_t; hipDeviceAttributeConcurrentManagedAccess #define DEVICERT_CB -enum Error { Success = hipSuccess }; +const inline auto Success = hipSuccess; enum DeviceId { CpuDeviceId = hipCpuDeviceId, From 604e11974fa103f81fa2bccdb29ccf30bd09f73a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 22 Sep 2023 08:02:11 -0400 Subject: [PATCH 122/592] dox++ --- src/TiledArray/external/device.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 21923e4598..bfb0610b17 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -499,6 +499,10 @@ inline const stream_t*& tls_stream_accessor() { return thread_local_stream_ptr; } +/// must call this before exiting the device task executed via +/// the MADNESS runtime (namely, via madness::add_device_task ) +/// to inform the runtime which stream the task +/// launched its kernels into inline void synchronize_stream(const stream_t* stream) { tls_stream_accessor() = stream; } From 49f480189ff5a4a7c90b18c724735f6ca0e35628 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 27 Sep 2023 10:39:00 -0400 Subject: [PATCH 123/592] loosen equality tolerance in um_expressions_suite, dot_permute fails occasionally ... resolves https://github.com/ValeevGroup/tiledarray/issues/423 --- tests/expressions_cuda_um.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/expressions_cuda_um.cpp b/tests/expressions_cuda_um.cpp index 29408c27c2..81dcf29c47 100644 --- a/tests/expressions_cuda_um.cpp +++ b/tests/expressions_cuda_um.cpp @@ -2489,6 +2489,9 @@ BOOST_AUTO_TEST_CASE(dot) { } BOOST_AUTO_TEST_CASE(dot_permute) { + // loosen the default tolerance + constexpr auto tolerance = 5e-13; + Permutation perm({2, 1, 0}); // Test the dot expression function double result = 0; From f7b7b422970d5e3127c7fed16c1389cbfc08a9b5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 25 Sep 2023 11:16:59 -0400 Subject: [PATCH 124/592] relax deviceEnv::current_device_id to support multiple devices per rank --- src/TiledArray/external/device.h | 41 +++++--------------------------- 1 file changed, 6 insertions(+), 35 deletions(-) diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index bfb0610b17..5219de705f 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -426,41 +426,12 @@ inline int num_devices() { } inline int current_device_id(World& world) { - int mpi_local_size = -1; - int mpi_local_rank = -1; - std::tie(mpi_local_rank, mpi_local_size) = detail::mpi_local_rank_size(world); - - int num_devices = device::num_devices(); - - int device_id = -1; - // devices may already be pre-mapped - // if mpi_local_size <= num_devices : all ranks are in same resource set, map - // round robin - if (mpi_local_size <= num_devices) { - device_id = mpi_local_rank % num_devices; - } else { // mpi_local_size > num_devices - const char* vd_cstr = - std::getenv(TILEDARRAY_DEVICE_RUNTIME_STR "_VISIBLE_DEVICES"); - if (vd_cstr) { // *_VISIBLE_DEVICES is set, assume that pre-mapped - // make sure that there is only 1 device available here - if (num_devices != 1) { - throw std::runtime_error( - std::string( - TILEDARRAY_DEVICE_RUNTIME_STR - "_VISIBLE_DEVICES environment variable is set, hence using " - "the provided device-to-rank mapping; BUT TiledArray found ") + - std::to_string(num_devices) + - " devices; only 1 device / MPI process is supported"); - } - device_id = 0; - } else { // not enough devices + devices are not pre-mapped - throw std::runtime_error( - std::string("TiledArray found ") + std::to_string(mpi_local_size) + - " MPI ranks on a node with " + std::to_string(num_devices) + - " devices; only 1 MPI process / device model is currently " - "supported"); - } - } + static const std::tuple local_rank_size = + detail::mpi_local_rank_size(world); + const auto& [mpi_local_rank, mpi_local_size] = local_rank_size; + static const int num_devices = device::num_devices(); + // map ranks to default device round robin + static const int device_id = mpi_local_rank % num_devices; return device_id; } From 0b2254b19840285466f47b05ad29d258f2ebc7a4 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 25 Sep 2023 22:37:48 -0400 Subject: [PATCH 125/592] jumbo "multidevice" bundle - initial support for multiple devices - introduced device::Stream - sticky handling of streams should allow device tasks with multiple ops (e.g., scale+permute) to work correctly --- doc/dox/dev/Optimization-Guide.md | 10 +- examples/device/device_task.cpp | 27 +- examples/device/ta_dense_device.cpp | 4 +- examples/device/ta_reduce_device.cpp | 7 +- examples/device/ta_vector_device.cpp | 4 +- src/TiledArray/device/blas.cpp | 14 +- src/TiledArray/device/blas.h | 33 +- src/TiledArray/device/btas.h | 171 +++++----- src/TiledArray/device/btas_um_tensor.cpp | 4 +- src/TiledArray/device/btas_um_tensor.h | 103 ++---- src/TiledArray/device/device_task_fn.h | 14 +- src/TiledArray/device/kernel/mult_kernel.h | 22 +- src/TiledArray/device/kernel/reduce_kernel.h | 87 ++---- .../device/kernel/thrust/mult_kernel.cu | 40 +-- .../device/kernel/thrust/mult_kernel.h | 12 +- .../device/kernel/thrust/reduce_kernel.cu | 130 ++++---- .../device/kernel/thrust/reduce_kernel.h | 43 ++- src/TiledArray/device/um_storage.h | 12 +- src/TiledArray/external/device.h | 294 ++++++++++++------ src/TiledArray/reduce_task.h | 73 +++-- tests/librett.cpp | 31 +- 21 files changed, 564 insertions(+), 571 deletions(-) diff --git a/doc/dox/dev/Optimization-Guide.md b/doc/dox/dev/Optimization-Guide.md index 229cf82d0f..49fefb196e 100644 --- a/doc/dox/dev/Optimization-Guide.md +++ b/doc/dox/dev/Optimization-Guide.md @@ -18,10 +18,8 @@ is devoted to communication. [Default = number of cores reported by ] ## MPI -## CUDA +## GPU/Device compute runtimes -In addition to [the environment variables that control the CUDA runtime behavior](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars), several environment variables control specifically the execution of TiledArray on CUDA devices: -* `TA_CUDA_NUM_STREAMS` -- The number of [CUDA streams](https://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf) used to execute tasks on each device. Each stream can be viewed as a thread in a threadpool, with tasks in a given stream executing in order, but each stream executing independently of others. For small tasks this may need to be increased. [Default=3] -* `CUDA_VISIBLE_DEVICES` -- This CUDA runtime environment variable is queried by TiledArray to determine whether CUDA devices on a multi-GPU node have been pre-mapped to MPI ranks. - * By default (i.e. when # of MPI ranks on a node <= # of _available_ CUDA devices) TiledArray will map 1 device (in the order of increasing rank) to each MPI rank. - * If # of available CUDA devices < # of MPI ranks on a node _and_ `CUDA_VISIBLE_DEVICES` is set TiledArray will assume that the user mapped the devices to the MPI ranks appropriately (e.g. using a resource manager like `jsrun`) and only checks that each rank has access to 1 CUDA device. +In addition to the environment variables that control the runtime behavior of [CUDA](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) and [HIP/ROCm](https://rocm.docs.amd.com/en/latest/search.html?q=environment+variables), several environment variables control specifically the execution of TiledArray on compute devices: +* `TA_DEVICE_NUM_STREAMS` -- The number of [compute streams](https://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf) used to execute tasks on each device. Each stream can be viewed as a thread in a threadpool, with tasks in a given stream executing in order, but each stream executing independently of others. For small tasks this may need to be increased. In addition stream for compute tasks TiledArray also creates 2 dedicated streams for data transfers to/from each device. [Default=3] +* `CUDA_VISIBLE_DEVICES`/`HIP_VISIBLE_DEVICES` -- These runtime environment variables are can be used to map CUDA/HIP devices, respectively, on a multi-device node to MPI ranks. It is usually the responsibility of the resource manager to control this mapping, thus normally it should not be needed. By default TiledArray will assign compute devices on a multidevice node round robin to each MPI rank. diff --git a/examples/device/device_task.cpp b/examples/device/device_task.cpp index 49225ae664..bfd75ac51c 100644 --- a/examples/device/device_task.cpp +++ b/examples/device/device_task.cpp @@ -15,8 +15,8 @@ using tile_type = TA::Tile; /// verify the elements in tile is equal to value void verify(const tile_type& tile, value_type value, std::size_t index) { // const auto size = tile.size(); - std::string message = "verify Tensor: " + std::to_string(index) + '\n'; - std::cout << message; + // std::string message = "verify Tensor: " + std::to_string(index) + '\n'; + // std::cout << message; for (auto& num : tile) { if (num != value) { std::string error("Error: " + std::to_string(num) + " " + @@ -29,20 +29,18 @@ void verify(const tile_type& tile, value_type value, std::size_t index) { } tile_type scale(const tile_type& arg, value_type a, - const TiledArray::device::stream_t* stream, std::size_t index) { - DeviceSafeCall(TiledArray::device::setDevice( - TiledArray::deviceEnv::instance()->current_device_id())); + TiledArray::device::Stream stream, std::size_t index) { /// make result Tensor using Storage = typename tile_type::tensor_type::storage_type; Storage result_storage; auto result_range = arg.range(); - make_device_storage(result_storage, arg.size(), *stream); + make_device_storage(result_storage, arg.size(), stream); typename tile_type::tensor_type result(std::move(result_range), std::move(result_storage)); /// copy the original Tensor - auto& queue = TiledArray::BLASQueuePool::queue(*stream); + auto& queue = TiledArray::BLASQueuePool::queue(stream); blas::copy(result.size(), arg.data(), 1, device_data(result.storage()), 1, queue); @@ -50,12 +48,12 @@ tile_type scale(const tile_type& arg, value_type a, blas::scal(result.size(), a, device_data(result.storage()), 1, queue); // std::stringstream stream_str; - // stream_str << *stream; + // stream_str << stream; // std::string message = "run scale on Tensor: " + std::to_string(index) + // "on stream: " + stream_str.str() + '\n'; // std::cout << message; - TiledArray::device::synchronize_stream(stream); + TiledArray::device::sync_madness_task_with(stream); return tile_type(std::move(result)); } @@ -65,10 +63,10 @@ void process_task(madness::World* world, std::size_t ntask) { const std::size_t M = 1000; const std::size_t N = 1000; - std::size_t n_stream = TiledArray::deviceEnv::instance()->num_streams(); + std::size_t n_stream = TiledArray::deviceEnv::instance()->num_streams_total(); for (std::size_t i = 0; i < iter; i++) { - auto& stream = TiledArray::deviceEnv::instance()->stream(i % n_stream); + auto stream = TiledArray::deviceEnv::instance()->stream(i % n_stream); TiledArray::Range range{M, N}; @@ -77,12 +75,11 @@ void process_task(madness::World* world, std::size_t ntask) { const double scale_factor = 2.0; // function pointer to the scale function to call - tile_type (*scale_fn)(const tile_type&, double, - const TiledArray::device::stream_t*, std::size_t) = - &::scale; + tile_type (*scale_fn)(const tile_type&, double, TiledArray::device::Stream, + std::size_t) = &::scale; madness::Future scale_future = madness::add_device_task( - *world, ::scale, tensor, scale_factor, &stream, ntask * iter + i); + *world, ::scale, tensor, scale_factor, stream, ntask * iter + i); /// this should start until scale_taskfn is finished world->taskq.add(verify, scale_future, scale_factor, ntask * iter + i); diff --git a/examples/device/ta_dense_device.cpp b/examples/device/ta_dense_device.cpp index 348b0a2d39..d30bf5079c 100644 --- a/examples/device/ta_dense_device.cpp +++ b/examples/device/ta_dense_device.cpp @@ -303,13 +303,13 @@ int try_main(int argc, char **argv) { << runtimeVersion << std::endl; { // print device properties - int num_devices = TA::deviceEnv::instance()->num_devices(); + int num_devices = TA::deviceEnv::instance()->num_visible_devices(); if (num_devices <= 0) { throw std::runtime_error("No GPUs Found!\n"); } - int device_id = TA::deviceEnv::instance()->current_device_id(); + const int device_id = TA::deviceEnv::instance()->current_device_id(); int mpi_size = world.size(); int mpi_rank = world.rank(); diff --git a/examples/device/ta_reduce_device.cpp b/examples/device/ta_reduce_device.cpp index 41f84c126c..96d1bdbda4 100644 --- a/examples/device/ta_reduce_device.cpp +++ b/examples/device/ta_reduce_device.cpp @@ -17,9 +17,10 @@ * */ -#include #include +#include + template void do_main_body(TiledArray::World &world, const long Nm, const long Bm, const long Nn, const long Bn, const long nrepeat) { @@ -291,13 +292,13 @@ int try_main(int argc, char **argv) { << runtimeVersion << std::endl; { // print device properties - int num_devices = TA::deviceEnv::instance()->num_devices(); + int num_devices = TA::deviceEnv::instance()->num_visible_devices(); if (num_devices <= 0) { throw std::runtime_error("No GPUs Found!\n"); } - int device_id = TA::deviceEnv::instance()->current_device_id(); + const int device_id = TA::deviceEnv::instance()->current_device_id(); int mpi_size = world.size(); int mpi_rank = world.rank(); diff --git a/examples/device/ta_vector_device.cpp b/examples/device/ta_vector_device.cpp index bc128f1e95..4507ee64f7 100644 --- a/examples/device/ta_vector_device.cpp +++ b/examples/device/ta_vector_device.cpp @@ -308,13 +308,13 @@ int try_main(int argc, char **argv) { << runtimeVersion << std::endl; { // print device properties - int num_devices = TA::deviceEnv::instance()->num_devices(); + int num_devices = TA::deviceEnv::instance()->num_visible_devices(); if (num_devices <= 0) { throw std::runtime_error("No GPUs Found!\n"); } - int device_id = TA::deviceEnv::instance()->current_device_id(); + const int device_id = TA::deviceEnv::instance()->current_device_id(); int mpi_size = world.size(); int mpi_rank = world.rank(); diff --git a/src/TiledArray/device/blas.cpp b/src/TiledArray/device/blas.cpp index ea8eb00faf..cedd694241 100644 --- a/src/TiledArray/device/blas.cpp +++ b/src/TiledArray/device/blas.cpp @@ -31,12 +31,12 @@ bool BLASQueuePool::initialized() { return !queues_.empty(); } void BLASQueuePool::initialize() { if (initialized()) return; - queues_.reserve(deviceEnv::instance()->num_streams()); - for (std::size_t sidx = 0; sidx != deviceEnv::instance()->num_streams(); + queues_.reserve(deviceEnv::instance()->num_streams_total()); + for (std::size_t sidx = 0; sidx != deviceEnv::instance()->num_streams_total(); ++sidx) { - auto stream = deviceEnv::instance()->stream( + auto q = deviceEnv::instance()->stream( sidx); // blaspp forsome reason wants non-const lvalue ref to stream - queues_.emplace_back(std::make_unique(0, stream)); + queues_.emplace_back(std::make_unique(q.device, q.stream)); } } @@ -44,14 +44,14 @@ void BLASQueuePool::finalize() { queues_.clear(); } blas::Queue& BLASQueuePool::queue(std::size_t ordinal) { TA_ASSERT(initialized()); - TA_ASSERT(ordinal < deviceEnv::instance()->num_streams()); + TA_ASSERT(ordinal < deviceEnv::instance()->num_streams_total()); return *(queues_[ordinal]); } -blas::Queue& BLASQueuePool::queue(device::stream_t const& stream) { +blas::Queue& BLASQueuePool::queue(device::Stream const& stream) { TA_ASSERT(initialized()); for (auto&& q : queues_) { - if (q->stream() == stream) return *q; + if (q->device() == stream.device && q->stream() == stream.stream) return *q; } throw TiledArray::Exception( "no matching device stream found in the BLAS queue pool"); diff --git a/src/TiledArray/device/blas.h b/src/TiledArray/device/blas.h index a41bb9b908..bd905a528e 100644 --- a/src/TiledArray/device/blas.h +++ b/src/TiledArray/device/blas.h @@ -28,19 +28,13 @@ #ifdef TILEDARRAY_HAS_DEVICE -#include - #include +#include #include - -#include +#include namespace TiledArray { -/* - * cuBLAS interface functions - */ - /** * BLASQueuePool is a singleton controlling a pool of blas::Queue objects: * - queues map to stream 1-to-1, so do not call Queue::set_stream to maintain @@ -54,20 +48,29 @@ struct BLASQueuePool { static void finalize(); static blas::Queue &queue(std::size_t ordinal = 0); - static blas::Queue &queue(const device::stream_t &stream); + static blas::Queue &queue(const device::Stream &s); private: static std::vector> queues_; }; -namespace detail { +/// maps a (tile) Range to blas::Queue; if had already pushed work into a +/// device::Stream (as indicated by madness_task_current_stream() ) +/// will return that Stream instead +/// @param[in] range will determine the device::Stream to compute an object +/// associated with this Range object +/// @return the device::Stream to use for creating tasks generating work +/// associated with Range \p range template -blas::Queue &get_blasqueue_based_on_range(const Range &range) { - // TODO better way to get stream based on the id of tensor - auto stream_ord = range.offset() % device::Env::instance()->num_streams(); - return BLASQueuePool::queue(stream_ord); +blas::Queue &blasqueue_for(const Range &range) { + auto stream_opt = device::madness_task_current_stream(); + if (!stream_opt) { + auto stream_ord = + range.offset() % device::Env::instance()->num_streams_total(); + return BLASQueuePool::queue(stream_ord); + } else + return BLASQueuePool::queue(*stream_opt); } -} // namespace detail } // namespace TiledArray diff --git a/src/TiledArray/device/btas.h b/src/TiledArray/device/btas.h index 576cbe9e37..acd42341fd 100644 --- a/src/TiledArray/device/btas.h +++ b/src/TiledArray/device/btas.h @@ -31,6 +31,7 @@ #ifdef TILEDARRAY_HAS_DEVICE #include + #include #include @@ -84,14 +85,15 @@ ::btas::Tensor gemm( T factor_t = T(factor); T zero(0); - DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - // typedef typename Tensor::storage_type storage_type; auto result_range = gemm_helper.make_result_range(left.range(), right.range()); - auto &queue = detail::get_blasqueue_based_on_range(result_range); - auto &stream = queue.stream(); + auto &queue = blasqueue_for(result_range); + const auto device = queue.device(); + const auto str = queue.stream(); + const device::Stream stream(device, str); + DeviceSafeCall(device::setDevice(device)); // the result Tensor type typedef ::btas::Tensor Tensor; @@ -115,7 +117,7 @@ ::btas::Tensor gemm( device_data(right.storage()), ldb, device_data(left.storage()), lda, zero, device_data(result.storage()), n, queue); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); } return result; @@ -188,9 +190,9 @@ void gemm(::btas::Tensor &result, const integer ldb = (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k); - DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &queue = detail::get_blasqueue_based_on_range(result.range()); - auto &stream = queue.stream(); + auto &queue = blasqueue_for(result.range()); + const auto stream = device::Stream(queue.device(), queue.stream()); + DeviceSafeCall(device::setDevice(stream.device)); T factor_t = T(factor); T one(1); @@ -209,7 +211,7 @@ void gemm(::btas::Tensor &result, gemm_helper.left_op(), n, m, k, factor_t, device_data(right.storage()), ldb, device_data(left.storage()), lda, one, device_data(result.storage()), n, queue); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); } } @@ -217,12 +219,10 @@ void gemm(::btas::Tensor &result, template ::btas::Tensor clone( const ::btas::Tensor &arg) { - DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - Storage result_storage; auto result_range = arg.range(); - auto &queue = detail::get_blasqueue_based_on_range(result_range); - auto &stream = queue.stream(); + auto &queue = blasqueue_for(result_range); + const auto stream = Stream{queue.device(), queue.stream()}; make_device_storage(result_storage, arg.size(), stream); ::btas::Tensor result(std::move(result_range), @@ -231,7 +231,7 @@ ::btas::Tensor clone( blas::copy(result.size(), device_data(arg.storage()), 1, device_data(result.storage()), 1, queue); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } @@ -240,29 +240,31 @@ template >> ::btas::Tensor scale( const ::btas::Tensor &arg, const Scalar a) { - DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &queue = detail::get_blasqueue_based_on_range(arg.range()); - auto &stream = queue.stream(); + auto &queue = blasqueue_for(arg.range()); + const device::Stream stream(queue.device(), queue.stream()); auto result = clone(arg); - if constexpr (detail::is_blas_numeric_v || + if constexpr (TiledArray::detail::is_blas_numeric_v || std::is_arithmetic_v) { blas::scal(result.size(), a, device_data(result.storage()), 1, queue); } else { - if constexpr (detail::is_complex_v) { + if constexpr (TiledArray::detail::is_complex_v) { abort(); // fused conjugation requires custom kernels, not yet supported } else { - if constexpr (std::is_same_v>) { - } else if constexpr (std::is_same_v>) { + if constexpr (std::is_same_v< + Scalar, TiledArray::detail::ComplexConjugate>) { + } else if constexpr (std::is_same_v< + Scalar, + TiledArray::detail::ComplexConjugate< + TiledArray::detail::ComplexNegTag>>) { blas::scal(result.size(), static_cast(-1), device_data(result.storage()), 1, queue); } } } - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } @@ -271,27 +273,29 @@ ::btas::Tensor scale( template >> void scale_to(::btas::Tensor &result, const Scalar a) { - DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &queue = detail::get_blasqueue_based_on_range(result.range()); - auto &stream = queue.stream(); + auto &queue = blasqueue_for(result.range()); + const device::Stream stream(queue.device(), queue.stream()); - if constexpr (detail::is_blas_numeric_v || + if constexpr (TiledArray::detail::is_blas_numeric_v || std::is_arithmetic_v) { blas::scal(result.size(), a, device_data(result.storage()), 1, queue); } else { - if constexpr (detail::is_complex_v) { + if constexpr (TiledArray::detail::is_complex_v) { abort(); // fused conjugation requires custom kernels, not yet supported } else { - if constexpr (std::is_same_v>) { - } else if constexpr (std::is_same_v>) { + if constexpr (std::is_same_v< + Scalar, TiledArray::detail::ComplexConjugate>) { + } else if constexpr (std::is_same_v< + Scalar, + TiledArray::detail::ComplexConjugate< + TiledArray::detail::ComplexNegTag>>) { blas::scal(result.size(), static_cast(-1), device_data(result.storage()), 1, queue); } } } - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); } /// result[i] = arg1[i] - a * arg2[i] @@ -305,9 +309,8 @@ ::btas::Tensor subt( // revert the sign of a auto b = -a; - DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &queue = detail::get_blasqueue_based_on_range(result.range()); - auto &stream = queue.stream(); + auto &queue = blasqueue_for(result.range()); + const device::Stream stream(queue.device(), queue.stream()); if (in_memory_space(result.storage())) { blas::axpy(result.size(), b, device_data(arg2.storage()), 1, @@ -316,7 +319,7 @@ ::btas::Tensor subt( TA_ASSERT(false); } - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } @@ -325,16 +328,15 @@ template >> void subt_to(::btas::Tensor &result, const ::btas::Tensor &arg1, const Scalar a) { - DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &queue = detail::get_blasqueue_based_on_range(result.range()); - auto &stream = queue.stream(); + auto &queue = blasqueue_for(result.range()); + const device::Stream stream(queue.device(), queue.stream()); // revert the sign of a auto b = -a; blas::axpy(result.size(), b, device_data(arg1.storage()), 1, device_data(result.storage()), 1, queue); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); } /// result[i] = arg1[i] + a * arg2[i] @@ -345,14 +347,13 @@ ::btas::Tensor add( const ::btas::Tensor &arg2, const Scalar a) { auto result = clone(arg1); - DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &queue = detail::get_blasqueue_based_on_range(result.range()); - auto &stream = queue.stream(); + auto &queue = blasqueue_for(result.range()); + const device::Stream stream(queue.device(), queue.stream()); blas::axpy(result.size(), a, device_data(arg2.storage()), 1, device_data(result.storage()), 1, queue); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } @@ -361,9 +362,8 @@ template >> void add_to(::btas::Tensor &result, const ::btas::Tensor &arg, const Scalar a) { - DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - auto &queue = detail::get_blasqueue_based_on_range(result.range()); - auto &stream = queue.stream(); + auto &queue = blasqueue_for(result.range()); + const device::Stream stream(queue.device(), queue.stream()); // TiledArray::to_execution_space(result.storage(),stream); // TiledArray::to_execution_space(arg.storage(),stream); @@ -371,22 +371,22 @@ void add_to(::btas::Tensor &result, blas::axpy(result.size(), a, device_data(arg.storage()), 1, device_data(result.storage()), 1, queue); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); } /// result[i] = result[i] * arg[i] template void mult_to(::btas::Tensor &result, const ::btas::Tensor &arg) { - auto device_id = deviceEnv::instance()->current_device_id(); - auto &stream = detail::get_stream_based_on_range(result.range()); + auto &queue = blasqueue_for(result.range()); + const device::Stream stream(queue.device(), queue.stream()); std::size_t n = result.size(); TA_ASSERT(n == arg.size()); - device::mult_to_kernel(result.data(), arg.data(), n, stream, device_id); - device::synchronize_stream(&stream); + device::mult_to_kernel(result.data(), arg.data(), n, stream); + device::sync_madness_task_with(stream); } /// result[i] = arg1[i] * arg2[i] @@ -398,19 +398,16 @@ ::btas::Tensor mult( TA_ASSERT(arg2.size() == n); - auto device_id = deviceEnv::instance()->current_device_id(); - DeviceSafeCall(device::setDevice(device_id)); - auto &stream = detail::get_stream_based_on_range(arg1.range()); + auto stream = stream_for(arg1.range()); Storage result_storage; make_device_storage(result_storage, n, stream); ::btas::Tensor result(arg1.range(), std::move(result_storage)); - device::mult_kernel(result.data(), arg1.data(), arg2.data(), n, stream, - device_id); + device::mult_kernel(result.data(), arg1.data(), arg2.data(), n, stream); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } @@ -418,10 +415,8 @@ ::btas::Tensor mult( template typename ::btas::Tensor::value_type squared_norm( const ::btas::Tensor &arg) { - DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - - auto &queue = detail::get_blasqueue_based_on_range(arg.range()); - auto &stream = queue.stream(); + auto &queue = blasqueue_for(arg.range()); + const device::Stream stream(queue.device(), queue.stream()); auto &storage = arg.storage(); using TiledArray::math::blas::integer; @@ -434,7 +429,7 @@ typename ::btas::Tensor::value_type squared_norm( TA_ASSERT(false); // result = TiledArray::math::dot(size, storage.data(), storage.data()); } - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } @@ -443,10 +438,8 @@ template typename ::btas::Tensor::value_type dot( const ::btas::Tensor &arg1, const ::btas::Tensor &arg2) { - DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - - auto &queue = detail::get_blasqueue_based_on_range(arg1.range()); - auto &stream = queue.stream(); + auto &queue = blasqueue_for(arg1.range()); + const device::Stream stream(queue.device(), queue.stream()); using TiledArray::math::blas::integer; integer size = arg1.storage().size(); @@ -462,91 +455,85 @@ typename ::btas::Tensor::value_type dot( TA_ASSERT(false); // result = TiledArray::math::dot(size, storage.data(), storage.data()); } - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } template T sum(const ::btas::Tensor &arg) { - auto &stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = deviceEnv::instance()->current_device_id(); + auto stream = device::stream_for(arg.range()); auto &storage = arg.storage(); auto n = storage.size(); - auto result = device::sum_kernel(arg.data(), n, stream, device_id); + auto result = device::sum_kernel(arg.data(), n, stream); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } template T product(const ::btas::Tensor &arg) { - auto &stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = deviceEnv::instance()->current_device_id(); + auto stream = device::stream_for(arg.range()); auto &storage = arg.storage(); auto n = storage.size(); - auto result = device::product_kernel(arg.data(), n, stream, device_id); + auto result = device::product_kernel(arg.data(), n, stream); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } template T min(const ::btas::Tensor &arg) { - auto &stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = deviceEnv::instance()->current_device_id(); + auto stream = device::stream_for(arg.range()); auto &storage = arg.storage(); auto n = storage.size(); - auto result = device::min_kernel(arg.data(), n, stream, device_id); + auto result = device::min_kernel(arg.data(), n, stream); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } template T max(const ::btas::Tensor &arg) { - auto &stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = deviceEnv::instance()->current_device_id(); + auto stream = device::stream_for(arg.range()); auto &storage = arg.storage(); auto n = storage.size(); - auto result = device::max_kernel(arg.data(), n, stream, device_id); + auto result = device::max_kernel(arg.data(), n, stream); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } template T absmin(const ::btas::Tensor &arg) { - auto &stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = deviceEnv::instance()->current_device_id(); + auto stream = device::stream_for(arg.range()); auto &storage = arg.storage(); auto n = storage.size(); - auto result = device::absmin_kernel(arg.data(), n, stream, device_id); + auto result = device::absmin_kernel(arg.data(), n, stream); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } template T absmax(const ::btas::Tensor &arg) { - auto &stream = detail::get_stream_based_on_range(arg.range()); - auto device_id = deviceEnv::instance()->current_device_id(); + auto stream = device::stream_for(arg.range()); auto &storage = arg.storage(); auto n = storage.size(); - auto result = device::absmax_kernel(arg.data(), n, stream, device_id); + auto result = device::absmax_kernel(arg.data(), n, stream); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } diff --git a/src/TiledArray/device/btas_um_tensor.cpp b/src/TiledArray/device/btas_um_tensor.cpp index 0b442620cc..a4d2167812 100644 --- a/src/TiledArray/device/btas_um_tensor.cpp +++ b/src/TiledArray/device/btas_um_tensor.cpp @@ -2,10 +2,12 @@ // Created by Chong Peng on 7/24/18. // -#include +#include #ifdef TILEDARRAY_HAS_DEVICE +#include + template class btas::varray>; template class btas::varray>; template class btas::varray< diff --git a/src/TiledArray/device/btas_um_tensor.h b/src/TiledArray/device/btas_um_tensor.h index af4899f4ee..45f9b63731 100644 --- a/src/TiledArray/device/btas_um_tensor.h +++ b/src/TiledArray/device/btas_um_tensor.h @@ -32,6 +32,7 @@ #ifdef TILEDARRAY_HAS_DEVICE +#include #include #include #include @@ -47,8 +48,7 @@ struct is_device_tile< template void to_device(const TiledArray::btasUMTensorVarray &tile) { - device::setDevice(TiledArray::deviceEnv::instance()->current_device_id()); - auto &stream = TiledArray::detail::get_stream_based_on_range(tile.range()); + auto stream = device::stream_for(tile.range()); TiledArray::to_execution_space( tile.storage(), stream); } @@ -69,8 +69,8 @@ struct ArchiveLoadImpl> { TiledArray::device_um_btas_varray store{}; ar &range &store; t = TiledArray::btasUMTensorVarray(std::move(range), std::move(store)); - // device::setDevice(TiledArray::deviceEnv::instance()->current_device_id()); - // auto &stream = TiledArray::detail::get_stream_based_on_range(range); + // device::setDevice(TiledArray::deviceEnv::instance()->default_device_id()); + // auto &stream = device::stream_for(range); // TiledArray::to_execution_space(t.storage(), // stream); } @@ -80,9 +80,7 @@ template struct ArchiveStoreImpl> { static inline void store(const Archive &ar, const TiledArray::btasUMTensorVarray &t) { - DeviceSafeCall(TiledArray::device::setDevice( - TiledArray::deviceEnv::instance()->current_device_id())); - auto &stream = TiledArray::detail::get_stream_based_on_range(t.range()); + auto stream = TiledArray::device::stream_for(t.range()); TiledArray::to_execution_space( t.storage(), stream); ar &t.range() & t.storage(); @@ -137,11 +135,9 @@ btasUMTensorVarray shift(const btasUMTensorVarray &arg, // shift the range result_range.inplace_shift(range_shift); - DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - // @important select the stream using the shifted range - auto &queue = detail::get_blasqueue_based_on_range(result_range); - auto &stream = queue.stream(); + auto &queue = blasqueue_for(result_range); + const auto stream = device::Stream(queue.device(), queue.stream()); typename btasUMTensorVarray::storage_type result_storage; @@ -152,7 +148,7 @@ btasUMTensorVarray shift(const btasUMTensorVarray &arg, blas::copy(result.size(), device_data(arg.storage()), 1, device_data(result.storage()), 1, queue); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } @@ -175,10 +171,8 @@ btasUMTensorVarray permute(const btasUMTensorVarray &arg, const TiledArray::Permutation &perm) { // compute result range auto result_range = perm * arg.range(); - DeviceSafeCall(device::setDevice(deviceEnv::instance()->current_device_id())); - // compute the stream to use - auto &stream = detail::get_stream_based_on_range(result_range); + auto stream = device::stream_for(result_range); // allocate result memory typename btasUMTensorVarray::storage_type storage; @@ -191,11 +185,22 @@ btasUMTensorVarray permute(const btasUMTensorVarray &arg, librett_permute(const_cast(device_data(arg.storage())), device_data(result.storage()), arg.range(), perm, stream); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; } +// WARNING omitting this overload dispatches to the base CPU implementation in +// external/btas.h + +template +btasUMTensorVarray permute( + const btasUMTensorVarray &arg, + const TiledArray::BipartitePermutation &perm) { + TA_ASSERT(inner_size(perm) == 0); // this must be a plain permutation + return permute(arg, outer(perm)); +} + /// /// scale /// @@ -225,10 +230,6 @@ btasUMTensorVarray scale(const btasUMTensorVarray &arg, const Scalar factor, const Perm &perm) { auto result = scale(arg, factor); - // wait to finish before switch stream - auto stream = device::tls_stream_accessor(); - device::streamSynchronize(*stream); - return permute(result, perm); } @@ -249,10 +250,6 @@ btasUMTensorVarray neg(const btasUMTensorVarray &arg, const Perm &perm) { auto result = neg(arg); - // wait to finish before switch stream - auto stream = device::tls_stream_accessor(); - device::streamSynchronize(*stream); - return permute(result, perm); } @@ -292,11 +289,6 @@ btasUMTensorVarray subt(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2, const Perm &perm) { auto result = subt(arg1, arg2); - - // wait to finish before switch stream - auto stream = device::tls_stream_accessor(); - device::streamSynchronize(*stream); - return permute(result, perm); } @@ -308,11 +300,6 @@ btasUMTensorVarray subt(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2, const Scalar factor, const Perm &perm) { auto result = subt(arg1, arg2, factor); - - // wait to finish before switch stream - auto stream = device::tls_stream_accessor(); - device::streamSynchronize(*stream); - return permute(result, perm); } @@ -370,11 +357,6 @@ btasUMTensorVarray add(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2, const Scalar factor, const Perm &perm) { auto result = add(arg1, arg2, factor); - - // wait to finish before switch stream - auto stream = device::tls_stream_accessor(); - device::streamSynchronize(*stream); - return permute(result, perm); } @@ -385,11 +367,6 @@ btasUMTensorVarray add(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2, const Perm &perm) { auto result = add(arg1, arg2); - - // wait to finish before switch stream - auto stream = device::tls_stream_accessor(); - device::streamSynchronize(*stream); - return permute(result, perm); } @@ -456,11 +433,6 @@ btasUMTensorVarray mult(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2, const Perm &perm) { auto result = mult(arg1, arg2); - - // wait to finish before switch stream - auto stream = device::tls_stream_accessor(); - device::streamSynchronize(*stream); - return permute(result, perm); } @@ -472,11 +444,6 @@ btasUMTensorVarray mult(const btasUMTensorVarray &arg1, const btasUMTensorVarray &arg2, const Scalar factor, const Perm &perm) { auto result = mult(arg1, arg2, factor); - - // wait to finish before switch stream - auto stream = device::tls_stream_accessor(); - device::streamSynchronize(*stream); - return permute(result, perm); } @@ -602,9 +569,7 @@ template void to_host( TiledArray::DistArray, Policy> &um_array) { auto to_host = [](TiledArray::Tile &tile) { - DeviceSafeCall( - device::setDevice(deviceEnv::instance()->current_device_id())); - auto &stream = detail::get_stream_based_on_range(tile.range()); + auto stream = device::stream_for(tile.range()); TiledArray::to_execution_space( tile.tensor().storage(), stream); @@ -630,9 +595,7 @@ template void to_device( TiledArray::DistArray, Policy> &um_array) { auto to_device = [](TiledArray::Tile &tile) { - DeviceSafeCall( - device::setDevice(deviceEnv::instance()->current_device_id())); - auto &stream = detail::get_stream_based_on_range(tile.range()); + auto stream = device::stream_for(tile.range()); TiledArray::to_execution_space( tile.tensor().storage(), stream); @@ -662,12 +625,12 @@ um_tensor_to_ta_tensor( const auto convert_tile_memcpy = [](const UMTensor &tile) { TATensor result(tile.tensor().range()); - auto &stream = deviceEnv::instance()->stream_d2h(); + auto stream = device::stream_for(result.range()); DeviceSafeCall( device::memcpyAsync(result.data(), tile.data(), tile.size() * sizeof(typename TATensor::value_type), device::MemcpyDefault, stream)); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return result; }; @@ -677,9 +640,7 @@ um_tensor_to_ta_tensor( using std::begin; const auto n = tile.tensor().size(); - DeviceSafeCall( - device::setDevice(deviceEnv::instance()->current_device_id())); - auto &stream = detail::get_stream_based_on_range(tile.range()); + auto stream = device::stream_for(tile.range()); TiledArray::to_execution_space( tile.tensor().storage(), stream); @@ -716,12 +677,9 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { auto convert_tile_memcpy = [](const TATensor &tile) { /// UMTensor must be wrapped into TA::Tile - DeviceSafeCall( - device::setDevice(deviceEnv::instance()->current_device_id())); - using Tensor = typename UMTensor::tensor_type; - auto &stream = deviceEnv::instance()->stream_h2d(); + auto stream = device::stream_for(tile.range()); typename Tensor::storage_type storage; make_device_storage(storage, tile.range().area(), stream); Tensor result(tile.range(), std::move(storage)); @@ -731,16 +689,13 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { tile.size() * sizeof(typename Tensor::value_type), device::MemcpyDefault, stream)); - device::synchronize_stream(&stream); + device::sync_madness_task_with(stream); return TiledArray::Tile(std::move(result)); }; auto convert_tile_um = [](const TATensor &tile) { /// UMTensor must be wrapped into TA::Tile - DeviceSafeCall( - device::setDevice(deviceEnv::instance()->current_device_id())); - using Tensor = typename UMTensor::tensor_type; typename Tensor::storage_type storage(tile.range().area()); @@ -750,7 +705,7 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { std::copy_n(tile.data(), n, result.data()); - auto &stream = detail::get_stream_based_on_range(result.range()); + auto stream = device::stream_for(result.range()); // prefetch data to GPU TiledArray::to_execution_space( diff --git a/src/TiledArray/device/device_task_fn.h b/src/TiledArray/device/device_task_fn.h index 8d2ab0e248..f08e5d44b5 100644 --- a/src/TiledArray/device/device_task_fn.h +++ b/src/TiledArray/device/device_task_fn.h @@ -35,7 +35,7 @@ namespace madness { /// /// deviceTaskFn class /// represent a task that calls an async device kernel -/// the task must call synchronize_stream function to tell which stream it +/// the task must call sync_madness_task_with function to tell which stream it /// used /// @@ -104,19 +104,17 @@ struct deviceTaskFn : public TaskInterface { task_->run_async(); // get the stream used by async function - auto stream = TiledArray::device::tls_stream_accessor(); - - // TA_ASSERT(stream != nullptr); + auto stream_opt = TiledArray::device::detail::tls_stream_accessor(); // WARNING, need to handle NoOp - if (stream == nullptr) { + if (!stream_opt) { task_->notify(); } else { // TODO should we use device callback or device events?? // insert device callback - TiledArray::device::launchHostFunc(*stream, device_callback, task_); - // reset stream to nullptr - TiledArray::device::synchronize_stream(nullptr); + TiledArray::device::launchHostFunc(*stream_opt, device_callback, task_); + // processed sync, clear state + TiledArray::device::detail::tls_stream_accessor() = {}; } } diff --git a/src/TiledArray/device/kernel/mult_kernel.h b/src/TiledArray/device/kernel/mult_kernel.h index e21c33e2e9..38a854000a 100644 --- a/src/TiledArray/device/kernel/mult_kernel.h +++ b/src/TiledArray/device/kernel/mult_kernel.h @@ -35,39 +35,39 @@ namespace TiledArray::device { /// result[i] = result[i] * arg[i] -void mult_to_kernel(int *result, const int *arg, std::size_t n, stream_t stream, - int device_id); +void mult_to_kernel(int *result, const int *arg, std::size_t n, + const Stream &stream); void mult_to_kernel(float *result, const float *arg, std::size_t n, - stream_t stream, int device_id); + const Stream &stream); void mult_to_kernel(double *result, const double *arg, std::size_t n, - stream_t stream, int device_id); + const Stream &stream); void mult_to_kernel(std::complex *result, const std::complex *arg, - std::size_t n, stream_t stream, int device_id); + std::size_t n, const Stream &stream); void mult_to_kernel(std::complex *result, const std::complex *arg, std::size_t n, - stream_t stream, int device_id); + const Stream &stream); /// result[i] = arg1[i] * arg2[i] void mult_kernel(int *result, const int *arg1, const int *arg2, std::size_t n, - stream_t stream, int device_id); + const Stream &stream); void mult_kernel(float *result, const float *arg1, const float *arg2, - std::size_t n, stream_t stream, int device_id); + std::size_t n, const Stream &stream); void mult_kernel(double *result, const double *arg1, const double *arg2, - std::size_t n, stream_t stream, int device_id); + std::size_t n, const Stream &stream); void mult_kernel(std::complex *result, const std::complex *arg1, const std::complex *arg2, std::size_t n, - stream_t stream, int device_id); + const Stream &stream); void mult_kernel(std::complex *result, const std::complex *arg1, const std::complex *arg2, std::size_t n, - stream_t stream, int device_id); + const Stream &stream); } // namespace TiledArray::device diff --git a/src/TiledArray/device/kernel/reduce_kernel.h b/src/TiledArray/device/kernel/reduce_kernel.h index 89f3600ba6..5af88c58be 100644 --- a/src/TiledArray/device/kernel/reduce_kernel.h +++ b/src/TiledArray/device/kernel/reduce_kernel.h @@ -35,89 +35,70 @@ namespace TiledArray::device { // foreach(i) result *= arg[i] -int product_kernel(const int *arg, std::size_t n, stream_t stream, - int device_id); +int product_kernel(const int* arg, std::size_t n, const Stream& stream); -float product_kernel(const float *arg, std::size_t n, stream_t stream, - int device_id); +float product_kernel(const float* arg, std::size_t n, const Stream& stream); -double product_kernel(const double *arg, std::size_t n, stream_t stream, - int device_id); +double product_kernel(const double* arg, std::size_t n, const Stream& stream); -std::complex product_kernel(const std::complex *arg, - std::size_t n, stream_t stream, - int device_id); +std::complex product_kernel(const std::complex* arg, + std::size_t n, const Stream& stream); -std::complex product_kernel(const std::complex *arg, - std::size_t n, stream_t stream, - int device_id); +std::complex product_kernel(const std::complex* arg, + std::size_t n, const Stream& stream); // foreach(i) result += arg[i] -int sum_kernel(const int *arg, std::size_t n, stream_t stream, int device_id); +int sum_kernel(const int* arg, std::size_t n, const Stream& stream); -float sum_kernel(const float *arg, std::size_t n, stream_t stream, - int device_id); +float sum_kernel(const float* arg, std::size_t n, const Stream& stream); -double sum_kernel(const double *arg, std::size_t n, stream_t stream, - int device_id); +double sum_kernel(const double* arg, std::size_t n, const Stream& stream); -std::complex sum_kernel(const std::complex *arg, std::size_t n, - stream_t stream, int device_id); +std::complex sum_kernel(const std::complex* arg, std::size_t n, + const Stream& stream); -std::complex sum_kernel(const std::complex *arg, std::size_t n, - stream_t stream, int device_id); +std::complex sum_kernel(const std::complex* arg, std::size_t n, + const Stream& stream); // foreach(i) result = max(result, arg[i]) -int max_kernel(const int *arg, std::size_t n, stream_t stream, int device_id); +int max_kernel(const int* arg, std::size_t n, const Stream& stream); -float max_kernel(const float *arg, std::size_t n, stream_t stream, - int device_id); +float max_kernel(const float* arg, std::size_t n, const Stream& stream); -double max_kernel(const double *arg, std::size_t n, stream_t stream, - int device_id); +double max_kernel(const double* arg, std::size_t n, const Stream& stream); // foreach(i) result = min(result, arg[i]) -int min_kernel(const int *arg, std::size_t n, stream_t stream, int device_id); +int min_kernel(const int* arg, std::size_t n, const Stream& stream); -float min_kernel(const float *arg, std::size_t n, stream_t stream, - int device_id); +float min_kernel(const float* arg, std::size_t n, const Stream& stream); -double min_kernel(const double *arg, std::size_t n, stream_t stream, - int device_id); +double min_kernel(const double* arg, std::size_t n, const Stream& stream); // foreach(i) result = max(result, abs(arg[i])) -int absmax_kernel(const int *arg, std::size_t n, stream_t stream, - int device_id); +int absmax_kernel(const int* arg, std::size_t n, const Stream& stream); -float absmax_kernel(const float *arg, std::size_t n, stream_t stream, - int device_id); +float absmax_kernel(const float* arg, std::size_t n, const Stream& stream); -double absmax_kernel(const double *arg, std::size_t n, stream_t stream, - int device_id); +double absmax_kernel(const double* arg, std::size_t n, const Stream& stream); -std::complex absmax_kernel(const std::complex *arg, std::size_t n, - stream_t stream, int device_id); +std::complex absmax_kernel(const std::complex* arg, std::size_t n, + const Stream& stream); -std::complex absmax_kernel(const std::complex *arg, - std::size_t n, stream_t stream, - int device_id); +std::complex absmax_kernel(const std::complex* arg, + std::size_t n, const Stream& stream); // foreach(i) result = min(result, abs(arg[i])) -int absmin_kernel(const int *arg, std::size_t n, stream_t stream, - int device_id); +int absmin_kernel(const int* arg, std::size_t n, const Stream& stream); -float absmin_kernel(const float *arg, std::size_t n, stream_t stream, - int device_id); +float absmin_kernel(const float* arg, std::size_t n, const Stream& stream); -double absmin_kernel(const double *arg, std::size_t n, stream_t stream, - int device_id); +double absmin_kernel(const double* arg, std::size_t n, const Stream& stream); -std::complex absmin_kernel(const std::complex *arg, std::size_t n, - stream_t stream, int device_id); +std::complex absmin_kernel(const std::complex* arg, std::size_t n, + const Stream& stream); -std::complex absmin_kernel(const std::complex *arg, - std::size_t n, stream_t stream, - int device_id); +std::complex absmin_kernel(const std::complex* arg, + std::size_t n, const Stream& stream); } // namespace TiledArray::device diff --git a/src/TiledArray/device/kernel/thrust/mult_kernel.cu b/src/TiledArray/device/kernel/thrust/mult_kernel.cu index bac60041f3..e28ccd757a 100644 --- a/src/TiledArray/device/kernel/thrust/mult_kernel.cu +++ b/src/TiledArray/device/kernel/thrust/mult_kernel.cu @@ -28,54 +28,54 @@ namespace TiledArray::device { /// result[i] = result[i] * arg[i] void mult_to_kernel(int *result, const int *arg, std::size_t n, - stream_t stream, int device_id) { - mult_to_kernel_thrust(result, arg, n, stream, device_id); + const Stream& stream) { + mult_to_kernel_thrust(result, arg, n, stream); } void mult_to_kernel(float *result, const float *arg, std::size_t n, - stream_t stream, int device_id) { - mult_to_kernel_thrust(result, arg, n, stream, device_id); + const Stream& stream) { + mult_to_kernel_thrust(result, arg, n, stream); } void mult_to_kernel(double *result, const double *arg, std::size_t n, - stream_t stream, int device_id) { - mult_to_kernel_thrust(result, arg, n, stream, device_id); + const Stream& stream) { + mult_to_kernel_thrust(result, arg, n, stream); } void mult_to_kernel(std::complex *result, const std::complex *arg, std::size_t n, - stream_t stream, int device_id) { - mult_to_kernel_thrust(result, arg, n, stream, device_id); + const Stream& stream) { + mult_to_kernel_thrust(result, arg, n, stream); } void mult_to_kernel(std::complex *result, const std::complex *arg, std::size_t n, - stream_t stream, int device_id) { - mult_to_kernel_thrust(result, arg, n, stream, device_id); + const Stream& stream) { + mult_to_kernel_thrust(result, arg, n, stream); } /// result[i] = arg1[i] * arg2[i] void mult_kernel(int *result, const int *arg1, const int *arg2, std::size_t n, - stream_t stream, int device_id){ - mult_kernel_thrust(result,arg1,arg2,n,stream,device_id); + const Stream& stream){ + mult_kernel_thrust(result,arg1,arg2,n,stream); } void mult_kernel(float *result, const float *arg1, const float *arg2, std::size_t n, - stream_t stream, int device_id){ - mult_kernel_thrust(result,arg1,arg2,n,stream,device_id); + const Stream& stream){ + mult_kernel_thrust(result,arg1,arg2,n,stream); } void mult_kernel(double *result, const double *arg1, const double *arg2, std::size_t n, - stream_t stream, int device_id){ - mult_kernel_thrust(result,arg1,arg2,n,stream,device_id); + const Stream& stream){ + mult_kernel_thrust(result,arg1,arg2,n,stream); } void mult_kernel(std::complex *result, const std::complex *arg1, const std::complex *arg2, std::size_t n, - stream_t stream, int device_id){ - mult_kernel_thrust(result,arg1,arg2,n,stream,device_id); + const Stream& stream){ + mult_kernel_thrust(result,arg1,arg2,n,stream); } void mult_kernel(std::complex *result, const std::complex *arg1, const std::complex *arg2, std::size_t n, - stream_t stream, int device_id){ - mult_kernel_thrust(result,arg1,arg2,n,stream,device_id); + const Stream& stream){ + mult_kernel_thrust(result,arg1,arg2,n,stream); } } // namespace TiledArray::device diff --git a/src/TiledArray/device/kernel/thrust/mult_kernel.h b/src/TiledArray/device/kernel/thrust/mult_kernel.h index 08e07efa54..8a48493cf0 100644 --- a/src/TiledArray/device/kernel/thrust/mult_kernel.h +++ b/src/TiledArray/device/kernel/thrust/mult_kernel.h @@ -34,12 +34,12 @@ namespace TiledArray::device { /// result[i] = result[i] * arg[i] template void mult_to_kernel_thrust(T *result, const T *arg, std::size_t n, - stream_t stream, int device_id) { - DeviceSafeCall(device::setDevice(device_id)); + const Stream &s) { + DeviceSafeCall(device::setDevice(s.device)); thrust::multiplies mul_op; thrust::transform( - thrust_system::par.on(stream), thrust::device_pointer_cast(arg), + thrust_system::par.on(s.stream), thrust::device_pointer_cast(arg), thrust::device_pointer_cast(arg) + n, thrust::device_pointer_cast(result), thrust::device_pointer_cast(result), mul_op); } @@ -47,12 +47,12 @@ void mult_to_kernel_thrust(T *result, const T *arg, std::size_t n, /// result[i] = arg1[i] * arg2[i] template void mult_kernel_thrust(T *result, const T *arg1, const T *arg2, std::size_t n, - stream_t stream, int device_id) { - DeviceSafeCall(device::setDevice(device_id)); + const Stream &s) { + DeviceSafeCall(device::setDevice(s.device)); thrust::multiplies mul_op; thrust::transform( - thrust_system::par.on(stream), thrust::device_pointer_cast(arg1), + thrust_system::par.on(s.stream), thrust::device_pointer_cast(arg1), thrust::device_pointer_cast(arg1) + n, thrust::device_pointer_cast(arg2), thrust::device_pointer_cast(result), mul_op); } diff --git a/src/TiledArray/device/kernel/thrust/reduce_kernel.cu b/src/TiledArray/device/kernel/thrust/reduce_kernel.cu index bfca9f2e64..08145ef0b4 100644 --- a/src/TiledArray/device/kernel/thrust/reduce_kernel.cu +++ b/src/TiledArray/device/kernel/thrust/reduce_kernel.cu @@ -27,141 +27,115 @@ namespace TiledArray::device { // foreach(i) result *= arg[i] -int product_kernel(const int *arg, std::size_t n, stream_t stream, - int device_id){ - return product_reduce_kernel_thrust(arg, n, stream, device_id); +int product_kernel(const int *arg, std::size_t n, const Stream& stream){ + return product_reduce_kernel_thrust(arg, n, stream); } -float product_kernel(const float *arg, std::size_t n, stream_t stream, - int device_id){ - return product_reduce_kernel_thrust(arg, n, stream, device_id); +float product_kernel(const float *arg, std::size_t n, const Stream& stream){ + return product_reduce_kernel_thrust(arg, n, stream); } -double product_kernel(const double *arg, std::size_t n, stream_t stream, - int device_id){ +double product_kernel(const double *arg, std::size_t n, const Stream& stream){ - return product_reduce_kernel_thrust(arg, n, stream, device_id); + return product_reduce_kernel_thrust(arg, n, stream); } -std::complex product_kernel(const std::complex *arg, std::size_t n, stream_t stream, - int device_id){ - return product_reduce_kernel_thrust(arg, n, stream, device_id); +std::complex product_kernel(const std::complex *arg, std::size_t n, const Stream& stream){ + return product_reduce_kernel_thrust(arg, n, stream); } -std::complex product_kernel(const std::complex *arg, std::size_t n, stream_t stream, - int device_id){ +std::complex product_kernel(const std::complex *arg, std::size_t n, const Stream& stream){ - return product_reduce_kernel_thrust(arg, n, stream, device_id); + return product_reduce_kernel_thrust(arg, n, stream); } // foreach(i) result += arg[i] -int sum_kernel(const int *arg, std::size_t n, stream_t stream, - int device_id){ - return sum_reduce_kernel_thrust(arg, n, stream, device_id); +int sum_kernel(const int *arg, std::size_t n, const Stream& stream){ + return sum_reduce_kernel_thrust(arg, n, stream); } -float sum_kernel(const float *arg, std::size_t n, stream_t stream, - int device_id){ - return sum_reduce_kernel_thrust(arg, n, stream, device_id); +float sum_kernel(const float *arg, std::size_t n, const Stream& stream){ + return sum_reduce_kernel_thrust(arg, n, stream); } -double sum_kernel(const double *arg, std::size_t n, stream_t stream, - int device_id){ - return sum_reduce_kernel_thrust(arg, n, stream, device_id); +double sum_kernel(const double *arg, std::size_t n, const Stream& stream){ + return sum_reduce_kernel_thrust(arg, n, stream); } -std::complex sum_kernel(const std::complex *arg, std::size_t n, stream_t stream, - int device_id){ - return sum_reduce_kernel_thrust(arg, n, stream, device_id); +std::complex sum_kernel(const std::complex *arg, std::size_t n, const Stream& stream){ + return sum_reduce_kernel_thrust(arg, n, stream); } -std::complex sum_kernel(const std::complex *arg, std::size_t n, stream_t stream, - int device_id){ - return sum_reduce_kernel_thrust(arg, n, stream, device_id); +std::complex sum_kernel(const std::complex *arg, std::size_t n, const Stream& stream){ + return sum_reduce_kernel_thrust(arg, n, stream); } // foreach(i) result = max(result, arg[i]) -int max_kernel(const int *arg, std::size_t n, stream_t stream, - int device_id){ - return max_reduce_kernel_thrust(arg, n, stream, device_id); +int max_kernel(const int *arg, std::size_t n, const Stream& stream){ + return max_reduce_kernel_thrust(arg, n, stream); } -float max_kernel(const float *arg, std::size_t n, stream_t stream, - int device_id){ - return max_reduce_kernel_thrust(arg, n, stream, device_id); +float max_kernel(const float *arg, std::size_t n, const Stream& stream){ + return max_reduce_kernel_thrust(arg, n, stream); } -double max_kernel(const double *arg, std::size_t n, stream_t stream, - int device_id){ - return max_reduce_kernel_thrust(arg, n, stream, device_id); +double max_kernel(const double *arg, std::size_t n, const Stream& stream){ + return max_reduce_kernel_thrust(arg, n, stream); } // foreach(i) result = min(result, arg[i]) -int min_kernel(const int *arg, std::size_t n, stream_t stream, - int device_id){ - return min_reduce_kernel_thrust(arg, n, stream, device_id); +int min_kernel(const int *arg, std::size_t n, const Stream& stream){ + return min_reduce_kernel_thrust(arg, n, stream); } -float min_kernel(const float *arg, std::size_t n, stream_t stream, - int device_id){ - return min_reduce_kernel_thrust(arg, n, stream, device_id); +float min_kernel(const float *arg, std::size_t n, const Stream& stream){ + return min_reduce_kernel_thrust(arg, n, stream); } -double min_kernel(const double *arg, std::size_t n, stream_t stream, - int device_id){ - return min_reduce_kernel_thrust(arg, n, stream, device_id); +double min_kernel(const double *arg, std::size_t n, const Stream& stream){ + return min_reduce_kernel_thrust(arg, n, stream); } // foreach(i) result = max(result, abs(arg[i])) -int absmax_kernel(const int *arg, std::size_t n, stream_t stream, - int device_id){ - return absmax_reduce_kernel_thrust(arg, n, stream, device_id); +int absmax_kernel(const int *arg, std::size_t n, const Stream& stream){ + return absmax_reduce_kernel_thrust(arg, n, stream); } -float absmax_kernel(const float *arg, std::size_t n, stream_t stream, - int device_id){ - return absmax_reduce_kernel_thrust(arg, n, stream, device_id); +float absmax_kernel(const float *arg, std::size_t n, const Stream& stream){ + return absmax_reduce_kernel_thrust(arg, n, stream); } -double absmax_kernel(const double *arg, std::size_t n, stream_t stream, - int device_id){ - return absmax_reduce_kernel_thrust(arg, n, stream, device_id); +double absmax_kernel(const double *arg, std::size_t n, const Stream& stream){ + return absmax_reduce_kernel_thrust(arg, n, stream); } -std::complex absmax_kernel(const std::complex *arg, std::size_t n, stream_t stream, - int device_id){ - return absmax_reduce_kernel_thrust(arg, n, stream, device_id); +std::complex absmax_kernel(const std::complex *arg, std::size_t n, const Stream& stream){ + return absmax_reduce_kernel_thrust(arg, n, stream); } -std::complex absmax_kernel(const std::complex *arg, std::size_t n, stream_t stream, - int device_id){ - return absmax_reduce_kernel_thrust(arg, n, stream, device_id); +std::complex absmax_kernel(const std::complex *arg, std::size_t n, const Stream& stream){ + return absmax_reduce_kernel_thrust(arg, n, stream); } // foreach(i) result = min(result, abs(arg[i])) -int absmin_kernel(const int *arg, std::size_t n, stream_t stream, - int device_id){ - return absmin_reduce_kernel_thrust(arg, n, stream, device_id); +int absmin_kernel(const int *arg, std::size_t n, const Stream& stream){ + return absmin_reduce_kernel_thrust(arg, n, stream); } -float absmin_kernel(const float *arg, std::size_t n, stream_t stream, - int device_id){ - return absmin_reduce_kernel_thrust(arg, n, stream, device_id); +float absmin_kernel(const float *arg, std::size_t n, const Stream& stream){ + return absmin_reduce_kernel_thrust(arg, n, stream); } -double absmin_kernel(const double *arg, std::size_t n, stream_t stream, - int device_id){ - return absmin_reduce_kernel_thrust(arg, n, stream, device_id); +double absmin_kernel(const double *arg, std::size_t n, const Stream& stream){ + return absmin_reduce_kernel_thrust(arg, n, stream); } -std::complex absmin_kernel(const std::complex *arg, std::size_t n, stream_t stream, - int device_id){ - return absmin_reduce_kernel_thrust(arg, n, stream, device_id); +std::complex absmin_kernel(const std::complex *arg, std::size_t n, const Stream& stream){ + return absmin_reduce_kernel_thrust(arg, n, stream); } -std::complex absmin_kernel(const std::complex *arg, std::size_t n, stream_t stream, - int device_id){ - return absmin_reduce_kernel_thrust(arg, n, stream, device_id); +std::complex absmin_kernel(const std::complex *arg, std::size_t n, const Stream& stream){ + return absmin_reduce_kernel_thrust(arg, n, stream); } } // namespace TiledArray::device diff --git a/src/TiledArray/device/kernel/thrust/reduce_kernel.h b/src/TiledArray/device/kernel/thrust/reduce_kernel.h index 8ee8e57b29..e5137ffb21 100644 --- a/src/TiledArray/device/kernel/thrust/reduce_kernel.h +++ b/src/TiledArray/device/kernel/thrust/reduce_kernel.h @@ -56,64 +56,60 @@ struct absolute_value /// T = reduce(T* arg) template T reduce_kernel_thrust(ReduceOp &&op, const T *arg, std::size_t n, T init, - stream_t stream, int device_id) { - DeviceSafeCall(device::setDevice(device_id)); + const Stream &s) { + DeviceSafeCall(device::setDevice(s.device)); auto arg_p = thrust::device_pointer_cast(arg); - auto result = thrust::reduce(thrust_system::par.on(stream), arg_p, arg_p + n, - init, std::forward(op)); + auto result = thrust::reduce(thrust_system::par.on(s.stream), arg_p, + arg_p + n, init, std::forward(op)); return result; } template -T product_reduce_kernel_thrust(const T *arg, std::size_t n, stream_t stream, - int device_id) { +T product_reduce_kernel_thrust(const T *arg, std::size_t n, + const Stream &stream) { T init(1); thrust::multiplies mul_op; - return reduce_kernel_thrust(mul_op, arg, n, init, stream, device_id); + return reduce_kernel_thrust(mul_op, arg, n, init, stream); } template -T sum_reduce_kernel_thrust(const T *arg, std::size_t n, stream_t stream, - int device_id) { +T sum_reduce_kernel_thrust(const T *arg, std::size_t n, const Stream &stream) { T init(0); thrust::plus plus_op; - return reduce_kernel_thrust(plus_op, arg, n, init, stream, device_id); + return reduce_kernel_thrust(plus_op, arg, n, init, stream); } template -T max_reduce_kernel_thrust(const T *arg, std::size_t n, stream_t stream, - int device_id) { +T max_reduce_kernel_thrust(const T *arg, std::size_t n, const Stream &stream) { T init = std::numeric_limits::lowest(); thrust::maximum max_op; - return reduce_kernel_thrust(max_op, arg, n, init, stream, device_id); + return reduce_kernel_thrust(max_op, arg, n, init, stream); } template -T min_reduce_kernel_thrust(const T *arg, std::size_t n, stream_t stream, - int device_id) { +T min_reduce_kernel_thrust(const T *arg, std::size_t n, const Stream &stream) { T init = std::numeric_limits::max(); thrust::minimum min_op; - return reduce_kernel_thrust(min_op, arg, n, init, stream, device_id); + return reduce_kernel_thrust(min_op, arg, n, init, stream); } template TiledArray::detail::scalar_t absmax_reduce_kernel_thrust(const T *arg, std::size_t n, - stream_t stream, - int device_id) { + const Stream &s) { using TR = TiledArray::detail::scalar_t; TR init(0); thrust::maximum max_op; detail::absolute_value abs_op; - DeviceSafeCall(device::setDevice(device_id)); + DeviceSafeCall(device::setDevice(s.device)); auto arg_p = thrust::device_pointer_cast(arg); - auto result = thrust::transform_reduce(thrust_system::par.on(stream), arg_p, + auto result = thrust::transform_reduce(thrust_system::par.on(s.stream), arg_p, arg_p + n, abs_op, init, max_op); return result; @@ -122,18 +118,17 @@ TiledArray::detail::scalar_t absmax_reduce_kernel_thrust(const T *arg, template TiledArray::detail::scalar_t absmin_reduce_kernel_thrust(const T *arg, std::size_t n, - stream_t stream, - int device_id) { + const Stream &s) { using TR = TiledArray::detail::scalar_t; TR init = std::numeric_limits::max(); thrust::minimum min_op; detail::absolute_value abs_op; - DeviceSafeCall(device::setDevice(device_id)); + DeviceSafeCall(device::setDevice(s.device)); auto arg_p = thrust::device_pointer_cast(arg); - auto result = thrust::transform_reduce(thrust_system::par.on(stream), arg_p, + auto result = thrust::transform_reduce(thrust_system::par.on(s.stream), arg_p, arg_p + n, abs_op, init, min_op); return result; } diff --git a/src/TiledArray/device/um_storage.h b/src/TiledArray/device/um_storage.h index c940bca45c..d151a3c316 100644 --- a/src/TiledArray/device/um_storage.h +++ b/src/TiledArray/device/um_storage.h @@ -50,7 +50,7 @@ bool in_memory_space(const Storage& vec) noexcept { * device_um_btas_varray */ template -void to_execution_space(Storage& vec, device::stream_t stream = 0) { +void to_execution_space(Storage& vec, const device::Stream& s) { switch (Space) { case ExecutionSpace::Host: { using std::data; @@ -59,7 +59,7 @@ void to_execution_space(Storage& vec, device::stream_t stream = 0) { if (deviceEnv::instance()->concurrent_managed_access()) { DeviceSafeCall(device::memPrefetchAsync(data(vec), size(vec) * sizeof(value_type), - device::CpuDeviceId, stream)); + device::CpuDeviceId, s.stream)); } break; } @@ -67,11 +67,9 @@ void to_execution_space(Storage& vec, device::stream_t stream = 0) { using std::data; using std::size; using value_type = typename Storage::value_type; - int device = -1; if (deviceEnv::instance()->concurrent_managed_access()) { - DeviceSafeCall(device::getDevice(&device)); DeviceSafeCall(device::memPrefetchAsync( - data(vec), size(vec) * sizeof(value_type), device, stream)); + data(vec), size(vec) * sizeof(value_type), s.device, s.stream)); } break; } @@ -89,10 +87,10 @@ void to_execution_space(Storage& vec, device::stream_t stream = 0) { */ template void make_device_storage(Storage& storage, std::size_t n, - const device::stream_t& stream = 0) { + const device::Stream& s) { storage = Storage(n); TiledArray::to_execution_space(storage, - stream); + s); } /** diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 5219de705f..d3aca3cae0 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -176,6 +177,10 @@ inline error_t setDevice(int device) { return cudaSetDevice(device); } inline error_t getDevice(int* device) { return cudaGetDevice(device); } +inline error_t getDeviceCount(int* num_devices) { + return cudaGetDeviceCount(num_devices); +} + inline error_t deviceSetCacheConfig(FuncCache cache_config) { return cudaDeviceSetCacheConfig(static_cast(cache_config)); } @@ -311,6 +316,10 @@ inline error_t setDevice(int device) { return hipSetDevice(device); } inline error_t getDevice(int* device) { return hipGetDevice(device); } +inline error_t getDeviceCount(int* num_devices) { + return hipGetDeviceCount(num_devices); +} + inline error_t deviceSetCacheConfig(FuncCache cache_config) { return hipDeviceSetCacheConfig(static_cast(cache_config)); } @@ -394,7 +403,7 @@ inline error_t streamDestroy(stream_t stream) { #ifdef TILEDARRAY_HAS_DEVICE -inline int num_streams() { +inline int num_streams_per_device() { int num_streams = -1; char* num_stream_char = std::getenv("TA_DEVICE_NUM_STREAMS"); if (num_stream_char) { @@ -415,27 +424,6 @@ inline int num_streams() { return num_streams; } -inline int num_devices() { - int num_devices = -1; -#if defined(TILEDARRAY_HAS_CUDA) - DeviceSafeCall(cudaGetDeviceCount(&num_devices)); -#elif defined(TILEDARRAY_HAS_HIP) - DeviceSafeCall(hipGetDeviceCount(&num_devices)); -#endif - return num_devices; -} - -inline int current_device_id(World& world) { - static const std::tuple local_rank_size = - detail::mpi_local_rank_size(world); - const auto& [mpi_local_rank, mpi_local_size] = local_rank_size; - static const int num_devices = device::num_devices(); - // map ranks to default device round robin - static const int device_id = mpi_local_rank % num_devices; - - return device_id; -} - inline void DEVICERT_CB readyflag_callback(void* userData) { // convert void * to std::atomic std::atomic* flag = static_cast*>(userData); @@ -465,18 +453,19 @@ inline void thread_wait_stream(const stream_t& stream) { delete flag; } -inline const stream_t*& tls_stream_accessor() { - static thread_local const stream_t* thread_local_stream_ptr{nullptr}; - return thread_local_stream_ptr; -} +/// Stream is a `{device, stream_t}` pair, i.e. the analog of blas::Queue. +/// It exists as a syntactic sugar around stream_t, and to avoid the need +/// to deduce the device from stream +/// \internal did not name it queue to avoid naming dichotomies +/// all over the place +struct Stream { + int device; + stream_t stream; + Stream(int device, stream_t stream) : device(device), stream(stream) {} -/// must call this before exiting the device task executed via -/// the MADNESS runtime (namely, via madness::add_device_task ) -/// to inform the runtime which stream the task -/// launched its kernels into -inline void synchronize_stream(const stream_t* stream) { - tls_stream_accessor() = stream; -} + /// Stream is implicitly convertible to stream + operator stream_t() const { return stream; } +}; /** * Env maintains the device-related part of the runtime environment, @@ -488,7 +477,7 @@ class Env { public: ~Env() { // destroy streams on current device - for (auto& stream : streams_) { + for (auto& [device, stream] : streams_) { DeviceSafeCallNoThrow(streamDestroy(stream)); } } @@ -523,12 +512,34 @@ class Env { std::scoped_lock lock{mtx}; // only the winner of the lock race gets to initialize if (instance_accessor() == nullptr) { - int num_streams = device::num_streams(); - int num_devices = device::num_devices(); - int device_id = device::current_device_id(world); - // set device for current MPI process .. will be set in the ctor as well - DeviceSafeCall(setDevice(device_id)); - DeviceSafeCall(deviceSetCacheConfig(FuncCachePreferShared)); + int num_streams_per_device = device::num_streams_per_device(); + const int num_visible_devices = []() { + int num_visible_devices = -1; + DeviceSafeCall(getDeviceCount(&num_visible_devices)); + return num_visible_devices; + }(); + const auto compute_devices = [num_visible_devices](World& world) { + std::vector compute_devices; + static const std::tuple local_rank_size = + TiledArray::detail::mpi_local_rank_size(world); + const auto& [mpi_local_rank, mpi_local_size] = local_rank_size; + // map ranks to default device round robin + int device_id = mpi_local_rank % num_visible_devices; + while (device_id < num_visible_devices) { + compute_devices.push_back(device_id); + device_id += mpi_local_size; + } + + return compute_devices; + }(world); + + // configure devices for this rank + for (auto device : compute_devices) { + DeviceSafeCall(setDevice(device)); + DeviceSafeCall(deviceSetCacheConfig(FuncCachePreferShared)); + } + // use the first device as default: + DeviceSafeCall(setDevice(compute_devices[0])); // uncomment to debug umpire ops // @@ -574,20 +585,33 @@ class Env { "QuickPool_SizeLimited_PINNED", pinned_size_limited_alloc, page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE); - auto env = std::unique_ptr( - new Env(world, num_devices, device_id, num_streams, um_dynamic_pool, - dev_dynamic_pool, pinned_dynamic_pool)); + auto env = std::unique_ptr(new Env( + world, num_visible_devices, compute_devices, num_streams_per_device, + um_dynamic_pool, dev_dynamic_pool, pinned_dynamic_pool)); instance_accessor() = std::move(env); } } World& world() const { return *world_; } - int num_devices() const { return num_devices_; } + /// @return the number of devices visible to this rank + int num_visible_devices() const { return num_devices_visible_; } + + /// @return the number of compute devices assigned to this rank + int num_compute_devices() const { return compute_devices_.size(); } - int current_device_id() const { return current_device_id_; } + /// @return the device pointed to by the currently-active device runtime + /// context + int current_device_id() const { + TA_ASSERT(num_compute_devices() > 0); + int current_device = -1; + DeviceSafeCall(getDevice(¤t_device)); + return current_device; + } - int num_streams() const { return num_streams_; } + /// @return the total number of compute streams (for all devices) + /// visible to this rank + int num_streams_total() const { return streams_.size(); } bool concurrent_managed_access() const { return device_concurrent_managed_access_; @@ -626,11 +650,13 @@ class Env { return result; } - const stream_t& stream(std::size_t i) const { return streams_.at(i); } - - const stream_t& stream_h2d() const { return streams_[num_streams_]; } - - const stream_t& stream_d2h() const { return streams_[num_streams_ + 1]; } + /// @param[in] i compute stream ordinal + /// @pre `inum_streams_total()); + return streams_[i]; + } /// @return a (non-thread-safe) Umpire allocator for device UM umpire::Allocator& um_allocator() { return um_allocator_; } @@ -684,50 +710,57 @@ class Env { } protected: - Env(World& world, int num_devices, int device_id, int num_streams, - umpire::Allocator um_alloc, umpire::Allocator device_alloc, - umpire::Allocator pinned_alloc) + Env(World& world, int num_visible_devices, std::vector compute_devices, + int num_streams_per_device, umpire::Allocator um_alloc, + umpire::Allocator device_alloc, umpire::Allocator pinned_alloc) : world_(&world), um_allocator_(um_alloc), device_allocator_(device_alloc), pinned_allocator_(pinned_alloc), - num_devices_(num_devices), - current_device_id_(device_id), - num_streams_(num_streams) { - if (num_devices <= 0) { + num_devices_visible_(num_visible_devices), + compute_devices_(std::move(compute_devices)), + num_streams_per_device_(num_streams_per_device) { + if (compute_devices_.size() <= 0) { throw std::runtime_error("No " TILEDARRAY_DEVICE_RUNTIME_STR " compute devices found!\n"); } - // set device for current MPI process - DeviceSafeCall(setDevice(current_device_id_)); - - /// check the capability of device - deviceProp_t prop; - DeviceSafeCall(getDeviceProperties(&prop, device_id)); - if (!prop.managedMemory) { - throw std::runtime_error(TILEDARRAY_DEVICE_RUNTIME_STR - "device doesn't support managedMemory\n"); - } - int concurrent_managed_access; - DeviceSafeCall(deviceGetAttribute(&concurrent_managed_access, - DeviceAttributeConcurrentManagedAccess, - device_id)); - device_concurrent_managed_access_ = concurrent_managed_access; - if (!device_concurrent_managed_access_) { - std::cout << "\nWarning: " TILEDARRAY_DEVICE_RUNTIME_STR - " device doesn't support " - "ConcurrentManagedAccess!\n\n"; + streams_.reserve(num_streams_per_device_ * compute_devices_.size()); + + /// ensure the desired capabilities of each device + for (auto device : compute_devices_) { + deviceProp_t prop; + DeviceSafeCall(getDeviceProperties(&prop, device)); + if (!prop.managedMemory) { + throw std::runtime_error(TILEDARRAY_DEVICE_RUNTIME_STR + "device doesn't support managedMemory\n"); + } + int concurrent_managed_access; + DeviceSafeCall(deviceGetAttribute(&concurrent_managed_access, + DeviceAttributeConcurrentManagedAccess, + device)); + device_concurrent_managed_access_ = + device_concurrent_managed_access_ && concurrent_managed_access; + if (!device_concurrent_managed_access_) { + std::cout << "\nWarning: " TILEDARRAY_DEVICE_RUNTIME_STR + " device doesn't support " + "ConcurrentManagedAccess!\n\n"; + } + + // creates streams on current device + DeviceSafeCall(setDevice(device)); + for (int s = 0; s != num_streams_per_device_; ++s) { + stream_t stream; + DeviceSafeCall(streamCreateWithFlags(&stream, StreamNonBlocking)); + streams_.emplace_back(device, stream); + } } - // creates streams on current device - streams_.resize(num_streams_ + 2); - for (auto& stream : streams_) { - DeviceSafeCall(streamCreateWithFlags(&stream, StreamNonBlocking)); - } - std::cout << "created " << num_streams_ - << " " TILEDARRAY_DEVICE_RUNTIME_STR " streams + 2 I/O streams" - << std::endl; + std::cout << "created " << streams_.size() + << " " TILEDARRAY_DEVICE_RUNTIME_STR " streams" << std::endl; + + // lastly, set default device for current MPI process's (main) thread + DeviceSafeCall(setDevice(compute_devices_.front())); } private: @@ -743,12 +776,15 @@ class Env { // N.B. not thread safe, so must be wrapped into umpire_based_allocator_impl umpire::Allocator pinned_allocator_; - int num_devices_; - int current_device_id_; - bool device_concurrent_managed_access_; + int num_devices_visible_; // total number of devices visible to this rank + std::vector + compute_devices_; // list of devices assigned to this rank, + // compute_devices_.size()<=num_devices_visible_ + bool device_concurrent_managed_access_ = true; - int num_streams_; - std::vector streams_; + int num_streams_per_device_; + std::vector streams_; // streams_.size() == (num_streams_per_device_) + // * compute_devices_.size() inline static std::unique_ptr& instance_accessor() { static std::unique_ptr instance_{nullptr}; @@ -756,18 +792,80 @@ class Env { } }; -} // namespace device - namespace detail { -template -const device::stream_t& get_stream_based_on_range(const Range& range) { - // TODO better way to get stream based on the id of tensor - auto stream_id = range.offset() % device::Env::instance()->num_streams(); - auto& stream = device::Env::instance()->stream(stream_id); - return stream; +inline std::optional& tls_stream_accessor() { + static thread_local std::optional tls_stream; + return tls_stream; } } // namespace detail +/// must call this before exiting the device task submitted to +/// the MADNESS runtime via madness::add_device_task +/// to synchronize with \p s +/// before task completion +/// \param s the stream to synchronize this task with +inline void sync_madness_task_with(const Stream& s) { + if (!detail::tls_stream_accessor()) + detail::tls_stream_accessor() = s; + else { + TA_ASSERT(*detail::tls_stream_accessor() == s); + } +} + +/// must call this before exiting the device task submitted to +/// the MADNESS runtime via madness::add_device_task +/// to synchronize with \p stream associated with device \p device +/// on the *current* device before task completion +/// \param device the device associated with \p stream +/// \param stream the stream to synchronize this task with +inline void sync_madness_task_with(int device, stream_t stream) { + sync_madness_task_with(Stream{device, stream}); +} + +/// must call this before exiting the device task submitted to +/// the MADNESS runtime via madness::add_device_task +/// to synchronize with \p stream on the *current* device +/// before task completion +/// \param stream the stream to synchronize this task with +inline void sync_madness_task_with(stream_t stream) { + TA_ASSERT(stream != nullptr); + int current_device = -1; + DeviceSafeCall(getDevice(¤t_device)); + sync_madness_task_with(current_device, stream); +} + +/// @return the optional Stream with which this task will be synced +inline std::optional madness_task_current_stream() { + return detail::tls_stream_accessor(); +} + +/// should call this within a task submitted to +/// the MADNESS runtime via madness::add_device_task +/// to cancel the previous calls to sync_madness_task_with() +/// if, e.g., it synchronized with any work performed +/// before exiting +inline void cancel_madness_task_sync() { detail::tls_stream_accessor() = {}; } + +/// maps a (tile) Range to device::Stream; if had already pushed work into a +/// device::Stream (as indicated by madness_task_current_stream() ) +/// will return that Stream instead +/// @param[in] range will determine the device::Stream to compute an object +/// associated with this Range object +/// @return the device::Stream to use for creating tasks generating work +/// associated with Range \p range +template +device::Stream stream_for(const Range& range) { + auto stream_opt = madness_task_current_stream(); + if (!stream_opt) { + auto stream_ord = + range.offset() % device::Env::instance()->num_streams_total(); + return device::Env::instance()->stream(stream_ord); + } else + return *stream_opt; +} + +} // namespace device + #endif // TILEDARRAY_HAS_DEVICE #ifdef TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/reduce_task.h b/src/TiledArray/reduce_task.h index 60d536eec9..34a2fef9ea 100644 --- a/src/TiledArray/reduce_task.h +++ b/src/TiledArray/reduce_task.h @@ -461,10 +461,10 @@ class ReduceTask { // cleanup the argument #ifdef TILEDARRAY_HAS_DEVICE - auto stream_ptr = device::tls_stream_accessor(); + auto& stream_opt = device::detail::tls_stream_accessor(); - /// non-device op - if (stream_ptr == nullptr) { + // need to sync with a device stream? + if (!stream_opt) { // no ReduceObject::destroy(ready_object); this->dec(); } else { @@ -472,12 +472,12 @@ class ReduceTask { (*callback_object)[0] = &world_; (*callback_object)[1] = this; (*callback_object)[2] = ready_object; - DeviceSafeCall( - device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(device::setDevice(stream_opt->device)); DeviceSafeCall(device::launchHostFunc( - *stream_ptr, device_dependency_dec_reduceobject_delete_callback, + stream_opt->stream, + device_dependency_dec_reduceobject_delete_callback, callback_object)); - device::synchronize_stream(nullptr); + device::cancel_madness_task_sync(); // std::cout << std::to_string(world().rank()) + " // add 3\n"; } @@ -496,21 +496,21 @@ class ReduceTask { // cleanup the result #ifdef TILEDARRAY_HAS_DEVICE - auto stream_ptr = device::tls_stream_accessor(); - if (stream_ptr == nullptr) { + auto queue_opt = device::detail::tls_stream_accessor(); + // need to sync with a stream? + if (!queue_opt) { // no ready_result.reset(); - } else { + } else { // yes auto ready_result_heap = new std::shared_ptr(ready_result); auto callback_object = new std::vector(2); (*callback_object)[0] = &world_; (*callback_object)[1] = ready_result_heap; - DeviceSafeCall( - device::setDevice(deviceEnv::instance()->current_device_id())); + auto& [device, stream] = *queue_opt; + DeviceSafeCall(device::setDevice(device)); DeviceSafeCall(device::launchHostFunc( - *stream_ptr, device_readyresult_reset_callback, - callback_object)); - device::synchronize_stream(nullptr); + stream, device_readyresult_reset_callback, callback_object)); + device::cancel_madness_task_sync(); // std::cout << std::to_string(world().rank()) + " // add 4\n"; } @@ -537,18 +537,18 @@ class ReduceTask { // Cleanup the argument #ifdef TILEDARRAY_HAS_DEVICE - auto stream_ptr = device::tls_stream_accessor(); - if (stream_ptr == nullptr) { + auto& stream_opt = device::detail::tls_stream_accessor(); + if (!stream_opt) { ReduceObject::destroy(object); } else { auto callback_object = new std::vector(2); (*callback_object)[0] = &world_; (*callback_object)[1] = const_cast(object); - DeviceSafeCall( - device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(device::setDevice(stream_opt->device)); DeviceSafeCall(device::launchHostFunc( - *stream_ptr, device_reduceobject_delete_callback, callback_object)); - device::synchronize_stream(nullptr); + stream_opt->stream, device_reduceobject_delete_callback, + callback_object)); + device::cancel_madness_task_sync(); // std::cout << std::to_string(world().rank()) + " add 1\n"; } #else @@ -560,15 +560,15 @@ class ReduceTask { // Decrement the dependency counter for the argument. This must // be done after the reduce call to avoid a race condition. #ifdef TILEDARRAY_HAS_DEVICE - if (stream_ptr == nullptr) { + if (!stream_opt) { this->dec(); } else { auto callback_object2 = new std::vector(1); (*callback_object2)[0] = this; - DeviceSafeCall( - device::setDevice(deviceEnv::instance()->current_device_id())); - DeviceSafeCall(device::launchHostFunc( - *stream_ptr, device_dependency_dec_callback, callback_object2)); + DeviceSafeCall(device::setDevice(stream_opt->device)); + DeviceSafeCall(device::launchHostFunc(stream_opt->stream, + device_dependency_dec_callback, + callback_object2)); // std::cout << std::to_string(world().rank()) + " add 2\n"; } #else @@ -588,8 +588,8 @@ class ReduceTask { // Cleanup arguments #ifdef TILEDARRAY_HAS_DEVICE - auto stream_ptr = device::tls_stream_accessor(); - if (stream_ptr == nullptr) { + auto& stream_opt = device::detail::tls_stream_accessor(); + if (!stream_opt) { ReduceObject::destroy(object1); ReduceObject::destroy(object2); } else { @@ -597,12 +597,11 @@ class ReduceTask { (*callback_object1)[0] = &world_; (*callback_object1)[1] = const_cast(object1); (*callback_object1)[2] = const_cast(object2); - DeviceSafeCall( - device::setDevice(deviceEnv::instance()->current_device_id())); + DeviceSafeCall(device::setDevice(stream_opt->device)); DeviceSafeCall(device::launchHostFunc( - *stream_ptr, device_reduceobject_delete_callback, + stream_opt->stream, device_reduceobject_delete_callback, callback_object1)); - device::synchronize_stream(nullptr); + device::cancel_madness_task_sync(); // std::cout << std::to_string(world().rank()) + " add 1\n"; } #else @@ -616,17 +615,17 @@ class ReduceTask { // Decrement the dependency counter for the two arguments. This // must be done after the reduce call to avoid a race condition. #ifdef TILEDARRAY_HAS_DEVICE - if (stream_ptr == nullptr) { + if (!stream_opt) { this->dec(); this->dec(); } else { auto callback_object2 = new std::vector(2); (*callback_object2)[0] = this; (*callback_object2)[1] = this; - DeviceSafeCall( - device::setDevice(deviceEnv::instance()->current_device_id())); - DeviceSafeCall(device::launchHostFunc( - *stream_ptr, device_dependency_dec_callback, callback_object2)); + DeviceSafeCall(device::setDevice(stream_opt->device)); + DeviceSafeCall(device::launchHostFunc(stream_opt->stream, + device_dependency_dec_callback, + callback_object2)); // std::cout << std::to_string(world().rank()) + " add 2\n"; } diff --git a/tests/librett.cpp b/tests/librett.cpp index de0c771a50..3b76e1106e 100644 --- a/tests/librett.cpp +++ b/tests/librett.cpp @@ -71,10 +71,11 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) { librettHandle plan; auto stream = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; - status = - librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), stream); + status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), + stream.stream); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -122,6 +123,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { librettHandle plan; auto stream = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; std::vector extent({B, A}); @@ -130,8 +132,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { std::vector perm({1, 0}); TiledArray::permutation_to_col_major(perm); - status = - librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), stream); + status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), + stream.stream); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -183,6 +185,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { librettHandle plan; auto stream = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; std::vector extent3{int(A), int(B), int(C)}; @@ -191,7 +194,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { // std::vector perm3{0, 2, 1}; status = librettPlanMeasure(&plan, 3, extent3.data(), perm3.data(), - sizeof(int), stream, a_device, b_device); + sizeof(int), stream.stream, a_device, b_device); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -248,6 +251,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { librettHandle plan; auto stream = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; std::vector extent({A, B, C}); @@ -257,7 +261,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { TiledArray::permutation_to_col_major(perm); status = librettPlanMeasure(&plan, 3, extent.data(), perm.data(), sizeof(int), - stream, a_device, b_device); + stream.stream, a_device, b_device); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -306,6 +310,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) { librettHandle plan; auto stream = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; std::vector extent({A, A}); @@ -314,8 +319,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) { std::vector perm({1, 0}); TiledArray::permutation_to_col_major(perm); - status = - librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), stream); + status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), + stream.stream); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -356,6 +361,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { librettHandle plan; auto stream = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; std::vector extent({B, A}); @@ -364,8 +370,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { std::vector perm({1, 0}); TiledArray::permutation_to_col_major(perm); - status = - librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), stream); + status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), + stream.stream); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -406,6 +412,7 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { librettHandle plan; auto stream = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; // b(k,i,j) = a(i,j,k) @@ -416,8 +423,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { std::vector perm({2, 0, 1}); TiledArray::permutation_to_col_major(perm); - status = - librettPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), stream); + status = librettPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), + stream.stream); BOOST_CHECK(status == LIBRETT_SUCCESS); From 23ba34ee413aba2f60a3b9f5d12357c84e6d9ed9 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 26 Sep 2023 14:20:28 -0400 Subject: [PATCH 126/592] UM tensor/expression unit tests build for HIP also --- tests/CMakeLists.txt | 6 +----- .../{expressions_cuda_um.cpp => expressions_device_um.cpp} | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) rename tests/{expressions_cuda_um.cpp => expressions_device_um.cpp} (99%) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d17dd8345b..118440cf3b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -103,11 +103,7 @@ set(ta_test_src_files ta_test.cpp ) if(CUDA_FOUND OR HIP_FOUND) - list(APPEND ta_test_src_files librett.cpp) -endif() - -if(CUDA_FOUND) - list(APPEND ta_test_src_files expressions_cuda_um.cpp tensor_um.cpp) + list(APPEND ta_test_src_files librett.cpp expressions_device_um.cpp tensor_um.cpp) endif() # if using C++20 must use Boost 1.74 or later: diff --git a/tests/expressions_cuda_um.cpp b/tests/expressions_device_um.cpp similarity index 99% rename from tests/expressions_cuda_um.cpp rename to tests/expressions_device_um.cpp index 81dcf29c47..e624756561 100644 --- a/tests/expressions_cuda_um.cpp +++ b/tests/expressions_device_um.cpp @@ -25,7 +25,7 @@ #include -#ifdef TILEDARRAY_HAS_CUDA +#ifdef TILEDARRAY_HAS_DEVICE #include #include From 1d59a369d59240d8480a2e25280703133ff88118 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 26 Sep 2023 14:23:08 -0400 Subject: [PATCH 127/592] decudaify header guard in device_task_fn.h --- src/TiledArray/device/device_task_fn.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/device/device_task_fn.h b/src/TiledArray/device/device_task_fn.h index f08e5d44b5..6b7105c550 100644 --- a/src/TiledArray/device/device_task_fn.h +++ b/src/TiledArray/device/device_task_fn.h @@ -2,8 +2,8 @@ // Created by Chong Peng on 2019-03-20. // -#ifndef TILEDARRAY_DEVICE_CUDA_TASK_FN_H__INCLUDED -#define TILEDARRAY_DEVICE_CUDA_TASK_FN_H__INCLUDED +#ifndef TILEDARRAY_DEVICE_DEVICE_TASK_FN_H__INCLUDED +#define TILEDARRAY_DEVICE_DEVICE_TASK_FN_H__INCLUDED #include @@ -865,4 +865,4 @@ typename detail::memfunc_enabler::type add_device_task( } // namespace madness #endif // TILDARRAY_HAS_DEVICE -#endif // TILEDARRAY_DEVICE_CUDA_TASK_FN_H__INCLUDED +#endif // TILEDARRAY_DEVICE_DEVICE_TASK_FN_H__INCLUDED From 75c39beac1aa8bb487608937c736b2e18a844d2e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 26 Sep 2023 14:23:33 -0400 Subject: [PATCH 128/592] run unit tests with multiple device streams --- tests/CMakeLists.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 118440cf3b..85b20d6bde 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -165,10 +165,9 @@ if(ENABLE_MPI) $ --log_level=unit_scope ${${executable}_np_${p}_args} ${MPIEXEC_POSTFLAGS} ) - # N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now set_tests_properties(tiledarray/unit/run-np-${p} PROPERTIES FIXTURES_REQUIRED TA_UNIT_TESTS_EXEC - ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1" + ENVIRONMENT "MAD_NUM_THREADS=2" ) if (p GREATER 1) @@ -178,9 +177,8 @@ if(ENABLE_MPI) else() add_test(NAME tiledarray/unit/run-np-1 COMMAND ${executable}) - # N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now set_tests_properties(tiledarray/unit/run-np-1 PROPERTIES FIXTURES_REQUIRED TA_UNIT_TESTS_EXEC - ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1" + ENVIRONMENT "MAD_NUM_THREADS=2" ) endif() From 7bf04631296fdca9b60a0087c3be45fb0004eaa5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 26 Sep 2023 18:29:08 -0400 Subject: [PATCH 129/592] try reverting "run unit tests with multiple device streams" This reverts commit b4b59978b054ef242e9279ff31362e7f2a0dd9fd. --- tests/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 85b20d6bde..118440cf3b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -165,9 +165,10 @@ if(ENABLE_MPI) $ --log_level=unit_scope ${${executable}_np_${p}_args} ${MPIEXEC_POSTFLAGS} ) + # N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now set_tests_properties(tiledarray/unit/run-np-${p} PROPERTIES FIXTURES_REQUIRED TA_UNIT_TESTS_EXEC - ENVIRONMENT "MAD_NUM_THREADS=2" + ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1" ) if (p GREATER 1) @@ -177,8 +178,9 @@ if(ENABLE_MPI) else() add_test(NAME tiledarray/unit/run-np-1 COMMAND ${executable}) + # N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now set_tests_properties(tiledarray/unit/run-np-1 PROPERTIES FIXTURES_REQUIRED TA_UNIT_TESTS_EXEC - ENVIRONMENT "MAD_NUM_THREADS=2" + ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1" ) endif() From 957e5cc1dba097383959cc3a2af3445495edaf99 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 27 Sep 2023 11:29:42 -0400 Subject: [PATCH 130/592] can probe whether TA was initialized to be quiet --- src/TiledArray/initialize.h | 4 ++++ src/TiledArray/tiledarray.cpp | 23 +++++++---------------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/src/TiledArray/initialize.h b/src/TiledArray/initialize.h index 324f772ccf..7d75d33c0d 100644 --- a/src/TiledArray/initialize.h +++ b/src/TiledArray/initialize.h @@ -17,6 +17,10 @@ bool initialized(); /// @return true if TiledArray has been finalized at least once bool finalized(); +/// @return true if TiledArray (and, necessarily, MADWorld runtime) was +/// initialized to be quiet +bool initialized_to_be_quiet(); + // clang-format off /// @name TiledArray initialization. /// These functions initialize TiledArray and (if needed) MADWorld diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp index ae5a8662ac..74244e59bd 100644 --- a/src/TiledArray/tiledarray.cpp +++ b/src/TiledArray/tiledarray.cpp @@ -62,28 +62,20 @@ inline bool& finalized_accessor() { return flag; } +inline bool& quiet_accessor() { + static bool quiet = false; + return quiet; +} + } // namespace } // namespace TiledArray -/// @return true if TiledArray (and, necessarily, MADWorld runtime) is in an -/// initialized state bool TiledArray::initialized() { return initialized_accessor(); } -/// @return true if TiledArray has been finalized at least once bool TiledArray::finalized() { return finalized_accessor(); } -/// @name TiledArray initialization. -/// These functions initialize TiledArray and (if needed) MADWorld -/// runtime. -/// @note the default World object is set to the object returned by these. -/// @warning MADWorld can only be initialized/finalized once, hence if -/// TiledArray initializes MADWorld -/// it can also be initialized/finalized only once. - -/// @{ +bool TiledArray::initialized_to_be_quiet() { return quiet_accessor(); } -/// @throw TiledArray::Exception if TiledArray initialized MADWorld and -/// TiledArray::finalize() had been called TiledArray::World& TiledArray::initialize(int& argc, char**& argv, const SafeMPI::Intracomm& comm, bool quiet) { @@ -112,6 +104,7 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv, TiledArray::set_num_threads(1); madness::print_meminfo_disable(); initialized_accessor() = true; + quiet_accessor() = quiet; // if have TTG initialize it also #if TILEDARRAY_HAS_TTG @@ -155,8 +148,6 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv, throw Exception("TiledArray already initialized"); } -/// Finalizes TiledArray (and MADWorld runtime, if it had not been initialized -/// when TiledArray::initialize was called). void TiledArray::finalize() { // finalize in the reverse order of initialize #if TILEDARRAY_HAS_TTG From 5ea446befbdb7007273cb2fa7c8c37c45df0166b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 27 Sep 2023 11:33:36 -0400 Subject: [PATCH 131/592] device initialization informational messages only logged if initialized with quiet=false --- src/TiledArray/external/device.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index d3aca3cae0..685c804dff 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -49,6 +49,7 @@ #include #include +#include #if defined(TILEDARRAY_HAS_CUDA) @@ -741,7 +742,7 @@ class Env { device)); device_concurrent_managed_access_ = device_concurrent_managed_access_ && concurrent_managed_access; - if (!device_concurrent_managed_access_) { + if (!initialized_to_be_quiet() && !device_concurrent_managed_access_) { std::cout << "\nWarning: " TILEDARRAY_DEVICE_RUNTIME_STR " device doesn't support " "ConcurrentManagedAccess!\n\n"; @@ -756,8 +757,12 @@ class Env { } } - std::cout << "created " << streams_.size() - << " " TILEDARRAY_DEVICE_RUNTIME_STR " streams" << std::endl; + if (!initialized_to_be_quiet() && world.rank() == 0) { + auto nstreams = streams_.size(); + std::cout << "created " << nstreams + << " " TILEDARRAY_DEVICE_RUNTIME_STR " stream" + << (nstreams == 1 ? "" : "s") << std::endl; + } // lastly, set default device for current MPI process's (main) thread DeviceSafeCall(setDevice(compute_devices_.front())); From fdaf8bca619a40c5b4628502b7bee20ccadd715b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 27 Sep 2023 18:42:20 -0400 Subject: [PATCH 132/592] convert librett unit tests to use default stream provided by TA runtime, not CUDA runtime --- tests/librett.cpp | 120 +++++++++++++++++++++++++++------------------- 1 file changed, 72 insertions(+), 48 deletions(-) diff --git a/tests/librett.cpp b/tests/librett.cpp index 3b76e1106e..58093d0e06 100644 --- a/tests/librett.cpp +++ b/tests/librett.cpp @@ -55,13 +55,18 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) { iter++; } } + + auto q = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(q.device)); + int* a_device; TiledArray::device::malloc(&a_device, A * A * sizeof(int)); int* b_device; TiledArray::device::malloc(&b_device, A * A * sizeof(int)); - TiledArray::device::memcpy(a_device, a_host, A * A * sizeof(int), - TiledArray::device::MemcpyHostToDevice); + TiledArray::device::memcpyAsync(a_device, a_host, A * A * sizeof(int), + TiledArray::device::MemcpyHostToDevice, + q.stream); std::vector extent({A, A}); TiledArray::extent_to_col_major(extent); @@ -70,22 +75,23 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) { TiledArray::permutation_to_col_major(perm); librettHandle plan; - auto stream = TiledArray::deviceEnv::instance()->stream(0); - DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; - status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), - stream.stream); + status = + librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), q.stream); BOOST_CHECK(status == LIBRETT_SUCCESS); status = librettExecute(plan, a_device, b_device); BOOST_CHECK(status == LIBRETT_SUCCESS); - librettDestroy(plan); - TiledArray::device::memcpy(b_host, b_device, A * A * sizeof(int), - TiledArray::device::MemcpyDeviceToHost); + TiledArray::device::memcpyAsync(b_host, b_device, A * A * sizeof(int), + TiledArray::device::MemcpyDeviceToHost, + q.stream); + TiledArray::device::streamSynchronize(q.stream); + + librettDestroy(plan); iter = 0; for (std::size_t i = 0; i < A; i++) { @@ -113,17 +119,19 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { } } + auto q = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(q.device)); + int* a_device; TiledArray::device::malloc(&a_device, A * B * sizeof(int)); int* b_device; TiledArray::device::malloc(&b_device, A * B * sizeof(int)); - TiledArray::device::memcpy(a_device, a_host, A * B * sizeof(int), - TiledArray::device::MemcpyHostToDevice); + TiledArray::device::memcpyAsync(a_device, a_host, A * B * sizeof(int), + TiledArray::device::MemcpyHostToDevice, + q.stream); librettHandle plan; - auto stream = TiledArray::deviceEnv::instance()->stream(0); - DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; std::vector extent({B, A}); @@ -132,18 +140,21 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) { std::vector perm({1, 0}); TiledArray::permutation_to_col_major(perm); - status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), - stream.stream); + status = + librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), q.stream); BOOST_CHECK(status == LIBRETT_SUCCESS); status = librettExecute(plan, a_device, b_device); BOOST_CHECK(status == LIBRETT_SUCCESS); - librettDestroy(plan); - TiledArray::device::memcpy(b_host, b_device, A * B * sizeof(int), - TiledArray::device::MemcpyDeviceToHost); + TiledArray::device::memcpyAsync(b_host, b_device, A * B * sizeof(int), + TiledArray::device::MemcpyDeviceToHost, + q.stream); + TiledArray::device::streamSynchronize(q.stream); + + librettDestroy(plan); iter = 0; for (std::size_t i = 0; i < B; i++) { @@ -173,19 +184,21 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { } } + auto q = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(q.device)); + int* a_device; TiledArray::device::malloc(&a_device, A * B * C * sizeof(int)); int* b_device; TiledArray::device::malloc(&b_device, A * B * C * sizeof(int)); - TiledArray::device::memcpy(a_device, a_host, A * B * C * sizeof(int), - TiledArray::device::MemcpyHostToDevice); + TiledArray::device::memcpyAsync(a_device, a_host, A * B * C * sizeof(int), + TiledArray::device::MemcpyHostToDevice, + q.stream); // b(j,i,k) = a(i,j,k) librettHandle plan; - auto stream = TiledArray::deviceEnv::instance()->stream(0); - DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; std::vector extent3{int(A), int(B), int(C)}; @@ -194,7 +207,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { // std::vector perm3{0, 2, 1}; status = librettPlanMeasure(&plan, 3, extent3.data(), perm3.data(), - sizeof(int), stream.stream, a_device, b_device); + sizeof(int), q.stream, a_device, b_device); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -202,8 +215,10 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) { BOOST_CHECK(status == LIBRETT_SUCCESS); - TiledArray::device::memcpy(b_host, b_device, A * B * C * sizeof(int), - TiledArray::device::MemcpyDeviceToHost); + TiledArray::device::memcpyAsync(b_host, b_device, A * B * C * sizeof(int), + TiledArray::device::MemcpyDeviceToHost, + q.stream); + TiledArray::device::streamSynchronize(q.stream); status = librettDestroy(plan); @@ -239,19 +254,21 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { } } + auto q = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(q.device)); + int* a_device; TiledArray::device::malloc(&a_device, A * B * C * sizeof(int)); int* b_device; TiledArray::device::malloc(&b_device, A * B * C * sizeof(int)); - TiledArray::device::memcpy(a_device, a_host, A * B * C * sizeof(int), - TiledArray::device::MemcpyHostToDevice); + TiledArray::device::memcpyAsync(a_device, a_host, A * B * C * sizeof(int), + TiledArray::device::MemcpyHostToDevice, + q.stream); // b(j,i,k) = a(i,j,k) librettHandle plan; - auto stream = TiledArray::deviceEnv::instance()->stream(0); - DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; std::vector extent({A, B, C}); @@ -261,7 +278,7 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { TiledArray::permutation_to_col_major(perm); status = librettPlanMeasure(&plan, 3, extent.data(), perm.data(), sizeof(int), - stream.stream, a_device, b_device); + q.stream, a_device, b_device); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -269,8 +286,10 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) { BOOST_CHECK(status == LIBRETT_SUCCESS); - TiledArray::device::memcpy(b_host, b_device, A * B * C * sizeof(int), - TiledArray::device::MemcpyDeviceToHost); + TiledArray::device::memcpyAsync(b_host, b_device, A * B * C * sizeof(int), + TiledArray::device::MemcpyDeviceToHost, + q.stream); + TiledArray::device::streamSynchronize(q.stream); status = librettDestroy(plan); @@ -308,9 +327,10 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) { } } + auto q = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(q.device)); + librettHandle plan; - auto stream = TiledArray::deviceEnv::instance()->stream(0); - DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; std::vector extent({A, A}); @@ -319,8 +339,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) { std::vector perm({1, 0}); TiledArray::permutation_to_col_major(perm); - status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), - stream.stream); + status = + librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), q.stream); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -328,9 +348,9 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) { BOOST_CHECK(status == LIBRETT_SUCCESS); - librettDestroy(plan); + TiledArray::device::streamSynchronize(q.stream); - TiledArray::device::deviceSynchronize(); + librettDestroy(plan); iter = 0; for (std::size_t i = 0; i < A; i++) { @@ -359,9 +379,10 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { } } + auto q = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(q.device)); + librettHandle plan; - auto stream = TiledArray::deviceEnv::instance()->stream(0); - DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; std::vector extent({B, A}); @@ -370,8 +391,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { std::vector perm({1, 0}); TiledArray::permutation_to_col_major(perm); - status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), - stream.stream); + status = + librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), q.stream); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -379,8 +400,9 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) { BOOST_CHECK(status == LIBRETT_SUCCESS); + TiledArray::device::streamSynchronize(q.stream); + librettDestroy(plan); - TiledArray::device::deviceSynchronize(); iter = 0; for (std::size_t i = 0; i < B; i++) { @@ -410,9 +432,10 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { } } + auto q = TiledArray::deviceEnv::instance()->stream(0); + DeviceSafeCall(TiledArray::device::setDevice(q.device)); + librettHandle plan; - auto stream = TiledArray::deviceEnv::instance()->stream(0); - DeviceSafeCall(TiledArray::device::setDevice(stream.device)); librettResult status; // b(k,i,j) = a(i,j,k) @@ -423,8 +446,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { std::vector perm({2, 0, 1}); TiledArray::permutation_to_col_major(perm); - status = librettPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), - stream.stream); + status = + librettPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), q.stream); BOOST_CHECK(status == LIBRETT_SUCCESS); @@ -432,8 +455,9 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) { BOOST_CHECK(status == LIBRETT_SUCCESS); + TiledArray::device::streamSynchronize(q.stream); + librettDestroy(plan); - TiledArray::device::deviceSynchronize(); iter = 0; for (std::size_t i = 0; i < A; i++) { From 37ee4488f658cad4d5dfdfedb323c36a9b20df62 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 28 Sep 2023 01:06:21 -0400 Subject: [PATCH 133/592] stream to use for syncing by madness tasks is no longer stored in TLS, but in task body so that streams are per-task, not per thread in case a task recursively executes other tasks by doing Future::get(dowork=true) --- src/TiledArray/device/device_task_fn.h | 20 +++++-- src/TiledArray/external/device.h | 23 ++++--- src/TiledArray/reduce_task.h | 83 +++++++++++++++++--------- 3 files changed, 84 insertions(+), 42 deletions(-) diff --git a/src/TiledArray/device/device_task_fn.h b/src/TiledArray/device/device_task_fn.h index 6b7105c550..fada332c63 100644 --- a/src/TiledArray/device/device_task_fn.h +++ b/src/TiledArray/device/device_task_fn.h @@ -99,22 +99,31 @@ struct deviceTaskFn : public TaskInterface { protected: void run(const TaskThreadEnv& env) override { + TA_ASSERT(!stream_); + TA_ASSERT( + TiledArray::device::detail::madness_task_stream_opt_ptr_accessor() == + nullptr); + // tell the task to report stream to be synced with to this->stream_ + TiledArray::device::detail::madness_task_stream_opt_ptr_accessor() = + &this->stream_; + // run the async function, the function must call synchronize_stream() to // set the stream it used!! task_->run_async(); - // get the stream used by async function - auto stream_opt = TiledArray::device::detail::tls_stream_accessor(); + // clear ptr to stream_ + TiledArray::device::detail::madness_task_stream_opt_ptr_accessor() = + nullptr; // WARNING, need to handle NoOp - if (!stream_opt) { + if (!stream_) { task_->notify(); } else { // TODO should we use device callback or device events?? // insert device callback - TiledArray::device::launchHostFunc(*stream_opt, device_callback, task_); + TiledArray::device::launchHostFunc(*stream_, device_callback, task_); // processed sync, clear state - TiledArray::device::detail::tls_stream_accessor() = {}; + stream_ = {}; } } @@ -137,6 +146,7 @@ struct deviceTaskFn : public TaskInterface { } deviceTaskFn_* task_; + std::optional stream_; // stream to sync with }; public: diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 685c804dff..133bb11c56 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -798,9 +798,14 @@ class Env { }; namespace detail { -inline std::optional& tls_stream_accessor() { - static thread_local std::optional tls_stream; - return tls_stream; +inline std::optional*& madness_task_stream_opt_ptr_accessor() { + static thread_local std::optional* stream_opt_ptr = nullptr; + return stream_opt_ptr; +} + +inline std::optional& madness_task_stream_opt_accessor() { + TA_ASSERT(madness_task_stream_opt_ptr_accessor() != nullptr); + return *madness_task_stream_opt_ptr_accessor(); } } // namespace detail @@ -810,10 +815,10 @@ inline std::optional& tls_stream_accessor() { /// before task completion /// \param s the stream to synchronize this task with inline void sync_madness_task_with(const Stream& s) { - if (!detail::tls_stream_accessor()) - detail::tls_stream_accessor() = s; + if (!detail::madness_task_stream_opt_accessor()) + detail::madness_task_stream_opt_accessor() = s; else { - TA_ASSERT(*detail::tls_stream_accessor() == s); + TA_ASSERT(*detail::madness_task_stream_opt_accessor() == s); } } @@ -841,7 +846,7 @@ inline void sync_madness_task_with(stream_t stream) { /// @return the optional Stream with which this task will be synced inline std::optional madness_task_current_stream() { - return detail::tls_stream_accessor(); + return detail::madness_task_stream_opt_accessor(); } /// should call this within a task submitted to @@ -849,7 +854,9 @@ inline std::optional madness_task_current_stream() { /// to cancel the previous calls to sync_madness_task_with() /// if, e.g., it synchronized with any work performed /// before exiting -inline void cancel_madness_task_sync() { detail::tls_stream_accessor() = {}; } +inline void cancel_madness_task_sync() { + detail::madness_task_stream_opt_accessor() = {}; +} /// maps a (tile) Range to device::Stream; if had already pushed work into a /// device::Stream (as indicated by madness_task_current_stream() ) diff --git a/src/TiledArray/reduce_task.h b/src/TiledArray/reduce_task.h index 34a2fef9ea..2a5813ff10 100644 --- a/src/TiledArray/reduce_task.h +++ b/src/TiledArray/reduce_task.h @@ -456,15 +456,21 @@ class ReduceTask { ready_object_ = nullptr; lock_.unlock(); // <<< End critical section +#ifdef TILEDARRAY_HAS_DEVICE + TA_ASSERT(device::detail::madness_task_stream_opt_ptr_accessor() == + nullptr); + device::detail::madness_task_stream_opt_ptr_accessor() = &stream_; +#endif + // Reduce the argument that was held by ready_object_ op_(*result, ready_object->arg()); // cleanup the argument #ifdef TILEDARRAY_HAS_DEVICE - auto& stream_opt = device::detail::tls_stream_accessor(); + device::detail::madness_task_stream_opt_ptr_accessor() = nullptr; // need to sync with a device stream? - if (!stream_opt) { // no + if (!stream_) { // no ReduceObject::destroy(ready_object); this->dec(); } else { @@ -472,12 +478,11 @@ class ReduceTask { (*callback_object)[0] = &world_; (*callback_object)[1] = this; (*callback_object)[2] = ready_object; - DeviceSafeCall(device::setDevice(stream_opt->device)); + DeviceSafeCall(device::setDevice(stream_->device)); DeviceSafeCall(device::launchHostFunc( - stream_opt->stream, + stream_->stream, device_dependency_dec_reduceobject_delete_callback, callback_object)); - device::cancel_madness_task_sync(); // std::cout << std::to_string(world().rank()) + " // add 3\n"; } @@ -491,14 +496,21 @@ class ReduceTask { ready_result_.reset(); lock_.unlock(); // <<< End critical section +#ifdef TILEDARRAY_HAS_DEVICE + TA_ASSERT(device::detail::madness_task_stream_opt_ptr_accessor() == + nullptr); + device::detail::madness_task_stream_opt_ptr_accessor() = &stream_; +#endif + // Reduce the result that was held by ready_result_ op_(*result, *ready_result); // cleanup the result #ifdef TILEDARRAY_HAS_DEVICE - auto queue_opt = device::detail::tls_stream_accessor(); + device::detail::madness_task_stream_opt_ptr_accessor() = nullptr; + // need to sync with a stream? - if (!queue_opt) { // no + if (!stream_) { // no ready_result.reset(); } else { // yes auto ready_result_heap = @@ -506,11 +518,10 @@ class ReduceTask { auto callback_object = new std::vector(2); (*callback_object)[0] = &world_; (*callback_object)[1] = ready_result_heap; - auto& [device, stream] = *queue_opt; + auto& [device, stream] = *stream_; DeviceSafeCall(device::setDevice(device)); DeviceSafeCall(device::launchHostFunc( stream, device_readyresult_reset_callback, callback_object)); - device::cancel_madness_task_sync(); // std::cout << std::to_string(world().rank()) + " // add 4\n"; } @@ -532,43 +543,49 @@ class ReduceTask { /// \param object The reduction argument to be reduced void reduce_result_object(std::shared_ptr result, const ReduceObject* object) { +#ifdef TILEDARRAY_HAS_DEVICE + TA_ASSERT(device::detail::madness_task_stream_opt_ptr_accessor() == + nullptr); + device::detail::madness_task_stream_opt_ptr_accessor() = &stream_; +#endif + // Reduce the argument op_(*result, object->arg()); // Cleanup the argument #ifdef TILEDARRAY_HAS_DEVICE - auto& stream_opt = device::detail::tls_stream_accessor(); - if (!stream_opt) { + device::detail::madness_task_stream_opt_ptr_accessor() = nullptr; + + if (!stream_) { ReduceObject::destroy(object); } else { auto callback_object = new std::vector(2); (*callback_object)[0] = &world_; (*callback_object)[1] = const_cast(object); - DeviceSafeCall(device::setDevice(stream_opt->device)); + DeviceSafeCall(device::setDevice(stream_->device)); DeviceSafeCall(device::launchHostFunc( - stream_opt->stream, device_reduceobject_delete_callback, + stream_->stream, device_reduceobject_delete_callback, callback_object)); - device::cancel_madness_task_sync(); // std::cout << std::to_string(world().rank()) + " add 1\n"; } #else ReduceObject::destroy(object); #endif + // Check for more reductions reduce(result); // Decrement the dependency counter for the argument. This must // be done after the reduce call to avoid a race condition. #ifdef TILEDARRAY_HAS_DEVICE - if (!stream_opt) { + if (!stream_) { this->dec(); } else { auto callback_object2 = new std::vector(1); (*callback_object2)[0] = this; - DeviceSafeCall(device::setDevice(stream_opt->device)); - DeviceSafeCall(device::launchHostFunc(stream_opt->stream, - device_dependency_dec_callback, - callback_object2)); + DeviceSafeCall(device::setDevice(stream_->device)); + DeviceSafeCall(device::launchHostFunc( + stream_->stream, device_dependency_dec_callback, callback_object2)); // std::cout << std::to_string(world().rank()) + " add 2\n"; } #else @@ -582,14 +599,21 @@ class ReduceTask { // Construct an empty result object auto result = std::make_shared(op_()); +#ifdef TILEDARRAY_HAS_DEVICE + TA_ASSERT(device::detail::madness_task_stream_opt_ptr_accessor() == + nullptr); + device::detail::madness_task_stream_opt_ptr_accessor() = &stream_; +#endif + // Reduce the two arguments op_(*result, object1->arg()); op_(*result, object2->arg()); // Cleanup arguments #ifdef TILEDARRAY_HAS_DEVICE - auto& stream_opt = device::detail::tls_stream_accessor(); - if (!stream_opt) { + device::detail::madness_task_stream_opt_ptr_accessor() = nullptr; + + if (!stream_) { ReduceObject::destroy(object1); ReduceObject::destroy(object2); } else { @@ -597,11 +621,10 @@ class ReduceTask { (*callback_object1)[0] = &world_; (*callback_object1)[1] = const_cast(object1); (*callback_object1)[2] = const_cast(object2); - DeviceSafeCall(device::setDevice(stream_opt->device)); + DeviceSafeCall(device::setDevice(stream_->device)); DeviceSafeCall(device::launchHostFunc( - stream_opt->stream, device_reduceobject_delete_callback, + stream_->stream, device_reduceobject_delete_callback, callback_object1)); - device::cancel_madness_task_sync(); // std::cout << std::to_string(world().rank()) + " add 1\n"; } #else @@ -615,17 +638,16 @@ class ReduceTask { // Decrement the dependency counter for the two arguments. This // must be done after the reduce call to avoid a race condition. #ifdef TILEDARRAY_HAS_DEVICE - if (!stream_opt) { + if (!stream_) { this->dec(); this->dec(); } else { auto callback_object2 = new std::vector(2); (*callback_object2)[0] = this; (*callback_object2)[1] = this; - DeviceSafeCall(device::setDevice(stream_opt->device)); - DeviceSafeCall(device::launchHostFunc(stream_opt->stream, - device_dependency_dec_callback, - callback_object2)); + DeviceSafeCall(device::setDevice(stream_->device)); + DeviceSafeCall(device::launchHostFunc( + stream_->stream, device_dependency_dec_callback, callback_object2)); // std::cout << std::to_string(world().rank()) + " add 2\n"; } @@ -671,6 +693,9 @@ class ReduceTask { madness::Spinlock lock_; ///< Task lock madness::CallbackInterface* callback_; ///< The completion callback int task_id_; ///< Task id +#ifdef TILEDARRAY_HAS_DEVICE + std::optional stream_; +#endif public: /// Implementation constructor From dfa3f76ebd58c05c64acf6e57cbdc85e90c880a8 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 28 Sep 2023 13:35:56 -0400 Subject: [PATCH 134/592] bump Umpire tag to bump up to its latest commit that provides C++20 support + MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/501 --- INSTALL.md | 4 ++-- external/umpire.cmake | 9 +++++---- external/versions.cmake | 8 ++++---- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index c9dd6b2d87..720943ac33 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag 3c91f086090390930bba62c6512c4e74a5520e76 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 3d585293f0094588778dbd3bec24b65e7bbe6a5d . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 1f307ebbe6604539493e165a7a2b00b366711fd8 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. @@ -68,7 +68,7 @@ Optional prerequisites: - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required. - [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators. Note that TiledArray does not use ROCm directly but its C++ Heterogeneous-Compute Interface for Portability, `HIP`; although HIP can also be used to program CUDA-enabled devices, in TiledArray it is used only to program ROCm devices, hence ROCm and HIP will be used interchangeably. - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 6eed30d4dd2a5aa58840fe895dcffd80be7fbece). - - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag f9640e0fa4245691cdd434e4f719ac5f7d455f82). + - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag 20839b2e8e8972070dd8f75c7f00d50d6c399716). - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later). - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing: - [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ (C++17) wrapper for ScaLAPACK (tag 6397f52cf11c0dfd82a79698ee198a2fce515d81); pulls and builds the following additional prerequisite diff --git a/external/umpire.cmake b/external/umpire.cmake index 24c9e5e56d..e61fe832e5 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -48,9 +48,9 @@ else() set(enable_umpire_asserts ON) endif() - # as of now BLT only supports up to C++17, so limit CMAKE_CXX_STANDARD + # as of now BLT only supports up to C++20, so limit CMAKE_CXX_STANDARD set(BLT_CXX_STD ${CMAKE_CXX_STANDARD}) - set(BLT_CXX_STD_MAX 17) + set(BLT_CXX_STD_MAX 20) if (BLT_CXX_STD GREATER ${BLT_CXX_STD_MAX}) set(BLT_CXX_STD ${BLT_CXX_STD_MAX}) endif() @@ -161,7 +161,8 @@ else() ) # TiledArray_UMPIRE target depends on existence of these directories to be usable from the build tree at configure time - execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${EXTERNAL_SOURCE_DIR}/src/umpire/tpl/camp/include") + execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${EXTERNAL_SOURCE_DIR}/src/tpl/umpire/camp/include") + execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${EXTERNAL_BUILD_DIR}/src/tpl/umpire/camp/include") execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${EXTERNAL_BUILD_DIR}/include") # do install of Umpire as part of building TiledArray's install target @@ -190,7 +191,7 @@ set_target_properties( TiledArray_UMPIRE PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "$;$;$;$" + "$;$;$;$;$;$" INTERFACE_LINK_LIBRARIES "$;$" ) diff --git a/external/versions.cmake b/external/versions.cmake index 40e94c3509..c9e6149311 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 3d585293f0094588778dbd3bec24b65e7bbe6a5d) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 4785f17bec34e08f10fa4de84c7359f0404a4d78) +set(TA_TRACKED_MADNESS_TAG 1f307ebbe6604539493e165a7a2b00b366711fd8) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 3d585293f0094588778dbd3bec24b65e7bbe6a5d) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) @@ -30,8 +30,8 @@ set(TA_TRACKED_BTAS_PREVIOUS_TAG 5a45699b78d0540b490c8c769b61033bd4d4f49c) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) -set(TA_TRACKED_UMPIRE_TAG f9640e0fa4245691cdd434e4f719ac5f7d455f82) -set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v6.0.0) +set(TA_TRACKED_UMPIRE_TAG 20839b2e8e8972070dd8f75c7f00d50d6c399716) +set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v2023.06.0) set(TA_TRACKED_SCALAPACKPP_TAG 6397f52cf11c0dfd82a79698ee198a2fce515d81) set(TA_TRACKED_SCALAPACKPP_PREVIOUS_TAG 711ef363479a90c88788036f9c6c8adb70736cbf ) From 089dcc6666bc27efffb8cd6afa64f6a56f398746 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 28 Sep 2023 17:31:25 -0400 Subject: [PATCH 135/592] ReduceTask choose device stream in round-robin fashion to avoid dynamic decisions and need for locking --- src/TiledArray/external/device.h | 16 +++++++++++++--- src/TiledArray/reduce_task.h | 27 +++++++++++++++++++-------- 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 133bb11c56..dcf286c443 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -798,14 +798,24 @@ class Env { }; namespace detail { +// in a madness device task point to its local optional stream to use by +// madness_task_stream_opt; set to nullptr after task callable finished inline std::optional*& madness_task_stream_opt_ptr_accessor() { static thread_local std::optional* stream_opt_ptr = nullptr; return stream_opt_ptr; } +inline std::optional& tls_stream_opt_accessor() { + static thread_local std::optional stream_opt = + device::Env::instance()->stream(0); + return stream_opt; +} + inline std::optional& madness_task_stream_opt_accessor() { - TA_ASSERT(madness_task_stream_opt_ptr_accessor() != nullptr); - return *madness_task_stream_opt_ptr_accessor(); + if (madness_task_stream_opt_ptr_accessor() != nullptr) // in a device task? + return *madness_task_stream_opt_ptr_accessor(); + else + return tls_stream_opt_accessor(); } } // namespace detail @@ -867,7 +877,7 @@ inline void cancel_madness_task_sync() { /// associated with Range \p range template device::Stream stream_for(const Range& range) { - auto stream_opt = madness_task_current_stream(); + const auto stream_opt = madness_task_current_stream(); if (!stream_opt) { auto stream_ord = range.offset() % device::Env::instance()->num_streams_total(); diff --git a/src/TiledArray/reduce_task.h b/src/TiledArray/reduce_task.h index 2a5813ff10..7d8924b0c3 100644 --- a/src/TiledArray/reduce_task.h +++ b/src/TiledArray/reduce_task.h @@ -29,6 +29,7 @@ #include #include #include +inline std::atomic global_reduce_task_counter(0); #endif namespace TiledArray { @@ -596,15 +597,15 @@ class ReduceTask { /// Reduce two reduction arguments void reduce_object_object(const ReduceObject* object1, const ReduceObject* object2) { - // Construct an empty result object - auto result = std::make_shared(op_()); - #ifdef TILEDARRAY_HAS_DEVICE TA_ASSERT(device::detail::madness_task_stream_opt_ptr_accessor() == nullptr); device::detail::madness_task_stream_opt_ptr_accessor() = &stream_; #endif + // Construct an empty result object + auto result = std::make_shared(op_()); + // Reduce the two arguments op_(*result, object1->arg()); op_(*result, object2->arg()); @@ -692,9 +693,9 @@ class ReduceTask { Future result_; ///< The result of the reduction task madness::Spinlock lock_; ///< Task lock madness::CallbackInterface* callback_; ///< The completion callback - int task_id_; ///< Task id + std::int64_t task_id_; ///< Task id #ifdef TILEDARRAY_HAS_DEVICE - std::optional stream_; + std::optional stream_; // round-robined by task_id #endif public: @@ -706,7 +707,7 @@ class ReduceTask { /// has completed /// \param task_id the task id (for debugging) ReduceTaskImpl(World& world, opT op, madness::CallbackInterface* callback, - int task_id = -1) + std::int64_t task_id = -1) : madness::TaskInterface(1, TaskAttributes::hipri()), world_(world), op_(op), @@ -715,7 +716,16 @@ class ReduceTask { result_(), lock_(), callback_(callback), - task_id_(task_id) {} + task_id_(task_id) { +#ifdef TILEDARRAY_HAS_DEVICE + if (task_id_ == -1) { + task_id_ = global_reduce_task_counter++; + const std::size_t stream_ord = + task_id_ % device::Env::instance()->num_streams_total(); + stream_ = device::Env::instance()->stream(stream_ord); + } +#endif + } virtual ~ReduceTaskImpl() {} @@ -780,7 +790,8 @@ class ReduceTask { /// this task is complete /// \param task_id the task id (for debugging) ReduceTask(World& world, const opT& op = opT(), - madness::CallbackInterface* callback = nullptr, int task_id = -1) + madness::CallbackInterface* callback = nullptr, + std::int64_t task_id = -1) : pimpl_(new ReduceTaskImpl(world, op, callback, task_id)), count_(0ul) {} /// Move constructor From 4b79b5a79ec65cbfdd6c0d4adadcb4c0a28fe0d3 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 28 Sep 2023 17:31:55 -0400 Subject: [PATCH 136/592] clone(DistArray) supports device-based arrays --- src/TiledArray/conversions/clone.h | 32 +++++++++++++++++++++++------ src/TiledArray/external/device.h | 1 + src/TiledArray/tensor/type_traits.h | 5 +---- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/TiledArray/conversions/clone.h b/src/TiledArray/conversions/clone.h index b8c05df840..910d86e21d 100644 --- a/src/TiledArray/conversions/clone.h +++ b/src/TiledArray/conversions/clone.h @@ -26,6 +26,10 @@ #ifndef TILEDARRAY_CONVERSIONS_CLONE_H__INCLUDED #define TILEDARRAY_CONVERSIONS_CLONE_H__INCLUDED +#ifdef TILEDARRAY_HAS_DEVICE +#include "TiledArray/device/device_task_fn.h" +#endif + namespace TiledArray { /// Forward declarations @@ -53,12 +57,28 @@ inline DistArray clone(const DistArray& arg) { if (arg.is_zero(index)) continue; // Spawn a task to clone the tiles - Future tile = world.taskq.add( - [](const value_type& tile) -> value_type { - using TiledArray::clone; - return clone(tile); - }, - arg.find(index)); + + Future tile; + if constexpr (!detail::is_device_tile_v) { + tile = world.taskq.add( + [](const value_type& tile) -> value_type { + using TiledArray::clone; + return clone(tile); + }, + arg.find(index)); + } else { +#ifdef TILEDARRAY_HAS_DEVICE + tile = madness::add_device_task( + world, + [](const value_type& tile) -> value_type { + using TiledArray::clone; + return clone(tile); + }, + arg.find(index)); +#else + abort(); // unreachable +#endif + } // Store result tile result.set(index, tile); diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index dcf286c443..44d9c77a68 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -807,6 +807,7 @@ inline std::optional*& madness_task_stream_opt_ptr_accessor() { inline std::optional& tls_stream_opt_accessor() { static thread_local std::optional stream_opt = + device::Env::instance()->stream(0); return stream_opt; } diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index e9d1681f71..eed84c6026 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -314,8 +314,7 @@ template constexpr const bool is_reduce_op_v = is_reduce_op_::value; -/// detect cuda tile -#ifdef TILEDARRAY_HAS_DEVICE +/// detect device tile types template struct is_device_tile : public std::false_type {}; @@ -329,8 +328,6 @@ struct is_device_tile> template static constexpr const auto is_device_tile_v = is_device_tile::value; -#endif - template struct default_permutation; From 90a0e01334d240be66565cef8347551a22d08d66 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 2 Oct 2023 14:46:25 -0400 Subject: [PATCH 137/592] [ci] update path to OneAPI MKL vars.sh script in response to VG images updated via https://github.com/ValeevGroup/DevOps/commit/24e2702e2102cc2cd1aad8a9875407d7ab541380 --- ci/.build-project | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/ci/.build-project b/ci/.build-project index 57fc67300b..455f76f131 100755 --- a/ci/.build-project +++ b/ci/.build-project @@ -75,15 +75,14 @@ cmd "source ci/openmpi.env" cmd "echo 'localhost slots=2' > /etc/openmpi/openmpi-default-hostfile" if [[ "$vars" =~ \"-DBLAS_PREFERENCE_LIST=IntelMKL ]]; then cmd "make -C /home/ValeevGroup install/intel-mkl" - # DevOps installs MKL 2020.3 which requires non-OneAPI TBB ... although MKL bundles TBB + # DevOps installs OneAPI MKL which requires OneAPI TBB ... although MKL bundles TBB # the systemwide TBB package is found first (the MKL's TBB does not bundle # headers anyway, so it's almost useless for us) - # unfortunately the default, libtbb-dev, package on ubuntu 22.04 is OneAPI, get rid of it and use - # libtbb2-dev instead + # unfortunately the default, libtbb-dev, package on ubuntu 20.04 is pre-OneAPI, so get rid of it if [[ "$vars" =~ \"-DIntelMKL_THREAD_LAYER=tbb ]]; then - cmd "(apt show libtbb2-dev && apt install -y libtbb2-dev) || echo \"no need to install libtbb2-dev\"" + cmd "(apt show libtbb-dev && apt remove -y libtbb-dev) || echo \"no need to install libtbb-dev\"" fi - cmd "source /opt/intel/mkl/bin/mklvars.sh intel64" + cmd "source /opt/intel/oneapi/mkl/latest/env/vars.sh" cmd "echo MKLROOT=\$MKLROOT" fi if [[ "$vars" =~ \"-D([a-zA-Z]+_)?ENABLE_CUDA=(ON|TRUE|1|YES)\" ]]; then From a0f4c5e6f38f4db48cef90bcddb5d25c0b392d8a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 2 Oct 2023 20:54:48 -0400 Subject: [PATCH 138/592] [ci] install OneAPI TBB dev package if using OneAPI MKL --- ci/.build-project | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ci/.build-project b/ci/.build-project index 455f76f131..f7a7033755 100755 --- a/ci/.build-project +++ b/ci/.build-project @@ -75,15 +75,19 @@ cmd "source ci/openmpi.env" cmd "echo 'localhost slots=2' > /etc/openmpi/openmpi-default-hostfile" if [[ "$vars" =~ \"-DBLAS_PREFERENCE_LIST=IntelMKL ]]; then cmd "make -C /home/ValeevGroup install/intel-mkl" + cmd "source /opt/intel/oneapi/mkl/latest/env/vars.sh" + cmd "echo MKLROOT=\$MKLROOT" # DevOps installs OneAPI MKL which requires OneAPI TBB ... although MKL bundles TBB - # the systemwide TBB package is found first (the MKL's TBB does not bundle - # headers anyway, so it's almost useless for us) - # unfortunately the default, libtbb-dev, package on ubuntu 20.04 is pre-OneAPI, so get rid of it + # the systemwide TBB package is found first + the MKL's TBB does not bundle + # so we can't discover it properly anyway + # unfortunately the default, libtbb-dev, package on ubuntu 20.04 is pre-OneAPI, so let's + # get rid of it + install "full" OneAPI TBB if [[ "$vars" =~ \"-DIntelMKL_THREAD_LAYER=tbb ]]; then - cmd "(apt show libtbb-dev && apt remove -y libtbb-dev) || echo \"no need to install libtbb-dev\"" + cmd "(apt show libtbb2 && apt remove -y libtbb2) || echo \"no need to install libtbb2\"" fi - cmd "source /opt/intel/oneapi/mkl/latest/env/vars.sh" - cmd "echo MKLROOT=\$MKLROOT" + cmd "apt-get -yq install intel-oneapi-tbb-devel" + cmd "source /opt/intel/oneapi/tbb/latest/env/vars.sh" + cmd "echo TBBROOT=\$TBBROOT" fi if [[ "$vars" =~ \"-D([a-zA-Z]+_)?ENABLE_CUDA=(ON|TRUE|1|YES)\" ]]; then cmd "make -C /home/ValeevGroup install/cuda" From 6c14bc5721d44eb46b9b761b412be454316278d8 Mon Sep 17 00:00:00 2001 From: David Williams-Young Date: Wed, 18 Oct 2023 09:13:57 -0700 Subject: [PATCH 139/592] Move GNUInstallDirs before setting TILEDARRAY_INSTALL_XYZ --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 520898704d..97b3d736e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,6 +78,7 @@ enable_language(C) # C needed even for basic platform introspection # Set install paths ============================================================ +include(GNUInstallDirs) set(TILEDARRAY_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH "TiledArray binary install directory") set(TILEDARRAY_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}" @@ -96,7 +97,6 @@ set(TILEDARRAY_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/tiledarray" # Add module directory and modules ============================================= list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules/) include(CMakePushCheckState) -include(GNUInstallDirs) include(AppendFlags) include(RedefaultableOption) include(DetectMADNESSConfig) From 172a37a9e20fc8f9bad56b506eb703bbdb962f6a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 18 Oct 2023 15:52:02 -0400 Subject: [PATCH 140/592] [skip ci] removed unused TILEDARRAY_INSTALL_{SHARE,DATA}DIR --- CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 97b3d736e9..17f2b480a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,10 +85,6 @@ set(TILEDARRAY_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}" CACHE PATH "TiledArray INCLUDE install directory") set(TILEDARRAY_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}" CACHE PATH "TiledArray LIB install directory") -set(TILEDARRAY_INSTALL_SHAREDIR "${CMAKE_INSTALL_DATAROOTDIR}/tiledarray/${TILEDARRAY_EXT_VERSION}" - CACHE PATH "TiledArray DATA install directory") -set(TILEDARRAY_INSTALL_DATADIR "${TILEDARRAY_INSTALL_SHAREDIR}/data" - CACHE PATH "TiledArray DATA install directory") set(TILEDARRAY_INSTALL_DOCDIR "${TILEDARRAY_INSTALL_SHAREDIR}/doc" CACHE PATH "TiledArray DOC install directory") set(TILEDARRAY_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/tiledarray" From c1316185fba266ead1bd9bfb5c0a0547994233fd Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 22 Oct 2023 10:28:37 -0400 Subject: [PATCH 141/592] amend https://github.com/ValeevGroup/tiledarray/commit/172a37a9e20fc8f9bad56b506eb703bbdb962f6a --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 17f2b480a8..604db1ee85 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,7 +85,9 @@ set(TILEDARRAY_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}" CACHE PATH "TiledArray INCLUDE install directory") set(TILEDARRAY_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}" CACHE PATH "TiledArray LIB install directory") -set(TILEDARRAY_INSTALL_DOCDIR "${TILEDARRAY_INSTALL_SHAREDIR}/doc" +set(TILEDARRAY_INSTALL_DATADIR "${CMAKE_INSTALL_DATAROOTDIR}/tiledarray/${TILEDARRAY_EXT_VERSION}" + CACHE PATH "TiledArray DATA install directory") +set(TILEDARRAY_INSTALL_DOCDIR "${TILEDARRAY_INSTALL_DATADIR}/doc" CACHE PATH "TiledArray DOC install directory") set(TILEDARRAY_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/tiledarray" CACHE PATH "TiledArray CMAKE install directory") From edcb91b533ea7bf187da04f2cedf01233c8a0043 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 31 Oct 2023 09:04:42 -0400 Subject: [PATCH 142/592] [unit] set MAD_NUM_THREADS when running w >1 rank --- tests/CMakeLists.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 118440cf3b..e4f5bfe213 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -153,6 +153,9 @@ endif (TARGET range-v3::range-v3) add_test(tiledarray/unit/build "${CMAKE_COMMAND}" --build ${PROJECT_BINARY_DIR} --target ${executable}) set_tests_properties(tiledarray/unit/build PROPERTIES FIXTURES_SETUP TA_UNIT_TESTS_EXEC) +# N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now +set(TA_UNIT_TESTS_ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1") + # Add a test(s) if(ENABLE_MPI) set (${executable}_np_1_args --run_test=!@distributed) @@ -165,22 +168,19 @@ if(ENABLE_MPI) $ --log_level=unit_scope ${${executable}_np_${p}_args} ${MPIEXEC_POSTFLAGS} ) - # N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now + if (p GREATER 1) + set(TA_UNIT_TESTS_ENVIRONMENT "${TA_UNIT_TESTS_ENVIRONMENT};TA_UT_DISTRIBUTED=1") + endif() set_tests_properties(tiledarray/unit/run-np-${p} PROPERTIES FIXTURES_REQUIRED TA_UNIT_TESTS_EXEC - ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1" + ENVIRONMENT "${TA_UNIT_TESTS_ENVIRONMENT}" ) - - if (p GREATER 1) - set_tests_properties(tiledarray/unit/run-np-${p} PROPERTIES ENVIRONMENT TA_UT_DISTRIBUTED=1) - endif() endforeach(p) else() add_test(NAME tiledarray/unit/run-np-1 COMMAND ${executable}) - # N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now set_tests_properties(tiledarray/unit/run-np-1 PROPERTIES FIXTURES_REQUIRED TA_UNIT_TESTS_EXEC - ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1" + ENVIRONMENT "${TA_UNIT_TESTS_ENVIRONMENT}" ) endif() From 9568d8a67896b93c65ca6da2379884df2e85a024 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 26 Oct 2023 08:46:02 -0400 Subject: [PATCH 143/592] [cmake] bump VG kit, BTAS, and MADNESS tags - refresh VG kit to include several linalgpp fixes - refresh BTAS to pull in https://github.com/ValeevGroup/BTAS/pull/166 - refresh MADNESS to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/507 --- INSTALL.md | 4 ++-- external/versions.cmake | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 720943ac33..6841ba0ca2 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -40,9 +40,9 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Container: header-only - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* -- [BTAS](http://github.com/ValeevGroup/BTAS), tag 3c91f086090390930bba62c6512c4e74a5520e76 . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag bf0c376d5cdd6f668174b2a4c67b19634d1c0da7 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 1f307ebbe6604539493e165a7a2b00b366711fd8 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 4f7d30b0a738621037b96bb5b820029835753667 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index c9e6149311..ff7aad41ac 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -1,7 +1,7 @@ # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS) # to be able to auto-update them -set(TA_TRACKED_VGCMAKEKIT_TAG e68b3b4e8a57a175bb9d1b4e4cfa7d31b9363de5) +set(TA_TRACKED_VGCMAKEKIT_TAG d6746098e63deab4032309c4455bb084a17ff51a) # Boost explicitly downgraded to 1.59 from 1.68 set(TA_TRACKED_BOOST_VERSION 1.59) @@ -19,13 +19,13 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 1f307ebbe6604539493e165a7a2b00b366711fd8) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 3d585293f0094588778dbd3bec24b65e7bbe6a5d) +set(TA_TRACKED_MADNESS_TAG 4f7d30b0a738621037b96bb5b820029835753667) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 1f307ebbe6604539493e165a7a2b00b366711fd8) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG 3c91f086090390930bba62c6512c4e74a5520e76) -set(TA_TRACKED_BTAS_PREVIOUS_TAG 5a45699b78d0540b490c8c769b61033bd4d4f49c) +set(TA_TRACKED_BTAS_TAG bf0c376d5cdd6f668174b2a4c67b19634d1c0da7) +set(TA_TRACKED_BTAS_PREVIOUS_TAG 3c91f086090390930bba62c6512c4e74a5520e76) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) From 75950bef52adcc1166d988037f470fb4af1c85eb Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 27 Oct 2023 16:15:55 -0400 Subject: [PATCH 144/592] [cmake] bump TTG tag to sync with most recent master --- external/versions.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/external/versions.cmake b/external/versions.cmake index ff7aad41ac..9aa1db064a 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -40,5 +40,5 @@ set(TA_TRACKED_RANGEV3_TAG 2e0591c57fce2aca6073ad6e4fdc50d841827864) set(TA_TRACKED_RANGEV3_PREVIOUS_TAG dbdaa247a25a0daa24c68f1286a5693c72ea0006) set(TA_TRACKED_TTG_URL https://github.com/TESSEorg/ttg) -set(TA_TRACKED_TTG_TAG 0adff52aa1ebdad013ab3843a7a68c2bb06b60a8) -set(TA_TRACKED_TTG_PREVIOUS_TAG a9a1a55b45f7503da39d8466a1a421155ac5ca2a) +set(TA_TRACKED_TTG_TAG 4643df546d52481c7ad4bf5b7cce45289dc6b222) +set(TA_TRACKED_TTG_PREVIOUS_TAG 0adff52aa1ebdad013ab3843a7a68c2bb06b60a8) From 8b06a678fb14065a9bc312fa7ba36791dfa6d475 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 27 Oct 2023 16:20:43 -0400 Subject: [PATCH 145/592] automate TTG tag synchronization between versions.cmake and INSTALL.md --- INSTALL.md | 2 +- bin/admin/dependency-versions-update-hook.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/INSTALL.md b/INSTALL.md index 6841ba0ca2..cfcaefaa06 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -75,7 +75,7 @@ Optional prerequisites: - [blacspp](https://github.com/wavefunction91/blacspp.git) -- a modern C++ (C++17) wrapper for BLACS - Python3 interpreter -- to test (optionally-built) Python bindings - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20; only used for some unit testing of the functionality anticipated to be supported by future C++ standards. -- [TTG](https://github.com/TESSEorg/ttg.git) -- C++ implementation of the Template Task Graph programming model for fine-grained flow-graph composition of distributed memory programs. +- [TTG](https://github.com/TESSEorg/ttg.git) -- C++ implementation of the Template Task Graph programming model for fine-grained flow-graph composition of distributed memory programs (tag 4643df546d52481c7ad4bf5b7cce45289dc6b222). Many of these dependencies can be installed with a package manager, such as Homebrew on OS X or apt-get on Debian Linux distributions; diff --git a/bin/admin/dependency-versions-update-hook.py b/bin/admin/dependency-versions-update-hook.py index 686b98b49a..739049f834 100755 --- a/bin/admin/dependency-versions-update-hook.py +++ b/bin/admin/dependency-versions-update-hook.py @@ -126,6 +126,11 @@ def replace_dep_id(topsrc, file_ext, dep_name, old_id, new_id, search_prefix = ' scalapackpp_old_tag = tokens[2] else: scalapackpp_new_tag = tokens[2] + elif tokens[1].find('TTG') != -1: + if tokens[1].find('PREVIOUS') != -1: + ttg_old_tag = tokens[2] + else: + ttg_new_tag = tokens[2] any_files_changed = False @@ -155,6 +160,9 @@ def replace_dep_id(topsrc, file_ext, dep_name, old_id, new_id, search_prefix = ' # SCALAPACKPP tag in INSTALL.md any_files_changed |= replace_dep_id(topsrc, 'md', 'SCALAPACKPP', scalapackpp_old_tag, scalapackpp_new_tag, '', '') +# TTG tag in INSTALL.md +any_files_changed |= replace_dep_id(topsrc, 'md', 'TTG', ttg_old_tag, ttg_new_tag, '', '') + if any_files_changed: sys.exit(1) else: From b9466a10b7b79ef13557a0a5956d5e5e01ac2f41 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 27 Oct 2023 16:22:13 -0400 Subject: [PATCH 146/592] [ci] test TTG Cholesky when using PaRSEC as backend (GHA only) --- .github/workflows/ci.yml | 1 + src/TiledArray/math/linalg/ttg/cholesky.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7753a3436d..44210103e8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,6 +29,7 @@ jobs: -DCMAKE_PREFIX_PATH="/usr/local/opt/bison;/usr/local/opt/scalapack" -DTA_ASSERT_POLICY=TA_ASSERT_THROW -DENABLE_SCALAPACK=ON + -DTA_TTG=${{ matrix.task_backend == 'PaRSEC' }} steps: - uses: actions/checkout@v2 diff --git a/src/TiledArray/math/linalg/ttg/cholesky.h b/src/TiledArray/math/linalg/ttg/cholesky.h index 66a67a8034..0017d1ae1e 100644 --- a/src/TiledArray/math/linalg/ttg/cholesky.h +++ b/src/TiledArray/math/linalg/ttg/cholesky.h @@ -86,7 +86,7 @@ auto cholesky(const Array& A, TiledRange l_trange = {}, [[maybe_unused]] auto connected = make_graph_executable(potrf_ttg.get()); // uncomment to trace - ::ttg::trace_on(); + //::ttg::trace_on(); // start ::ttg::execute(); @@ -175,7 +175,7 @@ auto cholesky_linv(const Array& A, TiledRange l_trange = {}, [[maybe_unused]] auto connected = make_graph_executable(trtri_ttg.get()); // uncomment to trace - ::ttg::trace_on(); + //::ttg::trace_on(); // start ::ttg::execute(); From 97655c08d667425b234158dd7f912700f810e24b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 27 Oct 2023 16:50:35 -0400 Subject: [PATCH 147/592] [ci] bump TTG tag again so that it pulls in latest MADNESS --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index cfcaefaa06..9f71cd221a 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -75,7 +75,7 @@ Optional prerequisites: - [blacspp](https://github.com/wavefunction91/blacspp.git) -- a modern C++ (C++17) wrapper for BLACS - Python3 interpreter -- to test (optionally-built) Python bindings - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20; only used for some unit testing of the functionality anticipated to be supported by future C++ standards. -- [TTG](https://github.com/TESSEorg/ttg.git) -- C++ implementation of the Template Task Graph programming model for fine-grained flow-graph composition of distributed memory programs (tag 4643df546d52481c7ad4bf5b7cce45289dc6b222). +- [TTG](https://github.com/TESSEorg/ttg.git) -- C++ implementation of the Template Task Graph programming model for fine-grained flow-graph composition of distributed memory programs (tag 18ecba5ddfa2953698642d75ec509a0735e0cd3e). Many of these dependencies can be installed with a package manager, such as Homebrew on OS X or apt-get on Debian Linux distributions; diff --git a/external/versions.cmake b/external/versions.cmake index 9aa1db064a..0e68e87eb2 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -40,5 +40,5 @@ set(TA_TRACKED_RANGEV3_TAG 2e0591c57fce2aca6073ad6e4fdc50d841827864) set(TA_TRACKED_RANGEV3_PREVIOUS_TAG dbdaa247a25a0daa24c68f1286a5693c72ea0006) set(TA_TRACKED_TTG_URL https://github.com/TESSEorg/ttg) -set(TA_TRACKED_TTG_TAG 4643df546d52481c7ad4bf5b7cce45289dc6b222) -set(TA_TRACKED_TTG_PREVIOUS_TAG 0adff52aa1ebdad013ab3843a7a68c2bb06b60a8) +set(TA_TRACKED_TTG_TAG 18ecba5ddfa2953698642d75ec509a0735e0cd3e) +set(TA_TRACKED_TTG_PREVIOUS_TAG 4643df546d52481c7ad4bf5b7cce45289dc6b222) From 3526ac9ac2c49527f4f820a7ec194164e0b5291b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 31 Oct 2023 13:00:17 -0400 Subject: [PATCH 148/592] [ci] bump TTG tag to pull in https://github.com/TESSEorg/ttg/commit/d3668525151f91c8da341f1780d5e623e2cf6c2b --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 9f71cd221a..7ecbfa3607 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -75,7 +75,7 @@ Optional prerequisites: - [blacspp](https://github.com/wavefunction91/blacspp.git) -- a modern C++ (C++17) wrapper for BLACS - Python3 interpreter -- to test (optionally-built) Python bindings - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20; only used for some unit testing of the functionality anticipated to be supported by future C++ standards. -- [TTG](https://github.com/TESSEorg/ttg.git) -- C++ implementation of the Template Task Graph programming model for fine-grained flow-graph composition of distributed memory programs (tag 18ecba5ddfa2953698642d75ec509a0735e0cd3e). +- [TTG](https://github.com/TESSEorg/ttg.git) -- C++ implementation of the Template Task Graph programming model for fine-grained flow-graph composition of distributed memory programs (tag 26da9b40872660b864794658d4fdeee1a95cb4d6). Many of these dependencies can be installed with a package manager, such as Homebrew on OS X or apt-get on Debian Linux distributions; diff --git a/external/versions.cmake b/external/versions.cmake index 0e68e87eb2..325735c821 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -40,5 +40,5 @@ set(TA_TRACKED_RANGEV3_TAG 2e0591c57fce2aca6073ad6e4fdc50d841827864) set(TA_TRACKED_RANGEV3_PREVIOUS_TAG dbdaa247a25a0daa24c68f1286a5693c72ea0006) set(TA_TRACKED_TTG_URL https://github.com/TESSEorg/ttg) -set(TA_TRACKED_TTG_TAG 18ecba5ddfa2953698642d75ec509a0735e0cd3e) -set(TA_TRACKED_TTG_PREVIOUS_TAG 4643df546d52481c7ad4bf5b7cce45289dc6b222) +set(TA_TRACKED_TTG_TAG 26da9b40872660b864794658d4fdeee1a95cb4d6) +set(TA_TRACKED_TTG_PREVIOUS_TAG 18ecba5ddfa2953698642d75ec509a0735e0cd3e) From 5980cc6152d4f21f0983d76bab798a5ef7aa966e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 31 Oct 2023 14:24:45 -0400 Subject: [PATCH 149/592] [ci] bump TTG tag to pull in https://github.com/TESSEorg/ttg/commit/3fe4a06dbf4b05091269488aab38223da1f8cb8e --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 7ecbfa3607..b188ba19f5 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -75,7 +75,7 @@ Optional prerequisites: - [blacspp](https://github.com/wavefunction91/blacspp.git) -- a modern C++ (C++17) wrapper for BLACS - Python3 interpreter -- to test (optionally-built) Python bindings - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20; only used for some unit testing of the functionality anticipated to be supported by future C++ standards. -- [TTG](https://github.com/TESSEorg/ttg.git) -- C++ implementation of the Template Task Graph programming model for fine-grained flow-graph composition of distributed memory programs (tag 26da9b40872660b864794658d4fdeee1a95cb4d6). +- [TTG](https://github.com/TESSEorg/ttg.git) -- C++ implementation of the Template Task Graph programming model for fine-grained flow-graph composition of distributed memory programs (tag 3fe4a06dbf4b05091269488aab38223da1f8cb8e). Many of these dependencies can be installed with a package manager, such as Homebrew on OS X or apt-get on Debian Linux distributions; diff --git a/external/versions.cmake b/external/versions.cmake index 325735c821..b1e615e117 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -40,5 +40,5 @@ set(TA_TRACKED_RANGEV3_TAG 2e0591c57fce2aca6073ad6e4fdc50d841827864) set(TA_TRACKED_RANGEV3_PREVIOUS_TAG dbdaa247a25a0daa24c68f1286a5693c72ea0006) set(TA_TRACKED_TTG_URL https://github.com/TESSEorg/ttg) -set(TA_TRACKED_TTG_TAG 26da9b40872660b864794658d4fdeee1a95cb4d6) -set(TA_TRACKED_TTG_PREVIOUS_TAG 18ecba5ddfa2953698642d75ec509a0735e0cd3e) +set(TA_TRACKED_TTG_TAG 3fe4a06dbf4b05091269488aab38223da1f8cb8e) +set(TA_TRACKED_TTG_PREVIOUS_TAG 26da9b40872660b864794658d4fdeee1a95cb4d6) From b6617fd3efbc48bb3b3fc2e0a4c3ce1bd3c3587a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 1 Nov 2023 09:30:52 -0400 Subject: [PATCH 150/592] [ci] bump MAD tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/508 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index b188ba19f5..1181e2d570 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag bf0c376d5cdd6f668174b2a4c67b19634d1c0da7 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 4f7d30b0a738621037b96bb5b820029835753667 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 03c82cf2780d9e96298cc9140ac128c73eacd3b1 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index b1e615e117..b5c6309e6f 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 4f7d30b0a738621037b96bb5b820029835753667) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 1f307ebbe6604539493e165a7a2b00b366711fd8) +set(TA_TRACKED_MADNESS_TAG 03c82cf2780d9e96298cc9140ac128c73eacd3b1) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 4f7d30b0a738621037b96bb5b820029835753667) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From 5a619b5ed5280700e88a053fc771c37960a6babe Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 1 Nov 2023 09:33:02 -0400 Subject: [PATCH 151/592] initialize MADWorld before TTG to have them share context + tell PaRSEC context to use full (WORLD) comm if TTG is used --- CMakeLists.txt | 2 +- src/TiledArray/tiledarray.cpp | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 604db1ee85..b0b5aac514 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -309,10 +309,10 @@ endif() if(ENABLE_HIP) include(external/hip.cmake) endif() +include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchMADWorld.cmake) if (TA_TTG) include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchTTG.cmake) endif(TA_TTG) -include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchMADWorld.cmake) detect_MADNESS_configuration() include(external/eigen.cmake) # the FetchContent-based version will not work due to BLT target name conflicts diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp index 74244e59bd..7d58434979 100644 --- a/src/TiledArray/tiledarray.cpp +++ b/src/TiledArray/tiledarray.cpp @@ -106,8 +106,16 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv, initialized_accessor() = true; quiet_accessor() = quiet; - // if have TTG initialize it also + // if have TTG, initialize it also #if TILEDARRAY_HAS_TTG + // MADNESS/PaRSEC creates PaRSEC context that uses MPI_COMM_SELF to avoid + // creation of a PaRSEC comm thread to be able to use TTG/PaRSEC need to + // tell PaRSEC context to use the full communicator + if (madness::ParsecRuntime::context()->nb_nodes != default_world.size()) { + auto default_world_comm = default_world.mpi.comm().Get_mpi_comm(); + parsec_remote_dep_set_ctx(madness::ParsecRuntime::context(), + (intptr_t)default_world_comm); + } ttg::initialize(argc, argv, -1, madness::ParsecRuntime::context()); #endif From 765ba8c398e6ccf9c051814a28dc4281a95d83f3 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 1 Nov 2023 14:16:27 -0400 Subject: [PATCH 152/592] FindOrFetchMADWorld: make sure MADWorld uses PaRSEC as backend if configured with TA_TTG --- cmake/modules/FindOrFetchMADWorld.cmake | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cmake/modules/FindOrFetchMADWorld.cmake b/cmake/modules/FindOrFetchMADWorld.cmake index 7be76bac5a..5961a4f05d 100644 --- a/cmake/modules/FindOrFetchMADWorld.cmake +++ b/cmake/modules/FindOrFetchMADWorld.cmake @@ -12,7 +12,15 @@ if (NOT TARGET MADworld) # TA-specific configuration set(MADNESS_BUILD_MADWORLD_ONLY ON CACHE BOOL "Whether to build MADNESS runtime only") - set(ENABLE_PARSEC OFF CACHE BOOL "Whether to use PaRSEC as the task backend of MADWorld") + if (TA_TTG) + if (NOT DEFINED MADNESS_TASK_BACKEND) + set(MADNESS_TASK_BACKEND PaRSEC CACHE STRING "The task backend to use for MADNESS tasks") + else () + if (NOT(${MADNESS_TASK_BACKEND} STREQUAL PaRSEC)) + message(FATAL_ERROR "must set MADNESS_TASK_BACKEND=PaRSEC if configuring with TA_TTG=ON") + endif() + endif() + endif() set(MPI_THREAD "multiple" CACHE INTERNAL "MADNESS requires MPI_THREAD_MULTIPLE") set(MADNESS_ASSUMES_ASLR_DISABLED ${TA_ASSUMES_ASLR_DISABLED} CACHE BOOL "Whether MADNESS assumes ASLR to be disabled") set(MPI_CXX_SKIP_MPICXX ON CACHE BOOL "Whether to disable search for C++ MPI-2 bindings") From 7e412dfb46c1f22dc52aa416480acab655a791e9 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 1 Nov 2023 14:17:11 -0400 Subject: [PATCH 153/592] Revert testing TTG Cholesky in CI until initialization issues between MADNESS and TTG are resolved. MADNESS needs to create PaRSEC taskpool lazily so that both MADNESS and TTG can be initialized with empty PaRSEC context created by TA This reverts commit b9466a10b7b79ef13557a0a5956d5e5e01ac2f41. --- .github/workflows/ci.yml | 1 - src/TiledArray/math/linalg/ttg/cholesky.h | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 44210103e8..7753a3436d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,6 @@ jobs: -DCMAKE_PREFIX_PATH="/usr/local/opt/bison;/usr/local/opt/scalapack" -DTA_ASSERT_POLICY=TA_ASSERT_THROW -DENABLE_SCALAPACK=ON - -DTA_TTG=${{ matrix.task_backend == 'PaRSEC' }} steps: - uses: actions/checkout@v2 diff --git a/src/TiledArray/math/linalg/ttg/cholesky.h b/src/TiledArray/math/linalg/ttg/cholesky.h index 0017d1ae1e..66a67a8034 100644 --- a/src/TiledArray/math/linalg/ttg/cholesky.h +++ b/src/TiledArray/math/linalg/ttg/cholesky.h @@ -86,7 +86,7 @@ auto cholesky(const Array& A, TiledRange l_trange = {}, [[maybe_unused]] auto connected = make_graph_executable(potrf_ttg.get()); // uncomment to trace - //::ttg::trace_on(); + ::ttg::trace_on(); // start ::ttg::execute(); @@ -175,7 +175,7 @@ auto cholesky_linv(const Array& A, TiledRange l_trange = {}, [[maybe_unused]] auto connected = make_graph_executable(trtri_ttg.get()); // uncomment to trace - //::ttg::trace_on(); + ::ttg::trace_on(); // start ::ttg::execute(); From 30f9eb540e39b04b5a76b80ec0dbca0489d8bc43 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 1 Nov 2023 14:20:05 -0400 Subject: [PATCH 154/592] disable trace output in TTG Cholesky --- src/TiledArray/math/linalg/ttg/cholesky.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/math/linalg/ttg/cholesky.h b/src/TiledArray/math/linalg/ttg/cholesky.h index 66a67a8034..0017d1ae1e 100644 --- a/src/TiledArray/math/linalg/ttg/cholesky.h +++ b/src/TiledArray/math/linalg/ttg/cholesky.h @@ -86,7 +86,7 @@ auto cholesky(const Array& A, TiledRange l_trange = {}, [[maybe_unused]] auto connected = make_graph_executable(potrf_ttg.get()); // uncomment to trace - ::ttg::trace_on(); + //::ttg::trace_on(); // start ::ttg::execute(); @@ -175,7 +175,7 @@ auto cholesky_linv(const Array& A, TiledRange l_trange = {}, [[maybe_unused]] auto connected = make_graph_executable(trtri_ttg.get()); // uncomment to trace - ::ttg::trace_on(); + //::ttg::trace_on(); // start ::ttg::execute(); From 4ceb416130733f9a01fb342e7436f759284a8633 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 8 Nov 2023 10:09:27 -0500 Subject: [PATCH 155/592] [unit] enabled tot x t test, does not compile @bimalgaudel will fix --- src/TiledArray/einsum/tiledarray.h | 6 +++--- tests/einsum.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index c248956066..7d4aca0425 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -422,9 +422,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B) { template auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, const std::string &cs, World &world = get_default_world()) { - static_assert(std::is_same::value); - using E = expressions::TsrExpr; - return Einsum::einsum(E(A), E(B), Einsum::idx(cs), world); + using ECT = expressions::TsrExpr; + using ECU = expressions::TsrExpr; + return Einsum::einsum(ECT(A), ECU(B), Einsum::idx(cs), world); } template diff --git a/tests/einsum.cpp b/tests/einsum.cpp index ee06cf099f..45c4d3e399 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -765,7 +765,7 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); // will try to make this work - // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); } BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t From 65f437492715caa61d7177cb82b9bf6013662f58 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 13 Nov 2023 12:28:02 -0500 Subject: [PATCH 156/592] [WIP] T x ToT overload of einsum: first attempt. --- src/TiledArray/einsum/tiledarray.h | 225 +++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 7d4aca0425..52dab7477e 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -283,6 +283,231 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, return C.array; } +namespace { +template +constexpr bool IsArrayT = detail::is_tensor_v; + +template +constexpr bool IsArrayToT = + detail::is_tensor_of_tensor_v; +} // namespace + +template < + typename ArrayT_, typename ArrayToT_, typename... Indices, + typename = std::enable_if_t && IsArrayToT>> +auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, + std::tuple, Indices...> cs, + World &world) { + using ArrayT = std::remove_cv_t; + using ArrayToT = std::remove_cv_t; + using Shape = typename ArrayToT::shape_type; + using T = typename ArrayT::value_type; + using ToT = typename ArrayToT::value_type; + + auto a = std::get<0>(Einsum::idx(A)); + auto b = std::get<0>(Einsum::idx(B)); + Einsum::Index c = std::get<0>(cs); + + struct { + std::string a, b, c; + } inner; + if constexpr (std::tuple_size::value == 2) { + inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + inner.c = ";" + (std::string)std::get<1>(cs); + } + + // these are "Hadamard" (fused) indices + auto h = a & b & c; + + auto e = (a ^ b); + // contracted indices + auto i = (a & b) - h; + + // cannot be hadamard reduction type operation for this overload + TA_ASSERT(e); + + // no Hadamard indices => standard contraction (or even outer product) + // same a, b, and c => pure Hadamard + TA_ASSERT(!h || (!(a ^ b) && !(b ^ c))); + + // maps Index to TiledRange1 + // (asserts same index maps to the same TR1 in A, and B) + auto range_map = + (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); + + using ::Einsum::index::permutation; + using TiledArray::Permutation; + + auto arrayTermA = ArrayTerm{A.array(), a}; + auto arrayTermB = ArrayTerm{B.array(), b}; + + { + auto ei = (e + i & arrayTermA.idx); + if (arrayTermA.idx != h + ei) + arrayTermA.permutation = permutation(arrayTermA.idx, h + ei); + arrayTermA.expr = ei; + } + + { + auto ei = (e + i & arrayTermB.idx); + if (arrayTermB.idx != h + ei) + arrayTermB.permutation = permutation(arrayTermB.idx, h + ei); + arrayTermB.expr = ei; + } + + ArrayTerm C = {ArrayToT(world, TiledRange(range_map[c])), c}; + for (auto idx : e) { + C.tiles *= Range(range_map[idx].tiles_range()); + } + if (C.idx != h + e) { + C.permutation = permutation(h + e, C.idx); + } + C.expr = e; + + struct { + RangeProduct tiles; + std::vector> batch; + } H; + + for (auto idx : h) { + H.tiles *= Range(range_map[idx].tiles_range()); + H.batch.push_back({}); + for (auto r : range_map[idx]) { + H.batch.back().push_back(Range{r}.size()); + } + } + + using Index = Einsum::Index; + + // generalized contraction + { + auto ei = (e + i & arrayTermA.idx); + arrayTermA.ei_tiled_range = TiledRange(range_map[ei]); + for (auto idx : ei) arrayTermA.tiles *= Range(range_map[idx].tiles_range()); + } + + { + auto ei = (e + i & arrayTermB.idx); + arrayTermB.ei_tiled_range = TiledRange(range_map[ei]); + for (auto idx : ei) arrayTermB.tiles *= Range(range_map[idx].tiles_range()); + } + + std::vector> worlds; + std::vector> local_tiles; + + // iterates over tiles of hadamard indices + for (Index h : H.tiles) { + auto &A = arrayTermA; + auto &B = arrayTermB; + + auto own = A.own(h) || B.own(h); + auto comm = world.mpi.comm().Split(own, world.rank()); + worlds.push_back(std::make_unique(comm)); + auto &owners = worlds.back(); + if (!own) continue; + size_t batch = 1; + for (size_t i = 0; i < h.size(); ++i) { + batch *= H.batch[i].at(h[i]); + } + + { + arrayTermA.local_tiles.clear(); + const Permutation &P = arrayTermA.permutation; + + for (Index ei : arrayTermA.tiles) { + auto idx = apply_inverse(P, h + ei); + if (!arrayTermA.array.is_local(idx)) continue; + if (arrayTermA.array.is_zero(idx)) continue; + // TODO no need for immediate evaluation + auto tile = arrayTermA.array.find_local(idx).get(); + if (P) tile = tile.permute(P); + auto shape = arrayTermA.ei_tiled_range.tile(ei); + tile = tile.reshape(shape, batch); + arrayTermA.local_tiles.push_back({ei, tile}); + } + bool replicated = arrayTermA.array.pmap()->is_replicated(); + arrayTermA.ei = TiledArray::make_array( + *owners, arrayTermA.ei_tiled_range, arrayTermA.local_tiles.begin(), + arrayTermA.local_tiles.end(), replicated); + } + + { + arrayTermB.local_tiles.clear(); + const Permutation &P = arrayTermB.permutation; + + for (Index ei : arrayTermB.tiles) { + auto idx = apply_inverse(P, h + ei); + if (!arrayTermB.array.is_local(idx)) continue; + if (arrayTermB.array.is_zero(idx)) continue; + // TODO no need for immediate evaluation + auto tile = arrayTermB.array.find_local(idx).get(); + if (P) tile = tile.permute(P); + auto shape = arrayTermB.ei_tiled_range.tile(ei); + tile = tile.reshape(shape, batch); + arrayTermB.local_tiles.push_back({ei, tile}); + } + bool replicated = arrayTermB.array.pmap()->is_replicated(); + arrayTermB.ei = TiledArray::make_array( + *owners, arrayTermB.ei_tiled_range, arrayTermB.local_tiles.begin(), + arrayTermB.local_tiles.end(), replicated); + } + + // todo + // C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); + A.ei.defer_deleter_to_next_fence(); + B.ei.defer_deleter_to_next_fence(); + A.ei = ArrayT(); + B.ei = ArrayToT(); + // why omitting this fence leads to deadlock? + owners->gop.fence(); + for (Index e : C.tiles) { + if (!C.ei.is_local(e)) continue; + if (C.ei.is_zero(e)) continue; + // TODO no need for immediate evaluation + auto tile = C.ei.find_local(e).get(); + assert(tile.batch_size() == batch); + const Permutation &P = C.permutation; + auto c = apply(P, h + e); + auto shape = C.array.trange().tile(c); + shape = apply_inverse(P, shape); + tile = tile.reshape(shape); + if (P) tile = tile.permute(P); + local_tiles.push_back({c, tile}); + } + // mark for lazy deletion + C.ei = ArrayToT(); + } + + if constexpr (!Shape::is_dense()) { + TiledRange tiled_range = TiledRange(range_map[c]); + std::vector> tile_norms; + for (auto &[index, tile] : local_tiles) { + tile_norms.push_back({index, tile.norm()}); + } + Shape shape(world, tile_norms, tiled_range); + C.array = ArrayToT(world, TiledRange(range_map[c]), shape); + } + + for (auto &[index, tile] : local_tiles) { + if (C.array.is_zero(index)) continue; + C.array.set(index, tile); + } + + for (auto &w : worlds) { + w->gop.fence(); + } + + return C.array; +} + +template && IsArrayToT>> +auto einsum(expressions::TsrExpr B, expressions::TsrExpr A, + std::tuple, Indices...> cs, + World &world) { + return einsum(A, B, cs, world); +} + /// Computes ternary tensor product whose result /// is a scalar (a ternary dot product). Optimized for the case where /// the arguments have common (Hadamard) indices. From ab0698dc9f95fe0609ac52a3b428408bccef7ba2 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 14 Nov 2023 14:34:05 -0500 Subject: [PATCH 157/592] tiny step towards supporting T*ToT in expr --- src/TiledArray/tensor/type_traits.h | 7 ++++--- src/TiledArray/tile_op/contract_reduce.h | 23 +++++++++++++---------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index eed84c6026..fd197c8cdf 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -114,7 +114,7 @@ struct is_nested_tensor { /// @c is_nested_tensor_v is an alias for @c /// is_nested_tensor::value template -constexpr const bool is_nested_tensor_v = is_nested_tensor::value; +inline constexpr const bool is_nested_tensor_v = is_nested_tensor::value; //////////////////////////////////////////////////////////////////////////////// @@ -150,7 +150,7 @@ struct is_tensor { /// @tparam Ts a parameter pack /// @c is_tensor_v is an alias for @c is_tensor::value template -constexpr const bool is_tensor_v = is_tensor::value; +inline constexpr const bool is_tensor_v = is_tensor::value; //////////////////////////////////////////////////////////////////////////////// @@ -172,7 +172,8 @@ struct is_tensor_of_tensor { /// @c is_tensor_of_tensor_v is an alias for @c /// is_tensor_of_tensor::value template -constexpr const bool is_tensor_of_tensor_v = is_tensor_of_tensor::value; +inline constexpr const bool is_tensor_of_tensor_v = + is_tensor_of_tensor::value; //////////////////////////////////////////////////////////////////////////////// diff --git a/src/TiledArray/tile_op/contract_reduce.h b/src/TiledArray/tile_op/contract_reduce.h index 48b7936d26..d9d87d59c8 100644 --- a/src/TiledArray/tile_op/contract_reduce.h +++ b/src/TiledArray/tile_op/contract_reduce.h @@ -64,17 +64,20 @@ class ContractReduceBase { using elem_muladd_op_type = void(result_value_type&, const left_value_type&, const right_value_type&); - static_assert( - TiledArray::detail::is_tensor_v == - TiledArray::detail::is_tensor_v && - TiledArray::detail::is_tensor_v == - TiledArray::detail::is_tensor_v, - "ContractReduce can only handle plain tensors or nested tensors " - "(tensors-of-tensors); mixed contractions are not supported"); static constexpr bool plain_tensors = - !(TiledArray::detail::is_tensor_v && - TiledArray::detail::is_tensor_v && - TiledArray::detail::is_tensor_v); + !TiledArray::detail::is_nested_tensor_v && + !TiledArray::detail::is_nested_tensor_v && + !TiledArray::detail::is_nested_tensor_v; + static constexpr bool nested_tensors = + TiledArray::detail::is_nested_tensor_v; + static constexpr bool mixed_tensors = !plain_tensors && !nested_tensors; + static_assert(!mixed_tensors || + (mixed_tensors && + TiledArray::detail::is_nested_tensor_v), + "ContractReduce applied to 1 plain tensor and 1 nested tensor " + "must produce a nested tensor " + "(tensors-of-tensors)"); private: struct Impl { From e14ba1c3e22efb400c3e11b3efd8edf5ba100ee7 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 17 Nov 2023 15:29:11 -0500 Subject: [PATCH 158/592] print/log feature summary only if not a subproject --- CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b0b5aac514..0c1efb0292 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -479,8 +479,11 @@ ADD_CUSTOM_TARGET(release COMMENT "Switch CMAKE_BUILD_TYPE to Release" ) -feature_summary(WHAT ALL - DESCRIPTION "=== TiledArray Package/Feature Info ===") +if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) + feature_summary(WHAT ALL + DESCRIPTION "=== TiledArray Package/Feature Info ===") + feature_summary(FILENAME ${CMAKE_CURRENT_BINARY_DIR}/features.log WHAT ALL) +endif() option(TA_PYTHON "Build TA python module" OFF) if (TA_PYTHON) From 24e07eb6592d4c8ed4b0ed8753e19d7c95f689bc Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 18 Nov 2023 08:47:40 -0500 Subject: [PATCH 159/592] [unit] introduced `BOOST_{WARN,CHECK,REQUIRE}_TA_ASSERT` macros to use in place of `BOOST_{WARN,CHECK,REQUIRE}_THROW` --- tests/unit_test_config.h.in | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/unit_test_config.h.in b/tests/unit_test_config.h.in index bd23b8414b..59e485634f 100644 --- a/tests/unit_test_config.h.in +++ b/tests/unit_test_config.h.in @@ -61,4 +61,24 @@ #define TA_UT_LABEL_DISTRIBUTED *boost::unit_test::label("distributed") #define TA_UT_LABEL_SERIAL *boost::unit_test::label("serial") +#if (TA_ASSERT_POLICY == TA_ASSERT_THROW) + +#define BOOST_WARN_TA_ASSERT( S, E ) \ + BOOST_WARN_THROW( S, E ) +#define BOOST_CHECK_TA_ASSERT( S, E ) \ + BOOST_CHECK_THROW( S, E ) +#define BOOST_REQUIRE_TA_ASSERT( S, E ) \ + BOOST_REQUIRE_THROW( S, E ) + +#else + +#define BOOST_WARN_TA_ASSERT( S, E ) \ + BOOST_WARN_MESSAGE( false, "Skipped BOOST_WARN_TA_ASSERT(" BOOST_STRINGIZE(S) "," BOOST_STRINGIZE(E) ") due to TA_ASSERT_POLICY != TA_ASSERT_THROW" ) +#define BOOST_CHECK_TA_ASSERT( S, E ) \ + BOOST_WARN_MESSAGE( false, "Skipped BOOST_CHECK_THROW(" BOOST_STRINGIZE(S) "," BOOST_STRINGIZE(E) ") due to TA_ASSERT_POLICY != TA_ASSERT_THROW" ) +#define BOOST_REQUIRE_TA_ASSERT( S, E ) \ + BOOST_WARN_MESSAGE( false, "Skipped BOOST_REQUIRE_THROW(" BOOST_STRINGIZE(S) "," BOOST_STRINGIZE(E) ") due to TA_ASSERT_POLICY != TA_ASSERT_THROW" ) + +#endif + #endif // TILEDARRAY_CONFIG_H__INCLUDED From 1ba4fbbda0db19613e96aa65fafbf23eadd51012 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 18 Nov 2023 09:25:19 -0500 Subject: [PATCH 160/592] [unit] convert all uses of `BOOST_{WARN,CHECK,REQUIRE}_THROW` to `BOOST_{WARN,CHECK,REQUIRE}_TA_ASSERT` so that unit tests can be built/run regardless of how `introduced `BOOST_{WARN,CHECK,REQUIRE}_TA_ASSERT` macros to use in place of ` is set --- CMakeLists.txt | 16 ++++---- tests/annotation.cpp | 2 +- tests/bipartite_index_list.cpp | 22 +++++------ tests/bitset.cpp | 4 +- tests/conversions.cpp | 12 +++--- tests/cyclic_pmap.cpp | 42 ++++++++++----------- tests/dist_op_group.cpp | 14 +++---- tests/distributed_storage.cpp | 6 +-- tests/index_list.cpp | 18 ++++----- tests/initializer_list.cpp | 2 +- tests/perm_index.cpp | 5 +-- tests/range.cpp | 4 +- tests/sparse_shape.cpp | 62 +++++++++++++++---------------- tests/ta_test.cpp | 3 -- tests/tile_op_contract_reduce.cpp | 2 +- tests/tiled_range1.cpp | 18 ++++----- tests/tot_dist_array_part1.cpp | 12 +++--- tests/tot_dist_array_part2.cpp | 58 ++++++++++++++--------------- 18 files changed, 150 insertions(+), 152 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c1efb0292..9a47fbd989 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,6 +109,7 @@ include(LoadFetchContent) include(CMakeDependentOption) include(CMakePackageConfigHelpers) include(FeatureSummary) +include(CTest) # testing, defined BUILD_TESTING set(MPI_CXX_SKIP_MPICXX TRUE CACHE BOOL "MPI_CXX_SKIP_MPICXX") @@ -284,13 +285,6 @@ set_property( CACHE TA_ASSERT_POLICY PROPERTY STRINGS TA_ASSERT_THROW TA_ASSERT_ABORT TA_ASSERT_IGNORE) -# if building unit tests default to throw to be able to test TA_ASSERT statements -if (NOT (TA_ASSERT_POLICY STREQUAL TA_ASSERT_THROW)) - if (BUILD_TESTING) - message(FATAL_ERROR "TA_ASSERT_POLICY=${TA_ASSERT_POLICY} requires BUILD_TESTING=OFF") - endif(BUILD_TESTING) -endif() - ########################## # Include source dirctories ########################## @@ -368,7 +362,7 @@ add_subdirectory(doc) ########################## # checking/testing ########################## -include(CTest) +# N.B. CTest was included above if (BUILD_TESTING) set(_ctest_args -V -R "tiledarray/unit/run-np.*") set(_ctest_args_serial -V -R "tiledarray/unit/run-np-1") @@ -376,6 +370,12 @@ if (BUILD_TESTING) list(APPEND _ctest_args --timeout ${TA_UT_CTEST_TIMEOUT}) list(APPEND _ctest_args_serial --timeout ${TA_UT_CTEST_TIMEOUT}) endif(DEFINED TA_UT_CTEST_TIMEOUT) + + # if building unit tests need to configure with TA_ASSERT_POLICY=TA_ASSERT_THROW to be able to test TA_ASSERT statements + if (NOT (TA_ASSERT_POLICY STREQUAL TA_ASSERT_THROW)) + message(WARNING "BUILD_TESTING=ON requires configuring with TA_ASSERT_POLICY=TA_ASSERT_THROW to engage REQUIRE_THROWS() tests; will skip these tests") + endif() + add_custom_target_subproject(tiledarray check USES_TERMINAL COMMAND ${CMAKE_CTEST_COMMAND} ${_ctest_args}) add_custom_target_subproject(tiledarray check_serial USES_TERMINAL COMMAND ${CMAKE_CTEST_COMMAND} ${_ctest_args_serial}) add_subdirectory(tests) diff --git a/tests/annotation.cpp b/tests/annotation.cpp index f3494b5ac9..48acaa189c 100644 --- a/tests/annotation.cpp +++ b/tests/annotation.cpp @@ -201,7 +201,7 @@ BOOST_AUTO_TEST_SUITE(split_index_fxn) BOOST_AUTO_TEST_CASE(invalid_idx) { if (TiledArray::get_default_world().nproc() == 1) - BOOST_CHECK_THROW(split_index("i,"), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(split_index("i,"), TiledArray::Exception); } BOOST_AUTO_TEST_CASE(non_tot) { diff --git a/tests/bipartite_index_list.cpp b/tests/bipartite_index_list.cpp index 71025297af..364f894659 100644 --- a/tests/bipartite_index_list.cpp +++ b/tests/bipartite_index_list.cpp @@ -122,7 +122,7 @@ BOOST_AUTO_TEST_CASE(default_ctor) { */ BOOST_AUTO_TEST_CASE(string_ctor) { if (world.nproc() == 1) { - BOOST_CHECK_THROW(BipartiteIndexList("i,"), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(BipartiteIndexList("i,"), TiledArray::Exception); } for (auto&& [str, idx] : idxs) { @@ -192,7 +192,7 @@ BOOST_AUTO_TEST_CASE(copy_assignment) { BOOST_AUTO_TEST_CASE(string_assignment) { if (world.nproc() == 1) { BipartiteIndexList v1; - BOOST_CHECK_THROW(v1.operator=("i,"), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(v1.operator=("i,"), TiledArray::Exception); } for (auto&& [str, idx] : idxs) { @@ -282,7 +282,7 @@ BOOST_AUTO_TEST_CASE(permute_in_place) { if (world.nproc() == 1) { BipartiteIndexList v0; Permutation p{0, 1}; - BOOST_CHECK_THROW(v0 *= p, TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(v0 *= p, TiledArray::Exception); } Permutation p({1, 2, 3, 0}); @@ -335,13 +335,13 @@ BOOST_AUTO_TEST_CASE(end_itr) { BOOST_AUTO_TEST_CASE(at_member) { for (auto&& [str, idx] : idxs) { if (world.nproc() == 1) { - BOOST_CHECK_THROW(idx.at(idx.size()), + BOOST_CHECK_TA_ASSERT(idx.at(idx.size()), #ifdef BOOST_CONTAINER_USE_STD_EXCEPTIONS - std::out_of_range + std::out_of_range #else - boost::container::out_of_range + boost::container::out_of_range #endif - ); + ); } auto [outer, inner] = detail::split_index(str); for (size_type i = 0; i < outer.size(); ++i) @@ -498,23 +498,23 @@ BOOST_AUTO_TEST_CASE(permutation_fxn) { { // not both ToT BipartiteIndexList v1("i;j"); - BOOST_CHECK_THROW(v1.permutation(v0), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(v1.permutation(v0), TiledArray::Exception); } { // wrong size BipartiteIndexList v1("i"); - BOOST_CHECK_THROW(v1.permutation(v0), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(v1.permutation(v0), TiledArray::Exception); } { // not a permutation BipartiteIndexList v1("i, a"); - BOOST_CHECK_THROW(v1.permutation(v0), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(v1.permutation(v0), TiledArray::Exception); } { // ToTs mix outer and inner BipartiteIndexList v1("i,j;k,l"); BipartiteIndexList v2("i,k;j,l"); - BOOST_CHECK_THROW(v1.permutation(v2), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(v1.permutation(v2), TiledArray::Exception); } } diff --git a/tests/bitset.cpp b/tests/bitset.cpp index 289a47c295..0ffcf56114 100644 --- a/tests/bitset.cpp +++ b/tests/bitset.cpp @@ -102,8 +102,8 @@ BOOST_AUTO_TEST_CASE(accessor) { // Check that exceptions are thrown when accessing an element that is out of // range. - BOOST_CHECK_THROW(set[set.size()], Exception); - BOOST_CHECK_THROW(set[set.size() + 1], Exception); + BOOST_CHECK_TA_ASSERT(set[set.size()], Exception); + BOOST_CHECK_TA_ASSERT(set[set.size() + 1], Exception); } BOOST_AUTO_TEST_CASE(set_bit) { diff --git a/tests/conversions.cpp b/tests/conversions.cpp index e9ae430bbb..107a383c00 100644 --- a/tests/conversions.cpp +++ b/tests/conversions.cpp @@ -530,12 +530,12 @@ BOOST_AUTO_TEST_CASE(concat) { } } // ranges of non-concatted dims must match - BOOST_CHECK_THROW((TiledArray::concat( - {a, b_t}, std::vector{false, true})), - TiledArray::Exception); - BOOST_CHECK_THROW((TiledArray::concat( - {a, b_t}, std::vector{true, false})), - TiledArray::Exception); + BOOST_CHECK_TA_ASSERT((TiledArray::concat( + {a, b_t}, std::vector{false, true})), + TiledArray::Exception); + BOOST_CHECK_TA_ASSERT((TiledArray::concat( + {a, b_t}, std::vector{true, false})), + TiledArray::Exception); }; do_test(static_cast(nullptr)); diff --git a/tests/cyclic_pmap.cpp b/tests/cyclic_pmap.cpp index 509b9f92bf..4d8d76da1f 100644 --- a/tests/cyclic_pmap.cpp +++ b/tests/cyclic_pmap.cpp @@ -60,28 +60,28 @@ BOOST_AUTO_TEST_CASE(constructor) { ProcessID size = GlobalFixture::world->size(); - BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap(*GlobalFixture::world, - 0ul, 10ul, 1, 1), - TiledArray::Exception); - BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap(*GlobalFixture::world, - 10ul, 0ul, 1, 1), - TiledArray::Exception); - BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap(*GlobalFixture::world, - 10ul, 10ul, 0, 1), - TiledArray::Exception); - BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap(*GlobalFixture::world, - 10ul, 10ul, 1, 0), - TiledArray::Exception); - BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap( - *GlobalFixture::world, 10ul, 10ul, size * 2, 1), - TiledArray::Exception); - BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap( - *GlobalFixture::world, 10ul, 10ul, 1, size * 2), - TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap( + *GlobalFixture::world, 0ul, 10ul, 1, 1), + TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap( + *GlobalFixture::world, 10ul, 0ul, 1, 1), + TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap( + *GlobalFixture::world, 10ul, 10ul, 0, 1), + TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap( + *GlobalFixture::world, 10ul, 10ul, 1, 0), + TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap( + *GlobalFixture::world, 10ul, 10ul, size * 2, 1), + TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap( + *GlobalFixture::world, 10ul, 10ul, 1, size * 2), + TiledArray::Exception); if (size > 1) { - BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap( - *GlobalFixture::world, 10ul, 10ul, size, size), - TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap( + *GlobalFixture::world, 10ul, 10ul, size, size), + TiledArray::Exception); } } diff --git a/tests/dist_op_group.cpp b/tests/dist_op_group.cpp index 8027eab7e1..b4846716f4 100644 --- a/tests/dist_op_group.cpp +++ b/tests/dist_op_group.cpp @@ -56,14 +56,14 @@ BOOST_AUTO_TEST_CASE(constructor_empty) { #if defined(MADNESS_ASSERTIONS_THROW) // Check that accessing group data throws exceptions for an empty group. - BOOST_CHECK_THROW(empty_group.id(), madness::MadnessException); - BOOST_CHECK_THROW(empty_group.get_world(), madness::MadnessException); - BOOST_CHECK_THROW(empty_group.rank(), madness::MadnessException); - BOOST_CHECK_THROW(empty_group.rank(0), madness::MadnessException); - BOOST_CHECK_THROW(empty_group.world_rank(0), madness::MadnessException); + BOOST_CHECK_TA_ASSERT(empty_group.id(), madness::MadnessException); + BOOST_CHECK_TA_ASSERT(empty_group.get_world(), madness::MadnessException); + BOOST_CHECK_TA_ASSERT(empty_group.rank(), madness::MadnessException); + BOOST_CHECK_TA_ASSERT(empty_group.rank(0), madness::MadnessException); + BOOST_CHECK_TA_ASSERT(empty_group.world_rank(0), madness::MadnessException); ProcessID parent, child1, child2; - BOOST_CHECK_THROW(empty_group.make_tree(0, parent, child1, child2), - madness::MadnessException); + BOOST_CHECK_TA_ASSERT(empty_group.make_tree(0, parent, child1, child2), + madness::MadnessException); #endif // MADNESS_ASSERTIONS_THROW } diff --git a/tests/distributed_storage.cpp b/tests/distributed_storage.cpp index 9dec84f967..895b734911 100644 --- a/tests/distributed_storage.cpp +++ b/tests/distributed_storage.cpp @@ -79,8 +79,8 @@ BOOST_AUTO_TEST_CASE(set_value) { BOOST_CHECK_EQUAL(n, t.max_size()); // Check throw for an out-of-range set. - BOOST_CHECK_THROW(t.set(t.max_size(), 1), TiledArray::Exception); - BOOST_CHECK_THROW(t.set(t.max_size() + 2, 1), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.set(t.max_size(), 1), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.set(t.max_size() + 2, 1), TiledArray::Exception); } BOOST_AUTO_TEST_CASE(array_operator) { @@ -97,7 +97,7 @@ BOOST_AUTO_TEST_CASE(array_operator) { BOOST_CHECK_EQUAL(n, t.max_size()); // Check throw for an out-of-range set. - BOOST_CHECK_THROW(t.get(t.max_size()), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.get(t.max_size()), TiledArray::Exception); BOOST_CHECK_THROW(t.get(t.max_size() + 2), TiledArray::Exception); } diff --git a/tests/index_list.cpp b/tests/index_list.cpp index bf75aaffac..c53bdd9de7 100644 --- a/tests/index_list.cpp +++ b/tests/index_list.cpp @@ -135,11 +135,11 @@ BOOST_AUTO_TEST_CASE(accessors) { BOOST_CHECK_EQUAL(v.at(3), "d"); // check last variable access BOOST_CHECK_EQUAL(v[0], "a"); // check 1st variable access BOOST_CHECK_EQUAL(v[3], "d"); // check last variable access - BOOST_CHECK_THROW(v.at(4), + BOOST_CHECK_TA_ASSERT(v.at(4), #ifdef BOOST_CONTAINER_USE_STD_EXCEPTIONS - std::out_of_range + std::out_of_range #else - boost::container::out_of_range + boost::container::out_of_range #endif ); // check for out of range throw. } @@ -175,11 +175,11 @@ BOOST_AUTO_TEST_CASE(constructor) { BOOST_CHECK_EQUAL(v10.at(2), "c"); BOOST_CHECK_EQUAL(v10.at(3), "d"); - BOOST_CHECK_THROW(IndexList v3(",a,b,c"), - Exception); // check invalid input - BOOST_CHECK_THROW(IndexList v4("a,,b,c"), Exception); - BOOST_CHECK_THROW(IndexList v5(" ,a,b"), Exception); - BOOST_CHECK_THROW(IndexList v6("a, b, , c"), Exception); + BOOST_CHECK_TA_ASSERT(IndexList v3(",a,b,c"), + Exception); // check invalid input + BOOST_CHECK_TA_ASSERT(IndexList v4("a,,b,c"), Exception); + BOOST_CHECK_TA_ASSERT(IndexList v5(" ,a,b"), Exception); + BOOST_CHECK_TA_ASSERT(IndexList v6("a, b, , c"), Exception); IndexList v7(" a , b, c, d , e e ,f f, g10,h, i "); // check input with // various spacings. @@ -193,7 +193,7 @@ BOOST_AUTO_TEST_CASE(constructor) { BOOST_CHECK_EQUAL(v7.at(7), "h"); BOOST_CHECK_EQUAL(v7.at(8), "i"); - BOOST_REQUIRE_THROW( + BOOST_REQUIRE_TA_ASSERT( IndexList v11(""), TiledArray::Exception); // Empty string is not permitted constructor } diff --git a/tests/initializer_list.cpp b/tests/initializer_list.cpp index 884f5c61fd..4d051f957d 100644 --- a/tests/initializer_list.cpp +++ b/tests/initializer_list.cpp @@ -198,7 +198,7 @@ BOOST_AUTO_TEST_CASE(scalar) { BOOST_AUTO_TEST_CASE(empty_vector) { vector_il il{}; if (world.rank() == 0) // only rank 0 does the work - BOOST_CHECK_THROW(tiled_range_from_il(il), Exception); + BOOST_CHECK_TA_ASSERT(tiled_range_from_il(il), Exception); } BOOST_AUTO_TEST_CASE(vector) { diff --git a/tests/perm_index.cpp b/tests/perm_index.cpp index 3ba48aa7a1..8a1326d7df 100644 --- a/tests/perm_index.cpp +++ b/tests/perm_index.cpp @@ -49,8 +49,7 @@ const std::array PermIndexFixture::start = { const std::array PermIndexFixture::finish = { {3ul, 5ul, 7ul, 11ul}}; -BOOST_FIXTURE_TEST_SUITE(perm_index_suite, PermIndexFixture, - TA_UT_LABEL_SERIAL) +BOOST_FIXTURE_TEST_SUITE(perm_index_suite, PermIndexFixture, TA_UT_LABEL_SERIAL) BOOST_AUTO_TEST_CASE(default_constructor) { BOOST_CHECK_NO_THROW(PermIndex x;); @@ -61,7 +60,7 @@ BOOST_AUTO_TEST_CASE(default_constructor) { BOOST_CHECK(!x.data()); // Check that an exception is thrown when using a default constructed object - BOOST_CHECK_THROW(x(0), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(x(0), TiledArray::Exception); } BOOST_AUTO_TEST_CASE(constructor) { diff --git a/tests/range.cpp b/tests/range.cpp index a71a0629d0..a5ac8898f9 100644 --- a/tests/range.cpp +++ b/tests/range.cpp @@ -344,8 +344,8 @@ BOOST_AUTO_TEST_CASE(constructors) { BOOST_CHECK_EQUAL(r2.volume(), 48); } #else // TA_SIGNED_1INDEX_TYPE - BOOST_REQUIRE_THROW(Range r2({{-1, 1}, {-2, 2}, {0, 6}}), - TiledArray::Exception); + BOOST_REQUIRE_TA_ASSERT(Range r2({{-1, 1}, {-2, 2}, {0, 6}}), + TiledArray::Exception); #endif // TA_SIGNED_1INDEX_TYPE // Copy Constructor diff --git a/tests/sparse_shape.cpp b/tests/sparse_shape.cpp index 77ada97028..0112f0dac6 100644 --- a/tests/sparse_shape.cpp +++ b/tests/sparse_shape.cpp @@ -49,36 +49,36 @@ BOOST_AUTO_TEST_CASE(default_constructor) { BOOST_CHECK(!x.validate(tr.tiles_range())); BOOST_CHECK_EQUAL(x.init_threshold(), SparseShape::threshold()); - BOOST_CHECK_THROW(x.nnz(), Exception); + BOOST_CHECK_TA_ASSERT(x.nnz(), Exception); - BOOST_CHECK_THROW(x[0], Exception); + BOOST_CHECK_TA_ASSERT(x[0], Exception); - BOOST_CHECK_THROW(x.perm(perm), Exception); + BOOST_CHECK_TA_ASSERT(x.perm(perm), Exception); - BOOST_CHECK_THROW(x.scale(2.0), Exception); - BOOST_CHECK_THROW(x.scale(2.0, perm), Exception); + BOOST_CHECK_TA_ASSERT(x.scale(2.0), Exception); + BOOST_CHECK_TA_ASSERT(x.scale(2.0, perm), Exception); - BOOST_CHECK_THROW(x.add(y), Exception); - BOOST_CHECK_THROW(x.add(y, 2.0), Exception); - BOOST_CHECK_THROW(x.add(y, perm), Exception); - BOOST_CHECK_THROW(x.add(y, 2.0, perm), Exception); - BOOST_CHECK_THROW(x.add(2.0), Exception); - BOOST_CHECK_THROW(x.add(2.0, perm), Exception); + BOOST_CHECK_TA_ASSERT(x.add(y), Exception); + BOOST_CHECK_TA_ASSERT(x.add(y, 2.0), Exception); + BOOST_CHECK_TA_ASSERT(x.add(y, perm), Exception); + BOOST_CHECK_TA_ASSERT(x.add(y, 2.0, perm), Exception); + BOOST_CHECK_TA_ASSERT(x.add(2.0), Exception); + BOOST_CHECK_TA_ASSERT(x.add(2.0, perm), Exception); - BOOST_CHECK_THROW(x.subt(y), Exception); - BOOST_CHECK_THROW(x.subt(y, 2.0), Exception); - BOOST_CHECK_THROW(x.subt(y, perm), Exception); - BOOST_CHECK_THROW(x.subt(y, 2.0, perm), Exception); - BOOST_CHECK_THROW(x.subt(2.0), Exception); - BOOST_CHECK_THROW(x.subt(2.0, perm), Exception); + BOOST_CHECK_TA_ASSERT(x.subt(y), Exception); + BOOST_CHECK_TA_ASSERT(x.subt(y, 2.0), Exception); + BOOST_CHECK_TA_ASSERT(x.subt(y, perm), Exception); + BOOST_CHECK_TA_ASSERT(x.subt(y, 2.0, perm), Exception); + BOOST_CHECK_TA_ASSERT(x.subt(2.0), Exception); + BOOST_CHECK_TA_ASSERT(x.subt(2.0, perm), Exception); - BOOST_CHECK_THROW(x.mult(y), Exception); - BOOST_CHECK_THROW(x.mult(y, 2.0), Exception); - BOOST_CHECK_THROW(x.mult(y, perm), Exception); - BOOST_CHECK_THROW(x.mult(y, 2.0, perm), Exception); + BOOST_CHECK_TA_ASSERT(x.mult(y), Exception); + BOOST_CHECK_TA_ASSERT(x.mult(y, 2.0), Exception); + BOOST_CHECK_TA_ASSERT(x.mult(y, perm), Exception); + BOOST_CHECK_TA_ASSERT(x.mult(y, 2.0, perm), Exception); - BOOST_CHECK_THROW(x.gemm(y, 2.0, gemm_helper), Exception); - BOOST_CHECK_THROW(x.gemm(y, 2.0, gemm_helper, perm), Exception); + BOOST_CHECK_TA_ASSERT(x.gemm(y, 2.0, gemm_helper), Exception); + BOOST_CHECK_TA_ASSERT(x.gemm(y, 2.0, gemm_helper, perm), Exception); } BOOST_AUTO_TEST_CASE(non_comm_constructor) { @@ -350,8 +350,8 @@ BOOST_AUTO_TEST_CASE(block) { #endif } else { // Check that block throws an exception with a bad block range - BOOST_CHECK_THROW(sparse_shape.block(lower, upper), - TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(sparse_shape.block(lower, upper), + TiledArray::Exception); } } } @@ -447,8 +447,8 @@ BOOST_AUTO_TEST_CASE(block_scale) { } else { // Check that block throws an exception with a bad block range - BOOST_CHECK_THROW(sparse_shape.block(lower, upper), - TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(sparse_shape.block(lower, upper), + TiledArray::Exception); } } } @@ -546,8 +546,8 @@ BOOST_AUTO_TEST_CASE(block_perm) { } else { // Check that block throws an exception with a bad block range - BOOST_CHECK_THROW(sparse_shape.block(lower, upper), - TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(sparse_shape.block(lower, upper), + TiledArray::Exception); } } } @@ -649,8 +649,8 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) { } else { // Check that block throws an exception with a bad block range - BOOST_CHECK_THROW(sparse_shape.block(lower, upper), - TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(sparse_shape.block(lower, upper), + TiledArray::Exception); } } } diff --git a/tests/ta_test.cpp b/tests/ta_test.cpp index 8d81e66849..7e5f2184bf 100644 --- a/tests/ta_test.cpp +++ b/tests/ta_test.cpp @@ -28,9 +28,6 @@ #endif #include -#if (TA_ASSERT_POLICY != TA_ASSERT_THROW) -#error "TiledArray unit tests require TA_ASSERT_POLICY=TA_ASSERT_THROW" -#endif GlobalFixture::GlobalFixture() { if (world == nullptr) { diff --git a/tests/tile_op_contract_reduce.cpp b/tests/tile_op_contract_reduce.cpp index 5c30a5b491..b50397097d 100644 --- a/tests/tile_op_contract_reduce.cpp +++ b/tests/tile_op_contract_reduce.cpp @@ -101,7 +101,7 @@ BOOST_AUTO_TEST_CASE(permute_empty) { TiledArray::math::blas::Op::NoTrans, TiledArray::math::blas::Op::NoTrans, 1, 2u, 2u, 2u); TensorI t, result; - BOOST_REQUIRE_THROW(result = op(t), TiledArray::Exception); + BOOST_REQUIRE_TA_ASSERT(result = op(t), TiledArray::Exception); } // TODO: Test non-empty permutation diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index d7379e2fbb..043e4b96ac 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -60,7 +60,7 @@ BOOST_AUTO_TEST_CASE(constructor) { BOOST_CHECK_EQUAL(r.tiles_range().second, 0ul); BOOST_CHECK_EQUAL(r.elements_range().first, 0ul); BOOST_CHECK_EQUAL(r.elements_range().second, 0ul); - BOOST_CHECK_THROW(r.tile(0), Exception); + BOOST_CHECK_TA_ASSERT(r.tile(0), Exception); } // check construction with a iterators and the range info. @@ -120,8 +120,8 @@ BOOST_AUTO_TEST_CASE(constructor) { BOOST_CHECK_EQUAL(r.elements_range().second, 28); } #else // TA_SIGNED_1INDEX_TYPE - BOOST_CHECK_THROW(TiledRange1 r({-1, 0, 2, 5, 10, 17, 28}), - TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(TiledRange1 r({-1, 0, 2, 5, 10, 17, 28}), + TiledArray::Exception); #endif // TA_SIGNED_1INDEX_TYPE // check copy constructor @@ -156,14 +156,14 @@ BOOST_AUTO_TEST_CASE(constructor) { #ifndef NDEBUG { std::vector boundaries; - BOOST_CHECK_THROW(TiledRange1 r(boundaries.begin(), boundaries.end()), - Exception); - BOOST_CHECK_THROW(TiledRange1 r(a.begin(), a.begin()), Exception); - BOOST_CHECK_THROW(TiledRange1 r(a.begin(), a.begin() + 1), Exception); + BOOST_CHECK_TA_ASSERT(TiledRange1 r(boundaries.begin(), boundaries.end()), + Exception); + BOOST_CHECK_TA_ASSERT(TiledRange1 r(a.begin(), a.begin()), Exception); + BOOST_CHECK_TA_ASSERT(TiledRange1 r(a.begin(), a.begin() + 1), Exception); boundaries.push_back(2); boundaries.push_back(0); - BOOST_CHECK_THROW(TiledRange1 r(boundaries.begin(), boundaries.end()), - Exception); + BOOST_CHECK_TA_ASSERT(TiledRange1 r(boundaries.begin(), boundaries.end()), + Exception); } #endif } diff --git a/tests/tot_dist_array_part1.cpp b/tests/tot_dist_array_part1.cpp index e71392ef8c..d95bb050a2 100644 --- a/tests/tot_dist_array_part1.cpp +++ b/tests/tot_dist_array_part1.cpp @@ -331,7 +331,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(begin, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.begin(), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.begin(), TiledArray::Exception); } for (auto tr_t : run_all()) { @@ -344,7 +344,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(const_begin, TestParam, test_params) { { const tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.begin(), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.begin(), TiledArray::Exception); } for (auto tr_t : run_all()) { @@ -356,7 +356,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(const_begin, TestParam, test_params) { BOOST_AUTO_TEST_CASE_TEMPLATE(end, TestParam, test_params) { { tensor_type t; - if (m_world.nproc() == 1) BOOST_CHECK_THROW(t.end(), TiledArray::Exception); + if (m_world.nproc() == 1) + BOOST_CHECK_TA_ASSERT(t.end(), TiledArray::Exception); } for (auto tr_t : run_all()) { @@ -368,7 +369,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(end, TestParam, test_params) { BOOST_AUTO_TEST_CASE_TEMPLATE(const_end, TestParam, test_params) { { const tensor_type t; - if (m_world.nproc() == 1) BOOST_CHECK_THROW(t.end(), TiledArray::Exception); + if (m_world.nproc() == 1) + BOOST_CHECK_TA_ASSERT(t.end(), TiledArray::Exception); } for (auto tr_t : run_all()) { @@ -391,7 +393,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(find, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.find(0), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.find(0), TiledArray::Exception); } for (auto tr_t : run_all()) { diff --git a/tests/tot_dist_array_part2.cpp b/tests/tot_dist_array_part2.cpp index b916812884..ffd1883198 100644 --- a/tests/tot_dist_array_part2.cpp +++ b/tests/tot_dist_array_part2.cpp @@ -37,7 +37,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(fill_local, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) { - BOOST_CHECK_THROW(t.fill_local(inner_type{}), except_t); + BOOST_CHECK_TA_ASSERT(t.fill_local(inner_type{}), except_t); } } @@ -56,7 +56,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(fill_local, TestParam, test_params) { // Test that it throws if a tile is already set /*{ if(m_world.nproc() == 1) - BOOST_CHECK_THROW(already_set.fill_local(inner_type{}), except_t); + BOOST_CHECK_TA_ASSERT(already_set.fill_local(inner_type{}), except_t); }*/ // Test we can actually fill tiles @@ -86,7 +86,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(fill, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) { - BOOST_CHECK_THROW(t.fill(inner_type{}), except_t); + BOOST_CHECK_TA_ASSERT(t.fill(inner_type{}), except_t); } } @@ -106,7 +106,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(fill, TestParam, test_params) { // Test that it throws if a tile is already set /*{ if(m_world.nproc() == 1) - BOOST_CHECK_THROW(already_set.fill(inner_type{}), except_t); + BOOST_CHECK_TA_ASSERT(already_set.fill(inner_type{}), except_t); }*/ // Test we can actually fill tiles @@ -145,7 +145,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(init_tiles, TestParam, test_params) { tensor_type t; if (m_world.nproc() == 1) { auto l = [](const Range&) { return tile_type{}; }; - BOOST_CHECK_THROW(t.init_tiles(l), except_t); + BOOST_CHECK_TA_ASSERT(t.init_tiles(l), except_t); } } @@ -172,7 +172,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(init_tiles, TestParam, test_params) { // Test that it throws if a tile is already set /*{ if(m_world.nproc() == 1) - BOOST_CHECK_THROW(corr.init_tiles(l), except_t); + BOOST_CHECK_TA_ASSERT(corr.init_tiles(l), except_t); }*/ // Test we can actually fill tiles @@ -200,7 +200,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(init_elements, TestParam, test_params) { tensor_type t; auto l = [](const index_type&) { return inner_type{}; }; if (m_world.nproc() == 1) { - BOOST_CHECK_THROW(t.init_elements(l), except_t); + BOOST_CHECK_TA_ASSERT(t.init_elements(l), except_t); } } @@ -226,7 +226,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(init_elements, TestParam, test_params) { // Test that it throws if a tile is already set /*{ if(m_world.nproc() == 1) - BOOST_CHECK_THROW(corr.init_elements(l), except_t); + BOOST_CHECK_TA_ASSERT(corr.init_elements(l), except_t); }*/ // Test we can actually fill tiles @@ -245,7 +245,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(trange, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.trange(), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.trange(), TiledArray::Exception); } for (auto tr_t : run_all()) { @@ -259,7 +259,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tiles_range, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.tiles_range(), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.tiles_range(), TiledArray::Exception); } for (auto tr_t : run_all()) { @@ -274,7 +274,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(elements_range, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.elements_range(), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.elements_range(), TiledArray::Exception); } for (auto tr_t : run_all()) { @@ -289,7 +289,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(size, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.size(), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.size(), TiledArray::Exception); } for (auto tr_t : run_all()) { @@ -303,7 +303,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(world, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.world(), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.world(), TiledArray::Exception); } for (auto tr_t : run_all()) { @@ -317,7 +317,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(pmap, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.pmap(), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.pmap(), TiledArray::Exception); } } @@ -325,7 +325,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(shape, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.shape(), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.shape(), TiledArray::Exception); } using shape_type = typename tensor_type::shape_type; for (auto tr_t : run_all()) { @@ -351,9 +351,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(call_operator, TestParam, test_params) { if (m_world.nproc() == 1) { using except_t = TiledArray::Exception; // Throws if no semicolon - BOOST_CHECK_THROW(t(outer_idx), except_t); + BOOST_CHECK_TA_ASSERT(t(outer_idx), except_t); // Throws if wrong outer rank - BOOST_CHECK_THROW(t("i,j,k,l,m;" + inner_idx), except_t); + BOOST_CHECK_TA_ASSERT(t("i,j,k,l,m;" + inner_idx), except_t); } auto vars = outer_idx + ";" + inner_idx; @@ -374,9 +374,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(const_call_operator, TestParam, test_params) { if (m_world.nproc() == 1) { using except_t = TiledArray::Exception; // Throws if no semicolon - BOOST_CHECK_THROW(t(outer_idx), except_t); + BOOST_CHECK_TA_ASSERT(t(outer_idx), except_t); // Throws if wrong outer rank - BOOST_CHECK_THROW(t("i,j,k,l,m;" + inner_idx), except_t); + BOOST_CHECK_TA_ASSERT(t("i,j,k,l,m;" + inner_idx), except_t); } auto vars = outer_idx + ";" + inner_idx; @@ -399,7 +399,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(is_dense, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.is_dense(), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.is_dense(), TiledArray::Exception); } using shape_type = typename tensor_type::shape_type; @@ -415,7 +415,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(owner, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.owner(0), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.owner(0), TiledArray::Exception); } for (auto tr_t : run_all()) { @@ -426,11 +426,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(owner, TestParam, test_params) { const auto& upbound = tr.tiles_range().upbound(); // Test throws if index is out of bounds - BOOST_CHECK_THROW(corr.owner(upbound), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(corr.owner(upbound), TiledArray::Exception); // Throws if index has wrong rank std::vector bad_idx(upbound.size() + 1, 0); - BOOST_CHECK_THROW(corr.owner(bad_idx), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(corr.owner(bad_idx), TiledArray::Exception); } for (auto idx : corr.tiles_range()) { @@ -445,7 +445,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(owner_init_list, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.owner({0}), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.owner({0}), TiledArray::Exception); } for (auto tr_t : run_all()) { @@ -459,13 +459,13 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(owner_init_list, TestParam, test_params) { // Test throws if index is out of bounds if (rank == 1) - BOOST_CHECK_THROW(corr.owner({upbound[0]}), except_t); + BOOST_CHECK_TA_ASSERT(corr.owner({upbound[0]}), except_t); else if (rank == 2) - BOOST_CHECK_THROW(corr.owner({upbound[0], upbound[1]}), except_t); + BOOST_CHECK_TA_ASSERT(corr.owner({upbound[0], upbound[1]}), except_t); // Throws if index has wrong rank std::initializer_list il2{0, 0, 0, 0, 0, 0}; - BOOST_CHECK_THROW(corr.owner(il2), except_t); + BOOST_CHECK_TA_ASSERT(corr.owner(il2), except_t); } for (auto idx : corr.tiles_range()) { @@ -484,7 +484,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(is_local, TestParam, test_params) { { tensor_type t; if (m_world.nproc() == 1) - BOOST_CHECK_THROW(t.is_local(0), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(t.is_local(0), TiledArray::Exception); } for (auto tr_t : run_all()) { @@ -495,7 +495,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(is_local, TestParam, test_params) { const auto& upbound = tr.tiles_range().upbound(); // Test throws if index is out of bounds - BOOST_CHECK_THROW(corr.is_local(upbound), TiledArray::Exception); + BOOST_CHECK_TA_ASSERT(corr.is_local(upbound), TiledArray::Exception); // Throws if index has wrong rank std::vector bad_idx(upbound.size() + 1, 0); From a9a6b58958c444b8b1900b345bae0993716d5c7d Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 Nov 2023 12:41:58 -0500 Subject: [PATCH 161/592] [WIP]: Make binary_egine less restrictive on left and right arg types. --- src/TiledArray/einsum/tiledarray.h | 21 ++++++++++++--------- src/TiledArray/expressions/binary_engine.h | 19 ++++++++++++++++--- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 52dab7477e..09640d31f6 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -309,7 +309,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, Einsum::Index c = std::get<0>(cs); struct { - std::string a, b, c; + std::string b, c; } inner; if constexpr (std::tuple_size::value == 2) { inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); @@ -319,16 +319,13 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // these are "Hadamard" (fused) indices auto h = a & b & c; - auto e = (a ^ b); // contracted indices auto i = (a & b) - h; + // contraction not allowed in tensor x tensor-of-tensor + TA_ASSERT(!i); - // cannot be hadamard reduction type operation for this overload - TA_ASSERT(e); - - // no Hadamard indices => standard contraction (or even outer product) - // same a, b, and c => pure Hadamard - TA_ASSERT(!h || (!(a ^ b) && !(b ^ c))); + // indices exclusively in 'a' or exclusively in 'b' + auto e = (a ^ b); // maps Index to TiledRange1 // (asserts same index maps to the same TR1 in A, and B) @@ -364,6 +361,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } C.expr = e; + arrayTermB.expr += inner.b; + C.expr += inner.c; + struct { RangeProduct tiles; std::vector> batch; @@ -453,7 +453,10 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } // todo - // C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); + C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); + + // + A.ei.defer_deleter_to_next_fence(); B.ei.defer_deleter_to_next_fence(); A.ei = ArrayT(); diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h index 4758ab0069..93192e2b5e 100644 --- a/src/TiledArray/expressions/binary_engine.h +++ b/src/TiledArray/expressions/binary_engine.h @@ -146,11 +146,10 @@ class BinaryEngine : public ExprEngine { TiledArray::detail::is_tensor_of_tensor_v; constexpr bool right_tile_is_tot = TiledArray::detail::is_tensor_of_tensor_v; - static_assert(!(left_tile_is_tot ^ right_tile_is_tot), - "ContEngine can only handle tensors of same nested-ness " - "(both plain or both ToT)"); constexpr bool args_are_plain_tensors = !left_tile_is_tot && !right_tile_is_tot; + constexpr bool args_are_mixed_tensors = + left_tile_is_tot ^ right_tile_is_tot; if (args_are_plain_tensors && (left_outer_permtype_ == PermutationType::matrix_transpose || left_outer_permtype_ == PermutationType::identity)) { @@ -175,6 +174,20 @@ class BinaryEngine : public ExprEngine { right_inner_permtype_ == PermutationType::identity))) { right_.permute_tiles(false); } + if (args_are_mixed_tensors && + ((left_outer_permtype_ == PermutationType::matrix_transpose || + left_outer_permtype_ == PermutationType::identity) || + (left_inner_permtype_ == PermutationType::matrix_transpose || + left_inner_permtype_ == PermutationType::identity))) { + left_.permute_tiles(false); + } + if (args_are_mixed_tensors && + ((left_outer_permtype_ == PermutationType::matrix_transpose || + left_outer_permtype_ == PermutationType::identity) || + (right_inner_permtype_ == PermutationType::matrix_transpose || + right_inner_permtype_ == PermutationType::identity))) { + right_.permute_tiles(false); + } } public: From e4eb2c9409385639a6c1fff5fae19b02ceb2ce8e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 20 Nov 2023 14:06:14 -0500 Subject: [PATCH 162/592] moar ToT * T progress --- src/TiledArray/expressions/cont_engine.h | 299 ++++++++++++++--------- src/TiledArray/expressions/mult_engine.h | 4 +- src/TiledArray/expressions/product.h | 3 + src/TiledArray/tile_op/scal.h | 2 + tests/einsum.cpp | 8 +- 5 files changed, 194 insertions(+), 122 deletions(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 35c2f34199..9a1cb9f5f9 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -107,15 +107,26 @@ class ContEngine : public BinaryEngine { protected: op_type op_; ///< Tile operation - using tile_element_type = typename value_type::value_type; - std::function - inner_tile_nonreturn_op_; ///< Tile element operation (only non-null for - ///< nested tensor expressions) - std::function - inner_tile_return_op_; ///< Same as inner_tile_nonreturn_op_ but returns - ///< the result + + // tile types of the result and (after evaluation) left and right arguments + using result_tile_type = value_type; + using left_tile_type = typename EngineTrait::eval_type; + using right_tile_type = typename EngineTrait::eval_type; + + // tile element types of the result and (after evaluation) left and right + // arguments + using result_tile_element_type = typename result_tile_type::value_type; + using left_tile_element_type = typename left_tile_type::value_type; + using right_tile_element_type = typename right_tile_type::value_type; + + std::function + element_nonreturn_op_; ///< Tile element operation (only non-null for + ///< nested tensor expressions) + std::function + element_return_op_; ///< Same as inner_tile_nonreturn_op_ but returns + ///< the result TiledArray::detail::ProcGrid proc_grid_; ///< Process grid for the contraction size_type K_ = 1; ///< Inner dimension size @@ -239,8 +250,8 @@ class ContEngine : public BinaryEngine { // precondition checks // 1. if ToT inner tile op has been initialized if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { - TA_ASSERT(inner_tile_nonreturn_op_); - TA_ASSERT(inner_tile_return_op_); + TA_ASSERT(element_nonreturn_op_); + TA_ASSERT(element_return_op_); } // Initialize children @@ -271,7 +282,7 @@ class ContEngine : public BinaryEngine { op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), (permute_tiles_ ? perm_ : BipartitePermutation{}), - this->inner_tile_nonreturn_op_); + this->element_nonreturn_op_); } trange_ = ContEngine_::make_trange(outer(perm_)); shape_ = ContEngine_::make_shape(outer(perm_)); @@ -284,7 +295,7 @@ class ContEngine : public BinaryEngine { // factor_ is absorbed into inner_tile_nonreturn_op_ op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), - BipartitePermutation{}, this->inner_tile_nonreturn_op_); + BipartitePermutation{}, this->element_nonreturn_op_); } trange_ = ContEngine_::make_trange(); shape_ = ContEngine_::make_shape(); @@ -457,120 +468,172 @@ class ContEngine : public BinaryEngine { protected: void init_inner_tile_op(const IndexList& inner_target_indices) { - if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { - using inner_tile_type = typename value_type::value_type; + if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { + constexpr bool tot_x_tot = TiledArray::detail::is_tensor_of_tensor_v< + result_tile_type, left_tile_type, right_tile_type>; const auto inner_prod = this->inner_product_type(); TA_ASSERT(inner_prod == TensorProduct::Contraction || inner_prod == TensorProduct::Hadamard); if (inner_prod == TensorProduct::Contraction) { - using inner_tile_type = typename value_type::value_type; - using contract_inner_tile_type = - TiledArray::detail::ContractReduce; - // factor_ is absorbed into inner_tile_nonreturn_op_ - auto contrreduce_op = - (inner_target_indices != inner(this->indices_)) - ? contract_inner_tile_type( - to_cblas_op(this->left_inner_permtype_), - to_cblas_op(this->right_inner_permtype_), this->factor_, - inner_size(this->indices_), - inner_size(this->left_indices_), - inner_size(this->right_indices_), - (this->permute_tiles_ ? inner(this->perm_) - : Permutation{})) - : contract_inner_tile_type( - to_cblas_op(this->left_inner_permtype_), - to_cblas_op(this->right_inner_permtype_), this->factor_, - inner_size(this->indices_), - inner_size(this->left_indices_), - inner_size(this->right_indices_)); - this->inner_tile_nonreturn_op_ = [contrreduce_op]( - inner_tile_type& result, - const inner_tile_type& left, - const inner_tile_type& right) { - contrreduce_op(result, left, right); - }; + TA_ASSERT(tot_x_tot); + if constexpr (tot_x_tot) { + using op_type = TiledArray::detail::ContractReduce< + result_tile_element_type, left_tile_element_type, + right_tile_element_type, scalar_type>; + // factor_ is absorbed into inner_tile_nonreturn_op_ + auto contrreduce_op = + (inner_target_indices != inner(this->indices_)) + ? op_type(to_cblas_op(this->left_inner_permtype_), + to_cblas_op(this->right_inner_permtype_), + this->factor_, inner_size(this->indices_), + inner_size(this->left_indices_), + inner_size(this->right_indices_), + (this->permute_tiles_ ? inner(this->perm_) + : Permutation{})) + : op_type(to_cblas_op(this->left_inner_permtype_), + to_cblas_op(this->right_inner_permtype_), + this->factor_, inner_size(this->indices_), + inner_size(this->left_indices_), + inner_size(this->right_indices_)); + this->element_nonreturn_op_ = + [contrreduce_op](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + contrreduce_op(result, left, right); + }; + } // ToT x ToT } else if (inner_prod == TensorProduct::Hadamard) { - // inner tile op depends on the outer op ... e.g. if outer op - // is contract then inner must implement (ternary) multiply-add; - // if the outer is hadamard then the inner is binary multiply - const auto outer_prod = this->product_type(); - if (this->factor_ == 1) { - using base_op_type = - TiledArray::detail::Mult; - using op_type = TiledArray::detail::BinaryWrapper< - base_op_type>; // can't consume inputs if they are used multiple - // times, e.g. when outer op is gemm - auto mult_op = (inner_target_indices != inner(this->indices_)) - ? op_type(base_op_type(), this->permute_tiles_ - ? inner(this->perm_) - : Permutation{}) - : op_type(base_op_type()); - this->inner_tile_nonreturn_op_ = [mult_op, outer_prod]( - inner_tile_type& result, - const inner_tile_type& left, - const inner_tile_type& right) { - if (outer_prod == TensorProduct::Hadamard) - result = mult_op(left, right); - else { - TA_ASSERT(outer_prod == TensorProduct::Hadamard || - outer_prod == TensorProduct::Contraction); - // there is currently no fused MultAdd ternary Op, only Add and - // Mult thus implement this as 2 separate steps - // TODO optimize by implementing (ternary) MultAdd - if (empty(result)) - result = mult_op(left, right); - else { - auto result_increment = mult_op(left, right); - add_to(result, result_increment); - } - } - }; - } else { - using base_op_type = - TiledArray::detail::ScalMult; - using op_type = TiledArray::detail::BinaryWrapper< - base_op_type>; // can't consume inputs if they are used multiple - // times, e.g. when outer op is gemm - auto mult_op = (inner_target_indices != inner(this->indices_)) - ? op_type(base_op_type(this->factor_), - this->permute_tiles_ ? inner(this->perm_) - : Permutation{}) - : op_type(base_op_type(this->factor_)); - this->inner_tile_nonreturn_op_ = [mult_op, outer_prod]( - inner_tile_type& result, - const inner_tile_type& left, - const inner_tile_type& right) { - TA_ASSERT(outer_prod == TensorProduct::Hadamard || - outer_prod == TensorProduct::Contraction); - if (outer_prod == TensorProduct::Hadamard) - result = mult_op(left, right); - else { - // there is currently no fused MultAdd ternary Op, only Add and - // Mult thus implement this as 2 separate steps - // TODO optimize by implementing (ternary) MultAdd - if (empty(result)) - result = mult_op(left, right); - else { - auto result_increment = mult_op(left, right); - add_to(result, result_increment); - } - } + TA_ASSERT(tot_x_tot); + if constexpr (tot_x_tot) { + // inner tile op depends on the outer op ... e.g. if outer op + // is contract then inner must implement (ternary) multiply-add; + // if the outer is hadamard then the inner is binary multiply + const auto outer_prod = this->product_type(); + if (this->factor_ == 1) { + using base_op_type = + TiledArray::detail::Mult; + using op_type = TiledArray::detail::BinaryWrapper< + base_op_type>; // can't consume inputs if they are used + // multiple times, e.g. when outer op is gemm + auto mult_op = + (inner_target_indices != inner(this->indices_)) + ? op_type(base_op_type(), this->permute_tiles_ + ? inner(this->perm_) + : Permutation{}) + : op_type(base_op_type()); + this->element_nonreturn_op_ = + [mult_op, outer_prod](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + if (outer_prod == TensorProduct::Hadamard) + result = mult_op(left, right); + else { + TA_ASSERT(outer_prod == TensorProduct::Hadamard || + outer_prod == TensorProduct::Contraction); + // there is currently no fused MultAdd ternary Op, only Add + // and Mult thus implement this as 2 separate steps + // TODO optimize by implementing (ternary) MultAdd + if (empty(result)) + result = mult_op(left, right); + else { + auto result_increment = mult_op(left, right); + add_to(result, result_increment); + } + } + }; + } else { + using base_op_type = TiledArray::detail::ScalMult< + result_tile_element_type, left_tile_element_type, + right_tile_element_type, scalar_type, false, false>; + using op_type = TiledArray::detail::BinaryWrapper< + base_op_type>; // can't consume inputs if they are used + // multiple times, e.g. when outer op is gemm + auto mult_op = + (inner_target_indices != inner(this->indices_)) + ? op_type(base_op_type(this->factor_), + this->permute_tiles_ ? inner(this->perm_) + : Permutation{}) + : op_type(base_op_type(this->factor_)); + this->element_nonreturn_op_ = + [mult_op, outer_prod](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + TA_ASSERT(outer_prod == TensorProduct::Hadamard || + outer_prod == TensorProduct::Contraction); + if (outer_prod == TensorProduct::Hadamard) + result = mult_op(left, right); + else { + // there is currently no fused MultAdd ternary Op, only Add + // and Mult thus implement this as 2 separate steps + // TODO optimize by implementing (ternary) MultAdd + if (empty(result)) + result = mult_op(left, right); + else { + auto result_increment = mult_op(left, right); + add_to(result, result_increment); + } + } + }; + } + } // ToT x ToT + } else if (inner_prod == TensorProduct::General) { + TA_ASSERT(!tot_x_tot); + constexpr bool tot_x_t = + TiledArray::detail::is_tensor_of_tensor_v && + TiledArray::detail::is_tensor_v; + constexpr bool t_x_tot = + TiledArray::detail::is_tensor_of_tensor_v && + TiledArray::detail::is_tensor_v; + if constexpr (tot_x_t || t_x_tot) { + using arg_tile_element_type = + std::conditional_t; + using scalar_type = + std::conditional_t; + + auto scal_op = [do_perm = this->permute_tiles_, + perm = this->permute_tiles_ ? inner(this->perm_) + : Permutation{}]( + const left_tile_element_type& left, + const right_tile_element_type& right) + -> result_tile_element_type { + using TiledArray::scale; + if constexpr (tot_x_t) { + if (do_perm) + return scale(left, right, perm); + else + return scale(left, right); + } else if constexpr (tot_x_t) { + if (do_perm) + return scale(right, left, perm); + else + return scale(right, left); + } else + abort(); // unreachable }; + this->element_nonreturn_op_ = + [scal_op](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + result = scal_op(left, right); + }; } } else abort(); // unsupported TensorProduct type - TA_ASSERT(inner_tile_nonreturn_op_); - this->inner_tile_return_op_ = - [inner_tile_nonreturn_op = this->inner_tile_nonreturn_op_]( - const inner_tile_type& left, const inner_tile_type& right) { - inner_tile_type result; - inner_tile_nonreturn_op(result, left, right); - return result; - }; + TA_ASSERT(element_nonreturn_op_); + this->element_return_op_ = [inner_tile_nonreturn_op = + this->element_nonreturn_op_]( + const left_tile_element_type& left, + const right_tile_element_type& right) { + result_tile_element_type result; + inner_tile_nonreturn_op(result, left, right); + return result; + }; } } diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index a53133d4b0..91924efeb2 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -406,7 +406,7 @@ class MultEngine : public ContEngine> { // dimensions as well return op_type(op_base_type()); } else if (inner_prod == TensorProduct::Contraction) { - return op_type(op_base_type(this->inner_tile_return_op_)); + return op_type(op_base_type(this->element_return_op_)); } else abort(); } else { // plain tensors @@ -431,7 +431,7 @@ class MultEngine : public ContEngine> { // dimensions as well return op_type(op_base_type(), perm); } else if (inner_prod == TensorProduct::Contraction) { - return op_type(op_base_type(this->inner_tile_return_op_), perm); + return op_type(op_base_type(this->element_return_op_), perm); } else abort(); } else { // plain tensor diff --git a/src/TiledArray/expressions/product.h b/src/TiledArray/expressions/product.h index d364764964..381b1f485c 100644 --- a/src/TiledArray/expressions/product.h +++ b/src/TiledArray/expressions/product.h @@ -57,6 +57,9 @@ inline TensorProduct compute_product_type(const IndexList& left_indices, result = TensorProduct::Hadamard; else result = TensorProduct::Contraction; + } else if ((left_indices && !right_indices) || + (!left_indices && right_indices)) { // used for ToT*T or T*ToT + result = TensorProduct::General; } return result; } diff --git a/src/TiledArray/tile_op/scal.h b/src/TiledArray/tile_op/scal.h index 54d5337ed4..a89770c5a7 100644 --- a/src/TiledArray/tile_op/scal.h +++ b/src/TiledArray/tile_op/scal.h @@ -128,6 +128,8 @@ class Scal { return Scal_::template eval(arg); } + void set_factor(const scalar_type factor) { factor_ = factor; } + }; // class Scal } // namespace detail diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 45c4d3e399..3033936381 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -764,8 +764,12 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // tot_type result; // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); - // will try to make this work - tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + // will try to make this work FIRST since this is used by the einsum code + // below + tot_type out; + out("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l"); + // will try to make this work NEXT + // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); } BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t From b80d1c44c94963ce1b08d516aab5b873cbb3b8ec Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 Nov 2023 22:55:22 -0500 Subject: [PATCH 163/592] [skip_ci] add permutation optimizer for general case: supports inner operation between tot * t. --- src/TiledArray/expressions/permopt.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/TiledArray/expressions/permopt.h b/src/TiledArray/expressions/permopt.h index 21d4a0ec39..dc029b73a1 100644 --- a/src/TiledArray/expressions/permopt.h +++ b/src/TiledArray/expressions/permopt.h @@ -527,6 +527,18 @@ class NullBinaryOpPermutationOptimizer : public BinaryOpPermutationOptimizer { } }; +/// +/// +/// +class GeneralPermutationOptimizer : public GEMMPermutationOptimizer { + public: + GeneralPermutationOptimizer(const GeneralPermutationOptimizer&) = default; + GeneralPermutationOptimizer& operator=(const GeneralPermutationOptimizer&) = + default; + virtual ~GeneralPermutationOptimizer() = default; + using GEMMPermutationOptimizer::GEMMPermutationOptimizer; +}; + inline std::shared_ptr make_permutation_optimizer( TensorProduct product_type, const IndexList& left_indices, const IndexList& right_indices, bool prefer_to_permute_left) { @@ -540,6 +552,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::General: + return std::make_shared( + left_indices, right_indices, prefer_to_permute_left); default: abort(); } @@ -559,6 +574,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( target_indices, left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::General: + return std::make_shared( + left_indices, right_indices, prefer_to_permute_left); default: abort(); } From c199457ec5729ccb20e403ff7b1a08e5ac5617f0 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 Nov 2023 22:55:22 -0500 Subject: [PATCH 164/592] add permutation optimizer for scaling --- src/CMakeLists.txt | 13 +-- src/TiledArray/expressions/permopt.cpp | 32 +++++++ src/TiledArray/expressions/permopt.h | 112 +++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 6 deletions(-) create mode 100644 src/TiledArray/expressions/permopt.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 55227c2093..6e6c708891 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -100,7 +100,6 @@ TiledArray/dist_eval/contraction_eval.h TiledArray/dist_eval/dist_eval.h TiledArray/dist_eval/unary_eval.h TiledArray/einsum/index.h -TiledArray/einsum/index.cpp TiledArray/einsum/range.h TiledArray/einsum/string.h TiledArray/expressions/add_engine.h @@ -195,13 +194,10 @@ TiledArray/util/bug.h TiledArray/util/function.h TiledArray/util/initializer_list.h TiledArray/util/logger.h -TiledArray/util/ptr_registry.cpp TiledArray/util/ptr_registry.h -TiledArray/util/random.cpp TiledArray/util/random.h TiledArray/util/singleton.h TiledArray/util/threads.h -TiledArray/util/threads.cpp TiledArray/util/thread_specific.h TiledArray/util/time.h TiledArray/util/vector.h @@ -243,10 +239,15 @@ TiledArray/tensor_impl.cpp TiledArray/array_impl.cpp TiledArray/dist_array.cpp TiledArray/version.cpp -TiledArray/util/backtrace.cpp -TiledArray/util/bug.cpp +TiledArray/einsum/index.cpp +TiledArray/expressions/permopt.cpp TiledArray/math/linalg/basic.cpp TiledArray/math/linalg/rank-local.cpp +TiledArray/util/backtrace.cpp +TiledArray/util/bug.cpp +TiledArray/util/ptr_registry.cpp +TiledArray/util/random.cpp +TiledArray/util/threads.cpp ) # feed TILEDARRAY_GIT_REVISION and TILEDARRAY_GIT_DESCRIPTION to TiledArray/version.cpp only to avoid recompiling everything set_source_files_properties( diff --git a/src/TiledArray/expressions/permopt.cpp b/src/TiledArray/expressions/permopt.cpp new file mode 100644 index 0000000000..9b125fdc04 --- /dev/null +++ b/src/TiledArray/expressions/permopt.cpp @@ -0,0 +1,32 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2020 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Eduard Valeyev + * Department of Chemistry, Virginia Tech + * + * permopt.cpp + * Nov 21, 2023 + * + */ + +#include + +namespace TiledArray::expressions { + +IndexList ScalePermutationOptimizer::null_indices_; + +} // namespace TiledArray::expressions diff --git a/src/TiledArray/expressions/permopt.h b/src/TiledArray/expressions/permopt.h index 21d4a0ec39..998ea78efe 100644 --- a/src/TiledArray/expressions/permopt.h +++ b/src/TiledArray/expressions/permopt.h @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -51,6 +52,56 @@ inline blas::Op to_cblas_op(PermutationType permtype) { : math::blas::NoTranspose; } +/// Optimizer of permutations for a unary operation +class UnaryOpPermutationOptimizer { + public: + /// construct using initial indices for the argument + /// \param argument_indices the initial argument index list + UnaryOpPermutationOptimizer(const IndexList& argument_indices) + : argument_indices_(argument_indices) {} + + /// construct using initial indices for the argument, + /// and the desired result indices + /// \param result_indices the desired result index list + /// \param argument_indices the initial argument index list + UnaryOpPermutationOptimizer(const IndexList& result_indices, + const IndexList& argument_indices) + : result_indices_(result_indices), argument_indices_(argument_indices) { + TA_ASSERT(argument_indices_.is_permutation(argument_indices_)); + target_result_indices_ = argument_indices_; + } + + UnaryOpPermutationOptimizer() = delete; + UnaryOpPermutationOptimizer(const UnaryOpPermutationOptimizer&) = default; + UnaryOpPermutationOptimizer& operator=(const UnaryOpPermutationOptimizer&) = + default; + virtual ~UnaryOpPermutationOptimizer() = default; + + /// \return the desired result indices + const IndexList& result_indices() const { + TA_ASSERT(result_indices_); + return result_indices_; + } + /// \return initial argument indices + const IndexList& argument_indices() const { return argument_indices_; } + + /// \return the proposed argument index list + const IndexList& target_argument_indices() const { + return target_result_indices_; + } + /// \return the proposed result index list (not necessarily same as that + /// returned by result_indices()) + const IndexList& target_result_indices() const { + return target_result_indices_; + } + /// \return the type of permutation bringing the initial left index list to + /// the target left index list + PermutationType argument_permtype() const { return PermutationType::general; } + + private: + IndexList result_indices_, argument_indices_, target_result_indices_; +}; + /// Abstract optimizer of permutations for a binary operation class BinaryOpPermutationOptimizer { public: @@ -479,6 +530,61 @@ class HadamardPermutationOptimizer : public BinaryOpPermutationOptimizer { IndexList target_result_indices_; }; +// clang-format off +/// Implements BinaryOpPermutationOptimizer interface for a scale operation viewed as a binary tensor product, i.e. +/// a tensor product between an order-0 tensor and an arbitrary tensor +// clang-format on +class ScalePermutationOptimizer : public BinaryOpPermutationOptimizer { + public: + ScalePermutationOptimizer(const ScalePermutationOptimizer&) = default; + ScalePermutationOptimizer& operator=(const ScalePermutationOptimizer&) = + default; + ~ScalePermutationOptimizer() = default; + + ScalePermutationOptimizer(const IndexList& left_indices, + const IndexList& right_indices) + : BinaryOpPermutationOptimizer(left_indices, right_indices, + left_indices ? true : false), + left_argument_is_scalar_(!left_indices), + target_result_indices_(left_argument_is_scalar_ ? right_indices + : left_indices) {} + + ScalePermutationOptimizer(const IndexList& result_indices, + const IndexList& left_indices, + const IndexList& right_indices) + : BinaryOpPermutationOptimizer(result_indices, left_indices, + right_indices, + left_indices ? true : false), + left_argument_is_scalar_(!left_indices) { + const auto& arg_indices = + left_argument_is_scalar_ ? right_indices : left_indices; + TA_ASSERT(arg_indices.is_permutation(result_indices)); + target_result_indices_ = arg_indices; + } + + const IndexList& target_left_indices() const override final { + return !left_argument_is_scalar_ ? target_result_indices_ : null_indices_; + } + const IndexList& target_right_indices() const override final { + return left_argument_is_scalar_ ? target_result_indices_ : null_indices_; + } + const IndexList& target_result_indices() const override final { + return target_result_indices_; + } + PermutationType left_permtype() const override final { + return PermutationType::general; + } + PermutationType right_permtype() const override final { + return PermutationType::general; + } + TensorProduct op_type() const override final { return TensorProduct::Scale; } + + private: + bool left_argument_is_scalar_; + IndexList target_result_indices_; + static IndexList null_indices_; +}; + class NullBinaryOpPermutationOptimizer : public BinaryOpPermutationOptimizer { public: NullBinaryOpPermutationOptimizer(const NullBinaryOpPermutationOptimizer&) = @@ -540,6 +646,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::Scale: + return std::make_shared(left_indices, + right_indices); default: abort(); } @@ -559,6 +668,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( target_indices, left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::Scale: + return std::make_shared( + target_indices, left_indices, right_indices); default: abort(); } From bff7d2888cd69e5ef4b9bb4ed86e775e6528c4db Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 21 Nov 2023 16:33:46 -0500 Subject: [PATCH 165/592] expression-level support for ToT x T (and vice versa) implemented, need to test --- src/TiledArray/expressions/cont_engine.h | 19 ++++----- src/TiledArray/expressions/product.h | 5 ++- tests/einsum.cpp | 49 +++++++++++++++++++++--- 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 9a1cb9f5f9..5ec69c7d0d 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -158,9 +158,10 @@ class ContEngine : public BinaryEngine { TensorProduct inner_product_type() const { TA_ASSERT(inner_product_type_ != TensorProduct::Invalid); // init_indices() must initialize this - /// only Hadamard and contraction are supported now + /// only Hadamard, contraction, and scale are supported now TA_ASSERT(inner_product_type_ == TensorProduct::Hadamard || - inner_product_type_ == TensorProduct::Contraction); + inner_product_type_ == TensorProduct::Contraction || + inner_product_type_ == TensorProduct::Scale); return inner_product_type_; } @@ -473,7 +474,8 @@ class ContEngine : public BinaryEngine { result_tile_type, left_tile_type, right_tile_type>; const auto inner_prod = this->inner_product_type(); TA_ASSERT(inner_prod == TensorProduct::Contraction || - inner_prod == TensorProduct::Hadamard); + inner_prod == TensorProduct::Hadamard || + inner_prod == TensorProduct::Scale); if (inner_prod == TensorProduct::Contraction) { TA_ASSERT(tot_x_tot); if constexpr (tot_x_tot) { @@ -577,8 +579,8 @@ class ContEngine : public BinaryEngine { } }; } - } // ToT x ToT - } else if (inner_prod == TensorProduct::General) { + } // ToT x T or T x ToT + } else if (inner_prod == TensorProduct::Scale) { TA_ASSERT(!tot_x_tot); constexpr bool tot_x_t = TiledArray::detail::is_tensor_of_tensor_v { std::conditional_t; - auto scal_op = [do_perm = this->permute_tiles_, - perm = this->permute_tiles_ ? inner(this->perm_) + auto scal_op = [perm = this->permute_tiles_ ? inner(this->perm_) : Permutation{}]( const left_tile_element_type& left, const right_tile_element_type& right) -> result_tile_element_type { using TiledArray::scale; if constexpr (tot_x_t) { - if (do_perm) + if (perm) return scale(left, right, perm); else return scale(left, right); } else if constexpr (tot_x_t) { - if (do_perm) + if (perm) return scale(right, left, perm); else return scale(right, left); diff --git a/src/TiledArray/expressions/product.h b/src/TiledArray/expressions/product.h index 381b1f485c..7111b7831b 100644 --- a/src/TiledArray/expressions/product.h +++ b/src/TiledArray/expressions/product.h @@ -39,6 +39,9 @@ enum class TensorProduct { Contraction, /// free, fused, and contracted indices General, + /// no indices on one, free indices on the other; only used for inner index + /// products in mixed nested products (ToT x T) + Scale, /// invalid Invalid = -1 }; @@ -59,7 +62,7 @@ inline TensorProduct compute_product_type(const IndexList& left_indices, result = TensorProduct::Contraction; } else if ((left_indices && !right_indices) || (!left_indices && right_indices)) { // used for ToT*T or T*ToT - result = TensorProduct::General; + result = TensorProduct::Scale; } return result; } diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 3033936381..ea5529e5b8 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -718,6 +718,49 @@ BOOST_AUTO_TEST_SUITE_END() // einsum_tot BOOST_AUTO_TEST_SUITE(einsum_tot_t) +BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { + using t_type = DistArray, SparsePolicy>; + using tot_type = DistArray>, SparsePolicy>; + using matrix_il = TiledArray::detail::matrix_il>; + auto& world = TiledArray::get_default_world(); + Tensor lhs_elem_0_0( + Range{7, 2}, {49, 73, 28, 46, 12, 83, 29, 61, 61, 98, 57, 28, 96, 57}); + Tensor lhs_elem_0_1( + Range{7, 2}, {78, 15, 69, 55, 87, 94, 28, 94, 79, 30, 26, 88, 48, 74}); + Tensor lhs_elem_1_0( + Range{7, 2}, {70, 32, 25, 71, 6, 56, 4, 13, 72, 50, 15, 95, 52, 89}); + Tensor lhs_elem_1_1( + Range{7, 2}, {12, 29, 17, 68, 37, 79, 5, 52, 13, 35, 53, 54, 78, 71}); + Tensor lhs_elem_2_0( + Range{7, 2}, {77, 39, 34, 94, 16, 82, 63, 27, 75, 12, 14, 59, 3, 14}); + Tensor lhs_elem_2_1( + Range{7, 2}, {65, 90, 37, 41, 65, 75, 59, 16, 44, 85, 86, 11, 40, 24}); + Tensor lhs_elem_3_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_3_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, + {lhs_elem_1_0, lhs_elem_1_1}, + {lhs_elem_2_0, lhs_elem_2_1}, + {lhs_elem_3_0, lhs_elem_3_1}}; + TiledRange lhs_trange{{0, 2, 4}, {0, 2}}; + tot_type lhs(world, lhs_trange, lhs_il); + + TiledRange rhs_trange{{0, 2}, {0, 2, 4, 6}}; + t_type rhs(world, rhs_trange); + rhs.fill_random(); + + TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1), + rhs_trange.dim(0)}; + tot_type ref_result(world, ref_result_trange); + // TODO compute ref_result + + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); + + // TODO check result against ref_result +} + BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { using t_type = DistArray, SparsePolicy>; using tot_type = DistArray>, SparsePolicy>; @@ -764,11 +807,7 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // tot_type result; // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); - // will try to make this work FIRST since this is used by the einsum code - // below - tot_type out; - out("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l"); - // will try to make this work NEXT + // will try to make this work // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); } From 72e1bcb66e4675e86d067390103f868f0d028033 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 14:54:30 -0500 Subject: [PATCH 166/592] [ci skip] implement 'i,j;m,n * j,k -> i,j,k;m,n' reference evaluation manually. --- tests/einsum.cpp | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index ea5529e5b8..800d51d3e0 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -793,10 +793,41 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { t_type rhs(world, rhs_trange); rhs.fill_random(); - TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1), - rhs_trange.dim(0)}; - tot_type ref_result(world, ref_result_trange); // TODO compute ref_result + // i,j;m,n * j,k => i,j,k;m,n + TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(0), + rhs_trange.dim(1)}; + tot_type ref_result(world, ref_result_trange); + + for (auto const& tile : ref_result) { + tot_type::value_type result_tile{tile.make_range()}; + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto j = res_ix[1]; + auto k = res_ix[2]; + + using Ix2 = std::array; + using Ix3 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(); + + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{j, k}); + auto rhs_tile = rhs.find(rhs_tile_ix).get(); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); + auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{j, k})); + + res_el = lhs_el.scale(rhs_el); + } + + ref_result.set(tile.index(), result_tile); + } + + std::cout << ref_result << std::endl; ///////////////////////////////////////////////////////// // ToT * T From c6940539f68dfa7eec5b3ba5922d2eb8c77070e9 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 17:34:55 -0500 Subject: [PATCH 167/592] [ci skip] more manual tot * t reference evaluation --- tests/einsum.cpp | 68 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 800d51d3e0..6501d91a10 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -751,14 +751,58 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { rhs.fill_random(); TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1), - rhs_trange.dim(0)}; + rhs_trange.dim(0), lhs_trange.dim(1)}; tot_type ref_result(world, ref_result_trange); - // TODO compute ref_result + + // + // i,l,k,j;n,m = i,j;m,n * k,l + // + + // why cannot lhs and rhs be captured by ref? + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto l = res_ix[1]; + auto k = res_ix[2]; + auto j = res_ix[3]; + + using Ix2 = std::array; + using Ix4 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(); + + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, l}); + auto rhs_tile = rhs.find(rhs_tile_ix).get(); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix4{i, l, k, j})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); + auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{k, l})); + + res_el = tot_type::element_type( + lhs_el.scale(rhs_el), // scale + TiledArray::Permutation{1, 0}); // permute [0,1] -> [1,0] + } + return result_tile; + }; + + using std::begin; + using std::endl; + + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - // TODO check result against ref_result + // todo: fix it + // const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + // BOOST_CHECK(are_equal); } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { @@ -799,8 +843,11 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { rhs_trange.dim(1)}; tot_type ref_result(world, ref_result_trange); - for (auto const& tile : ref_result) { - tot_type::value_type result_tile{tile.make_range()}; + // + // why cannot lhs and rhs be captured by ref? + // + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; for (auto&& res_ix : result_tile.range()) { auto i = res_ix[0]; auto j = res_ix[1]; @@ -823,11 +870,16 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { res_el = lhs_el.scale(rhs_el); } + return result_tile; + }; - ref_result.set(tile.index(), result_tile); - } + using std::begin; + using std::endl; - std::cout << ref_result << std::endl; + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } ///////////////////////////////////////////////////////// // ToT * T From 29b5dba22c87dd12d4265506e52593b9b026c997 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 22:04:59 -0500 Subject: [PATCH 168/592] Add equality comparison for SparseShape. --- src/TiledArray/sparse_shape.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h index bf51487922..271857a72c 100644 --- a/src/TiledArray/sparse_shape.h +++ b/src/TiledArray/sparse_shape.h @@ -1742,6 +1742,17 @@ bool is_replicated(World& world, const SparseShape& shape) { return result; } +template +constexpr inline bool operator==(const SparseShape& a, + const SparseShape& b) { + return true; +} +template +constexpr inline bool operator!=(const SparseShape& a, + const SparseShape& b) { + return !(a == b); +} + #ifndef TILEDARRAY_HEADER_ONLY extern template class SparseShape; From f2945dad86058ee08f7e68acafddf391eb0d186c Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 22:05:40 -0500 Subject: [PATCH 169/592] Validate outer-product type tot * t evaluation using expression layer. --- tests/einsum.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 6501d91a10..aad4a00c0a 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -800,9 +800,8 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - // todo: fix it - // const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); - // BOOST_CHECK(are_equal); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { From be06fbe6380daeed181ace0815c778c170f8f36d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 27 Nov 2023 11:42:05 -0500 Subject: [PATCH 170/592] [unit] einsum_tot_t pulls remote tiles using strick blocking (dowork=false) also fixed a few typos --- tests/einsum.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index aad4a00c0a..db2731a2e1 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -771,10 +771,10 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { using Ix4 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, l}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix4{i, l, k, j})); @@ -790,7 +790,7 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { }; using std::begin; - using std::endl; + using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); @@ -856,10 +856,10 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { using Ix3 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{j, k}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); @@ -873,7 +873,7 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { }; using std::begin; - using std::endl; + using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); From 3cd64dbbda97a9071d36d67826a63d5b88d6f5c2 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 27 Nov 2023 12:04:54 -0500 Subject: [PATCH 171/592] [unit] einsum_tot_t must test ToT*T AND T*ToT (the latter is currently broken due to missing Tensor functionality for binary Scalar*Tensor ops) --- tests/einsum.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index db2731a2e1..37889a73f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -802,6 +802,13 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); + + { // reverse the order + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); + } } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { @@ -887,10 +894,10 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // - general product w.r.t. outer indices // - involves ToT * T // tot_type result; - // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); + // BOOST_REQUIRE_NO_THROW(result("k,i,j;n,m") = lhs("i,j;m,n") * rhs("j,k")); // will try to make this work - // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "k,i,j;n,m"); } BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t From f246756bd707d319c33f2d536f698904fe9be0dd Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 27 Nov 2023 23:16:39 -0500 Subject: [PATCH 172/592] Avoid code-duplication by generalizing the existing einsum function. --- src/TiledArray/einsum/range.h | 3 +- src/TiledArray/einsum/tiledarray.h | 316 ++++++----------------------- tests/einsum.cpp | 12 +- 3 files changed, 72 insertions(+), 259 deletions(-) diff --git a/src/TiledArray/einsum/range.h b/src/TiledArray/einsum/range.h index 32eb669588..79b409e64d 100644 --- a/src/TiledArray/einsum/range.h +++ b/src/TiledArray/einsum/range.h @@ -14,7 +14,8 @@ using small_vector = TiledArray::container::svector; struct Range { using value_type = int64_t; using iterator = boost::counting_iterator; - template + template , bool> = true> explicit Range(Pair &&pair) : Range(pair.first, pair.second) {} Range(value_type begin, value_type end) : begin_(begin), end_(end) {} auto begin() const { return iterator(begin_); } diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 09640d31f6..1a3840f99f 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -64,13 +64,38 @@ struct ArrayTerm { } }; -template -auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, +namespace { +template +constexpr bool IsArrayT = detail::is_tensor_v; + +template +constexpr bool IsArrayToT = + detail::is_tensor_of_tensor_v; + +template +constexpr bool AreArrayT = IsArrayT && IsArrayT; + +template +constexpr bool AreArrayToT = IsArrayToT && IsArrayToT; + +template +constexpr bool AreArraySame = + AreArrayT || AreArrayToT; + +} // namespace + +template +auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, std::tuple, Indices...> cs, World &world) { - using Array = std::remove_cv_t; - using Tensor = typename Array::value_type; - using Shape = typename Array::shape_type; + using ArrayA = std::remove_cv_t; + using ArrayB = std::remove_cv_t; + using ArrayC = std::conditional_t< + AreArraySame, ArrayA, + std::conditional_t, ArrayA, ArrayB>>; + // using Array = ArrayC; + using ResultTensor = typename ArrayC::value_type; + using ResultShape = typename ArrayC::shape_type; auto a = std::get<0>(Einsum::idx(A)); auto b = std::get<0>(Einsum::idx(B)); @@ -91,7 +116,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // no Hadamard indices => standard contraction (or even outer product) // same a, b, and c => pure Hadamard if (!h || (!(a ^ b) && !(b ^ c))) { - Array C; + ArrayC C; C(std::string(c) + inner.c) = A * B; return C; } @@ -108,17 +133,22 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using ::Einsum::index::permutation; using TiledArray::Permutation; - ArrayTerm AB[2] = {{A.array(), a}, {B.array(), b}}; + std::tuple, ArrayTerm> AB{{A.array(), a}, + {B.array(), b}}; - for (auto &term : AB) { + auto update_perm_and_indices = [&e = std::as_const(e), &i = std::as_const(i), + &h = std::as_const(h)](auto &term) { auto ei = (e + i & term.idx); if (term.idx != h + ei) { term.permutation = permutation(term.idx, h + ei); } term.expr = ei; - } + }; - ArrayTerm C = {Array(world, TiledRange(range_map[c])), c}; + std::invoke(update_perm_and_indices, std::get<0>(AB)); + std::invoke(update_perm_and_indices, std::get<1>(AB)); + + ArrayTerm C = {ArrayC(world, TiledRange(range_map[c])), c}; for (auto idx : e) { C.tiles *= Range(range_map[idx].tiles_range()); } @@ -127,8 +157,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } C.expr = e; - AB[0].expr += inner.a; - AB[1].expr += inner.b; + std::get<0>(AB).expr += inner.a; + std::get<1>(AB).expr += inner.b; + C.expr += inner.c; struct { @@ -163,7 +194,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, for (size_t i = 0; i < h.size(); ++i) { batch *= H.batch[i].at(h[i]); } - Tensor tile(TiledArray::Range{batch}, typename Tensor::value_type(0)); + ResultTensor tile(TiledArray::Range{batch}, + typename ResultTensor::value_type(0)); for (Index i : tiles) { // skip this unless both input tiles exist const auto pahi_inv = apply_inverse(pa, h + i); @@ -193,16 +225,20 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // generalized contraction - for (auto &term : AB) { + auto update_tr = [&e = std::as_const(e), &i = std::as_const(i), + &range_map = std::as_const(range_map)](auto &term) { auto ei = (e + i & term.idx); term.ei_tiled_range = TiledRange(range_map[ei]); for (auto idx : ei) { term.tiles *= Range(range_map[idx].tiles_range()); } - } + }; + + std::invoke(update_tr, std::get<0>(AB)); + std::invoke(update_tr, std::get<1>(AB)); std::vector> worlds; - std::vector> local_tiles; + std::vector> local_tiles; // iterates over tiles of hadamard indices for (Index h : H.tiles) { @@ -216,7 +252,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, for (size_t i = 0; i < h.size(); ++i) { batch *= H.batch[i].at(h[i]); } - for (auto &term : AB) { + + auto retile = [&owners, &h = std::as_const(h), batch](auto &term) { term.local_tiles.clear(); const Permutation &P = term.permutation; @@ -232,235 +269,18 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, term.local_tiles.push_back({ei, tile}); } bool replicated = term.array.pmap()->is_replicated(); - term.ei = TiledArray::make_array( + term.ei = TiledArray::make_array( *owners, term.ei_tiled_range, term.local_tiles.begin(), term.local_tiles.end(), replicated); - } - C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); - A.ei.defer_deleter_to_next_fence(); - B.ei.defer_deleter_to_next_fence(); - A.ei = Array(); - B.ei = Array(); - // why omitting this fence leads to deadlock? - owners->gop.fence(); - for (Index e : C.tiles) { - if (!C.ei.is_local(e)) continue; - if (C.ei.is_zero(e)) continue; - // TODO no need for immediate evaluation - auto tile = C.ei.find_local(e).get(); - assert(tile.batch_size() == batch); - const Permutation &P = C.permutation; - auto c = apply(P, h + e); - auto shape = C.array.trange().tile(c); - shape = apply_inverse(P, shape); - tile = tile.reshape(shape); - if (P) tile = tile.permute(P); - local_tiles.push_back({c, tile}); - } - // mark for lazy deletion - C.ei = Array(); - } - - if constexpr (!Shape::is_dense()) { - TiledRange tiled_range = TiledRange(range_map[c]); - std::vector> tile_norms; - for (auto &[index, tile] : local_tiles) { - tile_norms.push_back({index, tile.norm()}); - } - Shape shape(world, tile_norms, tiled_range); - C.array = Array(world, TiledRange(range_map[c]), shape); - } - - for (auto &[index, tile] : local_tiles) { - if (C.array.is_zero(index)) continue; - C.array.set(index, tile); - } - - for (auto &w : worlds) { - w->gop.fence(); - } - - return C.array; -} - -namespace { -template -constexpr bool IsArrayT = detail::is_tensor_v; - -template -constexpr bool IsArrayToT = - detail::is_tensor_of_tensor_v; -} // namespace - -template < - typename ArrayT_, typename ArrayToT_, typename... Indices, - typename = std::enable_if_t && IsArrayToT>> -auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, - std::tuple, Indices...> cs, - World &world) { - using ArrayT = std::remove_cv_t; - using ArrayToT = std::remove_cv_t; - using Shape = typename ArrayToT::shape_type; - using T = typename ArrayT::value_type; - using ToT = typename ArrayToT::value_type; - - auto a = std::get<0>(Einsum::idx(A)); - auto b = std::get<0>(Einsum::idx(B)); - Einsum::Index c = std::get<0>(cs); - - struct { - std::string b, c; - } inner; - if constexpr (std::tuple_size::value == 2) { - inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); - inner.c = ";" + (std::string)std::get<1>(cs); - } + }; + std::invoke(retile, std::get<0>(AB)); + std::invoke(retile, std::get<1>(AB)); - // these are "Hadamard" (fused) indices - auto h = a & b & c; - - // contracted indices - auto i = (a & b) - h; - // contraction not allowed in tensor x tensor-of-tensor - TA_ASSERT(!i); - - // indices exclusively in 'a' or exclusively in 'b' - auto e = (a ^ b); - - // maps Index to TiledRange1 - // (asserts same index maps to the same TR1 in A, and B) - auto range_map = - (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); - - using ::Einsum::index::permutation; - using TiledArray::Permutation; - - auto arrayTermA = ArrayTerm{A.array(), a}; - auto arrayTermB = ArrayTerm{B.array(), b}; - - { - auto ei = (e + i & arrayTermA.idx); - if (arrayTermA.idx != h + ei) - arrayTermA.permutation = permutation(arrayTermA.idx, h + ei); - arrayTermA.expr = ei; - } - - { - auto ei = (e + i & arrayTermB.idx); - if (arrayTermB.idx != h + ei) - arrayTermB.permutation = permutation(arrayTermB.idx, h + ei); - arrayTermB.expr = ei; - } - - ArrayTerm C = {ArrayToT(world, TiledRange(range_map[c])), c}; - for (auto idx : e) { - C.tiles *= Range(range_map[idx].tiles_range()); - } - if (C.idx != h + e) { - C.permutation = permutation(h + e, C.idx); - } - C.expr = e; - - arrayTermB.expr += inner.b; - C.expr += inner.c; - - struct { - RangeProduct tiles; - std::vector> batch; - } H; - - for (auto idx : h) { - H.tiles *= Range(range_map[idx].tiles_range()); - H.batch.push_back({}); - for (auto r : range_map[idx]) { - H.batch.back().push_back(Range{r}.size()); - } - } - - using Index = Einsum::Index; - - // generalized contraction - { - auto ei = (e + i & arrayTermA.idx); - arrayTermA.ei_tiled_range = TiledRange(range_map[ei]); - for (auto idx : ei) arrayTermA.tiles *= Range(range_map[idx].tiles_range()); - } - - { - auto ei = (e + i & arrayTermB.idx); - arrayTermB.ei_tiled_range = TiledRange(range_map[ei]); - for (auto idx : ei) arrayTermB.tiles *= Range(range_map[idx].tiles_range()); - } - - std::vector> worlds; - std::vector> local_tiles; - - // iterates over tiles of hadamard indices - for (Index h : H.tiles) { - auto &A = arrayTermA; - auto &B = arrayTermB; - - auto own = A.own(h) || B.own(h); - auto comm = world.mpi.comm().Split(own, world.rank()); - worlds.push_back(std::make_unique(comm)); - auto &owners = worlds.back(); - if (!own) continue; - size_t batch = 1; - for (size_t i = 0; i < h.size(); ++i) { - batch *= H.batch[i].at(h[i]); - } - - { - arrayTermA.local_tiles.clear(); - const Permutation &P = arrayTermA.permutation; - - for (Index ei : arrayTermA.tiles) { - auto idx = apply_inverse(P, h + ei); - if (!arrayTermA.array.is_local(idx)) continue; - if (arrayTermA.array.is_zero(idx)) continue; - // TODO no need for immediate evaluation - auto tile = arrayTermA.array.find_local(idx).get(); - if (P) tile = tile.permute(P); - auto shape = arrayTermA.ei_tiled_range.tile(ei); - tile = tile.reshape(shape, batch); - arrayTermA.local_tiles.push_back({ei, tile}); - } - bool replicated = arrayTermA.array.pmap()->is_replicated(); - arrayTermA.ei = TiledArray::make_array( - *owners, arrayTermA.ei_tiled_range, arrayTermA.local_tiles.begin(), - arrayTermA.local_tiles.end(), replicated); - } - - { - arrayTermB.local_tiles.clear(); - const Permutation &P = arrayTermB.permutation; - - for (Index ei : arrayTermB.tiles) { - auto idx = apply_inverse(P, h + ei); - if (!arrayTermB.array.is_local(idx)) continue; - if (arrayTermB.array.is_zero(idx)) continue; - // TODO no need for immediate evaluation - auto tile = arrayTermB.array.find_local(idx).get(); - if (P) tile = tile.permute(P); - auto shape = arrayTermB.ei_tiled_range.tile(ei); - tile = tile.reshape(shape, batch); - arrayTermB.local_tiles.push_back({ei, tile}); - } - bool replicated = arrayTermB.array.pmap()->is_replicated(); - arrayTermB.ei = TiledArray::make_array( - *owners, arrayTermB.ei_tiled_range, arrayTermB.local_tiles.begin(), - arrayTermB.local_tiles.end(), replicated); - } - - // todo C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); - - // - A.ei.defer_deleter_to_next_fence(); B.ei.defer_deleter_to_next_fence(); - A.ei = ArrayT(); - B.ei = ArrayToT(); + A.ei = ArrayA(); + B.ei = ArrayB(); // why omitting this fence leads to deadlock? owners->gop.fence(); for (Index e : C.tiles) { @@ -478,17 +298,17 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, local_tiles.push_back({c, tile}); } // mark for lazy deletion - C.ei = ArrayToT(); + C.ei = ArrayC(); } - if constexpr (!Shape::is_dense()) { + if constexpr (!ResultShape::is_dense()) { TiledRange tiled_range = TiledRange(range_map[c]); std::vector> tile_norms; for (auto &[index, tile] : local_tiles) { tile_norms.push_back({index, tile.norm()}); } - Shape shape(world, tile_norms, tiled_range); - C.array = ArrayToT(world, TiledRange(range_map[c]), shape); + ResultShape shape(world, tile_norms, tiled_range); + C.array = ArrayC(world, TiledRange(range_map[c]), shape); } for (auto &[index, tile] : local_tiles) { @@ -503,14 +323,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, return C.array; } -template && IsArrayToT>> -auto einsum(expressions::TsrExpr B, expressions::TsrExpr A, - std::tuple, Indices...> cs, - World &world) { - return einsum(A, B, cs, world); -} - /// Computes ternary tensor product whose result /// is a scalar (a ternary dot product). Optimized for the case where /// the arguments have common (Hadamard) indices. diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 37889a73f9..8eea2884f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -803,12 +803,12 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); - { // reverse the order - tot_type result; - BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); - BOOST_CHECK(are_equal); - } +// { // reverse the order +// tot_type result; +// BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); +// const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); +// BOOST_CHECK(are_equal); +// } } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { From e5ec53161ccf22ffb40ddc40a9d2c1b3b29cb7c8 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 29 Nov 2023 10:28:47 -0500 Subject: [PATCH 173/592] In einsum, handle inner index labels when tot times t, or, t times tot arguments are passed. --- src/TiledArray/einsum/tiledarray.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 1a3840f99f..eb317e0aef 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -93,7 +93,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using ArrayC = std::conditional_t< AreArraySame, ArrayA, std::conditional_t, ArrayA, ArrayB>>; - // using Array = ArrayC; using ResultTensor = typename ArrayC::value_type; using ResultShape = typename ArrayC::shape_type; @@ -105,8 +104,13 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, std::string a, b, c; } inner; if constexpr (std::tuple_size::value == 2) { - inner.a = ";" + (std::string)std::get<1>(Einsum::idx(A)); - inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + if constexpr (IsArrayToT) + inner.a = ";" + (std::string)std::get<1>(Einsum::idx(A)); + + if constexpr (IsArrayToT) + inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + + static_assert(IsArrayToT || IsArrayToT); inner.c = ";" + (std::string)std::get<1>(cs); } From 8341bbb8cc5b902136cc87e374f19b56ccd2cddb Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 29 Nov 2023 17:00:36 -0500 Subject: [PATCH 174/592] amend https://github.com/ValeevGroup/tiledarray/commit/bff7d2888cd69e5ef4b9bb4ed86e775e6528c4db --- src/TiledArray/expressions/cont_engine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 5ec69c7d0d..21aceae14c 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -609,7 +609,7 @@ class ContEngine : public BinaryEngine { return scale(left, right, perm); else return scale(left, right); - } else if constexpr (tot_x_t) { + } else if constexpr (t_x_tot) { if (perm) return scale(right, left, perm); else From 56b49a03464294eb629b38e63060e93b98695142 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 29 Nov 2023 17:02:22 -0500 Subject: [PATCH 175/592] relax type requirements on tensor_init to support mixed (ToT alongside T) invocations, this allows T * ToT expr to compile and unit test to succeed --- src/TiledArray/tensor/kernels.h | 7 ++++--- tests/einsum.cpp | 12 ++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 87db8c1cc6..97f7dc1e5b 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -541,9 +541,10 @@ inline void tensor_init(Op&& op, const Permutation& perm, TR& result, /// \param[out] result The result tensor /// \param[in] tensor1 The first argument tensor /// \param[in] tensors The argument tensors -template ::value>::type* = nullptr> +template < + typename Op, typename TR, typename T1, typename... Ts, + typename std::enable_if::value && + !is_tensor::value>::type* = nullptr> inline void tensor_init(Op&& op, const Permutation& perm, TR& result, const T1& tensor1, const Ts&... tensors) { TA_ASSERT(!empty(result, tensor1, tensors...)); diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 8eea2884f9..37889a73f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -803,12 +803,12 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); -// { // reverse the order -// tot_type result; -// BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); -// const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); -// BOOST_CHECK(are_equal); -// } + { // reverse the order + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); + } } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { From b75b1fcac72a9f82c95529972e2a20cd6ab2ed56 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 30 Nov 2023 14:06:19 -0500 Subject: [PATCH 176/592] relax Tensor(left,right,binaryelemeop,permutation) ctor constraints --- src/TiledArray/tensor/tensor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 3c10ba4077..f3076c4514 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -492,7 +492,7 @@ class Tensor { /// \param perm The permutation that will be applied to the arguments template < typename T1, typename T2, typename Op, typename Perm, - typename std::enable_if::value && + typename std::enable_if::value && detail::is_permutation_v>::type* = nullptr> Tensor(const T1& left, const T2& right, Op&& op, const Perm& perm) : Tensor(outer(perm) * left.range(), 1, default_construct{false}) { From f8d41002c106e8cb54fa79ae02e8b1ca06216c7e Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 7 Dec 2023 18:38:25 -0500 Subject: [PATCH 177/592] Support for pure hadamard product between a tot and a t: 'i,j;m,n * i,j -> i,j;m,n' --- src/TiledArray/expressions/binary_engine.h | 6 +- src/TiledArray/expressions/mult_engine.h | 6 ++ tests/einsum.cpp | 92 ++++++++++++++++++++++ 3 files changed, 102 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h index 93192e2b5e..411a1c7c13 100644 --- a/src/TiledArray/expressions/binary_engine.h +++ b/src/TiledArray/expressions/binary_engine.h @@ -204,8 +204,10 @@ class BinaryEngine : public ExprEngine { /// \param target_indices The target index list for this expression void perm_indices(const BipartiteIndexList& target_indices) { if (permute_tiles_) { - TA_ASSERT(left_.indices().size() == target_indices.size()); - TA_ASSERT(right_.indices().size() == target_indices.size()); + TA_ASSERT(left_.indices().size() == target_indices.size() || + (left_.indices().second().size() ^ target_indices.second().size())); + TA_ASSERT(right_.indices().size() == target_indices.size() || + (right_.indices().second().size() ^ target_indices.second().size())); init_indices_(target_indices); diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 91924efeb2..9713e0b0df 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -407,6 +407,9 @@ class MultEngine : public ContEngine> { return op_type(op_base_type()); } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_)); + } else if (inner_prod == TensorProduct::Scale) { + TA_ASSERT(this->product_type() == TensorProduct::Hadamard); + return op_type(op_base_type()); } else abort(); } else { // plain tensors @@ -432,6 +435,9 @@ class MultEngine : public ContEngine> { return op_type(op_base_type(), perm); } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_), perm); + } else if (inner_prod == TensorProduct::Scale) { + TA_ASSERT(this->product_type() == TensorProduct::Hadamard); + return op_type(op_base_type(this->element_return_op_), perm); } else abort(); } else { // plain tensor diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 37889a73f9..9ea4dd39d3 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -900,6 +900,98 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "k,i,j;n,m"); } +BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { + using t_type = DistArray, SparsePolicy>; + using tot_type = DistArray>, SparsePolicy>; + using matrix_il = TiledArray::detail::matrix_il>; + auto& world = TiledArray::get_default_world(); + Tensor lhs_elem_0_0( + Range{7, 2}, {49, 73, 28, 46, 12, 83, 29, 61, 61, 98, 57, 28, 96, 57}); + Tensor lhs_elem_0_1( + Range{7, 2}, {78, 15, 69, 55, 87, 94, 28, 94, 79, 30, 26, 88, 48, 74}); + Tensor lhs_elem_1_0( + Range{7, 2}, {70, 32, 25, 71, 6, 56, 4, 13, 72, 50, 15, 95, 52, 89}); + Tensor lhs_elem_1_1( + Range{7, 2}, {12, 29, 17, 68, 37, 79, 5, 52, 13, 35, 53, 54, 78, 71}); + Tensor lhs_elem_2_0( + Range{7, 2}, {77, 39, 34, 94, 16, 82, 63, 27, 75, 12, 14, 59, 3, 14}); + Tensor lhs_elem_2_1( + Range{7, 2}, {65, 90, 37, 41, 65, 75, 59, 16, 44, 85, 86, 11, 40, 24}); + Tensor lhs_elem_3_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_3_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + Tensor lhs_elem_4_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_4_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + Tensor lhs_elem_5_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_5_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, + {lhs_elem_1_0, lhs_elem_1_1}, + {lhs_elem_2_0, lhs_elem_2_1}, + {lhs_elem_3_0, lhs_elem_3_1}, + {lhs_elem_4_0, lhs_elem_4_1}, + {lhs_elem_5_0, lhs_elem_5_1}}; + TiledRange lhs_trange{{0, 2, 6}, {0, 2}}; + tot_type lhs(world, lhs_trange, lhs_il); + + TiledRange rhs_trange{{0, 2}, {0, 2, 6}}; + t_type rhs(world, rhs_trange); + rhs.fill_random(); + + // + // i,j;m,n = j,i;n,m * i,j + // + TiledRange ref_result_trange{rhs_trange.dim(0), rhs_trange.dim(1)}; + tot_type ref_result(world, ref_result_trange); + + // why cannot lhs and rhs be captured by ref? + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto j = res_ix[1]; + + using Ix2 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{j, i}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork */ false); + + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2({i, j})); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork */ false ); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix2{i, j})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{j, i})); + auto rhs_el = + rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{i, j})); + res_el = tot_type::element_type( + lhs_el.scale(rhs_el), // scale + TiledArray::Permutation{0, 1} // permute + ); + } + return result_tile; + }; + + using std::begin; + using std::end; + + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } + + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,j;m,n") = lhs("j,i;m,n") * rhs("i,j")); + + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); +} + BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t // Eigen einsum indices From 726ebb893e6ad21cfcef92c70ce4600b42b6d9d3 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 8 Dec 2023 07:56:34 -0500 Subject: [PATCH 178/592] SparseShape inequality comparison added. --- src/TiledArray/sparse_shape.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h index 271857a72c..b589dc73cf 100644 --- a/src/TiledArray/sparse_shape.h +++ b/src/TiledArray/sparse_shape.h @@ -797,6 +797,13 @@ class SparseShape { return equal; } + /// Bitwise comparison + /// \param other a SparseShape object + /// \return true if this object and @c other object are bitwise NOT identical + inline bool operator!=(const SparseShape& other) const { + return !(*this == other); + } + private: /// Create a copy of a sub-block of the shape @@ -1742,17 +1749,6 @@ bool is_replicated(World& world, const SparseShape& shape) { return result; } -template -constexpr inline bool operator==(const SparseShape& a, - const SparseShape& b) { - return true; -} -template -constexpr inline bool operator!=(const SparseShape& a, - const SparseShape& b) { - return !(a == b); -} - #ifndef TILEDARRAY_HEADER_ONLY extern template class SparseShape; From 7fd52d54b02136857eb429da3bb2685f1ee4c77e Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 8 Dec 2023 07:57:16 -0500 Subject: [PATCH 179/592] Disable shape comparison in ToTArrayFixture. --- tests/tot_array_fixture.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 9d46fadcc7..1619a794c8 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -237,6 +237,7 @@ struct ToTArrayFixture { * - Components are bit-wise equal (i.e., 3.1400000000 != 3.1400000001) * * TODO: pmap comparisons + * TODO: shape comparisons */ template @@ -254,7 +255,7 @@ struct ToTArrayFixture { if (&lhs.world() != &rhs.world()) return false; // Same shape? - if (lhs.shape() != rhs.shape()) return false; + // if (lhs.shape() != rhs.shape()) return false; // Same pmap? // if(*lhs.pmap() != *rhs.pmap()) return false; From cdc9db23455dbccef01b7f906a0c7b3fafe11806 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 8 Dec 2023 07:58:25 -0500 Subject: [PATCH 180/592] Default construction of result tensor tile in `einsum` made more generic. --- src/TiledArray/einsum/tiledarray.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index eb317e0aef..48648407cb 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -199,7 +199,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, batch *= H.batch[i].at(h[i]); } ResultTensor tile(TiledArray::Range{batch}, - typename ResultTensor::value_type(0)); + typename ResultTensor::value_type{}); for (Index i : tiles) { // skip this unless both input tiles exist const auto pahi_inv = apply_inverse(pa, h + i); From d2fb429f93504a1996bca7b7355b818f27eefb00 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 10 Dec 2023 12:00:17 -0500 Subject: [PATCH 181/592] Restore (optional) shape comparison on ToTArrayFixture::are_equal function. --- tests/einsum.cpp | 6 +++--- tests/tot_array_fixture.h | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 9ea4dd39d3..a1c26d1782 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -800,13 +800,13 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); { // reverse the order tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } } @@ -988,7 +988,7 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,j;m,n") = lhs("j,i;m,n") * rhs("i,j")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 1619a794c8..21a9c956c6 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -231,16 +231,15 @@ struct ToTArrayFixture { * - Same type * - Either both are initialized or both are not initialized * - Same MPI context - * - Same shape + * - Same shape (unless the template parameter ShapeCmp is set false) * - Same distribution * - Same tiling * - Components are bit-wise equal (i.e., 3.1400000000 != 3.1400000001) * * TODO: pmap comparisons - * TODO: shape comparisons */ - template + template static bool are_equal(const DistArray& lhs, const DistArray& rhs) { // Same type @@ -255,7 +254,8 @@ struct ToTArrayFixture { if (&lhs.world() != &rhs.world()) return false; // Same shape? - // if (lhs.shape() != rhs.shape()) return false; + if constexpr (ShapeCmp) + if (lhs.shape() != rhs.shape()) return false; // Same pmap? // if(*lhs.pmap() != *rhs.pmap()) return false; From 42fb41bd9e1bcd01d7f1171aae9a68dcb033d72b Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 10 Dec 2023 12:03:38 -0500 Subject: [PATCH 182/592] Relax restricitons on this->product_type() values while calling make_tile_op(). --- src/TiledArray/expressions/mult_engine.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 9713e0b0df..20093b2cec 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -408,7 +408,6 @@ class MultEngine : public ContEngine> { } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_)); } else if (inner_prod == TensorProduct::Scale) { - TA_ASSERT(this->product_type() == TensorProduct::Hadamard); return op_type(op_base_type()); } else abort(); @@ -436,7 +435,6 @@ class MultEngine : public ContEngine> { } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_), perm); } else if (inner_prod == TensorProduct::Scale) { - TA_ASSERT(this->product_type() == TensorProduct::Hadamard); return op_type(op_base_type(this->element_return_op_), perm); } else abort(); From 7b7dbb8f8af59af85e0bfc38f3d734e9b2ef2fc7 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 11 Dec 2023 07:35:16 -0500 Subject: [PATCH 183/592] Typo. --- tests/einsum.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index a1c26d1782..ebd9784bfd 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -1269,7 +1269,7 @@ BOOST_AUTO_TEST_CASE(einsum_tiledarray_abi_cdi_cdab) { "abi,cdi->cdab"); } -BOOST_AUTO_TEST_CASE(einsum_tiledarray_icd_ai_abcd) { +BOOST_AUTO_TEST_CASE(einsum_tiledarray_icd_bai_abcd) { einsum_tiledarray_check<3, 3, 4>(random(3, 12, 13), random(14, 15, 3), "icd,bai->abcd"); From 02a7db7ab1dc2545b98794d700e3b9854517f564 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 15 Dec 2023 09:28:57 -0500 Subject: [PATCH 184/592] [skip ci] einsum unit test for ij;mn * kj;mn -> ijk;mn --- tests/einsum.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index ebd9784bfd..eb2ffe1869 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -580,6 +580,40 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ij_mn_times_ji_mn) { BOOST_CHECK(are_equal); } +BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { + using dist_array_t = DistArray>, DensePolicy>; + using matrix_il = TiledArray::detail::matrix_il>; + auto& world = TiledArray::get_default_world(); + + auto random_tot = [](TA::Range const& rng) { + TA::Range inner_rng{7,14}; + TA::Tensor t{inner_rng}; + TA::Tensor> result{rng}; + for (auto& e: result) e = t; + return result; + }; + + auto random_tot_darr = [&random_tot](World& world, + TiledRange const& tr) { + dist_array_t result(world, tr); + for (auto it = result.begin(); it != result.end(); ++it) { + auto tile = + TA::get_default_world().taskq.add(random_tot, it.make_range()); + *it = tile; + } + return result; + }; + + TiledRange lhs_trange{{0, 2, 4}, {0, 5}}; + auto lhs = random_tot_darr(world, lhs_trange); + + TiledRange rhs_trange{{0, 2, 4, 6}, {0, 5}}; + auto rhs = random_tot_darr(world, rhs_trange); + dist_array_t result; + BOOST_REQUIRE_NO_THROW( + result = einsum(lhs("i,j;m,n"), rhs("k,j;m,n"), "i,j,k;m,n")); +} + BOOST_AUTO_TEST_CASE(xxx) { using dist_array_t = DistArray>, DensePolicy>; using matrix_il = TiledArray::detail::matrix_il>; @@ -1328,6 +1362,13 @@ BOOST_AUTO_TEST_CASE(einsum_tiledarray_hji_jih_hj) { "hji,jih->hj"); } +BOOST_AUTO_TEST_CASE(einsum_tiledarray_ik_jk_ijk) { + einsum_tiledarray_check<2, 2, 3>(random(7, 5), + random(14, 5), "ik,jk->ijk"); + einsum_tiledarray_check<2, 2, 3>(sparse_zero(7, 5), sparse_zero(14, 5), + "ik,jk->ijk"); +} + BOOST_AUTO_TEST_CASE(einsum_tiledarray_replicated) { einsum_tiledarray_check<3, 3, 3>(replicated(random(7, 14, 3)), random(7, 15, 3), From f0be0c97d193b5c4df3653f4dfe4179695bb57e6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 15 Dec 2023 10:45:59 -0500 Subject: [PATCH 185/592] Tensor::gemm involving custom elem_op supports batching --- src/TiledArray/tensor/tensor.h | 75 ++++++++++++++++++++++++---------- tests/einsum.cpp | 4 +- 2 files changed, 55 insertions(+), 24 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index f3076c4514..c901dc0f4b 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -292,10 +292,12 @@ class Tensor { /// Construct a tensor with a range equal to \c range. The data is /// uninitialized. /// \param range The range of the tensor - explicit Tensor(const range_type& range) - : Tensor(range, 1, default_construct{true}) {} + /// \param batch_size The batch size (default is 1) + explicit Tensor(const range_type& range, size_type batch_size = 1) + : Tensor(range, batch_size, default_construct{true}) {} - /// Construct a tensor with a fill value + /// Construct a tensor of tensor values, setting all elements to the same + /// value /// \param range An array with the size of of each dimension /// \param value The value of the tensor elements @@ -312,12 +314,14 @@ class Tensor { new (data + i) value_type(cloner(value)); } - /// Construct a tensor with a fill value + /// Construct a tensor of scalars, setting all elements to the same value /// \param range An array with the size of of each dimension /// \param value The value of the tensor elements - template >::type* = nullptr> + template && + !detail::is_tensor::value>::type* = + nullptr> Tensor(const range_type& range, const Value& value) : Tensor(range, 1, default_construct{false}) { detail::tensor_init([value]() -> Value { return value; }, *this); @@ -358,7 +362,7 @@ class Tensor { math::uninitialized_copy_vector(range.volume(), u, this->data()); } - Tensor(const Range& range, std::initializer_list il) + explicit Tensor(const Range& range, std::initializer_list il) : Tensor(range, il.begin()) {} /// Construct a copy of a tensor interface object @@ -1004,6 +1008,22 @@ class Tensor { /// \return A mutable pointer to the tensor data pointer data() { return this->data_.get(); } + /// @param[in] batch_idx the batch index + /// @pre `batch_idx < this->batch_size()` + /// @return A const pointer to the tensor data of the batch \p batch_idx + const_pointer batch_data(size_t batch_idx) const { + TA_ASSERT(batch_idx < this->batch_size()); + return data() + batch_idx * size(); + } + + /// @param[in] batch_idx the batch index + /// @pre `batch_idx < this->batch_size()` + /// @return A const pointer to the tensor data of the batch \p batch_idx + pointer batch_data(size_t batch_idx) { + TA_ASSERT(batch_idx < this->batch_size()); + return data() + batch_idx * size(); + } + /// Read-only shared_ptr to the data /// \return A const shared_ptr to the tensor data @@ -2194,6 +2214,8 @@ class Tensor { TA_ASSERT(left.range().rank() == gemm_helper.left_rank()); TA_ASSERT(!right.empty()); TA_ASSERT(right.range().rank() == gemm_helper.right_rank()); + TA_ASSERT(left.batch_size() == right.batch_size()); + const auto batch_sz = left.batch_size(); // Check that the inner dimensions of left and right match TA_ASSERT(gemm_helper.left_right_congruent(left.range().extent_data(), @@ -2207,7 +2229,8 @@ class Tensor { if (this->empty()) { // initialize, if empty *this = Tensor(gemm_helper.make_result_range(left.range(), - right.range())); + right.range()), + batch_sz); } else { // Check that the outer dimensions of left match the corresponding // dimensions in result @@ -2230,6 +2253,9 @@ class Tensor { TA_ASSERT(ignore_tile_position() || gemm_helper.right_result_congruent( right.range().upbound_data(), this->range_.upbound_data())); + + // check that batch size of this matches that of left and right + TA_ASSERT(this->batch_size() == batch_sz); } // Compute gemm dimensions @@ -2243,20 +2269,25 @@ class Tensor { const integer ldb = (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? N : K); - for (integer m = 0; m != M; ++m) { - for (integer n = 0; n != N; ++n) { - auto c_offset = m * N + n; - for (integer k = 0; k != K; ++k) { - auto a_offset = - gemm_helper.left_op() == TiledArray::math::blas::NoTranspose - ? m * lda + k - : k * lda + m; - auto b_offset = - gemm_helper.right_op() == TiledArray::math::blas::NoTranspose - ? k * ldb + n - : n * ldb + k; - elem_muladd_op(*(this->data() + c_offset), *(left.data() + a_offset), - *(right.data() + b_offset)); + for (integer b = 0; b != batch_size(); ++b) { + auto this_data = this->batch_data(b); + auto left_data = left.batch_data(b); + auto right_data = right.batch_data(b); + for (integer m = 0; m != M; ++m) { + for (integer n = 0; n != N; ++n) { + auto c_offset = m * N + n; + for (integer k = 0; k != K; ++k) { + auto a_offset = + gemm_helper.left_op() == TiledArray::math::blas::NoTranspose + ? m * lda + k + : k * lda + m; + auto b_offset = + gemm_helper.right_op() == TiledArray::math::blas::NoTranspose + ? k * ldb + n + : n * ldb + k; + elem_muladd_op(*(this_data + c_offset), *(left_data + a_offset), + *(right_data + b_offset)); + } } } } diff --git a/tests/einsum.cpp b/tests/einsum.cpp index eb2ffe1869..eb976b31f5 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -604,10 +604,10 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { return result; }; - TiledRange lhs_trange{{0, 2, 4}, {0, 5}}; + TiledRange lhs_trange{{0, 2, 4}, {0, 2, 5}}; auto lhs = random_tot_darr(world, lhs_trange); - TiledRange rhs_trange{{0, 2, 4, 6}, {0, 5}}; + TiledRange rhs_trange{{0, 2, 4, 6}, {0, 2, 5}}; auto rhs = random_tot_darr(world, rhs_trange); dist_array_t result; BOOST_REQUIRE_NO_THROW( From 6e1868639fc1811ea2f60b65b4e85618a9b3e102 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 17 Dec 2023 13:10:40 -0500 Subject: [PATCH 186/592] Make single-valued initializer lists explicit in ambiguous cases. --- tests/initializer_list.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/initializer_list.cpp b/tests/initializer_list.cpp index 4d051f957d..3f5ad27b80 100644 --- a/tests/initializer_list.cpp +++ b/tests/initializer_list.cpp @@ -471,7 +471,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(vector, T, scalar_type_list) { auto array = array_from_il>(world, tr, il); using tile_type = typename TArray::value_type; std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 2.0}), - tile_type(tr.make_tile_range(1), {3.0})}; + tile_type(tr.make_tile_range(1), std::initializer_list{3.0})}; for (auto i = 0; i < array.size(); ++i) { if (!array.is_local(i)) continue; tile_type tile = array.find(i); @@ -486,7 +486,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(matrix, T, scalar_type_list) { using tile_type = typename TArray::value_type; std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 4.0}), tile_type(tr.make_tile_range(1), {2.0, 3.0, 5.0, 6.0}), - tile_type(tr.make_tile_range(2), {7.0}), + tile_type(tr.make_tile_range(2), std::initializer_list{7.0}), tile_type(tr.make_tile_range(3), {8.0, 9.0})}; for (auto i = 0; i < array.size(); ++i) { if (!array.is_local(i)) continue; @@ -503,11 +503,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tensor, T, scalar_type_list) { using tile_type = typename TArray::value_type; std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 4.0}), tile_type(tr.make_tile_range(1), {2.0, 3.0, 5.0, 6.0}), - tile_type(tr.make_tile_range(2), {7.0}), + tile_type(tr.make_tile_range(2), std::initializer_list{7.0}), tile_type(tr.make_tile_range(3), {8.0, 9.0}), tile_type(tr.make_tile_range(4), {10.0, 13.0}), tile_type(tr.make_tile_range(5), {11.0, 12.0, 14.0, 15.0}), - tile_type(tr.make_tile_range(6), {16.0}), + tile_type(tr.make_tile_range(6), std::initializer_list{16.0}), tile_type(tr.make_tile_range(7), {17.0, 18.0})}; for (auto i = 0; i < array.size(); ++i) { if (!array.is_local(i)) continue; From 2520fe54218419f41b64a5f7bc6f9288e31b1207 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 17 Dec 2023 16:34:25 -0500 Subject: [PATCH 187/592] Use .data() method to access elements by ordinal in tensor_reduce function. --- src/TiledArray/tensor/kernels.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 97f7dc1e5b..f1ec6d99c5 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -787,8 +787,8 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, auto result = identity; for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume; ++ord) { auto temp = - tensor_reduce(reduce_op, join_op, identity, tensor1.at_ordinal(ord), - tensors.at_ordinal(ord)...); + tensor_reduce(reduce_op, join_op, identity, tensor1.data()[ord], + tensors.data()[ord]...); join_op(result, temp); } From eacc22bf803941407bbd9716a51a1cd2baa9fc80 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 17 Dec 2023 16:36:08 -0500 Subject: [PATCH 188/592] Implement Tot x T (and reverse) generalized contraction. --- src/TiledArray/einsum/tiledarray.h | 84 +++++++++++++++--------------- tests/einsum.cpp | 14 +++-- 2 files changed, 53 insertions(+), 45 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 48648407cb..2bd548df5c 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -181,50 +181,51 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using Index = Einsum::Index; - if constexpr (std::tuple_size::value > 1) { - TA_ASSERT(e); - } else if (!e) { // hadamard reduction - auto &[A, B] = AB; - TiledRange trange(range_map[i]); - RangeProduct tiles; - for (auto idx : i) { - tiles *= Range(range_map[idx].tiles_range()); - } - auto pa = A.permutation; - auto pb = B.permutation; - for (Index h : H.tiles) { - if (!C.array.is_local(h)) continue; - size_t batch = 1; - for (size_t i = 0; i < h.size(); ++i) { - batch *= H.batch[i].at(h[i]); + if constexpr (std::tuple_size::value > 1) TA_ASSERT(e); + if constexpr (AreArraySame) { + if (!e) { // hadamard reduction + auto &[A, B] = AB; + TiledRange trange(range_map[i]); + RangeProduct tiles; + for (auto idx : i) { + tiles *= Range(range_map[idx].tiles_range()); } - ResultTensor tile(TiledArray::Range{batch}, - typename ResultTensor::value_type{}); - for (Index i : tiles) { - // skip this unless both input tiles exist - const auto pahi_inv = apply_inverse(pa, h + i); - const auto pbhi_inv = apply_inverse(pb, h + i); - if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; - - auto ai = A.array.find(pahi_inv).get(); - auto bi = B.array.find(pbhi_inv).get(); - if (pa) ai = ai.permute(pa); - if (pb) bi = bi.permute(pb); - auto shape = trange.tile(i); - ai = ai.reshape(shape, batch); - bi = bi.reshape(shape, batch); - for (size_t k = 0; k < batch; ++k) { - auto hk = ai.batch(k).dot(bi.batch(k)); - tile({k}) += hk; + auto pa = A.permutation; + auto pb = B.permutation; + for (Index h : H.tiles) { + if (!C.array.is_local(h)) continue; + size_t batch = 1; + for (size_t i = 0; i < h.size(); ++i) { + batch *= H.batch[i].at(h[i]); } + ResultTensor tile(TiledArray::Range{batch}, + typename ResultTensor::value_type{}); + for (Index i : tiles) { + // skip this unless both input tiles exist + const auto pahi_inv = apply_inverse(pa, h + i); + const auto pbhi_inv = apply_inverse(pb, h + i); + if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; + + auto ai = A.array.find(pahi_inv).get(); + auto bi = B.array.find(pbhi_inv).get(); + if (pa) ai = ai.permute(pa); + if (pb) bi = bi.permute(pb); + auto shape = trange.tile(i); + ai = ai.reshape(shape, batch); + bi = bi.reshape(shape, batch); + for (size_t k = 0; k < batch; ++k) { + auto hk = ai.batch(k).dot(bi.batch(k)); + tile({k}) += hk; + } + } + auto pc = C.permutation; + auto shape = apply_inverse(pc, C.array.trange().tile(h)); + tile = tile.reshape(shape); + if (pc) tile = tile.permute(pc); + C.array.set(h, tile); } - auto pc = C.permutation; - auto shape = apply_inverse(pc, C.array.trange().tile(h)); - tile = tile.reshape(shape); - if (pc) tile = tile.permute(pc); - C.array.set(h, tile); + return C.array; } - return C.array; } // generalized contraction @@ -468,7 +469,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, const std::string &cs, World &world = get_default_world()) { using ECT = expressions::TsrExpr; using ECU = expressions::TsrExpr; - return Einsum::einsum(ECT(A), ECU(B), Einsum::idx(cs), world); + using ResultExprT = std::conditional_t, T, U>; + return Einsum::einsum(ECT(A), ECU(B), Einsum::idx(cs), world); } template diff --git a/tests/einsum.cpp b/tests/einsum.cpp index eb976b31f5..3e7b502da9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -845,7 +845,7 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { } } -BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { +BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { using t_type = DistArray, SparsePolicy>; using tot_type = DistArray>, SparsePolicy>; using matrix_il = TiledArray::detail::matrix_il>; @@ -877,7 +877,6 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { t_type rhs(world, rhs_trange); rhs.fill_random(); - // TODO compute ref_result // i,j;m,n * j,k => i,j,k;m,n TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(0), rhs_trange.dim(1)}; @@ -928,10 +927,17 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // - general product w.r.t. outer indices // - involves ToT * T // tot_type result; - // BOOST_REQUIRE_NO_THROW(result("k,i,j;n,m") = lhs("i,j;m,n") * rhs("j,k")); + // BOOST_REQUIRE_NO_THROW(result("i,j,k;m,n") = lhs("i,j;m,n") * rhs("j,k")); // will try to make this work - // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "k,i,j;n,m"); + tot_type result = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); + { + result = einsum(rhs("j,k"), lhs("i,j;m,n"), "i,j,k;m,n"); + are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); + } } BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { From 54362997ea05c26128fa7c68d667492b9a4173fd Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 20 Dec 2023 16:16:07 -0500 Subject: [PATCH 189/592] bump pybind11 version to VG/v2.11 --- python/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 690b35979d..168bfa2984 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.12) FetchContent_Declare( pybind11 GIT_REPOSITORY https://github.com/ValeevGroup/pybind11.git - GIT_TAG 80d452484c5409444b0ec19383faa84bb7a4d351 # v2.4.3 + GIT_TAG ValeevGroup/v2.11 ) FetchContent_MakeAvailable(pybind11) @@ -39,11 +39,11 @@ if (BUILD_TESTING) # check for presence of prerequisite modules foreach(_mod pytest numpy) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import ${_mod}" + execute_process(COMMAND ${Python_EXECUTABLE} -c "import ${_mod}" OUTPUT_QUIET ERROR_QUIET RESULTS_VARIABLE check_for_${_mod}) if (check_for_${_mod}) - message(FATAL_ERROR "Python module \"${_mod}\" is not installed; install via \"${PYTHON_EXECUTABLE} -m pip install ${_mod}\" and rerun cmake") + message(FATAL_ERROR "Python module \"${_mod}\" is not installed; install via \"${Python_EXECUTABLE} -m pip install ${_mod}\" and rerun cmake") endif(check_for_${_mod}) endforeach(_mod) @@ -51,7 +51,7 @@ if (BUILD_TESTING) add_test( NAME tiledarray/unit/python/run # need to use pytest to find tiledarray module properly - COMMAND ${PYTHON_EXECUTABLE} -m pytest ${PROJECT_SOURCE_DIR}/test_tiledarray.py -v + COMMAND ${Python_EXECUTABLE} -m pytest ${PROJECT_SOURCE_DIR}/test_tiledarray.py -v WORKING_DIRECTORY ${PROJECT_BINARY_DIR} ) set_tests_properties(tiledarray/unit/python/run From 8729c9611f1501c9dc39ef28d9d5422d5329614f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 20 Dec 2023 16:16:07 -0500 Subject: [PATCH 190/592] bump pybind11 version to VG/v2.11 --- python/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 690b35979d..168bfa2984 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.12) FetchContent_Declare( pybind11 GIT_REPOSITORY https://github.com/ValeevGroup/pybind11.git - GIT_TAG 80d452484c5409444b0ec19383faa84bb7a4d351 # v2.4.3 + GIT_TAG ValeevGroup/v2.11 ) FetchContent_MakeAvailable(pybind11) @@ -39,11 +39,11 @@ if (BUILD_TESTING) # check for presence of prerequisite modules foreach(_mod pytest numpy) - execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import ${_mod}" + execute_process(COMMAND ${Python_EXECUTABLE} -c "import ${_mod}" OUTPUT_QUIET ERROR_QUIET RESULTS_VARIABLE check_for_${_mod}) if (check_for_${_mod}) - message(FATAL_ERROR "Python module \"${_mod}\" is not installed; install via \"${PYTHON_EXECUTABLE} -m pip install ${_mod}\" and rerun cmake") + message(FATAL_ERROR "Python module \"${_mod}\" is not installed; install via \"${Python_EXECUTABLE} -m pip install ${_mod}\" and rerun cmake") endif(check_for_${_mod}) endforeach(_mod) @@ -51,7 +51,7 @@ if (BUILD_TESTING) add_test( NAME tiledarray/unit/python/run # need to use pytest to find tiledarray module properly - COMMAND ${PYTHON_EXECUTABLE} -m pytest ${PROJECT_SOURCE_DIR}/test_tiledarray.py -v + COMMAND ${Python_EXECUTABLE} -m pytest ${PROJECT_SOURCE_DIR}/test_tiledarray.py -v WORKING_DIRECTORY ${PROJECT_BINARY_DIR} ) set_tests_properties(tiledarray/unit/python/run From d33511dac8e8baaaa28e295bffbd2503ef830c9d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 20 Dec 2023 16:33:12 -0500 Subject: [PATCH 191/592] bump MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/511 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 1181e2d570..8624da6e01 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag bf0c376d5cdd6f668174b2a4c67b19634d1c0da7 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 03c82cf2780d9e96298cc9140ac128c73eacd3b1 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag c0c4ea543439c740e3ee848fdd055c633a47f6c5 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index b5c6309e6f..eff687a3fe 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 03c82cf2780d9e96298cc9140ac128c73eacd3b1) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 4f7d30b0a738621037b96bb5b820029835753667) +set(TA_TRACKED_MADNESS_TAG c0c4ea543439c740e3ee848fdd055c633a47f6c5) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 03c82cf2780d9e96298cc9140ac128c73eacd3b1) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From 9b234948d99c27f1cbc982a02180e40d2dacc96f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 20 Dec 2023 21:39:38 -0500 Subject: [PATCH 192/592] find python before building pybind11 to prevent the use of FindPython{Interp,Libs} --- python/CMakeLists.txt | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 168bfa2984..99e29e2a83 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1,5 +1,11 @@ cmake_minimum_required(VERSION 3.12) +project(python-tiledarray) + +if (NOT TARGET Python::Module) + find_package(Python COMPONENTS Interpreter Development REQUIRED) +endif() + FetchContent_Declare( pybind11 GIT_REPOSITORY https://github.com/ValeevGroup/pybind11.git @@ -7,8 +13,6 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(pybind11) -project(python-tiledarray) - set(CMAKE_CXX_STANDARD 17) add_compile_options(-Wall) From f7e206d3a3fb70dde483e9003900b45fca28de87 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 8 Nov 2023 10:09:27 -0500 Subject: [PATCH 193/592] [unit] enabled tot x t test, does not compile @bimalgaudel will fix --- src/TiledArray/einsum/tiledarray.h | 6 +++--- tests/einsum.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index c248956066..7d4aca0425 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -422,9 +422,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B) { template auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, const std::string &cs, World &world = get_default_world()) { - static_assert(std::is_same::value); - using E = expressions::TsrExpr; - return Einsum::einsum(E(A), E(B), Einsum::idx(cs), world); + using ECT = expressions::TsrExpr; + using ECU = expressions::TsrExpr; + return Einsum::einsum(ECT(A), ECU(B), Einsum::idx(cs), world); } template diff --git a/tests/einsum.cpp b/tests/einsum.cpp index ee06cf099f..45c4d3e399 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -765,7 +765,7 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); // will try to make this work - // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); } BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t From e62a6757c1df6863a703d8163736495b30a7dc11 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 13 Nov 2023 12:28:02 -0500 Subject: [PATCH 194/592] [WIP] T x ToT overload of einsum: first attempt. --- src/TiledArray/einsum/tiledarray.h | 225 +++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 7d4aca0425..52dab7477e 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -283,6 +283,231 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, return C.array; } +namespace { +template +constexpr bool IsArrayT = detail::is_tensor_v; + +template +constexpr bool IsArrayToT = + detail::is_tensor_of_tensor_v; +} // namespace + +template < + typename ArrayT_, typename ArrayToT_, typename... Indices, + typename = std::enable_if_t && IsArrayToT>> +auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, + std::tuple, Indices...> cs, + World &world) { + using ArrayT = std::remove_cv_t; + using ArrayToT = std::remove_cv_t; + using Shape = typename ArrayToT::shape_type; + using T = typename ArrayT::value_type; + using ToT = typename ArrayToT::value_type; + + auto a = std::get<0>(Einsum::idx(A)); + auto b = std::get<0>(Einsum::idx(B)); + Einsum::Index c = std::get<0>(cs); + + struct { + std::string a, b, c; + } inner; + if constexpr (std::tuple_size::value == 2) { + inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + inner.c = ";" + (std::string)std::get<1>(cs); + } + + // these are "Hadamard" (fused) indices + auto h = a & b & c; + + auto e = (a ^ b); + // contracted indices + auto i = (a & b) - h; + + // cannot be hadamard reduction type operation for this overload + TA_ASSERT(e); + + // no Hadamard indices => standard contraction (or even outer product) + // same a, b, and c => pure Hadamard + TA_ASSERT(!h || (!(a ^ b) && !(b ^ c))); + + // maps Index to TiledRange1 + // (asserts same index maps to the same TR1 in A, and B) + auto range_map = + (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); + + using ::Einsum::index::permutation; + using TiledArray::Permutation; + + auto arrayTermA = ArrayTerm{A.array(), a}; + auto arrayTermB = ArrayTerm{B.array(), b}; + + { + auto ei = (e + i & arrayTermA.idx); + if (arrayTermA.idx != h + ei) + arrayTermA.permutation = permutation(arrayTermA.idx, h + ei); + arrayTermA.expr = ei; + } + + { + auto ei = (e + i & arrayTermB.idx); + if (arrayTermB.idx != h + ei) + arrayTermB.permutation = permutation(arrayTermB.idx, h + ei); + arrayTermB.expr = ei; + } + + ArrayTerm C = {ArrayToT(world, TiledRange(range_map[c])), c}; + for (auto idx : e) { + C.tiles *= Range(range_map[idx].tiles_range()); + } + if (C.idx != h + e) { + C.permutation = permutation(h + e, C.idx); + } + C.expr = e; + + struct { + RangeProduct tiles; + std::vector> batch; + } H; + + for (auto idx : h) { + H.tiles *= Range(range_map[idx].tiles_range()); + H.batch.push_back({}); + for (auto r : range_map[idx]) { + H.batch.back().push_back(Range{r}.size()); + } + } + + using Index = Einsum::Index; + + // generalized contraction + { + auto ei = (e + i & arrayTermA.idx); + arrayTermA.ei_tiled_range = TiledRange(range_map[ei]); + for (auto idx : ei) arrayTermA.tiles *= Range(range_map[idx].tiles_range()); + } + + { + auto ei = (e + i & arrayTermB.idx); + arrayTermB.ei_tiled_range = TiledRange(range_map[ei]); + for (auto idx : ei) arrayTermB.tiles *= Range(range_map[idx].tiles_range()); + } + + std::vector> worlds; + std::vector> local_tiles; + + // iterates over tiles of hadamard indices + for (Index h : H.tiles) { + auto &A = arrayTermA; + auto &B = arrayTermB; + + auto own = A.own(h) || B.own(h); + auto comm = world.mpi.comm().Split(own, world.rank()); + worlds.push_back(std::make_unique(comm)); + auto &owners = worlds.back(); + if (!own) continue; + size_t batch = 1; + for (size_t i = 0; i < h.size(); ++i) { + batch *= H.batch[i].at(h[i]); + } + + { + arrayTermA.local_tiles.clear(); + const Permutation &P = arrayTermA.permutation; + + for (Index ei : arrayTermA.tiles) { + auto idx = apply_inverse(P, h + ei); + if (!arrayTermA.array.is_local(idx)) continue; + if (arrayTermA.array.is_zero(idx)) continue; + // TODO no need for immediate evaluation + auto tile = arrayTermA.array.find_local(idx).get(); + if (P) tile = tile.permute(P); + auto shape = arrayTermA.ei_tiled_range.tile(ei); + tile = tile.reshape(shape, batch); + arrayTermA.local_tiles.push_back({ei, tile}); + } + bool replicated = arrayTermA.array.pmap()->is_replicated(); + arrayTermA.ei = TiledArray::make_array( + *owners, arrayTermA.ei_tiled_range, arrayTermA.local_tiles.begin(), + arrayTermA.local_tiles.end(), replicated); + } + + { + arrayTermB.local_tiles.clear(); + const Permutation &P = arrayTermB.permutation; + + for (Index ei : arrayTermB.tiles) { + auto idx = apply_inverse(P, h + ei); + if (!arrayTermB.array.is_local(idx)) continue; + if (arrayTermB.array.is_zero(idx)) continue; + // TODO no need for immediate evaluation + auto tile = arrayTermB.array.find_local(idx).get(); + if (P) tile = tile.permute(P); + auto shape = arrayTermB.ei_tiled_range.tile(ei); + tile = tile.reshape(shape, batch); + arrayTermB.local_tiles.push_back({ei, tile}); + } + bool replicated = arrayTermB.array.pmap()->is_replicated(); + arrayTermB.ei = TiledArray::make_array( + *owners, arrayTermB.ei_tiled_range, arrayTermB.local_tiles.begin(), + arrayTermB.local_tiles.end(), replicated); + } + + // todo + // C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); + A.ei.defer_deleter_to_next_fence(); + B.ei.defer_deleter_to_next_fence(); + A.ei = ArrayT(); + B.ei = ArrayToT(); + // why omitting this fence leads to deadlock? + owners->gop.fence(); + for (Index e : C.tiles) { + if (!C.ei.is_local(e)) continue; + if (C.ei.is_zero(e)) continue; + // TODO no need for immediate evaluation + auto tile = C.ei.find_local(e).get(); + assert(tile.batch_size() == batch); + const Permutation &P = C.permutation; + auto c = apply(P, h + e); + auto shape = C.array.trange().tile(c); + shape = apply_inverse(P, shape); + tile = tile.reshape(shape); + if (P) tile = tile.permute(P); + local_tiles.push_back({c, tile}); + } + // mark for lazy deletion + C.ei = ArrayToT(); + } + + if constexpr (!Shape::is_dense()) { + TiledRange tiled_range = TiledRange(range_map[c]); + std::vector> tile_norms; + for (auto &[index, tile] : local_tiles) { + tile_norms.push_back({index, tile.norm()}); + } + Shape shape(world, tile_norms, tiled_range); + C.array = ArrayToT(world, TiledRange(range_map[c]), shape); + } + + for (auto &[index, tile] : local_tiles) { + if (C.array.is_zero(index)) continue; + C.array.set(index, tile); + } + + for (auto &w : worlds) { + w->gop.fence(); + } + + return C.array; +} + +template && IsArrayToT>> +auto einsum(expressions::TsrExpr B, expressions::TsrExpr A, + std::tuple, Indices...> cs, + World &world) { + return einsum(A, B, cs, world); +} + /// Computes ternary tensor product whose result /// is a scalar (a ternary dot product). Optimized for the case where /// the arguments have common (Hadamard) indices. From dce1bdc40203e78e7c3252ae30cc38eeff8528aa Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 14 Nov 2023 14:34:05 -0500 Subject: [PATCH 195/592] tiny step towards supporting T*ToT in expr --- src/TiledArray/tensor/type_traits.h | 7 ++++--- src/TiledArray/tile_op/contract_reduce.h | 23 +++++++++++++---------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index eed84c6026..fd197c8cdf 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -114,7 +114,7 @@ struct is_nested_tensor { /// @c is_nested_tensor_v is an alias for @c /// is_nested_tensor::value template -constexpr const bool is_nested_tensor_v = is_nested_tensor::value; +inline constexpr const bool is_nested_tensor_v = is_nested_tensor::value; //////////////////////////////////////////////////////////////////////////////// @@ -150,7 +150,7 @@ struct is_tensor { /// @tparam Ts a parameter pack /// @c is_tensor_v is an alias for @c is_tensor::value template -constexpr const bool is_tensor_v = is_tensor::value; +inline constexpr const bool is_tensor_v = is_tensor::value; //////////////////////////////////////////////////////////////////////////////// @@ -172,7 +172,8 @@ struct is_tensor_of_tensor { /// @c is_tensor_of_tensor_v is an alias for @c /// is_tensor_of_tensor::value template -constexpr const bool is_tensor_of_tensor_v = is_tensor_of_tensor::value; +inline constexpr const bool is_tensor_of_tensor_v = + is_tensor_of_tensor::value; //////////////////////////////////////////////////////////////////////////////// diff --git a/src/TiledArray/tile_op/contract_reduce.h b/src/TiledArray/tile_op/contract_reduce.h index 48b7936d26..d9d87d59c8 100644 --- a/src/TiledArray/tile_op/contract_reduce.h +++ b/src/TiledArray/tile_op/contract_reduce.h @@ -64,17 +64,20 @@ class ContractReduceBase { using elem_muladd_op_type = void(result_value_type&, const left_value_type&, const right_value_type&); - static_assert( - TiledArray::detail::is_tensor_v == - TiledArray::detail::is_tensor_v && - TiledArray::detail::is_tensor_v == - TiledArray::detail::is_tensor_v, - "ContractReduce can only handle plain tensors or nested tensors " - "(tensors-of-tensors); mixed contractions are not supported"); static constexpr bool plain_tensors = - !(TiledArray::detail::is_tensor_v && - TiledArray::detail::is_tensor_v && - TiledArray::detail::is_tensor_v); + !TiledArray::detail::is_nested_tensor_v && + !TiledArray::detail::is_nested_tensor_v && + !TiledArray::detail::is_nested_tensor_v; + static constexpr bool nested_tensors = + TiledArray::detail::is_nested_tensor_v; + static constexpr bool mixed_tensors = !plain_tensors && !nested_tensors; + static_assert(!mixed_tensors || + (mixed_tensors && + TiledArray::detail::is_nested_tensor_v), + "ContractReduce applied to 1 plain tensor and 1 nested tensor " + "must produce a nested tensor " + "(tensors-of-tensors)"); private: struct Impl { From 8230b165159b20a0600f3d195fb3db1474f5e268 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 Nov 2023 12:41:58 -0500 Subject: [PATCH 196/592] [WIP]: Make binary_egine less restrictive on left and right arg types. --- src/TiledArray/einsum/tiledarray.h | 21 ++++++++++++--------- src/TiledArray/expressions/binary_engine.h | 19 ++++++++++++++++--- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 52dab7477e..09640d31f6 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -309,7 +309,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, Einsum::Index c = std::get<0>(cs); struct { - std::string a, b, c; + std::string b, c; } inner; if constexpr (std::tuple_size::value == 2) { inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); @@ -319,16 +319,13 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // these are "Hadamard" (fused) indices auto h = a & b & c; - auto e = (a ^ b); // contracted indices auto i = (a & b) - h; + // contraction not allowed in tensor x tensor-of-tensor + TA_ASSERT(!i); - // cannot be hadamard reduction type operation for this overload - TA_ASSERT(e); - - // no Hadamard indices => standard contraction (or even outer product) - // same a, b, and c => pure Hadamard - TA_ASSERT(!h || (!(a ^ b) && !(b ^ c))); + // indices exclusively in 'a' or exclusively in 'b' + auto e = (a ^ b); // maps Index to TiledRange1 // (asserts same index maps to the same TR1 in A, and B) @@ -364,6 +361,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } C.expr = e; + arrayTermB.expr += inner.b; + C.expr += inner.c; + struct { RangeProduct tiles; std::vector> batch; @@ -453,7 +453,10 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } // todo - // C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); + C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); + + // + A.ei.defer_deleter_to_next_fence(); B.ei.defer_deleter_to_next_fence(); A.ei = ArrayT(); diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h index 4758ab0069..93192e2b5e 100644 --- a/src/TiledArray/expressions/binary_engine.h +++ b/src/TiledArray/expressions/binary_engine.h @@ -146,11 +146,10 @@ class BinaryEngine : public ExprEngine { TiledArray::detail::is_tensor_of_tensor_v; constexpr bool right_tile_is_tot = TiledArray::detail::is_tensor_of_tensor_v; - static_assert(!(left_tile_is_tot ^ right_tile_is_tot), - "ContEngine can only handle tensors of same nested-ness " - "(both plain or both ToT)"); constexpr bool args_are_plain_tensors = !left_tile_is_tot && !right_tile_is_tot; + constexpr bool args_are_mixed_tensors = + left_tile_is_tot ^ right_tile_is_tot; if (args_are_plain_tensors && (left_outer_permtype_ == PermutationType::matrix_transpose || left_outer_permtype_ == PermutationType::identity)) { @@ -175,6 +174,20 @@ class BinaryEngine : public ExprEngine { right_inner_permtype_ == PermutationType::identity))) { right_.permute_tiles(false); } + if (args_are_mixed_tensors && + ((left_outer_permtype_ == PermutationType::matrix_transpose || + left_outer_permtype_ == PermutationType::identity) || + (left_inner_permtype_ == PermutationType::matrix_transpose || + left_inner_permtype_ == PermutationType::identity))) { + left_.permute_tiles(false); + } + if (args_are_mixed_tensors && + ((left_outer_permtype_ == PermutationType::matrix_transpose || + left_outer_permtype_ == PermutationType::identity) || + (right_inner_permtype_ == PermutationType::matrix_transpose || + right_inner_permtype_ == PermutationType::identity))) { + right_.permute_tiles(false); + } } public: From a129754727a63b8fe7a2840b323fc726f32b0399 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 20 Nov 2023 14:06:14 -0500 Subject: [PATCH 197/592] moar ToT * T progress --- src/TiledArray/expressions/cont_engine.h | 299 ++++++++++++++--------- src/TiledArray/expressions/mult_engine.h | 4 +- src/TiledArray/expressions/product.h | 3 + src/TiledArray/tile_op/scal.h | 2 + tests/einsum.cpp | 8 +- 5 files changed, 194 insertions(+), 122 deletions(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 35c2f34199..9a1cb9f5f9 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -107,15 +107,26 @@ class ContEngine : public BinaryEngine { protected: op_type op_; ///< Tile operation - using tile_element_type = typename value_type::value_type; - std::function - inner_tile_nonreturn_op_; ///< Tile element operation (only non-null for - ///< nested tensor expressions) - std::function - inner_tile_return_op_; ///< Same as inner_tile_nonreturn_op_ but returns - ///< the result + + // tile types of the result and (after evaluation) left and right arguments + using result_tile_type = value_type; + using left_tile_type = typename EngineTrait::eval_type; + using right_tile_type = typename EngineTrait::eval_type; + + // tile element types of the result and (after evaluation) left and right + // arguments + using result_tile_element_type = typename result_tile_type::value_type; + using left_tile_element_type = typename left_tile_type::value_type; + using right_tile_element_type = typename right_tile_type::value_type; + + std::function + element_nonreturn_op_; ///< Tile element operation (only non-null for + ///< nested tensor expressions) + std::function + element_return_op_; ///< Same as inner_tile_nonreturn_op_ but returns + ///< the result TiledArray::detail::ProcGrid proc_grid_; ///< Process grid for the contraction size_type K_ = 1; ///< Inner dimension size @@ -239,8 +250,8 @@ class ContEngine : public BinaryEngine { // precondition checks // 1. if ToT inner tile op has been initialized if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { - TA_ASSERT(inner_tile_nonreturn_op_); - TA_ASSERT(inner_tile_return_op_); + TA_ASSERT(element_nonreturn_op_); + TA_ASSERT(element_return_op_); } // Initialize children @@ -271,7 +282,7 @@ class ContEngine : public BinaryEngine { op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), (permute_tiles_ ? perm_ : BipartitePermutation{}), - this->inner_tile_nonreturn_op_); + this->element_nonreturn_op_); } trange_ = ContEngine_::make_trange(outer(perm_)); shape_ = ContEngine_::make_shape(outer(perm_)); @@ -284,7 +295,7 @@ class ContEngine : public BinaryEngine { // factor_ is absorbed into inner_tile_nonreturn_op_ op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), - BipartitePermutation{}, this->inner_tile_nonreturn_op_); + BipartitePermutation{}, this->element_nonreturn_op_); } trange_ = ContEngine_::make_trange(); shape_ = ContEngine_::make_shape(); @@ -457,120 +468,172 @@ class ContEngine : public BinaryEngine { protected: void init_inner_tile_op(const IndexList& inner_target_indices) { - if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { - using inner_tile_type = typename value_type::value_type; + if constexpr (TiledArray::detail::is_tensor_of_tensor_v) { + constexpr bool tot_x_tot = TiledArray::detail::is_tensor_of_tensor_v< + result_tile_type, left_tile_type, right_tile_type>; const auto inner_prod = this->inner_product_type(); TA_ASSERT(inner_prod == TensorProduct::Contraction || inner_prod == TensorProduct::Hadamard); if (inner_prod == TensorProduct::Contraction) { - using inner_tile_type = typename value_type::value_type; - using contract_inner_tile_type = - TiledArray::detail::ContractReduce; - // factor_ is absorbed into inner_tile_nonreturn_op_ - auto contrreduce_op = - (inner_target_indices != inner(this->indices_)) - ? contract_inner_tile_type( - to_cblas_op(this->left_inner_permtype_), - to_cblas_op(this->right_inner_permtype_), this->factor_, - inner_size(this->indices_), - inner_size(this->left_indices_), - inner_size(this->right_indices_), - (this->permute_tiles_ ? inner(this->perm_) - : Permutation{})) - : contract_inner_tile_type( - to_cblas_op(this->left_inner_permtype_), - to_cblas_op(this->right_inner_permtype_), this->factor_, - inner_size(this->indices_), - inner_size(this->left_indices_), - inner_size(this->right_indices_)); - this->inner_tile_nonreturn_op_ = [contrreduce_op]( - inner_tile_type& result, - const inner_tile_type& left, - const inner_tile_type& right) { - contrreduce_op(result, left, right); - }; + TA_ASSERT(tot_x_tot); + if constexpr (tot_x_tot) { + using op_type = TiledArray::detail::ContractReduce< + result_tile_element_type, left_tile_element_type, + right_tile_element_type, scalar_type>; + // factor_ is absorbed into inner_tile_nonreturn_op_ + auto contrreduce_op = + (inner_target_indices != inner(this->indices_)) + ? op_type(to_cblas_op(this->left_inner_permtype_), + to_cblas_op(this->right_inner_permtype_), + this->factor_, inner_size(this->indices_), + inner_size(this->left_indices_), + inner_size(this->right_indices_), + (this->permute_tiles_ ? inner(this->perm_) + : Permutation{})) + : op_type(to_cblas_op(this->left_inner_permtype_), + to_cblas_op(this->right_inner_permtype_), + this->factor_, inner_size(this->indices_), + inner_size(this->left_indices_), + inner_size(this->right_indices_)); + this->element_nonreturn_op_ = + [contrreduce_op](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + contrreduce_op(result, left, right); + }; + } // ToT x ToT } else if (inner_prod == TensorProduct::Hadamard) { - // inner tile op depends on the outer op ... e.g. if outer op - // is contract then inner must implement (ternary) multiply-add; - // if the outer is hadamard then the inner is binary multiply - const auto outer_prod = this->product_type(); - if (this->factor_ == 1) { - using base_op_type = - TiledArray::detail::Mult; - using op_type = TiledArray::detail::BinaryWrapper< - base_op_type>; // can't consume inputs if they are used multiple - // times, e.g. when outer op is gemm - auto mult_op = (inner_target_indices != inner(this->indices_)) - ? op_type(base_op_type(), this->permute_tiles_ - ? inner(this->perm_) - : Permutation{}) - : op_type(base_op_type()); - this->inner_tile_nonreturn_op_ = [mult_op, outer_prod]( - inner_tile_type& result, - const inner_tile_type& left, - const inner_tile_type& right) { - if (outer_prod == TensorProduct::Hadamard) - result = mult_op(left, right); - else { - TA_ASSERT(outer_prod == TensorProduct::Hadamard || - outer_prod == TensorProduct::Contraction); - // there is currently no fused MultAdd ternary Op, only Add and - // Mult thus implement this as 2 separate steps - // TODO optimize by implementing (ternary) MultAdd - if (empty(result)) - result = mult_op(left, right); - else { - auto result_increment = mult_op(left, right); - add_to(result, result_increment); - } - } - }; - } else { - using base_op_type = - TiledArray::detail::ScalMult; - using op_type = TiledArray::detail::BinaryWrapper< - base_op_type>; // can't consume inputs if they are used multiple - // times, e.g. when outer op is gemm - auto mult_op = (inner_target_indices != inner(this->indices_)) - ? op_type(base_op_type(this->factor_), - this->permute_tiles_ ? inner(this->perm_) - : Permutation{}) - : op_type(base_op_type(this->factor_)); - this->inner_tile_nonreturn_op_ = [mult_op, outer_prod]( - inner_tile_type& result, - const inner_tile_type& left, - const inner_tile_type& right) { - TA_ASSERT(outer_prod == TensorProduct::Hadamard || - outer_prod == TensorProduct::Contraction); - if (outer_prod == TensorProduct::Hadamard) - result = mult_op(left, right); - else { - // there is currently no fused MultAdd ternary Op, only Add and - // Mult thus implement this as 2 separate steps - // TODO optimize by implementing (ternary) MultAdd - if (empty(result)) - result = mult_op(left, right); - else { - auto result_increment = mult_op(left, right); - add_to(result, result_increment); - } - } + TA_ASSERT(tot_x_tot); + if constexpr (tot_x_tot) { + // inner tile op depends on the outer op ... e.g. if outer op + // is contract then inner must implement (ternary) multiply-add; + // if the outer is hadamard then the inner is binary multiply + const auto outer_prod = this->product_type(); + if (this->factor_ == 1) { + using base_op_type = + TiledArray::detail::Mult; + using op_type = TiledArray::detail::BinaryWrapper< + base_op_type>; // can't consume inputs if they are used + // multiple times, e.g. when outer op is gemm + auto mult_op = + (inner_target_indices != inner(this->indices_)) + ? op_type(base_op_type(), this->permute_tiles_ + ? inner(this->perm_) + : Permutation{}) + : op_type(base_op_type()); + this->element_nonreturn_op_ = + [mult_op, outer_prod](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + if (outer_prod == TensorProduct::Hadamard) + result = mult_op(left, right); + else { + TA_ASSERT(outer_prod == TensorProduct::Hadamard || + outer_prod == TensorProduct::Contraction); + // there is currently no fused MultAdd ternary Op, only Add + // and Mult thus implement this as 2 separate steps + // TODO optimize by implementing (ternary) MultAdd + if (empty(result)) + result = mult_op(left, right); + else { + auto result_increment = mult_op(left, right); + add_to(result, result_increment); + } + } + }; + } else { + using base_op_type = TiledArray::detail::ScalMult< + result_tile_element_type, left_tile_element_type, + right_tile_element_type, scalar_type, false, false>; + using op_type = TiledArray::detail::BinaryWrapper< + base_op_type>; // can't consume inputs if they are used + // multiple times, e.g. when outer op is gemm + auto mult_op = + (inner_target_indices != inner(this->indices_)) + ? op_type(base_op_type(this->factor_), + this->permute_tiles_ ? inner(this->perm_) + : Permutation{}) + : op_type(base_op_type(this->factor_)); + this->element_nonreturn_op_ = + [mult_op, outer_prod](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + TA_ASSERT(outer_prod == TensorProduct::Hadamard || + outer_prod == TensorProduct::Contraction); + if (outer_prod == TensorProduct::Hadamard) + result = mult_op(left, right); + else { + // there is currently no fused MultAdd ternary Op, only Add + // and Mult thus implement this as 2 separate steps + // TODO optimize by implementing (ternary) MultAdd + if (empty(result)) + result = mult_op(left, right); + else { + auto result_increment = mult_op(left, right); + add_to(result, result_increment); + } + } + }; + } + } // ToT x ToT + } else if (inner_prod == TensorProduct::General) { + TA_ASSERT(!tot_x_tot); + constexpr bool tot_x_t = + TiledArray::detail::is_tensor_of_tensor_v && + TiledArray::detail::is_tensor_v; + constexpr bool t_x_tot = + TiledArray::detail::is_tensor_of_tensor_v && + TiledArray::detail::is_tensor_v; + if constexpr (tot_x_t || t_x_tot) { + using arg_tile_element_type = + std::conditional_t; + using scalar_type = + std::conditional_t; + + auto scal_op = [do_perm = this->permute_tiles_, + perm = this->permute_tiles_ ? inner(this->perm_) + : Permutation{}]( + const left_tile_element_type& left, + const right_tile_element_type& right) + -> result_tile_element_type { + using TiledArray::scale; + if constexpr (tot_x_t) { + if (do_perm) + return scale(left, right, perm); + else + return scale(left, right); + } else if constexpr (tot_x_t) { + if (do_perm) + return scale(right, left, perm); + else + return scale(right, left); + } else + abort(); // unreachable }; + this->element_nonreturn_op_ = + [scal_op](result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + result = scal_op(left, right); + }; } } else abort(); // unsupported TensorProduct type - TA_ASSERT(inner_tile_nonreturn_op_); - this->inner_tile_return_op_ = - [inner_tile_nonreturn_op = this->inner_tile_nonreturn_op_]( - const inner_tile_type& left, const inner_tile_type& right) { - inner_tile_type result; - inner_tile_nonreturn_op(result, left, right); - return result; - }; + TA_ASSERT(element_nonreturn_op_); + this->element_return_op_ = [inner_tile_nonreturn_op = + this->element_nonreturn_op_]( + const left_tile_element_type& left, + const right_tile_element_type& right) { + result_tile_element_type result; + inner_tile_nonreturn_op(result, left, right); + return result; + }; } } diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index a53133d4b0..91924efeb2 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -406,7 +406,7 @@ class MultEngine : public ContEngine> { // dimensions as well return op_type(op_base_type()); } else if (inner_prod == TensorProduct::Contraction) { - return op_type(op_base_type(this->inner_tile_return_op_)); + return op_type(op_base_type(this->element_return_op_)); } else abort(); } else { // plain tensors @@ -431,7 +431,7 @@ class MultEngine : public ContEngine> { // dimensions as well return op_type(op_base_type(), perm); } else if (inner_prod == TensorProduct::Contraction) { - return op_type(op_base_type(this->inner_tile_return_op_), perm); + return op_type(op_base_type(this->element_return_op_), perm); } else abort(); } else { // plain tensor diff --git a/src/TiledArray/expressions/product.h b/src/TiledArray/expressions/product.h index d364764964..381b1f485c 100644 --- a/src/TiledArray/expressions/product.h +++ b/src/TiledArray/expressions/product.h @@ -57,6 +57,9 @@ inline TensorProduct compute_product_type(const IndexList& left_indices, result = TensorProduct::Hadamard; else result = TensorProduct::Contraction; + } else if ((left_indices && !right_indices) || + (!left_indices && right_indices)) { // used for ToT*T or T*ToT + result = TensorProduct::General; } return result; } diff --git a/src/TiledArray/tile_op/scal.h b/src/TiledArray/tile_op/scal.h index 54d5337ed4..a89770c5a7 100644 --- a/src/TiledArray/tile_op/scal.h +++ b/src/TiledArray/tile_op/scal.h @@ -128,6 +128,8 @@ class Scal { return Scal_::template eval(arg); } + void set_factor(const scalar_type factor) { factor_ = factor; } + }; // class Scal } // namespace detail diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 45c4d3e399..3033936381 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -764,8 +764,12 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // tot_type result; // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); - // will try to make this work - tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + // will try to make this work FIRST since this is used by the einsum code + // below + tot_type out; + out("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l"); + // will try to make this work NEXT + // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); } BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t From bf959a241633501810dd0f04e5910983dc394c84 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 Nov 2023 22:55:22 -0500 Subject: [PATCH 198/592] [skip_ci] add permutation optimizer for general case: supports inner operation between tot * t. --- src/TiledArray/expressions/permopt.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/TiledArray/expressions/permopt.h b/src/TiledArray/expressions/permopt.h index 21d4a0ec39..dc029b73a1 100644 --- a/src/TiledArray/expressions/permopt.h +++ b/src/TiledArray/expressions/permopt.h @@ -527,6 +527,18 @@ class NullBinaryOpPermutationOptimizer : public BinaryOpPermutationOptimizer { } }; +/// +/// +/// +class GeneralPermutationOptimizer : public GEMMPermutationOptimizer { + public: + GeneralPermutationOptimizer(const GeneralPermutationOptimizer&) = default; + GeneralPermutationOptimizer& operator=(const GeneralPermutationOptimizer&) = + default; + virtual ~GeneralPermutationOptimizer() = default; + using GEMMPermutationOptimizer::GEMMPermutationOptimizer; +}; + inline std::shared_ptr make_permutation_optimizer( TensorProduct product_type, const IndexList& left_indices, const IndexList& right_indices, bool prefer_to_permute_left) { @@ -540,6 +552,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::General: + return std::make_shared( + left_indices, right_indices, prefer_to_permute_left); default: abort(); } @@ -559,6 +574,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( target_indices, left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::General: + return std::make_shared( + left_indices, right_indices, prefer_to_permute_left); default: abort(); } From 8dd614ec8c2a946191c4ddf5811ea61ebb8bf7b8 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 Nov 2023 22:55:22 -0500 Subject: [PATCH 199/592] add permutation optimizer for scaling --- src/CMakeLists.txt | 13 +-- src/TiledArray/expressions/permopt.cpp | 32 ++++++ src/TiledArray/expressions/permopt.h | 130 +++++++++++++++++++++---- 3 files changed, 151 insertions(+), 24 deletions(-) create mode 100644 src/TiledArray/expressions/permopt.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 55227c2093..6e6c708891 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -100,7 +100,6 @@ TiledArray/dist_eval/contraction_eval.h TiledArray/dist_eval/dist_eval.h TiledArray/dist_eval/unary_eval.h TiledArray/einsum/index.h -TiledArray/einsum/index.cpp TiledArray/einsum/range.h TiledArray/einsum/string.h TiledArray/expressions/add_engine.h @@ -195,13 +194,10 @@ TiledArray/util/bug.h TiledArray/util/function.h TiledArray/util/initializer_list.h TiledArray/util/logger.h -TiledArray/util/ptr_registry.cpp TiledArray/util/ptr_registry.h -TiledArray/util/random.cpp TiledArray/util/random.h TiledArray/util/singleton.h TiledArray/util/threads.h -TiledArray/util/threads.cpp TiledArray/util/thread_specific.h TiledArray/util/time.h TiledArray/util/vector.h @@ -243,10 +239,15 @@ TiledArray/tensor_impl.cpp TiledArray/array_impl.cpp TiledArray/dist_array.cpp TiledArray/version.cpp -TiledArray/util/backtrace.cpp -TiledArray/util/bug.cpp +TiledArray/einsum/index.cpp +TiledArray/expressions/permopt.cpp TiledArray/math/linalg/basic.cpp TiledArray/math/linalg/rank-local.cpp +TiledArray/util/backtrace.cpp +TiledArray/util/bug.cpp +TiledArray/util/ptr_registry.cpp +TiledArray/util/random.cpp +TiledArray/util/threads.cpp ) # feed TILEDARRAY_GIT_REVISION and TILEDARRAY_GIT_DESCRIPTION to TiledArray/version.cpp only to avoid recompiling everything set_source_files_properties( diff --git a/src/TiledArray/expressions/permopt.cpp b/src/TiledArray/expressions/permopt.cpp new file mode 100644 index 0000000000..9b125fdc04 --- /dev/null +++ b/src/TiledArray/expressions/permopt.cpp @@ -0,0 +1,32 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2020 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Eduard Valeyev + * Department of Chemistry, Virginia Tech + * + * permopt.cpp + * Nov 21, 2023 + * + */ + +#include + +namespace TiledArray::expressions { + +IndexList ScalePermutationOptimizer::null_indices_; + +} // namespace TiledArray::expressions diff --git a/src/TiledArray/expressions/permopt.h b/src/TiledArray/expressions/permopt.h index dc029b73a1..998ea78efe 100644 --- a/src/TiledArray/expressions/permopt.h +++ b/src/TiledArray/expressions/permopt.h @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -51,6 +52,56 @@ inline blas::Op to_cblas_op(PermutationType permtype) { : math::blas::NoTranspose; } +/// Optimizer of permutations for a unary operation +class UnaryOpPermutationOptimizer { + public: + /// construct using initial indices for the argument + /// \param argument_indices the initial argument index list + UnaryOpPermutationOptimizer(const IndexList& argument_indices) + : argument_indices_(argument_indices) {} + + /// construct using initial indices for the argument, + /// and the desired result indices + /// \param result_indices the desired result index list + /// \param argument_indices the initial argument index list + UnaryOpPermutationOptimizer(const IndexList& result_indices, + const IndexList& argument_indices) + : result_indices_(result_indices), argument_indices_(argument_indices) { + TA_ASSERT(argument_indices_.is_permutation(argument_indices_)); + target_result_indices_ = argument_indices_; + } + + UnaryOpPermutationOptimizer() = delete; + UnaryOpPermutationOptimizer(const UnaryOpPermutationOptimizer&) = default; + UnaryOpPermutationOptimizer& operator=(const UnaryOpPermutationOptimizer&) = + default; + virtual ~UnaryOpPermutationOptimizer() = default; + + /// \return the desired result indices + const IndexList& result_indices() const { + TA_ASSERT(result_indices_); + return result_indices_; + } + /// \return initial argument indices + const IndexList& argument_indices() const { return argument_indices_; } + + /// \return the proposed argument index list + const IndexList& target_argument_indices() const { + return target_result_indices_; + } + /// \return the proposed result index list (not necessarily same as that + /// returned by result_indices()) + const IndexList& target_result_indices() const { + return target_result_indices_; + } + /// \return the type of permutation bringing the initial left index list to + /// the target left index list + PermutationType argument_permtype() const { return PermutationType::general; } + + private: + IndexList result_indices_, argument_indices_, target_result_indices_; +}; + /// Abstract optimizer of permutations for a binary operation class BinaryOpPermutationOptimizer { public: @@ -479,6 +530,61 @@ class HadamardPermutationOptimizer : public BinaryOpPermutationOptimizer { IndexList target_result_indices_; }; +// clang-format off +/// Implements BinaryOpPermutationOptimizer interface for a scale operation viewed as a binary tensor product, i.e. +/// a tensor product between an order-0 tensor and an arbitrary tensor +// clang-format on +class ScalePermutationOptimizer : public BinaryOpPermutationOptimizer { + public: + ScalePermutationOptimizer(const ScalePermutationOptimizer&) = default; + ScalePermutationOptimizer& operator=(const ScalePermutationOptimizer&) = + default; + ~ScalePermutationOptimizer() = default; + + ScalePermutationOptimizer(const IndexList& left_indices, + const IndexList& right_indices) + : BinaryOpPermutationOptimizer(left_indices, right_indices, + left_indices ? true : false), + left_argument_is_scalar_(!left_indices), + target_result_indices_(left_argument_is_scalar_ ? right_indices + : left_indices) {} + + ScalePermutationOptimizer(const IndexList& result_indices, + const IndexList& left_indices, + const IndexList& right_indices) + : BinaryOpPermutationOptimizer(result_indices, left_indices, + right_indices, + left_indices ? true : false), + left_argument_is_scalar_(!left_indices) { + const auto& arg_indices = + left_argument_is_scalar_ ? right_indices : left_indices; + TA_ASSERT(arg_indices.is_permutation(result_indices)); + target_result_indices_ = arg_indices; + } + + const IndexList& target_left_indices() const override final { + return !left_argument_is_scalar_ ? target_result_indices_ : null_indices_; + } + const IndexList& target_right_indices() const override final { + return left_argument_is_scalar_ ? target_result_indices_ : null_indices_; + } + const IndexList& target_result_indices() const override final { + return target_result_indices_; + } + PermutationType left_permtype() const override final { + return PermutationType::general; + } + PermutationType right_permtype() const override final { + return PermutationType::general; + } + TensorProduct op_type() const override final { return TensorProduct::Scale; } + + private: + bool left_argument_is_scalar_; + IndexList target_result_indices_; + static IndexList null_indices_; +}; + class NullBinaryOpPermutationOptimizer : public BinaryOpPermutationOptimizer { public: NullBinaryOpPermutationOptimizer(const NullBinaryOpPermutationOptimizer&) = @@ -527,18 +633,6 @@ class NullBinaryOpPermutationOptimizer : public BinaryOpPermutationOptimizer { } }; -/// -/// -/// -class GeneralPermutationOptimizer : public GEMMPermutationOptimizer { - public: - GeneralPermutationOptimizer(const GeneralPermutationOptimizer&) = default; - GeneralPermutationOptimizer& operator=(const GeneralPermutationOptimizer&) = - default; - virtual ~GeneralPermutationOptimizer() = default; - using GEMMPermutationOptimizer::GEMMPermutationOptimizer; -}; - inline std::shared_ptr make_permutation_optimizer( TensorProduct product_type, const IndexList& left_indices, const IndexList& right_indices, bool prefer_to_permute_left) { @@ -552,9 +646,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( left_indices, right_indices, prefer_to_permute_left); - case TensorProduct::General: - return std::make_shared( - left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::Scale: + return std::make_shared(left_indices, + right_indices); default: abort(); } @@ -574,9 +668,9 @@ inline std::shared_ptr make_permutation_optimizer( case TensorProduct::Invalid: return std::make_shared( target_indices, left_indices, right_indices, prefer_to_permute_left); - case TensorProduct::General: - return std::make_shared( - left_indices, right_indices, prefer_to_permute_left); + case TensorProduct::Scale: + return std::make_shared( + target_indices, left_indices, right_indices); default: abort(); } From 43d61f02fec226a2c26744b210d8f93970299f24 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 21 Nov 2023 16:33:46 -0500 Subject: [PATCH 200/592] expression-level support for ToT x T (and vice versa) implemented, need to test --- src/TiledArray/expressions/cont_engine.h | 19 ++++----- src/TiledArray/expressions/product.h | 5 ++- tests/einsum.cpp | 49 +++++++++++++++++++++--- 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 9a1cb9f5f9..5ec69c7d0d 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -158,9 +158,10 @@ class ContEngine : public BinaryEngine { TensorProduct inner_product_type() const { TA_ASSERT(inner_product_type_ != TensorProduct::Invalid); // init_indices() must initialize this - /// only Hadamard and contraction are supported now + /// only Hadamard, contraction, and scale are supported now TA_ASSERT(inner_product_type_ == TensorProduct::Hadamard || - inner_product_type_ == TensorProduct::Contraction); + inner_product_type_ == TensorProduct::Contraction || + inner_product_type_ == TensorProduct::Scale); return inner_product_type_; } @@ -473,7 +474,8 @@ class ContEngine : public BinaryEngine { result_tile_type, left_tile_type, right_tile_type>; const auto inner_prod = this->inner_product_type(); TA_ASSERT(inner_prod == TensorProduct::Contraction || - inner_prod == TensorProduct::Hadamard); + inner_prod == TensorProduct::Hadamard || + inner_prod == TensorProduct::Scale); if (inner_prod == TensorProduct::Contraction) { TA_ASSERT(tot_x_tot); if constexpr (tot_x_tot) { @@ -577,8 +579,8 @@ class ContEngine : public BinaryEngine { } }; } - } // ToT x ToT - } else if (inner_prod == TensorProduct::General) { + } // ToT x T or T x ToT + } else if (inner_prod == TensorProduct::Scale) { TA_ASSERT(!tot_x_tot); constexpr bool tot_x_t = TiledArray::detail::is_tensor_of_tensor_v { std::conditional_t; - auto scal_op = [do_perm = this->permute_tiles_, - perm = this->permute_tiles_ ? inner(this->perm_) + auto scal_op = [perm = this->permute_tiles_ ? inner(this->perm_) : Permutation{}]( const left_tile_element_type& left, const right_tile_element_type& right) -> result_tile_element_type { using TiledArray::scale; if constexpr (tot_x_t) { - if (do_perm) + if (perm) return scale(left, right, perm); else return scale(left, right); } else if constexpr (tot_x_t) { - if (do_perm) + if (perm) return scale(right, left, perm); else return scale(right, left); diff --git a/src/TiledArray/expressions/product.h b/src/TiledArray/expressions/product.h index 381b1f485c..7111b7831b 100644 --- a/src/TiledArray/expressions/product.h +++ b/src/TiledArray/expressions/product.h @@ -39,6 +39,9 @@ enum class TensorProduct { Contraction, /// free, fused, and contracted indices General, + /// no indices on one, free indices on the other; only used for inner index + /// products in mixed nested products (ToT x T) + Scale, /// invalid Invalid = -1 }; @@ -59,7 +62,7 @@ inline TensorProduct compute_product_type(const IndexList& left_indices, result = TensorProduct::Contraction; } else if ((left_indices && !right_indices) || (!left_indices && right_indices)) { // used for ToT*T or T*ToT - result = TensorProduct::General; + result = TensorProduct::Scale; } return result; } diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 3033936381..ea5529e5b8 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -718,6 +718,49 @@ BOOST_AUTO_TEST_SUITE_END() // einsum_tot BOOST_AUTO_TEST_SUITE(einsum_tot_t) +BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { + using t_type = DistArray, SparsePolicy>; + using tot_type = DistArray>, SparsePolicy>; + using matrix_il = TiledArray::detail::matrix_il>; + auto& world = TiledArray::get_default_world(); + Tensor lhs_elem_0_0( + Range{7, 2}, {49, 73, 28, 46, 12, 83, 29, 61, 61, 98, 57, 28, 96, 57}); + Tensor lhs_elem_0_1( + Range{7, 2}, {78, 15, 69, 55, 87, 94, 28, 94, 79, 30, 26, 88, 48, 74}); + Tensor lhs_elem_1_0( + Range{7, 2}, {70, 32, 25, 71, 6, 56, 4, 13, 72, 50, 15, 95, 52, 89}); + Tensor lhs_elem_1_1( + Range{7, 2}, {12, 29, 17, 68, 37, 79, 5, 52, 13, 35, 53, 54, 78, 71}); + Tensor lhs_elem_2_0( + Range{7, 2}, {77, 39, 34, 94, 16, 82, 63, 27, 75, 12, 14, 59, 3, 14}); + Tensor lhs_elem_2_1( + Range{7, 2}, {65, 90, 37, 41, 65, 75, 59, 16, 44, 85, 86, 11, 40, 24}); + Tensor lhs_elem_3_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_3_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, + {lhs_elem_1_0, lhs_elem_1_1}, + {lhs_elem_2_0, lhs_elem_2_1}, + {lhs_elem_3_0, lhs_elem_3_1}}; + TiledRange lhs_trange{{0, 2, 4}, {0, 2}}; + tot_type lhs(world, lhs_trange, lhs_il); + + TiledRange rhs_trange{{0, 2}, {0, 2, 4, 6}}; + t_type rhs(world, rhs_trange); + rhs.fill_random(); + + TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1), + rhs_trange.dim(0)}; + tot_type ref_result(world, ref_result_trange); + // TODO compute ref_result + + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); + + // TODO check result against ref_result +} + BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { using t_type = DistArray, SparsePolicy>; using tot_type = DistArray>, SparsePolicy>; @@ -764,11 +807,7 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // tot_type result; // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); - // will try to make this work FIRST since this is used by the einsum code - // below - tot_type out; - out("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l"); - // will try to make this work NEXT + // will try to make this work // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); } From 74e5e78a4897430e73e9e9af0133a3fca8188cd7 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 14:54:30 -0500 Subject: [PATCH 201/592] [ci skip] implement 'i,j;m,n * j,k -> i,j,k;m,n' reference evaluation manually. --- tests/einsum.cpp | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index ea5529e5b8..800d51d3e0 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -793,10 +793,41 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { t_type rhs(world, rhs_trange); rhs.fill_random(); - TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1), - rhs_trange.dim(0)}; - tot_type ref_result(world, ref_result_trange); // TODO compute ref_result + // i,j;m,n * j,k => i,j,k;m,n + TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(0), + rhs_trange.dim(1)}; + tot_type ref_result(world, ref_result_trange); + + for (auto const& tile : ref_result) { + tot_type::value_type result_tile{tile.make_range()}; + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto j = res_ix[1]; + auto k = res_ix[2]; + + using Ix2 = std::array; + using Ix3 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(); + + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{j, k}); + auto rhs_tile = rhs.find(rhs_tile_ix).get(); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); + auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{j, k})); + + res_el = lhs_el.scale(rhs_el); + } + + ref_result.set(tile.index(), result_tile); + } + + std::cout << ref_result << std::endl; ///////////////////////////////////////////////////////// // ToT * T From 86f287768baacf5fcbda63795622487a08d0b54a Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 17:34:55 -0500 Subject: [PATCH 202/592] [ci skip] more manual tot * t reference evaluation --- tests/einsum.cpp | 68 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 800d51d3e0..6501d91a10 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -751,14 +751,58 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { rhs.fill_random(); TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1), - rhs_trange.dim(0)}; + rhs_trange.dim(0), lhs_trange.dim(1)}; tot_type ref_result(world, ref_result_trange); - // TODO compute ref_result + + // + // i,l,k,j;n,m = i,j;m,n * k,l + // + + // why cannot lhs and rhs be captured by ref? + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto l = res_ix[1]; + auto k = res_ix[2]; + auto j = res_ix[3]; + + using Ix2 = std::array; + using Ix4 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(); + + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, l}); + auto rhs_tile = rhs.find(rhs_tile_ix).get(); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix4{i, l, k, j})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); + auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{k, l})); + + res_el = tot_type::element_type( + lhs_el.scale(rhs_el), // scale + TiledArray::Permutation{1, 0}); // permute [0,1] -> [1,0] + } + return result_tile; + }; + + using std::begin; + using std::endl; + + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - // TODO check result against ref_result + // todo: fix it + // const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + // BOOST_CHECK(are_equal); } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { @@ -799,8 +843,11 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { rhs_trange.dim(1)}; tot_type ref_result(world, ref_result_trange); - for (auto const& tile : ref_result) { - tot_type::value_type result_tile{tile.make_range()}; + // + // why cannot lhs and rhs be captured by ref? + // + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; for (auto&& res_ix : result_tile.range()) { auto i = res_ix[0]; auto j = res_ix[1]; @@ -823,11 +870,16 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { res_el = lhs_el.scale(rhs_el); } + return result_tile; + }; - ref_result.set(tile.index(), result_tile); - } + using std::begin; + using std::endl; - std::cout << ref_result << std::endl; + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } ///////////////////////////////////////////////////////// // ToT * T From e40d882ada11464bec3b25b6999cacc9767d229a Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 22:04:59 -0500 Subject: [PATCH 203/592] Add equality comparison for SparseShape. --- src/TiledArray/sparse_shape.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h index bf51487922..271857a72c 100644 --- a/src/TiledArray/sparse_shape.h +++ b/src/TiledArray/sparse_shape.h @@ -1742,6 +1742,17 @@ bool is_replicated(World& world, const SparseShape& shape) { return result; } +template +constexpr inline bool operator==(const SparseShape& a, + const SparseShape& b) { + return true; +} +template +constexpr inline bool operator!=(const SparseShape& a, + const SparseShape& b) { + return !(a == b); +} + #ifndef TILEDARRAY_HEADER_ONLY extern template class SparseShape; From f9e4f0db11f1a9f07b85f0b5250935b3aa507d62 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 Nov 2023 22:05:40 -0500 Subject: [PATCH 204/592] Validate outer-product type tot * t evaluation using expression layer. --- tests/einsum.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 6501d91a10..aad4a00c0a 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -800,9 +800,8 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - // todo: fix it - // const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); - // BOOST_CHECK(are_equal); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { From 42a1dc708397325ea768d7543a448a4050ddae71 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 27 Nov 2023 11:42:05 -0500 Subject: [PATCH 205/592] [unit] einsum_tot_t pulls remote tiles using strick blocking (dowork=false) also fixed a few typos --- tests/einsum.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index aad4a00c0a..db2731a2e1 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -771,10 +771,10 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { using Ix4 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, l}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix4{i, l, k, j})); @@ -790,7 +790,7 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { }; using std::begin; - using std::endl; + using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); @@ -856,10 +856,10 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { using Ix3 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{j, k}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); @@ -873,7 +873,7 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { }; using std::begin; - using std::endl; + using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); From 076f488905ca69150140bb97b4377f9690cd8a58 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 27 Nov 2023 12:04:54 -0500 Subject: [PATCH 206/592] [unit] einsum_tot_t must test ToT*T AND T*ToT (the latter is currently broken due to missing Tensor functionality for binary Scalar*Tensor ops) --- tests/einsum.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index db2731a2e1..37889a73f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -802,6 +802,13 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); + + { // reverse the order + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); + } } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { @@ -887,10 +894,10 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // - general product w.r.t. outer indices // - involves ToT * T // tot_type result; - // BOOST_REQUIRE_NO_THROW(result("i,k,j;m,n") = lhs("i,j;m,n") * rhs("j,k")); + // BOOST_REQUIRE_NO_THROW(result("k,i,j;n,m") = lhs("i,j;m,n") * rhs("j,k")); // will try to make this work - // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "k,i,j;n,m"); } BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t From 7b2a90b490bff387f0a52f7d335e98bc7440f968 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 27 Nov 2023 23:16:39 -0500 Subject: [PATCH 207/592] Avoid code-duplication by generalizing the existing einsum function. --- src/TiledArray/einsum/range.h | 3 +- src/TiledArray/einsum/tiledarray.h | 316 ++++++----------------------- tests/einsum.cpp | 12 +- 3 files changed, 72 insertions(+), 259 deletions(-) diff --git a/src/TiledArray/einsum/range.h b/src/TiledArray/einsum/range.h index 32eb669588..79b409e64d 100644 --- a/src/TiledArray/einsum/range.h +++ b/src/TiledArray/einsum/range.h @@ -14,7 +14,8 @@ using small_vector = TiledArray::container::svector; struct Range { using value_type = int64_t; using iterator = boost::counting_iterator; - template + template , bool> = true> explicit Range(Pair &&pair) : Range(pair.first, pair.second) {} Range(value_type begin, value_type end) : begin_(begin), end_(end) {} auto begin() const { return iterator(begin_); } diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 09640d31f6..1a3840f99f 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -64,13 +64,38 @@ struct ArrayTerm { } }; -template -auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, +namespace { +template +constexpr bool IsArrayT = detail::is_tensor_v; + +template +constexpr bool IsArrayToT = + detail::is_tensor_of_tensor_v; + +template +constexpr bool AreArrayT = IsArrayT && IsArrayT; + +template +constexpr bool AreArrayToT = IsArrayToT && IsArrayToT; + +template +constexpr bool AreArraySame = + AreArrayT || AreArrayToT; + +} // namespace + +template +auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, std::tuple, Indices...> cs, World &world) { - using Array = std::remove_cv_t; - using Tensor = typename Array::value_type; - using Shape = typename Array::shape_type; + using ArrayA = std::remove_cv_t; + using ArrayB = std::remove_cv_t; + using ArrayC = std::conditional_t< + AreArraySame, ArrayA, + std::conditional_t, ArrayA, ArrayB>>; + // using Array = ArrayC; + using ResultTensor = typename ArrayC::value_type; + using ResultShape = typename ArrayC::shape_type; auto a = std::get<0>(Einsum::idx(A)); auto b = std::get<0>(Einsum::idx(B)); @@ -91,7 +116,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // no Hadamard indices => standard contraction (or even outer product) // same a, b, and c => pure Hadamard if (!h || (!(a ^ b) && !(b ^ c))) { - Array C; + ArrayC C; C(std::string(c) + inner.c) = A * B; return C; } @@ -108,17 +133,22 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using ::Einsum::index::permutation; using TiledArray::Permutation; - ArrayTerm AB[2] = {{A.array(), a}, {B.array(), b}}; + std::tuple, ArrayTerm> AB{{A.array(), a}, + {B.array(), b}}; - for (auto &term : AB) { + auto update_perm_and_indices = [&e = std::as_const(e), &i = std::as_const(i), + &h = std::as_const(h)](auto &term) { auto ei = (e + i & term.idx); if (term.idx != h + ei) { term.permutation = permutation(term.idx, h + ei); } term.expr = ei; - } + }; - ArrayTerm C = {Array(world, TiledRange(range_map[c])), c}; + std::invoke(update_perm_and_indices, std::get<0>(AB)); + std::invoke(update_perm_and_indices, std::get<1>(AB)); + + ArrayTerm C = {ArrayC(world, TiledRange(range_map[c])), c}; for (auto idx : e) { C.tiles *= Range(range_map[idx].tiles_range()); } @@ -127,8 +157,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } C.expr = e; - AB[0].expr += inner.a; - AB[1].expr += inner.b; + std::get<0>(AB).expr += inner.a; + std::get<1>(AB).expr += inner.b; + C.expr += inner.c; struct { @@ -163,7 +194,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, for (size_t i = 0; i < h.size(); ++i) { batch *= H.batch[i].at(h[i]); } - Tensor tile(TiledArray::Range{batch}, typename Tensor::value_type(0)); + ResultTensor tile(TiledArray::Range{batch}, + typename ResultTensor::value_type(0)); for (Index i : tiles) { // skip this unless both input tiles exist const auto pahi_inv = apply_inverse(pa, h + i); @@ -193,16 +225,20 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // generalized contraction - for (auto &term : AB) { + auto update_tr = [&e = std::as_const(e), &i = std::as_const(i), + &range_map = std::as_const(range_map)](auto &term) { auto ei = (e + i & term.idx); term.ei_tiled_range = TiledRange(range_map[ei]); for (auto idx : ei) { term.tiles *= Range(range_map[idx].tiles_range()); } - } + }; + + std::invoke(update_tr, std::get<0>(AB)); + std::invoke(update_tr, std::get<1>(AB)); std::vector> worlds; - std::vector> local_tiles; + std::vector> local_tiles; // iterates over tiles of hadamard indices for (Index h : H.tiles) { @@ -216,7 +252,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, for (size_t i = 0; i < h.size(); ++i) { batch *= H.batch[i].at(h[i]); } - for (auto &term : AB) { + + auto retile = [&owners, &h = std::as_const(h), batch](auto &term) { term.local_tiles.clear(); const Permutation &P = term.permutation; @@ -232,235 +269,18 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, term.local_tiles.push_back({ei, tile}); } bool replicated = term.array.pmap()->is_replicated(); - term.ei = TiledArray::make_array( + term.ei = TiledArray::make_array( *owners, term.ei_tiled_range, term.local_tiles.begin(), term.local_tiles.end(), replicated); - } - C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); - A.ei.defer_deleter_to_next_fence(); - B.ei.defer_deleter_to_next_fence(); - A.ei = Array(); - B.ei = Array(); - // why omitting this fence leads to deadlock? - owners->gop.fence(); - for (Index e : C.tiles) { - if (!C.ei.is_local(e)) continue; - if (C.ei.is_zero(e)) continue; - // TODO no need for immediate evaluation - auto tile = C.ei.find_local(e).get(); - assert(tile.batch_size() == batch); - const Permutation &P = C.permutation; - auto c = apply(P, h + e); - auto shape = C.array.trange().tile(c); - shape = apply_inverse(P, shape); - tile = tile.reshape(shape); - if (P) tile = tile.permute(P); - local_tiles.push_back({c, tile}); - } - // mark for lazy deletion - C.ei = Array(); - } - - if constexpr (!Shape::is_dense()) { - TiledRange tiled_range = TiledRange(range_map[c]); - std::vector> tile_norms; - for (auto &[index, tile] : local_tiles) { - tile_norms.push_back({index, tile.norm()}); - } - Shape shape(world, tile_norms, tiled_range); - C.array = Array(world, TiledRange(range_map[c]), shape); - } - - for (auto &[index, tile] : local_tiles) { - if (C.array.is_zero(index)) continue; - C.array.set(index, tile); - } - - for (auto &w : worlds) { - w->gop.fence(); - } - - return C.array; -} - -namespace { -template -constexpr bool IsArrayT = detail::is_tensor_v; - -template -constexpr bool IsArrayToT = - detail::is_tensor_of_tensor_v; -} // namespace - -template < - typename ArrayT_, typename ArrayToT_, typename... Indices, - typename = std::enable_if_t && IsArrayToT>> -auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, - std::tuple, Indices...> cs, - World &world) { - using ArrayT = std::remove_cv_t; - using ArrayToT = std::remove_cv_t; - using Shape = typename ArrayToT::shape_type; - using T = typename ArrayT::value_type; - using ToT = typename ArrayToT::value_type; - - auto a = std::get<0>(Einsum::idx(A)); - auto b = std::get<0>(Einsum::idx(B)); - Einsum::Index c = std::get<0>(cs); - - struct { - std::string b, c; - } inner; - if constexpr (std::tuple_size::value == 2) { - inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); - inner.c = ";" + (std::string)std::get<1>(cs); - } + }; + std::invoke(retile, std::get<0>(AB)); + std::invoke(retile, std::get<1>(AB)); - // these are "Hadamard" (fused) indices - auto h = a & b & c; - - // contracted indices - auto i = (a & b) - h; - // contraction not allowed in tensor x tensor-of-tensor - TA_ASSERT(!i); - - // indices exclusively in 'a' or exclusively in 'b' - auto e = (a ^ b); - - // maps Index to TiledRange1 - // (asserts same index maps to the same TR1 in A, and B) - auto range_map = - (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); - - using ::Einsum::index::permutation; - using TiledArray::Permutation; - - auto arrayTermA = ArrayTerm{A.array(), a}; - auto arrayTermB = ArrayTerm{B.array(), b}; - - { - auto ei = (e + i & arrayTermA.idx); - if (arrayTermA.idx != h + ei) - arrayTermA.permutation = permutation(arrayTermA.idx, h + ei); - arrayTermA.expr = ei; - } - - { - auto ei = (e + i & arrayTermB.idx); - if (arrayTermB.idx != h + ei) - arrayTermB.permutation = permutation(arrayTermB.idx, h + ei); - arrayTermB.expr = ei; - } - - ArrayTerm C = {ArrayToT(world, TiledRange(range_map[c])), c}; - for (auto idx : e) { - C.tiles *= Range(range_map[idx].tiles_range()); - } - if (C.idx != h + e) { - C.permutation = permutation(h + e, C.idx); - } - C.expr = e; - - arrayTermB.expr += inner.b; - C.expr += inner.c; - - struct { - RangeProduct tiles; - std::vector> batch; - } H; - - for (auto idx : h) { - H.tiles *= Range(range_map[idx].tiles_range()); - H.batch.push_back({}); - for (auto r : range_map[idx]) { - H.batch.back().push_back(Range{r}.size()); - } - } - - using Index = Einsum::Index; - - // generalized contraction - { - auto ei = (e + i & arrayTermA.idx); - arrayTermA.ei_tiled_range = TiledRange(range_map[ei]); - for (auto idx : ei) arrayTermA.tiles *= Range(range_map[idx].tiles_range()); - } - - { - auto ei = (e + i & arrayTermB.idx); - arrayTermB.ei_tiled_range = TiledRange(range_map[ei]); - for (auto idx : ei) arrayTermB.tiles *= Range(range_map[idx].tiles_range()); - } - - std::vector> worlds; - std::vector> local_tiles; - - // iterates over tiles of hadamard indices - for (Index h : H.tiles) { - auto &A = arrayTermA; - auto &B = arrayTermB; - - auto own = A.own(h) || B.own(h); - auto comm = world.mpi.comm().Split(own, world.rank()); - worlds.push_back(std::make_unique(comm)); - auto &owners = worlds.back(); - if (!own) continue; - size_t batch = 1; - for (size_t i = 0; i < h.size(); ++i) { - batch *= H.batch[i].at(h[i]); - } - - { - arrayTermA.local_tiles.clear(); - const Permutation &P = arrayTermA.permutation; - - for (Index ei : arrayTermA.tiles) { - auto idx = apply_inverse(P, h + ei); - if (!arrayTermA.array.is_local(idx)) continue; - if (arrayTermA.array.is_zero(idx)) continue; - // TODO no need for immediate evaluation - auto tile = arrayTermA.array.find_local(idx).get(); - if (P) tile = tile.permute(P); - auto shape = arrayTermA.ei_tiled_range.tile(ei); - tile = tile.reshape(shape, batch); - arrayTermA.local_tiles.push_back({ei, tile}); - } - bool replicated = arrayTermA.array.pmap()->is_replicated(); - arrayTermA.ei = TiledArray::make_array( - *owners, arrayTermA.ei_tiled_range, arrayTermA.local_tiles.begin(), - arrayTermA.local_tiles.end(), replicated); - } - - { - arrayTermB.local_tiles.clear(); - const Permutation &P = arrayTermB.permutation; - - for (Index ei : arrayTermB.tiles) { - auto idx = apply_inverse(P, h + ei); - if (!arrayTermB.array.is_local(idx)) continue; - if (arrayTermB.array.is_zero(idx)) continue; - // TODO no need for immediate evaluation - auto tile = arrayTermB.array.find_local(idx).get(); - if (P) tile = tile.permute(P); - auto shape = arrayTermB.ei_tiled_range.tile(ei); - tile = tile.reshape(shape, batch); - arrayTermB.local_tiles.push_back({ei, tile}); - } - bool replicated = arrayTermB.array.pmap()->is_replicated(); - arrayTermB.ei = TiledArray::make_array( - *owners, arrayTermB.ei_tiled_range, arrayTermB.local_tiles.begin(), - arrayTermB.local_tiles.end(), replicated); - } - - // todo C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); - - // - A.ei.defer_deleter_to_next_fence(); B.ei.defer_deleter_to_next_fence(); - A.ei = ArrayT(); - B.ei = ArrayToT(); + A.ei = ArrayA(); + B.ei = ArrayB(); // why omitting this fence leads to deadlock? owners->gop.fence(); for (Index e : C.tiles) { @@ -478,17 +298,17 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, local_tiles.push_back({c, tile}); } // mark for lazy deletion - C.ei = ArrayToT(); + C.ei = ArrayC(); } - if constexpr (!Shape::is_dense()) { + if constexpr (!ResultShape::is_dense()) { TiledRange tiled_range = TiledRange(range_map[c]); std::vector> tile_norms; for (auto &[index, tile] : local_tiles) { tile_norms.push_back({index, tile.norm()}); } - Shape shape(world, tile_norms, tiled_range); - C.array = ArrayToT(world, TiledRange(range_map[c]), shape); + ResultShape shape(world, tile_norms, tiled_range); + C.array = ArrayC(world, TiledRange(range_map[c]), shape); } for (auto &[index, tile] : local_tiles) { @@ -503,14 +323,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, return C.array; } -template && IsArrayToT>> -auto einsum(expressions::TsrExpr B, expressions::TsrExpr A, - std::tuple, Indices...> cs, - World &world) { - return einsum(A, B, cs, world); -} - /// Computes ternary tensor product whose result /// is a scalar (a ternary dot product). Optimized for the case where /// the arguments have common (Hadamard) indices. diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 37889a73f9..8eea2884f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -803,12 +803,12 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); - { // reverse the order - tot_type result; - BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); - BOOST_CHECK(are_equal); - } +// { // reverse the order +// tot_type result; +// BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); +// const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); +// BOOST_CHECK(are_equal); +// } } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { From c8f9542866a08ccfae45e6bbf4dd42d65c1641b8 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 29 Nov 2023 10:28:47 -0500 Subject: [PATCH 208/592] In einsum, handle inner index labels when tot times t, or, t times tot arguments are passed. --- src/TiledArray/einsum/tiledarray.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 1a3840f99f..eb317e0aef 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -93,7 +93,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using ArrayC = std::conditional_t< AreArraySame, ArrayA, std::conditional_t, ArrayA, ArrayB>>; - // using Array = ArrayC; using ResultTensor = typename ArrayC::value_type; using ResultShape = typename ArrayC::shape_type; @@ -105,8 +104,13 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, std::string a, b, c; } inner; if constexpr (std::tuple_size::value == 2) { - inner.a = ";" + (std::string)std::get<1>(Einsum::idx(A)); - inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + if constexpr (IsArrayToT) + inner.a = ";" + (std::string)std::get<1>(Einsum::idx(A)); + + if constexpr (IsArrayToT) + inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + + static_assert(IsArrayToT || IsArrayToT); inner.c = ";" + (std::string)std::get<1>(cs); } From f04a94358e4bbc8e0121363b563b6550a412569d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 29 Nov 2023 17:00:36 -0500 Subject: [PATCH 209/592] amend https://github.com/ValeevGroup/tiledarray/commit/bff7d2888cd69e5ef4b9bb4ed86e775e6528c4db --- src/TiledArray/expressions/cont_engine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 5ec69c7d0d..21aceae14c 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -609,7 +609,7 @@ class ContEngine : public BinaryEngine { return scale(left, right, perm); else return scale(left, right); - } else if constexpr (tot_x_t) { + } else if constexpr (t_x_tot) { if (perm) return scale(right, left, perm); else From 178393b84e229a967b2120838db3907ad4531f4c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 29 Nov 2023 17:02:22 -0500 Subject: [PATCH 210/592] relax type requirements on tensor_init to support mixed (ToT alongside T) invocations, this allows T * ToT expr to compile and unit test to succeed --- src/TiledArray/tensor/kernels.h | 7 ++++--- tests/einsum.cpp | 12 ++++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 87db8c1cc6..97f7dc1e5b 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -541,9 +541,10 @@ inline void tensor_init(Op&& op, const Permutation& perm, TR& result, /// \param[out] result The result tensor /// \param[in] tensor1 The first argument tensor /// \param[in] tensors The argument tensors -template ::value>::type* = nullptr> +template < + typename Op, typename TR, typename T1, typename... Ts, + typename std::enable_if::value && + !is_tensor::value>::type* = nullptr> inline void tensor_init(Op&& op, const Permutation& perm, TR& result, const T1& tensor1, const Ts&... tensors) { TA_ASSERT(!empty(result, tensor1, tensors...)); diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 8eea2884f9..37889a73f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -803,12 +803,12 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); -// { // reverse the order -// tot_type result; -// BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); -// const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); -// BOOST_CHECK(are_equal); -// } + { // reverse the order + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); + } } BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { From 3eb8280d9cd7c84b31c1050e369ed27c6ed27ac7 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 30 Nov 2023 14:06:19 -0500 Subject: [PATCH 211/592] relax Tensor(left,right,binaryelemeop,permutation) ctor constraints --- src/TiledArray/tensor/tensor.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 3c10ba4077..f3076c4514 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -492,7 +492,7 @@ class Tensor { /// \param perm The permutation that will be applied to the arguments template < typename T1, typename T2, typename Op, typename Perm, - typename std::enable_if::value && + typename std::enable_if::value && detail::is_permutation_v>::type* = nullptr> Tensor(const T1& left, const T2& right, Op&& op, const Perm& perm) : Tensor(outer(perm) * left.range(), 1, default_construct{false}) { From 0f4e8183e13ce92a78219866f70afd7bda0a2bb7 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 7 Dec 2023 18:38:25 -0500 Subject: [PATCH 212/592] Support for pure hadamard product between a tot and a t: 'i,j;m,n * i,j -> i,j;m,n' --- src/TiledArray/expressions/binary_engine.h | 6 +- src/TiledArray/expressions/mult_engine.h | 6 ++ tests/einsum.cpp | 92 ++++++++++++++++++++++ 3 files changed, 102 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h index 93192e2b5e..411a1c7c13 100644 --- a/src/TiledArray/expressions/binary_engine.h +++ b/src/TiledArray/expressions/binary_engine.h @@ -204,8 +204,10 @@ class BinaryEngine : public ExprEngine { /// \param target_indices The target index list for this expression void perm_indices(const BipartiteIndexList& target_indices) { if (permute_tiles_) { - TA_ASSERT(left_.indices().size() == target_indices.size()); - TA_ASSERT(right_.indices().size() == target_indices.size()); + TA_ASSERT(left_.indices().size() == target_indices.size() || + (left_.indices().second().size() ^ target_indices.second().size())); + TA_ASSERT(right_.indices().size() == target_indices.size() || + (right_.indices().second().size() ^ target_indices.second().size())); init_indices_(target_indices); diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 91924efeb2..9713e0b0df 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -407,6 +407,9 @@ class MultEngine : public ContEngine> { return op_type(op_base_type()); } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_)); + } else if (inner_prod == TensorProduct::Scale) { + TA_ASSERT(this->product_type() == TensorProduct::Hadamard); + return op_type(op_base_type()); } else abort(); } else { // plain tensors @@ -432,6 +435,9 @@ class MultEngine : public ContEngine> { return op_type(op_base_type(), perm); } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_), perm); + } else if (inner_prod == TensorProduct::Scale) { + TA_ASSERT(this->product_type() == TensorProduct::Hadamard); + return op_type(op_base_type(this->element_return_op_), perm); } else abort(); } else { // plain tensor diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 37889a73f9..9ea4dd39d3 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -900,6 +900,98 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "k,i,j;n,m"); } +BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { + using t_type = DistArray, SparsePolicy>; + using tot_type = DistArray>, SparsePolicy>; + using matrix_il = TiledArray::detail::matrix_il>; + auto& world = TiledArray::get_default_world(); + Tensor lhs_elem_0_0( + Range{7, 2}, {49, 73, 28, 46, 12, 83, 29, 61, 61, 98, 57, 28, 96, 57}); + Tensor lhs_elem_0_1( + Range{7, 2}, {78, 15, 69, 55, 87, 94, 28, 94, 79, 30, 26, 88, 48, 74}); + Tensor lhs_elem_1_0( + Range{7, 2}, {70, 32, 25, 71, 6, 56, 4, 13, 72, 50, 15, 95, 52, 89}); + Tensor lhs_elem_1_1( + Range{7, 2}, {12, 29, 17, 68, 37, 79, 5, 52, 13, 35, 53, 54, 78, 71}); + Tensor lhs_elem_2_0( + Range{7, 2}, {77, 39, 34, 94, 16, 82, 63, 27, 75, 12, 14, 59, 3, 14}); + Tensor lhs_elem_2_1( + Range{7, 2}, {65, 90, 37, 41, 65, 75, 59, 16, 44, 85, 86, 11, 40, 24}); + Tensor lhs_elem_3_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_3_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + Tensor lhs_elem_4_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_4_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + Tensor lhs_elem_5_0( + Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); + Tensor lhs_elem_5_1( + Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); + matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, + {lhs_elem_1_0, lhs_elem_1_1}, + {lhs_elem_2_0, lhs_elem_2_1}, + {lhs_elem_3_0, lhs_elem_3_1}, + {lhs_elem_4_0, lhs_elem_4_1}, + {lhs_elem_5_0, lhs_elem_5_1}}; + TiledRange lhs_trange{{0, 2, 6}, {0, 2}}; + tot_type lhs(world, lhs_trange, lhs_il); + + TiledRange rhs_trange{{0, 2}, {0, 2, 6}}; + t_type rhs(world, rhs_trange); + rhs.fill_random(); + + // + // i,j;m,n = j,i;n,m * i,j + // + TiledRange ref_result_trange{rhs_trange.dim(0), rhs_trange.dim(1)}; + tot_type ref_result(world, ref_result_trange); + + // why cannot lhs and rhs be captured by ref? + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto j = res_ix[1]; + + using Ix2 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{j, i}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork */ false); + + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2({i, j})); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork */ false ); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix2{i, j})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{j, i})); + auto rhs_el = + rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{i, j})); + res_el = tot_type::element_type( + lhs_el.scale(rhs_el), // scale + TiledArray::Permutation{0, 1} // permute + ); + } + return result_tile; + }; + + using std::begin; + using std::end; + + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } + + tot_type result; + BOOST_REQUIRE_NO_THROW(result("i,j;m,n") = lhs("j,i;m,n") * rhs("i,j")); + + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_CHECK(are_equal); +} + BOOST_AUTO_TEST_SUITE_END() // einsum_tot_t // Eigen einsum indices From ba2b9a3b90a8d80340427139bb0a9dc04e76f827 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 8 Dec 2023 07:56:34 -0500 Subject: [PATCH 213/592] SparseShape inequality comparison added. --- src/TiledArray/sparse_shape.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h index 271857a72c..b589dc73cf 100644 --- a/src/TiledArray/sparse_shape.h +++ b/src/TiledArray/sparse_shape.h @@ -797,6 +797,13 @@ class SparseShape { return equal; } + /// Bitwise comparison + /// \param other a SparseShape object + /// \return true if this object and @c other object are bitwise NOT identical + inline bool operator!=(const SparseShape& other) const { + return !(*this == other); + } + private: /// Create a copy of a sub-block of the shape @@ -1742,17 +1749,6 @@ bool is_replicated(World& world, const SparseShape& shape) { return result; } -template -constexpr inline bool operator==(const SparseShape& a, - const SparseShape& b) { - return true; -} -template -constexpr inline bool operator!=(const SparseShape& a, - const SparseShape& b) { - return !(a == b); -} - #ifndef TILEDARRAY_HEADER_ONLY extern template class SparseShape; From be8e07a5667c02bbc9b1b516f9763db89038187d Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 8 Dec 2023 07:57:16 -0500 Subject: [PATCH 214/592] Disable shape comparison in ToTArrayFixture. --- tests/tot_array_fixture.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 9d46fadcc7..1619a794c8 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -237,6 +237,7 @@ struct ToTArrayFixture { * - Components are bit-wise equal (i.e., 3.1400000000 != 3.1400000001) * * TODO: pmap comparisons + * TODO: shape comparisons */ template @@ -254,7 +255,7 @@ struct ToTArrayFixture { if (&lhs.world() != &rhs.world()) return false; // Same shape? - if (lhs.shape() != rhs.shape()) return false; + // if (lhs.shape() != rhs.shape()) return false; // Same pmap? // if(*lhs.pmap() != *rhs.pmap()) return false; From e96df681b3f20328808b129ef16776c89e62dbe5 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 8 Dec 2023 07:58:25 -0500 Subject: [PATCH 215/592] Default construction of result tensor tile in `einsum` made more generic. --- src/TiledArray/einsum/tiledarray.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index eb317e0aef..48648407cb 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -199,7 +199,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, batch *= H.batch[i].at(h[i]); } ResultTensor tile(TiledArray::Range{batch}, - typename ResultTensor::value_type(0)); + typename ResultTensor::value_type{}); for (Index i : tiles) { // skip this unless both input tiles exist const auto pahi_inv = apply_inverse(pa, h + i); From 5b7c3dd5ed7f43d03ece64f93da8e28a7b5011a0 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 10 Dec 2023 12:00:17 -0500 Subject: [PATCH 216/592] Restore (optional) shape comparison on ToTArrayFixture::are_equal function. --- tests/einsum.cpp | 6 +++--- tests/tot_array_fixture.h | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 9ea4dd39d3..a1c26d1782 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -800,13 +800,13 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); { // reverse the order tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } } @@ -988,7 +988,7 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,j;m,n") = lhs("j,i;m,n") * rhs("i,j")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 1619a794c8..21a9c956c6 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -231,16 +231,15 @@ struct ToTArrayFixture { * - Same type * - Either both are initialized or both are not initialized * - Same MPI context - * - Same shape + * - Same shape (unless the template parameter ShapeCmp is set false) * - Same distribution * - Same tiling * - Components are bit-wise equal (i.e., 3.1400000000 != 3.1400000001) * * TODO: pmap comparisons - * TODO: shape comparisons */ - template + template static bool are_equal(const DistArray& lhs, const DistArray& rhs) { // Same type @@ -255,7 +254,8 @@ struct ToTArrayFixture { if (&lhs.world() != &rhs.world()) return false; // Same shape? - // if (lhs.shape() != rhs.shape()) return false; + if constexpr (ShapeCmp) + if (lhs.shape() != rhs.shape()) return false; // Same pmap? // if(*lhs.pmap() != *rhs.pmap()) return false; From df240014a838cf2e43c408f82dff91fd00ac75a0 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 10 Dec 2023 12:03:38 -0500 Subject: [PATCH 217/592] Relax restricitons on this->product_type() values while calling make_tile_op(). --- src/TiledArray/expressions/mult_engine.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 9713e0b0df..20093b2cec 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -408,7 +408,6 @@ class MultEngine : public ContEngine> { } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_)); } else if (inner_prod == TensorProduct::Scale) { - TA_ASSERT(this->product_type() == TensorProduct::Hadamard); return op_type(op_base_type()); } else abort(); @@ -436,7 +435,6 @@ class MultEngine : public ContEngine> { } else if (inner_prod == TensorProduct::Contraction) { return op_type(op_base_type(this->element_return_op_), perm); } else if (inner_prod == TensorProduct::Scale) { - TA_ASSERT(this->product_type() == TensorProduct::Hadamard); return op_type(op_base_type(this->element_return_op_), perm); } else abort(); From cbf06b1c8c20aa38bb0d1c65487f75de06f02a23 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 11 Dec 2023 07:35:16 -0500 Subject: [PATCH 218/592] Typo. --- tests/einsum.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index a1c26d1782..ebd9784bfd 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -1269,7 +1269,7 @@ BOOST_AUTO_TEST_CASE(einsum_tiledarray_abi_cdi_cdab) { "abi,cdi->cdab"); } -BOOST_AUTO_TEST_CASE(einsum_tiledarray_icd_ai_abcd) { +BOOST_AUTO_TEST_CASE(einsum_tiledarray_icd_bai_abcd) { einsum_tiledarray_check<3, 3, 4>(random(3, 12, 13), random(14, 15, 3), "icd,bai->abcd"); From c86b7d027560320f52179d8f402ceb460d61fc06 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 15 Dec 2023 09:28:57 -0500 Subject: [PATCH 219/592] [skip ci] einsum unit test for ij;mn * kj;mn -> ijk;mn --- tests/einsum.cpp | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index ebd9784bfd..eb2ffe1869 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -580,6 +580,40 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ij_mn_times_ji_mn) { BOOST_CHECK(are_equal); } +BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { + using dist_array_t = DistArray>, DensePolicy>; + using matrix_il = TiledArray::detail::matrix_il>; + auto& world = TiledArray::get_default_world(); + + auto random_tot = [](TA::Range const& rng) { + TA::Range inner_rng{7,14}; + TA::Tensor t{inner_rng}; + TA::Tensor> result{rng}; + for (auto& e: result) e = t; + return result; + }; + + auto random_tot_darr = [&random_tot](World& world, + TiledRange const& tr) { + dist_array_t result(world, tr); + for (auto it = result.begin(); it != result.end(); ++it) { + auto tile = + TA::get_default_world().taskq.add(random_tot, it.make_range()); + *it = tile; + } + return result; + }; + + TiledRange lhs_trange{{0, 2, 4}, {0, 5}}; + auto lhs = random_tot_darr(world, lhs_trange); + + TiledRange rhs_trange{{0, 2, 4, 6}, {0, 5}}; + auto rhs = random_tot_darr(world, rhs_trange); + dist_array_t result; + BOOST_REQUIRE_NO_THROW( + result = einsum(lhs("i,j;m,n"), rhs("k,j;m,n"), "i,j,k;m,n")); +} + BOOST_AUTO_TEST_CASE(xxx) { using dist_array_t = DistArray>, DensePolicy>; using matrix_il = TiledArray::detail::matrix_il>; @@ -1328,6 +1362,13 @@ BOOST_AUTO_TEST_CASE(einsum_tiledarray_hji_jih_hj) { "hji,jih->hj"); } +BOOST_AUTO_TEST_CASE(einsum_tiledarray_ik_jk_ijk) { + einsum_tiledarray_check<2, 2, 3>(random(7, 5), + random(14, 5), "ik,jk->ijk"); + einsum_tiledarray_check<2, 2, 3>(sparse_zero(7, 5), sparse_zero(14, 5), + "ik,jk->ijk"); +} + BOOST_AUTO_TEST_CASE(einsum_tiledarray_replicated) { einsum_tiledarray_check<3, 3, 3>(replicated(random(7, 14, 3)), random(7, 15, 3), From c72f3f4f0915e921498beeb66f562be32fca805f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 15 Dec 2023 10:45:59 -0500 Subject: [PATCH 220/592] Tensor::gemm involving custom elem_op supports batching --- src/TiledArray/tensor/tensor.h | 75 ++++++++++++++++++++++++---------- tests/einsum.cpp | 4 +- 2 files changed, 55 insertions(+), 24 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index f3076c4514..c901dc0f4b 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -292,10 +292,12 @@ class Tensor { /// Construct a tensor with a range equal to \c range. The data is /// uninitialized. /// \param range The range of the tensor - explicit Tensor(const range_type& range) - : Tensor(range, 1, default_construct{true}) {} + /// \param batch_size The batch size (default is 1) + explicit Tensor(const range_type& range, size_type batch_size = 1) + : Tensor(range, batch_size, default_construct{true}) {} - /// Construct a tensor with a fill value + /// Construct a tensor of tensor values, setting all elements to the same + /// value /// \param range An array with the size of of each dimension /// \param value The value of the tensor elements @@ -312,12 +314,14 @@ class Tensor { new (data + i) value_type(cloner(value)); } - /// Construct a tensor with a fill value + /// Construct a tensor of scalars, setting all elements to the same value /// \param range An array with the size of of each dimension /// \param value The value of the tensor elements - template >::type* = nullptr> + template && + !detail::is_tensor::value>::type* = + nullptr> Tensor(const range_type& range, const Value& value) : Tensor(range, 1, default_construct{false}) { detail::tensor_init([value]() -> Value { return value; }, *this); @@ -358,7 +362,7 @@ class Tensor { math::uninitialized_copy_vector(range.volume(), u, this->data()); } - Tensor(const Range& range, std::initializer_list il) + explicit Tensor(const Range& range, std::initializer_list il) : Tensor(range, il.begin()) {} /// Construct a copy of a tensor interface object @@ -1004,6 +1008,22 @@ class Tensor { /// \return A mutable pointer to the tensor data pointer data() { return this->data_.get(); } + /// @param[in] batch_idx the batch index + /// @pre `batch_idx < this->batch_size()` + /// @return A const pointer to the tensor data of the batch \p batch_idx + const_pointer batch_data(size_t batch_idx) const { + TA_ASSERT(batch_idx < this->batch_size()); + return data() + batch_idx * size(); + } + + /// @param[in] batch_idx the batch index + /// @pre `batch_idx < this->batch_size()` + /// @return A const pointer to the tensor data of the batch \p batch_idx + pointer batch_data(size_t batch_idx) { + TA_ASSERT(batch_idx < this->batch_size()); + return data() + batch_idx * size(); + } + /// Read-only shared_ptr to the data /// \return A const shared_ptr to the tensor data @@ -2194,6 +2214,8 @@ class Tensor { TA_ASSERT(left.range().rank() == gemm_helper.left_rank()); TA_ASSERT(!right.empty()); TA_ASSERT(right.range().rank() == gemm_helper.right_rank()); + TA_ASSERT(left.batch_size() == right.batch_size()); + const auto batch_sz = left.batch_size(); // Check that the inner dimensions of left and right match TA_ASSERT(gemm_helper.left_right_congruent(left.range().extent_data(), @@ -2207,7 +2229,8 @@ class Tensor { if (this->empty()) { // initialize, if empty *this = Tensor(gemm_helper.make_result_range(left.range(), - right.range())); + right.range()), + batch_sz); } else { // Check that the outer dimensions of left match the corresponding // dimensions in result @@ -2230,6 +2253,9 @@ class Tensor { TA_ASSERT(ignore_tile_position() || gemm_helper.right_result_congruent( right.range().upbound_data(), this->range_.upbound_data())); + + // check that batch size of this matches that of left and right + TA_ASSERT(this->batch_size() == batch_sz); } // Compute gemm dimensions @@ -2243,20 +2269,25 @@ class Tensor { const integer ldb = (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? N : K); - for (integer m = 0; m != M; ++m) { - for (integer n = 0; n != N; ++n) { - auto c_offset = m * N + n; - for (integer k = 0; k != K; ++k) { - auto a_offset = - gemm_helper.left_op() == TiledArray::math::blas::NoTranspose - ? m * lda + k - : k * lda + m; - auto b_offset = - gemm_helper.right_op() == TiledArray::math::blas::NoTranspose - ? k * ldb + n - : n * ldb + k; - elem_muladd_op(*(this->data() + c_offset), *(left.data() + a_offset), - *(right.data() + b_offset)); + for (integer b = 0; b != batch_size(); ++b) { + auto this_data = this->batch_data(b); + auto left_data = left.batch_data(b); + auto right_data = right.batch_data(b); + for (integer m = 0; m != M; ++m) { + for (integer n = 0; n != N; ++n) { + auto c_offset = m * N + n; + for (integer k = 0; k != K; ++k) { + auto a_offset = + gemm_helper.left_op() == TiledArray::math::blas::NoTranspose + ? m * lda + k + : k * lda + m; + auto b_offset = + gemm_helper.right_op() == TiledArray::math::blas::NoTranspose + ? k * ldb + n + : n * ldb + k; + elem_muladd_op(*(this_data + c_offset), *(left_data + a_offset), + *(right_data + b_offset)); + } } } } diff --git a/tests/einsum.cpp b/tests/einsum.cpp index eb2ffe1869..eb976b31f5 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -604,10 +604,10 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { return result; }; - TiledRange lhs_trange{{0, 2, 4}, {0, 5}}; + TiledRange lhs_trange{{0, 2, 4}, {0, 2, 5}}; auto lhs = random_tot_darr(world, lhs_trange); - TiledRange rhs_trange{{0, 2, 4, 6}, {0, 5}}; + TiledRange rhs_trange{{0, 2, 4, 6}, {0, 2, 5}}; auto rhs = random_tot_darr(world, rhs_trange); dist_array_t result; BOOST_REQUIRE_NO_THROW( From 657a12887c119bd63366d509595cd486ec5cb081 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 17 Dec 2023 13:10:40 -0500 Subject: [PATCH 221/592] Make single-valued initializer lists explicit in ambiguous cases. --- tests/initializer_list.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/initializer_list.cpp b/tests/initializer_list.cpp index 4d051f957d..3f5ad27b80 100644 --- a/tests/initializer_list.cpp +++ b/tests/initializer_list.cpp @@ -471,7 +471,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(vector, T, scalar_type_list) { auto array = array_from_il>(world, tr, il); using tile_type = typename TArray::value_type; std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 2.0}), - tile_type(tr.make_tile_range(1), {3.0})}; + tile_type(tr.make_tile_range(1), std::initializer_list{3.0})}; for (auto i = 0; i < array.size(); ++i) { if (!array.is_local(i)) continue; tile_type tile = array.find(i); @@ -486,7 +486,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(matrix, T, scalar_type_list) { using tile_type = typename TArray::value_type; std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 4.0}), tile_type(tr.make_tile_range(1), {2.0, 3.0, 5.0, 6.0}), - tile_type(tr.make_tile_range(2), {7.0}), + tile_type(tr.make_tile_range(2), std::initializer_list{7.0}), tile_type(tr.make_tile_range(3), {8.0, 9.0})}; for (auto i = 0; i < array.size(); ++i) { if (!array.is_local(i)) continue; @@ -503,11 +503,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tensor, T, scalar_type_list) { using tile_type = typename TArray::value_type; std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 4.0}), tile_type(tr.make_tile_range(1), {2.0, 3.0, 5.0, 6.0}), - tile_type(tr.make_tile_range(2), {7.0}), + tile_type(tr.make_tile_range(2), std::initializer_list{7.0}), tile_type(tr.make_tile_range(3), {8.0, 9.0}), tile_type(tr.make_tile_range(4), {10.0, 13.0}), tile_type(tr.make_tile_range(5), {11.0, 12.0, 14.0, 15.0}), - tile_type(tr.make_tile_range(6), {16.0}), + tile_type(tr.make_tile_range(6), std::initializer_list{16.0}), tile_type(tr.make_tile_range(7), {17.0, 18.0})}; for (auto i = 0; i < array.size(); ++i) { if (!array.is_local(i)) continue; From a08026c0a5d84343fbbf88118cc935de6e0c45c4 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 17 Dec 2023 16:34:25 -0500 Subject: [PATCH 222/592] Use .data() method to access elements by ordinal in tensor_reduce function. --- src/TiledArray/tensor/kernels.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 97f7dc1e5b..f1ec6d99c5 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -787,8 +787,8 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, auto result = identity; for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume; ++ord) { auto temp = - tensor_reduce(reduce_op, join_op, identity, tensor1.at_ordinal(ord), - tensors.at_ordinal(ord)...); + tensor_reduce(reduce_op, join_op, identity, tensor1.data()[ord], + tensors.data()[ord]...); join_op(result, temp); } From a5b253b5429bc6dbcafc2ee177c259f71502117f Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 17 Dec 2023 16:36:08 -0500 Subject: [PATCH 223/592] Implement Tot x T (and reverse) generalized contraction. --- src/TiledArray/einsum/tiledarray.h | 84 +++++++++++++++--------------- tests/einsum.cpp | 14 +++-- 2 files changed, 53 insertions(+), 45 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 48648407cb..2bd548df5c 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -181,50 +181,51 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using Index = Einsum::Index; - if constexpr (std::tuple_size::value > 1) { - TA_ASSERT(e); - } else if (!e) { // hadamard reduction - auto &[A, B] = AB; - TiledRange trange(range_map[i]); - RangeProduct tiles; - for (auto idx : i) { - tiles *= Range(range_map[idx].tiles_range()); - } - auto pa = A.permutation; - auto pb = B.permutation; - for (Index h : H.tiles) { - if (!C.array.is_local(h)) continue; - size_t batch = 1; - for (size_t i = 0; i < h.size(); ++i) { - batch *= H.batch[i].at(h[i]); + if constexpr (std::tuple_size::value > 1) TA_ASSERT(e); + if constexpr (AreArraySame) { + if (!e) { // hadamard reduction + auto &[A, B] = AB; + TiledRange trange(range_map[i]); + RangeProduct tiles; + for (auto idx : i) { + tiles *= Range(range_map[idx].tiles_range()); } - ResultTensor tile(TiledArray::Range{batch}, - typename ResultTensor::value_type{}); - for (Index i : tiles) { - // skip this unless both input tiles exist - const auto pahi_inv = apply_inverse(pa, h + i); - const auto pbhi_inv = apply_inverse(pb, h + i); - if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; - - auto ai = A.array.find(pahi_inv).get(); - auto bi = B.array.find(pbhi_inv).get(); - if (pa) ai = ai.permute(pa); - if (pb) bi = bi.permute(pb); - auto shape = trange.tile(i); - ai = ai.reshape(shape, batch); - bi = bi.reshape(shape, batch); - for (size_t k = 0; k < batch; ++k) { - auto hk = ai.batch(k).dot(bi.batch(k)); - tile({k}) += hk; + auto pa = A.permutation; + auto pb = B.permutation; + for (Index h : H.tiles) { + if (!C.array.is_local(h)) continue; + size_t batch = 1; + for (size_t i = 0; i < h.size(); ++i) { + batch *= H.batch[i].at(h[i]); } + ResultTensor tile(TiledArray::Range{batch}, + typename ResultTensor::value_type{}); + for (Index i : tiles) { + // skip this unless both input tiles exist + const auto pahi_inv = apply_inverse(pa, h + i); + const auto pbhi_inv = apply_inverse(pb, h + i); + if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; + + auto ai = A.array.find(pahi_inv).get(); + auto bi = B.array.find(pbhi_inv).get(); + if (pa) ai = ai.permute(pa); + if (pb) bi = bi.permute(pb); + auto shape = trange.tile(i); + ai = ai.reshape(shape, batch); + bi = bi.reshape(shape, batch); + for (size_t k = 0; k < batch; ++k) { + auto hk = ai.batch(k).dot(bi.batch(k)); + tile({k}) += hk; + } + } + auto pc = C.permutation; + auto shape = apply_inverse(pc, C.array.trange().tile(h)); + tile = tile.reshape(shape); + if (pc) tile = tile.permute(pc); + C.array.set(h, tile); } - auto pc = C.permutation; - auto shape = apply_inverse(pc, C.array.trange().tile(h)); - tile = tile.reshape(shape); - if (pc) tile = tile.permute(pc); - C.array.set(h, tile); + return C.array; } - return C.array; } // generalized contraction @@ -468,7 +469,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, const std::string &cs, World &world = get_default_world()) { using ECT = expressions::TsrExpr; using ECU = expressions::TsrExpr; - return Einsum::einsum(ECT(A), ECU(B), Einsum::idx(cs), world); + using ResultExprT = std::conditional_t, T, U>; + return Einsum::einsum(ECT(A), ECU(B), Einsum::idx(cs), world); } template diff --git a/tests/einsum.cpp b/tests/einsum.cpp index eb976b31f5..3e7b502da9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -845,7 +845,7 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { } } -BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { +BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { using t_type = DistArray, SparsePolicy>; using tot_type = DistArray>, SparsePolicy>; using matrix_il = TiledArray::detail::matrix_il>; @@ -877,7 +877,6 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { t_type rhs(world, rhs_trange); rhs.fill_random(); - // TODO compute ref_result // i,j;m,n * j,k => i,j,k;m,n TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(0), rhs_trange.dim(1)}; @@ -928,10 +927,17 @@ BOOST_AUTO_TEST_CASE(ikj_mn_eq_ij_mn_times_jk) { // - general product w.r.t. outer indices // - involves ToT * T // tot_type result; - // BOOST_REQUIRE_NO_THROW(result("k,i,j;n,m") = lhs("i,j;m,n") * rhs("j,k")); + // BOOST_REQUIRE_NO_THROW(result("i,j,k;m,n") = lhs("i,j;m,n") * rhs("j,k")); // will try to make this work - // tot_type out = einsum(lhs("i,j;m,n"), rhs("j,k"), "k,i,j;n,m"); + tot_type result = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); + bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); + { + result = einsum(rhs("j,k"), lhs("i,j;m,n"), "i,j,k;m,n"); + are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); + } } BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { From f001847d09461a37d5686c34a1155f50b1a1fb63 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 21 Dec 2023 15:05:19 -0500 Subject: [PATCH 224/592] einsum tot x tot 'i,j;m,n * j,k;m,n -> i,jk;m,n' unit-test compares results --- tests/einsum.cpp | 51 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 3e7b502da9..3e66e4b05b 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -581,13 +581,16 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ij_mn_times_ji_mn) { } BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { - using dist_array_t = DistArray>, DensePolicy>; + using tot_type = DistArray>, DensePolicy>; using matrix_il = TiledArray::detail::matrix_il>; auto& world = TiledArray::get_default_world(); auto random_tot = [](TA::Range const& rng) { TA::Range inner_rng{7,14}; TA::Tensor t{inner_rng}; + std::generate(t.begin(),t.end(),[]()->double{ + return TA::detail::MakeRandom::generate_value(); + }); TA::Tensor> result{rng}; for (auto& e: result) e = t; return result; @@ -595,7 +598,7 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { auto random_tot_darr = [&random_tot](World& world, TiledRange const& tr) { - dist_array_t result(world, tr); + tot_type result(world, tr); for (auto it = result.begin(); it != result.end(); ++it) { auto tile = TA::get_default_world().taskq.add(random_tot, it.make_range()); @@ -609,9 +612,51 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { TiledRange rhs_trange{{0, 2, 4, 6}, {0, 2, 5}}; auto rhs = random_tot_darr(world, rhs_trange); - dist_array_t result; + tot_type result; BOOST_REQUIRE_NO_THROW( result = einsum(lhs("i,j;m,n"), rhs("k,j;m,n"), "i,j,k;m,n")); + + // i,j,k;m,n = i,j;m,n * k,j;m,n + TiledRange ref_result_trange{lhs.trange().dim(0), lhs.trange().dim(1), + rhs.trange().dim(0)}; + tot_type ref_result(world, ref_result_trange); + + // + // why cannot lhs and rhs be captured by ref? + // + auto make_tile = [lhs, rhs](TA::Range const& rng) { + tot_type::value_type result_tile{rng}; + for (auto&& res_ix: result_tile.range()) { + auto i = res_ix[0]; + auto j = res_ix[1]; + auto k = res_ix[2]; + using Ix2 = std::array; + using Ix3 = std::array; + + auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); + auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); + auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, j}); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); + + auto& res_el = + result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); + auto const& lhs_el = + lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); + auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{k, j})); + res_el = lhs_el.mult(rhs_el); // m,n * m,n -> m,n + } + return result_tile; + }; + + using std::begin; + using std::end; + + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } + bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); } BOOST_AUTO_TEST_CASE(xxx) { From f4bba8e9fd6bc879dd2e92ca342827249701bbfc Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 21 Dec 2023 15:19:35 -0500 Subject: [PATCH 225/592] Make shape comparison flags more explicit. --- tests/einsum.cpp | 12 ++++++------ tests/tot_array_fixture.h | 10 ++++++++-- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 3e66e4b05b..e518626e97 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -655,7 +655,7 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); *it = tile; } - bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_REQUIRE(are_equal); } @@ -879,13 +879,13 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); { // reverse the order tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } } @@ -976,11 +976,11 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { // will try to make this work tot_type result = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); - bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_REQUIRE(are_equal); { result = einsum(rhs("j,k"), lhs("i,j;m,n"), "i,j,k;m,n"); - are_equal = ToTArrayFixture::are_equal(result, ref_result); + are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_REQUIRE(are_equal); } } @@ -1073,7 +1073,7 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,j;m,n") = lhs("j,i;m,n") * rhs("i,j")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 21a9c956c6..c01399dbba 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -88,6 +88,12 @@ using input_archive_type = madness::archive::BinaryFstreamInputArchive; // Type of an output archive using output_archive_type = madness::archive::BinaryFstreamOutputArchive; +enum class ShapeComp { + True, + False +}; + + /* * * When generating arrays containing tensors of tensors (ToT) we adopt simple @@ -238,7 +244,7 @@ struct ToTArrayFixture { * * TODO: pmap comparisons */ - template static bool are_equal(const DistArray& lhs, const DistArray& rhs) { @@ -254,7 +260,7 @@ struct ToTArrayFixture { if (&lhs.world() != &rhs.world()) return false; // Same shape? - if constexpr (ShapeCmp) + if constexpr (ShapeCompFlag == ShapeComp::True) if (lhs.shape() != rhs.shape()) return false; // Same pmap? From 0c30bb349dcbb1fd9489d07fb146e3de7d7fb413 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 23 Dec 2023 07:53:44 -0500 Subject: [PATCH 226/592] use version-controlled clang-format.sh from https://github.com/ValeevGroup/DevOps/blob/master/tools/clang-format/clang-format.sh --- .pre-commit-config.yaml | 4 +- bin/admin/clang-format.sh | 94 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 2 deletions(-) create mode 100755 bin/admin/clang-format.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 23f1509ca1..fd5c27bf6d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -38,5 +38,5 @@ repos: name: Format C/C++ code using clang-format. language: system files: \.(c|cc|cxx|cpp|h|hpp|hxx)$ - entry: clang-format -i - args: [--style=file] + entry: bin/admin/clang-format.sh + args: [--style=file -i] diff --git a/bin/admin/clang-format.sh b/bin/admin/clang-format.sh new file mode 100755 index 0000000000..3531dcc1b3 --- /dev/null +++ b/bin/admin/clang-format.sh @@ -0,0 +1,94 @@ +#!/bin/bash + +# these are the versions of clang-format that are supported required +# should be ordered from oldest to newest to make sure the newest is picked +supported_clang_format_versions="16 17" +preferred_clang_format_version="" # prefer most recent supported clang-format version +for v in $supported_clang_format_versions; do + preferred_clang_format_version=$v +done + +# append common locations of clang-format to PATH +unameOut="$(uname -s)" +case "${unameOut}" in + Darwin*) + extra_path="" + # this prefers more recent versions + for v in $supported_clang_format_versions; do + extra_path=/opt/homebrew/opt/llvm@$v/bin:/opt/homebrew/opt/clang-format@$v/bin:$extra_path + done + # prepend paths + export PATH=$extra_path:$PATH:/opt/homebrew/bin + ;; +esac + +path_to_clang_format=`which clang-format` +have_supported_clang_format_version=0 +if [[ "X$path_to_clang_format" != "X" ]]; then + + # check clang-format version + clang_format_version=`clang-format --version | sed 's/.* version //' | awk -F'[.]' '{print $1}'` + + #echo "supported_clang_format_versions=\"$supported_clang_format_versions\" clang_format_version=$clang_format_version" + + # if found clang-format, but wrong version, check if docker is available + for v in $supported_clang_format_versions; do + if [[ $clang_format_version -eq $v ]]; then + have_supported_clang_format_version=1 + break + fi + done +fi + +if [[ $have_supported_clang_format_version -eq 0 ]]; then + echo "WARNING: found clang-format with unsupported version $clang_format_version (supported versions: $supported_clang_format_versions)" + + # look for docker + path_to_docker=`which docker` + if [[ "X$path_to_docker" = "X" ]]; then + echo "ERROR: docker is not found either, PATH=$PATH, install one of supported clang-format versions (any of these: $supported_clang_format_versions) or install docker" + exit 1 + fi + + # if docker up? + docker info >/dev/null 2>&1 + if [[ $? -ne 0 ]]; then + echo "ERROR: docker is found but not running, start it" + exit 1 + fi + + # use docker to run clang-format + mount_path=$(readlink -f "$HOME") + + # convert file names in the arguments to relative paths + args="" + for i in "$@"; do + # skip options + if [[ "$i" == -* ]]; then + args="$args $i" + continue + fi + abs_file_path=$(readlink -f "$i") + if [[ "X$abs_file_path" = "X" ]]; then + echo "ERROR: given file $i is not found" + exit 1 + fi + + dir=$(dirname $abs_file_path) + file_path_relative_to_project_root=$(basename $abs_file_path) + while [[ "$dir" != "$mount_path" && "$dir" != "/" ]]; do + file_path_relative_to_project_root="$(basename $dir)/$file_path_relative_to_project_root" + dir=$(dirname $dir) + #echo "dir=$dir file_path_relative_to_project_root=$file_path_relative_to_project_root" + done + if [[ "$dir" == "/" ]]; then + echo "ERROR: given file $i (absolute path $abs_file_path) is not under \$HOME=$mount_path, cannot use docker-based clang-format in this case" + exit 1 + fi + args="$args /hostHOME/$file_path_relative_to_project_root" + done + docker run --platform linux/x86_64 -v $mount_path:/hostHOME xianpengshen/clang-tools:$preferred_clang_format_version clang-format $args +else + #echo "found $path_to_clang_format with required version $clang_format_version" + clang-format $* +fi From ba0be00b5e7ea9fc6b31a7789be81bd4a4cae959 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 23 Dec 2023 07:56:35 -0500 Subject: [PATCH 227/592] [ut] einsum_tot/ijk_mn_eq_ij_mn_times_kj_mn : how NOT to compute ref_result --- tests/einsum.cpp | 78 ++++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index e518626e97..22a6ddc326 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -586,18 +586,17 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { auto& world = TiledArray::get_default_world(); auto random_tot = [](TA::Range const& rng) { - TA::Range inner_rng{7,14}; + TA::Range inner_rng{7, 14}; TA::Tensor t{inner_rng}; - std::generate(t.begin(),t.end(),[]()->double{ + std::generate(t.begin(), t.end(), []() -> double { return TA::detail::MakeRandom::generate_value(); }); TA::Tensor> result{rng}; - for (auto& e: result) e = t; + for (auto& e : result) e = t; return result; }; - auto random_tot_darr = [&random_tot](World& world, - TiledRange const& tr) { + auto random_tot_darr = [&random_tot](World& world, TiledRange const& tr) { tot_type result(world, tr); for (auto it = result.begin(); it != result.end(); ++it) { auto tile = @@ -621,12 +620,9 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { rhs.trange().dim(0)}; tot_type ref_result(world, ref_result_trange); - // - // why cannot lhs and rhs be captured by ref? - // - auto make_tile = [lhs, rhs](TA::Range const& rng) { + auto make_tile = [&lhs, &rhs](TA::Range const& rng) { tot_type::value_type result_tile{rng}; - for (auto&& res_ix: result_tile.range()) { + for (auto&& res_ix : result_tile.range()) { auto i = res_ix[0]; auto j = res_ix[1]; auto k = res_ix[2]; @@ -643,7 +639,7 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { auto const& lhs_el = lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{k, j})); - res_el = lhs_el.mult(rhs_el); // m,n * m,n -> m,n + res_el = lhs_el.mult(rhs_el); // m,n * m,n -> m,n } return result_tile; }; @@ -651,12 +647,28 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { using std::begin; using std::end; - for (auto it = begin(ref_result); it != end(ref_result); ++it) { - auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); - *it = tile; + const auto have_spare_threads = madness::ThreadPool::size() > 0; + if (have_spare_threads) { + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + if (ref_result.is_local(it.index())) { + // using tasks does not work because: + // - make_tile pulls possibly remote data + // - but it also blocks thread on a remote tile futures, whose + // fulfillment requires available threads in the pool + // + // *it = world.taskq.add(make_tile, it.make_range()); + + // this technically will only work if the number of free threads in the + // pool is > 0 (i.e. main is not part of the pool or pool has 2 threads) + // + // OK, fine, @bosilca, blocking in tasks is BAD + *it = make_tile(it.make_range()); + } + } + bool are_equal = + ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); } - bool are_equal = ToTArrayFixture::are_equal(result, ref_result); - BOOST_REQUIRE(are_equal); } BOOST_AUTO_TEST_CASE(xxx) { @@ -879,13 +891,15 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = + ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); { // reverse the order tot_type result; BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = + ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } } @@ -976,11 +990,13 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { // will try to make this work tot_type result = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n"); - bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + bool are_equal = + ToTArrayFixture::are_equal(result, ref_result); BOOST_REQUIRE(are_equal); { result = einsum(rhs("j,k"), lhs("i,j;m,n"), "i,j,k;m,n"); - are_equal = ToTArrayFixture::are_equal(result, ref_result); + are_equal = + ToTArrayFixture::are_equal(result, ref_result); BOOST_REQUIRE(are_equal); } } @@ -1014,12 +1030,9 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79}); Tensor lhs_elem_5_1( Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97}); - matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, - {lhs_elem_1_0, lhs_elem_1_1}, - {lhs_elem_2_0, lhs_elem_2_1}, - {lhs_elem_3_0, lhs_elem_3_1}, - {lhs_elem_4_0, lhs_elem_4_1}, - {lhs_elem_5_0, lhs_elem_5_1}}; + matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, {lhs_elem_1_0, lhs_elem_1_1}, + {lhs_elem_2_0, lhs_elem_2_1}, {lhs_elem_3_0, lhs_elem_3_1}, + {lhs_elem_4_0, lhs_elem_4_1}, {lhs_elem_5_0, lhs_elem_5_1}}; TiledRange lhs_trange{{0, 2, 6}, {0, 2}}; tot_type lhs(world, lhs_trange, lhs_il); @@ -1046,17 +1059,15 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2({i, j})); - auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork */ false ); + auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix2{i, j})); auto const& lhs_el = lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{j, i})); - auto rhs_el = - rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{i, j})); - res_el = tot_type::element_type( - lhs_el.scale(rhs_el), // scale - TiledArray::Permutation{0, 1} // permute + auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{i, j})); + res_el = tot_type::element_type(lhs_el.scale(rhs_el), // scale + TiledArray::Permutation{0, 1} // permute ); } return result_tile; @@ -1073,7 +1084,8 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { tot_type result; BOOST_REQUIRE_NO_THROW(result("i,j;m,n") = lhs("j,i;m,n") * rhs("i,j")); - const bool are_equal = ToTArrayFixture::are_equal(result, ref_result); + const bool are_equal = + ToTArrayFixture::are_equal(result, ref_result); BOOST_CHECK(are_equal); } From 987040b68c06c69c10cd11728f493dfa55cedf0f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 23 Dec 2023 08:05:34 -0500 Subject: [PATCH 228/592] [ut] einsum_tot/ijk_mn_eq_ij_mn_times_kj_mn : how to compute ref_result --- tests/einsum.cpp | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 22a6ddc326..12692dc515 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -620,6 +620,10 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { rhs.trange().dim(0)}; tot_type ref_result(world, ref_result_trange); + // to be able to pull remote tiles make them local AND ready + lhs.make_replicated(); + rhs.make_replicated(); + world.gop.fence(); auto make_tile = [&lhs, &rhs](TA::Range const& rng) { tot_type::value_type result_tile{rng}; for (auto&& res_ix : result_tile.range()) { @@ -630,9 +634,9 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { using Ix3 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); + auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, j}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); + auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); @@ -647,28 +651,14 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { using std::begin; using std::end; - const auto have_spare_threads = madness::ThreadPool::size() > 0; - if (have_spare_threads) { - for (auto it = begin(ref_result); it != end(ref_result); ++it) { - if (ref_result.is_local(it.index())) { - // using tasks does not work because: - // - make_tile pulls possibly remote data - // - but it also blocks thread on a remote tile futures, whose - // fulfillment requires available threads in the pool - // - // *it = world.taskq.add(make_tile, it.make_range()); - - // this technically will only work if the number of free threads in the - // pool is > 0 (i.e. main is not part of the pool or pool has 2 threads) - // - // OK, fine, @bosilca, blocking in tasks is BAD - *it = make_tile(it.make_range()); - } + for (auto it = begin(ref_result); it != end(ref_result); ++it) { + if (ref_result.is_local(it.index())) { + *it = world.taskq.add(make_tile, it.make_range()); } - bool are_equal = - ToTArrayFixture::are_equal(result, ref_result); - BOOST_REQUIRE(are_equal); } + bool are_equal = + ToTArrayFixture::are_equal(result, ref_result); + BOOST_REQUIRE(are_equal); } BOOST_AUTO_TEST_CASE(xxx) { From 2392f2018d005c89ba804a2db78c891e24b7eb8c Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sat, 23 Dec 2023 09:35:14 -0500 Subject: [PATCH 229/592] [ut] ref result manual computation pattern from previous commit applied to more cases. --- tests/einsum.cpp | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 12692dc515..57a31a48e8 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -839,6 +839,10 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { // i,l,k,j;n,m = i,j;m,n * k,l // + lhs.make_replicated(); + rhs.make_replicated(); + world.gop.fence(); + // why cannot lhs and rhs be captured by ref? auto make_tile = [lhs, rhs](TA::Range const& rng) { tot_type::value_type result_tile{rng}; @@ -852,10 +856,10 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { using Ix4 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); + auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, l}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); + auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix4{i, l, k, j})); @@ -874,8 +878,10 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { - auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); - *it = tile; + if (ref_result.is_local(it.index())) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } } tot_type result; @@ -931,6 +937,9 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { rhs_trange.dim(1)}; tot_type ref_result(world, ref_result_trange); + lhs.make_replicated(); + rhs.make_replicated(); + // // why cannot lhs and rhs be captured by ref? // @@ -945,10 +954,10 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { using Ix3 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork = */ false); + auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{j, k}); - auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork = */ false); + auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); @@ -965,8 +974,10 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { - auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); - *it = tile; + if (ref_result.is_local(it.index())) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } } ///////////////////////////////////////////////////////// @@ -1036,6 +1047,10 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { TiledRange ref_result_trange{rhs_trange.dim(0), rhs_trange.dim(1)}; tot_type ref_result(world, ref_result_trange); + lhs.make_replicated(); + rhs.make_replicated(); + world.gop.fence(); + // why cannot lhs and rhs be captured by ref? auto make_tile = [lhs, rhs](TA::Range const& rng) { tot_type::value_type result_tile{rng}; @@ -1046,10 +1061,10 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { using Ix2 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{j, i}); - auto lhs_tile = lhs.find(lhs_tile_ix).get(/* dowork */ false); + auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork */ false); auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2({i, j})); - auto rhs_tile = rhs.find(rhs_tile_ix).get(/* dowork */ false); + auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork */ false); auto& res_el = result_tile.at_ordinal(result_tile.range().ordinal(Ix2{i, j})); @@ -1067,8 +1082,10 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { using std::end; for (auto it = begin(ref_result); it != end(ref_result); ++it) { - auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); - *it = tile; + if (ref_result.is_local(it.index())) { + auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range()); + *it = tile; + } } tot_type result; From 8b365a91ad6834071491f1525c9b426e66f02b81 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sat, 23 Dec 2023 10:35:24 -0500 Subject: [PATCH 230/592] [ut] typo --- tests/einsum.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 57a31a48e8..49e6812cac 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -939,6 +939,7 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { lhs.make_replicated(); rhs.make_replicated(); + world.gop.fence(); // // why cannot lhs and rhs be captured by ref? From 6c7a9f498b12101da345b519d496a4f9c33f89fd Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sat, 23 Dec 2023 12:33:18 -0500 Subject: [PATCH 231/592] [ci skip] add .batched_size() method to Tensor that returns size() multiplied by batch_size(). --- src/TiledArray/tensor/tensor.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index c901dc0f4b..e6c98b0cf0 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -672,6 +672,10 @@ class Tensor { /// \return The number of elements in the tensor ordinal_type size() const { return (this->range().volume()); } + /// \return The number of elements in the tensor by summing up the sizes of + /// the batches. + ordinal_type batched_size() const { return size() * batch_size(); } + /// Tensor data size (in bytes) accessor /// \return The number of bytes occupied by this tensor's data @@ -1064,10 +1068,10 @@ class Tensor { bool empty = this->empty(); auto range = this->range_; auto batch_size = this->batch_size_; - ar& empty; + ar & empty; if (!empty) { - ar& range; - ar& batch_size; + ar & range; + ar & batch_size; if constexpr (madness::is_input_archive_v) { *this = Tensor(std::move(range), batch_size, default_construct{true}); } From 60327021442f33bfff3e4e8d60ab7adce4c337a5 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 24 Dec 2023 12:31:11 -0500 Subject: [PATCH 232/592] Tensor reduce works on batch_size() * volume() many elements. --- src/TiledArray/tensor/kernels.h | 41 ++++++++++++++++++++++------- src/TiledArray/tensor/type_traits.h | 17 ++++++++++++ 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index f1ec6d99c5..c2f7c0897d 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -714,7 +714,12 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Identity&& identity, TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); - const auto volume = tensor1.range().volume(); + const auto volume = [&tensor1]() { + if constexpr (detail::has_batch_size_v) + return tensor1.batched_size(); + else + return tensor1.size(); + }(); auto init = std::forward(identity); math::reduce_op(std::forward(reduce_op), @@ -782,13 +787,17 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); - const auto volume = tensor1.range().volume(); + const auto volume = [&tensor1]() { + if constexpr (detail::has_batch_size_v) + return tensor1.batched_size(); + else + return tensor1.size(); + }(); auto result = identity; - for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume; ++ord) { - auto temp = - tensor_reduce(reduce_op, join_op, identity, tensor1.data()[ord], - tensors.data()[ord]...); + for (std::remove_cv_t ord = 0ul; ord < volume; ++ord) { + auto temp = tensor_reduce(reduce_op, join_op, identity, tensor1.data()[ord], + tensors.data()[ord]...); join_op(result, temp); } @@ -825,7 +834,12 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); - const auto volume = tensor1.range().volume(); + const auto volume = [&tensor1]() { + if constexpr (detail::has_batch_size_v) + return tensor1.batched_size(); + else + return tensor1.size(); + }(); auto result = identity; if constexpr (detail::has_member_function_data_anyreturn_v && @@ -840,6 +854,8 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, join_op(result, temp); } } else { // if 1+ tensor lacks data() must iterate over individual elements + // TA_ASSERT(tensor1.batch_size() == 1); // todo: asser the same for the + // remaining tensors auto& t1_rng = tensor1.range(); using signed_idx_t = Range::index_difference_type; auto t1_lobound = signed_idx_t(t1_rng.lobound()); @@ -884,8 +900,15 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, const Ts&... tensors) { TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); + // TA_ASSERT(tensor1.batch_size() == 1); // todo: assert the same for the + // remaining tensors - const auto volume = tensor1.range().volume(); + const auto volume = [&tensor1]() { + if constexpr (detail::has_batch_size_v) + return tensor1.batched_size(); + else + return tensor1.size(); + }(); Scalar result = identity; @@ -897,7 +920,7 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar& MADNESS_RESTRICT result, typename T1::const_pointer MADNESS_RESTRICT const tensor1_data, typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) { - for (decltype(result.range().volume()) i = 0ul; i < stride; ++i) { + for (std::remove_cv_t i = 0ul; i < stride; ++i) { Scalar temp = tensor_reduce(reduce_op, join_op, identity, tensor1_data[i], tensors_data[i]...); join_op(result, temp); diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index fd197c8cdf..10fdb70204 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -400,6 +400,23 @@ struct ordinal_traits>> { std::decay_t().range())>>::type; }; +template +class has_batch_size { + /// true case + template + static auto __test(U* p) -> decltype(p->batch_size(), std::true_type()); + /// false case + template + static std::false_type __test(...); + + public: + static constexpr const bool value = + std::is_same(0))>::value; +}; + +template +constexpr inline bool has_batch_size_v = has_batch_size::value; + } // namespace detail } // namespace TiledArray From 985ed8ade8b5fb8b4f6f345df0c3a87248e4364a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 24 Dec 2023 16:44:28 -0500 Subject: [PATCH 233/592] [cmake] for cmake v3.28 set policy `CMP0146` to `OLD` to satisfy BLT that still uses FindCUDA + use CUDA_TOOLKIT_ROOT_DIR to assist FindCUDA in finding CUDA toolkit resolves #438 --- .gitlab-ci.yml | 1 + external/umpire.cmake | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index fd9c49aefa..264b42f0bb 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,6 +15,7 @@ variables: TA_UT_CTEST_TIMEOUT=3000 ${TA_PYTHON} ${ENABLE_CUDA} + CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda ${BLA_VENDOR} ${BLA_THREADS} ${ENABLE_SCALAPACK} diff --git a/external/umpire.cmake b/external/umpire.cmake index e61fe832e5..aa98f27b1e 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -109,6 +109,15 @@ else() if (DEFINED CMAKE_CUDA_ARCHITECTURES) list(APPEND UMPIRE_CMAKE_ARGS "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}") endif(DEFINED CMAKE_CUDA_ARCHITECTURES) + # BLT will need FindCUDA until https://github.com/LLNL/blt/pull/585 is merged + # with CMake 3.28.1 needs to set CMP0146 to OLD + if (POLICY CMP0146) + list(APPEND UMPIRE_CMAKE_ARGS -DCMAKE_POLICY_DEFAULT_CMP0146=OLD) + endif() + # as of CMake 3.28+ FindCUDA seems to require CUDA_TOOLKIT_ROOT_DIR to be defined + if (DEFINED CUDA_TOOLKIT_ROOT_DIR) + list(APPEND UMPIRE_CMAKE_ARGS "-DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}") + endif() endif(ENABLE_CUDA) if (ENABLE_HIP) list(APPEND UMPIRE_CMAKE_ARGS From 959c84fe3f99b59b6e8cc3173ccea4a46557ea0f Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 25 Dec 2023 15:32:44 -0500 Subject: [PATCH 234/592] Rename TA::Tensor member function 'batched_size' to 'total_size'. --- src/TiledArray/tensor/kernels.h | 16 ++++++++-------- src/TiledArray/tensor/tensor.h | 2 +- src/TiledArray/tensor/type_traits.h | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index c2f7c0897d..d87007205b 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -715,8 +715,8 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Identity&& identity, TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); const auto volume = [&tensor1]() { - if constexpr (detail::has_batch_size_v) - return tensor1.batched_size(); + if constexpr (detail::has_total_size_v) + return tensor1.total_size(); else return tensor1.size(); }(); @@ -788,8 +788,8 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); const auto volume = [&tensor1]() { - if constexpr (detail::has_batch_size_v) - return tensor1.batched_size(); + if constexpr (detail::has_total_size_v) + return tensor1.total_size(); else return tensor1.size(); }(); @@ -835,8 +835,8 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); const auto volume = [&tensor1]() { - if constexpr (detail::has_batch_size_v) - return tensor1.batched_size(); + if constexpr (detail::has_total_size_v) + return tensor1.total_size(); else return tensor1.size(); }(); @@ -904,8 +904,8 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, // remaining tensors const auto volume = [&tensor1]() { - if constexpr (detail::has_batch_size_v) - return tensor1.batched_size(); + if constexpr (detail::has_total_size_v) + return tensor1.total_size(); else return tensor1.size(); }(); diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index e6c98b0cf0..15f2dcdd3e 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -674,7 +674,7 @@ class Tensor { /// \return The number of elements in the tensor by summing up the sizes of /// the batches. - ordinal_type batched_size() const { return size() * batch_size(); } + ordinal_type total_size() const { return size() * batch_size(); } /// Tensor data size (in bytes) accessor diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index 10fdb70204..89f8da70a2 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -401,10 +401,10 @@ struct ordinal_traits>> { }; template -class has_batch_size { +class has_total_size { /// true case template - static auto __test(U* p) -> decltype(p->batch_size(), std::true_type()); + static auto __test(U* p) -> decltype(p->total_size(), std::true_type()); /// false case template static std::false_type __test(...); @@ -415,7 +415,7 @@ class has_batch_size { }; template -constexpr inline bool has_batch_size_v = has_batch_size::value; +constexpr inline bool has_total_size_v = has_total_size::value; } // namespace detail From f0cd2a9b1b5166e8c856c768b8c602990be3480c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 25 Dec 2023 19:43:07 -0500 Subject: [PATCH 235/592] [cmake] disable clang-format use by umpire/blt --- external/umpire.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/external/umpire.cmake b/external/umpire.cmake index aa98f27b1e..efa0a0da36 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -87,6 +87,7 @@ else() -DENABLE_EXAMPLES=OFF -DENABLE_LOGGING=OFF -DENABLE_ASSERTS=${enable_umpire_asserts} + -DENABLE_CLANGFORMAT=OFF ) # caveat: on recent Ubuntu default libstdc++ provides filesystem, but if using older gcc (gcc-8) must link against From 0d4d2b6dc60adeabdeab08c3cc80efd5553f5bea Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 25 Dec 2023 20:46:37 -0500 Subject: [PATCH 236/592] Rename TA::Tensor and TA::Tile member function 'batch_size' to more revealing 'nbatch'. --- src/TiledArray/einsum/tiledarray.h | 2 +- src/TiledArray/tensor.h | 4 +- src/TiledArray/tensor/kernels.h | 4 +- src/TiledArray/tensor/tensor.h | 184 +++++++++++----------- src/TiledArray/tile.h | 10 +- src/TiledArray/tile_op/binary_reduction.h | 4 +- 6 files changed, 103 insertions(+), 105 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 2bd548df5c..18a3871f0b 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -293,7 +293,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, if (C.ei.is_zero(e)) continue; // TODO no need for immediate evaluation auto tile = C.ei.find_local(e).get(); - assert(tile.batch_size() == batch); + assert(tile.nbatch() == batch); const Permutation &P = C.permutation; auto c = apply(P, h + e); auto shape = C.array.trange().tile(c); diff --git a/src/TiledArray/tensor.h b/src/TiledArray/tensor.h index edb7ba2e47..20ecab9e0e 100644 --- a/src/TiledArray/tensor.h +++ b/src/TiledArray/tensor.h @@ -63,8 +63,8 @@ inline std::ostream& operator<<(std::ostream& os, const T& t) { os << t.range() << " { "; const auto n = t.range().volume(); std::size_t offset = 0ul; - const auto more_than_1_batch = t.batch_size() > 1; - for (auto b = 0ul; b != t.batch_size(); ++b) { + const auto more_than_1_batch = t.nbatch() > 1; + for (auto b = 0ul; b != t.nbatch(); ++b) { if (more_than_1_batch) { os << "[batch " << b << "]{ "; } diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index d87007205b..682cb1b209 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -854,7 +854,7 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, join_op(result, temp); } } else { // if 1+ tensor lacks data() must iterate over individual elements - // TA_ASSERT(tensor1.batch_size() == 1); // todo: asser the same for the + // TA_ASSERT(tensor1.nbatch() == 1); // todo: assert the same for the // remaining tensors auto& t1_rng = tensor1.range(); using signed_idx_t = Range::index_difference_type; @@ -900,7 +900,7 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, const Ts&... tensors) { TA_ASSERT(!empty(tensor1, tensors...)); TA_ASSERT(is_range_set_congruent(tensor1, tensors...)); - // TA_ASSERT(tensor1.batch_size() == 1); // todo: assert the same for the + // TA_ASSERT(tensor1.nbatch() == 1); // todo: assert the same for the // remaining tensors const auto volume = [&tensor1]() { diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 15f2dcdd3e..1b5beff19d 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -139,9 +139,9 @@ class Tensor { private: using default_construct = bool; - Tensor(const range_type& range, size_t batch_size, bool default_construct) - : range_(range), batch_size_(batch_size) { - size_t size = range_.volume() * batch_size; + Tensor(const range_type& range, size_t nbatch, bool default_construct) + : range_(range), nbatch_(nbatch) { + size_t size = range_.volume() * nbatch; allocator_type allocator; auto* ptr = allocator.allocate(size); if (default_construct) { @@ -177,9 +177,9 @@ class Tensor { #endif } - Tensor(range_type&& range, size_t batch_size, bool default_construct) - : range_(std::move(range)), batch_size_(batch_size) { - size_t size = range_.volume() * batch_size; + Tensor(range_type&& range, size_t nbatch, bool default_construct) + : range_(std::move(range)), nbatch_(nbatch) { + size_t size = range_.volume() * nbatch; allocator_type allocator; auto* ptr = allocator.allocate(size); if (default_construct) { @@ -232,7 +232,7 @@ class Tensor { range_type range_; ///< Range /// Number of `range_`-sized blocks in `data_` /// \note this is not used for (in)equality comparison - size_t batch_size_ = 1; + size_t nbatch_ = 1; std::shared_ptr data_; ///< Shared pointer to the data public: @@ -246,9 +246,7 @@ class Tensor { /// \post `*this` is a shallow copy of \p other , /// i.e. `*this == other && this->data()==other.data()` Tensor(const Tensor& other) - : range_(other.range_), - batch_size_(other.batch_size_), - data_(other.data_) { + : range_(other.range_), nbatch_(other.nbatch_), data_(other.data_) { #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { ptr_registry()->insert( @@ -266,7 +264,7 @@ class Tensor { /// \post `other.empty()` Tensor(Tensor&& other) : range_(std::move(other.range_)), - batch_size_(std::move(other.batch_size_)), + nbatch_(std::move(other.nbatch_)), data_(std::move(other.data_)) { #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { @@ -292,9 +290,9 @@ class Tensor { /// Construct a tensor with a range equal to \c range. The data is /// uninitialized. /// \param range The range of the tensor - /// \param batch_size The batch size (default is 1) - explicit Tensor(const range_type& range, size_type batch_size = 1) - : Tensor(range, batch_size, default_construct{true}) {} + /// \param nbatch The number of batches (default is 1) + explicit Tensor(const range_type& range, size_type nbatch = 1) + : Tensor(range, nbatch, default_construct{true}) {} /// Construct a tensor of tensor values, setting all elements to the same /// value @@ -519,15 +517,15 @@ class Tensor { /// Construct a tensor with a range equal to \c range using existing data /// \param range The range of the tensor - /// \param batch_size The batch size + /// \param nbatch The number of batches /// \param data shared pointer to the data - Tensor(const range_type& range, size_t batch_size, + Tensor(const range_type& range, size_t nbatch, std::shared_ptr data) - : range_(range), batch_size_(batch_size), data_(std::move(data)) { + : range_(range), nbatch_(nbatch), data_(std::move(data)) { #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { ptr_registry()->insert( - this, make_string("TA::Tensor(range, batch_size, data)::data_.get()=", + this, make_string("TA::Tensor(range, nbatch, data)::data_.get()=", data_.get())); } #endif @@ -537,7 +535,7 @@ class Tensor { /// assuming unit batch size \param range The range of the tensor \param data /// shared pointer to the data Tensor(const range_type& range, std::shared_ptr data) - : range_(range), batch_size_(1), data_(std::move(data)) { + : range_(range), nbatch_(1), data_(std::move(data)) { #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { ptr_registry()->insert( @@ -550,14 +548,14 @@ class Tensor { /// The batch size accessor /// @return the size of tensor batch represented by `*this` - size_t batch_size() const { return this->batch_size_; } + size_t nbatch() const { return this->nbatch_; } /// @param[in] idx the batch index - /// @pre `idx < this->batch_size()` - /// @return (plain, i.e. batch_size=1) Tensor representing element \p idx of + /// @pre `idx < this->nbatch()` + /// @return (plain, i.e. nbatch=1) Tensor representing element \p idx of /// the batch Tensor batch(size_t idx) const { - TA_ASSERT(idx < this->batch_size()); + TA_ASSERT(idx < this->nbatch()); std::shared_ptr data(this->data_, this->data_.get() + idx * this->size()); return Tensor(this->range(), 1, data); @@ -566,13 +564,13 @@ class Tensor { /// Returns Tensor representing the data using another range and batch size /// @param[in] range the Range of the result - /// @param[in] batch_size the batch size of the result + /// @param[in] nbatch the number of batches of the result /// @return Tensor object representing `this->data()` using @p range and @p - /// batch_size - auto reshape(const range_type& range, size_t batch_size = 1) const { - TA_ASSERT(this->range().volume() * this->batch_size() == - range.volume() * batch_size); - return Tensor(range, batch_size, this->data_); + /// nbatch + auto reshape(const range_type& range, size_t nbatch = 1) const { + TA_ASSERT(this->range().volume() * this->nbatch() == + range.volume() * nbatch); + return Tensor(range, nbatch, this->data_); } /// @return a deep copy of `*this` @@ -617,7 +615,7 @@ class Tensor { } #endif range_ = other.range_; - batch_size_ = other.batch_size_; + nbatch_ = other.nbatch_; data_ = other.data_; #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { @@ -650,7 +648,7 @@ class Tensor { } #endif range_ = std::move(other.range_); - batch_size_ = std::move(other.batch_size_); + nbatch_ = std::move(other.nbatch_); data_ = std::move(other.data_); #ifdef TA_TENSOR_MEM_TRACE if (nbytes() >= trace_if_larger_than_) { @@ -674,14 +672,14 @@ class Tensor { /// \return The number of elements in the tensor by summing up the sizes of /// the batches. - ordinal_type total_size() const { return size() * batch_size(); } + ordinal_type total_size() const { return size() * nbatch(); } /// Tensor data size (in bytes) accessor /// \return The number of bytes occupied by this tensor's data /// \warning this only returns valid value if this is a tensor of scalars std::size_t nbytes() const { - return this->range().volume() * this->batch_size_ * sizeof(T); + return this->range().volume() * this->nbatch_ * sizeof(T); } /// Const element accessor @@ -690,7 +688,7 @@ class Tensor { /// \param[in] ord an ordinal index /// \return Const reference to the element at position \c ord . /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template ::value>* = nullptr> const_reference operator[](const Ordinal ord) const { @@ -700,7 +698,7 @@ class Tensor { TA_ASSERT(this->range_.rank() != 1 && "use Tensor::operator[](index) or " "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); TA_ASSERT(this->range_.includes_ordinal(ord)); return this->data()[ord]; } @@ -711,7 +709,7 @@ class Tensor { /// \param[in] ord an ordinal index /// \return Reference to the element at position \c ord . /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template ::value>* = nullptr> reference operator[](const Ordinal ord) { @@ -721,7 +719,7 @@ class Tensor { TA_ASSERT(this->range_.rank() != 1 && "use Tensor::operator[](index) or " "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); TA_ASSERT(this->range_.includes_ordinal(ord)); return this->data()[ord]; } @@ -732,12 +730,12 @@ class Tensor { /// \param[in] ord an ordinal index /// \return Const reference to the element at position \c ord . /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template ::value>* = nullptr> const_reference at_ordinal(const Ordinal ord) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); TA_ASSERT(this->range_.includes_ordinal(ord)); return this->data()[ord]; } @@ -748,12 +746,12 @@ class Tensor { /// \param[in] ord an ordinal index /// \return Reference to the element at position \c ord . /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template ::value>* = nullptr> reference at_ordinal(const Ordinal ord) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); TA_ASSERT(this->range_.includes_ordinal(ord)); return this->data()[ord]; } @@ -764,12 +762,12 @@ class Tensor { /// \param[in] i an index /// \return Const reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> const_reference operator[](const Index& i) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -781,12 +779,12 @@ class Tensor { /// \param[in] i an index /// \return Reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> reference operator[](const Index& i) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -798,12 +796,12 @@ class Tensor { /// \param[in] i an index /// \return Const reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> const_reference operator[](const std::initializer_list& i) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -815,12 +813,12 @@ class Tensor { /// \param[in] i an index /// \return Reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> reference operator[](const std::initializer_list& i) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -832,12 +830,12 @@ class Tensor { /// \param[in] ord an ordinal index /// \return Const reference to the element at position \c ord . /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> const_reference operator()(const Ordinal& ord) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); // can't distinguish between operator[](Index...) and operator[](ordinal) // thus assume at_ordinal() if this->rank()==1 TA_ASSERT(this->range_.rank() != 1 && @@ -853,12 +851,12 @@ class Tensor { /// \param[in] ord an ordinal index /// \return Reference to the element at position \c ord . /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> reference operator()(const Ordinal& ord) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); // can't distinguish between operator[](Index...) and operator[](ordinal) // thus assume at_ordinal() if this->rank()==1 TA_ASSERT(this->range_.rank() != 1 && @@ -874,12 +872,12 @@ class Tensor { /// \param[in] i an index /// \return Const reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> const_reference operator()(const Index& i) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -891,12 +889,12 @@ class Tensor { /// \param[in] i an index /// \return Reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> reference operator()(const Index& i) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -908,12 +906,12 @@ class Tensor { /// \param[in] i an index /// \return Const reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> const_reference operator()(const std::initializer_list& i) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -925,12 +923,12 @@ class Tensor { /// \param[in] i an index /// \return Reference to the element at position \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template >* = nullptr> reference operator()(const std::initializer_list& i) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); const auto iord = this->range_.ordinal(i); TA_ASSERT(this->range_.includes_ordinal(iord)); return this->data()[iord]; @@ -943,14 +941,14 @@ class Tensor { /// \param[in] i an index \return Const reference to the element at position /// \c i . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template < typename... Index, std::enable_if_t<(sizeof...(Index) > 1ul) && detail::is_integral_list::value>* = nullptr> const_reference operator()(const Index&... i) const { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); using Int = std::common_type_t; const auto iord = this->range_.ordinal( std::array{{static_cast(i)...}}); @@ -965,14 +963,14 @@ class Tensor { /// \param[in] i an index \return Reference to the element at position \c i /// . /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is - /// included in the range, and `batch_size()==1` + /// included in the range, and `nbatch()==1` template < typename... Index, std::enable_if_t<(sizeof...(Index) > 1ul) && detail::is_integral_list::value>* = nullptr> reference operator()(const Index&... i) { TA_ASSERT(!this->empty()); - TA_ASSERT(this->batch_size() == 1); + TA_ASSERT(this->nbatch() == 1); using Int = std::common_type_t; const auto iord = this->range_.ordinal( std::array{{static_cast(i)...}}); @@ -1013,18 +1011,18 @@ class Tensor { pointer data() { return this->data_.get(); } /// @param[in] batch_idx the batch index - /// @pre `batch_idx < this->batch_size()` + /// @pre `batch_idx < this->nbatch()` /// @return A const pointer to the tensor data of the batch \p batch_idx const_pointer batch_data(size_t batch_idx) const { - TA_ASSERT(batch_idx < this->batch_size()); + TA_ASSERT(batch_idx < this->nbatch()); return data() + batch_idx * size(); } /// @param[in] batch_idx the batch index - /// @pre `batch_idx < this->batch_size()` + /// @pre `batch_idx < this->nbatch()` /// @return A const pointer to the tensor data of the batch \p batch_idx pointer batch_data(size_t batch_idx) { - TA_ASSERT(batch_idx < this->batch_size()); + TA_ASSERT(batch_idx < this->nbatch()); return data() + batch_idx * size(); } @@ -1049,9 +1047,9 @@ class Tensor { /// (`this->empty()` is equivalent to `*this == Tensor{}`), /// but is not identical /// to a default-constructed Tensor (e.g., `this->empty()` does not - /// imply `this->batch_size() == Tensor{}.batch_size()`) + /// imply `this->nbatch() == Tensor{}.nbatch()`) bool empty() const { - // empty data_ implies default values for range_ (but NOT batch_size_) + // empty data_ implies default values for range_ (but NOT nbatch_) TA_ASSERT( (this->data_.use_count() == 0 && !this->range_) || (this->data_.use_count() != 0 && this->range_)); // range is empty @@ -1067,16 +1065,16 @@ class Tensor { void serialize(Archive& ar) { bool empty = this->empty(); auto range = this->range_; - auto batch_size = this->batch_size_; + auto nbatch = this->nbatch_; ar & empty; if (!empty) { ar & range; - ar & batch_size; + ar & nbatch; if constexpr (madness::is_input_archive_v) { - *this = Tensor(std::move(range), batch_size, default_construct{true}); + *this = Tensor(std::move(range), nbatch, default_construct{true}); } ar& madness::archive::wrap(this->data_.get(), - this->range_.volume() * batch_size); + this->range_.volume() * nbatch); } else { if constexpr (madness::is_input_archive_v) { *this = Tensor{}; @@ -1105,7 +1103,7 @@ class Tensor { #endif std::swap(data_, other.data_); std::swap(range_, other.range_); - std::swap(batch_size_, other.batch_size_); + std::swap(nbatch_, other.nbatch_); #ifdef TA_TENSOR_MEM_TRACE if (other_to_be_traced) { ptr_registry()->insert( @@ -2123,11 +2121,11 @@ class Tensor { if (this->empty()) { *this = Tensor(gemm_helper.make_result_range(A.range_, B.range()), - A.batch_size(), default_construct{true}); + A.nbatch(), default_construct{true}); beta = 0; } - TA_ASSERT(this->batch_size() == A.batch_size()); - TA_ASSERT(this->batch_size() == B.batch_size()); + TA_ASSERT(this->nbatch() == A.nbatch()); + TA_ASSERT(this->nbatch() == B.nbatch()); // may need to split gemm into multiply + accumulate for tracing purposes #ifdef TA_ENABLE_TILE_OPS_LOGGING @@ -2138,11 +2136,11 @@ class Tensor { std::unique_ptr data_copy; size_t tile_volume; if (twostep) { - tile_volume = range().volume() * batch_size(); + tile_volume = range().volume() * nbatch(); data_copy = std::make_unique(tile_volume); std::copy(data_.get(), data_.get() + tile_volume, data_copy.get()); } - for (size_t i = 0; i < this->batch_size(); ++i) { + for (size_t i = 0; i < this->nbatch(); ++i) { auto Ci = this->batch(i); TiledArray::gemm(alpha, A.batch(i), B.batch(i), twostep ? numeric_type(0) : numeric_type(1), Ci, @@ -2183,7 +2181,7 @@ class Tensor { TiledArray::TileOpsLogger::get_instance().gemm_printer( *logger.log, tformed_left_range, A.data(), tformed_right_range, B.data(), tformed_right_range, - this->data(), this->batch_size()); + this->data(), this->nbatch()); } } } @@ -2196,7 +2194,7 @@ class Tensor { } } #else // TA_ENABLE_TILE_OPS_LOGGING - for (size_t i = 0; i < this->batch_size(); ++i) { + for (size_t i = 0; i < this->nbatch(); ++i) { auto Ci = this->batch(i); TiledArray::gemm(alpha, A.batch(i), B.batch(i), beta, Ci, gemm_helper); } @@ -2218,8 +2216,8 @@ class Tensor { TA_ASSERT(left.range().rank() == gemm_helper.left_rank()); TA_ASSERT(!right.empty()); TA_ASSERT(right.range().rank() == gemm_helper.right_rank()); - TA_ASSERT(left.batch_size() == right.batch_size()); - const auto batch_sz = left.batch_size(); + TA_ASSERT(left.nbatch() == right.nbatch()); + const auto batch_sz = left.nbatch(); // Check that the inner dimensions of left and right match TA_ASSERT(gemm_helper.left_right_congruent(left.range().extent_data(), @@ -2259,7 +2257,7 @@ class Tensor { right.range().upbound_data(), this->range_.upbound_data())); // check that batch size of this matches that of left and right - TA_ASSERT(this->batch_size() == batch_sz); + TA_ASSERT(this->nbatch() == batch_sz); } // Compute gemm dimensions @@ -2273,7 +2271,7 @@ class Tensor { const integer ldb = (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? N : K); - for (integer b = 0; b != batch_size(); ++b) { + for (integer b = 0; b != nbatch(); ++b) { auto this_data = this->batch_data(b); auto left_data = left.batch_data(b); auto right_data = right.batch_data(b); @@ -2599,9 +2597,9 @@ void gemm(Alpha alpha, const Tensor& A, const Tensor& B, TA_ASSERT(!B.empty()); TA_ASSERT(B.range().rank() == gemm_helper.right_rank()); - TA_ASSERT(A.batch_size() == 1); - TA_ASSERT(B.batch_size() == 1); - TA_ASSERT(C.batch_size() == 1); + TA_ASSERT(A.nbatch() == 1); + TA_ASSERT(B.nbatch() == 1); + TA_ASSERT(C.nbatch() == 1); // Check that the outer dimensions of left match the corresponding // dimensions in result @@ -2699,7 +2697,7 @@ void gemm(Alpha alpha, const Tensor& A, const Tensor& B, TiledArray::TileOpsLogger::get_instance().gemm_printer( *logger.log, tformed_left_range, A.data(), tformed_right_range, B.data(), tformed_right_range, C.data(), - C.batch_size()); + C.nbatch()); } } } @@ -2725,8 +2723,8 @@ void gemm(Alpha alpha, const Tensor& A, const Tensor& B, /// \param[in] a a Tensor object /// \param[in] b another Tensor object /// \return true if ranges and data of \p a and \p b are equal -/// \internal this does not compare batch_size so any -/// 2 empty tensors are equal even if their batch_size +/// \internal this does not compare nbatch so any +/// 2 empty tensors are equal even if their nbatch /// differ template bool operator==(const Tensor& a, const Tensor& b) { diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h index b8242fbf19..1091362287 100644 --- a/src/TiledArray/tile.h +++ b/src/TiledArray/tile.h @@ -589,7 +589,7 @@ class Tile { void serialize(Archive& ar) const { // Serialize data for empty tile check bool empty = !static_cast(pimpl_); - ar& empty; + ar & empty; if (!empty) { // Serialize tile data ar&* pimpl_; @@ -602,12 +602,12 @@ class Tile { void serialize(Archive& ar) { // Check for empty tile bool empty = false; - ar& empty; + ar & empty; if (!empty) { // Deserialize tile data tensor_type tensor; - ar& tensor; + ar & tensor; // construct a new pimpl pimpl_ = std::make_shared(std::move(tensor)); @@ -617,10 +617,10 @@ class Tile { } } - constexpr static std::size_t batch_size() { return 1; } + constexpr static std::size_t nbatch() { return 1; } const auto& batch(std::size_t idx) const { - TA_ASSERT(idx < this->batch_size()); + TA_ASSERT(idx < this->nbatch()); return *this; } diff --git a/src/TiledArray/tile_op/binary_reduction.h b/src/TiledArray/tile_op/binary_reduction.h index d65d133f32..4bbac16bcf 100644 --- a/src/TiledArray/tile_op/binary_reduction.h +++ b/src/TiledArray/tile_op/binary_reduction.h @@ -63,8 +63,8 @@ class DotReduction { void operator()(result_type& result, const first_argument_type& left, const second_argument_type& right) const { using TiledArray::dot; - TA_ASSERT(left.batch_size() == right.batch_size()); - size_t nb = left.batch_size(); + TA_ASSERT(left.nbatch() == right.nbatch()); + size_t nb = left.nbatch(); for (size_t i = 0; i < nb; ++i) { result += dot(left.batch(i), right.batch(i)); } From efb852e9efa864d965fd29dff5d7bb5100694da1 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 2 Jan 2024 14:43:38 -0500 Subject: [PATCH 237/592] Generic scalar_type instead of a cpp literal value --- src/TiledArray/expressions/cont_engine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 21aceae14c..2a658dc886 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -511,7 +511,7 @@ class ContEngine : public BinaryEngine { // is contract then inner must implement (ternary) multiply-add; // if the outer is hadamard then the inner is binary multiply const auto outer_prod = this->product_type(); - if (this->factor_ == 1) { + if (this->factor_ == scalar_type{1}) { using base_op_type = TiledArray::detail::Mult Date: Wed, 3 Jan 2024 10:29:58 -0500 Subject: [PATCH 238/592] bump MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/512 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- src/TiledArray/dist_eval/dist_eval.h | 2 +- tests/dist_op_communicator.cpp | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 8624da6e01..cbdbc817a2 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag bf0c376d5cdd6f668174b2a4c67b19634d1c0da7 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag c0c4ea543439c740e3ee848fdd055c633a47f6c5 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 0cb3920715c9a659bbb8158f9a31db1bd97d4614 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index eff687a3fe..1780dbbfb1 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG c0c4ea543439c740e3ee848fdd055c633a47f6c5) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 03c82cf2780d9e96298cc9140ac128c73eacd3b1) +set(TA_TRACKED_MADNESS_TAG 0cb3920715c9a659bbb8158f9a31db1bd97d4614) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG c0c4ea543439c740e3ee848fdd055c633a47f6c5) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) diff --git a/src/TiledArray/dist_eval/dist_eval.h b/src/TiledArray/dist_eval/dist_eval.h index 2fd6329de5..c6d0442174 100644 --- a/src/TiledArray/dist_eval/dist_eval.h +++ b/src/TiledArray/dist_eval/dist_eval.h @@ -110,7 +110,7 @@ class DistEvalImpl : public TensorImpl, const std::shared_ptr& pmap, const Permutation& perm) : TensorImpl_(world, trange, shape, pmap), - id_(world.unique_obj_id()), + id_(world.make_unique_obj_id()), source_to_target_(), target_to_source_(), task_count_(-1), diff --git a/tests/dist_op_communicator.cpp b/tests/dist_op_communicator.cpp index 4eac7a135c..28922e8d6c 100644 --- a/tests/dist_op_communicator.cpp +++ b/tests/dist_op_communicator.cpp @@ -30,9 +30,9 @@ struct DistOpFixture { DistOpFixture() : group_list(), world_group_list(), - group_did(GlobalFixture::world->unique_obj_id(), + group_did(GlobalFixture::world->make_unique_obj_id(), GlobalFixture::world->rank() % 2), - world_did(GlobalFixture::world->unique_obj_id(), + world_did(GlobalFixture::world->make_unique_obj_id(), GlobalFixture::world->size()) { for (ProcessID p = GlobalFixture::world->rank() % 2; p < GlobalFixture::world->size(); p += 2) From 74759c77fedd7876616253f76bbb922023e60802 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 3 Jan 2024 10:35:48 -0500 Subject: [PATCH 239/592] introduced TensorImpl::local_nnz --- src/TiledArray/tensor_impl.h | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tensor_impl.h b/src/TiledArray/tensor_impl.h index 6811fc6cb2..7ead791fd2 100644 --- a/src/TiledArray/tensor_impl.h +++ b/src/TiledArray/tensor_impl.h @@ -53,6 +53,8 @@ class TensorImpl : private NO_DEFAULTS { const trange_type trange_; ///< Tiled range type std::shared_ptr shape_; ///< Tensor shape std::shared_ptr pmap_; ///< Process map for tiles + mutable std::atomic> + local_nnz_; ///< Number of nonzero tiles assigned to this rank (memoized) public: /// Constructor @@ -74,6 +76,7 @@ class TensorImpl : private NO_DEFAULTS { trange_(trange), shape_(std::make_shared(shape)), pmap_(pmap) { + local_nnz_ = -1; // ensure that shapes are identical on every rank if (replicate_shape && !shape.is_dense()) world.gop.broadcast_serializable(*shape_, 0); @@ -115,8 +118,8 @@ class TensorImpl : private NO_DEFAULTS { /// Tensor tile volume accessor - /// \return The number of tiles in the tensor - /// \throw nothing + /// \return The number of tiles in the tensor, equivalent to + /// `this->trange().tiles_range().volume()` \throw nothing ordinal_type size() const { return trange_.tiles_range().volume(); } /// Max count of local tiles @@ -131,6 +134,27 @@ class TensorImpl : private NO_DEFAULTS { return static_cast(pmap_->local_size()); } + /// Count of nonzero local tiles + + /// This function is primarily available for debugging purposes. + /// \return The count of nonzero local tiles; for dense array this will be + /// equal to the value produced by local_size(), for a sparse array this will + /// be less than the value produced by local_size() + ordinal_type local_nnz() const { + if (local_nnz_ == -1) { + if (is_dense()) + local_nnz_ = local_size(); + else { + ordinal_type count = 0; + for (auto&& idx : trange_.tiles_range()) { + if (is_local(idx) && !is_zero(idx)) ++count; + } + local_nnz_ = count; + } + } + return local_nnz_; + } + /// Query a tile owner /// \tparam Index The sized integral range type From f3716f836e6289a89d499df27ed38b24a00d9467 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 3 Jan 2024 10:52:39 -0500 Subject: [PATCH 240/592] annotate virtual DistEval class members with override --- src/TiledArray/dist_eval/array_eval.h | 10 +++------- src/TiledArray/dist_eval/binary_eval.h | 6 +++--- src/TiledArray/dist_eval/contraction_eval.h | 6 +++--- src/TiledArray/dist_eval/dist_eval.h | 2 +- src/TiledArray/dist_eval/unary_eval.h | 6 +++--- 5 files changed, 13 insertions(+), 17 deletions(-) diff --git a/src/TiledArray/dist_eval/array_eval.h b/src/TiledArray/dist_eval/array_eval.h index 3bb34742cf..bb1ac49ae4 100644 --- a/src/TiledArray/dist_eval/array_eval.h +++ b/src/TiledArray/dist_eval/array_eval.h @@ -250,7 +250,7 @@ class ArrayEvalImpl /// Virtual destructor virtual ~ArrayEvalImpl() {} - virtual Future get_tile(ordinal_type i) const { + Future get_tile(ordinal_type i) const override { // Get the array index that corresponds to the target index auto array_index = DistEvalImpl_::perm_index_to_source(i); @@ -266,11 +266,7 @@ class ArrayEvalImpl return eval_tile(tile, consumable_tile); } - /// Discard a tile that is not needed - - /// This function handles the cleanup for tiles that are not needed in - /// subsequent computation. - virtual void discard_tile(ordinal_type) const { + void discard_tile(ordinal_type i) const override { const_cast(this)->notify(); } @@ -305,7 +301,6 @@ class ArrayEvalImpl /// This function will evaluate the children of this distributed evaluator /// and evaluate the tiles for this distributed evaluator. /// \return The number of tiles that will be set by this process - virtual int internal_eval() { // Counter for the number of tasks submitted by this object int task_count = 0; @@ -325,6 +320,7 @@ class ArrayEvalImpl } return task_count; + int internal_eval() override { } }; // class ArrayEvalImpl diff --git a/src/TiledArray/dist_eval/binary_eval.h b/src/TiledArray/dist_eval/binary_eval.h index fa33d74d9c..e343c087b3 100644 --- a/src/TiledArray/dist_eval/binary_eval.h +++ b/src/TiledArray/dist_eval/binary_eval.h @@ -100,7 +100,7 @@ class BinaryEvalImpl : public DistEvalImpl, /// \return A \c Future to the tile at index i /// \throw TiledArray::Exception When tile \c i is owned by a remote node. /// \throw TiledArray::Exception When tile \c i a zero tile. - virtual Future get_tile(ordinal_type i) const { + Future get_tile(ordinal_type i) const override { TA_ASSERT(TensorImpl_::is_local(i)); TA_ASSERT(!TensorImpl_::is_zero(i)); @@ -118,7 +118,7 @@ class BinaryEvalImpl : public DistEvalImpl, /// This function handles the cleanup for tiles that are not needed in /// subsequent computation. /// \param i The index of the tile - virtual void discard_tile(ordinal_type i) const { get_tile(i); } + void discard_tile(ordinal_type i) const override { get_tile(i); } private: /// Task function for evaluating tiles @@ -160,7 +160,7 @@ class BinaryEvalImpl : public DistEvalImpl, /// until the tasks for the children are evaluated (not for the tasks of /// this object). /// \return The number of tiles that will be set by this process - virtual int internal_eval() { + int internal_eval() override { // Evaluate child tensors left_.eval(); right_.eval(); diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h index 18aac80c57..8ff0d80091 100644 --- a/src/TiledArray/dist_eval/contraction_eval.h +++ b/src/TiledArray/dist_eval/contraction_eval.h @@ -1560,7 +1560,7 @@ class Summa /// \return A \c Future to the tile at index i /// \throw TiledArray::Exception When tile \c i is owned by a remote node. /// \throw TiledArray::Exception When tile \c i a zero tile. - virtual Future get_tile(ordinal_type i) const { + Future get_tile(ordinal_type i) const override { TA_ASSERT(TensorImpl_::is_local(i)); TA_ASSERT(!TensorImpl_::is_zero(i)); @@ -1584,7 +1584,7 @@ class Summa /// This function handles the cleanup for tiles that are not needed in /// subsequent computation. /// \param i The index of the tile - virtual void discard_tile(ordinal_type i) const { get_tile(i); } + void discard_tile(ordinal_type i) const override { get_tile(i); } private: /// Adjust iteration depth based on memory constraints @@ -1647,7 +1647,7 @@ class Summa /// until the tasks for the children are evaluated (not for the tasks of /// this object). /// \return The number of tiles that will be set by this process - virtual int internal_eval() { + int internal_eval() override { #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL printf("eval: start eval children rank=%i\n", TensorImpl_::world().rank()); #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL diff --git a/src/TiledArray/dist_eval/dist_eval.h b/src/TiledArray/dist_eval/dist_eval.h index c6d0442174..7585b7e4bf 100644 --- a/src/TiledArray/dist_eval/dist_eval.h +++ b/src/TiledArray/dist_eval/dist_eval.h @@ -176,7 +176,7 @@ class DistEvalImpl : public TensorImpl, } /// Tile set notification - virtual void notify() { set_counter_++; } + void notify() override { set_counter_++; } /// Wait for all tiles to be assigned void wait() const { diff --git a/src/TiledArray/dist_eval/unary_eval.h b/src/TiledArray/dist_eval/unary_eval.h index 191d247aef..d687fcb4af 100644 --- a/src/TiledArray/dist_eval/unary_eval.h +++ b/src/TiledArray/dist_eval/unary_eval.h @@ -85,7 +85,7 @@ class UnaryEvalImpl /// \return A \c Future to the tile at index i /// \throw TiledArray::Exception When tile \c i is owned by a remote node. /// \throw TiledArray::Exception When tile \c i a zero tile. - virtual Future get_tile(ordinal_type i) const { + Future get_tile(ordinal_type i) const override { TA_ASSERT(TensorImpl_::is_local(i)); TA_ASSERT(!TensorImpl_::is_zero(i)); const auto source = arg_.owner(DistEvalImpl_::perm_index_to_source(i)); @@ -98,7 +98,7 @@ class UnaryEvalImpl /// This function handles the cleanup for tiles that are not needed in /// subsequent computation. /// \param i The index of the tile - virtual void discard_tile(ordinal_type i) const { get_tile(i); } + void discard_tile(ordinal_type i) const override { get_tile(i); } private: /// Input tile argument type @@ -144,7 +144,7 @@ class UnaryEvalImpl /// until the tasks for the children are evaluated (not for the tasks of /// this object). /// \return The number of tiles that will be set by this process - virtual int internal_eval() { + int internal_eval() override { // Convert pimpl to this object type so it can be used in tasks std::shared_ptr self = std::enable_shared_from_this::shared_from_this(); From ee1b36765cc07c6afa0c88e45f7804a89208ffe5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 3 Jan 2024 10:59:42 -0500 Subject: [PATCH 241/592] if MADNESS configured with ENABLE_WORLDOBJECT_FUTURE_TRACE trace futures associated with DistributedStorage bump MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/514 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- src/TiledArray/distributed_storage.h | 22 ++++++++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index cbdbc817a2..c3b7b0659f 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag bf0c376d5cdd6f668174b2a4c67b19634d1c0da7 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 0cb3920715c9a659bbb8158f9a31db1bd97d4614 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag cf3c98053453329f35b775c8b9f561301f6a997e . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 1780dbbfb1..9499354eba 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 0cb3920715c9a659bbb8158f9a31db1bd97d4614) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG c0c4ea543439c740e3ee848fdd055c633a47f6c5) +set(TA_TRACKED_MADNESS_TAG cf3c98053453329f35b775c8b9f561301f6a997e) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 0cb3920715c9a659bbb8158f9a31db1bd97d4614) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) diff --git a/src/TiledArray/distributed_storage.h b/src/TiledArray/distributed_storage.h index 27c2885dcd..47c52ead2a 100644 --- a/src/TiledArray/distributed_storage.h +++ b/src/TiledArray/distributed_storage.h @@ -234,6 +234,13 @@ class DistributedStorage : public madness::WorldObject > { // Return the local element. const_accessor acc; [[maybe_unused]] const bool inserted = data_.insert(acc, i); +#ifdef MADNESS_WORLDOBJECT_FUTURE_TRACE + if (inserted) { + auto& f_nonconst_ref = + const_castsecond)>&>(acc->second); + this->trace(f_nonconst_ref); + } +#endif return acc->second; } @@ -249,6 +256,13 @@ class DistributedStorage : public madness::WorldObject > { // Return the local element. accessor acc; [[maybe_unused]] const bool inserted = data_.insert(acc, i); +#ifdef MADNESS_WORLDOBJECT_FUTURE_TRACE + if (inserted) { + auto& f_nonconst_ref = + const_castsecond)>&>(acc->second); + this->trace(f_nonconst_ref); + } +#endif return acc->second; } @@ -308,6 +322,14 @@ class DistributedStorage : public madness::WorldObject > { // Set the future existing_f.set(f); } +#ifdef MADNESS_WORLDOBJECT_FUTURE_TRACE + else { + auto& f_nonconst_ref = + const_castsecond)>&>( + acc->second); + this->trace(f_nonconst_ref); + } +#endif } else { if (f.probe()) { set_remote(i, f); From 886ec199cfae45d19eab876e0ab45c4504b9ba09 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 3 Jan 2024 11:01:31 -0500 Subject: [PATCH 242/592] binary_wrapper.h: hush warnings due to implicitly capture of `this` --- src/TiledArray/tile_op/binary_wrapper.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tile_op/binary_wrapper.h b/src/TiledArray/tile_op/binary_wrapper.h index b66be2986d..dac995f94b 100644 --- a/src/TiledArray/tile_op/binary_wrapper.h +++ b/src/TiledArray/tile_op/binary_wrapper.h @@ -294,10 +294,10 @@ class BinaryWrapper { if (perm_) return meta::invoke(op_, eval_left, eval_right, perm_); - auto op_left = [=](eval_t& _left, eval_t& _right) { + auto op_left = [=, this](eval_t& _left, eval_t& _right) { return op_.consume_left(_left, _right); }; - auto op_right = [=](eval_t& _left, eval_t& _right) { + auto op_right = [=, this](eval_t& _left, eval_t& _right) { return op_.consume_right(_left, _right); }; // Override consumable From c3a36dc247200212cb6a3de4949b986f9f283fed Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 3 Jan 2024 11:02:55 -0500 Subject: [PATCH 243/592] reimplement ArrayEvalImpl::internal_eval() using TensorImpl::local_nnz() --- src/TiledArray/dist_eval/array_eval.h | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/src/TiledArray/dist_eval/array_eval.h b/src/TiledArray/dist_eval/array_eval.h index bb1ac49ae4..10ad0543e0 100644 --- a/src/TiledArray/dist_eval/array_eval.h +++ b/src/TiledArray/dist_eval/array_eval.h @@ -301,27 +301,7 @@ class ArrayEvalImpl /// This function will evaluate the children of this distributed evaluator /// and evaluate the tiles for this distributed evaluator. /// \return The number of tiles that will be set by this process - // Counter for the number of tasks submitted by this object - int task_count = 0; - - // Get a count of the number of local tiles. - if (TensorImpl_::shape().is_dense()) { - task_count = TensorImpl_::pmap()->local_size(); - } else { - // Create iterator to tiles that are local for this evaluator. - typename array_type::pmap_interface::const_iterator it = - TensorImpl_::pmap()->begin(); - const typename array_type::pmap_interface::const_iterator end = - TensorImpl_::pmap()->end(); - - for (; it != end; ++it) { - if (!TensorImpl_::is_zero(*it)) ++task_count; - } - } - - return task_count; - int internal_eval() override { - } + int internal_eval() override { return TensorImpl_::local_nnz(); } }; // class ArrayEvalImpl From a9c7e62d6a58695c5e4c48c7799c591d8dd1d032 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 3 Jan 2024 17:06:51 -0500 Subject: [PATCH 244/592] Bug fix. --- src/TiledArray/einsum/tiledarray.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 18a3871f0b..1851973709 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -181,7 +181,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using Index = Einsum::Index; - if constexpr (std::tuple_size::value > 1) TA_ASSERT(e); if constexpr (AreArraySame) { if (!e) { // hadamard reduction auto &[A, B] = AB; From c16ecc14542a110dcda615d8a2bcffaecbde909f Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 4 Jan 2024 08:36:05 -0500 Subject: [PATCH 245/592] Remove [=] capture when not needed. [=, this] is C++20 extension. A warning is issued by clang-17 at least. --- src/TiledArray/tile_op/binary_wrapper.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tile_op/binary_wrapper.h b/src/TiledArray/tile_op/binary_wrapper.h index dac995f94b..4c02b84318 100644 --- a/src/TiledArray/tile_op/binary_wrapper.h +++ b/src/TiledArray/tile_op/binary_wrapper.h @@ -294,10 +294,10 @@ class BinaryWrapper { if (perm_) return meta::invoke(op_, eval_left, eval_right, perm_); - auto op_left = [=, this](eval_t& _left, eval_t& _right) { + auto op_left = [this](eval_t& _left, eval_t& _right) { return op_.consume_left(_left, _right); }; - auto op_right = [=, this](eval_t& _left, eval_t& _right) { + auto op_right = [this](eval_t& _left, eval_t& _right) { return op_.consume_right(_left, _right); }; // Override consumable From 35a474fad60920dff1580d93fb3eb32640478157 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Thu, 4 Jan 2024 11:16:40 -0500 Subject: [PATCH 246/592] Create proper target when installed Umpire is provided Currently, the TiledArray_UMPIRE target assumes that Umpire is build as part of TA, which fails if umpire was provided via UMPIRE_INSTALL_DIR. Signed-off-by: Joseph Schuchart --- external/umpire.cmake | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/external/umpire.cmake b/external/umpire.cmake index aa98f27b1e..4e9a005341 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -14,6 +14,21 @@ if(_UMPIRE_INSTALL_DIR) # find_package(umpire REQUIRED) message(STATUS "Umpire found at ${_UMPIRE_INSTALL_DIR}") + add_library(TiledArray_UMPIRE INTERFACE) + + set_target_properties( + TiledArray_UMPIRE + PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES + "${_UMPIRE_INSTALL_DIR}/include" + INTERFACE_LINK_LIBRARIES + "umpire" + INTERFACE_LINK_DIRECTORIES + "${_UMPIRE_INSTALL_DIR}/lib/" + ) + + install(TARGETS TiledArray_UMPIRE EXPORT tiledarray COMPONENT tiledarray) + elseif(TA_EXPERT) message("** Umpire was not found") @@ -190,23 +205,22 @@ else() set(_UMPIRE_INSTALL_DIR ${EXTERNAL_INSTALL_DIR}) -endif(_UMPIRE_INSTALL_DIR) - -# manually add Umpire library -add_library(TiledArray_UMPIRE INTERFACE) + add_library(TiledArray_UMPIRE INTERFACE) -set_target_properties( - TiledArray_UMPIRE - PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES - "$;$;$;$;$;$" - INTERFACE_LINK_LIBRARIES - "$;$" - ) + set_target_properties( + TiledArray_UMPIRE + PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES + "$;$;$;$;$;$" + INTERFACE_LINK_LIBRARIES + "$;$" + ) install(TARGETS TiledArray_UMPIRE EXPORT tiledarray COMPONENT tiledarray) +endif(_UMPIRE_INSTALL_DIR) + #TODO test Umpire endif(NOT TARGET TiledArray_UMPIRE) From bc1b712d1315ef7ae352776ef3b4309701d38bff Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 7 Jan 2024 16:35:22 -0500 Subject: [PATCH 247/592] introduced TA_TRACE_GLOBAL_COMM_STATS CMake option that enables tracing stats of communication within global objects (DistEval's + DistributedStorage) --- CMakeLists.txt | 4 ++++ src/TiledArray/config.h.in | 3 +++ 2 files changed, 7 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9a47fbd989..7f98e3fbf2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -165,6 +165,10 @@ if(TA_ENABLE_TILE_OPS_LOGGING AND NOT DEFINED TA_TILE_OPS_LOG_LEVEL) set(TA_TILE_OPS_LOG_LEVEL 1) endif(TA_ENABLE_TILE_OPS_LOGGING AND NOT DEFINED TA_TILE_OPS_LOG_LEVEL) +option(TA_TRACE_GLOBAL_COMM_STATS "Enable tracing of communication stats of global objects (DistEval's and DIstributedStorage) TiledArray" OFF) +add_feature_info(TASK_TRACE_DEBUG TA_TRACE_GLOBAL_COMM_STATS "Debug communication stats of global objects (DistEval's and DIstributedStorage) TiledArray") +set(TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE ${TA_TRACE_GLOBAL_COMM_STATS}) + option(TA_RANGEV3 "Enable Range-V3 library" OFF) add_feature_info(TA_RANGEV3 TA_RANGEV3 "Range-V3 ranges library") diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in index 1c38298623..79f9f0932a 100644 --- a/src/TiledArray/config.h.in +++ b/src/TiledArray/config.h.in @@ -174,6 +174,9 @@ #cmakedefine TA_ENABLE_TILE_OPS_LOGGING 1 #define TA_TILE_OPS_LOG_LEVEL 0@TA_TILE_OPS_LOG_LEVEL@ +/* Enables collection of communication statistics for global objects (DistEval and DistributedStorage) */ +#cmakedefine TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE 1 + /* ----------- pragma helpers ---------------*/ #define TILEDARRAY_PRAGMA(x) _Pragma(#x) /* same as TILEDARRAY_PRAGMA(x), but expands x */ From 56e0e2efb82570cfc24b5745874fd6c30b4ef1a3 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 7 Jan 2024 16:42:25 -0500 Subject: [PATCH 248/592] if configured with TA_TRACE_GLOBAL_COMM_STATS will collect stats of DistEval comms --- src/TiledArray/dist_eval/array_eval.h | 160 ++++++++++++++++++-- src/TiledArray/dist_eval/binary_eval.h | 68 ++++++++- src/TiledArray/dist_eval/contraction_eval.h | 44 +++++- src/TiledArray/dist_eval/dist_eval.h | 94 ++++++++++-- src/TiledArray/dist_eval/unary_eval.h | 33 +++- 5 files changed, 366 insertions(+), 33 deletions(-) diff --git a/src/TiledArray/dist_eval/array_eval.h b/src/TiledArray/dist_eval/array_eval.h index 10ad0543e0..6dade3dc2b 100644 --- a/src/TiledArray/dist_eval/array_eval.h +++ b/src/TiledArray/dist_eval/array_eval.h @@ -198,6 +198,26 @@ class ArrayEvalImpl std::shared_ptr op_; ///< The tile operation BlockRange block_range_; ///< Sub-block range +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + // tracing artifacts + using pending_counter_t = std::atomic[]; // 1 counter per rank + mutable std::shared_ptr + ntiles_pending_; // number of pending tiles from each rank + mutable std::shared_ptr + ntasks_pending_; // number of pending tasks using data from each rank + + struct AtomicCounterDecreaser : public madness::CallbackInterface { + std::shared_ptr> counter; + + AtomicCounterDecreaser(std::shared_ptr> counter) + : counter(std::move(counter)) {} + void notify() override { + --(*counter); + delete this; + } + }; +#endif + public: /// Construct with full array range @@ -217,7 +237,28 @@ class ArrayEvalImpl : DistEvalImpl_(world, trange, shape, pmap, outer(perm)), array_(array), op_(std::make_shared(op)), - block_range_() {} + block_range_() +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + ntiles_pending_(new std::atomic[world.size()]), + ntasks_pending_(new std::atomic[world.size()]) +#endif + { +#if 0 + std::stringstream ss; + ss << "ArrayEvalImpl: id=" << this->id(); + if (array_) ss << " array.id()=" << array_.id(); + ss << "\n"; + std::cout << ss.str(); +#endif + +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + for (auto rank = 0; rank != world.size(); ++rank) { + ntiles_pending_[rank] = 0; + ntasks_pending_[rank] = 0; + } +#endif + } /// Constructor with sub-block range @@ -245,10 +286,42 @@ class ArrayEvalImpl : DistEvalImpl_(world, trange, shape, pmap, outer(perm)), array_(array), op_(std::make_shared(op)), - block_range_(array.trange().tiles_range(), lower_bound, upper_bound) {} + block_range_(array.trange().tiles_range(), lower_bound, upper_bound) +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + ntiles_pending_(new std::atomic[world.size()]), + ntasks_pending_(new std::atomic[world.size()]) +#endif + { +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + for (auto rank = 0; rank != world.size(); ++rank) { + ntiles_pending_[rank] = 0; + ntasks_pending_[rank] = 0; + } +#endif + } /// Virtual destructor - virtual ~ArrayEvalImpl() {} + virtual ~ArrayEvalImpl() { +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + if (std::find_if(ntiles_pending_.get(), + ntiles_pending_.get() + this->world().size(), + [](const auto& v) { return v != 0; }) != + ntiles_pending_.get() + this->world().size()) { + madness::print_error( + "ArrayEvalImpl: pending tiles at destruction! (id=", this->id(), ")"); + abort(); + } + if (std::find_if(ntasks_pending_.get(), + ntasks_pending_.get() + this->world().size(), + [](const auto& v) { return v != 0; }) != + ntasks_pending_.get() + this->world().size()) { + madness::print_error( + "ArrayEvalImpl: pending tasks at destruction! (id=", this->id(), ")"); + abort(); + } +#endif + } Future get_tile(ordinal_type i) const override { // Get the array index that corresponds to the target index @@ -258,15 +331,49 @@ class ArrayEvalImpl // index to the correct location. if (block_range_.rank()) array_index = block_range_.ordinal(array_index); - // Get the tile from array_, which may be located on a remote node. - Future tile = array_.find(array_index); + const bool arg_tile_is_remote = !array_.is_local(array_index); + const ProcessID arg_tile_owner = array_.owner(array_index); - const bool consumable_tile = !array_.is_local(array_index); - - return eval_tile(tile, consumable_tile); + Future result; + bool task_created = false; + if (arg_tile_is_remote) { + TA_ASSERT(arg_tile_owner != this->world().rank()); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ntiles_pending_[arg_tile_owner]++; +#endif + auto arg_tile = array_.find(array_index); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + arg_tile.register_callback( + new AtomicCounterDecreaser(std::shared_ptr>( + ntiles_pending_, ntiles_pending_.get() + arg_tile_owner))); +#endif + std::tie(result, task_created) = + eval_tile(arg_tile, /* consumable_tile = */ true +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + arg_tile_owner +#endif + ); + } else { + TA_ASSERT(arg_tile_owner == this->world().rank()); + std::tie(result, task_created) = eval_tile(array_.find_local(array_index), + /* consumable_tile = */ false +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + arg_tile_owner +#endif + ); + } +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + TA_ASSERT(ntiles_pending_[this->world().rank()] == 0); + // even if data is local we may have created a task to evaluate it + // TA_ASSERT(ntasks_pending_[this->world().rank()] == 0); +#endif + return result; } void discard_tile(ordinal_type i) const override { + TA_ASSERT(this->is_local(i)); const_cast(this)->notify(); } @@ -277,23 +384,36 @@ class ArrayEvalImpl } /// Evaluate a single LazyArrayTile - madness::Future eval_tile( + /// @return A pair of the future to the tile and a boolean indicating whether + /// a task was created to produce the tile + [[nodiscard]] std::pair, bool> eval_tile( const madness::Future& tile, - const bool consumable_tile) const { + const bool consumable_tile +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + const ProcessID tile_owner +#endif + ) const { // Insert the tile into this evaluator for subsequent processing if (tile.probe()) { // Skip the task since the tile is ready Future result; result.set(make_tile(tile, consumable_tile)); const_cast(this)->notify(); - return result; + return {result, false}; } else { // Spawn a task to set the tile when the input tile is not ready. Future result = TensorImpl_::world().taskq.add( shared_from_this(), &ArrayEvalImpl_::make_tile, tile, consumable_tile, madness::TaskAttributes::hipri()); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ntasks_pending_[tile_owner]++; + result.register_callback( + new AtomicCounterDecreaser(std::shared_ptr>( + ntasks_pending_, ntasks_pending_.get() + tile_owner))); +#endif result.register_callback(const_cast(this)); - return result; + return {result, true}; } } /// Evaluate the tiles of this tensor @@ -303,6 +423,22 @@ class ArrayEvalImpl /// \return The number of tiles that will be set by this process int internal_eval() override { return TensorImpl_::local_nnz(); } +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + std::string status() const override { + std::stringstream ss; + ss << "ArrayEvalImpl: array.id()=" << array_.id(); + ss << " ntiles_pending=["; + for (auto rank = 0; rank != this->world().size(); ++rank) { + ss << " " << ntiles_pending_[rank]; + } + ss << "] ntasks_pending=["; + for (auto rank = 0; rank != this->world().size(); ++rank) { + ss << " " << ntasks_pending_[rank]; + } + ss << "]\n"; + return ss.str(); + } +#endif }; // class ArrayEvalImpl } // namespace detail diff --git a/src/TiledArray/dist_eval/binary_eval.h b/src/TiledArray/dist_eval/binary_eval.h index e343c087b3..62bbdb64ce 100644 --- a/src/TiledArray/dist_eval/binary_eval.h +++ b/src/TiledArray/dist_eval/binary_eval.h @@ -68,6 +68,16 @@ class BinaryEvalImpl : public DistEvalImpl, right_type right_; ///< Right argument op_type op_; ///< binary element operator +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + // artifacts of tracing + mutable ordinal_type left_ntiles_used_; // # of tiles used from left_ + mutable ordinal_type right_ntiles_used_; // # of tiles used from right_ + mutable ordinal_type + left_ntiles_discarded_; // # of tiles discarded from left_ + mutable ordinal_type + right_ntiles_discarded_; // # of tiles discarded from right_ +#endif + public: /// Construct a binary evaluator @@ -88,7 +98,15 @@ class BinaryEvalImpl : public DistEvalImpl, : DistEvalImpl_(world, trange, shape, pmap, outer(perm)), left_(left), right_(right), - op_(op) { + op_(op) +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + left_ntiles_used_(0), + right_ntiles_used_(0), + left_ntiles_discarded_(0), + right_ntiles_discarded_(0) +#endif + { TA_ASSERT(left.trange() == right.trange()); } @@ -105,9 +123,9 @@ class BinaryEvalImpl : public DistEvalImpl, TA_ASSERT(!TensorImpl_::is_zero(i)); const auto source_index = DistEvalImpl_::perm_index_to_source(i); - const ProcessID source = - left_.owner(source_index); // Left and right - // should have the same owner + const ProcessID source = left_.owner(source_index); + // Left and right should have the same owner + TA_ASSERT(source == right_.owner(source_index)); const madness::DistributedID key(DistEvalImpl_::id(), i); return TensorImpl_::world().gop.template recv(source, key); @@ -195,6 +213,12 @@ class BinaryEvalImpl : public DistEvalImpl, &BinaryEvalImpl_::template eval_tile, target_index, left_.get(source_index), right_.get(source_index)); + TA_ASSERT(left_.is_local(source_index)); + TA_ASSERT(right_.is_local(source_index)); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + left_ntiles_used_++; + right_ntiles_used_++; +#endif ++task_count; } @@ -213,32 +237,64 @@ class BinaryEvalImpl : public DistEvalImpl, &BinaryEvalImpl_::template eval_tile, target_index, ZeroTensor(), right_.get(index)); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + right_ntiles_used_++; +#endif } else if (right_.is_zero(index)) { TensorImpl_::world().taskq.add( self, &BinaryEvalImpl_::template eval_tile, target_index, left_.get(index), ZeroTensor()); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + left_ntiles_used_++; +#endif } else { + TA_ASSERT(!left_.is_zero(index) && !right_.is_zero(index)); TensorImpl_::world().taskq.add( self, &BinaryEvalImpl_::template eval_tile, target_index, left_.get(index), right_.get(index)); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + left_ntiles_used_++; + right_ntiles_used_++; +#endif } ++task_count; } else { // Cleanup unused tiles - if (!left_.is_zero(index)) left_.discard(index); - if (!right_.is_zero(index)) right_.discard(index); + if (!left_.is_zero(index)) { +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + left_ntiles_discarded_++; +#endif + left_.discard(index); + } + if (!right_.is_zero(index)) { +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + right_ntiles_discarded_++; +#endif + right_.discard(index); + } } } } // Wait for child tensors to be evaluated, and process tasks while waiting. +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + TA_ASSERT(left_.local_nnz() == left_ntiles_used_ + left_ntiles_discarded_); + TA_ASSERT(right_.local_nnz() == + right_ntiles_used_ + right_ntiles_discarded_); +#endif left_.wait(); right_.wait(); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + // for some evaluators like SUMMA real task counts are not available even + // after wait() TA_ASSERT(left_.task_count() >= left_ntiles_used_ + + // left_ntiles_discarded_); TA_ASSERT(right_.task_count() >= + // right_ntiles_used_ + right_ntiles_discarded_); +#endif return task_count; } diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h index 8ff0d80091..2da66628fc 100644 --- a/src/TiledArray/dist_eval/contraction_eval.h +++ b/src/TiledArray/dist_eval/contraction_eval.h @@ -118,6 +118,7 @@ class Summa typedef std::pair col_datum; ///< Datum element type for a left-hand argument column + // various tracing/debugging artifacts static constexpr const bool trace_tasks = #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE true @@ -125,6 +126,16 @@ class Summa false #endif ; +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + mutable std::atomic + left_ntiles_used_; // # of tiles used from left_ + mutable std::atomic + right_ntiles_used_; // # of tiles used from right_ + mutable std::atomic + left_ntiles_discarded_; // # of tiles discarded from left_ + mutable std::atomic + right_ntiles_discarded_; // # of tiles discarded from right_ +#endif protected: // Import base class functions @@ -705,11 +716,17 @@ class Summa if (do_broadcast) { // Broadcast the tile +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ++left_ntiles_used_; +#endif const madness::DistributedID key(DistEvalImpl_::id(), index); auto tile = get_tile(left_, index); TensorImpl_::world().gop.bcast(key, tile, group_root, row_group); } else { // Discard the tile +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ++left_ntiles_discarded_; +#endif left_.discard(index); } } @@ -748,12 +765,18 @@ class Summa if (do_broadcast) { // Broadcast the tile +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ++right_ntiles_used_; +#endif const madness::DistributedID key(DistEvalImpl_::id(), index + left_.size()); auto tile = get_tile(right_, index); TensorImpl_::world().gop.bcast(key, tile, group_root, col_group); } else { // Discard the tile +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ++right_ntiles_discarded_; +#endif right_.discard(index); } } @@ -1550,7 +1573,16 @@ class Summa left_stride_(k), left_stride_local_(proc_grid.proc_rows() * k), right_stride_(1ul), - right_stride_local_(proc_grid.proc_cols()) {} + right_stride_local_(proc_grid.proc_cols()) +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + left_ntiles_used_(0), + right_ntiles_used_(0), + left_ntiles_discarded_(0), + right_ntiles_discarded_(0) +#endif + { + } virtual ~Summa() {} @@ -1728,6 +1760,16 @@ class Summa // Wait for child tensors to be evaluated, and process tasks while waiting. left_.wait(); right_.wait(); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + // values of left_ntiles_used_ etc. are not available until all broadcasts + // have been completed ... +// TA_ASSERT(left_.local_nnz() == left_ntiles_used_ + +// left_ntiles_discarded_); TA_ASSERT(right_.local_nnz() == +// right_ntiles_used_ + right_ntiles_discarded_); +// TA_ASSERT(left_.task_count() >= left_ntiles_used_ + +// left_ntiles_discarded_); TA_ASSERT(right_.task_count() >= +// right_ntiles_used_ + right_ntiles_discarded_); +#endif #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL printf("eval: finished wait children rank=%i\n", diff --git a/src/TiledArray/dist_eval/dist_eval.h b/src/TiledArray/dist_eval/dist_eval.h index 7585b7e4bf..9e0157cb8b 100644 --- a/src/TiledArray/dist_eval/dist_eval.h +++ b/src/TiledArray/dist_eval/dist_eval.h @@ -123,6 +123,28 @@ class DistEvalImpl : public TensorImpl, source_to_target_ = PermIndex(source_range, perm); target_to_source_ = PermIndex(trange.tiles_range(), inv_perm); } + +#if 0 + { + // print out expected number of tiles on each rank + std::vector ntiles_per_rank(world.size(), 0); + for (auto& i : trange.tiles_range()) { + if (!TensorImpl_::is_zero(i)) { + ntiles_per_rank[TensorImpl_::owner(i)]++; + } + } + std::stringstream ss; + ss << "DistEvalImpl: id=" << id_; + if (perm) + ss << " perm=" << perm; + ss << " ntiles=["; + for (auto& i : ntiles_per_rank) { + ss << i << " "; + } + ss << "]"; + std::cout << ss.str() << std::endl; + } +#endif } virtual ~DistEvalImpl() {} @@ -142,7 +164,8 @@ class DistEvalImpl : public TensorImpl, /// This function handles the cleanup for tiles that are not needed in /// subsequent computation. - /// \param i The index of the tile + /// \param i The index of the local tile to discard + /// \pre `this->is_local(i)` virtual void discard_tile(ordinal_type i) const = 0; /// Set tensor value @@ -234,13 +257,36 @@ class DistEvalImpl : public TensorImpl, TA_ASSERT(task_count_ >= 0); } + /// \return The number of tasks spawned on this rank (after invoking eval() + /// this should be equal to local_nnz() for simple evaluators like + /// unary/binary, or greater than that for more complex evaluators like SUMMA + ordinal_type task_count() const { + if (task_count_ == -1) + return 0; + else + return task_count_; + } + +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + /// reports evaluator status + + /// intended for debugging purposes + /// @return string containing log of the current status of evaluator (empty + /// string, unless overridden in the specialization) + [[nodiscard]] virtual std::string status() const { return {}; } +#endif }; // class DistEvalImpl -/// Tensor expression object +/// Tensor expression evaluator wrapper -/// This object holds a tensor expression. It is used to store various type -/// of tensor expressions that depend on the pimpl used to construct the -/// expression. +/// This object holds a tensor expression evaluator (DistEvalImpl). +/// +/// \note Tensor expression evaluators (DistEval and DistEvalImpl) +/// are similar to DistArray in that they has tensorial structure +/// (TensorImpl), with shape and policy, but their semantics that +/// differs from DistArray (e.g., data is not stored +/// persistently). +/// /// \tparam Tile The output tile type /// \tparam Policy The tensor policy class template @@ -333,7 +379,7 @@ class DistEval { return pimpl_->pmap(); } - /// Query the density of the tensor + /// Query if the tensor is dense /// \return \c true if the tensor is dense, otherwise false bool is_dense() const { return pimpl_->is_dense(); } @@ -348,7 +394,7 @@ class DistEval { /// \return The tiled range of the tensor const trange_type& trange() const { return pimpl_->trange(); } - /// Tile move + /// Tile accessor /// Tile is removed after it is set. /// \param i The tile index @@ -359,8 +405,12 @@ class DistEval { /// This function handles the cleanup for tiles that are not needed in /// subsequent computation. - /// \param i The index of the tile - virtual void discard(ordinal_type i) const { pimpl_->discard_tile(i); } + /// \param i The index of a local tile to discard + /// \pre `this->is_local(i)` + virtual void discard(ordinal_type i) const { + TA_ASSERT(this->is_local(i)); + pimpl_->discard_tile(i); + } /// World object accessor @@ -372,9 +422,35 @@ class DistEval { /// \return The unique id for this object madness::uniqueidT id() const { return pimpl_->id(); } + /// \return Number of nonzero tiles on this rank + /// \sa TensorImpl::local_nnz() + ordinal_type local_nnz() const { return pimpl_->local_nnz(); } + + /// \return The number of tasks spawned on this rank (after invoking eval() + /// this should be same as the value returned by local_nnz(), if everything is + /// well) + ordinal_type task_count() const { return pimpl_->task_count(); } + /// Wait for all local tiles to be evaluated void wait() const { pimpl_->wait(); } +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + /// reports evaluator status + + /// intended for debugging purposes + /// @return string containing log of the current status of evaluator (empty + /// string, unless overridden in the specialization) + std::string status() const { + std::ostringstream oss; + oss << "DistEval status: id=" << id() + << " impl_type_name=" << typeid(*(pimpl_.get())).name() + << " "; + oss << pimpl_->status(); + oss << "\n"; + return oss.str(); + } +#endif + }; // class DistEval } // namespace detail diff --git a/src/TiledArray/dist_eval/unary_eval.h b/src/TiledArray/dist_eval/unary_eval.h index d687fcb4af..66ab742ada 100644 --- a/src/TiledArray/dist_eval/unary_eval.h +++ b/src/TiledArray/dist_eval/unary_eval.h @@ -74,7 +74,13 @@ class UnaryEvalImpl const Perm& perm, const op_type& op) : DistEvalImpl_(world, trange, shape, pmap, outer(perm)), arg_(arg), - op_(op) {} + op_(op) +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + arg_ntiles_used_(0) +#endif + { + } /// Virtual destructor virtual ~UnaryEvalImpl() {} @@ -152,10 +158,12 @@ class UnaryEvalImpl // Evaluate argument arg_.eval(); - // Counter for the number of tasks submitted by this object + // Counter for the number of tasks that will use local tiles of arg_ ordinal_type task_count = 0ul; - // Make sure all local tiles are present. + // now create tasks that will produce result tiles and push them to the + // destination N.B. data is pushed, rather than pulled, to be able to manage + // the lifetime of the argument const typename pmap_interface::const_iterator end = arg_.pmap()->end(); typename pmap_interface::const_iterator it = arg_.pmap()->begin(); for (; it != end; ++it) { @@ -165,8 +173,10 @@ class UnaryEvalImpl if (!arg_.is_zero(index)) { // Get target tile index const auto target_index = DistEvalImpl_::perm_index_to_target(index); + TA_ASSERT(!this->is_zero(target_index)); // Schedule tile evaluation task + TA_ASSERT(arg_.is_local(index)); #ifdef TILEDARRAY_HAS_DEVICE TensorImpl_::world().taskq.add(self, &UnaryEvalImpl_::template eval_tile<>, @@ -175,12 +185,18 @@ class UnaryEvalImpl TensorImpl_::world().taskq.add(self, &UnaryEvalImpl_::eval_tile, target_index, arg_.get(index)); #endif - +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + arg_ntiles_used_++; +#endif ++task_count; } } // Wait for local tiles of argument to be evaluated +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + TA_ASSERT(arg_.local_nnz() == arg_ntiles_used_); + TA_ASSERT(arg_.task_count() >= arg_ntiles_used_); +#endif // arg_.wait(); return task_count; @@ -188,7 +204,14 @@ class UnaryEvalImpl arg_type arg_; ///< Argument op_type op_; ///< The unary tile operation -}; // class UnaryEvalImpl + +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + // artifacts of tracing/debugging + mutable ordinal_type arg_ntiles_used_; // # of tiles used from arg_ ; N.B. no + // tiles are discarded! +#endif + +}; // class UnaryEvalImpl } // namespace detail } // namespace TiledArray From 78e8ad3d7df467b9a283ff7c7bd2dfa8608e7d77 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 7 Jan 2024 16:43:47 -0500 Subject: [PATCH 249/592] DistributedStorage::get() can use (2 types of) caching if requested by user if configured with TA_TRACE_GLOBAL_COMM_STATS will collect stats of DistributedStorage comms --- src/TiledArray/array_impl.h | 13 +- src/TiledArray/distributed_storage.h | 224 ++++++++++++++++++++++++--- src/TiledArray/expressions/expr.h | 4 + 3 files changed, 222 insertions(+), 19 deletions(-) diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h index beb8ba3e09..e5ad9d5db9 100644 --- a/src/TiledArray/array_impl.h +++ b/src/TiledArray/array_impl.h @@ -636,7 +636,18 @@ class ArrayImpl : public TensorImpl { /// DistributedStorage /// @return const reference to the atomic counter of live DelayedSet requests - const madness::AtomicInt& num_live_ds() const { return data_.num_live_ds(); } + const std::atomic& num_live_ds() const { + return data_.num_live_ds(); + } + + /// Reports the number of live DelayedForward requests for this object's + /// DistributedStorage + + /// @return const reference to the atomic counter of live DelayedForward + /// requests + const std::atomic& num_live_df() const { + return data_.num_live_df(); + } }; // class ArrayImpl diff --git a/src/TiledArray/distributed_storage.h b/src/TiledArray/distributed_storage.h index 47c52ead2a..60eb715c34 100644 --- a/src/TiledArray/distributed_storage.h +++ b/src/TiledArray/distributed_storage.h @@ -23,6 +23,17 @@ #include namespace TiledArray { + +/// Describes how to get remote data +enum class RemoteDataGetPolicy { + /// no caching = each get will trigger data fetch + nocache, + /// aggregate gets until data arrives, subsequent gets will trigger new gets + aggregate, + /// get once, read forever + cache +}; + namespace detail { /// Distributed storage container. @@ -41,7 +52,7 @@ namespace detail { /// thread. DO NOT construct world objects within tasks where the order of /// execution is nondeterministic. template -class DistributedStorage : public madness::WorldObject > { +class DistributedStorage : public madness::WorldObject> { public: typedef DistributedStorage DistributedStorage_; ///< This object type typedef madness::WorldObject @@ -64,8 +75,22 @@ class DistributedStorage : public madness::WorldObject > { ///< stored by this container std::shared_ptr pmap_; ///< The process map that defines the element distribution - mutable container_type data_; ///< The local data container - madness::AtomicInt num_live_ds_; ///< Number of live DelayedSet objects + mutable container_type data_; ///< The local data container + + // tracing/defensive driving artifacts + mutable std::atomic + num_live_ds_; ///< Number of live DelayedSet objects + mutable std::atomic + num_live_df_; ///< Number of live DelayedForward objects +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + mutable std::vector> + ngets_served_per_rank_; ///< Counts # of gets served to remote ranks + mutable std::vector> + ngets_sent_per_rank_; ///< Counts # of gets sent to remote ranks + mutable std::vector> + ngets_received_per_rank_; ///< Counts # of gets received from remote + ///< ranks +#endif // not allowed DistributedStorage(const DistributedStorage_&); @@ -120,6 +145,124 @@ class DistributedStorage : public madness::WorldObject > { }; // struct DelayedSet friend struct DelayedSet; + /// Tile cache works just like madness::detail::DistCache (and in fact is + /// based on it) in that it implements a local cache for asynchronous data + /// pulls. Unlike madness::detail::DistCache: + /// - this is unidirectional, i.e. there is no need to manually push data into + /// the cache (a task sending data + /// will be posted). + /// - depending on get policy data will either stay in the cache forever or + /// will be discarded upon arrival; + /// subsequent gets will need to fetch the data again (may make this + /// user-controllable in the future) + mutable container_type remote_data_cache_; + + /// Get the cache value accosted with \c key + + /// This will get the value associated with \c key to \c value. If + /// the cache element does not exist, a task requesting the data will be sent + /// to the owner, a future referring to the result will be inserted in the + /// cache so that the subsequent gets will receive the same data. After data + /// arrival the future will be removed from the cache, thus subsequent gets + /// will need to fetch the data again. \param[in] key The target key \return A + /// future that holds/will hold the cache value + future get_cached(const key_type& key, bool keep_in_cache = false) const { + // Retrieve the cached future + typename container_type::const_accessor acc; + if (remote_data_cache_.insert( + acc, key)) { // no future in cache yet, create a task + static_assert(std::is_signed_v); + const ProcessID rank = this->get_world().rank(); + ProcessID rank_w_persistence = keep_in_cache ? rank : -(rank + 1); + WorldObject_::task(owner(key), &DistributedStorage_::get_cached_handler, + key, rank_w_persistence, + madness::TaskAttributes::hipri()); +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ngets_sent_per_rank_.at(owner(key))++; +#endif + } + return acc->second; + } + + /// used to forward data that were unassigned at the time of request arrival + struct DelayedForward : public madness::CallbackInterface { + public: + DelayedForward(const DistributedStorage_& ds, key_type key, + ProcessID destination_rank, bool keep_in_cache) + : ds(ds), + key(key), + destination_rank(destination_rank), + keep_in_cache(keep_in_cache) {} + + void notify() override { + auto& data_fut = ds.get_local(key); + TA_ASSERT( + data_fut.probe()); // must be ready, otherwise why is this invoked? + if (keep_in_cache) { + ds.task(destination_rank, + &DistributedStorage_::template set_cached_handler, key, + data_fut, madness::TaskAttributes::hipri()); + } else { + ds.task(destination_rank, + &DistributedStorage_::template set_cached_handler, key, + data_fut, madness::TaskAttributes::hipri()); + } + delete this; + } + + private: + const DistributedStorage_& ds; + key_type key; + ProcessID destination_rank; + bool keep_in_cache; + }; + + void get_cached_handler(const size_type key, + ProcessID destination_rank_w_persistence) const { + const bool keep_in_cache = destination_rank_w_persistence >= 0; + const ProcessID destination_rank = + destination_rank_w_persistence < 0 + ? (-destination_rank_w_persistence - 1) + : destination_rank_w_persistence; +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ngets_served_per_rank_.at(destination_rank)++; +#endif + auto& data_fut = get_local(key); + if (data_fut.probe()) { + if (keep_in_cache) { + WorldObject_::task( + destination_rank, + &DistributedStorage_::template set_cached_handler, key, + data_fut, madness::TaskAttributes::hipri()); + } else { + WorldObject_::task( + destination_rank, + &DistributedStorage_::template set_cached_handler, key, + data_fut, madness::TaskAttributes::hipri()); + } + } else { // data not ready yet, defer send to a callback (maybe task??) + const_cast(data_fut).register_callback( + new DelayedForward(*this, key, destination_rank, keep_in_cache)); + } + } + + template + void set_cached_handler(const size_type key, const value_type& datum) const { + // assign the future first, then remove from the cache + typename container_type::accessor acc; + [[maybe_unused]] const bool inserted = remote_data_cache_.insert(acc, key); + // future must be in cache + TA_ASSERT(!inserted); + // assign it + acc->second.set(datum); + // remove it from the cache + if constexpr (!KeepInCache) remote_data_cache_.erase(acc); + +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + ngets_received_per_rank_.at(this->owner(key))++; +#endif + } + public: /// Makes an initialized, empty container with default data distribution (no /// communication) @@ -136,23 +279,47 @@ class DistributedStorage : public madness::WorldObject > { : WorldObject_(world), max_size_(max_size), pmap_(pmap), - data_((max_size / world.size()) + 11) { + data_((max_size / world.size()) + 11) +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + , + ngets_served_per_rank_(world.size()), + ngets_sent_per_rank_(world.size()), + ngets_received_per_rank_(world.size()) +#endif + { // Check that the process map is appropriate for this storage object TA_ASSERT(pmap_); TA_ASSERT(pmap_->size() == max_size); TA_ASSERT(pmap_->rank() == pmap_interface::size_type(world.rank())); TA_ASSERT(pmap_->procs() == pmap_interface::size_type(world.size())); num_live_ds_ = 0; + num_live_df_ = 0; +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + for (auto rank = 0; rank != world.size(); ++rank) { + ngets_served_per_rank_[rank] = 0; + ngets_sent_per_rank_[rank] = 0; + ngets_received_per_rank_[rank] = 0; + } +#endif WorldObject_::process_pending(); } virtual ~DistributedStorage() { if (num_live_ds_ != 0) { - madness::print_error( - "DistributedStorage (object id=", this->id(), - ") destroyed while " - "outstanding tasks exist. Add a fence() to extend the lifetime of " - "this object."); + madness::print_error("DistributedStorage (object id=", this->id(), + ") destroyed while " + "pending tasks that set its data exist. Add a " + "fence() to extend the lifetime of " + "this object."); + abort(); + } + if (num_live_df_ != 0) { + madness::print_error("DistributedStorage (object id=", this->id(), + ") destroyed while " + "pending callbacks that forward its data to other " + "ranks exist. This may indicate a bug in your " + "program or you may need to extend the lifetime of " + "this object."); abort(); } } @@ -207,18 +374,21 @@ class DistributedStorage : public madness::WorldObject > { /// \return A future to element \c i /// \throw TiledArray::Exception If \c i is greater than or equal to \c /// max_size() . - future get(size_type i) const { + future get(size_type i, + RemoteDataGetPolicy policy = RemoteDataGetPolicy::nocache) const { TA_ASSERT(i < max_size_); if (is_local(i)) { return get_local(i); } else { - // Send a request to the owner of i for the element. - future result; - WorldObject_::task(owner(i), &DistributedStorage_::get_handler, i, - result.remote_ref(get_world()), - madness::TaskAttributes::hipri()); - - return result; + if (policy == RemoteDataGetPolicy::nocache) { + // Send a request to the owner of i for the element. + future result; + WorldObject_::task(owner(i), &DistributedStorage_::get_handler, i, + result.remote_ref(get_world()), + madness::TaskAttributes::hipri()); + return result; + } else + return get_cached(i, policy == RemoteDataGetPolicy::cache); } } @@ -343,7 +513,25 @@ class DistributedStorage : public madness::WorldObject > { /// Reports the number of live DelayedSet requests /// @return const reference to the atomic counter of live DelayedSet requests - const madness::AtomicInt& num_live_ds() const { return num_live_ds_; } + const std::atomic& num_live_ds() const { return num_live_ds_; } + + /// Reports the number of live DelayedForward requests + + /// @return const reference to the atomic counter of live DelayedForward + /// requests + const std::atomic& num_live_df() const { return num_live_df_; } + +#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE + const std::vector>& ngets_served_per_rank() const { + return ngets_served_per_rank_; + } + const std::vector>& ngets_sent_per_rank() const { + return ngets_sent_per_rank_; + } + const std::vector>& ngets_received_per_rank() const { + return ngets_received_per_rank_; + } +#endif }; // class DistributedStorage } // namespace detail diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index 72ad9a42cd..f77d13dbad 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -420,6 +420,10 @@ class Expr { dist_eval.wait(); // Swap the new array with the result array object. result.swap(tsr.array()); + +#if 0 + std::cout << "array.id()=" << tsr.array().id() << " evaluated using dist_eval.id=" << dist_eval.id() << std::endl; +#endif } /// Evaluate this object and assign it to \c tsr From 989fd8e6549aaa2bb4e6017f991110c31567ba58 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 7 Jan 2024 16:46:25 -0500 Subject: [PATCH 250/592] bump MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/516 which fixes hangs in applications with large number of tasks --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index c3b7b0659f..c48f0c19b6 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag bf0c376d5cdd6f668174b2a4c67b19634d1c0da7 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag cf3c98053453329f35b775c8b9f561301f6a997e . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag b1f1c39c497b86ab3ef4e560a686de63eb555cc4 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 9499354eba..5255df9780 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG cf3c98053453329f35b775c8b9f561301f6a997e) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 0cb3920715c9a659bbb8158f9a31db1bd97d4614) +set(TA_TRACKED_MADNESS_TAG b1f1c39c497b86ab3ef4e560a686de63eb555cc4) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG cf3c98053453329f35b775c8b9f561301f6a997e) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From 2e4572af6dae9c2ed92a3ace8807925f9acf99a3 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 8 Jan 2024 01:51:08 -0500 Subject: [PATCH 251/592] patch Umpire to address https://github.com/LLNL/Umpire/issues/616 --- external/umpire.cmake | 2 ++ external/umpire.finalize_io.patch | 47 +++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 external/umpire.finalize_io.patch diff --git a/external/umpire.cmake b/external/umpire.cmake index 581839223a..c7a02d65bf 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -170,6 +170,8 @@ else() DOWNLOAD_DIR ${EXTERNAL_SOURCE_DIR} GIT_REPOSITORY ${UMPIRE_URL} GIT_TAG ${UMPIRE_TAG} + #--Patch step----------------- + PATCH_COMMAND patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/external/umpire.finalize_io.patch #--Configure step------------- SOURCE_DIR ${EXTERNAL_SOURCE_DIR} LIST_SEPARATOR :: diff --git a/external/umpire.finalize_io.patch b/external/umpire.finalize_io.patch new file mode 100644 index 0000000000..fa78727d7f --- /dev/null +++ b/external/umpire.finalize_io.patch @@ -0,0 +1,47 @@ +diff --git a/src/umpire/util/io.cpp b/src/umpire/util/io.cpp +index 806fb9e3..551c5e82 100644 +--- a/src/umpire/util/io.cpp ++++ b/src/umpire/util/io.cpp +@@ -52,10 +52,23 @@ std::ostream& error() + + namespace util { + ++namespace detail { ++OutputBuffer& s_log_buffer_accessor() ++{ ++ static OutputBuffer buffer; ++ return buffer; ++} ++OutputBuffer& s_error_buffer_accessor() ++{ ++ static OutputBuffer buffer; ++ return buffer; ++} ++} ++ + void initialize_io(const bool enable_log) + { +- static util::OutputBuffer s_log_buffer; +- static util::OutputBuffer s_error_buffer; ++ OutputBuffer& s_log_buffer = detail::s_log_buffer_accessor(); ++ OutputBuffer& s_error_buffer = detail::s_error_buffer_accessor(); + + s_log_buffer.setConsoleStream(nullptr); + s_error_buffer.setConsoleStream(&std::cerr); +@@ -121,6 +134,16 @@ void initialize_io(const bool enable_log) + MPI::logMpiInfo(); + } + ++void finalize_io() ++{ ++ detail::s_log_buffer_accessor().sync(); ++ detail::s_log_buffer_accessor().setConsoleStream(nullptr); ++ detail::s_log_buffer_accessor().setFileStream(nullptr); ++ detail::s_error_buffer_accessor().sync(); ++ detail::s_error_buffer_accessor().setConsoleStream(nullptr); ++ detail::s_error_buffer_accessor().setFileStream(nullptr); ++} ++ + void flush_files() + { + log().flush(); From c96d357f1f6d138bae7085362ff82637b73a24ed Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 8 Jan 2024 17:35:15 -0500 Subject: [PATCH 252/592] fixup https://github.com/ValeevGroup/tiledarray/commit/56e0e2efb82570cfc24b5745874fd6c30b4ef1a3 --- src/TiledArray/dist_eval/array_eval.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/dist_eval/array_eval.h b/src/TiledArray/dist_eval/array_eval.h index 6dade3dc2b..a4cbdc47b1 100644 --- a/src/TiledArray/dist_eval/array_eval.h +++ b/src/TiledArray/dist_eval/array_eval.h @@ -337,7 +337,7 @@ class ArrayEvalImpl Future result; bool task_created = false; if (arg_tile_is_remote) { - TA_ASSERT(arg_tile_owner != this->world().rank()); + TA_ASSERT(arg_tile_owner != array_.world().rank()); #ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE ntiles_pending_[arg_tile_owner]++; #endif @@ -355,7 +355,7 @@ class ArrayEvalImpl #endif ); } else { - TA_ASSERT(arg_tile_owner == this->world().rank()); + TA_ASSERT(arg_tile_owner == array_.world().rank()); std::tie(result, task_created) = eval_tile(array_.find_local(array_index), /* consumable_tile = */ false #ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE From 65ea1ef476480f7027b8a378ac9857cadfdc2eae Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 10 Jan 2024 15:53:53 -0500 Subject: [PATCH 253/592] try ignoring send order for remote task submission ... resets MAD tag to an experimental branch https://github.com/m-a-d-n-e-s-s/madness/tree/evaleev/feature/taskattribute-unordered --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- src/TiledArray/distributed_storage.h | 11 ++++++----- src/TiledArray/replicator.h | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index c48f0c19b6..93a666585b 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag bf0c376d5cdd6f668174b2a4c67b19634d1c0da7 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag b1f1c39c497b86ab3ef4e560a686de63eb555cc4 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag f7fd3c52f96fb04b98ba59c186f0ac9a09995acd . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 5255df9780..a38c1e57a6 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG b1f1c39c497b86ab3ef4e560a686de63eb555cc4) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG cf3c98053453329f35b775c8b9f561301f6a997e) +set(TA_TRACKED_MADNESS_TAG f7fd3c52f96fb04b98ba59c186f0ac9a09995acd) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG b1f1c39c497b86ab3ef4e560a686de63eb555cc4) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) diff --git a/src/TiledArray/distributed_storage.h b/src/TiledArray/distributed_storage.h index 60eb715c34..52f923ab4b 100644 --- a/src/TiledArray/distributed_storage.h +++ b/src/TiledArray/distributed_storage.h @@ -121,7 +121,8 @@ class DistributedStorage : public madness::WorldObject> { set_remote(const size_type i, Value&& value) { WorldObject_::task( owner(i), &DistributedStorage_::set_handler&>, - i, std::forward(value), madness::TaskAttributes::hipri()); + i, std::forward(value), + madness::TaskAttributes::hipri_unordered()); } struct DelayedSet : public madness::CallbackInterface { @@ -176,7 +177,7 @@ class DistributedStorage : public madness::WorldObject> { ProcessID rank_w_persistence = keep_in_cache ? rank : -(rank + 1); WorldObject_::task(owner(key), &DistributedStorage_::get_cached_handler, key, rank_w_persistence, - madness::TaskAttributes::hipri()); + madness::TaskAttributes::hipri_unordered()); #ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE ngets_sent_per_rank_.at(owner(key))++; #endif @@ -233,12 +234,12 @@ class DistributedStorage : public madness::WorldObject> { WorldObject_::task( destination_rank, &DistributedStorage_::template set_cached_handler, key, - data_fut, madness::TaskAttributes::hipri()); + data_fut, madness::TaskAttributes::hipri_unordered()); } else { WorldObject_::task( destination_rank, &DistributedStorage_::template set_cached_handler, key, - data_fut, madness::TaskAttributes::hipri()); + data_fut, madness::TaskAttributes::hipri_unordered()); } } else { // data not ready yet, defer send to a callback (maybe task??) const_cast(data_fut).register_callback( @@ -385,7 +386,7 @@ class DistributedStorage : public madness::WorldObject> { future result; WorldObject_::task(owner(i), &DistributedStorage_::get_handler, i, result.remote_ref(get_world()), - madness::TaskAttributes::hipri()); + madness::TaskAttributes::hipri_unordered()); return result; } else return get_cached(i, policy == RemoteDataGetPolicy::cache); diff --git a/src/TiledArray/replicator.h b/src/TiledArray/replicator.h index 52ae446af1..1dd7b03ad1 100644 --- a/src/TiledArray/replicator.h +++ b/src/TiledArray/replicator.h @@ -130,7 +130,7 @@ class Replicator : public madness::WorldObject >, if (dest != world_.rank()) { wobj_type::task(dest, &Replicator_::send_handler, indices_, data_, - madness::TaskAttributes::hipri()); + madness::TaskAttributes::hipri_unordered()); } else do_callbacks(); // Replication is done } From b5dd13fbce31e1b54f17699d72a3c099c4c771ab Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 10 Jan 2024 16:55:04 -0500 Subject: [PATCH 254/592] fixed a race in TiledRange{1,}::element_to_tile --- src/TiledArray/tiled_range1.h | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 0f6d18130b..45fef1bfb8 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -56,8 +57,7 @@ class TiledRange1 { /// assert(tr.elements_range() == (TiledRange1::range_type{0,0})); /// assert(tr.begin() == tr.end()); /// \endcode - TiledRange1() - : range_(0, 0), elements_range_(0, 0), tiles_ranges_(), elem2tile_() {} + TiledRange1() : range_(0, 0), elements_range_(0, 0) {} /// Constructs a range with the tile boundaries ("hashmarks") provided by /// the range [ \p first , \p last ). @@ -66,8 +66,7 @@ class TiledRange1 { template ::value>::type* = nullptr> - explicit TiledRange1(RandIter first, RandIter last) - : range_(), elements_range_(), tiles_ranges_(), elem2tile_() { + explicit TiledRange1(RandIter first, RandIter last) { init_tiles_(first, last, 0); } @@ -227,7 +226,7 @@ class TiledRange1 { /// across ALL TiledRange1 instances. const index1_type& element_to_tile(const index1_type& i) const { TA_ASSERT(includes(elements_range_, i)); - if (elem2tile_.empty()) { + if (!elem2tile_) { init_elem2tile_(); } return elem2tile_[i - elements_range_.first]; @@ -290,14 +289,14 @@ class TiledRange1 { typename std::enable_if>>::type* = nullptr> void serialize(Archive& ar) { - ar& range_& elements_range_& tiles_ranges_& elem2tile_; + ar & range_ & elements_range_ & tiles_ranges_; } template >>::type* = nullptr> void serialize(Archive& ar) const { - ar& range_& elements_range_& tiles_ranges_& elem2tile_; + ar & range_ & elements_range_ & tiles_ranges_; } private: @@ -345,19 +344,29 @@ class TiledRange1 { void init_elem2tile_() const { using TiledArray::extent; // check for 0 size range. - if (extent(elements_range_) == 0) return; + const auto n = extent(elements_range_); + if (n == 0) return; static std::mutex mtx; { std::lock_guard lock(mtx); - if (elem2tile_.empty()) { + if (!elem2tile_) { // initialize elem2tile map - elem2tile_.resize(extent(elements_range_)); + auto e2t = +#if __cplusplus >= 202002L + std::make_shared(n); +#else + std::shared_ptr( + new index1_type[n], [](index1_type* ptr) { delete[] ptr; }); +#endif const auto end = extent(range_); for (index1_type t = 0; t < end; ++t) for (index1_type e = tiles_ranges_[t].first; e < tiles_ranges_[t].second; ++e) - elem2tile_[e - elements_range_.first] = t + range_.first; + e2t[e - elements_range_.first] = t + range_.first; + auto e2t_const = std::const_pointer_cast(e2t); + // commit the changes + std::swap(elem2tile_, e2t_const); } } } @@ -369,7 +378,7 @@ class TiledRange1 { range_type elements_range_; ///< the range of element indices std::vector tiles_ranges_; ///< ranges of each tile (NO GAPS between tiles) - mutable std::vector + mutable std::shared_ptr elem2tile_; ///< maps element index to tile index (memoized data). }; // class TiledRange1 From b6ba9dc60ad82d7018060f0a716ce19f50366f7a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 10 Jan 2024 16:51:38 -0500 Subject: [PATCH 255/592] introduced TiledRange::element_to_tile(initializer_list) --- src/TiledArray/tiled_range.h | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tiled_range.h b/src/TiledArray/tiled_range.h index 8c0714aa7d..27e559da1c 100644 --- a/src/TiledArray/tiled_range.h +++ b/src/TiledArray/tiled_range.h @@ -277,6 +277,18 @@ class TiledRange { return result; } + /// Convert an element index to a tile index + + /// \tparam Integer An integral type + /// \param index The element index to convert + /// \return The tile index that corresponds to the given element index + template >> + typename range_type::index element_to_tile( + const std::initializer_list& index) const { + return this->element_to_tile>(index); + } + /// The rank accessor /// \return the rank (=number of dimensions) of this object @@ -316,14 +328,14 @@ class TiledRange { typename std::enable_if>>::type* = nullptr> void serialize(Archive& ar) { - ar& range_& elements_range_& ranges_; + ar & range_ & elements_range_ & ranges_; } template >>::type* = nullptr> void serialize(Archive& ar) const { - ar& range_& elements_range_& ranges_; + ar & range_ & elements_range_ & ranges_; } private: From a27eeb0bb19a7f4d99b41ef5a28cb01e56c4cc96 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 10 Jan 2024 15:56:46 -0500 Subject: [PATCH 256/592] cleanup einsum_tot/ijk_mn_eq_ij_mn_times_kj_mn .. no need for such verbose element access @bimalgaudel --- src/TiledArray/tiled_range1.h | 1 + tests/einsum.cpp | 12 +++++------- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 45fef1bfb8..b341baec95 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -27,6 +27,7 @@ #include #include #include + #include #include #include diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 49e6812cac..6155b2cb98 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -633,16 +633,14 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { using Ix2 = std::array; using Ix3 = std::array; - auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); + auto lhs_tile_ix = lhs.trange().element_to_tile({i, j}); auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false); - auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, j}); + auto rhs_tile_ix = rhs.trange().element_to_tile({k, j}); auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false); - auto& res_el = - result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); - auto const& lhs_el = - lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); - auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{k, j})); + auto& res_el = result_tile({i, j, k}); + auto const& lhs_el = lhs_tile({i, j}); + auto rhs_el = rhs_tile({k, j}); res_el = lhs_el.mult(rhs_el); // m,n * m,n -> m,n } return result_tile; From 248f85ce93e35d7e5326f35daceb4c262987c167 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 10 Jan 2024 21:59:55 -0500 Subject: [PATCH 257/592] trying again to ignore remote task asubmission order (revert of https://github.com/ValeevGroup/tiledarray/commit/65ea1ef476480f7027b8a378ac9857cadfdc2eae + pin MADNESS to https://github.com/m-a-d-n-e-s-s/madness/tree/evaleev/fix/worldobject-task-always-out-of-order) also, thanks to https://github.com/m-a-d-n-e-s-s/madness/commit/39de6cb4d262b3df1f67b3c04a37a935564ca657 ta_abort will now call std::abort instead of MPI_Abort if a nondefault SIBABRT has been installed (e.g., if running under a debugger) --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- src/TiledArray/distributed_storage.h | 11 +++++------ src/TiledArray/replicator.h | 2 +- src/TiledArray/tiledarray.cpp | 13 ++++++++++++- 5 files changed, 21 insertions(+), 11 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 93a666585b..2eade7e8fb 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag bf0c376d5cdd6f668174b2a4c67b19634d1c0da7 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag f7fd3c52f96fb04b98ba59c186f0ac9a09995acd . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 39de6cb4d262b3df1f67b3c04a37a935564ca657 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index a38c1e57a6..6c1bf8429b 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG f7fd3c52f96fb04b98ba59c186f0ac9a09995acd) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG b1f1c39c497b86ab3ef4e560a686de63eb555cc4) +set(TA_TRACKED_MADNESS_TAG 39de6cb4d262b3df1f67b3c04a37a935564ca657) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 51c2728d664c096d0ea39d3b9cbf2895d8d99439) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) diff --git a/src/TiledArray/distributed_storage.h b/src/TiledArray/distributed_storage.h index 52f923ab4b..60eb715c34 100644 --- a/src/TiledArray/distributed_storage.h +++ b/src/TiledArray/distributed_storage.h @@ -121,8 +121,7 @@ class DistributedStorage : public madness::WorldObject> { set_remote(const size_type i, Value&& value) { WorldObject_::task( owner(i), &DistributedStorage_::set_handler&>, - i, std::forward(value), - madness::TaskAttributes::hipri_unordered()); + i, std::forward(value), madness::TaskAttributes::hipri()); } struct DelayedSet : public madness::CallbackInterface { @@ -177,7 +176,7 @@ class DistributedStorage : public madness::WorldObject> { ProcessID rank_w_persistence = keep_in_cache ? rank : -(rank + 1); WorldObject_::task(owner(key), &DistributedStorage_::get_cached_handler, key, rank_w_persistence, - madness::TaskAttributes::hipri_unordered()); + madness::TaskAttributes::hipri()); #ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE ngets_sent_per_rank_.at(owner(key))++; #endif @@ -234,12 +233,12 @@ class DistributedStorage : public madness::WorldObject> { WorldObject_::task( destination_rank, &DistributedStorage_::template set_cached_handler, key, - data_fut, madness::TaskAttributes::hipri_unordered()); + data_fut, madness::TaskAttributes::hipri()); } else { WorldObject_::task( destination_rank, &DistributedStorage_::template set_cached_handler, key, - data_fut, madness::TaskAttributes::hipri_unordered()); + data_fut, madness::TaskAttributes::hipri()); } } else { // data not ready yet, defer send to a callback (maybe task??) const_cast(data_fut).register_callback( @@ -386,7 +385,7 @@ class DistributedStorage : public madness::WorldObject> { future result; WorldObject_::task(owner(i), &DistributedStorage_::get_handler, i, result.remote_ref(get_world()), - madness::TaskAttributes::hipri_unordered()); + madness::TaskAttributes::hipri()); return result; } else return get_cached(i, policy == RemoteDataGetPolicy::cache); diff --git a/src/TiledArray/replicator.h b/src/TiledArray/replicator.h index 1dd7b03ad1..52ae446af1 100644 --- a/src/TiledArray/replicator.h +++ b/src/TiledArray/replicator.h @@ -130,7 +130,7 @@ class Replicator : public madness::WorldObject >, if (dest != world_.rank()) { wobj_type::task(dest, &Replicator_::send_handler, indices_, data_, - madness::TaskAttributes::hipri_unordered()); + madness::TaskAttributes::hipri()); } else do_callbacks(); // Replication is done } diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp index 7d58434979..38bf61e86e 100644 --- a/src/TiledArray/tiledarray.cpp +++ b/src/TiledArray/tiledarray.cpp @@ -17,6 +17,7 @@ #endif #include +#include #include namespace TiledArray { @@ -187,7 +188,17 @@ TiledArray::detail::Finalizer::~Finalizer() noexcept { TiledArray::detail::Finalizer TiledArray::scoped_finalizer() { return {}; } -void TiledArray::ta_abort() { SafeMPI::COMM_WORLD.Abort(); } +void TiledArray::ta_abort() { + // if have a custom signal handler for SIGABRT (i.e. we are running under a + // debugger) then call abort() + struct sigaction sa; + auto rc = sigaction(SIGABRT, NULL, &sa); + if (rc == 0 && sa.sa_handler != SIG_DFL) { + abort(); + } else { + SafeMPI::COMM_WORLD.Abort(); + } +} void TiledArray::ta_abort(const std::string& m) { std::cerr << m << std::endl; From a172b44c2b3f2d724e3d5451bfcb11a26ef8088e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 17 Jan 2024 00:16:08 -0500 Subject: [PATCH 258/592] use VG CMake kit FindOrFetchBoost.cmake that installs modularized Boost if missing --- CMakeLists.txt | 5 +- INSTALL.md | 2 +- bin/admin/dependency-versions-update-hook.py | 21 +----- cmake/modules/FindOrFetchBoost.cmake | 79 -------------------- cmake/tiledarray-config.cmake.in | 10 ++- external/boost.cmake | 36 +++++++++ external/versions.cmake | 14 +--- src/CMakeLists.txt | 2 +- 8 files changed, 52 insertions(+), 117 deletions(-) delete mode 100644 cmake/modules/FindOrFetchBoost.cmake create mode 100644 external/boost.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f98e3fbf2..a97e6561f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -328,10 +328,9 @@ if (ENABLE_WFN91_LINALG_DISCOVERY_KIT) include(FetchWfn91LinAlgModules) include(FindLinalg) endif(ENABLE_WFN91_LINALG_DISCOVERY_KIT) -# BTAS does a better job of building and checking Boost since it uses Boost::serialization -# it also memorized the location of its config for use from install tree +# Boost is to be discovered by the top cmake project, and every (sub)project needs to make sure it has all of its targets +include(external/boost.cmake) include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchBTAS.cmake) -include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchBoost.cmake) if(ENABLE_SCALAPACK) include(external/scalapackpp.cmake) endif() diff --git a/INSTALL.md b/INSTALL.md index 2eade7e8fb..6186df6937 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -40,7 +40,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Container: header-only - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* -- [BTAS](http://github.com/ValeevGroup/BTAS), tag bf0c376d5cdd6f668174b2a4c67b19634d1c0da7 . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag 85eea7796651de1bcb4781b0081a352b32bf91d5 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 39de6cb4d262b3df1f67b3c04a37a935564ca657 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. diff --git a/bin/admin/dependency-versions-update-hook.py b/bin/admin/dependency-versions-update-hook.py index 739049f834..f7f652c1bd 100755 --- a/bin/admin/dependency-versions-update-hook.py +++ b/bin/admin/dependency-versions-update-hook.py @@ -59,23 +59,7 @@ def replace_dep_id(topsrc, file_ext, dep_name, old_id, new_id, search_prefix = ' tokens = line.split() if len(tokens) < 3: continue - if tokens[1].find('TRACKED_BOOST') != -1: - if tokens[1].find('PREVIOUS') != -1: - boost_old_version = tokens[2] - else: - boost_new_version = tokens[2] - elif tokens[1].find('INSTALL_BOOST') != -1: - if tokens[1].find('VERSION') != -1: - if tokens[1].find('PREVIOUS') != -1: - boost_old_install_version = tokens[2] - else: - boost_new_install_version = tokens[2] - else: # URL_HASH - if tokens[1].find('PREVIOUS') != -1: - boost_old_install_url_hash = tokens[2] - else: - boost_new_install_url_hash = tokens[2] - elif tokens[1].find('TRACKED_EIGEN') != -1: + if tokens[1].find('TRACKED_EIGEN') != -1: if tokens[1].find('PREVIOUS') != -1: eigen_old_version = tokens[2] else: @@ -134,9 +118,6 @@ def replace_dep_id(topsrc, file_ext, dep_name, old_id, new_id, search_prefix = ' any_files_changed = False -# Boost version in INSTALL.md -any_files_changed |= replace_dep_id(topsrc, 'md', 'Boost', boost_old_version, boost_new_version, 'boost.org/), version ', ' or higher') - # Eigen version in INSTALL.md any_files_changed |= replace_dep_id(topsrc, 'md', 'Eigen', eigen_old_version, eigen_new_version, 'eigen.tuxfamily.org), version ', ' or higher') # Eigen install version in eigen.cmake diff --git a/cmake/modules/FindOrFetchBoost.cmake b/cmake/modules/FindOrFetchBoost.cmake deleted file mode 100644 index 6ddb2a3b8d..0000000000 --- a/cmake/modules/FindOrFetchBoost.cmake +++ /dev/null @@ -1,79 +0,0 @@ -# Limit scope of the search if BOOST_ROOT or BOOST_INCLUDEDIR is provided. -if (BOOST_ROOT OR BOOST_INCLUDEDIR) - set(Boost_NO_SYSTEM_PATHS TRUE) -endif() - -# try find_package -if (NOT TARGET Boost::boost) - - # detect which Boost targets I already have - foreach(tgt boost;headers;${Boost_BTAS_DEPS_LIBRARIES}) - if (TARGET Boost::${tgt}) - set(ta_imported_boost_${tgt} 0) - else() - set(ta_imported_boost_${tgt} 1) - endif() - endforeach() - - include(FindPackageRegimport) - find_package_regimport(Boost ${TA_TRACKED_BOOST_VERSION} QUIET) - if (TARGET Boost::boost) - message(STATUS "Found Boost ${Boost_VERSION}: ${Boost_INCLUDE_DIRS}") - endif(TARGET Boost::boost) - - # Boost::* targets by default are not GLOBAL, so to allow users of TA to safely use them we need to make them global - # more discussion here: https://gitlab.kitware.com/cmake/cmake/-/issues/17256 - foreach(tgt boost;headers;${Boost_BTAS_DEPS_LIBRARIES}) - if (TARGET Boost::${tgt} AND ta_imported_boost_${tgt}) - get_target_property(_boost_tgt_${tgt}_is_imported_global Boost::${tgt} IMPORTED_GLOBAL) - if (NOT _boost_tgt_${tgt}_is_imported_global) - set_target_properties(Boost::${tgt} PROPERTIES IMPORTED_GLOBAL TRUE) - endif() - unset(_boost_tgt_${tgt}_is_imported_global) - endif() - endforeach() - -endif (NOT TARGET Boost::boost) - -# if not found, build via FetchContent -if (NOT TARGET Boost::boost) - include (FetchContent) - cmake_minimum_required (VERSION 3.14.0) # for FetchContent_MakeAvailable - - FetchContent_Declare( - CMAKEBOOST - GIT_REPOSITORY https://github.com/Orphis/boost-cmake - ) - FetchContent_MakeAvailable(CMAKEBOOST) - FetchContent_GetProperties(CMAKEBOOST - SOURCE_DIR CMAKEBOOST_SOURCE_DIR - BINARY_DIR CMAKEBOOST_BINARY_DIR - ) - - # current boost-cmake/master does not install boost correctly, so warn that installed TiledArray will not be usable - # boost-cmake/install_rules https://github.com/Orphis/boost-cmake/pull/45 is supposed to fix it but is inactive - message(WARNING "Building Boost from source makes TiledArray unusable from the install location! Install Boost using package manager or manually and reconfigure/reinstall TiledArray to fix this") - if (NOT TARGET Boost::headers) - add_library(Boost::headers ALIAS Boost::boost) - endif() - foreach(_lib serialization regex locale locale_deps thread chrono) # these are non-header-only components used by MPQC - if (TARGET Boost_${_lib}) - install(TARGETS Boost_${_lib} EXPORT btas COMPONENT boost-libs) - if (NOT TARGET Boost::${_lib}) - add_library(Boost::${_lib} ALIAS Boost_${_lib}) - endif() - endif() - endforeach() -# export(EXPORT tiledarray -# FILE "${PROJECT_BINARY_DIR}/boost-targets.cmake") -# install(EXPORT tiledarray -# FILE "boost-targets.cmake" -# DESTINATION "${TILEDARRAY_INSTALL_CMAKEDIR}" -# COMPONENT boost-libs) - -endif(NOT TARGET Boost::boost) - -# postcond check -if (NOT TARGET Boost::boost) - message(FATAL_ERROR "FindOrFetchBoost could not make Boost::boost target available") -endif(NOT TARGET Boost::boost) diff --git a/cmake/tiledarray-config.cmake.in b/cmake/tiledarray-config.cmake.in index 845f5225c3..3d1484013b 100644 --- a/cmake/tiledarray-config.cmake.in +++ b/cmake/tiledarray-config.cmake.in @@ -12,11 +12,17 @@ set(TILEDARRAY_EXT_VERSION "@TILEDARRAY_EXT_VERSION@") @PACKAGE_INIT@ +include(CMakeFindDependencyMacro) + # Include library IMPORT targets + +@Boost_CONFIG_FILE_CONTENTS@ + if (NOT TARGET BTAS::BTAS) get_filename_component(BTAS_DIR "@BTAS_CONFIG@" DIRECTORY) - find_package(BTAS 1.0.0 QUIET CONFIG REQUIRED HINTS "${BTAS_DIR}") + find_dependency(BTAS 1.0.0 QUIET CONFIG REQUIRED HINTS "${BTAS_DIR}") endif() + if(NOT TARGET MADworld) # if madness installed separately, use the madness install discovered when configuring TA set(MADNESS_CONFIG_DIR "@MADNESS_CONFIG_DIR@") @@ -38,7 +44,7 @@ set(TILEDARRAY_HAS_CUDA "@CUDA_FOUND@") if(TILEDARRAY_HAS_CUDA) cmake_minimum_required(VERSION 3.17) if (NOT TARGET CUDA::cublas) - find_package(CUDAToolkit REQUIRED COMPONENTS cublas nvToolsExt) + find_dependency(CUDAToolkit REQUIRED COMPONENTS cublas nvToolsExt) endif(NOT TARGET CUDA::cublas) set(CMAKE_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@") # workaround from https://gitlab.kitware.com/cmake/cmake/issues/18614#note_485631 diff --git a/external/boost.cmake b/external/boost.cmake new file mode 100644 index 0000000000..a050f47d32 --- /dev/null +++ b/external/boost.cmake @@ -0,0 +1,36 @@ +# Boost can be discovered by every (sub)package but only the top package can build it ... +# if we are the top package need to include the list of Boost components to be built +if("${CMAKE_PROJECT_NAME}" STREQUAL "${PROJECT_NAME}") + set(required_components + headers # TA, BTAS + algorithm # TA + container # TA, BTAS + iterator # TA, BTAS + random # TA, BTAS + tuple # TA + ) + if (DEFINED Boost_REQUIRED_COMPONENTS) + list(APPEND Boost_REQUIRED_COMPONENTS + ${required_components}) + list(REMOVE_DUPLICATES Boost_REQUIRED_COMPONENTS) + else() + set(Boost_REQUIRED_COMPONENTS "${required_components}" CACHE STRING "Components of Boost to discovered or built") + endif() + set(optional_components + serialization # BTAS + ) + if (DEFINED Boost_OPTIONAL_COMPONENTS) + list(APPEND Boost_OPTIONAL_COMPONENTS + ${optional_components} + ) + list(REMOVE_DUPLICATES Boost_OPTIONAL_COMPONENTS) + else() + set(Boost_OPTIONAL_COMPONENTS "${optional_components}" CACHE STRING "Optional components of Boost to discovered or built") + endif() +endif() + +if (NOT DEFINED Boost_FETCH_IF_MISSING) + set(Boost_FETCH_IF_MISSING 1) +endif() + +include(${vg_cmake_kit_SOURCE_DIR}/modules/FindOrFetchBoost.cmake) diff --git a/external/versions.cmake b/external/versions.cmake index 6c1bf8429b..211efd44d5 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -1,15 +1,7 @@ # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS) # to be able to auto-update them -set(TA_TRACKED_VGCMAKEKIT_TAG d6746098e63deab4032309c4455bb084a17ff51a) - -# Boost explicitly downgraded to 1.59 from 1.68 -set(TA_TRACKED_BOOST_VERSION 1.59) -set(TA_TRACKED_BOOST_PREVIOUS_VERSION 1.68) -set(TA_INSTALL_BOOST_VERSION 1.70.0) -set(TA_INSTALL_BOOST_PREVIOUS_VERSION 1.70.0) -set(TA_INSTALL_BOOST_URL_HASH 882b48708d211a5f48e60b0124cf5863c1534cd544ecd0664bb534a4b5d506e9) -set(TA_INSTALL_BOOST_PREVIOUS_URL_HASH 882b48708d211a5f48e60b0124cf5863c1534cd544ecd0664bb534a4b5d506e9) +set(TA_TRACKED_VGCMAKEKIT_TAG 8713beb71ff6b7d4b1c758e9e1c7d814bd97b0af) # N.B. may need to update INSTALL.md manually with the CUDA-specific version set(TA_TRACKED_EIGEN_VERSION 3.3.5) @@ -24,8 +16,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 51c2728d664c096d0ea39d3b9cbf2895d8d99439) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG bf0c376d5cdd6f668174b2a4c67b19634d1c0da7) -set(TA_TRACKED_BTAS_PREVIOUS_TAG 3c91f086090390930bba62c6512c4e74a5520e76) +set(TA_TRACKED_BTAS_TAG 85eea7796651de1bcb4781b0081a352b32bf91d5) +set(TA_TRACKED_BTAS_PREVIOUS_TAG bf0c376d5cdd6f668174b2a4c67b19634d1c0da7) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6e6c708891..9bb82bf537 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -329,7 +329,7 @@ add_library(tiledarray ${TILEDARRAY_SOURCE_FILES} ${TILEDARRAY_HEADER_FILES}) target_link_libraries(${targetname} PUBLIC ${TILEDARRAY_PRIVATE_LINK_LIBRARIES}) target_link_libraries(${targetname} PUBLIC MADworld) - target_link_libraries(${targetname} PUBLIC Boost::boost) + target_link_libraries(${targetname} PUBLIC Boost::headers) # build all external deps before building tiledarray add_dependencies(${targetname} External-tiledarray) From 98c6c3b2921589552a716acc2336aa2983e8b378 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 17 Jan 2024 11:59:07 -0500 Subject: [PATCH 259/592] xcode 14 does not support std::make_shared even with C++20 --- src/TiledArray/tiled_range1.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index b341baec95..f1dc2369a8 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -354,12 +354,12 @@ class TiledRange1 { if (!elem2tile_) { // initialize elem2tile map auto e2t = -#if __cplusplus >= 202002L - std::make_shared(n); -#else + // #if __cplusplus >= 202002L ... still broken in Xcode 14 + // std::make_shared(n); + // #else std::shared_ptr( new index1_type[n], [](index1_type* ptr) { delete[] ptr; }); -#endif + // #endif const auto end = extent(range_); for (index1_type t = 0; t < end; ++t) for (index1_type e = tiles_ranges_[t].first; From 4f8c1d328084e15fe247b7fc7d171612e76fc27e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 23 Jan 2024 14:57:25 -0500 Subject: [PATCH 260/592] bump MAD tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/520 which allows to reduce the maximum size of messages sent by GOP bcast/reduce --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 6186df6937..591e1d182c 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag 85eea7796651de1bcb4781b0081a352b32bf91d5 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 39de6cb4d262b3df1f67b3c04a37a935564ca657 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 87715d98a244bff5cbff0bd2c644a8a00d882989 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 211efd44d5..709b86fa9c 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -11,8 +11,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 39de6cb4d262b3df1f67b3c04a37a935564ca657) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 51c2728d664c096d0ea39d3b9cbf2895d8d99439) +set(TA_TRACKED_MADNESS_TAG 87715d98a244bff5cbff0bd2c644a8a00d882989) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 39de6cb4d262b3df1f67b3c04a37a935564ca657) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From 1bd55170e3fc1e75f6ecdeef169d5043707b7789 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 24 Jan 2024 00:17:51 -0500 Subject: [PATCH 261/592] bump tags for VG CMake kit and BTAS to pull in https://github.com/ValeevGroup/BTAS/pull/171 and support more robust Boost discovery (e.g. when it's been partially loaded before TA) --- INSTALL.md | 2 +- external/boost.cmake | 54 ++++++++++++++++++++--------------------- external/versions.cmake | 6 ++--- 3 files changed, 30 insertions(+), 32 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 591e1d182c..e6e5d02580 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -40,7 +40,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Container: header-only - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* -- [BTAS](http://github.com/ValeevGroup/BTAS), tag 85eea7796651de1bcb4781b0081a352b32bf91d5 . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag b2c18c797122b149eb088ae494dbd12f2f5ff81b . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 87715d98a244bff5cbff0bd2c644a8a00d882989 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. diff --git a/external/boost.cmake b/external/boost.cmake index a050f47d32..e058ba0247 100644 --- a/external/boost.cmake +++ b/external/boost.cmake @@ -1,32 +1,30 @@ -# Boost can be discovered by every (sub)package but only the top package can build it ... -# if we are the top package need to include the list of Boost components to be built -if("${CMAKE_PROJECT_NAME}" STREQUAL "${PROJECT_NAME}") - set(required_components - headers # TA, BTAS - algorithm # TA - container # TA, BTAS - iterator # TA, BTAS - random # TA, BTAS - tuple # TA - ) - if (DEFINED Boost_REQUIRED_COMPONENTS) - list(APPEND Boost_REQUIRED_COMPONENTS - ${required_components}) - list(REMOVE_DUPLICATES Boost_REQUIRED_COMPONENTS) - else() - set(Boost_REQUIRED_COMPONENTS "${required_components}" CACHE STRING "Components of Boost to discovered or built") - endif() - set(optional_components - serialization # BTAS +# Boost can be discovered by every (sub)package but only the top package can *build* it ... +# in either case must declare the components used by BTAS +set(required_components + headers # TA, BTAS + algorithm # TA + container # TA, BTAS + iterator # TA, BTAS + random # TA, BTAS + tuple # TA +) +if (DEFINED Boost_REQUIRED_COMPONENTS) + list(APPEND Boost_REQUIRED_COMPONENTS + ${required_components}) + list(REMOVE_DUPLICATES Boost_REQUIRED_COMPONENTS) +else() + set(Boost_REQUIRED_COMPONENTS "${required_components}" CACHE STRING "Components of Boost to discovered or built") +endif() +set(optional_components + serialization # BTAS +) +if (DEFINED Boost_OPTIONAL_COMPONENTS) + list(APPEND Boost_OPTIONAL_COMPONENTS + ${optional_components} ) - if (DEFINED Boost_OPTIONAL_COMPONENTS) - list(APPEND Boost_OPTIONAL_COMPONENTS - ${optional_components} - ) - list(REMOVE_DUPLICATES Boost_OPTIONAL_COMPONENTS) - else() - set(Boost_OPTIONAL_COMPONENTS "${optional_components}" CACHE STRING "Optional components of Boost to discovered or built") - endif() + list(REMOVE_DUPLICATES Boost_OPTIONAL_COMPONENTS) +else() + set(Boost_OPTIONAL_COMPONENTS "${optional_components}" CACHE STRING "Optional components of Boost to discovered or built") endif() if (NOT DEFINED Boost_FETCH_IF_MISSING) diff --git a/external/versions.cmake b/external/versions.cmake index 709b86fa9c..ba7f0d9c66 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -1,7 +1,7 @@ # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS) # to be able to auto-update them -set(TA_TRACKED_VGCMAKEKIT_TAG 8713beb71ff6b7d4b1c758e9e1c7d814bd97b0af) +set(TA_TRACKED_VGCMAKEKIT_TAG 45e7d0d8d7f994a88c5af5fc082332db7bd0d6b3) # N.B. may need to update INSTALL.md manually with the CUDA-specific version set(TA_TRACKED_EIGEN_VERSION 3.3.5) @@ -16,8 +16,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 39de6cb4d262b3df1f67b3c04a37a935564ca657) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG 85eea7796651de1bcb4781b0081a352b32bf91d5) -set(TA_TRACKED_BTAS_PREVIOUS_TAG bf0c376d5cdd6f668174b2a4c67b19634d1c0da7) +set(TA_TRACKED_BTAS_TAG b2c18c797122b149eb088ae494dbd12f2f5ff81b) +set(TA_TRACKED_BTAS_PREVIOUS_TAG 85eea7796651de1bcb4781b0081a352b32bf91d5) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) From 2061fa930c5ad33b953a8ebed5947dd1086285ae Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 25 Jan 2024 00:46:35 -0500 Subject: [PATCH 262/592] if building Boost from source, ta_test depends on Boost::unit_test_framework target whose building is a saga ... bump VG CMake kit to support building such Boost components (for which component != target) from source --- INSTALL.md | 2 +- external/boost.cmake | 5 +++++ external/versions.cmake | 6 +++--- tests/CMakeLists.txt | 4 ++++ 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index e6e5d02580..e675270f1f 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -40,7 +40,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Container: header-only - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* -- [BTAS](http://github.com/ValeevGroup/BTAS), tag b2c18c797122b149eb088ae494dbd12f2f5ff81b . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag c0db35c8217f5fcc7e2b97b0ab48421f2a60ae4b . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 87715d98a244bff5cbff0bd2c644a8a00d882989 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. diff --git a/external/boost.cmake b/external/boost.cmake index e058ba0247..d4e06326dc 100644 --- a/external/boost.cmake +++ b/external/boost.cmake @@ -8,6 +8,11 @@ set(required_components random # TA, BTAS tuple # TA ) +if (BUILD_TESTING) + list(APPEND required_components + test # TA + ) +endif() if (DEFINED Boost_REQUIRED_COMPONENTS) list(APPEND Boost_REQUIRED_COMPONENTS ${required_components}) diff --git a/external/versions.cmake b/external/versions.cmake index ba7f0d9c66..c243a80c23 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -1,7 +1,7 @@ # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS) # to be able to auto-update them -set(TA_TRACKED_VGCMAKEKIT_TAG 45e7d0d8d7f994a88c5af5fc082332db7bd0d6b3) +set(TA_TRACKED_VGCMAKEKIT_TAG 38f99f3da4810c97ed1d54f863120ae85120fa8f) # N.B. may need to update INSTALL.md manually with the CUDA-specific version set(TA_TRACKED_EIGEN_VERSION 3.3.5) @@ -16,8 +16,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 39de6cb4d262b3df1f67b3c04a37a935564ca657) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG b2c18c797122b149eb088ae494dbd12f2f5ff81b) -set(TA_TRACKED_BTAS_PREVIOUS_TAG 85eea7796651de1bcb4781b0081a352b32bf91d5) +set(TA_TRACKED_BTAS_TAG c0db35c8217f5fcc7e2b97b0ab48421f2a60ae4b) +set(TA_TRACKED_BTAS_PREVIOUS_TAG b2c18c797122b149eb088ae494dbd12f2f5ff81b) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e4f5bfe213..3bcf8de967 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -131,6 +131,10 @@ if (NOT (TA_ASSERT_POLICY STREQUAL TA_ASSERT_THROW) AND NOT CUDA_FOUND AND FALSE else() add_ta_executable(${executable} "${ta_test_src_files}" "tiledarray") endif() +# if Boost is modularized, need to explicitly state that we need Boost::test module +if (Boost_IS_MODULARIZED) + target_link_libraries(${executable} PRIVATE Boost::unit_test_framework) +endif() # Add include directories and compiler flags for ta_test target_include_directories(${executable} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} From 94db5c970f3e907e2d965c946c841a760e0c3ec5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 25 Jan 2024 09:01:30 -0500 Subject: [PATCH 263/592] bump MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/521 also --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index e675270f1f..510b6cf84c 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Range: header-only, *only used for unit testing* - [BTAS](http://github.com/ValeevGroup/BTAS), tag c0db35c8217f5fcc7e2b97b0ab48421f2a60ae4b . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 87715d98a244bff5cbff0bd2c644a8a00d882989 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 8788aea9758bfe6479cc23d39e6c77b7528009db . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index c243a80c23..0c29363594 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -11,8 +11,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 87715d98a244bff5cbff0bd2c644a8a00d882989) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 39de6cb4d262b3df1f67b3c04a37a935564ca657) +set(TA_TRACKED_MADNESS_TAG 8788aea9758bfe6479cc23d39e6c77b7528009db) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 87715d98a244bff5cbff0bd2c644a8a00d882989) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From 954d861f553e938c3cfc4892fce9234bf4bf7193 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 25 Jan 2024 16:30:12 -0500 Subject: [PATCH 264/592] bump CMake VG Kit and BTAS tags to pull in https://github.com/ValeevGroup/kit-cmake/commit/e0d04e91a84b7e71d9b87682c46c518e9966bd78 --- INSTALL.md | 2 +- external/versions.cmake | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 510b6cf84c..445fb9af9a 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -40,7 +40,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Container: header-only - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* -- [BTAS](http://github.com/ValeevGroup/BTAS), tag c0db35c8217f5fcc7e2b97b0ab48421f2a60ae4b . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag b7b2ea7513b087e35c6f1b26184a3904ac1e6b14 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 8788aea9758bfe6479cc23d39e6c77b7528009db . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. diff --git a/external/versions.cmake b/external/versions.cmake index 0c29363594..456a32af33 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -1,7 +1,7 @@ # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS) # to be able to auto-update them -set(TA_TRACKED_VGCMAKEKIT_TAG 38f99f3da4810c97ed1d54f863120ae85120fa8f) +set(TA_TRACKED_VGCMAKEKIT_TAG e0d04e91a84b7e71d9b87682c46c518e9966bd78) # N.B. may need to update INSTALL.md manually with the CUDA-specific version set(TA_TRACKED_EIGEN_VERSION 3.3.5) @@ -16,8 +16,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 87715d98a244bff5cbff0bd2c644a8a00d882989) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG c0db35c8217f5fcc7e2b97b0ab48421f2a60ae4b) -set(TA_TRACKED_BTAS_PREVIOUS_TAG b2c18c797122b149eb088ae494dbd12f2f5ff81b) +set(TA_TRACKED_BTAS_TAG b7b2ea7513b087e35c6f1b26184a3904ac1e6b14) +set(TA_TRACKED_BTAS_PREVIOUS_TAG c0db35c8217f5fcc7e2b97b0ab48421f2a60ae4b) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) From 68392f9567c55452efced497dd908e153f72b979 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 26 Jan 2024 00:15:21 -0500 Subject: [PATCH 265/592] cleanup [skip ci] --- external/boost.cmake | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/external/boost.cmake b/external/boost.cmake index d4e06326dc..c89b2e3667 100644 --- a/external/boost.cmake +++ b/external/boost.cmake @@ -1,16 +1,16 @@ # Boost can be discovered by every (sub)package but only the top package can *build* it ... -# in either case must declare the components used by BTAS +# in either case must declare the components used by TA set(required_components - headers # TA, BTAS - algorithm # TA - container # TA, BTAS - iterator # TA, BTAS - random # TA, BTAS - tuple # TA + headers + algorithm + container + iterator + random + tuple ) if (BUILD_TESTING) list(APPEND required_components - test # TA + test ) endif() if (DEFINED Boost_REQUIRED_COMPONENTS) From 493c109379a1b64ddd5ef59f7e33b95633b68d73 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 31 Jan 2024 17:11:16 -0500 Subject: [PATCH 266/592] [cmake] bump VG CMake kit to improve robustness + BTAS tag to pull in https://github.com/ValeevGroup/BTAS/pull/173 --- INSTALL.md | 2 +- external/versions.cmake | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 445fb9af9a..6e7c6fc746 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -40,7 +40,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Container: header-only - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* -- [BTAS](http://github.com/ValeevGroup/BTAS), tag b7b2ea7513b087e35c6f1b26184a3904ac1e6b14 . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag 4e8f5233aa7881dccdfcc37ce07128833926d3c2 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 8788aea9758bfe6479cc23d39e6c77b7528009db . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. diff --git a/external/versions.cmake b/external/versions.cmake index 456a32af33..e04b066573 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -1,7 +1,7 @@ # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS) # to be able to auto-update them -set(TA_TRACKED_VGCMAKEKIT_TAG e0d04e91a84b7e71d9b87682c46c518e9966bd78) +set(TA_TRACKED_VGCMAKEKIT_TAG d5c0a6f9ff6dc97cbb5132912733e1eb1cf73f1e) # N.B. may need to update INSTALL.md manually with the CUDA-specific version set(TA_TRACKED_EIGEN_VERSION 3.3.5) @@ -16,8 +16,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 87715d98a244bff5cbff0bd2c644a8a00d882989) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG b7b2ea7513b087e35c6f1b26184a3904ac1e6b14) -set(TA_TRACKED_BTAS_PREVIOUS_TAG c0db35c8217f5fcc7e2b97b0ab48421f2a60ae4b) +set(TA_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) +set(TA_TRACKED_BTAS_PREVIOUS_TAG b7b2ea7513b087e35c6f1b26184a3904ac1e6b14) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) From 56d822fbb7734f8020ef5d46ca50f5ffbc3a1c9d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 4 Feb 2024 12:09:42 -0500 Subject: [PATCH 267/592] fix Tensor(range, elemop) ctor to use placement-new instead of (move) assignment ... resolves #445 --- src/TiledArray/tensor/tensor.h | 31 ++++++++++++++++++++----------- tests/tensor_of_tensor.cpp | 9 +++++++++ 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 1b5beff19d..9c36b071cc 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -144,9 +144,12 @@ class Tensor { size_t size = range_.volume() * nbatch; allocator_type allocator; auto* ptr = allocator.allocate(size); - if (default_construct) { - std::uninitialized_default_construct_n(ptr, size); - // std::uninitialized_value_construct_n(ptr, size); + // default construct elements of data only if can have any effect ... + if constexpr (!std::is_trivially_default_constructible_v) { + // .. and requested + if (default_construct) { + std::uninitialized_default_construct_n(ptr, size); + } } auto deleter = [ #ifdef TA_TENSOR_MEM_TRACE @@ -182,9 +185,12 @@ class Tensor { size_t size = range_.volume() * nbatch; allocator_type allocator; auto* ptr = allocator.allocate(size); - if (default_construct) { - std::uninitialized_default_construct_n(ptr, size); - // std::uninitialized_value_construct_n(ptr, size); + // default construct elements of data only if can have any effect ... + if constexpr (!std::is_trivially_default_constructible_v) { + // .. and requested + if (default_construct) { + std::uninitialized_default_construct_n(ptr, size); + } } auto deleter = [ #ifdef TA_TENSOR_MEM_TRACE @@ -288,7 +294,8 @@ class Tensor { } /// Construct a tensor with a range equal to \c range. The data is - /// uninitialized. + /// default-initialized (which, for `T` with trivial default constructor, + /// means data is uninitialized). /// \param range The range of the tensor /// \param nbatch The number of batches (default is 1) explicit Tensor(const range_type& range, size_type nbatch = 1) @@ -336,9 +343,10 @@ class Tensor { value_type, ElementIndexOp, const Range::index_type&>>> Tensor(const range_type& range, const ElementIndexOp& element_idx_op) : Tensor(range, 1, default_construct{false}) { - auto* data_ptr = data_.get(); + pointer MADNESS_RESTRICT const data = this->data(); for (auto&& element_idx : range) { - data_ptr[range.ordinal(element_idx)] = element_idx_op(element_idx); + const auto ord = range.ordinal(element_idx); + new (data + ord) value_type(element_idx_op(element_idx)); } } @@ -350,8 +358,9 @@ class Tensor { Tensor(const range_type& range, InIter it) : Tensor(range, 1, default_construct{false}) { auto n = range.volume(); - pointer MADNESS_RESTRICT const data = this->data(); - for (size_type i = 0ul; i < n; ++i, ++it) data[i] = *it; + pointer MADNESS_RESTRICT data = this->data(); + for (size_type i = 0ul; i < n; ++i, ++it, ++data) + new (data) value_type(*it); } template diff --git a/tests/tensor_of_tensor.cpp b/tests/tensor_of_tensor.cpp index f6fae22be5..f0aa8be3e8 100644 --- a/tests/tensor_of_tensor.cpp +++ b/tests/tensor_of_tensor.cpp @@ -200,6 +200,15 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(default_constructor, ITensor, itensor_types) { BOOST_AUTO_TEST_CASE_TEMPLATE(unary_constructor, ITensor, itensor_types) { const auto& a = ToT(0); + + // apply element-wise op with default initializer + // this is a reproducer for + // https://github.com/ValeevGroup/tiledarray/issues/445 + { + BOOST_CHECK_NO_THROW( + Tensor t(a.range(), [](auto&& l) { return ITensor(); })); + } + // apply element-wise op BOOST_CHECK_NO_THROW(Tensor t(a, [](const int l) { return l * 2; })); Tensor t(a, [](const int l) { return l * 2; }); From 79b70f47cb11e3e1b1d4b3c7838750d6a812cbd2 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 8 Jan 2024 08:37:58 -0500 Subject: [PATCH 268/592] Small refactor and addition of inner tensor Hadamard, external, and internal indices in `einsum` function. --- src/TiledArray/einsum/tiledarray.h | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 1851973709..bee8e4ec10 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -102,6 +102,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, struct { std::string a, b, c; + // Hadamard, external, internal indices for inner tensor + Einsum::Index h, e, i; } inner; if constexpr (std::tuple_size::value == 2) { if constexpr (IsArrayToT) @@ -112,23 +114,35 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, static_assert(IsArrayToT || IsArrayToT); inner.c = ";" + (std::string)std::get<1>(cs); + + Einsum::Index a_idx, b_idx, c_idx; + if constexpr (IsArrayToT) a_idx = std::get<1>(Einsum::idx(A)); + if constexpr (IsArrayToT) b_idx = std::get<1>(Einsum::idx(B)); + if constexpr (IsArrayToT || IsArrayToT) + c_idx = std::get<1>(cs); + + inner.h = a_idx & b_idx & c_idx; + inner.e = (a_idx ^ b_idx); + inner.i = (a_idx & b_idx) - inner.h; } // these are "Hadamard" (fused) indices auto h = a & b & c; + // external indices + auto e = (a ^ b); + + // contracted indices + auto i = (a & b) - h; + // no Hadamard indices => standard contraction (or even outer product) // same a, b, and c => pure Hadamard - if (!h || (!(a ^ b) && !(b ^ c))) { + if (!h || (h && !(i || e))) { ArrayC C; C(std::string(c) + inner.c) = A * B; return C; } - auto e = (a ^ b); - // contracted indices - auto i = (a & b) - h; - TA_ASSERT(e || h); auto range_map = From 630547607caf769941207001b5771984e0cdf435 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 8 Jan 2024 08:39:30 -0500 Subject: [PATCH 269/592] Random tensor and tensor-of-tensor generation functions added for unit testing. --- tests/tot_array_fixture.h | 55 ++++++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index c01399dbba..7345401518 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -88,11 +88,52 @@ using input_archive_type = madness::archive::BinaryFstreamInputArchive; // Type of an output archive using output_archive_type = madness::archive::BinaryFstreamOutputArchive; -enum class ShapeComp { - True, - False -}; - +enum class ShapeComp { True, False }; + +template +auto random_tensor(TA::Range const& rng) { + TA::Tensor result{rng}; + std::generate(result.begin(), result.end(), + TA::detail::MakeRandom::generate_value); + return result; +} + +// note: all the inner tensors (elements of the outer tensor) +// have the same @c inner_rng +template +auto random_tensor_of_tensor(TA::Range const& outer_rng, + TA::Range const& inner_rng) { + TA::Tensor> result{outer_rng}; + + std::generate(result.begin(), result.end(), + [inner_rng]() { return random_tensor(inner_rng); }); + + return result; +} + +template +auto make_random_array(TA::TiledRange const& trange) { + using ArrayT = TA::DistArray, Policy>; + + auto make_tile = [](TA::Tensor& tile, TA::Range const& rng) { + tile = random_tensor(rng); + if constexpr (std::is_same_v) return tile.norm(); + }; + + return TA::make_array(TA::get_default_world(), trange, make_tile); +} + +template +auto make_random_array(TA::TiledRange const& trange, TA::Range const& inner) { + using ArrayT = TA::DistArray>, Policy>; + + auto make_tile = [inner](TA::Tensor>& tile, + TA::Range const& rng) { + tile = random_tensor_of_tensor(rng, inner); + if constexpr (std::is_same_v) return tile.norm(); + }; + return TA::make_array(TA::get_default_world(), trange, make_tile); +} /* * @@ -244,8 +285,8 @@ struct ToTArrayFixture { * * TODO: pmap comparisons */ - template + template static bool are_equal(const DistArray& lhs, const DistArray& rhs) { // Same type From 25c2cb19d4edf630017944aa752783fae3527563 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 10 Jan 2024 21:41:04 -0500 Subject: [PATCH 270/592] [wip][skip ci] tensor_contract function for TA::Tensor. --- src/TiledArray/tensor/kernels.h | 271 ++++++++++++++++++++++++++++ src/TiledArray/tensor/tensor.h | 191 +------------------- src/TiledArray/tensor/type_traits.h | 101 +++++++++++ 3 files changed, 374 insertions(+), 189 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 682cb1b209..af951755a3 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -26,6 +26,8 @@ #ifndef TILEDARRAY_TENSOR_KENERLS_H__INCLUDED #define TILEDARRAY_TENSOR_KENERLS_H__INCLUDED +#include +#include #include #include #include @@ -37,6 +39,192 @@ class Tensor; namespace detail { +// ------------------------------------------------------------------------- +// Tensor GEMM + +/// Contract two tensors + +/// GEMM is limited to matrix like contractions. For example, the following +/// contractions are supported: +/// \code +/// C[a,b] = A[a,i,j] * B[i,j,b] +/// C[a,b] = A[a,i,j] * B[b,i,j] +/// C[a,b] = A[i,j,a] * B[i,j,b] +/// C[a,b] = A[i,j,a] * B[b,i,j] +/// +/// C[a,b,c,d] = A[a,b,i,j] * B[i,j,c,d] +/// C[a,b,c,d] = A[a,b,i,j] * B[c,d,i,j] +/// C[a,b,c,d] = A[i,j,a,b] * B[i,j,c,d] +/// C[a,b,c,d] = A[i,j,a,b] * B[c,d,i,j] +/// \endcode +/// Notice that in the above contractions, the inner and outer indices of +/// the arguments for exactly two contiguous groups in each tensor and that +/// each group is in the same order in all tensors. That is, the indices of +/// the tensors must fit the one of the following patterns: +/// \code +/// C[M...,N...] = A[M...,K...] * B[K...,N...] +/// C[M...,N...] = A[M...,K...] * B[N...,K...] +/// C[M...,N...] = A[K...,M...] * B[K...,N...] +/// C[M...,N...] = A[K...,M...] * B[N...,K...] +/// \endcode +/// This allows use of optimized BLAS functions to evaluate tensor +/// contractions. Tensor contractions that do not fit this pattern require +/// one or more tensor permutation so that the tensors fit the required +/// pattern. +/// \tparam U The left-hand tensor element type +/// \tparam AU The left-hand tensor allocator type +/// \tparam V The right-hand tensor element type +/// \tparam AV The right-hand tensor allocator type +/// \tparam W The type of the scaling factor +/// \param left The left-hand tensor that will be contracted +/// \param right The right-hand tensor that will be contracted +/// \param factor The contraction result will be scaling by this value, then +/// accumulated into \c this \param gemm_helper The *GEMM operation meta data +/// \return A reference to \c this +/// \note if this is uninitialized, i.e., if \c this->empty()==true will +/// this is equivalent to +/// \code +/// return (*this = left.gemm(right, factor, gemm_helper)); +/// \endcode +template +void gemm(Alpha alpha, const Tensor& A, const Tensor& B, + Beta beta, Tensor& C, const math::GemmHelper& gemm_helper) { + static_assert(!detail::is_tensor_of_tensor_v, Tensor, + Tensor>, + "TA::Tensor::gemm without custom element op is " + "only applicable to " + "plain tensors"); + { + // Check that tensor C is not empty and has the correct rank + TA_ASSERT(!C.empty()); + TA_ASSERT(C.range().rank() == gemm_helper.result_rank()); + + // Check that the arguments are not empty and have the correct ranks + TA_ASSERT(!A.empty()); + TA_ASSERT(A.range().rank() == gemm_helper.left_rank()); + TA_ASSERT(!B.empty()); + TA_ASSERT(B.range().rank() == gemm_helper.right_rank()); + + TA_ASSERT(A.nbatch() == 1); + TA_ASSERT(B.nbatch() == 1); + TA_ASSERT(C.nbatch() == 1); + + // Check that the outer dimensions of left match the corresponding + // dimensions in result + TA_ASSERT(gemm_helper.left_result_congruent(A.range().extent_data(), + C.range().extent_data())); + TA_ASSERT(ignore_tile_position() || + gemm_helper.left_result_congruent(A.range().lobound_data(), + C.range().lobound_data())); + TA_ASSERT(ignore_tile_position() || + gemm_helper.left_result_congruent(A.range().upbound_data(), + C.range().upbound_data())); + + // Check that the outer dimensions of right match the corresponding + // dimensions in result + TA_ASSERT(gemm_helper.right_result_congruent(B.range().extent_data(), + C.range().extent_data())); + TA_ASSERT(ignore_tile_position() || + gemm_helper.right_result_congruent(B.range().lobound_data(), + C.range().lobound_data())); + TA_ASSERT(ignore_tile_position() || + gemm_helper.right_result_congruent(B.range().upbound_data(), + C.range().upbound_data())); + + // Check that the inner dimensions of left and right match + TA_ASSERT(gemm_helper.left_right_congruent(A.range().extent_data(), + B.range().extent_data())); + TA_ASSERT(ignore_tile_position() || + gemm_helper.left_right_congruent(A.range().lobound_data(), + B.range().lobound_data())); + TA_ASSERT(ignore_tile_position() || + gemm_helper.left_right_congruent(A.range().upbound_data(), + B.range().upbound_data())); + + // Compute gemm dimensions + using integer = TiledArray::math::blas::integer; + integer m, n, k; + gemm_helper.compute_matrix_sizes(m, n, k, A.range(), B.range()); + + // Get the leading dimension for left and right matrices. + const integer lda = + (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose ? k : m); + const integer ldb = + (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? n : k); + + // may need to split gemm into multiply + accumulate for tracing purposes +#ifdef TA_ENABLE_TILE_OPS_LOGGING + { + using numeric_type = typename Tensor::numeric_type; + using T = numeric_type; + const bool twostep = + TiledArray::TileOpsLogger::get_instance().gemm && + TiledArray::TileOpsLogger::get_instance().gemm_print_contributions; + std::unique_ptr data_copy; + size_t tile_volume; + if (twostep) { + tile_volume = C.range().volume(); + data_copy = std::make_unique(tile_volume); + std::copy(C.data(), C.data() + tile_volume, data_copy.get()); + } + non_distributed::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, + k, alpha, A.data(), lda, B.data(), ldb, + twostep ? numeric_type(0) : beta, C.data(), n); + + if (TiledArray::TileOpsLogger::get_instance_ptr() != nullptr && + TiledArray::TileOpsLogger::get_instance().gemm) { + auto& logger = TiledArray::TileOpsLogger::get_instance(); + auto apply = [](auto& fnptr, const Range& arg) { + return fnptr ? fnptr(arg) : arg; + }; + auto tformed_left_range = + apply(logger.gemm_left_range_transform, A.range()); + auto tformed_right_range = + apply(logger.gemm_right_range_transform, B.range()); + auto tformed_result_range = + apply(logger.gemm_result_range_transform, C.range()); + if ((!logger.gemm_result_range_filter || + logger.gemm_result_range_filter(tformed_result_range)) && + (!logger.gemm_left_range_filter || + logger.gemm_left_range_filter(tformed_left_range)) && + (!logger.gemm_right_range_filter || + logger.gemm_right_range_filter(tformed_right_range))) { + logger << "TA::Tensor::gemm+: left=" << tformed_left_range + << " right=" << tformed_right_range + << " result=" << tformed_result_range << std::endl; + if (TiledArray::TileOpsLogger::get_instance() + .gemm_print_contributions) { + if (!TiledArray::TileOpsLogger::get_instance() + .gemm_printer) { // default printer + // must use custom printer if result's range transformed + if (!logger.gemm_result_range_transform) + logger << C << std::endl; + else + logger << make_map(C.data(), tformed_result_range) << std::endl; + } else { + TiledArray::TileOpsLogger::get_instance().gemm_printer( + *logger.log, tformed_left_range, A.data(), + tformed_right_range, B.data(), tformed_right_range, C.data(), + C.nbatch()); + } + } + } + } + + if (twostep) { + for (size_t v = 0; v != tile_volume; ++v) { + C.data()[v] += data_copy[v]; + } + } + } +#else // TA_ENABLE_TILE_OPS_LOGGING + math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, k, + alpha, A.data(), lda, B.data(), ldb, beta, C.data(), n); +#endif // TA_ENABLE_TILE_OPS_LOGGING + } +} + /// customization point transform functionality to tensor class T, useful for /// nonintrusive extension of T to be usable as element type T in Tensor template @@ -953,6 +1141,89 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, return result; } +/// +/// todo: constraint ResultTensorAllocator type so that non-sensical Allocators +/// are prohibited +/// +template && + is_annotation_v>> +auto tensor_contract(TensorA const& A, Annot const& aA, TensorB const& B, + Annot const& aB, Annot const& aC) { + using Result = result_tensor_t, TensorA, TensorB, + ResultTensorAllocator>; + + using Indices = ::Einsum::index::Index; + using Permutation = ::Einsum::index::Permutation; + using ::Einsum::index::permutation; + + // Check that the ranks of the tensors match that of the annotation. + TA_ASSERT(A.range().rank() == aA.size()); + TA_ASSERT(B.range().rank() == aB.size()); + + struct { + Indices // + A, // indices of A + B, // indices of B + C, // indices of C (target indices) + h, // Hadamard indices (aA intersection aB intersection aC) + e, // external indices (aA symmetric difference aB) + i; // internal indices ((aA intersection aB) set difference aC) + } const indices{aA, + aB, + aC, + (indices.A & indices.B & indices.C), + (indices.A ^ indices.B), + ((indices.A & indices.B) - indices.h)}; + + TA_ASSERT(!indices.h && "Hadamard indices not supported"); + TA_ASSERT(indices.e && "Dot product not supported"); + + struct { + Indices A, B, C; + } const blas_layout{(indices.A - indices.B) + indices.i, + indices.i + (indices.B - indices.A), indices.e}; + + struct { + Permutation A, B, C; + } const perm{permutation(indices.A, blas_layout.A), + permutation(indices.B, blas_layout.B), + permutation(indices.C, blas_layout.C)}; + + struct { + bool A, B, C; + } const do_perm{indices.A != blas_layout.A, indices.B != blas_layout.B, + indices.C != blas_layout.C}; + + auto permedA = [&]() -> TensorA { + return do_perm.A ? A.permute(perm.A) : std::cref(A); + }; + + auto permedB = [&]() -> TensorB { + return do_perm.B ? B.permute(perm.B) : std::cref(B); + }; + + math::GemmHelper gemm_helper{blas::Op::NoTrans, blas::Op::NoTrans, + static_cast(indices.e.size()), + static_cast(indices.A.size()), + static_cast(indices.B.size())}; + + // initialize result with correct rank + Result result; + { + container::vector rng(indices.e.size(), 0); + result = Result{TA::Range(rng)}; + } + + using Numeric = typename Result::numeric_type; + + // call gemm + gemm(Numeric{1}, permedA(), permedB(), Numeric{0}, result, gemm_helper); + + return do_perm.C ? result.permute(perm.C.inv()) : result; +} + } // namespace detail } // namespace TiledArray diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 9c36b071cc..5ecee7fc02 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -36,11 +36,6 @@ namespace TiledArray { -template -void gemm(Alpha alpha, const Tensor& A, const Tensor& B, - Beta beta, Tensor& C, const math::GemmHelper& gemm_helper); - namespace detail { /// Signals that we can take the trace of a Tensor (for numeric \c T) @@ -2205,7 +2200,8 @@ class Tensor { #else // TA_ENABLE_TILE_OPS_LOGGING for (size_t i = 0; i < this->nbatch(); ++i) { auto Ci = this->batch(i); - TiledArray::gemm(alpha, A.batch(i), B.batch(i), beta, Ci, gemm_helper); + TiledArray::detail::gemm(alpha, A.batch(i), B.batch(i), beta, Ci, + gemm_helper); } #endif // TA_ENABLE_TILE_OPS_LOGGING @@ -2542,189 +2538,6 @@ Tensor operator*(const Permutation& p, const Tensor& t) { return t.permute(p); } -/// Contract two tensors and accumulate the scaled result to this tensor - -/// GEMM is limited to matrix like contractions. For example, the following -/// contractions are supported: -/// \code -/// C[a,b] = A[a,i,j] * B[i,j,b] -/// C[a,b] = A[a,i,j] * B[b,i,j] -/// C[a,b] = A[i,j,a] * B[i,j,b] -/// C[a,b] = A[i,j,a] * B[b,i,j] -/// -/// C[a,b,c,d] = A[a,b,i,j] * B[i,j,c,d] -/// C[a,b,c,d] = A[a,b,i,j] * B[c,d,i,j] -/// C[a,b,c,d] = A[i,j,a,b] * B[i,j,c,d] -/// C[a,b,c,d] = A[i,j,a,b] * B[c,d,i,j] -/// \endcode -/// Notice that in the above contractions, the inner and outer indices of -/// the arguments for exactly two contiguous groups in each tensor and that -/// each group is in the same order in all tensors. That is, the indices of -/// the tensors must fit the one of the following patterns: -/// \code -/// C[M...,N...] = A[M...,K...] * B[K...,N...] -/// C[M...,N...] = A[M...,K...] * B[N...,K...] -/// C[M...,N...] = A[K...,M...] * B[K...,N...] -/// C[M...,N...] = A[K...,M...] * B[N...,K...] -/// \endcode -/// This allows use of optimized BLAS functions to evaluate tensor -/// contractions. Tensor contractions that do not fit this pattern require -/// one or more tensor permutation so that the tensors fit the required -/// pattern. -/// \tparam U The left-hand tensor element type -/// \tparam AU The left-hand tensor allocator type -/// \tparam V The right-hand tensor element type -/// \tparam AV The right-hand tensor allocator type -/// \tparam W The type of the scaling factor -/// \param left The left-hand tensor that will be contracted -/// \param right The right-hand tensor that will be contracted -/// \param factor The contraction result will be scaling by this value, then -/// accumulated into \c this \param gemm_helper The *GEMM operation meta data -/// \return A reference to \c this -/// \note if this is uninitialized, i.e., if \c this->empty()==true will -/// this is equivalent to -/// \code -/// return (*this = left.gemm(right, factor, gemm_helper)); -/// \endcode -template -void gemm(Alpha alpha, const Tensor& A, const Tensor& B, - Beta beta, Tensor& C, const math::GemmHelper& gemm_helper) { - static_assert(!detail::is_tensor_of_tensor_v, Tensor, - Tensor>, - "TA::Tensor::gemm without custom element op is " - "only applicable to " - "plain tensors"); - { - // Check that tensor C is not empty and has the correct rank - TA_ASSERT(!C.empty()); - TA_ASSERT(C.range().rank() == gemm_helper.result_rank()); - - // Check that the arguments are not empty and have the correct ranks - TA_ASSERT(!A.empty()); - TA_ASSERT(A.range().rank() == gemm_helper.left_rank()); - TA_ASSERT(!B.empty()); - TA_ASSERT(B.range().rank() == gemm_helper.right_rank()); - - TA_ASSERT(A.nbatch() == 1); - TA_ASSERT(B.nbatch() == 1); - TA_ASSERT(C.nbatch() == 1); - - // Check that the outer dimensions of left match the corresponding - // dimensions in result - TA_ASSERT(gemm_helper.left_result_congruent(A.range().extent_data(), - C.range().extent_data())); - TA_ASSERT(ignore_tile_position() || - gemm_helper.left_result_congruent(A.range().lobound_data(), - C.range().lobound_data())); - TA_ASSERT(ignore_tile_position() || - gemm_helper.left_result_congruent(A.range().upbound_data(), - C.range().upbound_data())); - - // Check that the outer dimensions of right match the corresponding - // dimensions in result - TA_ASSERT(gemm_helper.right_result_congruent(B.range().extent_data(), - C.range().extent_data())); - TA_ASSERT(ignore_tile_position() || - gemm_helper.right_result_congruent(B.range().lobound_data(), - C.range().lobound_data())); - TA_ASSERT(ignore_tile_position() || - gemm_helper.right_result_congruent(B.range().upbound_data(), - C.range().upbound_data())); - - // Check that the inner dimensions of left and right match - TA_ASSERT(gemm_helper.left_right_congruent(A.range().extent_data(), - B.range().extent_data())); - TA_ASSERT(ignore_tile_position() || - gemm_helper.left_right_congruent(A.range().lobound_data(), - B.range().lobound_data())); - TA_ASSERT(ignore_tile_position() || - gemm_helper.left_right_congruent(A.range().upbound_data(), - B.range().upbound_data())); - - // Compute gemm dimensions - using integer = TiledArray::math::blas::integer; - integer m, n, k; - gemm_helper.compute_matrix_sizes(m, n, k, A.range(), B.range()); - - // Get the leading dimension for left and right matrices. - const integer lda = - (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose ? k : m); - const integer ldb = - (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? n : k); - - // may need to split gemm into multiply + accumulate for tracing purposes -#ifdef TA_ENABLE_TILE_OPS_LOGGING - { - using numeric_type = typename Tensor::numeric_type; - using T = numeric_type; - const bool twostep = - TiledArray::TileOpsLogger::get_instance().gemm && - TiledArray::TileOpsLogger::get_instance().gemm_print_contributions; - std::unique_ptr data_copy; - size_t tile_volume; - if (twostep) { - tile_volume = C.range().volume(); - data_copy = std::make_unique(tile_volume); - std::copy(C.data(), C.data() + tile_volume, data_copy.get()); - } - non_distributed::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, - k, alpha, A.data(), lda, B.data(), ldb, - twostep ? numeric_type(0) : beta, C.data(), n); - - if (TiledArray::TileOpsLogger::get_instance_ptr() != nullptr && - TiledArray::TileOpsLogger::get_instance().gemm) { - auto& logger = TiledArray::TileOpsLogger::get_instance(); - auto apply = [](auto& fnptr, const Range& arg) { - return fnptr ? fnptr(arg) : arg; - }; - auto tformed_left_range = - apply(logger.gemm_left_range_transform, A.range()); - auto tformed_right_range = - apply(logger.gemm_right_range_transform, B.range()); - auto tformed_result_range = - apply(logger.gemm_result_range_transform, C.range()); - if ((!logger.gemm_result_range_filter || - logger.gemm_result_range_filter(tformed_result_range)) && - (!logger.gemm_left_range_filter || - logger.gemm_left_range_filter(tformed_left_range)) && - (!logger.gemm_right_range_filter || - logger.gemm_right_range_filter(tformed_right_range))) { - logger << "TA::Tensor::gemm+: left=" << tformed_left_range - << " right=" << tformed_right_range - << " result=" << tformed_result_range << std::endl; - if (TiledArray::TileOpsLogger::get_instance() - .gemm_print_contributions) { - if (!TiledArray::TileOpsLogger::get_instance() - .gemm_printer) { // default printer - // must use custom printer if result's range transformed - if (!logger.gemm_result_range_transform) - logger << C << std::endl; - else - logger << make_map(C.data(), tformed_result_range) << std::endl; - } else { - TiledArray::TileOpsLogger::get_instance().gemm_printer( - *logger.log, tformed_left_range, A.data(), - tformed_right_range, B.data(), tformed_right_range, C.data(), - C.nbatch()); - } - } - } - } - - if (twostep) { - for (size_t v = 0; v != tile_volume; ++v) { - C.data()[v] += data_copy[v]; - } - } - } -#else // TA_ENABLE_TILE_OPS_LOGGING - math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, k, - alpha, A.data(), lda, B.data(), ldb, beta, C.data(), n); -#endif // TA_ENABLE_TILE_OPS_LOGGING - } -} - // template // const typename Tensor::range_type Tensor::empty_range_; diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index 89f8da70a2..2e23359950 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -30,6 +30,7 @@ #include #include +#include #include namespace Eigen { @@ -373,6 +374,106 @@ static constexpr const auto is_bipartite_permutable_v = is_free_function_permute_anyreturn_v< const T&, const TiledArray::BipartitePermutation&>; +// +template +constexpr bool is_random_access_container_v{}; + +/// +/// - The container concept is weakly tested -- any type that has +/// @c iterator typedef gets picked up. +/// +/// - The iterator category must be std::random_access_iterator_tag -- +/// random-access-ness is strongly tested. +/// +/// Following lines compile, for example: +/// +/// @c static_assert(is_random_access_container>); +/// @c static_assert(!is_random_access_container>); +/// +template +constexpr bool is_random_access_container_v< + T, std::void_t, + std::enable_if_t::iterator_category, + std::random_access_iterator_tag>>>{true}; + +// +template +constexpr bool is_annotation_v{}; + +/// +/// An annotation type (T) is a type that satisfies the following constraints: +/// - is_random_access_container_v is true. +/// - The value type of the container T are strictly ordered. Note that T is a +/// container from the first constraint. +/// +template +constexpr bool is_annotation_v< + T, std::void_t, + std::enable_if_t && + is_strictly_ordered_v> + + >{true}; + +namespace { + +template +using binop_result_t = std::invoke_result_t; + +template +constexpr bool is_binop_v{}; + +template +constexpr bool + is_binop_v>>{true}; + +template >> +struct result_tensor_helper { + private: + using TensorA_ = std::remove_reference_t; + using TensorB_ = std::remove_reference_t; + using value_type_A = typename TensorA_::value_type; + using value_type_B = typename TensorB_::value_type; + using allocator_type_A = typename TensorA_::allocator_type; + using allocator_type_B = typename TensorB_::allocator_type; + + public: + using numeric_type = binop_result_t; + using allocator_type = + std::conditional_t && + std::is_same_v, + allocator_type_A, Allocator>; + using result_type = + std::conditional_t, + TA::Tensor, + TA::Tensor>; +}; + +} // namespace + +/// +/// The typedef is a complete TA::Tensor type where +/// - NumericT is determined by Op: +/// - effectively, it is: +/// std::invoke_result_t +/// +/// - AllocatorT is +/// - the default TA::Tensor allocator if @tparam Allocator is void +/// - TensorA::allocator_type if TensorA and TensorB have the same allocator +/// type +/// - the @tparam Allocator otherwise +/// todo: constraint what @tparam Allocator +/// +/// +template >> +using result_tensor_t = + typename result_tensor_helper::result_type; + } // namespace detail /// Specifies how coordinates are mapped to ordinal values From 0b2b06b0be8949abd62dd51ac6f92fe9c129b087 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 11 Jan 2024 12:56:35 -0500 Subject: [PATCH 271/592] Add non-const begin() and end() methods to Einsum::index::Index. --- src/TiledArray/einsum/index.h | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/src/TiledArray/einsum/index.h b/src/TiledArray/einsum/index.h index 58c378704b..56283f239a 100644 --- a/src/TiledArray/einsum/index.h +++ b/src/TiledArray/einsum/index.h @@ -3,10 +3,10 @@ #include "TiledArray/expressions/fwd.h" +#include #include #include #include -#include #include #include @@ -45,18 +45,14 @@ class Index { Index(const char (&s)[N]) : Index(std::string(s)) {} template - explicit Index(const char* &s) : Index(std::string(s)) {} + explicit Index(const char *&s) : Index(std::string(s)) {} template explicit Index(const std::string &s) { - static_assert( - std::is_same_v || - std::is_same_v - ); - if constexpr (std::is_same_v) { + static_assert(std::is_same_v || std::is_same_v); + if constexpr (std::is_same_v) { data_ = index::tokenize(s); - } - else { + } else { using std::begin; using std::end; data_.assign(begin(s), end(s)); @@ -78,8 +74,11 @@ class Index { size_t size() const { return data_.size(); } - auto begin() const { return data_.begin(); } - auto end() const { return data_.end(); } + auto begin() const { return data_.cbegin(); } + auto end() const { return data_.cend(); } + + auto begin() { return data_.begin(); } + auto end() { return data_.end(); } auto find(const T &v) const { return std::find(this->begin(), this->end(), v); @@ -209,11 +208,8 @@ auto permute(const Permutation &p, const Index &s, if (!p) return s; using R = typename Index::container_type; R r(p.size()); - TiledArray::detail::permute_n( - p.size(), - p.begin(), s.begin(), r.begin(), - std::bool_constant{} - ); + TiledArray::detail::permute_n(p.size(), p.begin(), s.begin(), r.begin(), + std::bool_constant{}); return Index{r}; } @@ -306,8 +302,8 @@ IndexMap operator|(const IndexMap &a, const IndexMap &b) { } // namespace Einsum::index namespace Einsum { - using index::Index; - using index::IndexMap; -} // namespace TiledArray::Einsum +using index::Index; +using index::IndexMap; +} // namespace Einsum #endif /* TILEDARRAY_EINSUM_INDEX_H__INCLUDED */ From 232bf82d06f5b3807417289f34f5214f725a7093 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 11 Jan 2024 13:02:20 -0500 Subject: [PATCH 272/592] Include sstream header. --- src/TiledArray/einsum/string.h | 64 +++++++++++++++++----------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/src/TiledArray/einsum/string.h b/src/TiledArray/einsum/string.h index 7647aed63b..d2dc6048ab 100644 --- a/src/TiledArray/einsum/string.h +++ b/src/TiledArray/einsum/string.h @@ -1,50 +1,50 @@ #ifndef TILEDARRAY_EINSUM_STRING_H #define TILEDARRAY_EINSUM_STRING_H +#include #include #include -#include +#include #include #include namespace Einsum::string { namespace { - // Split delimiter must match completely - template - std::pair split2(const std::string& s, const std::string &d) { - auto pos = s.find(d); - if (pos == s.npos) return { T(s), U("") }; - return { T(s.substr(0,pos)), U(s.substr(pos+d.size())) }; - } +// Split delimiter must match completely +template +std::pair split2(const std::string& s, const std::string& d) { + auto pos = s.find(d); + if (pos == s.npos) return {T(s), U("")}; + return {T(s.substr(0, pos)), U(s.substr(pos + d.size()))}; +} - // Split delimiter must match completely - std::vector split(const std::string& s, char d) { - std::vector res; - return boost::split(res, s, [&d](char c) { return c == d; } /*boost::is_any_of(d)*/); - } +// Split delimiter must match completely +std::vector split(const std::string& s, char d) { + std::vector res; + return boost::split(res, s, + [&d](char c) { return c == d; } /*boost::is_any_of(d)*/); +} - std::string trim(const std::string& s) { - return boost::trim_copy(s); - } +std::string trim(const std::string& s) { return boost::trim_copy(s); } - template - std::string str(const T& obj) { - std::stringstream ss; - ss << obj; - return ss.str(); - } +template +std::string str(const T& obj) { + std::stringstream ss; + ss << obj; + return ss.str(); +} - template - std::string join(const T &s, const U& j = U("")) { - std::vector strings; - for (auto e : s) { - strings.push_back(str(e)); - } - return boost::join(strings, j); +template +std::string join(const T& s, const U& j = U("")) { + std::vector strings; + for (auto e : s) { + strings.push_back(str(e)); } - -} + return boost::join(strings, j); } -#endif //TILEDARRAY_EINSUM_STRING_H +} // namespace +} // namespace Einsum::string + +#endif // TILEDARRAY_EINSUM_STRING_H From ef65d3610936c68f07a632b75e5e7f2d12236d51 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 12 Jan 2024 11:07:20 -0500 Subject: [PATCH 273/592] Complete implementation of tensor_contract free function. Also adds function to compare contraction results with btas in the test fixture for ToT. --- src/TiledArray/tensor/kernels.h | 38 +++++--- tests/tot_array_fixture.h | 167 ++++++++++++++++++++++++++------ 2 files changed, 165 insertions(+), 40 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index af951755a3..5dc32db65d 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -1182,8 +1182,8 @@ auto tensor_contract(TensorA const& A, Annot const& aA, TensorB const& B, struct { Indices A, B, C; - } const blas_layout{(indices.A - indices.B) + indices.i, - indices.i + (indices.B - indices.A), indices.e}; + } const blas_layout{(indices.A - indices.B) | indices.i, + indices.i | (indices.B - indices.A), indices.e}; struct { Permutation A, B, C; @@ -1196,30 +1196,42 @@ auto tensor_contract(TensorA const& A, Annot const& aA, TensorB const& B, } const do_perm{indices.A != blas_layout.A, indices.B != blas_layout.B, indices.C != blas_layout.C}; - auto permedA = [&]() -> TensorA { - return do_perm.A ? A.permute(perm.A) : std::cref(A); - }; - - auto permedB = [&]() -> TensorB { - return do_perm.B ? B.permute(perm.B) : std::cref(B); - }; - math::GemmHelper gemm_helper{blas::Op::NoTrans, blas::Op::NoTrans, static_cast(indices.e.size()), static_cast(indices.A.size()), static_cast(indices.B.size())}; - // initialize result with correct rank + // initialize result with the correct extents Result result; { - container::vector rng(indices.e.size(), 0); + using Index = typename Indices::value_type; + using Extent = std::remove_cv_t< + typename decltype(std::declval().extent())::value_type>; + using ExtentMap = ::Einsum::index::IndexMap; + + // Map tensor indices to their extents. + // Note that whether the contracting indices have matching extents is + // implicitly checked here by the pipe(|) operator on ExtentMap. + + ExtentMap extent = (ExtentMap{indices.A, A.range().extent()} | + ExtentMap{indices.B, B.range().extent()}); + + container::vector rng; + rng.reserve(indices.e.size()); + for (auto&& ix : indices.e) { + // assuming ix _exists_ in extent + rng.emplace_back(extent[ix]); + } result = Result{TA::Range(rng)}; } using Numeric = typename Result::numeric_type; // call gemm - gemm(Numeric{1}, permedA(), permedB(), Numeric{0}, result, gemm_helper); + gemm(Numeric{1}, // + do_perm.A ? A.permute(perm.A) : A, // + do_perm.B ? B.permute(perm.B) : B, // + Numeric{0}, result, gemm_helper); return do_perm.C ? result.permute(perm.C.inv()) : result; } diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 7345401518..87749afec0 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -22,7 +22,9 @@ #include "tiledarray.h" #include "unit_test_config.h" #ifdef TILEDARRAY_HAS_BTAS +#include #include +#include #endif /* Notes: @@ -90,51 +92,162 @@ using output_archive_type = madness::archive::BinaryFstreamOutputArchive; enum class ShapeComp { True, False }; -template +template , bool> = true> auto random_tensor(TA::Range const& rng) { - TA::Tensor result{rng}; - std::generate(result.begin(), result.end(), + TensorT result{rng}; + using NumericT = typename TensorT::numeric_type; + std::generate(/*std::execution::par, */ + result.begin(), result.end(), TA::detail::MakeRandom::generate_value); return result; } +// // note: all the inner tensors (elements of the outer tensor) // have the same @c inner_rng -template -auto random_tensor_of_tensor(TA::Range const& outer_rng, - TA::Range const& inner_rng) { - TA::Tensor> result{outer_rng}; - - std::generate(result.begin(), result.end(), - [inner_rng]() { return random_tensor(inner_rng); }); +// +template < + typename TensorT, + std::enable_if_t, bool> = true> +auto random_tensor(TA::Range const& outer_rng, TA::Range const& inner_rng) { + using InnerTensorT = typename TensorT::value_type; + TensorT result{outer_rng}; + + std::generate(/*std::execution::par,*/ + result.begin(), result.end(), [inner_rng]() { + return random_tensor(inner_rng); + }); return result; } -template -auto make_random_array(TA::TiledRange const& trange) { - using ArrayT = TA::DistArray, Policy>; - - auto make_tile = [](TA::Tensor& tile, TA::Range const& rng) { - tile = random_tensor(rng); - if constexpr (std::is_same_v) return tile.norm(); +/// +/// \tparam Array The type of DistArray to be generated. Cannot be cv-qualified +/// or reference type. +/// \tparam Args TA::Range type for inner tensor if the tile type of the result +/// is a tensor-of-tensor. +/// \param trange The TiledRange of the result DistArray. +/// \param args Either exactly one TA::Range type when the tile type of Array is +/// tensor-of-tensor or nothing. +/// \return Returns a DistArray of type Array whose elements are randomly +/// generated. +/// @note: +/// - Although DistArrays with Sparse policy can be generated all of their +/// tiles are initialized with random values -- technically the returned value +/// is dense. +/// - In case of arrays with tensor-of-tensor tiles, all the inner tensors have +/// the same rank and the same extent of corresponding modes. +/// +template < + typename Array, typename... Args, + typename = + std::void_t, + std::enable_if_t, + bool> = true> +auto random_array(TA::TiledRange const& trange, Args const&... args) { + static_assert( + (sizeof...(Args) == 0 && + TA::detail::is_tensor_v) || + (sizeof...(Args) == 1) && + (TA::detail::is_tensor_of_tensor_v)); + + if constexpr (sizeof...(Args) == 1) + static_assert(std::is_convertible_v); + + using TensorT = typename Array::value_type; + using PolicyT = typename Array::policy_type; + + auto make_tile_meta = [](auto&&... args) { + return [=](TensorT& tile, TA::Range const& rng) { + tile = random_tensor(rng, args...); + if constexpr (std::is_same_v) + return tile.norm(); + }; }; - return TA::make_array(TA::get_default_world(), trange, make_tile); + return TA::make_array(TA::get_default_world(), trange, + make_tile_meta(args...)); +} + +/// +/// Succinctly call TA::detail::tensor_contract +/// +/// \tparam T TA::Tensor type. +/// \param einsum_annot Example annot: 'ik,kj->ij', when @c A is annotated by +/// 'i' and 'k' for its two modes, and @c B is annotated by 'k' and 'j' for the +/// same. The result tensor is rank-2 as well and its modes are annotated by 'i' +/// and 'j'. +/// \return Tensor contraction result. +/// +template , bool> = true> +auto tensor_contract(std::string const& einsum_annot, T const& A, T const& B) { + using ::Einsum::string::split2; + auto [ab, aC] = split2(einsum_annot, "->"); + auto [aA, aB] = split2(ab, ","); + + return TA::detail::tensor_contract(A, aA, B, aB, aC); } -template -auto make_random_array(TA::TiledRange const& trange, TA::Range const& inner) { - using ArrayT = TA::DistArray>, Policy>; +#ifdef TILEDARRAY_HAS_BTAS - auto make_tile = [inner](TA::Tensor>& tile, - TA::Range const& rng) { - tile = random_tensor_of_tensor(rng, inner); - if constexpr (std::is_same_v) return tile.norm(); - }; - return TA::make_array(TA::get_default_world(), trange, make_tile); +template >> +auto tensor_to_btas_tensor(T const& ta_tensor) { + using value_type = typename T::value_type; + using range_type = typename T::range_type; + + btas::Tensor result{ta_tensor.range()}; + TA::tensor_to_btas_subtensor(ta_tensor, result); + return result; +} + +template >> +auto btas_tensor_to_tensor( + btas::Tensor const& btas_tensor) { + TA::Tensor result{TA::Range(btas_tensor.range())}; + TA::btas_subtensor_to_tensor(btas_tensor, result); + return result; } +/// +/// @c einsum_annot pattern example: 'ik,kj->ij'. See tensor_contract function. +/// +template , bool> = true> +auto tensor_contract_btas(std::string const& einsum_annot, T const& A, + T const& B) { + using ::Einsum::string::split2; + auto [ab, aC] = split2(einsum_annot, "->"); + auto [aA, aB] = split2(ab, ","); + + using NumericT = typename T::numeric_type; + + struct { + btas::Tensor A, B, C; + } btas_tensor{tensor_to_btas_tensor(A), tensor_to_btas_tensor(B), {}}; + + btas::contract(NumericT{1}, btas_tensor.A, aA, btas_tensor.B, aB, NumericT{0}, + btas_tensor.C, aC); + + return btas_tensor_to_tensor(btas_tensor.C); +} + +/// +/// \tparam T TA::Tensor type +/// \param einsum_annot see tensor_contract_mult +/// \return True when TA::detail::tensor_contract and btas::contract result the +/// result. Performs bitwise comparison. +/// +template >> +auto tensor_contract_equal(std::string const& einsum_annot, T const& A, + T const& B) { + T result_ta = tensor_contract(einsum_annot, A, B); + T result_btas = tensor_contract_btas(einsum_annot, A, B); + return result_ta == result_btas; +} + +#endif + /* * * When generating arrays containing tensors of tensors (ToT) we adopt simple From ee5a6af7e9bd5eadfd83dc811955c749ce51a4c1 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 12 Jan 2024 14:14:50 -0500 Subject: [PATCH 274/592] More useful is_array_v type trait. --- src/TiledArray/type_traits.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/type_traits.h b/src/TiledArray/type_traits.h index 428ad63716..1bddff446d 100644 --- a/src/TiledArray/type_traits.h +++ b/src/TiledArray/type_traits.h @@ -1258,8 +1258,9 @@ struct is_array : public std::false_type {}; template struct is_array> : public std::true_type {}; -template -static constexpr bool is_array_v = is_array::value; +template +constexpr bool is_array_v = + (is_array>::value && ...); template using trange_t = typename T::trange_type; From b30ba474e2c4eed0860782e019571d4361387475 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sat, 13 Jan 2024 11:39:00 -0500 Subject: [PATCH 275/592] Less verbose element access. --- tests/einsum.cpp | 46 +++++++++++++++------------------------------- 1 file changed, 15 insertions(+), 31 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 6155b2cb98..79d4e70e06 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -630,8 +630,6 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) { auto i = res_ix[0]; auto j = res_ix[1]; auto k = res_ix[2]; - using Ix2 = std::array; - using Ix3 = std::array; auto lhs_tile_ix = lhs.trange().element_to_tile({i, j}); auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false); @@ -850,20 +848,15 @@ BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) { auto k = res_ix[2]; auto j = res_ix[3]; - using Ix2 = std::array; - using Ix4 = std::array; - - auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); + auto lhs_tile_ix = lhs.trange().element_to_tile({i, j}); auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false); - auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{k, l}); + auto rhs_tile_ix = rhs.trange().element_to_tile({k, l}); auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false); - auto& res_el = - result_tile.at_ordinal(result_tile.range().ordinal(Ix4{i, l, k, j})); - auto const& lhs_el = - lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); - auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{k, l})); + auto& res_el = result_tile({i, l, k, j}); + auto const& lhs_el = lhs_tile({i, j}); + auto rhs_el = rhs_tile({k, l}); res_el = tot_type::element_type( lhs_el.scale(rhs_el), // scale @@ -949,20 +942,15 @@ BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) { auto j = res_ix[1]; auto k = res_ix[2]; - using Ix2 = std::array; - using Ix3 = std::array; - - auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{i, j}); + auto lhs_tile_ix = lhs.trange().element_to_tile({i, j}); auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false); - auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2{j, k}); + auto rhs_tile_ix = rhs.trange().element_to_tile({j, k}); auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false); - auto& res_el = - result_tile.at_ordinal(result_tile.range().ordinal(Ix3{i, j, k})); - auto const& lhs_el = - lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{i, j})); - auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{j, k})); + auto& res_el = result_tile({i, j, k}); + auto const& lhs_el = lhs_tile({i, j}); + auto rhs_el = rhs_tile({j, k}); res_el = lhs_el.scale(rhs_el); } @@ -1057,19 +1045,15 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) { auto i = res_ix[0]; auto j = res_ix[1]; - using Ix2 = std::array; - - auto lhs_tile_ix = lhs.trange().element_to_tile(Ix2{j, i}); + auto lhs_tile_ix = lhs.trange().element_to_tile({j, i}); auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork */ false); - auto rhs_tile_ix = rhs.trange().element_to_tile(Ix2({i, j})); + auto rhs_tile_ix = rhs.trange().element_to_tile({i, j}); auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork */ false); - auto& res_el = - result_tile.at_ordinal(result_tile.range().ordinal(Ix2{i, j})); - auto const& lhs_el = - lhs_tile.at_ordinal(lhs_tile.range().ordinal(Ix2{j, i})); - auto rhs_el = rhs_tile.at_ordinal(rhs_tile.range().ordinal(Ix2{i, j})); + auto& res_el = result_tile({i, j}); + auto const& lhs_el = lhs_tile({j, i}); + auto rhs_el = rhs_tile({i, j}); res_el = tot_type::element_type(lhs_el.scale(rhs_el), // scale TiledArray::Permutation{0, 1} // permute ); From 81621a6bf2b581f15909926c7ce36e70d7f72d1e Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 14 Jan 2024 15:50:17 -0500 Subject: [PATCH 276/592] Add 'iterator' typedef to einsum::Index class. --- src/TiledArray/einsum/index.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/TiledArray/einsum/index.h b/src/TiledArray/einsum/index.h index 56283f239a..080158c57b 100644 --- a/src/TiledArray/einsum/index.h +++ b/src/TiledArray/einsum/index.h @@ -29,6 +29,7 @@ class Index { public: using container_type = small_vector; using value_type = typename container_type::value_type; + using iterator = typename container_type::iterator; Index() = default; Index(const container_type &s) : data_(s) {} From 69a2941e67e59eea7b88500a4c24cc7c01e8f58d Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 14 Jan 2024 15:51:38 -0500 Subject: [PATCH 277/592] .add_to method on TA::Tensor supported when 'this' object is empty. --- src/TiledArray/tensor/tensor.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 5ecee7fc02..6a60ace6d1 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1687,6 +1687,7 @@ class Tensor { template ::value>::type* = nullptr> Tensor& add_to(const Right& right) { + if (empty()) *this = Tensor{right.range()}; return inplace_binary(right, [](value_type& MADNESS_RESTRICT l, const value_t r) { l += r; }); } From 967eef2394471f4309674bf04842ca1409b4fbd5 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 14 Jan 2024 15:52:56 -0500 Subject: [PATCH 278/592] Adds 'tensor_hadamard' free function that supports permutation. --- src/TiledArray/tensor/kernels.h | 42 +++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 5dc32db65d..4d123fecd0 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -1236,6 +1236,48 @@ auto tensor_contract(TensorA const& A, Annot const& aA, TensorB const& B, return do_perm.C ? result.permute(perm.C.inv()) : result; } +template && + is_annotation_v>> +auto tensor_hadamard(TensorA const& A, Annot const& aA, TensorB const& B, + Annot const& aB, Annot const& aC) { + using ::Einsum::index::Permutation; + using ::Einsum::index::permutation; + using Indices = ::Einsum::index::Index; + + struct { + Permutation // + AB, // permutes A to B + AC, // permutes A to C + BC; // permutes B to C + } const perm{permutation(Indices{aA}, Indices{aB}), + permutation(Indices{aA}, Indices{aC}), + permutation(Indices{aB}, Indices{aC})}; + + struct { + bool no_perm, perm_to_c, perm_a, perm_b; + } do_this{perm.AB.is_identity() && perm.AC.is_identity() && + perm.BC.is_identity(), // + perm.AB.is_identity(), // + perm.AC.is_identity()}; + + if (do_this.no_perm) { + return A.mult(B); + } else if (do_this.perm_to_c) { + return A.mult(B, perm.AC); + } else if (do_this.perm_a) { + auto pA = A.permute(perm.AC); + return pA.mult(B); + } else if (do_this.perm_b) { + auto pB = B.permute(perm.BC); + return A.mult(pB); + } else { + auto pA = A.permute(perm.AC); + auto pB = B.permute(perm.BC); + return pA.mult(pB); + } +} + } // namespace detail } // namespace TiledArray From 8e9bf3bbc5d432beb447275edf36ee0238e7960e Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 14 Jan 2024 15:53:53 -0500 Subject: [PATCH 279/592] Adds .is_identity method to fast check whether the permutation is an identity operation. --- src/TiledArray/permutation.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/permutation.h b/src/TiledArray/permutation.h index cd527dfeef..a1dcad2ce8 100644 --- a/src/TiledArray/permutation.h +++ b/src/TiledArray/permutation.h @@ -329,6 +329,13 @@ class Permutation { return result; } + /// + /// Checks if this permutation is the identity permutation. + /// + [[nodiscard]] bool is_identity() const { + std::is_sorted(p_.begin(), p_.end()); + } + /// Identity permutation factory function /// \return An identity permutation @@ -421,7 +428,7 @@ class Permutation { /// \param[in,out] ar The serialization archive template void serialize(Archive& ar) { - ar& p_; + ar & p_; } }; // class Permutation @@ -795,7 +802,7 @@ class BipartitePermutation { /// \param[in,out] ar The serialization archive template void serialize(Archive& ar) { - ar& base_& second_size_; + ar & base_ & second_size_; if constexpr (madness::is_input_archive_v) { first_ = {}; second_ = {}; From fcdff9e814ec7ec826eb3d896b47918f49c070d3 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 14 Jan 2024 15:56:10 -0500 Subject: [PATCH 280/592] 'einsum' function supports Hadamard+contraction on the outer indices with Hadamard or contraction on the inner indices. --- src/TiledArray/einsum/tiledarray.h | 44 ++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index bee8e4ec10..b294a05b7d 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -103,7 +103,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, struct { std::string a, b, c; // Hadamard, external, internal indices for inner tensor - Einsum::Index h, e, i; + Einsum::Index A, B, C, h, e, i; } inner; if constexpr (std::tuple_size::value == 2) { if constexpr (IsArrayToT) @@ -116,14 +116,14 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, inner.c = ";" + (std::string)std::get<1>(cs); Einsum::Index a_idx, b_idx, c_idx; - if constexpr (IsArrayToT) a_idx = std::get<1>(Einsum::idx(A)); - if constexpr (IsArrayToT) b_idx = std::get<1>(Einsum::idx(B)); + if constexpr (IsArrayToT) inner.A = std::get<1>(Einsum::idx(A)); + if constexpr (IsArrayToT) inner.B = std::get<1>(Einsum::idx(B)); if constexpr (IsArrayToT || IsArrayToT) - c_idx = std::get<1>(cs); + inner.C = std::get<1>(cs); - inner.h = a_idx & b_idx & c_idx; - inner.e = (a_idx ^ b_idx); - inner.i = (a_idx & b_idx) - inner.h; + inner.h = inner.A & inner.B & inner.C; + inner.e = (inner.A ^ inner.B); + inner.i = (inner.A & inner.B) - inner.h; } // these are "Hadamard" (fused) indices @@ -227,8 +227,34 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, ai = ai.reshape(shape, batch); bi = bi.reshape(shape, batch); for (size_t k = 0; k < batch; ++k) { - auto hk = ai.batch(k).dot(bi.batch(k)); - tile({k}) += hk; + using Ix = ::Einsum::Index; + if constexpr (AreArrayToT) { + TA_ASSERT(inner.h ^ inner.i && + "Hadamard with contraction not supported between the " + "inner tensors"); + + auto aik = ai.batch(k); + auto bik = bi.batch(k); + auto vol = aik.total_size(); + TA_ASSERT(vol == bik.total_size()); + + auto &el = tile({k}); + using TensorT = std::remove_reference_t; + + auto mult_op = [&inner](auto const &l, auto const &r) -> TensorT { + return inner.h ? TA::detail::tensor_hadamard(l, inner.A, r, + inner.B, inner.C) + : TA::detail::tensor_contract(l, inner.A, r, + inner.B, inner.C); + }; + + for (auto i = 0; i < vol; ++i) + el.add_to(mult_op(aik.data()[i], bik.data()[i])); + + } else { + auto hk = ai.batch(k).dot(bi.batch(k)); + tile({k}) += hk; + } } } auto pc = C.permutation; From 1beda942076d58629b1eea6f0d9f6bcdbc42e960 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 14 Jan 2024 15:59:37 -0500 Subject: [PATCH 281/592] Add tests for Hadamard+contraction on outer indices, and contraction on inner indices. --- tests/einsum.cpp | 141 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 79d4e70e06..4ab944e676 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -791,6 +791,147 @@ BOOST_AUTO_TEST_CASE(xxx) { BOOST_CHECK(are_equal); } +BOOST_AUTO_TEST_CASE(ij_mn_eq_ij_mo_times_ji_on) { + using Array = TA::DistArray>, TA::DensePolicy>; + auto& world = TA::get_default_world(); + + TA::Range const inner_rng{2, 7}; + TA::Range const inner_rng_perm{7, 2}; + TA::TiledRange lhs_trng{{0, 2, 4}, {0, 2}}; + TA::TiledRange rhs_trng{{0, 2}, {0, 2, 4}}; + auto lhs = random_array(lhs_trng, inner_rng); + auto rhs = random_array(rhs_trng, inner_rng_perm); + + // + // manual evaluation: 'ij;mn = ij;mo * ji;on' + // + Array ref{world, lhs_trng}; + { + lhs.make_replicated(); + rhs.make_replicated(); + world.gop.fence(); + + auto make_tile = [lhs, rhs](TA::Range const& rng) { + typename Array::value_type result_tile{rng}; + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto j = res_ix[1]; + + auto lhs_tile_ix = lhs.trange().element_to_tile({i, j}); + auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false); + + auto rhs_tile_ix = rhs.trange().element_to_tile({j, i}); + auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false); + + auto& res_el = result_tile({i, j}); + auto const& lhs_el = lhs_tile({i, j}); + auto const& rhs_el = rhs_tile({j, i}); + using namespace std::string_literals; + res_el = + TA::detail::tensor_contract(lhs_el, "mo"s, rhs_el, "on"s, "mn"s); + } + return result_tile; + }; + using std::begin; + using std::end; + + for (auto it = begin(ref); it != end(ref); ++it) + if (ref.is_local(it.index())) { + auto tile = world.taskq.add(make_tile, it.make_range()); + *it = tile; + } + } + + auto out = einsum(lhs("i,j;m,o"), rhs("j,i;o,n"), "i,j;m,n"); + std::cerr << "TODO: ij;mo * ji;on -> ij;mn using expression layer does not " + "produce the same result compared to manual evaluation." + << '\n'; + // bool are_equal = ToTArrayFixture::are_equal(ref, out); + // std::cout << out << '\n' << ref << '\n'; + // BOOST_CHECK(are_equal); +} + +BOOST_AUTO_TEST_CASE(ij_mn_eq_ijk_mo_times_ijk_no) { + using Array = TA::DistArray>, TA::DensePolicy>; + using Ix = typename TA::Range::index1_type; + using namespace std::string_literals; + auto& world = TA::get_default_world(); + + Ix const K = 2; // the extent of contracted outer mode + + TA::Range const inner_rng{3, 7}; + TA::TiledRange const lhs_trng{ + std::initializer_list>{ + {0, 2, 4}, {0, 2}, {0, 2}}}; + TA::TiledRange const rhs_trng(lhs_trng); + TA::TiledRange const ref_trng{lhs_trng.dim(0), lhs_trng.dim(1)}; + TA::Range const ref_inner_rng{3, 3}; // contract(3x7,3x7) -> (3,3) + auto lhs = random_array(lhs_trng, inner_rng); + auto rhs = random_array(rhs_trng, inner_rng); + + // + // manual evaluation: ij;mn = ijk;mo * ijk;no + // + Array ref{world, ref_trng}; + { + lhs.make_replicated(); + rhs.make_replicated(); + world.gop.fence(); + + auto make_tile = [lhs, rhs, ref_inner_rng](TA::Range const& rng) { + using InnerT = typename Array::value_type::value_type; + typename Array::value_type result_tile{rng}; + + for (auto&& res_ix : result_tile.range()) { + auto i = res_ix[0]; + auto j = res_ix[1]; + + InnerT mn{ref_inner_rng}; + for (Ix k = 0; k < K; ++k) { + auto lhs_tile = + lhs.find_local(lhs.trange().element_to_tile({i, j, k})) + .get(/*dowork = */ false); + auto rhs_tile = + rhs.find_local(rhs.trange().element_to_tile({i, j, k})) + .get(/*doworkd = */ false); + mn.add_to(tensor_contract("mo,no->mn", lhs_tile({i, j, k}), + rhs_tile({i, j, k}))); + } + result_tile({i, j}) = std::move(mn); + } + return result_tile; + }; + using std::begin; + using std::end; + + for (auto it = begin(ref); it != end(ref); ++it) + if (ref.is_local(it.index())) { + auto tile = world.taskq.add(make_tile, it.make_range()); + *it = tile; + } + } + + auto out = einsum(lhs("i,j,k;m,o"), rhs("i,j,k;n,o"), "i,j;m,n"); + bool are_equal = ToTArrayFixture::are_equal(ref, out); + BOOST_CHECK(are_equal); +} + +#ifdef TILEDARRAY_HAS_BTAS +BOOST_AUTO_TEST_CASE(tensor_contract) { + using TensorT = TA::Tensor; + + TA::Range const rng_A{2, 3, 4}; + TA::Range const rng_B{4, 3, 2}; + auto const A = random_tensor(rng_A); + auto const B = random_tensor(rng_B); + + BOOST_CHECK(tensor_contract_equal("ijk,klm->ijlm", A, B)); + BOOST_CHECK(tensor_contract_equal("ijk,klm->milj", A, B)); + BOOST_CHECK(tensor_contract_equal("ijk,kjm->im", A, B)); + BOOST_CHECK(tensor_contract_equal("ijk,kli->lj", A, B)); +} +#endif + BOOST_AUTO_TEST_SUITE_END() // einsum_tot BOOST_AUTO_TEST_SUITE(einsum_tot_t) From dc450ed97819f18d3f1ed615638b1c1aa245cac8 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 15 Jan 2024 06:13:57 -0500 Subject: [PATCH 282/592] Print out manual and einsum result in the failing case. --- tests/einsum.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 4ab944e676..69cf317279 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -913,6 +913,7 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ijk_mo_times_ijk_no) { auto out = einsum(lhs("i,j,k;m,o"), rhs("i,j,k;n,o"), "i,j;m,n"); bool are_equal = ToTArrayFixture::are_equal(ref, out); + std::cout << "ref:\n" << ref << '\n' << "out:\n" << out << '\n'; BOOST_CHECK(are_equal); } From 5dc0c108ee3f1e9132fcaed456d455d02e7fc8f4 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 15 Jan 2024 11:57:02 -0500 Subject: [PATCH 283/592] char type is used to construct random tensors. --- tests/tot_array_fixture.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 87749afec0..f356888b26 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -99,7 +99,7 @@ auto random_tensor(TA::Range const& rng) { using NumericT = typename TensorT::numeric_type; std::generate(/*std::execution::par, */ result.begin(), result.end(), - TA::detail::MakeRandom::generate_value); + TA::detail::MakeRandom::generate_value); return result; } From 8aad3b58763e51d4c30115ae9e3d6bf68a099a0c Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 15 Jan 2024 12:47:13 -0500 Subject: [PATCH 284/592] Zero-intialization made explicit in Tensor::add_to. --- src/TiledArray/tensor/tensor.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 6a60ace6d1..aafc9e3665 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1687,7 +1687,11 @@ class Tensor { template ::value>::type* = nullptr> Tensor& add_to(const Right& right) { - if (empty()) *this = Tensor{right.range()}; + if (empty()) { + *this = Tensor{right.range()}; + if constexpr (detail::is_numeric_v) + std::fill(begin(), end(), value_type{0}); + } return inplace_binary(right, [](value_type& MADNESS_RESTRICT l, const value_t r) { l += r; }); } From 0a7a77ea68593a47fcd12b6370fa3f6320c4136a Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 16 Jan 2024 10:00:19 -0500 Subject: [PATCH 285/592] Typo. --- src/TiledArray/permutation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/permutation.h b/src/TiledArray/permutation.h index a1dcad2ce8..96b0081643 100644 --- a/src/TiledArray/permutation.h +++ b/src/TiledArray/permutation.h @@ -333,7 +333,7 @@ class Permutation { /// Checks if this permutation is the identity permutation. /// [[nodiscard]] bool is_identity() const { - std::is_sorted(p_.begin(), p_.end()); + return std::is_sorted(p_.begin(), p_.end()); } /// Identity permutation factory function From 23c29262bcb5d04cf5c370bbde3d94a350bfd281 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 16 Jan 2024 11:32:34 -0500 Subject: [PATCH 286/592] `random_tensor` function returns the same TA::Tensor for the same TA::Range. [debug] --- tests/einsum.cpp | 5 ++++- tests/tot_array_fixture.h | 29 ++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 69cf317279..f3eb5cd3f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -913,7 +913,10 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ijk_mo_times_ijk_no) { auto out = einsum(lhs("i,j,k;m,o"), rhs("i,j,k;n,o"), "i,j;m,n"); bool are_equal = ToTArrayFixture::are_equal(ref, out); - std::cout << "ref:\n" << ref << '\n' << "out:\n" << out << '\n'; + + std::cout << "LHS:\n" << lhs << "\nRHS:\n" << rhs << "\n"; + + // std::cout << "ref:\n" << ref << '\n' << "out:\n" << out << '\n'; BOOST_CHECK(are_equal); } diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index f356888b26..2627e76b47 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -95,11 +95,17 @@ enum class ShapeComp { True, False }; template , bool> = true> auto random_tensor(TA::Range const& rng) { + using Ix1 = typename TA::Range::index1_type; + using Num = typename TensorT::numeric_type; TensorT result{rng}; using NumericT = typename TensorT::numeric_type; - std::generate(/*std::execution::par, */ - result.begin(), result.end(), - TA::detail::MakeRandom::generate_value); + for (auto const& ix : rng) { + result(ix) = + static_cast(std::accumulate(ix.begin(), ix.end(), Ix1{0})); + } + // std::generate(/*std::execution::par, */ + // result.begin(), result.end(), + // TA::detail::MakeRandom::generate_value); return result; } @@ -112,12 +118,21 @@ template < std::enable_if_t, bool> = true> auto random_tensor(TA::Range const& outer_rng, TA::Range const& inner_rng) { using InnerTensorT = typename TensorT::value_type; + using Num = typename TensorT::numeric_type; + using Ix1 = typename TA::Range::index1_type; TensorT result{outer_rng}; - std::generate(/*std::execution::par,*/ - result.begin(), result.end(), [inner_rng]() { - return random_tensor(inner_rng); - }); + for (auto const& ix : outer_rng) { + auto inner = random_tensor(inner_rng); + auto plus = std::accumulate(ix.begin(), ix.end(), Ix1{0}); + inner.add_to(static_cast(plus)); + result(ix) = inner; + } + + // std::generate(/*std::execution::par,*/ + // result.begin(), result.end(), [inner_rng]() { + // return random_tensor(inner_rng); + // }); return result; } From 3c11692e2ae50bdf476d9b584d970d3ec604b639 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 16 Jan 2024 12:33:01 -0500 Subject: [PATCH 287/592] fence and print [debug]. --- tests/einsum.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index f3eb5cd3f9..1dcb942230 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -914,9 +914,14 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ijk_mo_times_ijk_no) { auto out = einsum(lhs("i,j,k;m,o"), rhs("i,j,k;n,o"), "i,j;m,n"); bool are_equal = ToTArrayFixture::are_equal(ref, out); - std::cout << "LHS:\n" << lhs << "\nRHS:\n" << rhs << "\n"; + world.gop.fence(); + std::cout << "ij_mn_eq_ijk_mo_times_ijk_no\n"; + std::cout << "LHS:\n" + << lhs << "\nRHS:\n" + << rhs << "\nOut:\n" + << out << "\nRef:\n" + << ref << std::endl; - // std::cout << "ref:\n" << ref << '\n' << "out:\n" << out << '\n'; BOOST_CHECK(are_equal); } From f0810ed2923be7a74bbef32f3ce28d5a1cc68fb7 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 16 Jan 2024 18:50:52 -0500 Subject: [PATCH 288/592] Zero initialize manually computed temp result [debug]. --- tests/einsum.cpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 1dcb942230..4b74a01663 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -887,13 +887,14 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ijk_mo_times_ijk_no) { auto j = res_ix[1]; InnerT mn{ref_inner_rng}; + std::fill(mn.begin(), mn.end(), typename InnerT::value_type{0}); for (Ix k = 0; k < K; ++k) { auto lhs_tile = lhs.find_local(lhs.trange().element_to_tile({i, j, k})) .get(/*dowork = */ false); auto rhs_tile = rhs.find_local(rhs.trange().element_to_tile({i, j, k})) - .get(/*doworkd = */ false); + .get(/*dowork = */ false); mn.add_to(tensor_contract("mo,no->mn", lhs_tile({i, j, k}), rhs_tile({i, j, k}))); } @@ -914,14 +915,6 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ijk_mo_times_ijk_no) { auto out = einsum(lhs("i,j,k;m,o"), rhs("i,j,k;n,o"), "i,j;m,n"); bool are_equal = ToTArrayFixture::are_equal(ref, out); - world.gop.fence(); - std::cout << "ij_mn_eq_ijk_mo_times_ijk_no\n"; - std::cout << "LHS:\n" - << lhs << "\nRHS:\n" - << rhs << "\nOut:\n" - << out << "\nRef:\n" - << ref << std::endl; - BOOST_CHECK(are_equal); } From 068557e3329958b0a90e40297863039211a76f1f Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 16 Jan 2024 19:36:31 -0500 Subject: [PATCH 289/592] Tensor::add_to is the wrong place to zero-initialize because we cannot be certain whether the Tensor object was default constructed or not. --- src/TiledArray/tensor/tensor.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index aafc9e3665..5ecee7fc02 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1687,11 +1687,6 @@ class Tensor { template ::value>::type* = nullptr> Tensor& add_to(const Right& right) { - if (empty()) { - *this = Tensor{right.range()}; - if constexpr (detail::is_numeric_v) - std::fill(begin(), end(), value_type{0}); - } return inplace_binary(right, [](value_type& MADNESS_RESTRICT l, const value_t r) { l += r; }); } From 3ccdc324ec70381f18a52e05c35c5702082cb3f4 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 16 Jan 2024 19:49:59 -0500 Subject: [PATCH 290/592] Amend previous commit on the basis of second thought :) --- src/TiledArray/tensor/tensor.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 5ecee7fc02..a14d30157e 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1687,6 +1687,9 @@ class Tensor { template ::value>::type* = nullptr> Tensor& add_to(const Right& right) { + if (empty()) { + *this = Tensor{right.range(), value_type{}}; + } return inplace_binary(right, [](value_type& MADNESS_RESTRICT l, const value_t r) { l += r; }); } From e1840e03ba9f748d3ae0eb825e65775e3c89d58e Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 16 Jan 2024 19:50:46 -0500 Subject: [PATCH 291/592] Restore `random_tensor` function. --- tests/tot_array_fixture.h | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 2627e76b47..e28eb1d139 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -95,17 +95,12 @@ enum class ShapeComp { True, False }; template , bool> = true> auto random_tensor(TA::Range const& rng) { - using Ix1 = typename TA::Range::index1_type; - using Num = typename TensorT::numeric_type; - TensorT result{rng}; using NumericT = typename TensorT::numeric_type; - for (auto const& ix : rng) { - result(ix) = - static_cast(std::accumulate(ix.begin(), ix.end(), Ix1{0})); - } - // std::generate(/*std::execution::par, */ - // result.begin(), result.end(), - // TA::detail::MakeRandom::generate_value); + TensorT result{rng}; + + std::generate(/*std::execution::par, */ + result.begin(), result.end(), + TA::detail::MakeRandom::generate_value); return result; } @@ -118,21 +113,12 @@ template < std::enable_if_t, bool> = true> auto random_tensor(TA::Range const& outer_rng, TA::Range const& inner_rng) { using InnerTensorT = typename TensorT::value_type; - using Num = typename TensorT::numeric_type; - using Ix1 = typename TA::Range::index1_type; TensorT result{outer_rng}; - for (auto const& ix : outer_rng) { - auto inner = random_tensor(inner_rng); - auto plus = std::accumulate(ix.begin(), ix.end(), Ix1{0}); - inner.add_to(static_cast(plus)); - result(ix) = inner; - } - - // std::generate(/*std::execution::par,*/ - // result.begin(), result.end(), [inner_rng]() { - // return random_tensor(inner_rng); - // }); + std::generate(/*std::execution::par,*/ + result.begin(), result.end(), [inner_rng]() { + return random_tensor(inner_rng); + }); return result; } From f786413440d9c8c617693649238937eb54303d19 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 16 Jan 2024 19:55:51 -0500 Subject: [PATCH 292/592] Cleanup. --- tests/einsum.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 4b74a01663..1760cf82fc 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -886,8 +886,7 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ijk_mo_times_ijk_no) { auto i = res_ix[0]; auto j = res_ix[1]; - InnerT mn{ref_inner_rng}; - std::fill(mn.begin(), mn.end(), typename InnerT::value_type{0}); + InnerT mn; for (Ix k = 0; k < K; ++k) { auto lhs_tile = lhs.find_local(lhs.trange().element_to_tile({i, j, k})) From 89bdab0ee3d3d70197efc4836d6a4fc86464c476 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 18 Jan 2024 16:00:15 -0500 Subject: [PATCH 293/592] Test outer pure-Hadamard with inner tensors contraction. --- tests/einsum.cpp | 63 ++++++++---------------------------------------- 1 file changed, 10 insertions(+), 53 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 1760cf82fc..1d2cbba0a1 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -792,63 +792,20 @@ BOOST_AUTO_TEST_CASE(xxx) { } BOOST_AUTO_TEST_CASE(ij_mn_eq_ij_mo_times_ji_on) { - using Array = TA::DistArray>, TA::DensePolicy>; auto& world = TA::get_default_world(); - TA::Range const inner_rng{2, 7}; - TA::Range const inner_rng_perm{7, 2}; - TA::TiledRange lhs_trng{{0, 2, 4}, {0, 2}}; - TA::TiledRange rhs_trng{{0, 2}, {0, 2, 4}}; - auto lhs = random_array(lhs_trng, inner_rng); - auto rhs = random_array(rhs_trng, inner_rng_perm); - - // - // manual evaluation: 'ij;mn = ij;mo * ji;on' - // - Array ref{world, lhs_trng}; - { - lhs.make_replicated(); - rhs.make_replicated(); - world.gop.fence(); - - auto make_tile = [lhs, rhs](TA::Range const& rng) { - typename Array::value_type result_tile{rng}; - for (auto&& res_ix : result_tile.range()) { - auto i = res_ix[0]; - auto j = res_ix[1]; - - auto lhs_tile_ix = lhs.trange().element_to_tile({i, j}); - auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false); - - auto rhs_tile_ix = rhs.trange().element_to_tile({j, i}); - auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false); - - auto& res_el = result_tile({i, j}); - auto const& lhs_el = lhs_tile({i, j}); - auto const& rhs_el = rhs_tile({j, i}); - using namespace std::string_literals; - res_el = - TA::detail::tensor_contract(lhs_el, "mo"s, rhs_el, "on"s, "mn"s); - } - return result_tile; - }; - using std::begin; - using std::end; + using Array = TA::DistArray>, TA::DensePolicy>; + using Perm = TA::Permutation; - for (auto it = begin(ref); it != end(ref); ++it) - if (ref.is_local(it.index())) { - auto tile = world.taskq.add(make_tile, it.make_range()); - *it = tile; - } - } + TA::TiledRange lhs_trng{{0, 2, 3}, {0, 2, 4}}; + TA::TiledRange rhs_trng{{0, 2, 4}, {0, 2, 3}}; + TA::Range lhs_inner_rng{1, 1}; + TA::Range rhs_inner_rng{1, 1}; - auto out = einsum(lhs("i,j;m,o"), rhs("j,i;o,n"), "i,j;m,n"); - std::cerr << "TODO: ij;mo * ji;on -> ij;mn using expression layer does not " - "produce the same result compared to manual evaluation." - << '\n'; - // bool are_equal = ToTArrayFixture::are_equal(ref, out); - // std::cout << out << '\n' << ref << '\n'; - // BOOST_CHECK(are_equal); + auto lhs = random_array(lhs_trng, lhs_inner_rng); + auto rhs = random_array(rhs_trng, rhs_inner_rng); + Array out; + BOOST_REQUIRE_NO_THROW(out("i,j;m,n") = lhs("i,j;m,o") * rhs("j,i;o,n")); } BOOST_AUTO_TEST_CASE(ij_mn_eq_ijk_mo_times_ijk_no) { From 5137481e06671916700e0686a49c6aff4731cb0c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 19 Jan 2024 18:08:53 -0500 Subject: [PATCH 294/592] fixed permutation logic in BinaryEngine::init_indices_ + implemented (suboptimal) forms of Mult that were not implemented for the ToT case --- src/TiledArray/expressions/binary_engine.h | 73 ++++++++++------------ src/TiledArray/expressions/mult_engine.h | 1 - src/TiledArray/tile_op/mult.h | 25 ++++++-- tests/einsum.cpp | 4 +- 4 files changed, 54 insertions(+), 49 deletions(-) diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h index 411a1c7c13..ed247070d9 100644 --- a/src/TiledArray/expressions/binary_engine.h +++ b/src/TiledArray/expressions/binary_engine.h @@ -150,43 +150,34 @@ class BinaryEngine : public ExprEngine { !left_tile_is_tot && !right_tile_is_tot; constexpr bool args_are_mixed_tensors = left_tile_is_tot ^ right_tile_is_tot; - if (args_are_plain_tensors && - (left_outer_permtype_ == PermutationType::matrix_transpose || - left_outer_permtype_ == PermutationType::identity)) { - left_.permute_tiles(false); - } - if (!args_are_plain_tensors && - ((left_outer_permtype_ == PermutationType::matrix_transpose || - left_outer_permtype_ == PermutationType::identity) || - (left_inner_permtype_ == PermutationType::matrix_transpose || - left_inner_permtype_ == PermutationType::identity))) { - left_.permute_tiles(false); - } - if (args_are_plain_tensors && - (right_outer_permtype_ == PermutationType::matrix_transpose || - right_outer_permtype_ == PermutationType::identity)) { - right_.permute_tiles(false); - } - if (!args_are_plain_tensors && - ((left_outer_permtype_ == PermutationType::matrix_transpose || - left_outer_permtype_ == PermutationType::identity) || - (right_inner_permtype_ == PermutationType::matrix_transpose || - right_inner_permtype_ == PermutationType::identity))) { - right_.permute_tiles(false); - } - if (args_are_mixed_tensors && - ((left_outer_permtype_ == PermutationType::matrix_transpose || - left_outer_permtype_ == PermutationType::identity) || - (left_inner_permtype_ == PermutationType::matrix_transpose || - left_inner_permtype_ == PermutationType::identity))) { - left_.permute_tiles(false); + // permute_tiles() denotes what happens to outer OR inner modes + // if we have contraction happening to BOTH inner and outer modes, no need + // to involve permutation, can fuse it into GEMMs + if (left_tile_is_tot) { + if ((left_outer_permtype_ == PermutationType::matrix_transpose || + left_outer_permtype_ == PermutationType::identity) && + (left_inner_permtype_ == PermutationType::matrix_transpose || + left_inner_permtype_ == PermutationType::identity)) { + left_.permute_tiles(false); + } + } else { // !left_tile_is_tot + if (left_outer_permtype_ == PermutationType::matrix_transpose || + left_outer_permtype_ == PermutationType::identity) { + left_.permute_tiles(false); + } } - if (args_are_mixed_tensors && - ((left_outer_permtype_ == PermutationType::matrix_transpose || - left_outer_permtype_ == PermutationType::identity) || - (right_inner_permtype_ == PermutationType::matrix_transpose || - right_inner_permtype_ == PermutationType::identity))) { - right_.permute_tiles(false); + if (right_tile_is_tot) { + if ((right_outer_permtype_ == PermutationType::matrix_transpose || + right_outer_permtype_ == PermutationType::identity) && + (right_inner_permtype_ == PermutationType::matrix_transpose || + right_inner_permtype_ == PermutationType::identity)) { + right_.permute_tiles(false); + } + } else { // !right_tile_is_tot + if (right_outer_permtype_ == PermutationType::matrix_transpose || + right_outer_permtype_ == PermutationType::identity) { + right_.permute_tiles(false); + } } } @@ -204,10 +195,12 @@ class BinaryEngine : public ExprEngine { /// \param target_indices The target index list for this expression void perm_indices(const BipartiteIndexList& target_indices) { if (permute_tiles_) { - TA_ASSERT(left_.indices().size() == target_indices.size() || - (left_.indices().second().size() ^ target_indices.second().size())); - TA_ASSERT(right_.indices().size() == target_indices.size() || - (right_.indices().second().size() ^ target_indices.second().size())); + TA_ASSERT( + left_.indices().size() == target_indices.size() || + (left_.indices().second().size() ^ target_indices.second().size())); + TA_ASSERT( + right_.indices().size() == target_indices.size() || + (right_.indices().second().size() ^ target_indices.second().size())); init_indices_(target_indices); diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 20093b2cec..97751c56a2 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -299,7 +299,6 @@ class MultEngine : public ContEngine> { // the tile op; the type of the tile op does not need to match the type of // the operation on the outer indices if (this->product_type() == TensorProduct::Hadamard) { - // assumes inner op is also Hadamard BinaryEngine_::perm_indices(target_indices); } else { auto children_initialized = true; diff --git a/src/TiledArray/tile_op/mult.h b/src/TiledArray/tile_op/mult.h index b9da1d5e24..577ea94115 100644 --- a/src/TiledArray/tile_op/mult.h +++ b/src/TiledArray/tile_op/mult.h @@ -128,17 +128,30 @@ class Mult { template ::type* = nullptr> result_type eval(left_type& first, const right_type& second) const { - TA_ASSERT(!element_op_); - using TiledArray::mult_to; - return mult_to(first, second); + if (!element_op_) { + using TiledArray::mult_to; + return mult_to(first, second); + } else { + // TODO figure out why this does not compiles!!! + // using TiledArray::inplace_binary; + // return inplace_binary(first, second, element_op_); + using TiledArray::binary; + return binary(first, second, element_op_); + } } template ::type* = nullptr> result_type eval(const left_type& first, right_type& second) const { - TA_ASSERT(!element_op_); - using TiledArray::mult_to; - return mult_to(second, first); + if (!element_op_) { + using TiledArray::mult_to; + return mult_to(second, first); + } else { // WARNING: element_op_ might be noncommuting, so can't swap first + // and second! for GEMM could optimize, but can't introspect + // element_op_ + using TiledArray::binary; + return binary(first, second, element_op_); + } } template ::type* = nullptr> diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 1d2cbba0a1..ced387802d 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -797,8 +797,8 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ij_mo_times_ji_on) { using Array = TA::DistArray>, TA::DensePolicy>; using Perm = TA::Permutation; - TA::TiledRange lhs_trng{{0, 2, 3}, {0, 2, 4}}; - TA::TiledRange rhs_trng{{0, 2, 4}, {0, 2, 3}}; + TA::TiledRange lhs_trng{{0, 2, 3}, {0, 1}}; + TA::TiledRange rhs_trng{{0, 1}, {0, 2, 3}}; TA::Range lhs_inner_rng{1, 1}; TA::Range rhs_inner_rng{1, 1}; From 013e0553e70cea294a1c9e891633431f1b1fefaf Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 20 Jan 2024 03:42:21 -0500 Subject: [PATCH 295/592] Range(Permutation perm, Range r) returns a copy of r if perm is null --- src/TiledArray/range.h | 10 +++++----- tests/range.cpp | 7 +++++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h index b4f2a0d48f..25e4852118 100644 --- a/src/TiledArray/range.h +++ b/src/TiledArray/range.h @@ -613,10 +613,10 @@ class Range { /// Permuting copy constructor - /// \param perm The permutation applied to other - /// \param other The range to be permuted and copied + /// \param perm The permutation applied to other; if `!perm` then no + /// permutation is applied \param other The range to be permuted and copied Range(const Permutation& perm, const Range_& other) { - TA_ASSERT(perm.size() == other.rank_); + TA_ASSERT(perm.size() == other.rank_ || !perm); if (other.rank_ > 0ul) { rank_ = other.rank_; @@ -1139,7 +1139,7 @@ class Range { template void serialize(Archive& ar) { - ar& rank_; + ar & rank_; const auto four_x_rank = rank_ << 2; // read via madness::archive::wrap to be able to // - avoid having to serialize datavec_'s size @@ -1151,7 +1151,7 @@ class Range { ar << madness::archive::wrap(datavec_.data(), four_x_rank); } else abort(); // unreachable - ar& offset_& volume_; + ar & offset_ & volume_; } void swap(Range_& other) { diff --git a/tests/range.cpp b/tests/range.cpp index a5ac8898f9..a20f185d44 100644 --- a/tests/range.cpp +++ b/tests/range.cpp @@ -517,6 +517,9 @@ BOOST_AUTO_TEST_CASE(permutation) { BOOST_CHECK_EQUAL_COLLECTIONS(r3.stride_data(), r3.stride_data() + r3.rank(), r2.stride_data(), r2.stride_data() + r2.rank()); BOOST_CHECK_EQUAL(r3, r2); + + // using null Permutation is allowed + BOOST_CHECK_EQUAL(Range(Permutation{}, r1), r1); } BOOST_AUTO_TEST_CASE(include) { @@ -700,13 +703,13 @@ BOOST_AUTO_TEST_CASE(serialization) { 2 * (sizeof(Range) + sizeof(std::size_t) * (4 * GlobalFixture::dim + 1)); unsigned char* buf = new unsigned char[buf_size]; madness::archive::BufferOutputArchive oar(buf, buf_size); - oar& r; + oar & r; std::size_t nbyte = oar.size(); oar.close(); Range rs; madness::archive::BufferInputArchive iar(buf, nbyte); - iar& rs; + iar & rs; iar.close(); delete[] buf; From 20dff5650883081d192580a3110afe353357da05 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 20 Jan 2024 03:45:47 -0500 Subject: [PATCH 296/592] permutation handling in ExprEngines is cleaned up to properly track whether inner/outer permutations are implicit .. this should complete support for contractions and hadamard products for ToT in the expression layer (still need einsum for general products) --- src/TiledArray/expressions/binary_engine.h | 54 ++++------ src/TiledArray/expressions/blk_tsr_engine.h | 9 +- src/TiledArray/expressions/cont_engine.h | 52 ++++----- src/TiledArray/expressions/expr_engine.h | 111 ++++++++++++++++---- src/TiledArray/expressions/leaf_engine.h | 3 +- src/TiledArray/expressions/mult_engine.h | 6 ++ src/TiledArray/expressions/permopt.h | 7 +- src/TiledArray/expressions/product.h | 4 +- src/TiledArray/expressions/unary_engine.h | 5 +- 9 files changed, 166 insertions(+), 85 deletions(-) diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h index ed247070d9..daf69d428e 100644 --- a/src/TiledArray/expressions/binary_engine.h +++ b/src/TiledArray/expressions/binary_engine.h @@ -75,9 +75,10 @@ class BinaryEngine : public ExprEngine { protected: // Import base class variables to this scope + using ExprEngine_::implicit_permute_inner_; + using ExprEngine_::implicit_permute_outer_; using ExprEngine_::indices_; using ExprEngine_::perm_; - using ExprEngine_::permute_tiles_; using ExprEngine_::pmap_; using ExprEngine_::shape_; using ExprEngine_::trange_; @@ -150,34 +151,25 @@ class BinaryEngine : public ExprEngine { !left_tile_is_tot && !right_tile_is_tot; constexpr bool args_are_mixed_tensors = left_tile_is_tot ^ right_tile_is_tot; - // permute_tiles() denotes what happens to outer OR inner modes - // if we have contraction happening to BOTH inner and outer modes, no need - // to involve permutation, can fuse it into GEMMs - if (left_tile_is_tot) { - if ((left_outer_permtype_ == PermutationType::matrix_transpose || - left_outer_permtype_ == PermutationType::identity) && - (left_inner_permtype_ == PermutationType::matrix_transpose || - left_inner_permtype_ == PermutationType::identity)) { - left_.permute_tiles(false); - } - } else { // !left_tile_is_tot - if (left_outer_permtype_ == PermutationType::matrix_transpose || - left_outer_permtype_ == PermutationType::identity) { - left_.permute_tiles(false); - } + // implicit_permute_{outer,inner}() denotes whether permutations will be + // fused into consuming operation + if (left_outer_permtype_ == PermutationType::matrix_transpose || + left_outer_permtype_ == PermutationType::identity) { + left_.implicit_permute_outer(true); } - if (right_tile_is_tot) { - if ((right_outer_permtype_ == PermutationType::matrix_transpose || - right_outer_permtype_ == PermutationType::identity) && - (right_inner_permtype_ == PermutationType::matrix_transpose || - right_inner_permtype_ == PermutationType::identity)) { - right_.permute_tiles(false); - } - } else { // !right_tile_is_tot - if (right_outer_permtype_ == PermutationType::matrix_transpose || - right_outer_permtype_ == PermutationType::identity) { - right_.permute_tiles(false); - } + if (left_tile_is_tot && + (left_inner_permtype_ == PermutationType::matrix_transpose || + left_inner_permtype_ == PermutationType::identity)) { + left_.implicit_permute_inner(true); + } + if (right_outer_permtype_ == PermutationType::matrix_transpose || + right_outer_permtype_ == PermutationType::identity) { + right_.implicit_permute_outer(true); + } + if (right_tile_is_tot && + (right_inner_permtype_ == PermutationType::matrix_transpose || + right_inner_permtype_ == PermutationType::identity)) { + right_.implicit_permute_inner(true); } } @@ -194,7 +186,7 @@ class BinaryEngine : public ExprEngine { /// result of this expression will be permuted to match \c target_indices. /// \param target_indices The target index list for this expression void perm_indices(const BipartiteIndexList& target_indices) { - if (permute_tiles_) { + if (!this->implicit_permute()) { TA_ASSERT( left_.indices().size() == target_indices.size() || (left_.indices().second().size() ^ target_indices.second().size())); @@ -204,8 +196,8 @@ class BinaryEngine : public ExprEngine { init_indices_(target_indices); - TA_ASSERT(right_outer_permtype_ == PermutationType::general || - right_inner_permtype_ == PermutationType::general); + TA_ASSERT(left_outer_permtype_ == PermutationType::general && + right_outer_permtype_ == PermutationType::general); if (left_.indices() != left_indices_) left_.perm_indices(left_indices_); if (right_.indices() != right_indices_) diff --git a/src/TiledArray/expressions/blk_tsr_engine.h b/src/TiledArray/expressions/blk_tsr_engine.h index 2d16172dbe..5cb9009460 100644 --- a/src/TiledArray/expressions/blk_tsr_engine.h +++ b/src/TiledArray/expressions/blk_tsr_engine.h @@ -147,9 +147,10 @@ class BlkTsrEngineBase : public LeafEngine { protected: // Import base class variables to this scope + using ExprEngine_::implicit_permute_inner_; + using ExprEngine_::implicit_permute_outer_; using ExprEngine_::indices_; using ExprEngine_::perm_; - using ExprEngine_::permute_tiles_; using ExprEngine_::pmap_; using ExprEngine_::shape_; using ExprEngine_::trange_; @@ -335,9 +336,10 @@ class BlkTsrEngine // Import base class variables to this scope using BlkTsrEngineBase_::lower_bound_; using BlkTsrEngineBase_::upper_bound_; + using ExprEngine_::implicit_permute_inner_; + using ExprEngine_::implicit_permute_outer_; using ExprEngine_::indices_; using ExprEngine_::perm_; - using ExprEngine_::permute_tiles_; using ExprEngine_::pmap_; using ExprEngine_::shape_; using ExprEngine_::trange_; @@ -478,9 +480,10 @@ class ScalBlkTsrEngine // Import base class variables to this scope using BlkTsrEngineBase_::lower_bound_; using BlkTsrEngineBase_::upper_bound_; + using ExprEngine_::implicit_permute_inner_; + using ExprEngine_::implicit_permute_outer_; using ExprEngine_::indices_; using ExprEngine_::perm_; - using ExprEngine_::permute_tiles_; using ExprEngine_::pmap_; using ExprEngine_::shape_; using ExprEngine_::trange_; diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 2a658dc886..265a389a03 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -94,9 +94,10 @@ class ContEngine : public BinaryEngine { using BinaryEngine_::right_indices_; using BinaryEngine_::right_inner_permtype_; using BinaryEngine_::right_outer_permtype_; + using ExprEngine_::implicit_permute_inner_; + using ExprEngine_::implicit_permute_outer_; using ExprEngine_::indices_; using ExprEngine_::perm_; - using ExprEngine_::permute_tiles_; using ExprEngine_::pmap_; using ExprEngine_::shape_; using ExprEngine_::trange_; @@ -202,7 +203,7 @@ class ContEngine : public BinaryEngine { void perm_indices(const BipartiteIndexList& target_indices) { // assert that init_indices has been called TA_ASSERT(left_.indices() && right_.indices()); - if (permute_tiles_) { + if (!this->implicit_permute()) { this->template init_indices_(target_indices); // propagate the indices down the tree, if needed @@ -262,31 +263,29 @@ class ContEngine : public BinaryEngine { // Initialize the tile operation in this function because it is used to // evaluate the tiled range and shape. - const math::blas::Op left_op = - (left_outer_permtype_ == PermutationType::matrix_transpose - ? math::blas::Transpose - : math::blas::NoTranspose); - const math::blas::Op right_op = - (right_outer_permtype_ == PermutationType::matrix_transpose - ? math::blas::Transpose - : math::blas::NoTranspose); + const auto left_op = to_cblas_op(left_outer_permtype_); + const auto right_op = to_cblas_op(right_outer_permtype_); + // initialize perm_ + this->init_perm(target_indices); + + // initialize op_, trange_, and shape_ which only refer to the outer modes if (outer(target_indices) != outer(indices_)) { + const auto outer_perm = outer(perm_); // Initialize permuted structure - perm_ = ExprEngine_::make_perm(target_indices); if constexpr (!TiledArray::detail::is_tensor_of_tensor_v) { op_ = op_type(left_op, right_op, factor_, outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), - (permute_tiles_ ? perm_ : BipartitePermutation{})); + (!implicit_permute_outer_ ? outer_perm : Permutation{})); } else { // factor_ is absorbed into inner_tile_nonreturn_op_ op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), - (permute_tiles_ ? perm_ : BipartitePermutation{}), + (!implicit_permute_outer_ ? outer_perm : Permutation{}), this->element_nonreturn_op_); } - trange_ = ContEngine_::make_trange(outer(perm_)); - shape_ = ContEngine_::make_shape(outer(perm_)); + trange_ = ContEngine_::make_trange(outer_perm); + shape_ = ContEngine_::make_shape(outer_perm); } else { // Initialize non-permuted structure if constexpr (!TiledArray::detail::is_tensor_of_tensor_v) { @@ -490,8 +489,8 @@ class ContEngine : public BinaryEngine { this->factor_, inner_size(this->indices_), inner_size(this->left_indices_), inner_size(this->right_indices_), - (this->permute_tiles_ ? inner(this->perm_) - : Permutation{})) + (!this->implicit_permute_inner_ ? inner(this->perm_) + : Permutation{})) : op_type(to_cblas_op(this->left_inner_permtype_), to_cblas_op(this->right_inner_permtype_), this->factor_, inner_size(this->indices_), @@ -521,7 +520,7 @@ class ContEngine : public BinaryEngine { // multiple times, e.g. when outer op is gemm auto mult_op = (inner_target_indices != inner(this->indices_)) - ? op_type(base_op_type(), this->permute_tiles_ + ? op_type(base_op_type(), !this->implicit_permute_inner_ ? inner(this->perm_) : Permutation{}) : op_type(base_op_type()); @@ -552,12 +551,12 @@ class ContEngine : public BinaryEngine { using op_type = TiledArray::detail::BinaryWrapper< base_op_type>; // can't consume inputs if they are used // multiple times, e.g. when outer op is gemm - auto mult_op = - (inner_target_indices != inner(this->indices_)) - ? op_type(base_op_type(this->factor_), - this->permute_tiles_ ? inner(this->perm_) - : Permutation{}) - : op_type(base_op_type(this->factor_)); + auto mult_op = (inner_target_indices != inner(this->indices_)) + ? op_type(base_op_type(this->factor_), + !this->implicit_permute_inner_ + ? inner(this->perm_) + : Permutation{}) + : op_type(base_op_type(this->factor_)); this->element_nonreturn_op_ = [mult_op, outer_prod](result_tile_element_type& result, const left_tile_element_type& left, @@ -598,8 +597,9 @@ class ContEngine : public BinaryEngine { std::conditional_t; - auto scal_op = [perm = this->permute_tiles_ ? inner(this->perm_) - : Permutation{}]( + auto scal_op = [perm = !this->implicit_permute_inner_ + ? inner(this->perm_) + : Permutation{}]( const left_tile_element_type& left, const right_tile_element_type& right) -> result_tile_element_type { diff --git a/src/TiledArray/expressions/expr_engine.h b/src/TiledArray/expressions/expr_engine.h index c364a5c1ba..7ff7e16b2d 100644 --- a/src/TiledArray/expressions/expr_engine.h +++ b/src/TiledArray/expressions/expr_engine.h @@ -54,6 +54,8 @@ class ExprEngine : private NO_DEFAULTS { typename EngineTrait::op_type op_type; ///< Tile operation type typedef typename EngineTrait::policy policy; ///< The result policy type + typedef typename EngineTrait::eval_type + eval_type; ///< Evaluation tile type typedef typename EngineTrait::dist_eval_type dist_eval_type; ///< This expression's distributed evaluator type @@ -74,9 +76,12 @@ class ExprEngine : private NO_DEFAULTS { BipartiteIndexList indices_; ///< The index list of this expression; bipartite due to need ///< to support nested tensors (e.g. tensors of tensors) - bool permute_tiles_; ///< Result tile permutation flag (\c true == permute - ///< tile) - /// The permutation that will be applied to the outer tensor of tensors + bool implicit_permute_outer_ = false; ///< If false, result tiles' outer + ///< modes will not need to be permuted + bool implicit_permute_inner_ = false; ///< If false, result tiles' inner + ///< modes will not need to be permuted + /// The permutation that will be applied to the result tensor (or tensor of + /// tensors) BipartitePermutation perm_; trange_type trange_; ///< The tiled range of the result tensor shape_type shape_; ///< The shape of the result tensor @@ -93,7 +98,6 @@ class ExprEngine : private NO_DEFAULTS { ExprEngine(const Expr& expr) : world_(NULL), indices_(), - permute_tiles_(true), perm_(), trange_(), shape_(), @@ -141,7 +145,7 @@ class ExprEngine : private NO_DEFAULTS { /// This function will initialize the permutation, tiled range, and shape /// for the result tensor. These members are initialized with the - /// make_perm(), \c make_trange(), and make_shape() functions. + /// \c init_perm(), \c make_trange(), and make_shape() functions. /// Derived classes may customize the structure initialization by /// providing their own implementation of this function or any of the /// above initialization. @@ -149,7 +153,7 @@ class ExprEngine : private NO_DEFAULTS { /// \param target_indices The target index list for the result tensor void init_struct(const BipartiteIndexList& target_indices) { if (target_indices != indices_) { - perm_ = derived().make_perm(target_indices); + if (!perm_) perm_ = make_perm(target_indices); trange_ = derived().make_trange(outer(perm_)); shape_ = derived().make_shape(outer(perm_)); } else { @@ -187,20 +191,41 @@ class ExprEngine : private NO_DEFAULTS { /// providing their own implementation it. BipartitePermutation make_perm( const BipartiteIndexList& target_indices) const { + TA_ASSERT(target_indices != indices_); return target_indices.permutation(indices_); } + void init_perm(const BipartiteIndexList& target_indices) { + if (!perm_ && target_indices != indices_) perm_ = make_perm(target_indices); + } + /// Tile operation factory function /// This function will generate the tile operations by calling /// \c make_tile_op(). The permuting or non-permuting version of the tile - /// operation will be selected based on permute_tiles(). Derived classes - /// may customize this function by providing their own implementation it. + /// operation will be selected based on implicit_permute_outer(). Derived + /// classes may customize this function by providing their own implementation + /// it. op_type make_op() const { - if (perm_ && permute_tiles_) - // permutation can only be applied to the tile, not to its element (if - // tile = tensor-of-tensors) - return derived().make_tile_op(perm_); + // figure out which permutations (of outer or inner modes) must be enacted + // explicitly + BipartitePermutation explicit_perm; + if (implicit_permute_outer_) { + if (!implicit_permute_inner_) { + explicit_perm = BipartitePermutation(Permutation{}, inner(perm_)); + } + } else { + if (implicit_permute_inner_) { + explicit_perm = BipartitePermutation(outer(perm_), Permutation{}); + } else { + explicit_perm = perm_; + } + } + const bool explicit_perm_is_nontrivial = + !(explicit_perm.first().is_identity() && + explicit_perm.second().is_identity()); + if (explicit_perm && explicit_perm_is_nontrivial) + return derived().make_tile_op(explicit_perm); else return derived().make_tile_op(); } @@ -243,11 +268,45 @@ class ExprEngine : private NO_DEFAULTS { /// \return A const reference to the process map const std::shared_ptr& pmap() const { return pmap_; } - /// Set the permute tiles flag + /// Set the flag that controls whether tiles' permutation will be implicit + + /// some consuming operations (like GEMM) permutation can perform some + /// permutation types implicitly. setting this to true indicates that the + /// result tiles' outer modes do not need to be permuted and permutation will + /// be performed implicitly by the consuming operation \param status The new + /// value for the implicit permute flag (true => will not permute outer modes + /// of result tiles; false => will permute outer modes of result tiles if + /// needed) \note for plain tensors, i.e., tensor-of-scalars, any mode is + /// outer + void implicit_permute_outer(const bool status) { + implicit_permute_outer_ = status; + } + + /// Set the flag that controls whether tiles' permutation will be implicit + + /// some consuming operations (like GEMM) permutation can perform some + /// permutation types implicitly. setting this to true indicates that the + /// result tiles' inner modes do not need to be permuted and permutation will + /// be performed implicitly by the consuming operation \param status The new + /// value for the implicit permute flag (true => will not permute inner modes + /// of result tiles; false => will permute inner modes of result tiles if + /// needed) \note for plain tensors, i.e., tensor-of-scalars, there are no + /// inner modes and this should not be used + void implicit_permute_inner(const bool status) { + TA_ASSERT(TiledArray::detail::is_tensor_of_tensor_v); + implicit_permute_inner_ = status; + } - /// \param status The new status for permute tiles (true == permute result - /// tiles) - void permute_tiles(const bool status) { permute_tiles_ = status; } + /// Reports whether permutation of the result tiles will be implicit, i.e. + /// will be fused into the consuming operation + + /// \return true if will not permute of result tiles; false will indicate that + /// the result tiles will be permuted if needed + bool implicit_permute() const { + constexpr bool is_tot = + TiledArray::detail::is_tensor_of_tensor_v; + return (implicit_permute_outer_ || (is_tot && implicit_permute_inner_)); + } /// Expression print @@ -255,9 +314,23 @@ class ExprEngine : private NO_DEFAULTS { /// \param target_indices The target index list for this expression void print(ExprOStream& os, const BipartiteIndexList& target_indices) const { if (perm_) { - os << "[P " << target_indices << "]" - << (permute_tiles_ ? " " : " [no permute tiles] ") - << derived().make_tag() << indices_ << "\n"; + os << "[P " << target_indices << "]"; + if (implicit_permute_outer_ || implicit_permute_inner_) { + os << " [implicit "; + constexpr bool is_tot = + TiledArray::detail::is_tensor_of_tensor_v; + if constexpr (is_tot) { + if (implicit_permute_outer_ && implicit_permute_inner_) { + os << "outer&inner "; + } else if (implicit_permute_outer_) { + os << "outer "; + } else + os << "inner "; + } + os << "permute ] "; + } else + os << " "; + os << derived().make_tag() << indices_ << "\n"; } else { os << derived().make_tag() << indices_ << "\n"; } diff --git a/src/TiledArray/expressions/leaf_engine.h b/src/TiledArray/expressions/leaf_engine.h index 5e273fb5dc..8804989d6f 100644 --- a/src/TiledArray/expressions/leaf_engine.h +++ b/src/TiledArray/expressions/leaf_engine.h @@ -70,9 +70,10 @@ class LeafEngine : public ExprEngine { protected: // Import base class variables to this scope + using ExprEngine_::implicit_permute_inner_; + using ExprEngine_::implicit_permute_outer_; using ExprEngine_::indices_; using ExprEngine_::perm_; - using ExprEngine_::permute_tiles_; using ExprEngine_::pmap_; using ExprEngine_::shape_; using ExprEngine_::trange_; diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 97751c56a2..88b9ffb7df 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -334,6 +334,9 @@ class MultEngine : public ContEngine> { /// for the result tensor. /// \param target_indices The target index list for the result tensor void init_struct(const BipartiteIndexList& target_indices) { + this->init_perm(target_indices); + + // for ContEngine_::init_struct need to initialize element op first this->init_inner_tile_op(inner(target_indices)); if (this->product_type() == TensorProduct::Contraction) ContEngine_::init_struct(target_indices); @@ -592,6 +595,9 @@ class ScalMultEngine /// for the result tensor. /// \param target_indices The target index list for the result tensor void init_struct(const BipartiteIndexList& target_indices) { + this->init_perm(target_indices); + + // for ContEngine_::init_struct need to initialize element op first this->init_inner_tile_op(inner(target_indices)); if (this->product_type() == TensorProduct::Contraction) ContEngine_::init_struct(target_indices); diff --git a/src/TiledArray/expressions/permopt.h b/src/TiledArray/expressions/permopt.h index 998ea78efe..291604faa8 100644 --- a/src/TiledArray/expressions/permopt.h +++ b/src/TiledArray/expressions/permopt.h @@ -45,8 +45,11 @@ namespace expressions { enum class PermutationType { identity = 1, matrix_transpose = 2, general = 3 }; inline blas::Op to_cblas_op(PermutationType permtype) { - TA_ASSERT(permtype == PermutationType::matrix_transpose || - permtype == PermutationType::identity); + // N.B. 3 cases: + // - permtype == identity : no transpose needed + // - permtype == matrix_transpose : transpose needed + // - permtype == general : the argument will be explicitly permuted to be in a + // layout which does not require permutation hence no need for a switch ... return permtype == PermutationType::matrix_transpose ? math::blas::Transpose : math::blas::NoTranspose; diff --git a/src/TiledArray/expressions/product.h b/src/TiledArray/expressions/product.h index 7111b7831b..df2867a360 100644 --- a/src/TiledArray/expressions/product.h +++ b/src/TiledArray/expressions/product.h @@ -73,8 +73,10 @@ inline TensorProduct compute_product_type(const IndexList& left_indices, const IndexList& right_indices, const IndexList& target_indices) { auto result = compute_product_type(left_indices, right_indices); - if (result == TensorProduct::Hadamard) + if (result == TensorProduct::Hadamard) { TA_ASSERT(left_indices.is_permutation(target_indices)); + TA_ASSERT(right_indices.is_permutation(target_indices)); + } return result; } diff --git a/src/TiledArray/expressions/unary_engine.h b/src/TiledArray/expressions/unary_engine.h index 621c4a71b3..631fca8fed 100644 --- a/src/TiledArray/expressions/unary_engine.h +++ b/src/TiledArray/expressions/unary_engine.h @@ -70,9 +70,10 @@ class UnaryEngine : ExprEngine { protected: // Import base class variables to this scope + using ExprEngine_::implicit_permute_inner_; + using ExprEngine_::implicit_permute_outer_; using ExprEngine_::indices_; using ExprEngine_::perm_; - using ExprEngine_::permute_tiles_; using ExprEngine_::pmap_; using ExprEngine_::shape_; using ExprEngine_::trange_; @@ -99,7 +100,7 @@ class UnaryEngine : ExprEngine { /// children such that the number of permutations is minimized. /// \param target_indices The target index list for this expression void perm_indices(const BipartiteIndexList& target_indices) { - TA_ASSERT(permute_tiles_); + TA_ASSERT(!this->implicit_permute()); indices_ = target_indices; if (arg_.indices() != target_indices) arg_.perm_indices(target_indices); From 838bfdd956450a04158104c29f917ef06b08309e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 22 Jan 2024 13:27:34 -0500 Subject: [PATCH 297/592] cleanup --- src/TiledArray/expressions/binary_engine.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h index daf69d428e..33318b57a6 100644 --- a/src/TiledArray/expressions/binary_engine.h +++ b/src/TiledArray/expressions/binary_engine.h @@ -97,14 +97,14 @@ class BinaryEngine : public ExprEngine { PermutationType right_inner_permtype_ = PermutationType::general; ///< Right-hand permutation type - template + template void init_indices_(const BipartiteIndexList& target_indices = {}) { - static_assert(ProductType == TensorProduct::Contraction || - ProductType == TensorProduct::Hadamard); + static_assert(OuterProductType == TensorProduct::Contraction || + OuterProductType == TensorProduct::Hadamard); // prefer to permute the arg with fewest leaves to try to minimize the // number of possible permutations using permopt_type = - std::conditional_t; From f9c2106d8657a0dc832f1c7f959e2edf6b5da1a4 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 22 Jan 2024 17:38:04 -0500 Subject: [PATCH 298/592] Tensor(OtherTensor, BipartitePermutation) can handle the case where outer(BipartitePermutation) is null --- src/TiledArray/tensor/tensor.h | 39 ++++++++++------------------------ 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index a14d30157e..5faec43957 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -397,7 +397,7 @@ class Tensor { /// \param perm The permutation that will be applied to the copy /// \warning if `T1` is a tensor of tensors its elements are _cloned_ rather /// than copied to make the semantics of this to be consistent - /// between tensors of scalars and tensors of scalars; specifically, + /// between tensors of scalars and tensors of tensors; specifically, /// if `T1` is a tensor of scalars the constructed tensor is /// is independent of \p other, thus should apply clone to inner /// tensor nests to behave similarly for nested tensors @@ -407,8 +407,14 @@ class Tensor { detail::is_permutation_v>::type* = nullptr> Tensor(const T1& other, const Perm& perm) : Tensor(outer(perm) * other.range(), 1, default_construct{false}) { - detail::tensor_init(value_converter, outer(perm), - *this, other); + const auto outer_perm = outer(perm); + if (outer_perm) { + detail::tensor_init(value_converter, outer_perm, + *this, other); + } else { + detail::tensor_init(value_converter, *this, + other); + } // If we actually have a ToT the inner permutation was not applied above so // we do that now @@ -419,7 +425,7 @@ class Tensor { // not match Tensor"); if constexpr (is_tot && is_bperm) { if (inner_size(perm) != 0) { - auto inner_perm = inner(perm); + const auto inner_perm = inner(perm); Permute p; for (auto& x : *this) x = p(x, inner_perm); } @@ -1294,30 +1300,7 @@ class Tensor { constexpr bool is_tot = detail::is_tensor_of_tensor_v; [[maybe_unused]] constexpr bool is_bperm = detail::is_bipartite_permutation_v; - // tile ops pass bipartite permutations here even if this is a plain tensor - // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does - // not match Tensor"); - if constexpr (!is_tot) { - if constexpr (is_bperm) { - TA_ASSERT(inner_size(perm) == 0); // ensure this is a plain permutation - return Tensor(*this, outer(perm)); - } else - return Tensor(*this, perm); - } else { - // If we have a ToT we need to apply the permutation in two steps. The - // first step is identical to the non-ToT case (permute the outer modes) - // the second step does the inner modes - Tensor rv(*this, outer(perm)); - if constexpr (is_bperm) { - if (inner_size(perm) != 0) { - auto inner_perm = inner(perm); - Permute p; - for (auto& inner_t : rv) inner_t = p(inner_t, inner_perm); - } - } - return rv; - } - abort(); // unreachable + return Tensor(*this, perm); } /// Shift the lower and upper bound of this tensor From e9603b49ea5f4e4a9ce4860451ef3857790445bd Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 26 Jan 2024 16:12:37 -0500 Subject: [PATCH 299/592] Cleanup. --- src/TiledArray/tensor/kernels.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 4d123fecd0..379439856f 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -1256,10 +1256,11 @@ auto tensor_hadamard(TensorA const& A, Annot const& aA, TensorB const& B, struct { bool no_perm, perm_to_c, perm_a, perm_b; - } do_this{perm.AB.is_identity() && perm.AC.is_identity() && - perm.BC.is_identity(), // - perm.AB.is_identity(), // - perm.AC.is_identity()}; + } const do_this{ + perm.AB.is_identity() && perm.AC.is_identity() && perm.BC.is_identity(), + perm.AB.is_identity(), // + perm.BC.is_identity(), // + perm.AC.is_identity()}; if (do_this.no_perm) { return A.mult(B); @@ -1267,14 +1268,16 @@ auto tensor_hadamard(TensorA const& A, Annot const& aA, TensorB const& B, return A.mult(B, perm.AC); } else if (do_this.perm_a) { auto pA = A.permute(perm.AC); - return pA.mult(B); + pA.mult_to(B); + return pA; } else if (do_this.perm_b) { auto pB = B.permute(perm.BC); - return A.mult(pB); + pB.mult_to(A); + return pB; } else { auto pA = A.permute(perm.AC); - auto pB = B.permute(perm.BC); - return pA.mult(pB); + return pA.mult_to(B.permute(perm.BC)); + return pA; } } From 689b9b3cc3230ca1ad665d87435261cef533a0cd Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 26 Jan 2024 16:13:53 -0500 Subject: [PATCH 300/592] Type traits to get nested ranks and max nested-rank among Tensor and DistArray types. --- src/TiledArray/tensor/type_traits.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index 2e23359950..2a624ebf76 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -210,6 +210,25 @@ template constexpr const bool tensors_have_equal_nested_rank_v = tensors_have_equal_nested_rank::value; +template +constexpr size_t nested_rank = 0; + +template +constexpr size_t nested_rank> = 1 + nested_rank; + +template +constexpr size_t nested_rank> = nested_rank; + +template +constexpr size_t max_nested_rank = 0; + +template +constexpr size_t max_nested_rank = nested_rank; + +template +constexpr size_t max_nested_rank = + std::max(nested_rank, std::max(nested_rank, max_nested_rank)); + //////////////////////////////////////////////////////////////////////////////// template From 4174d5160982ac0a308777b1b8d818902596f9a2 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 26 Jan 2024 16:15:05 -0500 Subject: [PATCH 301/592] string annotation taken by einsum function can handle tensor-of-tensor annotations. --- src/TiledArray/einsum/tiledarray.h | 39 ++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index b294a05b7d..116381ef15 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -531,11 +531,40 @@ using expressions::einsum; template auto einsum(const std::string &expr, const DistArray &A, const DistArray &B, World &world = get_default_world()) { - namespace string = ::Einsum::string; - auto [lhs, rhs] = string::split2(expr, "->"); - auto [a, b] = string::split2(lhs, ","); - return einsum(A(string::join(a, ",")), B(string::join(b, ",")), - string::join(rhs, ","), world); + using ::Einsum::string::join; + using ::Einsum::string::split2; + + struct { + std::string A, B, C; + } annot; + + { + struct { + std::string A, B, C; + } outer; + + struct { + std::string A, B, C; + } inner; + + auto [ab, aC] = split2(expr, "->"); + std::tie(outer.C, inner.C) = split2(aC, ";"); + + auto [aA, aB] = split2(ab, ","); + std::tie(outer.A, inner.A) = split2(aA, ";"); + std::tie(outer.B, inner.B) = split2(aB, ";"); + + auto combine = [](auto const &outer, auto const &inner) { + return inner.empty() ? join(outer, ",") + : (join(outer, ",") + ";" + join(inner, ",")); + }; + + annot.A = combine(outer.A, inner.A); + annot.B = combine(outer.B, inner.B); + annot.C = combine(outer.C, inner.C); + } + + return einsum(A(annot.A), B(annot.B), annot.C, world); } /// Computes ternary tensor product whose result From adba673196d57b42f7e512dd484c16eefa567bf0 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 26 Jan 2024 16:19:21 -0500 Subject: [PATCH 302/592] Slow yet correct general tensor product evaluation of arbitrarily nested tensors. --- tests/tot_array_fixture.h | 327 +++++++++++++++++++++++++++++++++++++- 1 file changed, 324 insertions(+), 3 deletions(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index e28eb1d139..c165a456c2 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -104,6 +104,12 @@ auto random_tensor(TA::Range const& rng) { return result; } +template +auto random_tensor(std::initializer_list const& extents) { + auto lobounds = TA::container::svector(extents.size(), 0); + return random_tensor(TA::Range{lobounds, extents}); +} + // // note: all the inner tensors (elements of the outer tensor) // have the same @c inner_rng @@ -123,6 +129,13 @@ auto random_tensor(TA::Range const& outer_rng, TA::Range const& inner_rng) { return result; } +template +auto random_tensor(TA::Range const& outer_rng, + std::initializer_list const& inner_extents) { + TA::container::svector lobounds(inner_extents.size(), 0); + return random_tensor(outer_rng, TA::Range(lobounds, inner_extents)); +} + /// /// \tparam Array The type of DistArray to be generated. Cannot be cv-qualified /// or reference type. @@ -153,9 +166,6 @@ auto random_array(TA::TiledRange const& trange, Args const&... args) { (sizeof...(Args) == 1) && (TA::detail::is_tensor_of_tensor_v)); - if constexpr (sizeof...(Args) == 1) - static_assert(std::is_convertible_v); - using TensorT = typename Array::value_type; using PolicyT = typename Array::policy_type; @@ -171,6 +181,13 @@ auto random_array(TA::TiledRange const& trange, Args const&... args) { make_tile_meta(args...)); } +template +auto random_array(std::initializer_list> trange, + Args&&... args) { + return random_array(TA::TiledRange(trange), + std::forward(args)...); +} + /// /// Succinctly call TA::detail::tensor_contract /// @@ -190,6 +207,310 @@ auto tensor_contract(std::string const& einsum_annot, T const& A, T const& B) { return TA::detail::tensor_contract(A, aA, B, aB, aC); } +using PartialPerm = TA::container::svector>; + +template +PartialPerm partial_perm(::Einsum::index::Index const& from, + ::Einsum::index::Index const& to) { + PartialPerm result; + for (auto i = 0; i < from.size(); ++i) + if (auto found = to.find(from[i]); found != to.end()) + result.emplace_back(i, std::distance(to.begin(), found)); + return result; +} + +template >> +void apply_partial_perm(T& to, T const& from, PartialPerm const& p) { + for (auto [f, t] : p) { + TA_ASSERT(f < from.size() && t < to.size() && "Invalid permutation used"); + to[t] = from[f]; + } +} + +/// +/// Example: To represent A("ik;ac") * B("kj;cb") -> C("ij;ab"), +/// construct with std::string("ij;ac,kj;cb->ij;ab"); +/// outer_indices;inner_indices annotates a single object (DistArray, Tensor +/// etc.) A_indices,B_indices annotates first(A) and second(B) object +/// '->' separates argument objects' annotation from the result's annotation +/// +class OuterInnerIndices { + // array[0] annotes A + // array[1] annotes B + // array[2] annotes C + std::array outer_, inner_; + + public: + OuterInnerIndices(std::string const& annot) { + using ::Einsum::string::split2; + + constexpr size_t A = 0; + constexpr size_t B = 1; + constexpr size_t C = 2; + + auto [ab, aC] = split2(annot, "->"); + std::tie(outer_[C], inner_[C]) = split2(aC, ";"); + + auto [aA, aB] = split2(ab, ","); + + std::tie(outer_[A], inner_[A]) = split2(aA, ";"); + std::tie(outer_[B], inner_[B]) = split2(aB, ";"); + } + + template + OuterInnerIndices(const char (&s)[N]) : OuterInnerIndices{std::string(s)} {} + + [[nodiscard]] auto const& outer() const noexcept { return outer_; } + [[nodiscard]] auto const& inner() const noexcept { return inner_; } + + [[nodiscard]] auto const& outerA() const noexcept { return outer_[0]; } + [[nodiscard]] auto const& outerB() const noexcept { return outer_[1]; } + [[nodiscard]] auto const& outerC() const noexcept { return outer_[2]; } + [[nodiscard]] auto const& innerA() const noexcept { return inner_[0]; } + [[nodiscard]] auto const& innerB() const noexcept { return inner_[1]; } + [[nodiscard]] auto const& innerC() const noexcept { return inner_[2]; } +}; + +struct ProductSetup { + TA::expressions::TensorProduct product_type{ + TA::expressions::TensorProduct::Invalid}; + + PartialPerm + // {} index at kth position in C appears at vth position in A + // and so on... + C_to_A, + C_to_B, + I_to_A, // 'I' implies for contracted indices + I_to_B; + size_t // + rank_A, // + rank_B, + rank_C, // + rank_H, + rank_E, // + rank_I; + + ProductSetup() = default; + + template >> + ProductSetup(T const& aA, T const& aB, T const& aC) { + using Indices = ::Einsum::index::Index; + + struct { + // A, B, C tensor indices + // H, E, I Hadamard, external, internal, and target indices + Indices A, B, C, H, E, I; + } const ixs{Indices{aA}, Indices{aB}, + Indices{aC}, (ixs.A & ixs.B & ixs.C), + (ixs.A ^ ixs.B), ((ixs.A & ixs.B) - ixs.H)}; + + rank_A = ixs.A.size(); + rank_B = ixs.B.size(); + rank_C = ixs.C.size(); + rank_H = ixs.H.size(); + rank_E = ixs.E.size(); + rank_I = ixs.I.size(); + + C_to_A = partial_perm(ixs.C, ixs.A); + C_to_B = partial_perm(ixs.C, ixs.B); + I_to_A = partial_perm(ixs.I, ixs.A); + I_to_B = partial_perm(ixs.I, ixs.B); + + using TP = decltype(product_type); + if (!(ixs.E || ixs.H)) + product_type = TP::Invalid; // no target indices + else if (!(ixs.E || ixs.I)) + product_type = TP::Hadamard; + else if (!ixs.H) + product_type = TP::Contraction; + else if (ixs.H && (ixs.E || ixs.I)) + product_type = TP::General; + } + + template >> + ProductSetup(ArrayLike const& arr) + : ProductSetup(std::get<0>(arr), std::get<1>(arr), std::get<2>(arr)) {} + + [[nodiscard]] bool valid() const noexcept { + return (rank_A + rank_B) != 0 && (rank_E + rank_H) != 0; + } +}; + +namespace { +template +inline auto general_product(Tensor const& t, typename Tensor::numeric_type s, + Args&&...) { + return t * s; +} + +template +inline auto general_product(typename Tensor::numeric_type s, Tensor const& t, + Args&&...) { + return s * t; +} +} // namespace + +template // + && sizeof...(Setups) == + TA::detail::max_nested_rank - 1, + bool> = true> +auto general_product(TensorA const& A, TensorB const& B, + ProductSetup const& setup, Setups const&... args) { + static_assert(std::is_same_v); + using TensorC = std::conditional_t<(TA::detail::nested_rank > + TA::detail::nested_rank), + TensorA, TensorB>; + TA_ASSERT(setup.valid()); + + constexpr bool is_tot = TA::detail::max_nested_rank > 1; + + // creating the contracted TA::Range + TA::Range const rng_I = [&setup, &A, &B]() { + TA::container::svector rng1_I(setup.rank_I, TA::Range1{}); + for (auto [f, t] : setup.I_to_A) + // I_to_A implies I[f] == A[t] + rng1_I[f] = A.range().dim(t); + + return TA::Range(rng1_I); + }(); + + // creating the target (ie. C's) TA::Range. + TA::Range const rng_C = [&setup, &A, &B]() { + TA::container::svector rng1_C(setup.rank_C, TA::Range1{0, 0}); + for (auto [f, t] : setup.C_to_A) + // C_to_A implies C[f] = A[t] + rng1_C[f] = A.range().dim(t); + + for (auto [f, t] : setup.C_to_B) + // C_to_B implies C[f] = B[t] + rng1_C[f] = B.range().dim(t); + + auto zero_r1 = [](TA::Range1 const& r) { return r == TA::Range1{0, 0}; }; + + TA_ASSERT(std::none_of(rng1_C.begin(), rng1_C.end(), zero_r1)); + + return TA::Range(rng1_C); + }(); + + TensorC C{rng_C}; + + // do the computation + for (auto ix_C : rng_C) { + // finding corresponding indices of A, and B. + TA::Range::index_type ix_A(setup.rank_A, 0), ix_B(setup.rank_B, 0); + apply_partial_perm(ix_A, ix_C, setup.C_to_A); + apply_partial_perm(ix_B, ix_C, setup.C_to_B); + + if (setup.rank_I == 0) + if constexpr (is_tot) + C(ix_C) = general_product(A(ix_A), B(ix_B), args...); + else { + TA_ASSERT(!(ix_A.empty() && ix_B.empty())); + C(ix_C) = ix_A.empty() ? B(ix_B) + : ix_B.empty() ? A(ix_B) + : A(ix_A) * B(ix_B); + } + + else { + typename TensorC::value_type temp{}; + for (auto ix_I : rng_I) { + apply_partial_perm(ix_A, ix_I, setup.I_to_A); + apply_partial_perm(ix_B, ix_I, setup.I_to_B); + if constexpr (is_tot) + temp += general_product(A(ix_A), B(ix_B), args...); + else { + TA_ASSERT(!(ix_A.empty() || ix_B.empty())); + temp += A(ix_A) * B(ix_B); + } + } + C(ix_C) = temp; + } + } + + return C; +} + +template +auto general_product(TA::DistArray A, + TA::DistArray B, + ProductSetup const& setup, Setups const&... args) { + using TileC = std::conditional_t<(TA::detail::nested_rank > + TA::detail::nested_rank), + TileA, TileB>; + TA_ASSERT(setup.valid()); + + A.make_replicated(); + B.make_replicated(); + TA::get_default_world().gop.fence(); + + TA::Tensor tensorA{A.trange().tiles_range()}; + for (auto&& ix : tensorA.range()) tensorA(ix) = A.find_local(ix).get(false); + + TA::Tensor tensorB{B.trange().tiles_range()}; + for (auto&& ix : tensorB.range()) tensorB(ix) = B.find_local(ix).get(false); + + auto result_tensor = general_product(tensorA, tensorB, setup, setup, args...); + + TA::TiledRange result_trange; + { + auto const rank = result_tensor.range().rank(); + auto const result_range = result_tensor.range(); + + TA::container::svector> tr1s(rank, {0}); + + TA::container::svector const ix_hi(result_range.upbound()); + for (auto d = 0; d < rank; ++d) { + TA::container::svector ix(result_range.lobound()); + for (auto& i = ix[d]; i < ix_hi[d]; ++i) { + auto const& elem_tensor = result_tensor(ix); + auto& tr1 = tr1s[d]; + tr1.emplace_back(tr1.back() + elem_tensor.range().extent(d)); + } + } + + TA::container::svector tr1s_explicit; + tr1s_explicit.reserve(tr1s.size()); + for (auto const& v : tr1s) tr1s_explicit.emplace_back(v.begin(), v.end()); + + result_trange = TA::TiledRange(tr1s_explicit); + } + + using TileC = typename decltype(result_tensor)::value_type; + TA::DistArray C(TA::get_default_world(), + result_trange); + C.make_replicated(); + for (auto it : C) it = result_tensor(it.index()); + return C; +} + +template >> +auto manual_eval(OuterInnerIndices const& oixs, ArrayA A, ArrayB B) { + constexpr auto mnr = TA::detail::max_nested_rank; + static_assert(mnr == 1 || mnr == 2); + + auto const outer_setup = ProductSetup(oixs.outer()); + + TA_ASSERT(outer_setup.valid()); + + if constexpr (mnr == 2) { + auto const inner_setup = ProductSetup(oixs.inner()); + TA_ASSERT(inner_setup.valid()); + return general_product(A, B, outer_setup, inner_setup); + } else { + return general_product(A, B, outer_setup); + } +} + #ifdef TILEDARRAY_HAS_BTAS template >> From 1ca47e87d0e67d78856de14320da19753353a20d Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 26 Jan 2024 16:22:17 -0500 Subject: [PATCH 303/592] Compare 'manual' evaluation against `einsum` evaluations. --- tests/einsum.cpp | 129 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index ced387802d..31f0147708 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -25,6 +25,135 @@ #include "TiledArray/expressions/contraction_helpers.h" +BOOST_AUTO_TEST_SUITE(manual) + +template >> +bool check_manual_eval(std::string const& annot, ArrayA A, ArrayB B) { + auto ref = TA::einsum(annot, A, B); + auto out = manual_eval(annot, A, B); + return ToTArrayFixture::are_equal(ref, out); +} + +template +bool check_manual_eval( + std::string const& annot, + std::initializer_list> trangeA, + std::initializer_list> trangeB) { + auto A = random_array(TA::TiledRange(trangeA)); + auto B = random_array(TA::TiledRange(trangeB)); + return check_manual_eval(annot, A, B); +} + +template +bool check_manual_eval( + std::string const& annot, + std::initializer_list> trangeA, + std::initializer_list> trangeB, + std::initializer_list inner_extents) { + if constexpr (TA::detail::is_tensor_of_tensor_v) + return check_manual_eval( + annot, random_array(trangeA, inner_extents), + random_array(trangeB)); + else + return check_manual_eval( + annot, random_array(trangeA), + random_array(trangeB, inner_extents)); +} + +template +bool check_manual_eval( + std::string const& annot, + std::initializer_list> trangeA, + std::initializer_list> trangeB, + std::initializer_list inner_extentsA, + std::initializer_list inner_extentsB) { + return check_manual_eval( + annot, random_array(trangeA, inner_extentsA), + random_array(trangeB, inner_extentsB)); +} + +BOOST_AUTO_TEST_CASE(manual_contract) { + using Array = TA::Array; + + BOOST_REQUIRE(check_manual_eval("ij,j->i", + {{0, 2, 4}, {0, 4, 8}}, // A's trange + {{0, 4, 8}} // B's trange + )); + BOOST_REQUIRE(check_manual_eval("ik,jk->ji", + {{0, 2, 4}, {0, 4, 8}}, // A's trange + {{0, 3}, {0, 4, 8}} // B's trange + )); + + BOOST_REQUIRE(check_manual_eval( + "ijkl,jm->lkmi", // + {{0, 2}, {0, 4, 8}, {0, 3}, {0, 7}}, // + {{0, 4, 8}, {0, 5}} // + )); +} + +BOOST_AUTO_TEST_CASE(manual_hadamard) { + using Array = TA::Array; + BOOST_REQUIRE(check_manual_eval("i,i->i", // + {{0, 1}}, // + {{0, 1}} // + )); + BOOST_REQUIRE(check_manual_eval("i,i->i", // + {{0, 2, 4}}, // + {{0, 2, 4}} // + )); + + BOOST_REQUIRE(check_manual_eval("ijk,kij->ikj", // + {{0, 2, 4}, {0, 2, 3}, {0, 5}}, // + {{0, 5}, {0, 2, 4}, {0, 2, 3}} // + )); +} + +BOOST_AUTO_TEST_CASE(manual_general) { + using Array = TA::Array; + BOOST_REQUIRE(check_manual_eval("ijk,kil->ijl", // + {{0, 2}, {0, 3, 5}, {0, 2, 4}}, // + {{0, 2, 4}, {0, 2}, {0, 1}} // + )); + + using Array = TA::Array; + using Tensor = typename Array::value_type; + using namespace std::string_literals; + + Tensor A(TA::Range{2, 3}, {1, 2, 3, 4, 5, 6}); + Tensor B(TA::Range{2}, {2, 10}); + Tensor C(TA::Range{2, 3}, {2, 4, 6, 40, 50, 60}); + BOOST_REQUIRE(C == general_product(A, B, ProductSetup("ij"s, "i"s, "ij"s))); +} + +BOOST_AUTO_TEST_CASE(manual_nested_ranks) { + using ArrayT = TA::DistArray>; + using ArrayToT = TA::DistArray>>; + + BOOST_REQUIRE(check_manual_eval("ij;mn,ji;nm->ij;mn", // + {{0, 2, 4}, {0, 3}}, // + {{0, 3}, {0, 2, 4}}, // + {5, 7}, // + {7, 5} // + )); + BOOST_REQUIRE(check_manual_eval("ij;mo,ji;on->ij;mn", // + {{0, 2, 4}, {0, 3}}, // + {{0, 3}, {0, 2, 4}}, // + {3, 7}, // + {7, 4} // + )); + BOOST_REQUIRE(check_manual_eval("ij;mo,ji;o->ij;m", // + {{0, 2, 4}, {0, 3}}, // + {{0, 3}, {0, 2, 4}}, // + {3, 7}, // + {7} // + )); +} + +BOOST_AUTO_TEST_SUITE_END() + using namespace TiledArray; using namespace TiledArray::expressions; From d18b9dac05bd6f9a951ebcc7332d1307e534348d Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 26 Jan 2024 17:55:22 -0500 Subject: [PATCH 304/592] Add madness gop fences. --- tests/tot_array_fixture.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index c165a456c2..18dfdb8196 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -448,9 +448,11 @@ auto general_product(TA::DistArray A, TileA, TileB>; TA_ASSERT(setup.valid()); + auto& world = TA::get_default_world(); + A.make_replicated(); B.make_replicated(); - TA::get_default_world().gop.fence(); + world.gop.fence(); TA::Tensor tensorA{A.trange().tiles_range()}; for (auto&& ix : tensorA.range()) tensorA(ix) = A.find_local(ix).get(false); @@ -485,9 +487,9 @@ auto general_product(TA::DistArray A, } using TileC = typename decltype(result_tensor)::value_type; - TA::DistArray C(TA::get_default_world(), - result_trange); + TA::DistArray C(world, result_trange); C.make_replicated(); + world.gop.fence(); for (auto it : C) it = result_tensor(it.index()); return C; } From 64cc43e11f33b02bece62cc9ae4db57a27e00e29 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 26 Jan 2024 18:52:38 -0500 Subject: [PATCH 305/592] [skip ci] typos. --- tests/tot_array_fixture.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 18dfdb8196..152a270298 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -300,7 +300,7 @@ struct ProductSetup { struct { // A, B, C tensor indices - // H, E, I Hadamard, external, internal, and target indices + // H, E, I Hadamard, external, and internal indices Indices A, B, C, H, E, I; } const ixs{Indices{aA}, Indices{aB}, Indices{aC}, (ixs.A & ixs.B & ixs.C), @@ -443,9 +443,6 @@ template auto general_product(TA::DistArray A, TA::DistArray B, ProductSetup const& setup, Setups const&... args) { - using TileC = std::conditional_t<(TA::detail::nested_rank > - TA::detail::nested_rank), - TileA, TileB>; TA_ASSERT(setup.valid()); auto& world = TA::get_default_world(); From 8cf4c865e5d1efa778f4168846c894a1f34d184e Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 28 Jan 2024 08:13:08 -0500 Subject: [PATCH 306/592] [skip ci] add a debug instance. Of buggy expression layer support involving ToT times, T where the outer op is contraction (no Hadamard indices). --- tests/einsum.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 31f0147708..8cf463f089 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -152,6 +152,17 @@ BOOST_AUTO_TEST_CASE(manual_nested_ranks) { )); } +BOOST_AUTO_TEST_CASE(debug) { + using ArrayT = TA::DistArray>; + using ArrayToT = TA::DistArray>>; + + bool are_equal = check_manual_eval("ik;mn,jk->ij;mn", // + {{0, 2}, {0, 3}}, // + {{0, 2}, {0, 3}}, // + {2, 2}); + BOOST_REQUIRE(are_equal); +} + BOOST_AUTO_TEST_SUITE_END() using namespace TiledArray; From 15294b7a01697371a344fc851a74c09cfc88566f Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 28 Jan 2024 08:16:23 -0500 Subject: [PATCH 307/592] [skip ci] relax tile type restrictions on `einsum(std::string const&, ...)` function. --- src/TiledArray/einsum/tiledarray.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 116381ef15..2a03f7bc5b 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -528,9 +528,9 @@ namespace TiledArray { using expressions::dot; using expressions::einsum; -template -auto einsum(const std::string &expr, const DistArray &A, - const DistArray &B, World &world = get_default_world()) { +template +auto einsum(const std::string &expr, const DistArray &A, + const DistArray &B, World &world = get_default_world()) { using ::Einsum::string::join; using ::Einsum::string::split2; From 8693e4de43c97ded48c2e1656a748604d85ef392 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 28 Jan 2024 10:14:19 -0500 Subject: [PATCH 308/592] Restrict to pure Hadamard or contraction, between inner tensors. --- src/TiledArray/einsum/tiledarray.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 2a03f7bc5b..3550b6489f 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -124,6 +124,8 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, inner.h = inner.A & inner.B & inner.C; inner.e = (inner.A ^ inner.B); inner.i = (inner.A & inner.B) - inner.h; + TA_ASSERT(!(inner.h && (inner.i || inner.e)) && + "General product between inner tensors not supported"); } // these are "Hadamard" (fused) indices @@ -229,10 +231,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, for (size_t k = 0; k < batch; ++k) { using Ix = ::Einsum::Index; if constexpr (AreArrayToT) { - TA_ASSERT(inner.h ^ inner.i && - "Hadamard with contraction not supported between the " - "inner tensors"); - auto aik = ai.batch(k); auto bik = bi.batch(k); auto vol = aik.total_size(); From 4357f0991c5665fe36e4b78af5a70b829097b3aa Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 28 Jan 2024 10:45:00 -0500 Subject: [PATCH 309/592] Tensor times Tensor-of-Tensor can be called with BipartitePermutation in the `Tensor::binary` function. --- src/TiledArray/tensor/tensor.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 5faec43957..7814afba7d 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1401,11 +1401,7 @@ class Tensor { using result_allocator_type = typename std::allocator_traits< Allocator>::template rebind_alloc; using ResultTensor = Tensor; - if constexpr (is_bperm) { - TA_ASSERT(inner_size(perm) == 0); // ensure this is a plain permutation - return ResultTensor(*this, right, op, outer(perm)); - } else - return ResultTensor(*this, right, op, perm); + return ResultTensor(*this, right, op, perm); } else { // AFAIK the other branch fundamentally relies on raw pointer arithmetic, // which won't work for ToTs. From 002fae71eb12b8f66b2161cdc69888e128397bc9 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 28 Jan 2024 12:08:06 -0500 Subject: [PATCH 310/592] Add more tests for einsum evaluation between tensors of differing nested ranks. --- tests/einsum.cpp | 112 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 104 insertions(+), 8 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 8cf463f089..f44aa6b7fd 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -128,39 +128,135 @@ BOOST_AUTO_TEST_CASE(manual_general) { BOOST_REQUIRE(C == general_product(A, B, ProductSetup("ij"s, "i"s, "ij"s))); } -BOOST_AUTO_TEST_CASE(manual_nested_ranks) { - using ArrayT = TA::DistArray>; +BOOST_AUTO_TEST_CASE(manual_equal_nested_ranks) { using ArrayToT = TA::DistArray>>; + // H;H (Hadamard outer; Hadamard inner) BOOST_REQUIRE(check_manual_eval("ij;mn,ji;nm->ij;mn", // {{0, 2, 4}, {0, 3}}, // {{0, 3}, {0, 2, 4}}, // {5, 7}, // {7, 5} // )); + + // H;C (Hadamard outer; contraction inner) BOOST_REQUIRE(check_manual_eval("ij;mo,ji;on->ij;mn", // {{0, 2, 4}, {0, 3}}, // {{0, 3}, {0, 2, 4}}, // {3, 7}, // {7, 4} // )); + + // H;C BOOST_REQUIRE(check_manual_eval("ij;mo,ji;o->ij;m", // {{0, 2, 4}, {0, 3}}, // {{0, 3}, {0, 2, 4}}, // {3, 7}, // {7} // )); + + // C;C + BOOST_REQUIRE(check_manual_eval("ik;mo,kj;on->ij;mn", // + {{0, 3, 5}, {0, 2, 4}}, // + {{0, 2, 4}, {0, 2}}, // + {2, 2}, // + {2, 2})); + // H+C;H + BOOST_REQUIRE(check_manual_eval("ijk;mn,ijk;nm->ij;mn", // + {{0, 2}, {0, 3}, {0, 2}}, // + {{0, 2}, {0, 3}, {0, 2}}, // + {2, 2}, // + {2, 2})); + + // H+C;C + BOOST_REQUIRE(check_manual_eval("ijk;mo,ijk;no->ij;nm", // + {{0, 2}, {0, 3}, {0, 2}}, // + {{0, 2}, {0, 3}, {0, 2}}, // + {3, 2}, // + {3, 2})); + + // H+C;C + BOOST_REQUIRE(check_manual_eval("ijk;m,ijk;n->ij;nm", // + {{0, 2}, {0, 3}, {0, 2}}, // + {{0, 2}, {0, 3}, {0, 2}}, // + {3}, // + {2})); + // H+C;H+C not supported } -BOOST_AUTO_TEST_CASE(debug) { +BOOST_AUTO_TEST_CASE(manual_different_nested_ranks) { using ArrayT = TA::DistArray>; using ArrayToT = TA::DistArray>>; - bool are_equal = check_manual_eval("ik;mn,jk->ij;mn", // - {{0, 2}, {0, 3}}, // - {{0, 2}, {0, 3}}, // - {2, 2}); - BOOST_REQUIRE(are_equal); + // H + BOOST_REQUIRE((check_manual_eval("ij;mn,ji->ji;nm", // + {{0, 2, 4}, {0, 3, 5}}, // + {{0, 3, 5}, {0, 2, 4}}, // + {2, 3}))); + + // H (reversed arguments) + BOOST_REQUIRE((check_manual_eval("ji,ij;mn->ji;nm", // + {{0, 3, 5}, {0, 2, 4}}, // + {{0, 2, 4}, {0, 3, 5}}, // + {2, 3}))); + + // H+C (outer product) + BOOST_REQUIRE((check_manual_eval("ij;mn,ik->ijk;mn", // + {{0, 2}, {0, 1}}, // + {{0, 2}, {0, 3}}, // + {2, 3}))); + + // H+C (outer product) (reversed arguments) + BOOST_REQUIRE((check_manual_eval("ik,ij;mn->ijk;mn", // + {{0, 2}, {0, 3}}, // + {{0, 2}, {0, 1}}, // + {2, 3}))); + + // todo: bug fix in expression layer + + // C (outer product) + BOOST_REQUIRE((check_manual_eval("i;mn,j->ij;nm", // + {{0, 2}}, // + {{0, 3}}, // + {1, 2}))); + + // C (outer product) (reversed arguments) + BOOST_REQUIRE((check_manual_eval("j,i;mn->ij;nm", // + {{0, 3}}, // + {{0, 2}}, // + {1, 2}))); + + // C + BOOST_REQUIRE((check_manual_eval("ij;m,j->i;m", // + {{0, 2}, {0, 2}}, // + {{0, 2}}, // + {3}))); + + // C (reversed arguments) + BOOST_REQUIRE((check_manual_eval("j,ij;m->i;m", // + {{0, 2}}, // + {{0, 2}, {0, 2}}, // + {3}))); + + // C (outer product) (reversed arguments) + BOOST_REQUIRE((check_manual_eval("j,i;m->ij;m", // + {{0, 2}}, // + {{0, 2}}, // + {3}))); + + // H+C + BOOST_REQUIRE( + (check_manual_eval("ik;mn,ijk->ij;nm", // + {{0, 2}, {0, 3}}, // + {{0, 2}, {0, 2}, {0, 3}}, // + {2, 2}))); + + // H+C (reversed arguments) + BOOST_REQUIRE( + (check_manual_eval("ijk,ik;mn->ij;nm", // + {{0, 2}, {0, 2}, {0, 3}}, // + {{0, 2}, {0, 3}}, // + {2, 2}))); } BOOST_AUTO_TEST_SUITE_END() From 5fa8d641d0a02d30aec22dfb6c1de0cef3684b0a Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 28 Jan 2024 12:09:10 -0500 Subject: [PATCH 311/592] [skip ci] Test case name cleanup. --- tests/einsum.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index f44aa6b7fd..49155a81f9 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -75,7 +75,7 @@ bool check_manual_eval( random_array(trangeB, inner_extentsB)); } -BOOST_AUTO_TEST_CASE(manual_contract) { +BOOST_AUTO_TEST_CASE(contract) { using Array = TA::Array; BOOST_REQUIRE(check_manual_eval("ij,j->i", @@ -94,7 +94,7 @@ BOOST_AUTO_TEST_CASE(manual_contract) { )); } -BOOST_AUTO_TEST_CASE(manual_hadamard) { +BOOST_AUTO_TEST_CASE(hadamard) { using Array = TA::Array; BOOST_REQUIRE(check_manual_eval("i,i->i", // {{0, 1}}, // @@ -111,7 +111,7 @@ BOOST_AUTO_TEST_CASE(manual_hadamard) { )); } -BOOST_AUTO_TEST_CASE(manual_general) { +BOOST_AUTO_TEST_CASE(general) { using Array = TA::Array; BOOST_REQUIRE(check_manual_eval("ijk,kil->ijl", // {{0, 2}, {0, 3, 5}, {0, 2, 4}}, // @@ -128,7 +128,7 @@ BOOST_AUTO_TEST_CASE(manual_general) { BOOST_REQUIRE(C == general_product(A, B, ProductSetup("ij"s, "i"s, "ij"s))); } -BOOST_AUTO_TEST_CASE(manual_equal_nested_ranks) { +BOOST_AUTO_TEST_CASE(equal_nested_ranks) { using ArrayToT = TA::DistArray>>; // H;H (Hadamard outer; Hadamard inner) @@ -184,7 +184,7 @@ BOOST_AUTO_TEST_CASE(manual_equal_nested_ranks) { // H+C;H+C not supported } -BOOST_AUTO_TEST_CASE(manual_different_nested_ranks) { +BOOST_AUTO_TEST_CASE(different_nested_ranks) { using ArrayT = TA::DistArray>; using ArrayToT = TA::DistArray>>; From 5889b9fad9306b1ad5dc060fe669f6313cae7824 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 28 Jan 2024 21:29:51 -0500 Subject: [PATCH 312/592] scaling with permutation supported in manual evaluation --- tests/tot_array_fixture.h | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 152a270298..b63209daf5 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -277,8 +277,9 @@ struct ProductSetup { TA::expressions::TensorProduct::Invalid}; PartialPerm - // {} index at kth position in C appears at vth position in A - // and so on... + // - {} index at kth position in C appears at vth position in A + // and so on... + // - {} is sorted by k C_to_A, C_to_B, I_to_A, // 'I' implies for contracted indices @@ -341,16 +342,27 @@ struct ProductSetup { }; namespace { -template + +auto make_perm(PartialPerm const& pp) { + TA::container::svector p(pp.size()); + for (auto [k, v] : pp) p[k] = v; + return TA::Permutation(p); +} + +template inline auto general_product(Tensor const& t, typename Tensor::numeric_type s, - Args&&...) { - return t * s; + ProductSetup const& setup, Setups const&... args) { + static_assert(sizeof...(args) == 0, + "To-Do: Only scalar times once-nested tensor supported now"); + return t.scale(s, make_perm(setup.C_to_A).inv()); } -template +template inline auto general_product(typename Tensor::numeric_type s, Tensor const& t, - Args&&...) { - return s * t; + ProductSetup const& setup, Setups const&... args) { + static_assert(sizeof...(args) == 0, + "To-Do: Only scalar times once-nested tensor supported now"); + return t.scale(s, make_perm(setup.C_to_B).inv()); } } // namespace From 36087b78ae7cfdde8d850582922bc2d4ac2ee664 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 28 Jan 2024 21:30:33 -0500 Subject: [PATCH 313/592] [skip ci] add more tests --- tests/einsum.cpp | 92 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 66 insertions(+), 26 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 49155a81f9..d935ead8ef 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -188,31 +188,77 @@ BOOST_AUTO_TEST_CASE(different_nested_ranks) { using ArrayT = TA::DistArray>; using ArrayToT = TA::DistArray>>; - // H - BOOST_REQUIRE((check_manual_eval("ij;mn,ji->ji;nm", // - {{0, 2, 4}, {0, 3, 5}}, // - {{0, 3, 5}, {0, 2, 4}}, // - {2, 3}))); - - // H (reversed arguments) - BOOST_REQUIRE((check_manual_eval("ji,ij;mn->ji;nm", // - {{0, 3, 5}, {0, 2, 4}}, // - {{0, 2, 4}, {0, 3, 5}}, // - {2, 3}))); + { + // these tests do not involve permutation of inner tensors + // H + BOOST_REQUIRE( + (check_manual_eval("ij;mn,ji->ji;mn", // + {{0, 2, 5}, {0, 3, 5, 9}}, // + {{0, 3, 5, 9}, {0, 2, 5}}, // + {2, 1}))); + + // H (reversed arguments) + BOOST_REQUIRE( + (check_manual_eval("ji,ij;mn->ji;mn", // + {{0, 3, 5, 9}, {0, 2, 5}}, // + {{0, 2, 5}, {0, 3, 5, 9}}, // + {2, 4}))); + + // C (outer product) + BOOST_REQUIRE((check_manual_eval("i;mn,j->ij;mn", // + {{0, 5}}, // + {{0, 3, 8}}, // + {3, 2}))); + + // C (outer product) (reversed arguments) + BOOST_REQUIRE((check_manual_eval("j,i;mn->ij;mn", // + {{0, 3, 8}}, // + {{0, 5}}, // + {2, 2}))); + } // H+C (outer product) - BOOST_REQUIRE((check_manual_eval("ij;mn,ik->ijk;mn", // - {{0, 2}, {0, 1}}, // - {{0, 2}, {0, 3}}, // - {2, 3}))); + BOOST_REQUIRE((check_manual_eval("ij;mn,ik->ijk;nm", // + {{0, 2, 5}, {0, 3, 7}}, // + {{0, 2, 5}, {0, 4, 7}}, // + {2, 5}))); // H+C (outer product) (reversed arguments) - BOOST_REQUIRE((check_manual_eval("ik,ij;mn->ijk;mn", // - {{0, 2}, {0, 3}}, // - {{0, 2}, {0, 1}}, // - {2, 3}))); + BOOST_REQUIRE((check_manual_eval("ik,ij;mn->ijk;nm", // + {{0, 2, 5}, {0, 4, 7}}, // + {{0, 2, 5}, {0, 3, 7}}, // + {2, 5}))); + + // todo: bug fix in expression layer for following tests to pass - // todo: bug fix in expression layer + { + // these tests do not involve permutation of inner tensors + // H+C + BOOST_REQUIRE( + (check_manual_eval("ik;mn,ijk->ij;mn", // + {{0, 2}, {0, 3}}, // + {{0, 2}, {0, 2}, {0, 3}}, // + {2, 2}))); + + // H+C (reversed arguments) + BOOST_REQUIRE( + (check_manual_eval("ijk,ik;mn->ij;mn", // + {{0, 2}, {0, 2}, {0, 3}}, // + {{0, 2}, {0, 3}}, // + {2, 2}))); + } + + // H + BOOST_REQUIRE((check_manual_eval("ij;mn,ji->ji;nm", // + {{0, 2}, {0, 1}}, // + {{0, 1}, {0, 2}}, // + {2, 2}))); + + // H (reversed arguments) + BOOST_REQUIRE((check_manual_eval("ji,ij;mn->ji;nm", // + {{0, 1}, {0, 2}}, // + {{0, 2}, {0, 1}}, // + {2, 2}))); // C (outer product) BOOST_REQUIRE((check_manual_eval("i;mn,j->ij;nm", // @@ -238,12 +284,6 @@ BOOST_AUTO_TEST_CASE(different_nested_ranks) { {{0, 2}, {0, 2}}, // {3}))); - // C (outer product) (reversed arguments) - BOOST_REQUIRE((check_manual_eval("j,i;m->ij;m", // - {{0, 2}}, // - {{0, 2}}, // - {3}))); - // H+C BOOST_REQUIRE( (check_manual_eval("ik;mn,ijk->ij;nm", // From 5bf8913b104e743b21267a4322235b25e587ff40 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 29 Jan 2024 10:01:48 -0500 Subject: [PATCH 314/592] pure outer product ToT times T seems correct. --- tests/einsum.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index d935ead8ef..57aa75b7ba 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -217,6 +217,18 @@ BOOST_AUTO_TEST_CASE(different_nested_ranks) { {2, 2}))); } + // C (outer product) + BOOST_REQUIRE((check_manual_eval("ik;mn,j->ijk;nm", // + {{0, 2, 4}, {0, 4}}, // + {{0, 3, 5}}, // + {3, 2}))); + + // C (outer product) (reversed arguments) + BOOST_REQUIRE((check_manual_eval("jl,ik;mn->ijkl;nm", // + {{0, 3, 5}, {0, 3}}, // + {{0, 2, 4}, {0, 4}}, // + {3, 2}))); + // H+C (outer product) BOOST_REQUIRE((check_manual_eval("ij;mn,ik->ijk;nm", // {{0, 2, 5}, {0, 3, 7}}, // @@ -260,18 +272,6 @@ BOOST_AUTO_TEST_CASE(different_nested_ranks) { {{0, 2}, {0, 1}}, // {2, 2}))); - // C (outer product) - BOOST_REQUIRE((check_manual_eval("i;mn,j->ij;nm", // - {{0, 2}}, // - {{0, 3}}, // - {1, 2}))); - - // C (outer product) (reversed arguments) - BOOST_REQUIRE((check_manual_eval("j,i;mn->ij;nm", // - {{0, 3}}, // - {{0, 2}}, // - {1, 2}))); - // C BOOST_REQUIRE((check_manual_eval("ij;m,j->i;m", // {{0, 2}, {0, 2}}, // From e49d2389e1209cfb5a20f8be21d0476e7bb243ea Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 29 Jan 2024 16:47:57 -0500 Subject: [PATCH 315/592] Partly restores ef8d203 --- src/TiledArray/tensor/tensor.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 7814afba7d..b28b6006ad 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1401,7 +1401,10 @@ class Tensor { using result_allocator_type = typename std::allocator_traits< Allocator>::template rebind_alloc; using ResultTensor = Tensor; - return ResultTensor(*this, right, op, perm); + if constexpr (is_bperm) { + return ResultTensor(*this, right, op, outer(perm)); + } else + return ResultTensor(*this, right, op, perm); } else { // AFAIK the other branch fundamentally relies on raw pointer arithmetic, // which won't work for ToTs. From 52bda8dd4981e98f0decc2b082a40acc0e83e11c Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 29 Jan 2024 17:40:21 -0500 Subject: [PATCH 316/592] [skip ci] ToT x T contraction support. --- src/TiledArray/expressions/cont_engine.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 265a389a03..a59bcac16d 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -618,10 +618,21 @@ class ContEngine : public BinaryEngine { abort(); // unreachable }; this->element_nonreturn_op_ = - [scal_op](result_tile_element_type& result, - const left_tile_element_type& left, - const right_tile_element_type& right) { - result = scal_op(left, right); + [scal_op, outer_prod = (this->product_type())]( + result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { + if (outer_prod == TensorProduct::Contraction) { + if (empty(result)) + result = scal_op(left, right); + else { + auto result_increment = scal_op(left, right); + add_to(result, result_increment); + } + // result += scal_op(left, right); + } else { + result = scal_op(left, right); + } }; } } else From e27a330f4c9889b3b38f807635c9d70f4beff40a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 13 Feb 2024 08:55:12 -0500 Subject: [PATCH 317/592] fixup semantics of Tensor::binary(right, op, perm) ... it seems to have depended on Op having fused inner(perm) into it --- src/TiledArray/tensor/tensor.h | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index b28b6006ad..8e812bdaf5 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1389,19 +1389,21 @@ class Tensor { typename std::enable_if::value && detail::is_permutation_v>::type* = nullptr> auto binary(const Right& right, Op&& op, const Perm& perm) const { - constexpr bool is_tot = detail::is_tensor_of_tensor_v; + using result_value_type = decltype(op( + std::declval(), std::declval&>())); + using result_allocator_type = typename std::allocator_traits< + Allocator>::template rebind_alloc; + using ResultTensor = Tensor; + // tile ops pass bipartite permutations here even if the result is a plain + // tensor [[maybe_unused]] constexpr bool is_bperm = detail::is_bipartite_permutation_v; - // tile ops pass bipartite permutations here even if this is a plain tensor - // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does - // not match Tensor"); - if constexpr (!is_tot) { - using result_value_type = decltype(op( - std::declval(), std::declval&>())); - using result_allocator_type = typename std::allocator_traits< - Allocator>::template rebind_alloc; - using ResultTensor = Tensor; + constexpr bool result_is_tot = detail::is_tensor_of_tensor_v; + + if constexpr (!result_is_tot) { if constexpr (is_bperm) { + TA_ASSERT(!inner(perm)); // ensure this is a plain permutation since + // ResultTensor is plain return ResultTensor(*this, right, op, outer(perm)); } else return ResultTensor(*this, right, op, perm); @@ -1410,7 +1412,12 @@ class Tensor { // which won't work for ToTs. auto temp = binary(right, std::forward(op)); Permute p; - return p(temp, perm); + if constexpr (is_bperm) { + TA_ASSERT(!inner(perm)); // ensure this is a plain permutation since + // ResultTensor is plain + return p(temp, outer(perm)); + } else + return p(temp, perm); } abort(); // unreachable } From ede00a68e0944e81872a135d29264d6aa85359f0 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 30 Jan 2024 08:15:08 -0500 Subject: [PATCH 318/592] [skip ci] more than one tiles used in tested arrays. all tests pass locally. --- tests/einsum.cpp | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 57aa75b7ba..82319aec2f 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -31,8 +31,8 @@ template >> bool check_manual_eval(std::string const& annot, ArrayA A, ArrayB B) { - auto ref = TA::einsum(annot, A, B); - auto out = manual_eval(annot, A, B); + auto out = TA::einsum(annot, A, B); + auto ref = manual_eval(annot, A, B); return ToTArrayFixture::are_equal(ref, out); } @@ -241,8 +241,6 @@ BOOST_AUTO_TEST_CASE(different_nested_ranks) { {{0, 2, 5}, {0, 3, 7}}, // {2, 5}))); - // todo: bug fix in expression layer for following tests to pass - { // these tests do not involve permutation of inner tensors // H+C @@ -261,42 +259,42 @@ BOOST_AUTO_TEST_CASE(different_nested_ranks) { } // H - BOOST_REQUIRE((check_manual_eval("ij;mn,ji->ji;nm", // - {{0, 2}, {0, 1}}, // - {{0, 1}, {0, 2}}, // - {2, 2}))); + BOOST_REQUIRE((check_manual_eval("ij;mn,ji->ji;nm", // + {{0, 2, 4, 6}, {0, 3}}, // + {{0, 3}, {0, 2, 4, 6}}, // + {4, 2}))); // H (reversed arguments) - BOOST_REQUIRE((check_manual_eval("ji,ij;mn->ji;nm", // - {{0, 1}, {0, 2}}, // - {{0, 2}, {0, 1}}, // - {2, 2}))); + BOOST_REQUIRE((check_manual_eval("ji,ij;mn->ji;nm", // + {{0, 3, 5}, {0, 2, 4}}, // + {{0, 2, 4}, {0, 3, 5}}, // + {1, 2}))); // C - BOOST_REQUIRE((check_manual_eval("ij;m,j->i;m", // - {{0, 2}, {0, 2}}, // - {{0, 2}}, // + BOOST_REQUIRE((check_manual_eval("ij;m,j->i;m", // + {{0, 5}, {0, 2, 3}}, // + {{0, 2, 3}}, // {3}))); // C (reversed arguments) BOOST_REQUIRE((check_manual_eval("j,ij;m->i;m", // {{0, 2}}, // - {{0, 2}, {0, 2}}, // - {3}))); + {{0, 1}, {0, 2}}, // + {3, 5}))); // H+C - BOOST_REQUIRE( - (check_manual_eval("ik;mn,ijk->ij;nm", // - {{0, 2}, {0, 3}}, // - {{0, 2}, {0, 2}, {0, 3}}, // - {2, 2}))); + BOOST_REQUIRE(( + check_manual_eval("ik;mn,ijk->ij;nm", // + {{0, 2}, {0, 3, 5}}, // + {{0, 2}, {0, 2, 4, 6}, {0, 3, 5}}, // + {2, 2}))); // H+C (reversed arguments) BOOST_REQUIRE( (check_manual_eval("ijk,ik;mn->ij;nm", // - {{0, 2}, {0, 2}, {0, 3}}, // + {{0, 2}, {0, 4}, {0, 3}}, // {{0, 2}, {0, 3}}, // - {2, 2}))); + {2}))); } BOOST_AUTO_TEST_SUITE_END() From 5fc5e0cefaaa4f7495af37aef95cfcc7c9d21194 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 30 Jan 2024 09:57:17 -0500 Subject: [PATCH 319/592] [skip ci] typos. --- tests/einsum.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 82319aec2f..755ef89275 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -280,7 +280,7 @@ BOOST_AUTO_TEST_CASE(different_nested_ranks) { BOOST_REQUIRE((check_manual_eval("j,ij;m->i;m", // {{0, 2}}, // {{0, 1}, {0, 2}}, // - {3, 5}))); + {3}))); // H+C BOOST_REQUIRE(( @@ -294,7 +294,7 @@ BOOST_AUTO_TEST_CASE(different_nested_ranks) { (check_manual_eval("ijk,ik;mn->ij;nm", // {{0, 2}, {0, 4}, {0, 3}}, // {{0, 2}, {0, 3}}, // - {2}))); + {2, 4}))); } BOOST_AUTO_TEST_SUITE_END() From 02db2728c96e84b061e934fd893bae7585e503a0 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 31 Jan 2024 14:19:38 -0500 Subject: [PATCH 320/592] TA::detail::norm should actually be called TA::detail::squared_norm --- src/TiledArray/tensor/complex.h | 14 +++++++------- src/TiledArray/tensor/tensor.h | 2 +- src/TiledArray/tensor/tensor_interface.h | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/tensor/complex.h b/src/TiledArray/tensor/complex.h index 69a8971bf6..676327427f 100644 --- a/src/TiledArray/tensor/complex.h +++ b/src/TiledArray/tensor/complex.h @@ -81,30 +81,30 @@ TILEDARRAY_FORCE_INLINE auto inner_product(const L l, const R r) { return TiledArray::detail::conj(l) * r; } -/// Wrapper function for `std::norm` +/// Squared norm of a real number /// This function disables the call to `std::conj` for real values to /// prevent the result from being converted into a complex value. /// \tparam R A real scalar type /// \param r The real scalar -/// \return `r` +/// \return squared norm of `z` `r*r` template && !is_complex::value>::type* = nullptr> -TILEDARRAY_FORCE_INLINE R norm(const R r) { +TILEDARRAY_FORCE_INLINE R squared_norm(const R r) { return r * r; } -/// Compute the norm of a complex number `z` +/// Compute the squared norm of a complex number `z` /// \f[ -/// {\rm norm}(z) = zz^* = {\rm Re}(z)^2 + {\rm Im}(z)^2 +/// {\rm norm}(z)^2 = zz^* = {\rm Re}(z)^2 + {\rm Im}(z)^2 /// \f] /// \tparam R The scalar type /// \param z The complex scalar -/// \return The complex conjugate of `z` +/// \return squared norm of `z` template -TILEDARRAY_FORCE_INLINE R norm(const std::complex z) { +TILEDARRAY_FORCE_INLINE R squared_norm(const std::complex z) { const R real = z.real(); const R imag = z.imag(); return real * real + imag * imag; diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 8e812bdaf5..af66f66dad 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -2382,7 +2382,7 @@ class Tensor { scalar_type squared_norm() const { auto square_op = [](scalar_type& MADNESS_RESTRICT res, const numeric_type arg) { - res += TiledArray::detail::norm(arg); + res += TiledArray::detail::squared_norm(arg); }; auto sum_op = [](scalar_type& MADNESS_RESTRICT res, const scalar_type arg) { res += arg; diff --git a/src/TiledArray/tensor/tensor_interface.h b/src/TiledArray/tensor/tensor_interface.h index a514959cab..7a23307036 100644 --- a/src/TiledArray/tensor/tensor_interface.h +++ b/src/TiledArray/tensor/tensor_interface.h @@ -1066,7 +1066,7 @@ class TensorInterface { scalar_type squared_norm() const { auto square_op = [](scalar_type& MADNESS_RESTRICT res, const numeric_type arg) { - res += TiledArray::detail::norm(arg); + res += TiledArray::detail::squared_norm(arg); }; auto sum_op = [](scalar_type& MADNESS_RESTRICT res, const scalar_type arg) { res += arg; From 8f38e3b664dddc9cbb306078e2c570f2f1f41514 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 5 Feb 2024 09:22:57 -0500 Subject: [PATCH 321/592] making manual_eval runnable on multiple processes. --- tests/tot_array_fixture.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index b63209daf5..3b7408caf1 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -496,10 +496,12 @@ auto general_product(TA::DistArray A, } using TileC = typename decltype(result_tensor)::value_type; + TA::DistArray C(world, result_trange); - C.make_replicated(); - world.gop.fence(); - for (auto it : C) it = result_tensor(it.index()); + + for (auto it : C) { + if (C.is_local(it.index())) it = result_tensor(it.index()); + } return C; } From 1314b63864d20491ab3271074a0ccdcda6c53ada Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 13 Feb 2024 08:58:28 -0500 Subject: [PATCH 322/592] Permutation: range check in operator[] + improved dox --- src/TiledArray/permutation.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/permutation.h b/src/TiledArray/permutation.h index 96b0081643..0a64bc492f 100644 --- a/src/TiledArray/permutation.h +++ b/src/TiledArray/permutation.h @@ -271,7 +271,10 @@ class Permutation { /// \param i The element index /// \return The i-th element - index_type operator[](unsigned int i) const { return p_[i]; } + index_type operator[](unsigned int i) const { + TA_ASSERT(i < p_.size()); + return p_[i]; + } /// Cycles decomposition @@ -409,11 +412,13 @@ class Permutation { /// Bool conversion /// \return \c true if the permutation is not empty, otherwise \c false. + /// \note equivalent to `this->size() != 0` explicit operator bool() const { return !p_.empty(); } /// Not operator /// \return \c true if the permutation is empty, otherwise \c false. + /// \note equivalent to `this->size() == 0` bool operator!() const { return p_.empty(); } /// Permutation data accessor From 7105bfcf503310a688af57d3440f3b8b39ba88da Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 13 Feb 2024 08:59:42 -0500 Subject: [PATCH 323/592] Tensor touchup --- src/TiledArray/tensor/tensor.h | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index af66f66dad..d8d3678999 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -464,10 +464,8 @@ class Tensor { // If we actually have a ToT the inner permutation was not applied above so // we do that now constexpr bool is_tot = detail::is_tensor_of_tensor_v; - constexpr bool is_bperm = detail::is_bipartite_permutation_v; // tile ops pass bipartite permutations here even if this is a plain tensor - // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does - // not match Tensor"); + constexpr bool is_bperm = detail::is_bipartite_permutation_v; if constexpr (is_tot && is_bperm) { if (inner_size(perm) != 0) { auto inner_perm = inner(perm); @@ -512,10 +510,8 @@ class Tensor { // If we actually have a ToT the inner permutation was not applied above so // we do that now constexpr bool is_tot = detail::is_tensor_of_tensor_v; - constexpr bool is_bperm = detail::is_bipartite_permutation_v; // tile ops pass bipartite permutations here even if this is a plain tensor - // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does - // not match Tensor"); + constexpr bool is_bperm = detail::is_bipartite_permutation_v; if constexpr (is_tot && is_bperm) { if (inner_size(perm) != 0) { auto inner_perm = inner(perm); @@ -1297,9 +1293,6 @@ class Tensor { template >> Tensor permute(const Perm& perm) const { - constexpr bool is_tot = detail::is_tensor_of_tensor_v; - [[maybe_unused]] constexpr bool is_bperm = - detail::is_bipartite_permutation_v; return Tensor(*this, perm); } @@ -1371,7 +1364,8 @@ class Tensor { std::declval(), std::declval&>())); using result_allocator_type = typename std::allocator_traits< Allocator>::template rebind_alloc; - return Tensor(*this, right, op); + using ResultTensor = Tensor; + return ResultTensor(*this, right, op); } /// Use a binary, element wise operation to construct a new, permuted tensor From b2e1e0662a2f3e1a7091b9930f2eaea9591669d5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 13 Feb 2024 09:13:01 -0500 Subject: [PATCH 324/592] dox typo [skip ci] --- src/TiledArray/expressions/cont_engine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index a59bcac16d..0dcc3e6c06 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -126,7 +126,7 @@ class ContEngine : public BinaryEngine { ///< nested tensor expressions) std::function - element_return_op_; ///< Same as inner_tile_nonreturn_op_ but returns + element_return_op_; ///< Same as element_nonreturn_op_ but returns ///< the result TiledArray::detail::ProcGrid proc_grid_; ///< Process grid for the contraction From de05229c2fe4871a37c99005213efaf9a43b6fc1 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 13 Feb 2024 09:58:02 -0500 Subject: [PATCH 325/592] make detail::is_{bipartite_,}permutation work for const T --- src/TiledArray/tensor/type_traits.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index 2a624ebf76..b325752c7a 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -370,6 +370,9 @@ using default_permutation_t = typename default_permutation::type; template struct is_permutation : public std::false_type {}; +template +struct is_permutation : public is_permutation {}; + template <> struct is_permutation : public std::true_type {}; @@ -386,7 +389,8 @@ static constexpr const auto is_permutation_v = is_permutation::value; template static constexpr const auto is_bipartite_permutation_v = - std::is_same_v; + std::is_same_v || + std::is_same_v; template static constexpr const auto is_bipartite_permutable_v = From 76c4b52a4f1dedb01d954e9affce1fbcbf63ca47 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 13 Feb 2024 09:16:38 -0500 Subject: [PATCH 326/592] MultEngine: if inner permutation was fused into tile op, should not try to apply inner part of permutation again ... needs to be tested --- src/TiledArray/expressions/mult_engine.h | 21 ++++++++++++++------- src/TiledArray/tensor/tensor.h | 7 +------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 88b9ffb7df..553b80e8ed 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -423,9 +423,10 @@ class MultEngine : public ContEngine> { /// \param perm The permutation to be applied to the result /// \return The tile operation - template >> - op_type make_tile_op(const Perm& perm) const { + template >>> + op_type make_tile_op(Perm&& perm) const { if constexpr (TiledArray::detail::is_tensor_of_tensor_v< value_type>) { // nested tensors const auto inner_prod = this->inner_product_type(); @@ -433,15 +434,21 @@ class MultEngine : public ContEngine> { TA_ASSERT(this->product_type() == inner_prod); // Hadamard automatically works for inner // dimensions as well - return op_type(op_base_type(), perm); + return op_type(op_base_type(), std::forward(perm)); } else if (inner_prod == TensorProduct::Contraction) { - return op_type(op_base_type(this->element_return_op_), perm); + // inner permutation, if needed, was fused into inner op, do not apply + // inner part of the perm again + return op_type(op_base_type(this->element_return_op_), + outer(std::forward(perm))); } else if (inner_prod == TensorProduct::Scale) { - return op_type(op_base_type(this->element_return_op_), perm); + // inner permutation, if needed, was fused into inner op, do not apply + // inner part of the perm again + return op_type(op_base_type(this->element_return_op_), + outer(std::forward(perm))); } else abort(); } else { // plain tensor - return op_type(op_base_type(), perm); + return op_type(op_base_type(), std::forward(perm)); } abort(); // unreachable } diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index d8d3678999..48e69c96e3 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1406,12 +1406,7 @@ class Tensor { // which won't work for ToTs. auto temp = binary(right, std::forward(op)); Permute p; - if constexpr (is_bperm) { - TA_ASSERT(!inner(perm)); // ensure this is a plain permutation since - // ResultTensor is plain - return p(temp, outer(perm)); - } else - return p(temp, perm); + return p(temp, std::forward(perm)); } abort(); // unreachable } From 642d54abfe4b3c62ae339cad78a96c84721de267 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 13 Feb 2024 10:01:17 -0500 Subject: [PATCH 327/592] perfect forward permutations in most (all?) places that matters --- src/TiledArray/dist_eval/array_eval.h | 30 ++++++++++-------- src/TiledArray/expressions/add_engine.h | 18 ++++++----- src/TiledArray/expressions/blk_tsr_engine.h | 19 +++++++----- src/TiledArray/expressions/cont_engine.h | 16 +++++----- src/TiledArray/expressions/mult_engine.h | 10 +++--- src/TiledArray/expressions/scal_engine.h | 9 +++--- src/TiledArray/expressions/scal_tsr_engine.h | 9 +++--- src/TiledArray/expressions/subt_engine.h | 18 ++++++----- src/TiledArray/expressions/tsr_engine.h | 9 +++--- src/TiledArray/tensor/tensor.h | 30 +++++++++--------- src/TiledArray/tile_op/binary_wrapper.h | 8 +++-- src/TiledArray/tile_op/contract_reduce.h | 32 +++++++++++--------- 12 files changed, 114 insertions(+), 94 deletions(-) diff --git a/src/TiledArray/dist_eval/array_eval.h b/src/TiledArray/dist_eval/array_eval.h index a4cbdc47b1..2eaad01a9b 100644 --- a/src/TiledArray/dist_eval/array_eval.h +++ b/src/TiledArray/dist_eval/array_eval.h @@ -228,13 +228,15 @@ class ArrayEvalImpl /// \param pmap The process map for the result tensor tiles /// \param perm The permutation that is applied to the tile coordinate index /// \param op The operation that will be used to evaluate the tiles of array - template >> + template >>> ArrayEvalImpl(const array_type& array, World& world, const trange_type& trange, const shape_type& shape, - const std::shared_ptr& pmap, - const Perm& perm, const op_type& op) - : DistEvalImpl_(world, trange, shape, pmap, outer(perm)), + const std::shared_ptr& pmap, Perm&& perm, + const op_type& op) + : DistEvalImpl_(world, trange, shape, pmap, + outer(std::forward(perm))), array_(array), op_(std::make_shared(op)), block_range_() @@ -273,17 +275,19 @@ class ArrayEvalImpl /// \param op The operation that will be used to evaluate the tiles of array /// \param lower_bound The sub-block lower bound /// \param upper_bound The sub-block upper bound - template && - TiledArray::detail::is_integral_range_v && - TiledArray::detail::is_permutation_v>> + template < + typename Index1, typename Index2, typename Perm, + typename = std::enable_if_t< + TiledArray::detail::is_integral_range_v && + TiledArray::detail::is_integral_range_v && + TiledArray::detail::is_permutation_v>>> ArrayEvalImpl(const array_type& array, World& world, const trange_type& trange, const shape_type& shape, - const std::shared_ptr& pmap, - const Perm& perm, const op_type& op, const Index1& lower_bound, + const std::shared_ptr& pmap, Perm&& perm, + const op_type& op, const Index1& lower_bound, const Index2& upper_bound) - : DistEvalImpl_(world, trange, shape, pmap, outer(perm)), + : DistEvalImpl_(world, trange, shape, pmap, + outer(std::forward(perm))), array_(array), op_(std::make_shared(op)), block_range_(array.trange().tiles_range(), lower_bound, upper_bound) diff --git a/src/TiledArray/expressions/add_engine.h b/src/TiledArray/expressions/add_engine.h index 9421f6ffb2..f4a879365a 100644 --- a/src/TiledArray/expressions/add_engine.h +++ b/src/TiledArray/expressions/add_engine.h @@ -195,10 +195,11 @@ class AddEngine : public BinaryEngine> { /// \param perm The permutation to be applied to tiles /// \return The tile operation - template >> - static op_type make_tile_op(const Perm& perm) { - return op_type(op_base_type(), perm); + template >>> + static op_type make_tile_op(Perm&& perm) { + return op_type(op_base_type(), std::forward(perm)); } /// Expression identification tag @@ -296,10 +297,11 @@ class ScalAddEngine /// \param perm The permutation to be applied to tiles /// \return The tile operation - template >> - op_type make_tile_op(const Perm& perm) const { - return op_type(op_base_type(factor_), perm); + template >>> + op_type make_tile_op(Perm&& perm) const { + return op_type(op_base_type(factor_), std::forward(perm)); } /// Scaling factor accessor diff --git a/src/TiledArray/expressions/blk_tsr_engine.h b/src/TiledArray/expressions/blk_tsr_engine.h index 5cb9009460..a7aba8e8c5 100644 --- a/src/TiledArray/expressions/blk_tsr_engine.h +++ b/src/TiledArray/expressions/blk_tsr_engine.h @@ -393,9 +393,10 @@ class BlkTsrEngine /// \param perm The permutation to be applied to tiles /// \return The tile operation - template >> - op_type make_tile_op(const Perm& perm) const { + template >>> + op_type make_tile_op(Perm&& perm) const { const unsigned int rank = trange_.tiles_range().rank(); // Construct and allocate memory for the shift range @@ -415,7 +416,7 @@ class BlkTsrEngine range_shift[perm_d] = -base_d; } - return op_type(op_base_type(range_shift), perm); + return op_type(op_base_type(range_shift), std::forward(perm)); } /// Expression identification tag @@ -540,9 +541,10 @@ class ScalBlkTsrEngine /// \param perm The permutation to be applied to tiles /// \return The tile operation - template >> - op_type make_tile_op(const Perm& perm) const { + template >>> + op_type make_tile_op(Perm&& perm) const { const unsigned int rank = trange_.tiles_range().rank(); // Construct and allocate memory for the shift range @@ -562,7 +564,8 @@ class ScalBlkTsrEngine range_shift[perm_d] = -base_d; } - return op_type(op_base_type(range_shift, factor_), perm); + return op_type(op_base_type(range_shift, factor_), + std::forward(perm)); } /// Expression identification tag diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 0dcc3e6c06..7f6af354d9 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -274,15 +274,17 @@ class ContEngine : public BinaryEngine { const auto outer_perm = outer(perm_); // Initialize permuted structure if constexpr (!TiledArray::detail::is_tensor_of_tensor_v) { - op_ = op_type(left_op, right_op, factor_, outer_size(indices_), - outer_size(left_indices_), outer_size(right_indices_), - (!implicit_permute_outer_ ? outer_perm : Permutation{})); + op_ = op_type( + left_op, right_op, factor_, outer_size(indices_), + outer_size(left_indices_), outer_size(right_indices_), + (!implicit_permute_outer_ ? std::move(outer_perm) : Permutation{})); } else { // factor_ is absorbed into inner_tile_nonreturn_op_ - op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_), - outer_size(left_indices_), outer_size(right_indices_), - (!implicit_permute_outer_ ? outer_perm : Permutation{}), - this->element_nonreturn_op_); + op_ = op_type( + left_op, right_op, scalar_type(1), outer_size(indices_), + outer_size(left_indices_), outer_size(right_indices_), + (!implicit_permute_outer_ ? std::move(outer_perm) : Permutation{}), + this->element_nonreturn_op_); } trange_ = ContEngine_::make_trange(outer_perm); shape_ = ContEngine_::make_shape(outer_perm); diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h index 553b80e8ed..84d11bd4c0 100644 --- a/src/TiledArray/expressions/mult_engine.h +++ b/src/TiledArray/expressions/mult_engine.h @@ -685,10 +685,12 @@ class ScalMultEngine /// \param perm The permutation to be applied to tiles /// \return The tile operation - template >> - op_type make_tile_op(const Perm& perm) const { - return op_type(op_base_type(ContEngine_::factor_), perm); + template >>> + op_type make_tile_op(Perm&& perm) const { + return op_type(op_base_type(ContEngine_::factor_), + std::forward(perm)); } /// Expression identification tag diff --git a/src/TiledArray/expressions/scal_engine.h b/src/TiledArray/expressions/scal_engine.h index a2312fccb7..2c0d33bf33 100644 --- a/src/TiledArray/expressions/scal_engine.h +++ b/src/TiledArray/expressions/scal_engine.h @@ -146,10 +146,11 @@ class ScalEngine : public UnaryEngine> { /// \param perm The permutation to be applied to tiles /// \return The tile operation - template >> - op_type make_tile_op(const Perm& perm) const { - return op_type(perm, factor_); + template >>> + op_type make_tile_op(Perm&& perm) const { + return op_type(std::forward(perm), factor_); } /// Expression identification tag diff --git a/src/TiledArray/expressions/scal_tsr_engine.h b/src/TiledArray/expressions/scal_tsr_engine.h index 8dfcc596d9..8b38362740 100644 --- a/src/TiledArray/expressions/scal_tsr_engine.h +++ b/src/TiledArray/expressions/scal_tsr_engine.h @@ -140,10 +140,11 @@ class ScalTsrEngine : public LeafEngine> { /// \param perm The permutation to be applied to tiles /// \return The tile operation - template >> - op_type make_tile_op(const Perm& perm) const { - return op_type(op_base_type(factor_), perm); + template >>> + op_type make_tile_op(Perm&& perm) const { + return op_type(op_base_type(factor_), std::forward(perm)); } /// Expression identification tag diff --git a/src/TiledArray/expressions/subt_engine.h b/src/TiledArray/expressions/subt_engine.h index ab93dde1ea..3750a199c5 100644 --- a/src/TiledArray/expressions/subt_engine.h +++ b/src/TiledArray/expressions/subt_engine.h @@ -195,10 +195,11 @@ class SubtEngine : public BinaryEngine> { /// \param perm The permutation to be applied to tiles /// \return The tile operation - template >> - static op_type make_tile_op(const Perm& perm) { - return op_type(op_base_type(), perm); + template >>> + static op_type make_tile_op(Perm&& perm) { + return op_type(op_base_type(), std::forward(perm)); } /// Expression identification tag @@ -296,10 +297,11 @@ class ScalSubtEngine /// \param perm The permutation to be applied to tiles /// \return The tile operation - template >> - op_type make_tile_op(const Perm& perm) const { - return op_type(op_base_type(factor_), perm); + template >>> + op_type make_tile_op(Perm&& perm) const { + return op_type(op_base_type(factor_), std::forward(perm)); } /// Expression identification tag diff --git a/src/TiledArray/expressions/tsr_engine.h b/src/TiledArray/expressions/tsr_engine.h index 5219af37ca..20b893ead3 100644 --- a/src/TiledArray/expressions/tsr_engine.h +++ b/src/TiledArray/expressions/tsr_engine.h @@ -126,10 +126,11 @@ class TsrEngine : public LeafEngine> { /// \param perm The permutation to be applied to tiles /// \return The tile operation - template >> - static op_type make_tile_op(const Perm& perm) { - return op_type(op_base_type(), perm); + template >>> + static op_type make_tile_op(Perm&& perm) { + return op_type(op_base_type(), std::forward(perm)); } }; // class TsrEngine diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 48e69c96e3..a787765655 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -421,8 +421,6 @@ class Tensor { constexpr bool is_tot = detail::is_tensor_of_tensor_v; constexpr bool is_bperm = detail::is_bipartite_permutation_v; // tile ops pass bipartite permutations here even if this is a plain tensor - // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does - // not match Tensor"); if constexpr (is_tot && is_bperm) { if (inner_size(perm) != 0) { const auto inner_perm = inner(perm); @@ -1378,11 +1376,12 @@ class Tensor { /// \param perm The permutation to be applied to this tensor /// \return A tensor where element \c i of the new tensor is equal to /// \c op(*this[i],other[i]) - template < - typename Right, typename Op, typename Perm, - typename std::enable_if::value && - detail::is_permutation_v>::type* = nullptr> - auto binary(const Right& right, Op&& op, const Perm& perm) const { + template ::value && + detail::is_permutation_v< + std::remove_reference_t>>::type* = + nullptr> + auto binary(const Right& right, Op&& op, Perm&& perm) const { using result_value_type = decltype(op( std::declval(), std::declval&>())); using result_allocator_type = typename std::allocator_traits< @@ -1398,9 +1397,9 @@ class Tensor { if constexpr (is_bperm) { TA_ASSERT(!inner(perm)); // ensure this is a plain permutation since // ResultTensor is plain - return ResultTensor(*this, right, op, outer(perm)); + return ResultTensor(*this, right, op, outer(std::forward(perm))); } else - return ResultTensor(*this, right, op, perm); + return ResultTensor(*this, right, op, std::forward(perm)); } else { // AFAIK the other branch fundamentally relies on raw pointer arithmetic, // which won't work for ToTs. @@ -1454,24 +1453,23 @@ class Tensor { /// \throw TiledArray::Exception The dimension of \c perm does not match /// that of this tensor. template >> - Tensor unary(Op&& op, const Perm& perm) const { + typename = std::enable_if_t< + detail::is_permutation_v>>> + Tensor unary(Op&& op, Perm&& perm) const { constexpr bool is_tot = detail::is_tensor_of_tensor_v; [[maybe_unused]] constexpr bool is_bperm = detail::is_bipartite_permutation_v; // tile ops pass bipartite permutations here even if this is a plain tensor - // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does - // not match Tensor"); if constexpr (!is_tot) { if constexpr (is_bperm) { TA_ASSERT(inner_size(perm) == 0); // ensure this is a plain permutation - return Tensor(*this, op, outer(perm)); + return Tensor(*this, op, outer(std::forward(perm))); } else - return Tensor(*this, op, perm); + return Tensor(*this, op, std::forward(perm)); } else { auto temp = unary(std::forward(op)); Permute p; - return p(temp, perm); + return p(temp, std::forward(perm)); } abort(); // unreachable } diff --git a/src/TiledArray/tile_op/binary_wrapper.h b/src/TiledArray/tile_op/binary_wrapper.h index 4c02b84318..33d021f2b0 100644 --- a/src/TiledArray/tile_op/binary_wrapper.h +++ b/src/TiledArray/tile_op/binary_wrapper.h @@ -129,9 +129,11 @@ class BinaryWrapper { BinaryWrapper& operator=(const BinaryWrapper&) = default; BinaryWrapper& operator=(BinaryWrapper&&) = default; - template >> - BinaryWrapper(const Op& op, const Perm& perm) : op_(op), perm_(perm) {} + template >>> + BinaryWrapper(const Op& op, Perm&& perm) + : op_(op), perm_(std::forward(perm)) {} BinaryWrapper(const Op& op) : op_(op), perm_() {} diff --git a/src/TiledArray/tile_op/contract_reduce.h b/src/TiledArray/tile_op/contract_reduce.h index d9d87d59c8..94c7107343 100644 --- a/src/TiledArray/tile_op/contract_reduce.h +++ b/src/TiledArray/tile_op/contract_reduce.h @@ -85,17 +85,18 @@ class ContractReduceBase { typename Perm = BipartitePermutation, typename ElemMultAddOp = TiledArray::function_ref, typename = std::enable_if_t< - TiledArray::detail::is_permutation_v && + TiledArray::detail::is_permutation_v< + std::remove_reference_t> && std::is_invocable_r_v, result_value_type&, const left_value_type&, const right_value_type&>>> Impl(const math::blas::Op left_op, const math::blas::Op right_op, const scalar_type alpha, const unsigned int result_rank, const unsigned int left_rank, const unsigned int right_rank, - const Perm& perm = {}, ElemMultAddOp&& elem_muladd_op = {}) + Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {}) : gemm_helper_(left_op, right_op, result_rank, left_rank, right_rank), alpha_(alpha), - perm_(perm), + perm_(std::forward(perm)), elem_muladd_op_(std::forward(elem_muladd_op)) { // non-unit alpha must be absorbed into elem_muladd_op if (elem_muladd_op_) TA_ASSERT(alpha == scalar_type(1)); @@ -141,7 +142,7 @@ class ContractReduceBase { typename Perm = BipartitePermutation, typename ElemMultAddOp = TiledArray::function_ref, typename = std::enable_if_t< - TiledArray::detail::is_permutation_v && + TiledArray::detail::is_permutation_v> && std::is_invocable_r_v, result_value_type&, const left_value_type&, const right_value_type&>>> @@ -149,10 +150,11 @@ class ContractReduceBase { const math::blas::Op right_op, const scalar_type alpha, const unsigned int result_rank, const unsigned int left_rank, - const unsigned int right_rank, const Perm& perm = {}, + const unsigned int right_rank, Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {}) : pimpl_(std::make_shared( - left_op, right_op, alpha, result_rank, left_rank, right_rank, perm, + left_op, right_op, alpha, result_rank, left_rank, right_rank, + std::forward(perm), std::forward(elem_muladd_op))) {} /// Gemm meta data accessor @@ -276,16 +278,16 @@ class ContractReduce : public ContractReduceBase { typename Perm = BipartitePermutation, typename ElemMultAddOp = TiledArray::function_ref, typename = std::enable_if_t< - TiledArray::detail::is_permutation_v && + TiledArray::detail::is_permutation_v> && std::is_invocable_r_v, result_value_type&, const left_value_type&, const right_value_type&>>> ContractReduce(const math::blas::Op left_op, const math::blas::Op right_op, const scalar_type alpha, const unsigned int result_rank, const unsigned int left_rank, const unsigned int right_rank, - const Perm& perm = {}, ElemMultAddOp&& elem_muladd_op = {}) + Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {}) : ContractReduceBase_(left_op, right_op, alpha, result_rank, left_rank, - right_rank, perm, + right_rank, std::forward(perm), std::forward(elem_muladd_op)) {} /// Create a result type object @@ -404,16 +406,16 @@ class ContractReduce, typename = std::enable_if_t< - TiledArray::detail::is_permutation_v && + TiledArray::detail::is_permutation_v> && std::is_invocable_r_v, result_value_type&, const left_value_type&, const right_value_type&>>> ContractReduce(const math::blas::Op left_op, const math::blas::Op right_op, const scalar_type alpha, const unsigned int result_rank, const unsigned int left_rank, const unsigned int right_rank, - const Perm& perm = {}, ElemMultAddOp&& elem_muladd_op = {}) + Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {}) : ContractReduceBase_(left_op, right_op, alpha, result_rank, left_rank, - right_rank, perm, + right_rank, std::forward(perm), std::forward(elem_muladd_op)) {} /// Create a result type object @@ -530,16 +532,16 @@ class ContractReduce, typename = std::enable_if_t< - TiledArray::detail::is_permutation_v && + TiledArray::detail::is_permutation_v> && std::is_invocable_r_v, result_value_type&, const left_value_type&, const right_value_type&>>> ContractReduce(const math::blas::Op left_op, const math::blas::Op right_op, const scalar_type alpha, const unsigned int result_rank, const unsigned int left_rank, const unsigned int right_rank, - const Perm& perm = {}, ElemMultAddOp&& elem_muladd_op = {}) + Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {}) : ContractReduceBase_(left_op, right_op, alpha, result_rank, left_rank, - right_rank, perm, + right_rank, std::forward(perm), std::forward(elem_muladd_op)) {} /// Create a result type object From dd6509fda337587ea439d3054266d672f32f6d52 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 13 Feb 2024 10:06:58 -0500 Subject: [PATCH 328/592] Permutation/BipartitePermutation is a bit more movable + {inner,outer}(Perm&&) move when possible --- src/TiledArray/permutation.h | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/permutation.h b/src/TiledArray/permutation.h index 0a64bc492f..d70b283034 100644 --- a/src/TiledArray/permutation.h +++ b/src/TiledArray/permutation.h @@ -733,6 +733,11 @@ class BipartitePermutation { init(); } + BipartitePermutation(Permutation&& p, index_type second_partition_size = 0) + : base_(std::move(p)), second_size_(second_partition_size) { + init(); + } + BipartitePermutation(const Permutation& first, const Permutation& second) : second_size_(second.size()) { vector base; @@ -790,9 +795,14 @@ class BipartitePermutation { } /// \return reference to the first partition - const Permutation& first() const { return first_; } + const Permutation& first() const& { return first_; } + /// \return reference to the second partition + const Permutation& second() const& { return second_; } + + /// \return rvalue-reference to the first partition + Permutation&& first() && { return std::move(first_); } /// \return reference to the second partition - const Permutation& second() const { return second_; } + Permutation&& second() && { return std::move(second_); } /// \return the size of the first partition index_type first_size() const { return this->size() - second_size_; } @@ -870,6 +880,8 @@ inline auto inner(const Permutation& p) { // temporary inline auto outer(const Permutation& p) { return p; } +inline Permutation&& outer(Permutation&& p) { return std::move(p); } + inline auto inner_size(const Permutation& p) { abort(); return 0; @@ -879,8 +891,16 @@ inline auto outer_size(const Permutation& p) { return p.size(); } inline auto inner(const BipartitePermutation& p) { return p.second(); } +inline Permutation&& inner(BipartitePermutation&& p) { + return std::move(p).second(); +} + inline auto outer(const BipartitePermutation& p) { return p.first(); } +inline Permutation&& outer(BipartitePermutation&& p) { + return std::move(p).first(); +} + inline auto inner_size(const BipartitePermutation& p) { return p.second_size(); } From fa30a2da1b3102179057c5a5862db60ab10d0b02 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 13 Feb 2024 12:11:33 -0500 Subject: [PATCH 329/592] dox cleanup --- src/TiledArray/expressions/expr_engine.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/TiledArray/expressions/expr_engine.h b/src/TiledArray/expressions/expr_engine.h index 7ff7e16b2d..a502857af9 100644 --- a/src/TiledArray/expressions/expr_engine.h +++ b/src/TiledArray/expressions/expr_engine.h @@ -273,10 +273,11 @@ class ExprEngine : private NO_DEFAULTS { /// some consuming operations (like GEMM) permutation can perform some /// permutation types implicitly. setting this to true indicates that the /// result tiles' outer modes do not need to be permuted and permutation will - /// be performed implicitly by the consuming operation \param status The new - /// value for the implicit permute flag (true => will not permute outer modes - /// of result tiles; false => will permute outer modes of result tiles if - /// needed) \note for plain tensors, i.e., tensor-of-scalars, any mode is + /// be performed implicitly by the consuming operation + /// \param status The new value for the implicit permute flag + /// (true => will not permute outer modes of result tiles; + /// false => will permute outer modes of result tiles if needed) + /// \note for plain tensors, i.e., tensor-of-scalars, any mode is /// outer void implicit_permute_outer(const bool status) { implicit_permute_outer_ = status; @@ -287,10 +288,11 @@ class ExprEngine : private NO_DEFAULTS { /// some consuming operations (like GEMM) permutation can perform some /// permutation types implicitly. setting this to true indicates that the /// result tiles' inner modes do not need to be permuted and permutation will - /// be performed implicitly by the consuming operation \param status The new - /// value for the implicit permute flag (true => will not permute inner modes - /// of result tiles; false => will permute inner modes of result tiles if - /// needed) \note for plain tensors, i.e., tensor-of-scalars, there are no + /// be performed implicitly by the consuming operation + /// \param status The new value for the implicit permute flag + /// (true => will not permute inner modes of result tiles; + /// false => will permute inner modes of result tiles if needed) + /// \note for plain tensors, i.e., tensor-of-scalars, there are no /// inner modes and this should not be used void implicit_permute_inner(const bool status) { TA_ASSERT(TiledArray::detail::is_tensor_of_tensor_v); From 8a6218f39a9ef2c7bb6541b0f48520cd5547b19a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 13 Feb 2024 14:23:45 -0500 Subject: [PATCH 330/592] [unit] reduce elapsed times in map tests --- tests/blocked_pmap.cpp | 10 +++++----- tests/cyclic_pmap.cpp | 14 +++++++------- tests/hash_pmap.cpp | 8 ++++---- tests/replicated_pmap.cpp | 13 +++++-------- tests/round_robin_pmap.cpp | 8 ++++---- 5 files changed, 25 insertions(+), 28 deletions(-) diff --git a/tests/blocked_pmap.cpp b/tests/blocked_pmap.cpp index 4ad055d885..80ab449570 100644 --- a/tests/blocked_pmap.cpp +++ b/tests/blocked_pmap.cpp @@ -25,7 +25,7 @@ using namespace TiledArray; struct BlockedPmapFixture { - BlockedPmapFixture() {} + constexpr static std::size_t max_ntiles = 10ul; }; // ============================================================================= @@ -34,7 +34,7 @@ struct BlockedPmapFixture { BOOST_FIXTURE_TEST_SUITE(blocked_pmap_suite, BlockedPmapFixture) BOOST_AUTO_TEST_CASE(constructor) { - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { BOOST_REQUIRE_NO_THROW( TiledArray::detail::BlockedPmap pmap(*GlobalFixture::world, tiles)); TiledArray::detail::BlockedPmap pmap(*GlobalFixture::world, tiles); @@ -51,7 +51,7 @@ BOOST_AUTO_TEST_CASE(owner) { ProcessID* p_owner = new ProcessID[size]; // Check various pmap sizes - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { TiledArray::detail::BlockedPmap pmap(*GlobalFixture::world, tiles); for (std::size_t tile = 0; tile < tiles; ++tile) { @@ -71,7 +71,7 @@ BOOST_AUTO_TEST_CASE(owner) { } BOOST_AUTO_TEST_CASE(local_size) { - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { TiledArray::detail::BlockedPmap pmap(*GlobalFixture::world, tiles); std::size_t total_size = pmap.local_size(); @@ -87,7 +87,7 @@ BOOST_AUTO_TEST_CASE(local_size) { BOOST_AUTO_TEST_CASE(local_group) { ProcessID tile_owners[100]; - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { TiledArray::detail::BlockedPmap pmap(*GlobalFixture::world, tiles); // Check that all local elements map to this rank diff --git a/tests/cyclic_pmap.cpp b/tests/cyclic_pmap.cpp index 4d8d76da1f..a2eddcf4f8 100644 --- a/tests/cyclic_pmap.cpp +++ b/tests/cyclic_pmap.cpp @@ -24,7 +24,7 @@ using namespace TiledArray; struct CyclicPmapFixture { - CyclicPmapFixture() {} + constexpr static std::size_t max_ntiles_per_dim = 4ul; }; // ============================================================================= @@ -92,8 +92,8 @@ BOOST_AUTO_TEST_CASE(owner) { ProcessID* p_owner = new ProcessID[size]; // Check various pmap sizes - for (std::size_t x = 1ul; x < 10ul; ++x) { - for (std::size_t y = 1ul; y < 10ul; ++y) { + for (std::size_t x = 1ul; x < max_ntiles_per_dim; ++x) { + for (std::size_t y = 1ul; y < max_ntiles_per_dim; ++y) { // Compute the limits for process rows const std::size_t min_proc_rows = std::max( ((GlobalFixture::world->size() + y - 1ul) / y), 1ul); @@ -129,8 +129,8 @@ BOOST_AUTO_TEST_CASE(owner) { } BOOST_AUTO_TEST_CASE(local_size) { - for (std::size_t x = 1ul; x < 10ul; ++x) { - for (std::size_t y = 1ul; y < 10ul; ++y) { + for (std::size_t x = 1ul; x < max_ntiles_per_dim; ++x) { + for (std::size_t y = 1ul; y < max_ntiles_per_dim; ++y) { // Compute the limits for process rows const std::size_t min_proc_rows = std::max( ((GlobalFixture::world->size() + y - 1ul) / y), 1ul); @@ -162,8 +162,8 @@ BOOST_AUTO_TEST_CASE(local_size) { BOOST_AUTO_TEST_CASE(local_group) { ProcessID tile_owners[100]; - for (std::size_t x = 1ul; x < 10ul; ++x) { - for (std::size_t y = 1ul; y < 10ul; ++y) { + for (std::size_t x = 1ul; x < max_ntiles_per_dim; ++x) { + for (std::size_t y = 1ul; y < max_ntiles_per_dim; ++y) { // Compute the limits for process rows const std::size_t min_proc_rows = std::max( ((GlobalFixture::world->size() + y - 1ul) / y), 1ul); diff --git a/tests/hash_pmap.cpp b/tests/hash_pmap.cpp index a9b573802c..06d721dceb 100644 --- a/tests/hash_pmap.cpp +++ b/tests/hash_pmap.cpp @@ -24,7 +24,7 @@ using namespace TiledArray; struct HashPmapFixture { - HashPmapFixture() {} + constexpr static std::size_t max_ntiles = 10ul; }; // ============================================================================= @@ -33,7 +33,7 @@ struct HashPmapFixture { BOOST_FIXTURE_TEST_SUITE(hash_pmap_suite, HashPmapFixture) BOOST_AUTO_TEST_CASE(constructor) { - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { BOOST_REQUIRE_NO_THROW( TiledArray::detail::HashPmap pmap(*GlobalFixture::world, tiles)); TiledArray::detail::HashPmap pmap(*GlobalFixture::world, tiles); @@ -50,7 +50,7 @@ BOOST_AUTO_TEST_CASE(owner) { ProcessID* p_owner = new ProcessID[size]; // Check various pmap sizes - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { TiledArray::detail::HashPmap pmap(*GlobalFixture::world, tiles); for (std::size_t tile = 0; tile < tiles; ++tile) { @@ -77,7 +77,7 @@ BOOST_AUTO_TEST_CASE(local_size) { BOOST_AUTO_TEST_CASE(local_group) { ProcessID tile_owners[100]; - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { TiledArray::detail::HashPmap pmap(*GlobalFixture::world, tiles); // Check that all local elements map to this rank diff --git a/tests/replicated_pmap.cpp b/tests/replicated_pmap.cpp index 1a06b85ea4..f9c8b45618 100644 --- a/tests/replicated_pmap.cpp +++ b/tests/replicated_pmap.cpp @@ -27,16 +27,13 @@ #include "unit_test_config.h" struct ReplicatedPmapFixture { - ReplicatedPmapFixture() {} - - ~ReplicatedPmapFixture() {} - + constexpr static std::size_t max_ntiles = 10ul; }; // Fixture BOOST_FIXTURE_TEST_SUITE(replicated_pmap_suite, ReplicatedPmapFixture) BOOST_AUTO_TEST_CASE(constructor) { - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { BOOST_REQUIRE_NO_THROW( TiledArray::detail::ReplicatedPmap pmap(*GlobalFixture::world, tiles)); TiledArray::detail::ReplicatedPmap pmap(*GlobalFixture::world, tiles); @@ -50,7 +47,7 @@ BOOST_AUTO_TEST_CASE(owner) { const std::size_t rank = GlobalFixture::world->rank(); // Check various pmap sizes - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { TiledArray::detail::ReplicatedPmap pmap(*GlobalFixture::world, tiles); for (std::size_t tile = 0; tile < tiles; ++tile) { @@ -60,7 +57,7 @@ BOOST_AUTO_TEST_CASE(owner) { } BOOST_AUTO_TEST_CASE(local_size) { - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { TiledArray::detail::ReplicatedPmap pmap(*GlobalFixture::world, tiles); // Check that the total number of elements in all local groups is equal to @@ -71,7 +68,7 @@ BOOST_AUTO_TEST_CASE(local_size) { } BOOST_AUTO_TEST_CASE(local_group) { - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { TiledArray::detail::ReplicatedPmap pmap(*GlobalFixture::world, tiles); // Check that all local elements map to this rank diff --git a/tests/round_robin_pmap.cpp b/tests/round_robin_pmap.cpp index 4851c5b5b1..7c601d4bfd 100644 --- a/tests/round_robin_pmap.cpp +++ b/tests/round_robin_pmap.cpp @@ -25,7 +25,7 @@ using namespace TiledArray; struct RoundRobinPmapFixture { - RoundRobinPmapFixture() {} + constexpr static std::size_t max_ntiles = 10ul; }; // ============================================================================= @@ -51,7 +51,7 @@ BOOST_AUTO_TEST_CASE(owner) { ProcessID *p_owner = new ProcessID[size]; // Check various pmap sizes - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { TiledArray::detail::RoundRobinPmap pmap(*GlobalFixture::world, tiles); for (std::size_t tile = 0; tile < tiles; ++tile) { @@ -71,7 +71,7 @@ BOOST_AUTO_TEST_CASE(owner) { } BOOST_AUTO_TEST_CASE(local_size) { - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { TiledArray::detail::RoundRobinPmap pmap(*GlobalFixture::world, tiles); std::size_t total_size = pmap.local_size(); @@ -87,7 +87,7 @@ BOOST_AUTO_TEST_CASE(local_size) { BOOST_AUTO_TEST_CASE(local_group) { ProcessID tile_owners[100]; - for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) { + for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) { TiledArray::detail::RoundRobinPmap pmap(*GlobalFixture::world, tiles); // Check that all local elements map to this rank From 8ac800befff339b6304c2abe3671b24554ceeb1e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 13 Feb 2024 16:42:39 -0500 Subject: [PATCH 331/592] bugfix for gcc gcc compiles Einsum::index::Index v{Einsum::index::Index} to implicit conversion of Einsum::index::Index to std::string, then using it as part of an initializer_list ... solution is to replace {} by () --- src/TiledArray/einsum/index.h | 2 +- src/TiledArray/tensor/kernels.h | 6 +++--- tests/tot_array_fixture.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/TiledArray/einsum/index.h b/src/TiledArray/einsum/index.h index 080158c57b..67e9d6c1a0 100644 --- a/src/TiledArray/einsum/index.h +++ b/src/TiledArray/einsum/index.h @@ -33,7 +33,7 @@ class Index { Index() = default; Index(const container_type &s) : data_(s) {} - Index(const std::initializer_list &s) : data_(s) {} + explicit Index(const std::initializer_list &s) : data_(s) {} template Index(const S &s) { diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 379439856f..0baa4781f5 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -1250,9 +1250,9 @@ auto tensor_hadamard(TensorA const& A, Annot const& aA, TensorB const& B, AB, // permutes A to B AC, // permutes A to C BC; // permutes B to C - } const perm{permutation(Indices{aA}, Indices{aB}), - permutation(Indices{aA}, Indices{aC}), - permutation(Indices{aB}, Indices{aC})}; + } const perm{permutation(Indices(aA), Indices(aB)), + permutation(Indices(aA), Indices(aC)), + permutation(Indices(aB), Indices(aC))}; struct { bool no_perm, perm_to_c, perm_a, perm_b; diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 3b7408caf1..f41697dc2e 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -303,8 +303,8 @@ struct ProductSetup { // A, B, C tensor indices // H, E, I Hadamard, external, and internal indices Indices A, B, C, H, E, I; - } const ixs{Indices{aA}, Indices{aB}, - Indices{aC}, (ixs.A & ixs.B & ixs.C), + } const ixs{Indices(aA), Indices(aB), + Indices(aC), (ixs.A & ixs.B & ixs.C), (ixs.A ^ ixs.B), ((ixs.A & ixs.B) - ixs.H)}; rank_A = ixs.A.size(); From 92c6981861f955d7d5cbe2e07d23693e3d8a1bbe Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 09:19:23 -0500 Subject: [PATCH 332/592] [unit] fix range1_suite/constructors + add corner case to range1_suite/accessors --- tests/range1.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/range1.cpp b/tests/range1.cpp index bc2fabdd6c..ba49515cd7 100644 --- a/tests/range1.cpp +++ b/tests/range1.cpp @@ -65,7 +65,7 @@ BOOST_AUTO_TEST_CASE(constructors) { BOOST_CHECK_NO_THROW((Range1{1, 1})); BOOST_CHECK_NO_THROW(Range1(1, 1)); BOOST_CHECK_EQUAL(Range1(1, 1).first, 1); - BOOST_CHECK_EQUAL(Range1(1, 1).first, 1); + BOOST_CHECK_EQUAL(Range1(1, 1).second, 1); BOOST_CHECK_NO_THROW((Range1{-11, 13})); BOOST_CHECK_EQUAL(Range1(-11, 13).first, -11); @@ -86,6 +86,15 @@ BOOST_AUTO_TEST_CASE(accessors) { BOOST_CHECK_EQUAL(r.upbound(), 10); BOOST_CHECK_NO_THROW(r.extent()); BOOST_CHECK_EQUAL(r.extent(), 9); + + // corner case: empty range + Range1 r1{1, 1}; + BOOST_CHECK_NO_THROW(r1.lobound()); + BOOST_CHECK_EQUAL(r1.lobound(), 1); + BOOST_CHECK_NO_THROW(r1.upbound()); + BOOST_CHECK_EQUAL(r1.upbound(), 1); + BOOST_CHECK_NO_THROW(r.extent()); + BOOST_CHECK_EQUAL(r1.extent(), 0); } BOOST_AUTO_TEST_CASE(iteration) { @@ -134,13 +143,13 @@ BOOST_AUTO_TEST_CASE(serialization) { std::size_t buf_size = sizeof(Range1); unsigned char* buf = new unsigned char[buf_size]; madness::archive::BufferOutputArchive oar(buf, buf_size); - oar& r; + oar & r; std::size_t nbyte = oar.size(); oar.close(); Range1 rs; madness::archive::BufferInputArchive iar(buf, nbyte); - iar& rs; + iar & rs; iar.close(); delete[] buf; From f82330fb1a713fa9ac45e5451d4d3badca28d824 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 09:57:25 -0500 Subject: [PATCH 333/592] TiledRange1 supports empty tiles --- src/TiledArray/tiled_range1.h | 20 +++++++++++------- tests/tiled_range1.cpp | 40 +++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index f1dc2369a8..69e5a5eea3 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -38,9 +38,11 @@ namespace TiledArray { /// TiledRange1 class defines a non-uniformly-tiled, contiguous, one-dimensional /// range. The tiling data is constructed with and stored in an array with -/// the format {a0, a1, a2, ...}, where 0 <= a0 < a1 < a2 < ... Each tile is +/// the format {a0, a1, a2, ...}, where a0 <= a1 <= a2 <= ... Each tile is /// defined as [a0,a1), [a1,a2), ... The number of tiles in the range will be /// equal to one less than the number of elements in the array. +/// \note if TiledArray was configured with `TA_SIGNED_1INDEX_TYPE=OFF` then the +/// tile boundaries must be non-negative. class TiledRange1 { private: struct Enabler {}; @@ -230,6 +232,7 @@ class TiledRange1 { if (!elem2tile_) { init_elem2tile_(); } + // N.B. only track elements in this range return elem2tile_[i - elements_range_.first]; } @@ -312,17 +315,17 @@ class TiledRange1 { TA_ASSERT((std::distance(first, last) >= 2) && "TiledRange1 construction failed: You need at least 2 " "elements in the tile boundary list."); - // Verify the requirement that a0 < a1 < a2 < ... + // Verify the requirement that a0 <= a1 <= a2 <= ... for (; first != (last - 1); ++first) { TA_ASSERT( - *first < *(first + 1) && + *first <= *(first + 1) && "TiledRange1 construction failed: Invalid tile boundary, tile " - "boundary i must be greater than tile boundary i+1 for all i. "); + "boundary i must not be greater than tile boundary i+1 for all i. "); TA_ASSERT( - static_cast(*first) < + static_cast(*first) <= static_cast(*(first + 1)) && "TiledRange1 construction failed: Invalid tile boundary, tile " - "boundary i must be greater than tile boundary i+1 for all i. "); + "boundary i must not be greater than tile boundary i+1 for all i. "); } } @@ -362,9 +365,10 @@ class TiledRange1 { // #endif const auto end = extent(range_); for (index1_type t = 0; t < end; ++t) - for (index1_type e = tiles_ranges_[t].first; - e < tiles_ranges_[t].second; ++e) + for (auto e : tiles_ranges_[t]) { + // only track elements in this range e2t[e - elements_range_.first] = t + range_.first; + } auto e2t_const = std::const_pointer_cast(e2t); // commit the changes std::swap(elem2tile_, e2t_const); diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index 043e4b96ac..eb94091e59 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -152,6 +152,32 @@ BOOST_AUTO_TEST_CASE(constructor) { } } + // corner cases + { + // range with 1 empty tile + { + TiledRange1 r{0, 0}; + BOOST_CHECK_EQUAL(r.tiles_range().first, 0); + BOOST_CHECK_EQUAL(r.tiles_range().second, 1); + BOOST_CHECK_EQUAL(r.elements_range().first, 0); + BOOST_CHECK_EQUAL(r.elements_range().second, 0); + BOOST_CHECK(r.tile(0) == Range1(0, 0)); + } + // range with some empty tiles + { + TiledRange1 r{1, 3, 3, 5, 5}; + BOOST_CHECK_EQUAL(r.tiles_range().first, 0); + BOOST_CHECK_EQUAL(r.tiles_range().second, 4); + BOOST_CHECK_EQUAL(r.elements_range().first, 1); + BOOST_CHECK_EQUAL(r.elements_range().second, 5); + // test tiles + BOOST_CHECK(r.tile(0) == Range1(1, 3)); + BOOST_CHECK(r.tile(1) == Range1(3, 3)); + BOOST_CHECK(r.tile(2) == Range1(3, 5)); + BOOST_CHECK(r.tile(3) == Range1(5, 5)); + } + } + // Check that invalid input throws an exception. #ifndef NDEBUG { @@ -195,6 +221,20 @@ BOOST_AUTO_TEST_CASE(element_to_tile) { // Check that the expected and internal element to tile maps match. BOOST_CHECK_EQUAL_COLLECTIONS(c.begin(), c.end(), e.begin(), e.end()); + + // corner case: empty tiles + { + // range with some empty tiles + { + TiledRange1 r{1, 3, 3, 5, 5}; + BOOST_CHECK_TA_ASSERT(r.element_to_tile(0), Exception); + BOOST_CHECK_EQUAL(r.element_to_tile(1), 0); + BOOST_CHECK_EQUAL(r.element_to_tile(2), 0); + BOOST_CHECK_EQUAL(r.element_to_tile(3), 2); + BOOST_CHECK_EQUAL(r.element_to_tile(4), 2); + BOOST_CHECK_TA_ASSERT(r.element_to_tile(5), Exception); + } + } } BOOST_AUTO_TEST_CASE(comparison) { From 03b67de05e2062d1fa80ac0a5012899306f29e4a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 10:03:55 -0500 Subject: [PATCH 334/592] [unit] tiled_range_suite/constructor: test corner case with TiledRange1 composed of empty tiles only --- tests/tiled_range.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/tiled_range.cpp b/tests/tiled_range.cpp index 14e47e3557..76702831a3 100644 --- a/tests/tiled_range.cpp +++ b/tests/tiled_range.cpp @@ -58,6 +58,14 @@ BOOST_AUTO_TEST_CASE(constructor) { BOOST_CHECK_EQUAL(r1.elements_range().area(), 0); } + // construct with ranges containing empty tiles only + { + BOOST_REQUIRE_NO_THROW(TiledRange r1({dims[0], TiledRange1{1, 1, 1}})); + TiledRange r1{dims[0], TiledRange1{1, 1, 1}}; + BOOST_CHECK_EQUAL(r1.tiles_range().area(), dims[0].tile_extent() * 2); + BOOST_CHECK_EQUAL(r1.elements_range().area(), 0); + } + // check initializer list of initializer list constructor { TiledRange r1{ From c3f8b80b7d5c44b3a0b17915c2d8257177fcdd19 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 10:05:13 -0500 Subject: [PATCH 335/592] dox typo --- src/TiledArray/expressions/cont_engine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 2a658dc886..8795e699c6 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -350,7 +350,7 @@ class ContEngine : public BinaryEngine { left_.init_distribution(world, proc_grid_.make_row_phase_pmap(K_)); right_.init_distribution(world, proc_grid_.make_col_phase_pmap(K_)); - // Initialize the process map in not already defined + // Initialize the process map if not already defined if (!pmap) pmap = proc_grid_.make_pmap(); ExprEngine_::init_distribution(world, pmap); } From a586733320f9ad216ef67cbe9b913153df5683df Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 14:06:58 -0500 Subject: [PATCH 336/592] SparseShape::sparsity() returns 0 for zero-volume ranges --- src/TiledArray/sparse_shape.h | 19 +-- tests/sparse_shape.cpp | 222 +++++++++++++++++++++++----------- 2 files changed, 161 insertions(+), 80 deletions(-) diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h index b589dc73cf..f7cc9355f7 100644 --- a/src/TiledArray/sparse_shape.h +++ b/src/TiledArray/sparse_shape.h @@ -516,10 +516,13 @@ class SparseShape { /// Sparsity of the shape - /// \return The fraction of tiles that are zero. + /// \return The fraction of tiles that are zero. Always returns 0 if + /// `this->data().size()` is zero. float sparsity() const { TA_ASSERT(!tile_norms_.empty()); - return float(zero_tile_count_) / float(tile_norms_.size()); + return tile_norms_.size() != 0 + ? float(zero_tile_count_) / float(tile_norms_.size()) + : 0.f; } // clang-format off @@ -1679,23 +1682,23 @@ class SparseShape { typename std::enable_if>>::type* = nullptr> void serialize(Archive& ar) { - ar& tile_norms_; + ar & tile_norms_; const unsigned int dim = tile_norms_.range().rank(); // allocate size_vectors_ size_vectors_ = std::move(std::shared_ptr( new vector_type[dim], std::default_delete())); - for (unsigned d = 0; d != dim; ++d) ar& size_vectors_.get()[d]; - ar& zero_tile_count_; + for (unsigned d = 0; d != dim; ++d) ar & size_vectors_.get()[d]; + ar & zero_tile_count_; } template >>::type* = nullptr> void serialize(Archive& ar) const { - ar& tile_norms_; + ar & tile_norms_; const unsigned int dim = tile_norms_.range().rank(); - for (unsigned d = 0; d != dim; ++d) ar& size_vectors_.get()[d]; - ar& zero_tile_count_; + for (unsigned d = 0; d != dim; ++d) ar & size_vectors_.get()[d]; + ar & zero_tile_count_; } private: diff --git a/tests/sparse_shape.cpp b/tests/sparse_shape.cpp index 0112f0dac6..64116dd687 100644 --- a/tests/sparse_shape.cpp +++ b/tests/sparse_shape.cpp @@ -121,9 +121,12 @@ BOOST_AUTO_TEST_CASE(non_comm_constructor) { } } - BOOST_CHECK_CLOSE(x.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + x.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); BOOST_CHECK(x.nnz() == x.data().size() - zero_tile_count); // use the sparse ctor @@ -194,9 +197,12 @@ BOOST_AUTO_TEST_CASE(comm_constructor) { } } - BOOST_CHECK_CLOSE(x.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + x.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); BOOST_CHECK_EQUAL(x.nnz(), x.data().size() - zero_tile_count); // use the sparse ctor @@ -321,7 +327,9 @@ BOOST_AUTO_TEST_CASE(block) { } BOOST_CHECK_CLOSE( result.sparsity(), - float(zero_tile_count) / float(result.data().range().volume()), + result.data().range().volume() > 0 + ? float(zero_tile_count) / float(result.data().range().volume()) + : 0, tolerance); // validate other block functions @@ -413,7 +421,9 @@ BOOST_AUTO_TEST_CASE(block_scale) { } BOOST_CHECK_CLOSE( result.sparsity(), - float(zero_tile_count) / float(result.data().range().volume()), + result.data().range().volume() > 0 + ? float(zero_tile_count) / float(result.data().range().volume()) + : 0, tolerance); // validate other block functions @@ -513,7 +523,9 @@ BOOST_AUTO_TEST_CASE(block_perm) { } BOOST_CHECK_CLOSE( result.sparsity(), - float(zero_tile_count) / float(result.data().range().volume()), + result.data().range().volume() > 0 + ? float(zero_tile_count) / float(result.data().range().volume()) + : 0, tolerance); // validate other block functions @@ -614,7 +626,9 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) { } BOOST_CHECK_CLOSE( result.sparsity(), - float(zero_tile_count) / float(result.data().range().volume()), + result.data().range().volume() > 0 + ? float(zero_tile_count) / float(result.data().range().volume()) + : 0, tolerance); // validate other block functions @@ -706,9 +720,12 @@ BOOST_AUTO_TEST_CASE(transform) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(mask) { @@ -745,9 +762,12 @@ BOOST_AUTO_TEST_CASE(mask) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(scale) { @@ -778,9 +798,12 @@ BOOST_AUTO_TEST_CASE(scale) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(scale_perm) { @@ -812,9 +835,12 @@ BOOST_AUTO_TEST_CASE(scale_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(add) { @@ -848,9 +874,12 @@ BOOST_AUTO_TEST_CASE(add) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); BOOST_CHECK_EQUAL(result.nnz(), result.data().size() - zero_tile_count); } @@ -885,9 +914,12 @@ BOOST_AUTO_TEST_CASE(add_scale) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(add_perm) { @@ -922,9 +954,12 @@ BOOST_AUTO_TEST_CASE(add_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(add_scale_perm) { @@ -959,9 +994,12 @@ BOOST_AUTO_TEST_CASE(add_scale_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(add_const) { @@ -998,9 +1036,12 @@ BOOST_AUTO_TEST_CASE(add_const) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(add_const_perm) { @@ -1037,9 +1078,12 @@ BOOST_AUTO_TEST_CASE(add_const_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(subt) { @@ -1073,9 +1117,12 @@ BOOST_AUTO_TEST_CASE(subt) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(subt_scale) { @@ -1109,9 +1156,12 @@ BOOST_AUTO_TEST_CASE(subt_scale) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(subt_perm) { @@ -1146,9 +1196,12 @@ BOOST_AUTO_TEST_CASE(subt_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(subt_scale_perm) { @@ -1183,9 +1236,12 @@ BOOST_AUTO_TEST_CASE(subt_scale_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(subt_const) { @@ -1220,9 +1276,12 @@ BOOST_AUTO_TEST_CASE(subt_const) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(subt_const_perm) { @@ -1260,9 +1319,12 @@ BOOST_AUTO_TEST_CASE(subt_const_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(mult) { @@ -1295,9 +1357,12 @@ BOOST_AUTO_TEST_CASE(mult) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(mult_scale) { @@ -1330,9 +1395,12 @@ BOOST_AUTO_TEST_CASE(mult_scale) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(mult_perm) { @@ -1368,9 +1436,12 @@ BOOST_AUTO_TEST_CASE(mult_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(mult_scale_perm) { @@ -1406,9 +1477,12 @@ BOOST_AUTO_TEST_CASE(mult_scale_perm) { } } - BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(tr.tiles_range().volume()), - tolerance); + BOOST_CHECK_CLOSE( + result.sparsity(), + tr.tiles_range().volume() > 0 + ? float(zero_tile_count) / float(tr.tiles_range().volume()) + : 0, + tolerance); } BOOST_AUTO_TEST_CASE(gemm) { @@ -1470,7 +1544,9 @@ BOOST_AUTO_TEST_CASE(gemm) { } BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(result_norms.size()), + result_norms.size() > 0 + ? float(zero_tile_count) / float(result_norms.size()) + : 0, tolerance); } @@ -1538,7 +1614,9 @@ BOOST_AUTO_TEST_CASE(gemm_perm) { } BOOST_CHECK_CLOSE(result.sparsity(), - float(zero_tile_count) / float(result_norms.size()), + result_norms.size() > 0 + ? float(zero_tile_count) / float(result_norms.size()) + : 0, tolerance); } From e05d908469e738469ff2209ad650e221010099a6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 14:07:42 -0500 Subject: [PATCH 337/592] detail::permute works for zero-volume result/arg --- src/TiledArray/tensor/permute.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/TiledArray/tensor/permute.h b/src/TiledArray/tensor/permute.h index 43fbfc9328..4d46907172 100644 --- a/src/TiledArray/tensor/permute.h +++ b/src/TiledArray/tensor/permute.h @@ -127,6 +127,9 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result, const unsigned int ndim1 = ndim - 1; const auto volume = arg0.range().volume(); + // handle the corner case of empty result/args + if (volume == 0) return; + // Get pointer to arg extent const auto* MADNESS_RESTRICT const arg0_extent = arg0.range().extent_data(); From cffb7cbf803b5924fc7c38e20dde03a605e349f2 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 15 Feb 2024 19:14:36 -0500 Subject: [PATCH 338/592] [skip ci][wip] functions to replicate and increase rank of Tensor and Array. --- src/TiledArray/conversions/make_array.h | 14 ++++- src/TiledArray/einsum/tiledarray.h | 81 +++++++++++++++++++++++++ 2 files changed, 93 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/conversions/make_array.h b/src/TiledArray/conversions/make_array.h index 6f5ada0bba..d9dfad5be3 100644 --- a/src/TiledArray/conversions/make_array.h +++ b/src/TiledArray/conversions/make_array.h @@ -85,7 +85,12 @@ inline Array make_array( auto tile = world.taskq.add( [=](const range_type& range) -> value_type { value_type tile; - op(tile, range); + if constexpr (std::is_invocable_v>) + op(tile, range, index); + else + op(tile, range); return tile; }, trange.make_tile_range(index)); @@ -155,7 +160,12 @@ inline Array make_array( int task_count = 0; auto task = [&](const ordinal_type index) -> value_type { value_type tile; - tile_norms.at_ordinal(index) = op(tile, trange.make_tile_range(index)); + if constexpr (std::is_invocable_v) + tile_norms.at_ordinal(index) = + op(tile, trange.make_tile_range(index), index); + else + tile_norms.at_ordinal(index) = op(tile, trange.make_tile_range(index)); ++counter; return tile; }; diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 3550b6489f..11ff79e3ae 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -84,6 +84,87 @@ constexpr bool AreArraySame = } // namespace +namespace { + +/// +/// \brief This function replicates a tensor B into a tensor A such that +/// A(a_1,...a_k,i_1,...,i_l) = B(i_1,...,i_l). Evidently, the +/// extents of i_n modes must match in both A and B. +/// +/// \tparam Tensor TiledArray::Tensor type. +/// \param to The target tensor. +/// \param from The source tensor that will be replicated into \c to. +/// +template >> +void replicate_tensor(Tensor &to, Tensor const &from) { + // assert that corresponding modes have the same extents + TA_ASSERT(std::equal(from.range().extent().rbegin(), + from.range().extent().rend(), + to.range().extent().rbegin())); + + // number of elements to be copied + // (same as the number of elements in @c from) + auto const N = from.range().volume(); + for (auto i = 0; i < to.range().volume(); i += N) + std::copy(from.begin(), from.end(), to.data()+i); +} + +template >> +auto replicate_array(Array from, TiledRange const& prepend_trng) { + auto const result_rank = prepend_trng.rank() + rank(from); + container::svector tr1s; + tr1s.reserve(result_rank); + for (auto const& r : prepend_trng) tr1s.emplace_back(r); + for (auto const& r : from.trange()) tr1s.emplace_back(r); + auto const result_trange = TiledRange(tr1s); + + from.make_replicated(); + + auto result = make_array( + get_default_world(), result_trange, + [from, res_tr = result_trange.tiles_range(), + delta_rank = prepend_trng.rank()](auto& tile, auto const& res_rng, + auto res_ord) { + using std::begin; + using std::end; + using std::next; + + typename Array::value_type repped(res_rng); + auto res_coord_ix = res_tr.idx(res_ord); + auto from_coord_ix = decltype(res_coord_ix)( + next(begin(res_coord_ix), delta_rank), end(res_coord_ix)); + replicate_tensor(repped, from.find_local(from_coord_ix).get(false)); + tile = repped; + }); + + //clang-format off + // using std::begin; + // using std::next; + // using std::end; + // + // Array result(get_default_world(), result_trange); + // + // for (auto tile : result) { + // auto res_tix = tile.index(); + // auto from_tix = decltype(res_tix)(next(begin(res_tix), + // prepend_trng.rank()), end(res_tix)); + // if (result.is_local(res_tix) && !result.is_zero(res_tix) && + // !from.is_zero(from_tix)) { + // typename Array::value_type + // repped(result.trange().make_tile_range(res_tix)); auto found = + // from.find_local(from_tix).get(false); replicate_tensor(repped, found); + // tile = repped; + // } + // } + //clang-format on + + return result; +} + +} // namespace + template auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, std::tuple, Indices...> cs, From ae41f770241a5e4a3a97b424c6b002b7e9128fdb Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 15:52:19 -0500 Subject: [PATCH 339/592] BlockRange extended for zero-volume Range --- src/TiledArray/block_range.h | 27 +++++++++++++++++---------- tests/block_range.cpp | 26 ++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/TiledArray/block_range.h b/src/TiledArray/block_range.h index 08096c96ea..06f8ecd629 100644 --- a/src/TiledArray/block_range.h +++ b/src/TiledArray/block_range.h @@ -85,7 +85,7 @@ class BlockRange : public Range { upper[d] = upper_bound_d; // Check input dimensions TA_ASSERT(lower[d] >= range.lobound(d)); - TA_ASSERT(lower[d] < upper[d]); + TA_ASSERT(lower[d] <= upper[d]); TA_ASSERT(upper[d] <= range.upbound(d)); extent[d] = upper[d] - lower[d]; TA_ASSERT(extent[d] == @@ -132,7 +132,7 @@ class BlockRange : public Range { upper[d] = upper_bound_d; // Check input dimensions TA_ASSERT(lower[d] >= range.lobound(d)); - TA_ASSERT(lower[d] < upper[d]); + TA_ASSERT(lower[d] <= upper[d]); TA_ASSERT(upper[d] <= range.upbound(d)); extent[d] = upper[d] - lower[d]; TA_ASSERT(extent[d] == @@ -177,9 +177,10 @@ class BlockRange : public Range { /// \param range the host Range /// \param lower_bound A sequence of lower bounds for each dimension /// \param upper_bound A sequence of upper bounds for each dimension + /// \note Zero-extent blocks along any mode is possible, i.e. `lower_bound[d] == upper_bound[d]` is supported /// \throw TiledArray::Exception When the size of \p lower_bound is not /// equal to that of \p upper_bound. - /// \throw TiledArray::Exception When `lower_bound[i] >= upper_bound[i]` + /// \throw TiledArray::Exception When `lower_bound[i] > upper_bound[i]` // clang-format on template && @@ -204,9 +205,10 @@ class BlockRange : public Range { /// \param range the host Range /// \param lower_bound An initializer list of lower bounds for each dimension /// \param upper_bound An initializer list of upper bounds for each dimension + /// \note Zero-extent blocks along any mode is possible, i.e. `lower_bound[d] == upper_bound[d]` is supported /// \throw TiledArray::Exception When the size of \p lower_bound is not /// equal to that of \p upper_bound. - /// \throw TiledArray::Exception When `lower_bound[i] >= upper_bound[i]` + /// \throw TiledArray::Exception When `lower_bound[i] > upper_bound[i]` // clang-format on template && @@ -247,7 +249,8 @@ class BlockRange : public Range { /// \endcode /// \tparam PairRange Type representing a range of generalized pairs (see TiledArray::detail::is_gpair_v ) /// \param bounds A range of {lower,upper} bounds for each dimension - /// \throw TiledArray::Exception When `bounds[i].lower>=bounds[i].upper` for any \c i . + /// \note Zero-extent blocks along any mode is possible, i.e. `bounds[d].lower == bounds[d].upper` is supported + /// \throw TiledArray::Exception When `bounds[i].lower>bounds[i].upper` for any \c i . // clang-format on template >> @@ -264,8 +267,9 @@ class BlockRange : public Range { /// BlockRange br0(r, {std::make_pair(0,4), std::pair{1,6}, std::pair(2,8)}); /// \endcode /// \tparam GPair a generalized pair of integral types - /// \param bound A range of {lower,upper} bounds for each dimension - /// \throw TiledArray::Exception When `bound[i].lower>=bound[i].upper` for any \c i . + /// \param bounds A range of {lower,upper} bounds for each dimension + /// \note Zero-extent blocks along any mode is possible, i.e. `bounds[d].lower == bounds[d].upper` is supported + /// \throw TiledArray::Exception When `bounds[i].lower>bounds[i].upper` for any \c i . // clang-format on template BlockRange(const Range& range, const std::initializer_list& bounds, @@ -290,8 +294,9 @@ class BlockRange : public Range { /// BlockRange br0(r, {{0,4}, {1,6}, {2,8}}); /// \endcode /// \tparam Index An integral type - /// \param bound A range of {lower,upper} bounds for each dimension - /// \throw TiledArray::Exception When `bound[i].lower>=bound[i].upper` for any \c i . + /// \param bounds A range of {lower,upper} bounds for each dimension + /// \note Zero-extent blocks along any mode is possible, i.e. `bounds[d].lower == bounds[d].upper` is supported + /// \throw TiledArray::Exception When `bounds[i].lower>bounds[i].upper` for any \c i . // clang-format on template >> @@ -354,6 +359,8 @@ class BlockRange : public Range { /// \return The ordinal index in the /// \throw TiledArray::Exception When \c index is not included in this range ordinal_type ordinal(ordinal_type ord) const { + // ordinals are useless for zero-volume ranges + TA_ASSERT(volume() != 0); // Check that ord is contained by this range. TA_ASSERT(Range::includes_ordinal(ord)); @@ -414,7 +421,7 @@ class BlockRange : public Range { template void serialize(Archive& ar) const { Range::serialize(ar); - ar& block_offset_; + ar & block_offset_; } }; // BlockRange diff --git a/tests/block_range.cpp b/tests/block_range.cpp index 135c36d0b4..5d8431fa41 100644 --- a/tests/block_range.cpp +++ b/tests/block_range.cpp @@ -72,7 +72,7 @@ BOOST_AUTO_TEST_CASE(block_zero_lower_bound) { for (unsigned int i = 0u; i < upper.size(); ++i) ++(upper[i]); if (std::equal(lower.begin(), lower.end(), upper.begin(), - [](std::size_t l, std::size_t r0) { return l < r0; })) { + [](std::size_t l, std::size_t r0) { return l <= r0; })) { if (count_valid == target_count) continue; ++count_valid; @@ -141,7 +141,7 @@ BOOST_AUTO_TEST_CASE(block) { for (unsigned int i = 0u; i < r.rank(); ++i) ++(upper[i]); if (std::equal(lower.begin(), lower.end(), upper.begin(), - [](std::size_t l, std::size_t r) { return l < r; })) { + [](std::size_t l, std::size_t r) { return l <= r; })) { if (count_valid == target_count) continue; ++count_valid; @@ -269,4 +269,26 @@ BOOST_AUTO_TEST_CASE(block) { end:; } +BOOST_AUTO_TEST_CASE(empty_trange1) { + using TiledArray::eigen::iv; + // host range is non-empty but one of the dimensions will have no tiles + { + BOOST_CHECK_NO_THROW(BlockRange(r, iv(3, 3, 3), iv(4, 3, 5))); + BlockRange br(r, iv(3, 3, 3), iv(4, 3, 5)); + BOOST_CHECK_EQUAL(br.volume(), 0); + BOOST_CHECK_TA_ASSERT(br.ordinal(0), Exception); + } + + // host range is non-empty but one of the dimensions will have no tiles + { + BOOST_CHECK_NO_THROW( + BlockRange(Range({Range1{0, 3}, Range1{}, Range1{0, 4}}), iv(0, 0, 0), + iv(1, 0, 1))); + BlockRange br(Range({Range1{0, 3}, Range1{}, Range1{0, 4}}), iv(0, 0, 0), + iv(1, 0, 1)); + BOOST_CHECK_EQUAL(br.volume(), 0); + BOOST_CHECK_TA_ASSERT(br.ordinal(0), Exception); + } +} + BOOST_AUTO_TEST_SUITE_END() From 463f6a75254dcc325e9b0a37c21f84a3723dfd0f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 16:47:08 -0500 Subject: [PATCH 340/592] SparseShape::block() supports zero-volume blocks --- src/TiledArray/sparse_shape.h | 4 ++-- tests/sparse_shape.cpp | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h index f7cc9355f7..a7df1c520c 100644 --- a/src/TiledArray/sparse_shape.h +++ b/src/TiledArray/sparse_shape.h @@ -840,7 +840,7 @@ class SparseShape { // Check that the input indices are in range TA_ASSERT(lower_d >= tile_norms_.range().lobound(d)); - TA_ASSERT(lower_d < upper_d); + TA_ASSERT(lower_d <= upper_d); TA_ASSERT(upper_d <= tile_norms_.range().upbound(d)); // Construct the size vector for rank i @@ -874,7 +874,7 @@ class SparseShape { // Check that the input indices are in range TA_ASSERT(lower_d >= tile_norms_.range().lobound(d)); - TA_ASSERT(lower_d < upper_d); + TA_ASSERT(lower_d <= upper_d); TA_ASSERT(upper_d <= tile_norms_.range().upbound(d)); // Construct the size vector for rank i diff --git a/tests/sparse_shape.cpp b/tests/sparse_shape.cpp index 64116dd687..a79d7ceb8e 100644 --- a/tests/sparse_shape.cpp +++ b/tests/sparse_shape.cpp @@ -276,7 +276,7 @@ BOOST_AUTO_TEST_CASE(block) { // change default threshold to make sure it's not inherited auto resetter = set_threshold_to_max(); - auto less = std::less(); + auto less_equal = std::less_equal(); for (auto lower_it = tr.tiles_range().begin(); lower_it != tr.tiles_range().end(); ++lower_it) { @@ -287,7 +287,7 @@ BOOST_AUTO_TEST_CASE(block) { auto upper = *upper_it; for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1; - if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) { + if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) { // Check that the block function does not throw an exception SparseShape result; BOOST_REQUIRE_NO_THROW(result = sparse_shape.block(lower, upper)); @@ -369,7 +369,7 @@ BOOST_AUTO_TEST_CASE(block_scale) { // change default threshold to make sure it's not inherited auto resetter = set_threshold_to_max(); - auto less = std::less(); + auto less_equal = std::less_equal(); const float factor = 3.3; for (auto lower_it = tr.tiles_range().begin(); @@ -381,7 +381,7 @@ BOOST_AUTO_TEST_CASE(block_scale) { auto upper = *upper_it; for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1; - if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) { + if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) { // Check that the block function does not throw an exception SparseShape result; BOOST_REQUIRE_NO_THROW(result = @@ -468,7 +468,7 @@ BOOST_AUTO_TEST_CASE(block_perm) { // change default threshold to make sure it's not inherited auto resetter = set_threshold_to_max(); - auto less = std::less(); + auto less_equal = std::less_equal(); const auto inv_perm = perm.inv(); for (auto lower_it = tr.tiles_range().begin(); @@ -480,7 +480,7 @@ BOOST_AUTO_TEST_CASE(block_perm) { auto upper = *upper_it; for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1; - if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) { + if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) { // Check that the block function does not throw an exception SparseShape result; BOOST_REQUIRE_NO_THROW(result = sparse_shape.block(lower, upper, perm)); @@ -569,7 +569,7 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) { // change default threshold to make sure it's not inherited auto resetter = set_threshold_to_max(); - auto less = std::less(); + auto less_equal = std::less_equal(); const float factor = 3.3; const auto inv_perm = perm.inv(); @@ -582,7 +582,7 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) { auto upper = *upper_it; for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1; - if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) { + if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) { // Check that the block function does not throw an exception SparseShape result; BOOST_REQUIRE_NO_THROW( From a8d39125da85229a57444b62be9df0b7997c41e0 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 14:15:51 -0500 Subject: [PATCH 341/592] [unit] expressions suite: tensor_factories tests DistArrays with empty trange1 --- src/TiledArray/expressions/blk_tsr_engine.h | 81 ++++++++++++++------- src/TiledArray/expressions/blk_tsr_expr.h | 2 +- tests/expressions_fixture.h | 12 ++- tests/expressions_impl.h | 26 +++++++ 4 files changed, 91 insertions(+), 30 deletions(-) diff --git a/src/TiledArray/expressions/blk_tsr_engine.h b/src/TiledArray/expressions/blk_tsr_engine.h index 2d16172dbe..31ad29ee74 100644 --- a/src/TiledArray/expressions/blk_tsr_engine.h +++ b/src/TiledArray/expressions/blk_tsr_engine.h @@ -194,16 +194,19 @@ class BlkTsrEngineBase : public LeafEngine { const auto lower_d = lower[d]; const auto upper_d = upper[d]; - // Copy and shift the tiling for the block - auto i = lower_d; - const auto base_d = trange[d].tile(i).first; - trange1_data.emplace_back(0ul); - for (; i < upper_d; ++i) - trange1_data.emplace_back(trange[d].tile(i).second - base_d); - - // Add the trange1 to the tiled range data - trange_data.emplace_back(trange1_data.begin(), trange1_data.end()); - trange1_data.resize(0ul); + // Copy and shift the tiling for the block, if nonempty + if (lower_d != upper_d) { + auto i = lower_d; + const auto base_d = trange[d].tile(i).first; + trange1_data.emplace_back(0ul); + for (; i < upper_d; ++i) + trange1_data.emplace_back(trange[d].tile(i).second - base_d); + // Add the trange1 to the tiled range data + trange_data.emplace_back(trange1_data.begin(), trange1_data.end()); + trange1_data.resize(0ul); + } else { + trange_data.emplace_back(); + } } return TiledRange(trange_data.begin(), trange_data.end()); @@ -233,16 +236,19 @@ class BlkTsrEngineBase : public LeafEngine { const auto lower_i = lower[inv_perm_d]; const auto upper_i = upper[inv_perm_d]; - // Copy, shift, and permute the tiling of the block - auto i = lower_i; - const auto base_d = trange[inv_perm_d].tile(i).first; - trange1_data.emplace_back(0ul); - for (; i < upper_i; ++i) - trange1_data.emplace_back(trange[inv_perm_d].tile(i).second - base_d); - - // Add the trange1 to the tiled range data - trange_data.emplace_back(trange1_data.begin(), trange1_data.end()); - trange1_data.resize(0ul); + if (lower_i != upper_i) { + // Copy, shift, and permute the tiling of the block + auto i = lower_i; + const auto base_d = trange[inv_perm_d].tile(i).first; + trange1_data.emplace_back(0ul); + for (; i < upper_i; ++i) + trange1_data.emplace_back(trange[inv_perm_d].tile(i).second - base_d); + + // Add the trange1 to the tiled range data + trange_data.emplace_back(trange1_data.begin(), trange1_data.end()); + trange1_data.resize(0ul); + } else + trange_data.emplace_back(); } return TiledRange(trange_data.begin(), trange_data.end()); @@ -376,12 +382,18 @@ class BlkTsrEngine // Get temporary data pointers const auto* MADNESS_RESTRICT const trange = array_.trange().data().data(); const auto* MADNESS_RESTRICT const lower = lower_bound_.data(); + const auto* MADNESS_RESTRICT const upper = upper_bound_.data(); // Initialize the range shift vector for (unsigned int d = 0u; d < rank; ++d) { const auto lower_d = lower[d]; - const auto base_d = trange[d].tile(lower_d).first; - range_shift.emplace_back(-base_d); + const auto upper_d = upper[d]; + if (lower_d != upper_d) { + const auto base_d = trange[d].tile(lower_d).first; + range_shift.emplace_back(-base_d); + } else { + range_shift.emplace_back(0l); + } } return op_type(op_base_type(range_shift)); @@ -402,6 +414,7 @@ class BlkTsrEngine // Get temporary data pointers const auto* MADNESS_RESTRICT const trange = array_.trange().data().data(); const auto* MADNESS_RESTRICT const lower = lower_bound_.data(); + const auto* MADNESS_RESTRICT const upper = upper_bound_.data(); // Initialize the permuted range shift vector auto outer_perm = outer(perm); @@ -409,8 +422,11 @@ class BlkTsrEngine for (unsigned int d = 0u; d < rank; ++d) { const auto perm_d = outer_perm[d]; const auto lower_d = lower[d]; - const auto base_d = trange[d].tile(lower_d).first; - range_shift[perm_d] = -base_d; + const auto upper_d = upper[d]; + if (lower_d != upper_d) { + const auto base_d = trange[d].tile(lower_d).first; + range_shift[perm_d] = -base_d; + } } return op_type(op_base_type(range_shift), perm); @@ -522,12 +538,17 @@ class ScalBlkTsrEngine // Get temporary data pointers const auto* MADNESS_RESTRICT const trange = array_.trange().data().data(); const auto* MADNESS_RESTRICT const lower = lower_bound_.data(); + const auto* MADNESS_RESTRICT const upper = upper_bound_.data(); // Construct the inverse permutation for (unsigned int d = 0u; d < rank; ++d) { const auto lower_d = lower[d]; - const auto base_d = trange[d].tile(lower_d).first; - range_shift.emplace_back(-base_d); + const auto upper_d = upper[d]; + if (lower_d != upper_d) { + const auto base_d = trange[d].tile(lower_d).first; + range_shift.emplace_back(-base_d); + } else + range_shift.emplace_back(0); } return op_type(op_base_type(range_shift, factor_)); @@ -548,6 +569,7 @@ class ScalBlkTsrEngine // Get temporary data pointers const auto* MADNESS_RESTRICT const trange = array_.trange().data().data(); const auto* MADNESS_RESTRICT const lower = lower_bound_.data(); + const auto* MADNESS_RESTRICT const upper = upper_bound_.data(); // Initialize the permuted range shift vector auto outer_perm = outer(perm); @@ -555,8 +577,11 @@ class ScalBlkTsrEngine for (unsigned int d = 0u; d < rank; ++d) { const auto perm_d = outer_perm[d]; const auto lower_d = lower[d]; - const auto base_d = trange[d].tile(lower_d).first; - range_shift[perm_d] = -base_d; + const auto upper_d = upper[d]; + if (lower_d != upper_d) { + const auto base_d = trange[d].tile(lower_d).first; + range_shift[perm_d] = -base_d; + } } return op_type(op_base_type(range_shift, factor_), perm); diff --git a/src/TiledArray/expressions/blk_tsr_expr.h b/src/TiledArray/expressions/blk_tsr_expr.h index d32603b58f..5d6612d5cc 100644 --- a/src/TiledArray/expressions/blk_tsr_expr.h +++ b/src/TiledArray/expressions/blk_tsr_expr.h @@ -179,7 +179,7 @@ class BlkTsrExprBase : public Expr { const bool lower_upper_bound_check = std::equal(std::begin(lower_bound_), std::end(lower_bound_), std::begin(upper_bound_), - [](std::size_t l, std::size_t r) { return l < r; }); + [](std::size_t l, std::size_t r) { return l <= r; }); if (!lower_upper_bound_check) { if (TiledArray::get_default_world().rank() == 0) { using TiledArray::operator<<; diff --git a/tests/expressions_fixture.h b/tests/expressions_fixture.h index 3e493b1200..a4f7e4cd0b 100644 --- a/tests/expressions_fixture.h +++ b/tests/expressions_fixture.h @@ -62,9 +62,11 @@ struct ExpressionsFixture : public TiledRangeFixture { s_tr1_1(make_random_sparseshape(trange1)), s_tr1_2(make_random_sparseshape(trange1)), s_tr2(make_random_sparseshape(trange2)), + s_trC(make_random_sparseshape(trangeC)), a(*GlobalFixture::world, tr, s_tr_1), b(*GlobalFixture::world, tr, s_tr_2), c(*GlobalFixture::world, tr, s_tr_2), + aC(*GlobalFixture::world, trangeC, s_trC), u(*GlobalFixture::world, trange1, s_tr1_1), v(*GlobalFixture::world, trange1, s_tr1_2), w(*GlobalFixture::world, trange2, s_tr2) { @@ -72,6 +74,7 @@ struct ExpressionsFixture : public TiledRangeFixture { random_fill(b); random_fill(u); random_fill(v); + random_fill(aC); GlobalFixture::world->gop.fence(); a.truncate(); b.truncate(); @@ -88,11 +91,13 @@ struct ExpressionsFixture : public TiledRangeFixture { c(*GlobalFixture::world, tr), u(*GlobalFixture::world, trange1), v(*GlobalFixture::world, trange1), - w(*GlobalFixture::world, trange2) { + w(*GlobalFixture::world, trange2), + aC(*GlobalFixture::world, trangeC) { random_fill(a); random_fill(b); random_fill(u); random_fill(v); + random_fill(aC); GlobalFixture::world->gop.fence(); } @@ -213,17 +218,22 @@ struct ExpressionsFixture : public TiledRangeFixture { const TiledRange trange1{{0, 2, 5, 10, 17, 28, 41}}; const TiledRange trange2{{0, 2, 5, 10, 17, 28, 41}, {0, 3, 6, 11, 18, 29, 42}}; + // contains empty trange1 + const TiledRange trangeC{TiledRange1{0, 2, 5, 10}, TiledRange1{}, + TiledRange1{0, 2, 7, 11}}; SparseShape s_tr_1; SparseShape s_tr_2; SparseShape s_tr1_1; SparseShape s_tr1_2; SparseShape s_tr2; + SparseShape s_trC; TArray a; TArray b; TArray c; TArray u; TArray v; TArray w; + TArray aC; }; // ExpressionsFixture #endif // TILEDARRAY_TEST_EXPRESSIONS_FIXTURE_H diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h index 91bcb10cc4..6fdfc2ce0e 100644 --- a/tests/expressions_impl.h +++ b/tests/expressions_impl.h @@ -31,6 +31,7 @@ constexpr int nrepeats = 5; BOOST_FIXTURE_TEST_CASE_TEMPLATE(tensor_factories, F, Fixtures, F) { auto& a = F::a; auto& c = F::c; + auto& aC = F::aC; const auto& ca = a; const std::array lobound{{3, 3, 3}}; @@ -2941,6 +2942,31 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(inner_product, F, Fixtures, F) { BOOST_CHECK_EQUAL(result, expected); } +// corner case: expressions involving array with empty trange1 +BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) { + auto& c = F::c; + auto& aC = F::aC; + + BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") += aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") *= aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,b,c").conj()); + BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,c,b")); + BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,c,b").conj()); + BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,c,b").conj()); + + using TiledArray::eigen::iv; + const std::array lobound{{0, 0, 1}}; + const std::array upbound{{1, 0, 2}}; + + BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c").block(lobound, upbound)); + BOOST_CHECK_NO_THROW(c("a,b,c") += + 2 * aC("a,b,c").block(lobound, upbound).conj()); + BOOST_CHECK_NO_THROW(c("a,b,c") = + 2 * conj(aC("a,c,b").block(lobound, upbound))); +} + BOOST_AUTO_TEST_SUITE_END() #endif // TILEDARRAY_TEST_EXPRESSIONS_IMPL_H From df7e0c804dfa6f5901fa4e6bedbeb1993e2a5286 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 15 Feb 2024 23:46:31 -0500 Subject: [PATCH 342/592] GEMM ld{a,b,c} must be >= 1 see https://netlib.org/lapack/explore-html/dd/d09/group__gemm_ga1e899f8453bcbfde78e91a86a2dab984.html#ga1e899f8453bcbfde78e91a86a2dab984 --- src/TiledArray/device/btas.h | 26 ++++++++++++++++---------- src/TiledArray/external/btas.h | 26 ++++++++++++++++---------- src/TiledArray/tensor/tensor.h | 14 +++++++++----- 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/src/TiledArray/device/btas.h b/src/TiledArray/device/btas.h index acd42341fd..b30fdd4edd 100644 --- a/src/TiledArray/device/btas.h +++ b/src/TiledArray/device/btas.h @@ -77,10 +77,12 @@ ::btas::Tensor gemm( gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range()); // Get the leading dimension for left and right matrices. - const integer lda = - (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m); - const integer ldb = - (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k); + const integer lda = std::max( + integer{1}, + (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m)); + const integer ldb = std::max( + integer{1}, + (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k)); T factor_t = T(factor); T zero(0); @@ -112,10 +114,11 @@ ::btas::Tensor gemm( static_assert(::btas::boxrange_iteration_order::value == ::btas::boxrange_iteration_order::row_major); + const integer ldc = std::max(integer{1}, n); blas::gemm(blas::Layout::ColMajor, gemm_helper.right_op(), gemm_helper.left_op(), n, m, k, factor_t, device_data(right.storage()), ldb, device_data(left.storage()), - lda, zero, device_data(result.storage()), n, queue); + lda, zero, device_data(result.storage()), ldc, queue); device::sync_madness_task_with(stream); } @@ -185,10 +188,12 @@ void gemm(::btas::Tensor &result, gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range()); // Get the leading dimension for left and right matrices. - const integer lda = - (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m); - const integer ldb = - (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k); + const integer lda = std::max( + integer{1}, + (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m)); + const integer ldb = std::max( + integer{1}, + (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k)); auto &queue = blasqueue_for(result.range()); const auto stream = device::Stream(queue.device(), queue.stream()); @@ -207,10 +212,11 @@ void gemm(::btas::Tensor &result, static_assert(::btas::boxrange_iteration_order::value == ::btas::boxrange_iteration_order::row_major); + const integer ldc = std::max(integer{1}, n); blas::gemm(blas::Layout::ColMajor, gemm_helper.right_op(), gemm_helper.left_op(), n, m, k, factor_t, device_data(right.storage()), ldb, device_data(left.storage()), - lda, one, device_data(result.storage()), n, queue); + lda, one, device_data(result.storage()), ldc, queue); device::sync_madness_task_with(stream); } } diff --git a/src/TiledArray/external/btas.h b/src/TiledArray/external/btas.h index 11971c269e..fe84e6f0c6 100644 --- a/src/TiledArray/external/btas.h +++ b/src/TiledArray/external/btas.h @@ -661,16 +661,19 @@ inline btas::Tensor gemm( gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range()); // Get the leading dimension for left and right matrices. - const integer lda = - (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m); - const integer ldb = - (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k); + const integer lda = std::max( + integer{1}, + (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m)); + const integer ldb = std::max( + integer{1}, + (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k)); T factor_t(factor); + const integer ldc = std::max(integer{1}, n); TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, k, factor_t, left.data(), lda, right.data(), - ldb, T(0), result.data(), n); + ldb, T(0), result.data(), ldc); return result; } @@ -736,16 +739,19 @@ inline void gemm(btas::Tensor& result, gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range()); // Get the leading dimension for left and right matrices. - const integer lda = - (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m); - const integer ldb = - (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k); + const integer lda = std::max( + integer{1}, + (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m)); + const integer ldb = std::max( + integer{1}, + (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k)); T factor_t(factor); + const integer ldc = std::max(integer{1}, n); TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, k, factor_t, left.data(), lda, right.data(), - ldb, T(1), result.data(), n); + ldb, T(1), result.data(), ldc); } // sum of the hyperdiagonal elements diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 9c36b071cc..c12c2c15d1 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -2648,10 +2648,13 @@ void gemm(Alpha alpha, const Tensor& A, const Tensor& B, gemm_helper.compute_matrix_sizes(m, n, k, A.range(), B.range()); // Get the leading dimension for left and right matrices. - const integer lda = - (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose ? k : m); - const integer ldb = - (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? n : k); + const integer lda = std::max( + integer{1}, + (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose ? k : m)); + const integer ldb = std::max( + integer{1}, + (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? n + : k)); // may need to split gemm into multiply + accumulate for tracing purposes #ifdef TA_ENABLE_TILE_OPS_LOGGING @@ -2719,8 +2722,9 @@ void gemm(Alpha alpha, const Tensor& A, const Tensor& B, } } #else // TA_ENABLE_TILE_OPS_LOGGING + const integer ldc = std::max(integer{1}, n); math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, k, - alpha, A.data(), lda, B.data(), ldb, beta, C.data(), n); + alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc); #endif // TA_ENABLE_TILE_OPS_LOGGING } } From e1883fecc61ecc9ff4cb341cc18d374cd8763a45 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 16 Feb 2024 00:01:53 -0500 Subject: [PATCH 343/592] [unit] expression suite: can contract + reduce zero-volume DistArrays --- src/TiledArray/dist_eval/contraction_eval.h | 8 ++-- src/TiledArray/pmap/cyclic_pmap.h | 4 -- src/TiledArray/proc_grid.h | 6 --- tests/cyclic_pmap.cpp | 6 --- tests/expressions_impl.h | 50 +++++++++++++++------ 5 files changed, 42 insertions(+), 32 deletions(-) diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h index 2da66628fc..4755538689 100644 --- a/src/TiledArray/dist_eval/contraction_eval.h +++ b/src/TiledArray/dist_eval/contraction_eval.h @@ -891,9 +891,9 @@ class Summa ordinal_type initialize(const DenseShape&) { // Construct static broadcast groups for dense arguments const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul); - col_group_ = proc_grid_.make_col_group(col_did); + if (k_ > 0) col_group_ = proc_grid_.make_col_group(col_did); const madness::DistributedID row_did(DistEvalImpl_::id(), k_); - row_group_ = proc_grid_.make_row_group(row_did); + if (k_ > 0) row_group_ = proc_grid_.make_row_group(row_did); #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE std::stringstream ss; @@ -1347,7 +1347,6 @@ class Summa template void make_next_step_tasks(Derived* task, ordinal_type depth) { - TA_ASSERT(depth > 0); // Set the depth to be no greater than the maximum number steps if (depth > owner_->k_) depth = owner_->k_; @@ -1706,6 +1705,9 @@ class Summa std::max(ProcGrid::size_type(2), std::min(proc_grid_.proc_rows(), proc_grid_.proc_cols())); + // corner case: empty result + if (k_ == 0) return 0; + // Construct the first SUMMA iteration task if (TensorImpl_::shape().is_dense()) { // We cannot have more iterations than there are blocks in the k diff --git a/src/TiledArray/pmap/cyclic_pmap.h b/src/TiledArray/pmap/cyclic_pmap.h index 6d2df0088b..250b4f677b 100644 --- a/src/TiledArray/pmap/cyclic_pmap.h +++ b/src/TiledArray/pmap/cyclic_pmap.h @@ -84,10 +84,6 @@ class CyclicPmap : public Pmap { cols_(cols), proc_cols_(proc_cols), proc_rows_(proc_rows) { - // Check that the size is non-zero - TA_ASSERT(rows_ >= 1ul); - TA_ASSERT(cols_ >= 1ul); - // Check limits of process rows and columns TA_ASSERT(proc_rows_ >= 1ul); TA_ASSERT(proc_cols_ >= 1ul); diff --git a/src/TiledArray/proc_grid.h b/src/TiledArray/proc_grid.h index a401e0ac1e..cd15c1b73e 100644 --- a/src/TiledArray/proc_grid.h +++ b/src/TiledArray/proc_grid.h @@ -288,12 +288,6 @@ class ProcGrid { local_rows_(0ul), local_cols_(0ul), local_size_(0ul) { - // Check for non-zero sizes - TA_ASSERT(rows_ >= 1u); - TA_ASSERT(cols_ >= 1u); - TA_ASSERT(row_size >= 1ul); - TA_ASSERT(col_size >= 1ul); - init(world_->rank(), world_->size(), row_size, col_size); } diff --git a/tests/cyclic_pmap.cpp b/tests/cyclic_pmap.cpp index 4d8d76da1f..b8c2b9670c 100644 --- a/tests/cyclic_pmap.cpp +++ b/tests/cyclic_pmap.cpp @@ -60,12 +60,6 @@ BOOST_AUTO_TEST_CASE(constructor) { ProcessID size = GlobalFixture::world->size(); - BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap( - *GlobalFixture::world, 0ul, 10ul, 1, 1), - TiledArray::Exception); - BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap( - *GlobalFixture::world, 10ul, 0ul, 1, 1), - TiledArray::Exception); BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap( *GlobalFixture::world, 10ul, 10ul, 0, 1), TiledArray::Exception); diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h index 6fdfc2ce0e..ea4beab3d6 100644 --- a/tests/expressions_impl.h +++ b/tests/expressions_impl.h @@ -2947,24 +2947,48 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) { auto& c = F::c; auto& aC = F::aC; - BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c")); - BOOST_CHECK_NO_THROW(c("a,b,c") += aC("a,b,c")); - BOOST_CHECK_NO_THROW(c("a,b,c") *= aC("a,b,c")); - BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,b,c")); - BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,b,c").conj()); - BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,c,b")); - BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,c,b").conj()); - BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,c,b").conj()); + // unary/binary expressions + { + BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") += aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") *= aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,b,c")); + BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,b,c").conj()); + BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,c,b")); + BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,c,b").conj()); + BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,c,b").conj()); + } using TiledArray::eigen::iv; const std::array lobound{{0, 0, 1}}; const std::array upbound{{1, 0, 2}}; - BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c").block(lobound, upbound)); - BOOST_CHECK_NO_THROW(c("a,b,c") += - 2 * aC("a,b,c").block(lobound, upbound).conj()); - BOOST_CHECK_NO_THROW(c("a,b,c") = - 2 * conj(aC("a,c,b").block(lobound, upbound))); + // unary/binary block expressions + { + BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c").block(lobound, upbound)); + BOOST_CHECK_NO_THROW(c("a,b,c") += + 2 * aC("a,b,c").block(lobound, upbound).conj()); + BOOST_CHECK_NO_THROW(c("a,b,c") = + 2 * conj(aC("a,c,b").block(lobound, upbound))); + } + + // contraction expressions + { + std::decay_t t2, t4; + // contraction over empty dim + BOOST_CHECK_NO_THROW(t4("a,c,e,d") = aC("a,b,c") * aC("d,b,e")); + // contraction over empty and nonempty dims + BOOST_CHECK_NO_THROW(t2("a,d") = aC("a,b,c") * aC("d,b,c")); + // contraction over nonempty dims + BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * aC("d,e,c")); + } + + // reduction expressions + { + // contraction over empty dim + BOOST_CHECK_NO_THROW(aC("a,b,c").dot(2 * aC("a,b,c").conj()).get()); + BOOST_CHECK_EQUAL(aC("a,b,c").dot(2 * aC("a,b,c").conj()).get(), 0); + } } BOOST_AUTO_TEST_SUITE_END() From fa830a183178a68013e42abc6ecda3f329eff4e8 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 16 Feb 2024 09:24:02 -0500 Subject: [PATCH 344/592] expression suite: fixups for contractions w/ zero-volume contraction range and w/ nonzero-volume contraction range producing zero-volume result --- src/TiledArray/dist_eval/contraction_eval.h | 184 ++++++++++++-------- src/TiledArray/expressions/cont_engine.h | 30 ++-- tests/expressions_fixture.h | 12 +- tests/expressions_impl.h | 3 + 4 files changed, 146 insertions(+), 83 deletions(-) diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h index 4755538689..a747c0748b 100644 --- a/src/TiledArray/dist_eval/contraction_eval.h +++ b/src/TiledArray/dist_eval/contraction_eval.h @@ -889,45 +889,63 @@ class Summa /// Initialize reduce tasks and construct broadcast groups ordinal_type initialize(const DenseShape&) { - // Construct static broadcast groups for dense arguments - const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul); - if (k_ > 0) col_group_ = proc_grid_.make_col_group(col_did); - const madness::DistributedID row_did(DistEvalImpl_::id(), k_); - if (k_ > 0) row_group_ = proc_grid_.make_row_group(row_did); + // if contraction is over zero-volume range just initialize tiles to zero + if (k_ == 0) { + ordinal_type tile_count = 0; + const auto& tiles_range = this->trange().tiles_range(); + for (auto&& tile_idx : tiles_range) { + auto tile_ord = tiles_range.ordinal(tile_idx); + if (this->is_local(tile_ord)) { + this->world().taskq.add([this, tile_ord, tile_idx]() { + this->set_tile(tile_ord, + value_type(this->trange().tile(tile_idx), + typename value_type::value_type{})); + }); + ++tile_count; + } + } + return tile_count; + } else { + // Construct static broadcast groups for dense arguments + const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul); + col_group_ = proc_grid_.make_col_group(col_did); + const madness::DistributedID row_did(DistEvalImpl_::id(), k_); + row_group_ = proc_grid_.make_row_group(row_did); #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE - std::stringstream ss; - ss << "init: rank=" << TensorImpl_::world().rank() << "\n col_group_=(" - << col_did.first << ", " << col_did.second << ") { "; - for (ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc) - ss << col_group_.world_rank(gproc) << " "; - ss << "}\n row_group_=(" << row_did.first << ", " << row_did.second - << ") { "; - for (ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc) - ss << row_group_.world_rank(gproc) << " "; - ss << "}\n"; - printf(ss.str().c_str()); + std::stringstream ss; + ss << "init: rank=" << TensorImpl_::world().rank() << "\n col_group_=(" + << col_did.first << ", " << col_did.second << ") { "; + for (ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc) + ss << col_group_.world_rank(gproc) << " "; + ss << "}\n row_group_=(" << row_did.first << ", " << row_did.second + << ") { "; + for (ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc) + ss << row_group_.world_rank(gproc) << " "; + ss << "}\n"; + printf(ss.str().c_str()); #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE - // Allocate memory for the reduce pair tasks. - std::allocator> alloc; - reduce_tasks_ = alloc.allocate(proc_grid_.local_size()); + // Allocate memory for the reduce pair tasks. + std::allocator> alloc; + reduce_tasks_ = alloc.allocate(proc_grid_.local_size()); - // Iterate over all local tiles - const ordinal_type n = proc_grid_.local_size(); - for (ordinal_type t = 0ul; t < n; ++t) { - // Initialize the reduction task - ReducePairTask* MADNESS_RESTRICT const reduce_task = - reduce_tasks_ + t; - new (reduce_task) ReducePairTask(TensorImpl_::world(), op_ + // Iterate over all local tiles + const ordinal_type n = proc_grid_.local_size(); + for (ordinal_type t = 0ul; t < n; ++t) { + // Initialize the reduction task + ReducePairTask* MADNESS_RESTRICT const reduce_task = + reduce_tasks_ + t; + new (reduce_task) ReducePairTask(TensorImpl_::world(), op_ #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE - , - nullptr, t + , + nullptr, t #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE - ); - } + ); + } - return proc_grid_.local_size(); + return proc_grid_.local_size(); + } } /// Initialize reduce tasks @@ -938,6 +956,9 @@ class Summa ss << " initialize rank=" << TensorImpl_::world().rank() << " tiles={ "; #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE + // fast return if there is no work to do + if (k_ == 0) return 0; + // Allocate memory for the reduce pair tasks. std::allocator> alloc; reduce_tasks_ = alloc.allocate(proc_grid_.local_size()); @@ -1705,60 +1726,79 @@ class Summa std::max(ProcGrid::size_type(2), std::min(proc_grid_.proc_rows(), proc_grid_.proc_cols())); - // corner case: empty result - if (k_ == 0) return 0; - - // Construct the first SUMMA iteration task - if (TensorImpl_::shape().is_dense()) { - // We cannot have more iterations than there are blocks in the k - // dimension - if (depth > k_) depth = k_; - - // Modify the number of concurrent iterations based on the available - // memory. - depth = mem_bound_depth(depth, 0.0f, 0.0f); - - // Enforce user defined depth bound - if (max_depth_) depth = std::min(depth, max_depth_); - - TensorImpl_::world().taskq.add( - new DenseStepTask(shared_from_this(), depth)); - } else { - // Increase the depth based on the amount of sparsity in an iteration. + // watch out for the corner case: contraction over zero-volume range + // producing nonzero-volume result ... in that case there is nothing to do + // the appropriate initialization was performed in the initialize() method + if (k_ != 0) { + // Construct the first SUMMA iteration task + if (TensorImpl_::shape().is_dense()) { + // We cannot have more iterations than there are blocks in the k + // dimension + if (depth > k_) depth = k_; + + // Modify the number of concurrent iterations based on the available + // memory. + depth = mem_bound_depth(depth, 0.0f, 0.0f); + + // Enforce user defined depth bound + if (max_depth_) depth = std::min(depth, max_depth_); + + TensorImpl_::world().taskq.add( + new DenseStepTask(shared_from_this(), depth)); + } else { + // Increase the depth based on the amount of sparsity in an iteration. - // Get the sparsity fractions for the left- and right-hand arguments. - const float left_sparsity = left_.shape().sparsity(); - const float right_sparsity = right_.shape().sparsity(); + // Get the sparsity fractions for the left- and right-hand arguments. + const float left_sparsity = left_.shape().sparsity(); + const float right_sparsity = right_.shape().sparsity(); - // Compute the fraction of non-zero result tiles in a single SUMMA - // iteration. - const float frac_non_zero = (1.0f - std::min(left_sparsity, 0.9f)) * - (1.0f - std::min(right_sparsity, 0.9f)); + // Compute the fraction of non-zero result tiles in a single SUMMA + // iteration. + const float frac_non_zero = (1.0f - std::min(left_sparsity, 0.9f)) * + (1.0f - std::min(right_sparsity, 0.9f)); - // Compute the new depth based on sparsity of the arguments - depth = - float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) + 0.5f; + // Compute the new depth based on sparsity of the arguments + depth = float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) + + 0.5f; - // We cannot have more iterations than there are blocks in the k - // dimension - if (depth > k_) depth = k_; + // We cannot have more iterations than there are blocks in the k + // dimension + if (depth > k_) depth = k_; - // Modify the number of concurrent iterations based on the available - // memory and sparsity of the argument tensors. - depth = mem_bound_depth(depth, left_sparsity, right_sparsity); + // Modify the number of concurrent iterations based on the available + // memory and sparsity of the argument tensors. + depth = mem_bound_depth(depth, left_sparsity, right_sparsity); - // Enforce user defined depth bound - if (max_depth_) depth = std::min(depth, max_depth_); + // Enforce user defined depth bound + if (max_depth_) depth = std::min(depth, max_depth_); - TensorImpl_::world().taskq.add( - new SparseStepTask(shared_from_this(), depth)); - } + TensorImpl_::world().taskq.add( + new SparseStepTask(shared_from_this(), depth)); + } + } // k_ != 0 } #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL printf("eval: start wait children rank=%i\n", TensorImpl_::world().rank()); #endif // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL + // corner case: if left or right are zero-volume no tasks were scheduled, so + // need to discard all of their tiles manually + if (left_.range().volume() == 0) { + for (auto&& tile_idx : right_.range()) { + auto tile_ord = right_.range().ordinal(tile_idx); + if (right_.is_local(tile_ord) && !right_.is_zero(tile_ord)) + right_.discard(tile_ord); + } + } + if (right_.range().volume() == 0) { + for (auto&& tile_idx : left_.range()) { + auto tile_ord = left_.range().ordinal(tile_idx); + if (left_.is_local(tile_ord) && !left_.is_zero(tile_ord)) + left_.discard(tile_ord); + } + } + // Wait for child tensors to be evaluated, and process tasks while waiting. left_.wait(); right_.wait(); diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 8795e699c6..94562b5154 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -343,16 +343,26 @@ class ContEngine : public BinaryEngine { n *= right_element_size[i]; } - // Construct the process grid. - proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n); - - // Initialize children - left_.init_distribution(world, proc_grid_.make_row_phase_pmap(K_)); - right_.init_distribution(world, proc_grid_.make_col_phase_pmap(K_)); - - // Initialize the process map if not already defined - if (!pmap) pmap = proc_grid_.make_pmap(); - ExprEngine_::init_distribution(world, pmap); + // corner case: zero-volume result ... easier to skip proc_grid_ + // construction alltogether + if (M == 0 || N == 0) { + left_.init_distribution(world, {}); + right_.init_distribution(world, {}); + ExprEngine_::init_distribution( + world, (pmap ? pmap : policy::default_pmap(*world, M * N))); + } else { // M!=0 && N!=0 + + // Construct the process grid. + proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n); + + // Initialize children + left_.init_distribution(world, proc_grid_.make_row_phase_pmap(K_)); + right_.init_distribution(world, proc_grid_.make_col_phase_pmap(K_)); + + // Initialize the process map if not already defined + if (!pmap) pmap = proc_grid_.make_pmap(); + ExprEngine_::init_distribution(world, pmap); + } } /// Tiled range factory function diff --git a/tests/expressions_fixture.h b/tests/expressions_fixture.h index a4f7e4cd0b..8e527465d1 100644 --- a/tests/expressions_fixture.h +++ b/tests/expressions_fixture.h @@ -63,10 +63,12 @@ struct ExpressionsFixture : public TiledRangeFixture { s_tr1_2(make_random_sparseshape(trange1)), s_tr2(make_random_sparseshape(trange2)), s_trC(make_random_sparseshape(trangeC)), + s_trC_f(make_random_sparseshape(trangeC_f)), a(*GlobalFixture::world, tr, s_tr_1), b(*GlobalFixture::world, tr, s_tr_2), c(*GlobalFixture::world, tr, s_tr_2), aC(*GlobalFixture::world, trangeC, s_trC), + aC_f(*GlobalFixture::world, trangeC_f, s_trC_f), u(*GlobalFixture::world, trange1, s_tr1_1), v(*GlobalFixture::world, trange1, s_tr1_2), w(*GlobalFixture::world, trange2, s_tr2) { @@ -92,12 +94,14 @@ struct ExpressionsFixture : public TiledRangeFixture { u(*GlobalFixture::world, trange1), v(*GlobalFixture::world, trange1), w(*GlobalFixture::world, trange2), - aC(*GlobalFixture::world, trangeC) { + aC(*GlobalFixture::world, trangeC), + aC_f(*GlobalFixture::world, trangeC_f) { random_fill(a); random_fill(b); random_fill(u); random_fill(v); random_fill(aC); + random_fill(aC_f); GlobalFixture::world->gop.fence(); } @@ -221,12 +225,17 @@ struct ExpressionsFixture : public TiledRangeFixture { // contains empty trange1 const TiledRange trangeC{TiledRange1{0, 2, 5, 10}, TiledRange1{}, TiledRange1{0, 2, 7, 11}}; + // like trC, but with all dimension nonempty + const TiledRange trangeC_f{trangeC.dim(0), TiledRange1{0, 4, 7}, + trangeC.dim(2)}; + SparseShape s_tr_1; SparseShape s_tr_2; SparseShape s_tr1_1; SparseShape s_tr1_2; SparseShape s_tr2; SparseShape s_trC; + SparseShape s_trC_f; TArray a; TArray b; TArray c; @@ -234,6 +243,7 @@ struct ExpressionsFixture : public TiledRangeFixture { TArray v; TArray w; TArray aC; + TArray aC_f; }; // ExpressionsFixture #endif // TILEDARRAY_TEST_EXPRESSIONS_FIXTURE_H diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h index ea4beab3d6..268b118568 100644 --- a/tests/expressions_impl.h +++ b/tests/expressions_impl.h @@ -2946,6 +2946,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(inner_product, F, Fixtures, F) { BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) { auto& c = F::c; auto& aC = F::aC; + auto& aC_f = F::aC_f; // unary/binary expressions { @@ -2981,6 +2982,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) { BOOST_CHECK_NO_THROW(t2("a,d") = aC("a,b,c") * aC("d,b,c")); // contraction over nonempty dims BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * aC("d,e,c")); + // contraction over nonempty dims, involving expressions with nonzero-volume + BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * (2. * aC_f("d,e,c"))); } // reduction expressions From f0112af07423ce85dfba1cf1f8325a84111b1894 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 16 Feb 2024 10:00:03 -0500 Subject: [PATCH 345/592] [unit] re-add block_range_suite --- tests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3bcf8de967..76bb14e4b1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -34,6 +34,7 @@ set(executable ta_test) set(ta_test_src_files ta_test.cpp range1.cpp range.cpp + block_range.cpp type_traits.cpp tensor.cpp tensor_of_tensor.cpp From 087202a0e9b00b10638f78a9926431e9be981f11 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 16 Feb 2024 11:46:31 -0500 Subject: [PATCH 346/592] one more zero-volume corner case, in Expr::eval_to(BlkTsrExpr..) --- src/TiledArray/expressions/expr.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index f77d13dbad..c3fdd6423b 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -504,8 +504,11 @@ class Expr { // Move the data from dist_eval into the sub-block of result array. // This step may involve communication when the tiles are moved from the // sub-block distribution to the array distribution. - { + // N.B. handle the corner case of zero-volume host array, then no data needs + // to be moved + if (tsr.array().trange().tiles_range().volume() != 0) { // N.B. must deep copy + TA_ASSERT(tsr.array().trange().tiles_range().includes(tsr.lower_bound())); const container::svector shift = tsr.array().trange().make_tile_range(tsr.lower_bound()).lobound(); From 3b6269b3f730d56d7c6e096f805431e54baabbcd Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 16 Feb 2024 13:07:53 -0500 Subject: [PATCH 347/592] [skip ci] `ia,i->ia` family tests pass on a single rank --- src/TiledArray/einsum/tiledarray.h | 50 +++++++++++++++++++++++++++--- tests/einsum.cpp | 22 +++++++++++++ 2 files changed, 67 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 11ff79e3ae..150484fd32 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -107,17 +107,17 @@ void replicate_tensor(Tensor &to, Tensor const &from) { // (same as the number of elements in @c from) auto const N = from.range().volume(); for (auto i = 0; i < to.range().volume(); i += N) - std::copy(from.begin(), from.end(), to.data()+i); + std::copy(from.begin(), from.end(), to.data() + i); } template >> -auto replicate_array(Array from, TiledRange const& prepend_trng) { +auto replicate_array(Array from, TiledRange const &prepend_trng) { auto const result_rank = prepend_trng.rank() + rank(from); container::svector tr1s; tr1s.reserve(result_rank); - for (auto const& r : prepend_trng) tr1s.emplace_back(r); - for (auto const& r : from.trange()) tr1s.emplace_back(r); + for (auto const &r : prepend_trng) tr1s.emplace_back(r); + for (auto const &r : from.trange()) tr1s.emplace_back(r); auto const result_trange = TiledRange(tr1s); from.make_replicated(); @@ -125,7 +125,7 @@ auto replicate_array(Array from, TiledRange const& prepend_trng) { auto result = make_array( get_default_world(), result_trange, [from, res_tr = result_trange.tiles_range(), - delta_rank = prepend_trng.rank()](auto& tile, auto const& res_rng, + delta_rank = prepend_trng.rank()](auto &tile, auto const &res_rng, auto res_ord) { using std::begin; using std::end; @@ -137,6 +137,7 @@ auto replicate_array(Array from, TiledRange const& prepend_trng) { next(begin(res_coord_ix), delta_rank), end(res_coord_ix)); replicate_tensor(repped, from.find_local(from_coord_ix).get(false)); tile = repped; + return tile.norm(); }); //clang-format off @@ -163,6 +164,14 @@ auto replicate_array(Array from, TiledRange const& prepend_trng) { return result; } +template +TiledRange make_trange(RangeMap const &map, Ixs const &ixs) { + container::svector tr1s; + tr1s.reserve(ixs.size()); + for (auto &&i : ixs) tr1s.emplace_back(map[i]); + return TiledRange(tr1s); +} + } // namespace template @@ -231,6 +240,37 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, auto range_map = (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); + auto perm_and_rank_replicate = [delta_trng = make_trange(range_map, e)]( + auto pre, // + std::string const &pre_annot, // + std::string const &permed_annot) { + decltype(pre) permed; + permed(permed_annot) = pre(pre_annot); + return replicate_array(permed, delta_trng); + }; + + // special Hadamard + if (h.size() == a.size() || h.size() == b.size()) { + TA_ASSERT(!i && e); + bool small_a = h.size() == a.size(); + std::string const eh_annot = (e | h); + std::string const permed_annot = + std::string(h) + (small_a ? inner.a : inner.b); + std::string const C_annot = std::string(c) + inner.c; + std::string const temp_annot = std::string(e) + "," + permed_annot; + ArrayC C; + if (small_a) { + auto temp = + perm_and_rank_replicate(A.array(), A.annotation(), permed_annot); + C(C_annot) = temp(temp_annot) * B; + } else { + auto temp = + perm_and_rank_replicate(B.array(), B.annotation(), permed_annot); + C(C_annot) = A * temp(temp_annot); + } + return C; + } + using ::Einsum::index::permutation; using TiledArray::Permutation; diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 755ef89275..d76d041865 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -297,6 +297,28 @@ BOOST_AUTO_TEST_CASE(different_nested_ranks) { {2, 4}))); } +BOOST_AUTO_TEST_CASE(corner_cases) { + using T = TA::Tensor; + using ToT = TA::Tensor; + using ArrayT = TA::DistArray; + + BOOST_REQUIRE(check_manual_eval("ia,i->ia", // + {{0, 2, 5}, {0, 7, 11, 16}}, // + {{0, 2, 5}})); + + BOOST_REQUIRE(check_manual_eval("i,ai->ia", // + {{0, 2, 5}}, // + {{0, 7, 11, 16}, {0, 2, 5}})); + + BOOST_REQUIRE(check_manual_eval("ijk,kj->kij", // + {{0, 2, 5}, {0, 3, 6}, {0, 2, 7}}, // + {{0, 2, 7}, {0, 3, 6}})); + + BOOST_REQUIRE(check_manual_eval("kj,ijk->kij", // + {{0, 2, 7}, {0, 3, 6}}, // + {{0, 2, 5}, {0, 3, 6}, {0, 2, 7}})); +} + BOOST_AUTO_TEST_SUITE_END() using namespace TiledArray; From 2561d5a02d9456fc1bf2b7fa1f12a21fc94415da Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 16 Feb 2024 16:57:08 -0500 Subject: [PATCH 348/592] revert back `make_array` touch-ups from previous commit. --- src/TiledArray/conversions/make_array.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/TiledArray/conversions/make_array.h b/src/TiledArray/conversions/make_array.h index d9dfad5be3..6f5ada0bba 100644 --- a/src/TiledArray/conversions/make_array.h +++ b/src/TiledArray/conversions/make_array.h @@ -85,12 +85,7 @@ inline Array make_array( auto tile = world.taskq.add( [=](const range_type& range) -> value_type { value_type tile; - if constexpr (std::is_invocable_v>) - op(tile, range, index); - else - op(tile, range); + op(tile, range); return tile; }, trange.make_tile_range(index)); @@ -160,12 +155,7 @@ inline Array make_array( int task_count = 0; auto task = [&](const ordinal_type index) -> value_type { value_type tile; - if constexpr (std::is_invocable_v) - tile_norms.at_ordinal(index) = - op(tile, trange.make_tile_range(index), index); - else - tile_norms.at_ordinal(index) = op(tile, trange.make_tile_range(index)); + tile_norms.at_ordinal(index) = op(tile, trange.make_tile_range(index)); ++counter; return tile; }; From 06445db4f816c9310bef3fa625bca140bf570a8f Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 16 Feb 2024 17:17:13 -0500 Subject: [PATCH 349/592] Cleanup. --- src/TiledArray/einsum/tiledarray.h | 44 ++++++++++++------------------ 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 150484fd32..a429bc0a6e 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -110,6 +110,17 @@ void replicate_tensor(Tensor &to, Tensor const &from) { std::copy(from.begin(), from.end(), to.data() + i); } +/// +/// \brief This function is the @c DistArray counterpart of the function +/// @c replicate_tensor(TA::Tensor&, TA::Tensor const&). +/// +/// \tparam Array +/// \param from The DistArray to be by-rank replicated. +/// \parama prepend_trng TiledRange1's in this argument will be prepended to the +/// `TiledRange` of the argument array. +/// \return An array whose rank is increased by `prepend_trng.rank()`. +/// \see `replicate_tensor` +/// template >> auto replicate_array(Array from, TiledRange const &prepend_trng) { @@ -121,46 +132,25 @@ auto replicate_array(Array from, TiledRange const &prepend_trng) { auto const result_trange = TiledRange(tr1s); from.make_replicated(); + auto &world = from.world(); + world.gop.fence(); auto result = make_array( - get_default_world(), result_trange, - [from, res_tr = result_trange.tiles_range(), - delta_rank = prepend_trng.rank()](auto &tile, auto const &res_rng, - auto res_ord) { + world, result_trange, + [from, res_tr = result_trange, delta_rank = prepend_trng.rank()]( + auto &tile, auto const &res_rng) { using std::begin; using std::end; using std::next; typename Array::value_type repped(res_rng); - auto res_coord_ix = res_tr.idx(res_ord); + auto res_coord_ix = res_tr.element_to_tile(res_rng.lobound()); auto from_coord_ix = decltype(res_coord_ix)( next(begin(res_coord_ix), delta_rank), end(res_coord_ix)); replicate_tensor(repped, from.find_local(from_coord_ix).get(false)); tile = repped; return tile.norm(); }); - - //clang-format off - // using std::begin; - // using std::next; - // using std::end; - // - // Array result(get_default_world(), result_trange); - // - // for (auto tile : result) { - // auto res_tix = tile.index(); - // auto from_tix = decltype(res_tix)(next(begin(res_tix), - // prepend_trng.rank()), end(res_tix)); - // if (result.is_local(res_tix) && !result.is_zero(res_tix) && - // !from.is_zero(from_tix)) { - // typename Array::value_type - // repped(result.trange().make_tile_range(res_tix)); auto found = - // from.find_local(from_tix).get(false); replicate_tensor(repped, found); - // tile = repped; - // } - // } - //clang-format on - return result; } From 924c15fc57058b8d9a4dd4a41b5e1e8736c2e339 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 16 Feb 2024 17:33:20 -0500 Subject: [PATCH 350/592] Add more tests. --- tests/einsum.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index d76d041865..92a00a1149 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -301,6 +301,7 @@ BOOST_AUTO_TEST_CASE(corner_cases) { using T = TA::Tensor; using ToT = TA::Tensor; using ArrayT = TA::DistArray; + using ArrayToT = TA::DistArray; BOOST_REQUIRE(check_manual_eval("ia,i->ia", // {{0, 2, 5}, {0, 7, 11, 16}}, // @@ -317,6 +318,23 @@ BOOST_AUTO_TEST_CASE(corner_cases) { BOOST_REQUIRE(check_manual_eval("kj,ijk->kij", // {{0, 2, 7}, {0, 3, 6}}, // {{0, 2, 5}, {0, 3, 6}, {0, 2, 7}})); + + BOOST_REQUIRE(check_manual_eval("kij;ab,kj;bc->kji;ac", // + {{0, 2}, {0, 3, 5}, {0, 4, 7}}, // + {{0, 2}, {0, 4, 7}}, // + {3, 5}, {5, 2})); + + BOOST_REQUIRE( + (check_manual_eval("ijk;ab,kj->kij;ba", // + {{0, 2}, {0, 4, 6}, {0, 3, 5}}, // + {{0, 3, 5}, {0, 4, 6}}, // + {7, 5}))); + + BOOST_REQUIRE( + (check_manual_eval("ij,jik;ab->kji;ab", // + {{0, 3, 5}, {0, 3, 8}}, // + {{0, 3, 8}, {0, 3, 5}, {0, 2}}, // + {3, 9}))); } BOOST_AUTO_TEST_SUITE_END() From d73534e50da1456774fa5337c7682f3888702643 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 18 Feb 2024 10:48:37 -0500 Subject: [PATCH 351/592] [skip ci] Another corner case. --- tests/einsum.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 92a00a1149..3bfdd5e603 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -337,6 +337,17 @@ BOOST_AUTO_TEST_CASE(corner_cases) { {3, 9}))); } +BOOST_AUTO_TEST_CASE(debug) { + using T = TA::Tensor; + using ToT = TA::Tensor; + using ArrayToT = TA::DistArray; + BOOST_REQUIRE(check_manual_eval("ijk;dcb,ik;bc->ij;d", // + {{0, 3}, {0, 4}, {0, 5}}, // + {{0, 3}, {0, 5}}, // + {2, 3, 4}, // + {4, 3})); +} + BOOST_AUTO_TEST_SUITE_END() using namespace TiledArray; From 780d524ae5168c63d12fd2222001397b00d8cccc Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 22 Feb 2024 07:40:08 -0500 Subject: [PATCH 352/592] Amend `Tensor(Tensor, Perm)` constructor to handle batched tensors. --- src/TiledArray/tensor/tensor.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index a787765655..cd0e7e97f1 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -406,7 +406,8 @@ class Tensor { typename std::enable_if && detail::is_permutation_v>::type* = nullptr> Tensor(const T1& other, const Perm& perm) - : Tensor(outer(perm) * other.range(), 1, default_construct{false}) { + : Tensor(outer(perm) * other.range(), other.nbatch(), + default_construct{false}) { const auto outer_perm = outer(perm); if (outer_perm) { detail::tensor_init(value_converter, outer_perm, @@ -425,7 +426,12 @@ class Tensor { if (inner_size(perm) != 0) { const auto inner_perm = inner(perm); Permute p; - for (auto& x : *this) x = p(x, inner_perm); + + auto volume = total_size(); + for (decltype(volume) i = 0; i < volume; ++i) { + auto& el = *(data() + i); + el = p(el, inner_perm); + } } } } From dd88d59cd286860188f660c4c917e9bae1b38b3b Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 22 Feb 2024 07:41:44 -0500 Subject: [PATCH 353/592] `tensor_init` function can handle batched tensors. --- src/TiledArray/tensor/kernels.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 0baa4781f5..8172587d66 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -662,21 +662,20 @@ inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) { /// \param[in] tensors The argument tensors template < typename Op, typename TR, typename... Ts, - typename std::enable_if<(is_nested_tensor::value && - !is_tensor::value) && - is_contiguous_tensor::value>::type* = nullptr> + typename std::enable_if< + (is_nested_tensor::value && !is_tensor::value) && + is_contiguous_tensor::value>::type* = nullptr> inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) { TA_ASSERT(!empty(result, tensors...)); TA_ASSERT(is_range_set_congruent(result, tensors...)); - const auto volume = result.range().volume(); - if constexpr (std::is_invocable_r_v) { result = std::forward(op)(tensors...); } else { - for (decltype(result.range().volume()) ord = 0ul; ord < volume; ++ord) { + const auto volume = result.total_size(); + for (std::remove_cv_t ord = 0ul; ord < volume; ++ord) { new (result.data() + ord) typename TR::value_type( - tensor_op(op, tensors.at_ordinal(ord)...)); + tensor_op(op, (*(tensors.data() + ord))...)); } } } From dd65656627f9ccfaf25267d7d4972d6722865987 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 22 Feb 2024 07:43:17 -0500 Subject: [PATCH 354/592] Debugged and moved a test case involving ToTs contraction on both inner and outer. --- tests/einsum.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 3bfdd5e603..c161020c6d 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -161,6 +161,14 @@ BOOST_AUTO_TEST_CASE(equal_nested_ranks) { {{0, 2, 4}, {0, 2}}, // {2, 2}, // {2, 2})); + + // C;C + BOOST_REQUIRE(check_manual_eval("ijk;dcb,ik;bc->ij;d", // + {{0, 3}, {0, 4}, {0, 5}}, // + {{0, 3}, {0, 5}}, // + {2, 3, 4}, // + {4, 3})); + // H+C;H BOOST_REQUIRE(check_manual_eval("ijk;mn,ijk;nm->ij;mn", // {{0, 2}, {0, 3}, {0, 2}}, // @@ -337,17 +345,6 @@ BOOST_AUTO_TEST_CASE(corner_cases) { {3, 9}))); } -BOOST_AUTO_TEST_CASE(debug) { - using T = TA::Tensor; - using ToT = TA::Tensor; - using ArrayToT = TA::DistArray; - BOOST_REQUIRE(check_manual_eval("ijk;dcb,ik;bc->ij;d", // - {{0, 3}, {0, 4}, {0, 5}}, // - {{0, 3}, {0, 5}}, // - {2, 3, 4}, // - {4, 3})); -} - BOOST_AUTO_TEST_SUITE_END() using namespace TiledArray; From 29e0b477cbd0c5c75e866ef2956f0c915cf0f111 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 23 Feb 2024 14:06:55 -0500 Subject: [PATCH 355/592] Setup nested-rank reduction evaluations in test fixture [wip] --- tests/einsum.cpp | 3 +- tests/tot_array_fixture.h | 232 +++++++++++++++++++++++--------------- 2 files changed, 140 insertions(+), 95 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index c161020c6d..acfc06332a 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -125,7 +125,8 @@ BOOST_AUTO_TEST_CASE(general) { Tensor A(TA::Range{2, 3}, {1, 2, 3, 4, 5, 6}); Tensor B(TA::Range{2}, {2, 10}); Tensor C(TA::Range{2, 3}, {2, 4, 6, 40, 50, 60}); - BOOST_REQUIRE(C == general_product(A, B, ProductSetup("ij"s, "i"s, "ij"s))); + BOOST_REQUIRE( + C == general_product(A, B, ProductSetup("ij"s, "i"s, "ij"s))); } BOOST_AUTO_TEST_CASE(equal_nested_ranks) { diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index f41697dc2e..09d8c83985 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -272,9 +272,10 @@ class OuterInnerIndices { [[nodiscard]] auto const& innerC() const noexcept { return inner_[2]; } }; +enum struct TensorProduct { General, Dot, Invalid }; + struct ProductSetup { - TA::expressions::TensorProduct product_type{ - TA::expressions::TensorProduct::Invalid}; + TensorProduct product_type{TensorProduct::Invalid}; PartialPerm // - {} index at kth position in C appears at vth position in A @@ -292,7 +293,7 @@ struct ProductSetup { rank_E, // rank_I; - ProductSetup() = default; + // ProductSetup() = default; template >> @@ -320,14 +321,13 @@ struct ProductSetup { I_to_B = partial_perm(ixs.I, ixs.B); using TP = decltype(product_type); - if (!(ixs.E || ixs.H)) - product_type = TP::Invalid; // no target indices - else if (!(ixs.E || ixs.I)) - product_type = TP::Hadamard; - else if (!ixs.H) - product_type = TP::Contraction; - else if (ixs.H && (ixs.E || ixs.I)) + + if (rank_A + rank_B != 0 && rank_C != 0) product_type = TP::General; + else if (rank_A == rank_B && rank_B != 0 && rank_C == 0) + product_type = TP::Dot; + else + product_type = TP::Invalid; } template (arr), std::get<1>(arr), std::get<2>(arr)) {} [[nodiscard]] bool valid() const noexcept { - return (rank_A + rank_B) != 0 && (rank_E + rank_H) != 0; + return product_type != TensorProduct::Invalid; } }; @@ -349,112 +349,154 @@ auto make_perm(PartialPerm const& pp) { return TA::Permutation(p); } -template -inline auto general_product(Tensor const& t, typename Tensor::numeric_type s, - ProductSetup const& setup, Setups const&... args) { +template >> +inline Result general_product(Tensor const& t, typename Tensor::numeric_type s, + ProductSetup const& setup, + Setups const&... args) { + static_assert(std::is_same_v); static_assert(sizeof...(args) == 0, "To-Do: Only scalar times once-nested tensor supported now"); return t.scale(s, make_perm(setup.C_to_A).inv()); } -template -inline auto general_product(typename Tensor::numeric_type s, Tensor const& t, - ProductSetup const& setup, Setups const&... args) { +template >> +inline Result general_product(typename Tensor::numeric_type s, Tensor const& t, + ProductSetup const& setup, + Setups const&... args) { + static_assert(std::is_same_v); static_assert(sizeof...(args) == 0, "To-Do: Only scalar times once-nested tensor supported now"); return t.scale(s, make_perm(setup.C_to_B).inv()); } + } // namespace -template // - && sizeof...(Setups) == - TA::detail::max_nested_rank - 1, - bool> = true> -auto general_product(TensorA const& A, TensorB const& B, - ProductSetup const& setup, Setups const&... args) { +template < + typename Result, typename TensorA, typename TensorB, typename... Setups, + typename = + std::enable_if_t>> +Result general_product(TensorA const& A, TensorB const& B, + ProductSetup const& setup, Setups const&... args) { + using TA::detail::max_nested_rank; + using TA::detail::nested_rank; + static_assert(std::is_same_v); - using TensorC = std::conditional_t<(TA::detail::nested_rank > - TA::detail::nested_rank), - TensorA, TensorB>; + + static_assert(max_nested_rank == sizeof...(args) + 1); + TA_ASSERT(setup.valid()); - constexpr bool is_tot = TA::detail::max_nested_rank > 1; - - // creating the contracted TA::Range - TA::Range const rng_I = [&setup, &A, &B]() { - TA::container::svector rng1_I(setup.rank_I, TA::Range1{}); - for (auto [f, t] : setup.I_to_A) - // I_to_A implies I[f] == A[t] - rng1_I[f] = A.range().dim(t); - - return TA::Range(rng1_I); - }(); - - // creating the target (ie. C's) TA::Range. - TA::Range const rng_C = [&setup, &A, &B]() { - TA::container::svector rng1_C(setup.rank_C, TA::Range1{0, 0}); - for (auto [f, t] : setup.C_to_A) - // C_to_A implies C[f] = A[t] - rng1_C[f] = A.range().dim(t); - - for (auto [f, t] : setup.C_to_B) - // C_to_B implies C[f] = B[t] - rng1_C[f] = B.range().dim(t); - - auto zero_r1 = [](TA::Range1 const& r) { return r == TA::Range1{0, 0}; }; - - TA_ASSERT(std::none_of(rng1_C.begin(), rng1_C.end(), zero_r1)); - - return TA::Range(rng1_C); - }(); - - TensorC C{rng_C}; - - // do the computation - for (auto ix_C : rng_C) { - // finding corresponding indices of A, and B. - TA::Range::index_type ix_A(setup.rank_A, 0), ix_B(setup.rank_B, 0); - apply_partial_perm(ix_A, ix_C, setup.C_to_A); - apply_partial_perm(ix_B, ix_C, setup.C_to_B); - - if (setup.rank_I == 0) - if constexpr (is_tot) - C(ix_C) = general_product(A(ix_A), B(ix_B), args...); - else { - TA_ASSERT(!(ix_A.empty() && ix_B.empty())); - C(ix_C) = ix_A.empty() ? B(ix_B) - : ix_B.empty() ? A(ix_B) - : A(ix_A) * B(ix_B); - } + constexpr bool is_tot = max_nested_rank > 1; + + if constexpr (std::is_same_v) { + // + // tensor dot product evaluation + // T * T -> scalar + // ToT * ToT -> scalar + // + static_assert(nested_rank == nested_rank); - else { - typename TensorC::value_type temp{}; - for (auto ix_I : rng_I) { - apply_partial_perm(ix_A, ix_I, setup.I_to_A); - apply_partial_perm(ix_B, ix_I, setup.I_to_B); - if constexpr (is_tot) - temp += general_product(A(ix_A), B(ix_B), args...); - else { - TA_ASSERT(!(ix_A.empty() || ix_B.empty())); - temp += A(ix_A) * B(ix_B); + TA_ASSERT(setup.rank_C == 0 && + "Attempted to evaluate dot product when the product setup does " + "not allow"); + + TA_ASSERT(false && "Dot product not yet supported!"); + + } else { + // + // general product: + // T * T -> T + // ToT * T -> ToT + // ToT * ToT -> ToT + // ToT * ToT -> T + // + + static_assert(nested_rank <= max_nested_rank, + "Tensor product not supported with increased nested rank in " + "the result"); + + constexpr bool de_nest = + nested_rank < max_nested_rank; + + // creating the contracted TA::Range + TA::Range const rng_I = [&setup, &A, &B]() { + TA::container::svector rng1_I(setup.rank_I, TA::Range1{}); + for (auto [f, t] : setup.I_to_A) + // I_to_A implies I[f] == A[t] + rng1_I[f] = A.range().dim(t); + + return TA::Range(rng1_I); + }(); + + // creating the target TA::Range. + TA::Range const rng_C = [&setup, &A, &B]() { + TA::container::svector rng1_C(setup.rank_C, TA::Range1{0, 0}); + for (auto [f, t] : setup.C_to_A) + // C_to_A implies C[f] = A[t] + rng1_C[f] = A.range().dim(t); + + for (auto [f, t] : setup.C_to_B) + // C_to_B implies C[f] = B[t] + rng1_C[f] = B.range().dim(t); + + auto zero_r1 = [](TA::Range1 const& r) { return r == TA::Range1{0, 0}; }; + + TA_ASSERT(std::none_of(rng1_C.begin(), rng1_C.end(), zero_r1)); + + return TA::Range(rng1_C); + }(); + + Result C{rng_C}; + + // do the computation + for (auto ix_C : rng_C) { + // finding corresponding indices of A, and B. + TA::Range::index_type ix_A(setup.rank_A, 0), ix_B(setup.rank_B, 0); + apply_partial_perm(ix_A, ix_C, setup.C_to_A); + apply_partial_perm(ix_B, ix_C, setup.C_to_B); + + if (setup.rank_I == 0) { + if constexpr (is_tot) { + C(ix_C) = general_product( + A(ix_A), B(ix_B), args...); + } else { + TA_ASSERT(!(ix_A.empty() && ix_B.empty())); + C(ix_C) = ix_A.empty() ? B(ix_B) + : ix_B.empty() ? A(ix_B) + : A(ix_A) * B(ix_B); } + } else { + typename Result::value_type temp{}; + for (auto ix_I : rng_I) { + apply_partial_perm(ix_A, ix_I, setup.I_to_A); + apply_partial_perm(ix_B, ix_I, setup.I_to_B); + if constexpr (is_tot) + temp += general_product( + A(ix_A), B(ix_B), args...); + else { + TA_ASSERT(!(ix_A.empty() || ix_B.empty())); + temp += A(ix_A) * B(ix_B); + } + } + C(ix_C) = temp; } - C(ix_C) = temp; } - } - return C; + return C; + } } template auto general_product(TA::DistArray A, TA::DistArray B, ProductSetup const& setup, Setups const&... args) { + using TA::detail::nested_rank; + static_assert(!TA::detail::is_scalar_v, + "Dot product of DistArrays not yet supported!"); + TA_ASSERT(setup.valid()); auto& world = TA::get_default_world(); @@ -469,7 +511,11 @@ auto general_product(TA::DistArray A, TA::Tensor tensorB{B.trange().tiles_range()}; for (auto&& ix : tensorB.range()) tensorB(ix) = B.find_local(ix).get(false); - auto result_tensor = general_product(tensorA, tensorB, setup, setup, args...); + using TileC = std::conditional_t<(nested_rank < nested_rank), + TileB, TileA>; + + auto result_tensor = general_product>( + tensorA, tensorB, setup, setup, args...); TA::TiledRange result_trange; { @@ -495,8 +541,6 @@ auto general_product(TA::DistArray A, result_trange = TA::TiledRange(tr1s_explicit); } - using TileC = typename decltype(result_tensor)::value_type; - TA::DistArray C(world, result_trange); for (auto it : C) { From 1c79b4a2f73461503b46524c25154bf9e39092d0 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sat, 24 Feb 2024 15:16:11 -0500 Subject: [PATCH 356/592] Nested-rank reduction including dot products of `TA::Tensor` in test fixture implemented. --- tests/tot_array_fixture.h | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 09d8c83985..94486f3366 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -403,8 +403,21 @@ Result general_product(TensorA const& A, TensorB const& B, "Attempted to evaluate dot product when the product setup does " "not allow"); - TA_ASSERT(false && "Dot product not yet supported!"); + Result result{}; + + for (auto&& ix_A : A.range()) { + TA::Range::index_type ix_B(setup.rank_B, 0); + apply_partial_perm(ix_B, ix_A, setup.I_to_B); + + if constexpr (is_tot) { + auto const& lhs = A(ix_A); + auto const& rhs = B(ix_B); + result += general_product(lhs, rhs, args...); + } else + result += A(ix_A) * B(ix_B); + } + return result; } else { // // general product: @@ -494,8 +507,6 @@ auto general_product(TA::DistArray A, TA::DistArray B, ProductSetup const& setup, Setups const&... args) { using TA::detail::nested_rank; - static_assert(!TA::detail::is_scalar_v, - "Dot product of DistArrays not yet supported!"); TA_ASSERT(setup.valid()); From 41bc078aea0a1dae47ca032fffb73eaa0778822f Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sat, 24 Feb 2024 15:28:15 -0500 Subject: [PATCH 357/592] Cleanup unused variable. --- tests/tot_array_fixture.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 94486f3366..eab97063f4 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -431,9 +431,6 @@ Result general_product(TensorA const& A, TensorB const& B, "Tensor product not supported with increased nested rank in " "the result"); - constexpr bool de_nest = - nested_rank < max_nested_rank; - // creating the contracted TA::Range TA::Range const rng_I = [&setup, &A, &B]() { TA::container::svector rng1_I(setup.rank_I, TA::Range1{}); From f527baa1c104ad499d2f846827e556a5df487126 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 28 Feb 2024 11:08:27 -0500 Subject: [PATCH 358/592] nested-rank reduced result in DistArray result supported by test fixture. --- tests/tot_array_fixture.h | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index eab97063f4..a86173b02f 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -499,13 +499,15 @@ Result general_product(TensorA const& A, TensorB const& B, } } -template +template auto general_product(TA::DistArray A, TA::DistArray B, ProductSetup const& setup, Setups const&... args) { + using TA::detail::max_nested_rank; using TA::detail::nested_rank; - - TA_ASSERT(setup.valid()); + static_assert(nested_rank <= max_nested_rank); + static_assert(nested_rank != 0); + TA_ASSERT(setup.product_type == TensorProduct::General); auto& world = TA::get_default_world(); @@ -519,9 +521,6 @@ auto general_product(TA::DistArray A, TA::Tensor tensorB{B.trange().tiles_range()}; for (auto&& ix : tensorB.range()) tensorB(ix) = B.find_local(ix).get(false); - using TileC = std::conditional_t<(nested_rank < nested_rank), - TileB, TileA>; - auto result_tensor = general_product>( tensorA, tensorB, setup, setup, args...); @@ -557,7 +556,19 @@ auto general_product(TA::DistArray A, return C; } -template +auto general_product(TA::DistArray A, + TA::DistArray B, + Setups const&... args) { + using TA::detail::nested_rank; + using TileC = std::conditional_t<(nested_rank > nested_rank), + TileB, TileA>; + return general_product(A, B, args...); +} + +enum struct DeNest { True, False }; + +template >> auto manual_eval(OuterInnerIndices const& oixs, ArrayA A, ArrayB B) { constexpr auto mnr = TA::detail::max_nested_rank; @@ -570,7 +581,15 @@ auto manual_eval(OuterInnerIndices const& oixs, ArrayA A, ArrayB B) { if constexpr (mnr == 2) { auto const inner_setup = ProductSetup(oixs.inner()); TA_ASSERT(inner_setup.valid()); - return general_product(A, B, outer_setup, inner_setup); + if constexpr (DeNestFlag == DeNest::True) { + // reduced nested rank in result + using TA::detail::nested_rank; + static_assert(nested_rank == nested_rank); + TA_ASSERT(inner_setup.rank_C == 0); + using TileC = typename ArrayA::value_type::value_type; + return general_product(A, B, outer_setup, inner_setup); + } else + return general_product(A, B, outer_setup, inner_setup); } else { return general_product(A, B, outer_setup); } From f36245e97dea8d926aeaed5991da8b690ff561ed Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 29 Feb 2024 16:24:33 -0500 Subject: [PATCH 359/592] [wip] supporting denested result as from a general product of two ToTs. --- src/TiledArray/einsum/tiledarray.h | 75 +++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 18 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index a429bc0a6e..7e4b02e62d 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -82,6 +82,15 @@ template constexpr bool AreArraySame = AreArrayT || AreArrayToT; +template +using DeNestedArray = DistArray; + +template +using MaxNestedArray = std::conditional_t<(detail::nested_rank > + detail::nested_rank), + Array2, Array1>; + } // namespace namespace { @@ -164,15 +173,24 @@ TiledRange make_trange(RangeMap const &map, Ixs const &ixs) { } // namespace -template +enum struct DeNest { True, False }; + +template auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, std::tuple, Indices...> cs, World &world) { using ArrayA = std::remove_cv_t; using ArrayB = std::remove_cv_t; - using ArrayC = std::conditional_t< - AreArraySame, ArrayA, - std::conditional_t, ArrayA, ArrayB>>; + + if constexpr (DeNestFlag == DeNest::True) + static_assert(detail::nested_rank == detail::nested_rank && + detail::nested_rank == 2); + + using ArrayC = + std::conditional_t, + MaxNestedArray>; + using ResultTensor = typename ArrayC::value_type; using ResultShape = typename ArrayC::shape_type; @@ -185,27 +203,48 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // Hadamard, external, internal indices for inner tensor Einsum::Index A, B, C, h, e, i; } inner; - if constexpr (std::tuple_size::value == 2) { - if constexpr (IsArrayToT) - inner.a = ";" + (std::string)std::get<1>(Einsum::idx(A)); - if constexpr (IsArrayToT) - inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + if constexpr (IsArrayToT) { + inner.a = ";" + (std::string)std::get<1>(Einsum::idx(A)); + inner.A = std::get<1>(Einsum::idx(A)); + } - static_assert(IsArrayToT || IsArrayToT); - inner.c = ";" + (std::string)std::get<1>(cs); + if constexpr (IsArrayToT) { + inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B)); + inner.B = std::get<1>(Einsum::idx(B)); + } - Einsum::Index a_idx, b_idx, c_idx; - if constexpr (IsArrayToT) inner.A = std::get<1>(Einsum::idx(A)); - if constexpr (IsArrayToT) inner.B = std::get<1>(Einsum::idx(B)); - if constexpr (IsArrayToT || IsArrayToT) - inner.C = std::get<1>(cs); + if constexpr (std::tuple_size::value == 2) { + static_assert(IsArrayToT); + inner.c = ";" + (std::string)std::get<1>(cs); + inner.C = std::get<1>(cs); + } + { inner.h = inner.A & inner.B & inner.C; inner.e = (inner.A ^ inner.B); inner.i = (inner.A & inner.B) - inner.h; - TA_ASSERT(!(inner.h && (inner.i || inner.e)) && - "General product between inner tensors not supported"); + if constexpr (IsArrayToT) + TA_ASSERT(!(inner.h && (inner.i || inner.e)) && + "General product between inner tensors not supported"); + } + + if constexpr (DeNestFlag == DeNest::True) { + TA_ASSERT(!inner.C && + "Denested result cannot have inner-tensor annotation"); + // Step I: A * B -> C' + // Step II: C' -> C + // + // At "Step I", a general product (without reduction) in outer indices, + // and pure Hadamard product in inner indices is carried out. + // Then at "Step II", the inner tensors are reduced with a unary function. + // The reducing function is determined by looking at the contracting and + // non-contracting outer indices. + // + // eg. A(i,j,k;a,b) * B(k,j;a,b) -> C(i,j) involves following two steps: + // Step I: A(i,j,k;a,b) * B(k,j;a,b) -> C'(i,j,k;a,b) + // Step II: C'(i,j,k;a,b) -> C(i,j) + TA_ASSERT(false && "Denesting not yet implemented!"); } // these are "Hadamard" (fused) indices From 1076270e7f661cfec98906460ea5705416d09bd6 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 5 Mar 2024 11:41:54 -0500 Subject: [PATCH 360/592] [wip] complete 'ToT x ToT -> ToS' implementation in `einsum`. Tests will be added after CI passes. --- src/TiledArray/einsum/tiledarray.h | 571 +++++++++++++++++------------ tests/tot_array_fixture.h | 5 +- 2 files changed, 336 insertions(+), 240 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 7e4b02e62d..bef0e24f47 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -9,6 +9,10 @@ #include "TiledArray/tiled_range.h" #include "TiledArray/tiled_range1.h" +namespace TiledArray { +enum struct DeNest { True, False }; +} + namespace TiledArray::Einsum { using ::Einsum::index::small_vector; @@ -173,8 +177,6 @@ TiledRange make_trange(RangeMap const &map, Ixs const &ixs) { } // namespace -enum struct DeNest { True, False }; - template auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, @@ -183,10 +185,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using ArrayA = std::remove_cv_t; using ArrayB = std::remove_cv_t; - if constexpr (DeNestFlag == DeNest::True) - static_assert(detail::nested_rank == detail::nested_rank && - detail::nested_rank == 2); - using ArrayC = std::conditional_t, MaxNestedArray>; @@ -230,8 +228,17 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } if constexpr (DeNestFlag == DeNest::True) { + static_assert(detail::nested_rank == detail::nested_rank && + detail::nested_rank == 2); + TA_ASSERT(!inner.C && "Denested result cannot have inner-tensor annotation"); + + TA_ASSERT(inner.i.size() == inner.A.size() && + inner.i.size() == inner.B.size() && + "Nested-rank-reduction only supported when the inner tensor " + "ranks match on the arguments"); + // Step I: A * B -> C' // Step II: C' -> C // @@ -244,275 +251,358 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // eg. A(i,j,k;a,b) * B(k,j;a,b) -> C(i,j) involves following two steps: // Step I: A(i,j,k;a,b) * B(k,j;a,b) -> C'(i,j,k;a,b) // Step II: C'(i,j,k;a,b) -> C(i,j) - TA_ASSERT(false && "Denesting not yet implemented!"); - } - // these are "Hadamard" (fused) indices - auto h = a & b & c; + using PartialPerm = TA::container::svector>; + auto partial_perm = [](auto const &from, auto const &to) { + PartialPerm result; + for (auto i = 0; i < from.size(); ++i) + if (auto found = to.find(from[i]); found != to.end()) + result.emplace_back(i, std::distance(to.begin(), found)); + return result; + }; - // external indices - auto e = (a ^ b); + auto apply_partial_perm = [](auto &to, auto const &from, + PartialPerm const &p) { + for (auto [f, t] : p) { + TA_ASSERT(f < from.size() && t < to.size() && + "Invalid permutation used"); + to[t] = from[f]; + } + }; - // contracted indices - auto i = (a & b) - h; + auto ix_outer_Cp = (a | b); - // no Hadamard indices => standard contraction (or even outer product) - // same a, b, and c => pure Hadamard - if (!h || (h && !(i || e))) { - ArrayC C; - C(std::string(c) + inner.c) = A * B; - return C; - } + auto C_to_Cp = partial_perm(c, ix_outer_Cp); + auto I_to_Cp = partial_perm(ix_outer_Cp - c, ix_outer_Cp); - TA_ASSERT(e || h); + auto Cp = + einsum(A, B, std::string(ix_outer_Cp) + ";" + std::string(inner.i)); - auto range_map = - (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); - - auto perm_and_rank_replicate = [delta_trng = make_trange(range_map, e)]( - auto pre, // - std::string const &pre_annot, // - std::string const &permed_annot) { - decltype(pre) permed; - permed(permed_annot) = pre(pre_annot); - return replicate_array(permed, delta_trng); - }; - - // special Hadamard - if (h.size() == a.size() || h.size() == b.size()) { - TA_ASSERT(!i && e); - bool small_a = h.size() == a.size(); - std::string const eh_annot = (e | h); - std::string const permed_annot = - std::string(h) + (small_a ? inner.a : inner.b); - std::string const C_annot = std::string(c) + inner.c; - std::string const temp_annot = std::string(e) + "," + permed_annot; - ArrayC C; - if (small_a) { - auto temp = - perm_and_rank_replicate(A.array(), A.annotation(), permed_annot); - C(C_annot) = temp(temp_annot) * B; - } else { - auto temp = - perm_and_rank_replicate(B.array(), B.annotation(), permed_annot); - C(C_annot) = A * temp(temp_annot); - } - return C; - } + auto make_tile = [Cp, apply_partial_perm, C_to_Cp, I_to_Cp]( + auto &target, TA::Range const &rng) { + typename ArrayC::value_type result(rng); - using ::Einsum::index::permutation; - using TiledArray::Permutation; + // ijk;ab * ijk;ab -> ij + // ijk;ab - std::tuple, ArrayTerm> AB{{A.array(), a}, - {B.array(), b}}; + for (auto rix : rng) { + // eg. C'(0,0,0,0,0) + container::svector lannot(rank(Cp), 0); - auto update_perm_and_indices = [&e = std::as_const(e), &i = std::as_const(i), - &h = std::as_const(h)](auto &term) { - auto ei = (e + i & term.idx); - if (term.idx != h + ei) { - term.permutation = permutation(term.idx, h + ei); + // eg. C'(i,0,j,0,k) + apply_partial_perm(lannot, rng.lobound(), C_to_Cp); + + // eg. find C'(i,0,j,0,k) tile + auto ltile = Cp.find(lannot).get(false); + + // set the lannot now to the actual element of lhs argument + apply_partial_perm(lannot, rix, C_to_Cp); + + // creating the traced TA::Range + TA::Range const rng_I = [<ile, &I_to_Cp]() { + container::svector rng1_I(I_to_Cp.size(), TA::Range1{}); + for (auto [f, t] : I_to_Cp) + // I_to_Cp implies I[f] == Cp[t] + rng1_I[f] = ltile.range().dim(t); + + return TA::Range(rng1_I); + }(); + + if (rng_I.rank() == 0) { + result(rix) = ltile(lannot).sum(); + } else { + for (auto iix : rng_I) { + auto lannot_ = lannot; + apply_partial_perm(lannot_, iix, I_to_Cp); + result(rix) += ltile(lannot_).sum(); + } + } + } + + target = result; + return result.norm(); + }; + + auto range_map = + (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); + container::svector result_tr1s; + for (auto const &ix : c) result_tr1s.emplace_back(range_map[ix]); + + return make_array( + Cp.world(), TiledRange(result_tr1s.begin(), result_tr1s.end()), + make_tile); + } else { + // these are "Hadamard" (fused) indices + auto h = a & b & c; + + // external indices + auto e = (a ^ b); + + // contracted indices + auto i = (a & b) - h; + + // no Hadamard indices => standard contraction (or even outer product) + // same a, b, and c => pure Hadamard + if (!h || (h && !(i || e))) { + ArrayC C; + C(std::string(c) + inner.c) = A * B; + return C; } - term.expr = ei; - }; - std::invoke(update_perm_and_indices, std::get<0>(AB)); - std::invoke(update_perm_and_indices, std::get<1>(AB)); + TA_ASSERT(e || h); - ArrayTerm C = {ArrayC(world, TiledRange(range_map[c])), c}; - for (auto idx : e) { - C.tiles *= Range(range_map[idx].tiles_range()); - } - if (C.idx != h + e) { - C.permutation = permutation(h + e, C.idx); - } - C.expr = e; + auto range_map = + (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); - std::get<0>(AB).expr += inner.a; - std::get<1>(AB).expr += inner.b; + auto perm_and_rank_replicate = [delta_trng = make_trange(range_map, e)]( + auto pre, // + std::string const &pre_annot, // + std::string const &permed_annot) { + decltype(pre) permed; + permed(permed_annot) = pre(pre_annot); + return replicate_array(permed, delta_trng); + }; - C.expr += inner.c; + // special Hadamard + if (h.size() == a.size() || h.size() == b.size()) { + TA_ASSERT(!i && e); + bool small_a = h.size() == a.size(); + std::string const eh_annot = (e | h); + std::string const permed_annot = + std::string(h) + (small_a ? inner.a : inner.b); + std::string const C_annot = std::string(c) + inner.c; + std::string const temp_annot = std::string(e) + "," + permed_annot; + ArrayC C; + if (small_a) { + auto temp = + perm_and_rank_replicate(A.array(), A.annotation(), permed_annot); + C(C_annot) = temp(temp_annot) * B; + } else { + auto temp = + perm_and_rank_replicate(B.array(), B.annotation(), permed_annot); + C(C_annot) = A * temp(temp_annot); + } + return C; + } - struct { - RangeProduct tiles; - std::vector> batch; - } H; + using ::Einsum::index::permutation; + using TiledArray::Permutation; - for (auto idx : h) { - H.tiles *= Range(range_map[idx].tiles_range()); - H.batch.push_back({}); - for (auto r : range_map[idx]) { - H.batch.back().push_back(Range{r}.size()); + std::tuple, ArrayTerm> AB{{A.array(), a}, + {B.array(), b}}; + + auto update_perm_and_indices = [&e = std::as_const(e), + &i = std::as_const(i), + &h = std::as_const(h)](auto &term) { + auto ei = (e + i & term.idx); + if (term.idx != h + ei) { + term.permutation = permutation(term.idx, h + ei); + } + term.expr = ei; + }; + + std::invoke(update_perm_and_indices, std::get<0>(AB)); + std::invoke(update_perm_and_indices, std::get<1>(AB)); + + ArrayTerm C = {ArrayC(world, TiledRange(range_map[c])), c}; + for (auto idx : e) { + C.tiles *= Range(range_map[idx].tiles_range()); } - } + if (C.idx != h + e) { + C.permutation = permutation(h + e, C.idx); + } + C.expr = e; - using Index = Einsum::Index; + std::get<0>(AB).expr += inner.a; + std::get<1>(AB).expr += inner.b; - if constexpr (AreArraySame) { - if (!e) { // hadamard reduction - auto &[A, B] = AB; - TiledRange trange(range_map[i]); + C.expr += inner.c; + + struct { RangeProduct tiles; - for (auto idx : i) { - tiles *= Range(range_map[idx].tiles_range()); + std::vector> batch; + } H; + + for (auto idx : h) { + H.tiles *= Range(range_map[idx].tiles_range()); + H.batch.push_back({}); + for (auto r : range_map[idx]) { + H.batch.back().push_back(Range{r}.size()); } - auto pa = A.permutation; - auto pb = B.permutation; - for (Index h : H.tiles) { - if (!C.array.is_local(h)) continue; - size_t batch = 1; - for (size_t i = 0; i < h.size(); ++i) { - batch *= H.batch[i].at(h[i]); + } + + using Index = Einsum::Index; + + if constexpr (AreArraySame && + AreArraySame) { + if (!e) { // hadamard reduction + auto &[A, B] = AB; + TiledRange trange(range_map[i]); + RangeProduct tiles; + for (auto idx : i) { + tiles *= Range(range_map[idx].tiles_range()); } - ResultTensor tile(TiledArray::Range{batch}, - typename ResultTensor::value_type{}); - for (Index i : tiles) { - // skip this unless both input tiles exist - const auto pahi_inv = apply_inverse(pa, h + i); - const auto pbhi_inv = apply_inverse(pb, h + i); - if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; - - auto ai = A.array.find(pahi_inv).get(); - auto bi = B.array.find(pbhi_inv).get(); - if (pa) ai = ai.permute(pa); - if (pb) bi = bi.permute(pb); - auto shape = trange.tile(i); - ai = ai.reshape(shape, batch); - bi = bi.reshape(shape, batch); - for (size_t k = 0; k < batch; ++k) { - using Ix = ::Einsum::Index; - if constexpr (AreArrayToT) { - auto aik = ai.batch(k); - auto bik = bi.batch(k); - auto vol = aik.total_size(); - TA_ASSERT(vol == bik.total_size()); - - auto &el = tile({k}); - using TensorT = std::remove_reference_t; - - auto mult_op = [&inner](auto const &l, auto const &r) -> TensorT { - return inner.h ? TA::detail::tensor_hadamard(l, inner.A, r, - inner.B, inner.C) - : TA::detail::tensor_contract(l, inner.A, r, - inner.B, inner.C); - }; - - for (auto i = 0; i < vol; ++i) - el.add_to(mult_op(aik.data()[i], bik.data()[i])); - - } else { - auto hk = ai.batch(k).dot(bi.batch(k)); - tile({k}) += hk; + auto pa = A.permutation; + auto pb = B.permutation; + for (Index h : H.tiles) { + if (!C.array.is_local(h)) continue; + size_t batch = 1; + for (size_t i = 0; i < h.size(); ++i) { + batch *= H.batch[i].at(h[i]); + } + ResultTensor tile(TiledArray::Range{batch}, + typename ResultTensor::value_type{}); + for (Index i : tiles) { + // skip this unless both input tiles exist + const auto pahi_inv = apply_inverse(pa, h + i); + const auto pbhi_inv = apply_inverse(pb, h + i); + if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) + continue; + + auto ai = A.array.find(pahi_inv).get(); + auto bi = B.array.find(pbhi_inv).get(); + if (pa) ai = ai.permute(pa); + if (pb) bi = bi.permute(pb); + auto shape = trange.tile(i); + ai = ai.reshape(shape, batch); + bi = bi.reshape(shape, batch); + for (size_t k = 0; k < batch; ++k) { + using Ix = ::Einsum::Index; + if constexpr (AreArrayToT) { + auto aik = ai.batch(k); + auto bik = bi.batch(k); + auto vol = aik.total_size(); + TA_ASSERT(vol == bik.total_size()); + + auto &el = tile({k}); + using TensorT = std::remove_reference_t; + + auto mult_op = [&inner](auto const &l, + auto const &r) -> TensorT { + return inner.h ? TA::detail::tensor_hadamard(l, inner.A, r, + inner.B, inner.C) + : TA::detail::tensor_contract( + l, inner.A, r, inner.B, inner.C); + }; + + for (auto i = 0; i < vol; ++i) + el.add_to(mult_op(aik.data()[i], bik.data()[i])); + + } else { + auto hk = ai.batch(k).dot(bi.batch(k)); + tile({k}) += hk; + } } } + auto pc = C.permutation; + auto shape = apply_inverse(pc, C.array.trange().tile(h)); + tile = tile.reshape(shape); + if (pc) tile = tile.permute(pc); + C.array.set(h, tile); } - auto pc = C.permutation; - auto shape = apply_inverse(pc, C.array.trange().tile(h)); - tile = tile.reshape(shape); - if (pc) tile = tile.permute(pc); - C.array.set(h, tile); + return C.array; } - return C.array; } - } - // generalized contraction + // generalized contraction - auto update_tr = [&e = std::as_const(e), &i = std::as_const(i), - &range_map = std::as_const(range_map)](auto &term) { - auto ei = (e + i & term.idx); - term.ei_tiled_range = TiledRange(range_map[ei]); - for (auto idx : ei) { - term.tiles *= Range(range_map[idx].tiles_range()); - } - }; + auto update_tr = [&e = std::as_const(e), &i = std::as_const(i), + &range_map = std::as_const(range_map)](auto &term) { + auto ei = (e + i & term.idx); + term.ei_tiled_range = TiledRange(range_map[ei]); + for (auto idx : ei) { + term.tiles *= Range(range_map[idx].tiles_range()); + } + }; - std::invoke(update_tr, std::get<0>(AB)); - std::invoke(update_tr, std::get<1>(AB)); + std::invoke(update_tr, std::get<0>(AB)); + std::invoke(update_tr, std::get<1>(AB)); - std::vector> worlds; - std::vector> local_tiles; + std::vector> worlds; + std::vector> local_tiles; - // iterates over tiles of hadamard indices - for (Index h : H.tiles) { - auto &[A, B] = AB; - auto own = A.own(h) || B.own(h); - auto comm = world.mpi.comm().Split(own, world.rank()); - worlds.push_back(std::make_unique(comm)); - auto &owners = worlds.back(); - if (!own) continue; - size_t batch = 1; - for (size_t i = 0; i < h.size(); ++i) { - batch *= H.batch[i].at(h[i]); - } - - auto retile = [&owners, &h = std::as_const(h), batch](auto &term) { - term.local_tiles.clear(); - const Permutation &P = term.permutation; + // iterates over tiles of hadamard indices + for (Index h : H.tiles) { + auto &[A, B] = AB; + auto own = A.own(h) || B.own(h); + auto comm = world.mpi.comm().Split(own, world.rank()); + worlds.push_back(std::make_unique(comm)); + auto &owners = worlds.back(); + if (!own) continue; + size_t batch = 1; + for (size_t i = 0; i < h.size(); ++i) { + batch *= H.batch[i].at(h[i]); + } - for (Index ei : term.tiles) { - auto idx = apply_inverse(P, h + ei); - if (!term.array.is_local(idx)) continue; - if (term.array.is_zero(idx)) continue; + auto retile = [&owners, &h = std::as_const(h), batch](auto &term) { + term.local_tiles.clear(); + const Permutation &P = term.permutation; + + for (Index ei : term.tiles) { + auto idx = apply_inverse(P, h + ei); + if (!term.array.is_local(idx)) continue; + if (term.array.is_zero(idx)) continue; + // TODO no need for immediate evaluation + auto tile = term.array.find_local(idx).get(); + if (P) tile = tile.permute(P); + auto shape = term.ei_tiled_range.tile(ei); + tile = tile.reshape(shape, batch); + term.local_tiles.push_back({ei, tile}); + } + bool replicated = term.array.pmap()->is_replicated(); + term.ei = TiledArray::make_array( + *owners, term.ei_tiled_range, term.local_tiles.begin(), + term.local_tiles.end(), replicated); + }; + std::invoke(retile, std::get<0>(AB)); + std::invoke(retile, std::get<1>(AB)); + + C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); + A.ei.defer_deleter_to_next_fence(); + B.ei.defer_deleter_to_next_fence(); + A.ei = ArrayA(); + B.ei = ArrayB(); + // why omitting this fence leads to deadlock? + owners->gop.fence(); + for (Index e : C.tiles) { + if (!C.ei.is_local(e)) continue; + if (C.ei.is_zero(e)) continue; // TODO no need for immediate evaluation - auto tile = term.array.find_local(idx).get(); + auto tile = C.ei.find_local(e).get(); + assert(tile.nbatch() == batch); + const Permutation &P = C.permutation; + auto c = apply(P, h + e); + auto shape = C.array.trange().tile(c); + shape = apply_inverse(P, shape); + tile = tile.reshape(shape); if (P) tile = tile.permute(P); - auto shape = term.ei_tiled_range.tile(ei); - tile = tile.reshape(shape, batch); - term.local_tiles.push_back({ei, tile}); + local_tiles.push_back({c, tile}); } - bool replicated = term.array.pmap()->is_replicated(); - term.ei = TiledArray::make_array( - *owners, term.ei_tiled_range, term.local_tiles.begin(), - term.local_tiles.end(), replicated); - }; - std::invoke(retile, std::get<0>(AB)); - std::invoke(retile, std::get<1>(AB)); - - C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners); - A.ei.defer_deleter_to_next_fence(); - B.ei.defer_deleter_to_next_fence(); - A.ei = ArrayA(); - B.ei = ArrayB(); - // why omitting this fence leads to deadlock? - owners->gop.fence(); - for (Index e : C.tiles) { - if (!C.ei.is_local(e)) continue; - if (C.ei.is_zero(e)) continue; - // TODO no need for immediate evaluation - auto tile = C.ei.find_local(e).get(); - assert(tile.nbatch() == batch); - const Permutation &P = C.permutation; - auto c = apply(P, h + e); - auto shape = C.array.trange().tile(c); - shape = apply_inverse(P, shape); - tile = tile.reshape(shape); - if (P) tile = tile.permute(P); - local_tiles.push_back({c, tile}); + // mark for lazy deletion + C.ei = ArrayC(); + } + + if constexpr (!ResultShape::is_dense()) { + TiledRange tiled_range = TiledRange(range_map[c]); + std::vector> tile_norms; + for (auto &[index, tile] : local_tiles) { + tile_norms.push_back({index, tile.norm()}); + } + ResultShape shape(world, tile_norms, tiled_range); + C.array = ArrayC(world, TiledRange(range_map[c]), shape); } - // mark for lazy deletion - C.ei = ArrayC(); - } - if constexpr (!ResultShape::is_dense()) { - TiledRange tiled_range = TiledRange(range_map[c]); - std::vector> tile_norms; for (auto &[index, tile] : local_tiles) { - tile_norms.push_back({index, tile.norm()}); + if (C.array.is_zero(index)) continue; + C.array.set(index, tile); } - ResultShape shape(world, tile_norms, tiled_range); - C.array = ArrayC(world, TiledRange(range_map[c]), shape); - } - for (auto &[index, tile] : local_tiles) { - if (C.array.is_zero(index)) continue; - C.array.set(index, tile); - } + for (auto &w : worlds) { + w->gop.fence(); + } - for (auto &w : worlds) { - w->gop.fence(); + return C.array; } - - return C.array; } /// Computes ternary tensor product whose result @@ -651,13 +741,19 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B) { /// @param[in] r result indices /// @warning just as in the plain expression code, reductions are a special /// case; use Expr::reduce() -template +template auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, const std::string &cs, World &world = get_default_world()) { using ECT = expressions::TsrExpr; using ECU = expressions::TsrExpr; - using ResultExprT = std::conditional_t, T, U>; - return Einsum::einsum(ECT(A), ECU(B), Einsum::idx(cs), world); + + using ResultExprT = + std::conditional_t, + Einsum::MaxNestedArray>; + + return Einsum::einsum(ECT(A), ECU(B), + Einsum::idx(cs), world); } template @@ -676,7 +772,8 @@ namespace TiledArray { using expressions::dot; using expressions::einsum; -template +template auto einsum(const std::string &expr, const DistArray &A, const DistArray &B, World &world = get_default_world()) { using ::Einsum::string::join; @@ -712,7 +809,7 @@ auto einsum(const std::string &expr, const DistArray &A, annot.C = combine(outer.C, inner.C); } - return einsum(A(annot.A), B(annot.B), annot.C, world); + return einsum(A(annot.A), B(annot.B), annot.C, world); } /// Computes ternary tensor product whose result diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index a86173b02f..94f57b0930 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -19,7 +19,8 @@ #ifndef TILEDARRAY_TEST_TOT_ARRAY_FIXTURE_H__INCLUDED #define TILEDARRAY_TEST_TOT_ARRAY_FIXTURE_H__INCLUDED -#include "tiledarray.h" +#include +#include #include "unit_test_config.h" #ifdef TILEDARRAY_HAS_BTAS #include @@ -566,8 +567,6 @@ auto general_product(TA::DistArray A, return general_product(A, B, args...); } -enum struct DeNest { True, False }; - template >> auto manual_eval(OuterInnerIndices const& oixs, ArrayA A, ArrayB B) { From a710a8ff67e55b7c2d4fd475353ff130d87b5eba Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 5 Mar 2024 13:21:27 -0500 Subject: [PATCH 361/592] `nested_rank` type trait supports const tensor and const distarrays. --- src/TiledArray/tensor/type_traits.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h index b325752c7a..a32de32e4a 100644 --- a/src/TiledArray/tensor/type_traits.h +++ b/src/TiledArray/tensor/type_traits.h @@ -216,9 +216,17 @@ constexpr size_t nested_rank = 0; template constexpr size_t nested_rank> = 1 + nested_rank; +template +constexpr size_t nested_rank> = + nested_rank>; + template constexpr size_t nested_rank> = nested_rank; +template +constexpr size_t nested_rank> = + nested_rank>; + template constexpr size_t max_nested_rank = 0; From 748d5e252107262db55ce16b511dbaccab37e0fe Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 6 Mar 2024 14:04:35 -0500 Subject: [PATCH 362/592] Bug fix. --- src/TiledArray/einsum/tiledarray.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index bef0e24f47..663ef826ff 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -293,7 +293,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, apply_partial_perm(lannot, rng.lobound(), C_to_Cp); // eg. find C'(i,0,j,0,k) tile - auto ltile = Cp.find(lannot).get(false); + auto ltile = Cp.find(Cp.trange().element_to_tile(lannot)).get(false); // set the lannot now to the actual element of lhs argument apply_partial_perm(lannot, rix, C_to_Cp); From 19889a51116b0fc2af205e532b31f71cd38c28ab Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 6 Mar 2024 14:40:07 -0500 Subject: [PATCH 363/592] Use TA::foreach in ToT x ToT -> ToS evaluations. --- src/TiledArray/einsum/tiledarray.h | 89 ++++-------------------------- 1 file changed, 12 insertions(+), 77 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 663ef826ff..d2d8b2c9ba 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -249,88 +249,23 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // non-contracting outer indices. // // eg. A(i,j,k;a,b) * B(k,j;a,b) -> C(i,j) involves following two steps: - // Step I: A(i,j,k;a,b) * B(k,j;a,b) -> C'(i,j,k;a,b) - // Step II: C'(i,j,k;a,b) -> C(i,j) - - using PartialPerm = TA::container::svector>; - auto partial_perm = [](auto const &from, auto const &to) { - PartialPerm result; - for (auto i = 0; i < from.size(); ++i) - if (auto found = to.find(from[i]); found != to.end()) - result.emplace_back(i, std::distance(to.begin(), found)); - return result; - }; - - auto apply_partial_perm = [](auto &to, auto const &from, - PartialPerm const &p) { - for (auto [f, t] : p) { - TA_ASSERT(f < from.size() && t < to.size() && - "Invalid permutation used"); - to[t] = from[f]; - } - }; - - auto ix_outer_Cp = (a | b); - - auto C_to_Cp = partial_perm(c, ix_outer_Cp); - auto I_to_Cp = partial_perm(ix_outer_Cp - c, ix_outer_Cp); - - auto Cp = - einsum(A, B, std::string(ix_outer_Cp) + ";" + std::string(inner.i)); - - auto make_tile = [Cp, apply_partial_perm, C_to_Cp, I_to_Cp]( - auto &target, TA::Range const &rng) { - typename ArrayC::value_type result(rng); - - // ijk;ab * ijk;ab -> ij - // ijk;ab - - for (auto rix : rng) { - // eg. C'(0,0,0,0,0) - container::svector lannot(rank(Cp), 0); + // Step I: A(i,j,k;a,b) * B(k,j;a,b) -> C'(i,j;a,b) + // Step II: C'(i,j;a,b) -> C(i,j) - // eg. C'(i,0,j,0,k) - apply_partial_perm(lannot, rng.lobound(), C_to_Cp); + auto Cp = einsum(A, B, std::string(c) + ";" + std::string(inner.i)); - // eg. find C'(i,0,j,0,k) tile - auto ltile = Cp.find(Cp.trange().element_to_tile(lannot)).get(false); - - // set the lannot now to the actual element of lhs argument - apply_partial_perm(lannot, rix, C_to_Cp); - - // creating the traced TA::Range - TA::Range const rng_I = [<ile, &I_to_Cp]() { - container::svector rng1_I(I_to_Cp.size(), TA::Range1{}); - for (auto [f, t] : I_to_Cp) - // I_to_Cp implies I[f] == Cp[t] - rng1_I[f] = ltile.range().dim(t); - - return TA::Range(rng1_I); - }(); - - if (rng_I.rank() == 0) { - result(rix) = ltile(lannot).sum(); - } else { - for (auto iix : rng_I) { - auto lannot_ = lannot; - apply_partial_perm(lannot_, iix, I_to_Cp); - result(rix) += ltile(lannot_).sum(); - } - } - } - - target = result; - return result.norm(); + auto sum_tot_2_tos = [](auto const &tot) { + typename std::remove_reference_t::value_type result( + tot.range(), [tot](auto &&ix) { return tot(ix).sum(); }); + return result; }; - auto range_map = - (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); - container::svector result_tr1s; - for (auto const &ix : c) result_tr1s.emplace_back(range_map[ix]); + auto result = TA::foreach( + Cp, [sum_tot_2_tos](auto &out_tile, auto const &in_tile) { + out_tile = sum_tot_2_tos(in_tile); + }); - return make_array( - Cp.world(), TiledRange(result_tr1s.begin(), result_tr1s.end()), - make_tile); + return result; } else { // these are "Hadamard" (fused) indices auto h = a & b & c; From 145850121c911378e0af508a6e93a2e7fd1d82fc Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 6 Mar 2024 14:41:18 -0500 Subject: [PATCH 364/592] Add a test for ToT x ToT -> ToS --- tests/einsum.cpp | 110 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 82 insertions(+), 28 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index acfc06332a..5d7e6024e5 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -27,54 +27,95 @@ BOOST_AUTO_TEST_SUITE(manual) -template >; +using il_extent = std::initializer_list; +} // namespace + +template >> bool check_manual_eval(std::string const& annot, ArrayA A, ArrayB B) { - auto out = TA::einsum(annot, A, B); - auto ref = manual_eval(annot, A, B); + auto out = TA::einsum(annot, A, B); + auto ref = manual_eval(annot, A, B); return ToTArrayFixture::are_equal(ref, out); } -template -bool check_manual_eval( - std::string const& annot, - std::initializer_list> trangeA, - std::initializer_list> trangeB) { +template >> +bool check_manual_eval(std::string const& annot, ArrayA A, ArrayB B) { + return check_manual_eval(annot, A, B); +} + +template +bool check_manual_eval(std::string const& annot, il_trange trangeA, + il_trange trangeB) { + static_assert(detail::is_array_v && + detail::is_tensor_v); auto A = random_array(TA::TiledRange(trangeA)); auto B = random_array(TA::TiledRange(trangeB)); - return check_manual_eval(annot, A, B); + return check_manual_eval(annot, A, B); } -template +bool check_manual_eval(std::string const& annot, il_trange trangeA, + il_trange trangeB) { + return check_manual_eval(annot, trangeA, + trangeB); +} + +template -bool check_manual_eval( - std::string const& annot, - std::initializer_list> trangeA, - std::initializer_list> trangeB, - std::initializer_list inner_extents) { - if constexpr (TA::detail::is_tensor_of_tensor_v) - return check_manual_eval( +bool check_manual_eval(std::string const& annot, il_trange trangeA, + il_trange trangeB, il_extent inner_extents) { + static_assert(detail::is_array_v); + + if constexpr (detail::is_tensor_of_tensor_v) { + static_assert(!detail::is_tensor_of_tensor_v); + return check_manual_eval( annot, random_array(trangeA, inner_extents), random_array(trangeB)); - else - return check_manual_eval( + } else { + static_assert(detail::is_tensor_of_tensor_v); + return check_manual_eval( annot, random_array(trangeA), random_array(trangeB, inner_extents)); + } } -template -bool check_manual_eval( - std::string const& annot, - std::initializer_list> trangeA, - std::initializer_list> trangeB, - std::initializer_list inner_extentsA, - std::initializer_list inner_extentsB) { - return check_manual_eval( +template +bool check_manual_eval(std::string const& annot, il_trange trangeA, + il_trange trangeB, il_extent inner_extents) { + return check_manual_eval( + annot, trangeA, trangeB); +} + +template +bool check_manual_eval(std::string const& annot, il_trange trangeA, + il_trange trangeB, il_extent inner_extentsA, + il_extent inner_extentsB) { + static_assert(detail::is_array_v && + detail::is_tensor_of_tensor_v); + return check_manual_eval( annot, random_array(trangeA, inner_extentsA), random_array(trangeB, inner_extentsB)); } +template +bool check_manual_eval(std::string const& annot, il_trange trangeA, + il_trange trangeB, il_extent inner_extentsA, + il_extent inner_extentsB) { + return check_manual_eval( + annot, trangeA, trangeB, inner_extentsA, inner_extentsB); +} + BOOST_AUTO_TEST_CASE(contract) { using Array = TA::Array; @@ -306,6 +347,19 @@ BOOST_AUTO_TEST_CASE(different_nested_ranks) { {2, 4}))); } +BOOST_AUTO_TEST_CASE(nested_rank_reduction) { + using T = TA::Tensor; + using ToT = TA::Tensor; + using Array = TA::DistArray; + using ArrayToT = TA::DistArray; + BOOST_REQUIRE( + (check_manual_eval("ij;ab,ij;ab->ij", // + {{0, 2, 4}, {0, 4}}, // + {{0, 2, 4}, {0, 4}}, // + {3, 2}, // + {3, 2}))); +} + BOOST_AUTO_TEST_CASE(corner_cases) { using T = TA::Tensor; using ToT = TA::Tensor; From f157e0ef2ad2230acbf09d3f062b71b85fdc433d Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 6 Mar 2024 14:54:15 -0500 Subject: [PATCH 365/592] [unit_test] ToT x ToT -> ToS with contraction on outer indices --- tests/einsum.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 5d7e6024e5..a671657cbf 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -358,6 +358,12 @@ BOOST_AUTO_TEST_CASE(nested_rank_reduction) { {{0, 2, 4}, {0, 4}}, // {3, 2}, // {3, 2}))); + BOOST_REQUIRE( + (check_manual_eval("ij;ab,ij;ab->i", // + {{0, 2, 4}, {0, 4}}, // + {{0, 2, 4}, {0, 4}}, // + {3, 2}, // + {3, 2}))); } BOOST_AUTO_TEST_CASE(corner_cases) { From 6a134f31f38a1e643583d5c8f19ed85fb61b04ae Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 6 Mar 2024 16:17:28 -0500 Subject: [PATCH 366/592] typo. --- tests/einsum.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index a671657cbf..cfae7b5925 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -33,7 +33,7 @@ using il_extent = std::initializer_list; } // namespace template >> bool check_manual_eval(std::string const& annot, ArrayA A, ArrayB B) { From 5809ff25258a837783983121a8df8ca2d89f4b32 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 1 Apr 2024 23:29:45 -0400 Subject: [PATCH 367/592] typo --- examples/dgemm/ta_blas.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/dgemm/ta_blas.cpp b/examples/dgemm/ta_blas.cpp index 0a4feff383..aeefaae908 100644 --- a/examples/dgemm/ta_blas.cpp +++ b/examples/dgemm/ta_blas.cpp @@ -69,7 +69,7 @@ int main(int argc, char** argv) { // Start clock const double wall_time_start = madness::wall_time(); - // Do matrix multiplcation + // Do matrix multiplication // Note: If TiledArray has not been configured with blas, this will be an // eigen call. for (int i = 0; i < repeat; ++i) { From 0dbd0eee07a44f7b577fdf11038addcc1d466883 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 2 Apr 2024 00:00:50 -0400 Subject: [PATCH 368/592] introduced patcher for Intel MKL's unfair dispatch, enabled by configuring with IntelMKL_FAIR_DISPATCH=ON see https://www.agner.org/optimize/intel_dispatch_patch.zip --- CMakeLists.txt | 3 + INSTALL.md | 1 + src/CMakeLists.txt | 10 +++ src/TiledArray/config.h.in | 2 + .../agnerfog/intel_cpu_feature_patch.c | 48 +++++++++++ .../external/agnerfog/intel_mkl_cpuid_patch.c | 61 ++++++++++++++ .../agnerfog/intel_mkl_feature_patch.c | 49 +++++++++++ src/TiledArray/external/agnerfog/readme.txt | 84 +++++++++++++++++++ src/TiledArray/tiledarray.cpp | 7 ++ 9 files changed, 265 insertions(+) create mode 100644 src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c create mode 100644 src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c create mode 100644 src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c create mode 100644 src/TiledArray/external/agnerfog/readme.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index a97e6561f8..7bc524337b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,6 +175,9 @@ add_feature_info(TA_RANGEV3 TA_RANGEV3 "Range-V3 ranges library") option(TA_TTG "Enable search/build of TTG library" OFF) add_feature_info(TA_TTG TA_TTG "TTG library") +option(IntelMKL_FAIR_DISPATCH "Enable fair dispatch in Intel MKL" OFF) +add_feature_info(IntelMKL_FAIR_DISPATCH IntelMKL_FAIR_DISPATCH "Use of fair dispatch in Intel MKL") + # Enable shared library support options redefaultable_option(TA_ASSUMES_ASLR_DISABLED "TiledArray assumes the Address Space Layout Randomization (ASLR) to be disabled" OFF) add_feature_info(ASSUMES_ASLR_DISABLED TA_ASSUMES_ASLR_DISABLED diff --git a/INSTALL.md b/INSTALL.md index 6e7c6fc746..3f669073f0 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -423,6 +423,7 @@ support may be added. * `TA_TENSOR_MEM_PROFILE` -- Set to `ON` to profile host memory allocations used by TA::Tensor. This causes the use of Umpire for host memory allocation. This also enables additional tracing facilities provided by Umpire; these can be controlled via [environment variable `UMPIRE_LOG_LEVEL`](https://umpire.readthedocs.io/en/develop/sphinx/features/logging_and_replay.html), but note that the default is to log Umpire info into a file rather than stdout. * `TA_TENSOR_MEM_TRACE` -- Set to `ON` to *trace* host memory allocations used by TA::Tensor. This turns on support for tracking memory used by `Tensor` objects; such tracking must be enabled programmatically. This can greatly increase memory consumption by the application and is only intended for expert developers troubleshooting memory use by TiledArray. * `TA_UT_CTEST_TIMEOUT` -- The value (in seconds) of the timeout to use for running the TA unit tests via CTest when building the `check`/`check-tiledarray` targets. The default timeout is 1500s. +* `IntelMKL_FAIR_DISPATCH` -- If want to use Intel MKL library on non-Intel (e.g., AMD) CPUs, set to `ON` to use fair kernel dispatch. [Default=OFF]. # Build TiledArray diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9bb82bf537..0167aab636 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -313,6 +313,16 @@ if( TARGET ttg-parsec ) list(APPEND _TILEDARRAY_DEPENDENCIES ttg-parsec) endif() +if (IntelMKL_FAIR_DISPATCH AND BLAS_IS_MKL) + message(WARNING "created tiledarray_mkl_dispatch") + add_library(tiledarray_mkl_dispatch OBJECT + TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c + TiledArray/external/agnerfog/intel_mkl_feature_patch.c + ) + # N.B. --allow-multiple-definition is a GNU linker extension + list(APPEND _TILEDARRAY_DEPENDENCIES $ -Wl,--allow-multiple-definition) +endif() + # cache deps as TILEDARRAY_PRIVATE_LINK_LIBRARIES set(TILEDARRAY_PRIVATE_LINK_LIBRARIES ${_TILEDARRAY_DEPENDENCIES} CACHE STRING "List of libraries on which TiledArray depends on") diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in index 79f9f0932a..483847067f 100644 --- a/src/TiledArray/config.h.in +++ b/src/TiledArray/config.h.in @@ -113,6 +113,8 @@ #endif // !defined(TILEDARRAY_HAS_BTAS) #if defined(TILEDARRAY_HAS_BTAS) && defined(BTAS_HAS_INTEL_MKL) # define TILEDARRAY_HAS_INTEL_MKL +/* use fair dispatch in Intel MKL? */ +#cmakedefine IntelMKL_FAIR_DISPATCH #endif /* Add macro TILEDARRAY_FORCE_INLINE which does as the name implies. */ diff --git a/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c b/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c new file mode 100644 index 0000000000..f3706ef1fa --- /dev/null +++ b/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c @@ -0,0 +1,48 @@ +/*********************** intel_cpu_feature_patch.c ************************** + * Author: Agner Fog + * Date created: 2014-07-30 + * Last modified: 2019-12-29 + * Source URL: https://www.agner.org/optimize/intel_dispatch_patch.zip + * Language: C or C++ + * + * Description: + * Patch for Intel compiler version 13.0 and later, including the general + * libraries, LIBM and SVML, but not MKL and VML. + * + * Example of how to patch Intel's CPU feature dispatcher in order to improve + * compatibility of generated code with non-Intel processors. + * In Windows: Use the static link libraries (*.lib), not the dynamic link + * librarise (*.DLL). + * In Linux and Mac: use static linking (*.a) or dynamic linking (*.so). + * + * Include this code in your C or C++ program and call intel_cpu_patch(); + * before any call to the library functions. + * + * Copyright (c) 2014-2019. BSD License 2.0 + ******************************************************************************/ +#include + +#ifdef __cplusplus // use C-style linking +extern "C" { +#endif + +// link to Intel libraries +extern int64_t __intel_cpu_feature_indicator; // CPU feature bits +extern int64_t __intel_cpu_feature_indicator_x; // CPU feature bits +void __intel_cpu_features_init(); // unfair dispatcher: checks CPU features for + // Intel CPU's only +void __intel_cpu_features_init_x(); // fair dispatcher: checks CPU features + // without discriminating by CPU brand + +#ifdef __cplusplus +} // end of extern "C" +#endif + +void intel_cpu_patch() { + // force a re-evaluation of the CPU features without discriminating by CPU + // brand + __intel_cpu_feature_indicator = 0; + __intel_cpu_feature_indicator_x = 0; + __intel_cpu_features_init_x(); + __intel_cpu_feature_indicator = __intel_cpu_feature_indicator_x; +} diff --git a/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c b/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c new file mode 100644 index 0000000000..b88a1807f7 --- /dev/null +++ b/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c @@ -0,0 +1,61 @@ +/*********************** intel_mkl_cpuid_patch.c ************************** + * Author: Agner Fog + * Date created: 2019-12-29 + * Source URL: https://www.agner.org/optimize/intel_dispatch_patch.zip + * Language: C or C++ + * + * Description: + * Patch for Intel Math Kernel Library (MKL) version 14.0 and later, except + * the Vector Math Library (VML). + * + * Example of how to override Intel's CPU feature dispatcher in order to improve + * compatibility of Intel function libraries with non-Intel processors. + * + * Include this code in your C or C++ program and make sure it is linked before + * any Intel libraries. You may need to include intel_mkl_feature_patch.c as + *well. + * + * Copyright (c) 2019. BSD License 2.0 + ******************************************************************************/ +#include + +#ifdef __cplusplus // use C-style linking +extern "C" { +#endif + +// detect if Intel CPU +int mkl_serv_intel_cpu() { return 1; } + +// detect if Intel CPU +int mkl_serv_intel_cpu_true() { return 1; } + +int mkl_serv_cpuhaspnr_true() { return 1; } + +int mkl_serv_cpuhaspnr() { return 1; } + +int mkl_serv_cpuhasnhm() { return 1; } + +int mkl_serv_cpuisbulldozer() { return 0; } + +int mkl_serv_cpuiszen() { return 0; } + +int mkl_serv_cpuisatomsse4_2() { return 0; } + +int mkl_serv_cpuisatomssse3() { return 0; } + +int mkl_serv_cpuisitbarcelona() { return 0; } + +int mkl_serv_cpuisskl() { return 0; } + +int mkl_serv_cpuisknm() { return 0; } + +int mkl_serv_cpuisclx() { return 0; } + +int mkl_serv_get_microarchitecture() { + // I don't know what this number means + return 33; +} + +#ifdef __cplusplus +} // end of extern "C" +#endif diff --git a/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c b/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c new file mode 100644 index 0000000000..4844f2621d --- /dev/null +++ b/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c @@ -0,0 +1,49 @@ +/*********************** intel_mkl_feature_patch.c ************************** + * Author: Agner Fog + * Date created: 2014-07-30 + * Last modified: 2019-12-29 + * Source URL: https://www.agner.org/optimize/intel_dispatch_patch.zip + * Language: C or C++ + * + * Description: + * Patch for Intel Math Kernel Library (MKL) version 14.0 and later, except + * the Vector Math Library (VML). + * + * Example of how to patch Intel's CPU feature dispatcher in order to improve + * compatibility of Intel function libraries with non-Intel processors. + * In Windows: Use the static link libraries (*.lib), not the dynamic link + * librarise (*.DLL). + * In Linux and Mac: use static linking (*.a) or dynamic linking (*.so). + * + * Include this code in your C or C++ program and call intel_mkl_patch(); + * before any call to the MKL functions. You may need to include + * intel_mkl_cpuid_patch.c as well. + * + * Copyright (c) 2014-2019. BSD License 2.0 + ******************************************************************************/ +#include + +#ifdef __cplusplus // use C-style linking +extern "C" { +#endif + +// link to MKL libraries +extern int64_t __intel_mkl_feature_indicator; // CPU feature bits +extern int64_t __intel_mkl_feature_indicator_x; // CPU feature bits +void __intel_mkl_features_init(); // unfair dispatcher: checks CPU features for + // Intel CPU's only +void __intel_mkl_features_init_x(); // fair dispatcher: checks CPU features + // without discriminating by CPU brand + +#ifdef __cplusplus +} // end of extern "C" +#endif + +void intel_mkl_use_fair_dispatch() { + // force a re-evaluation of the CPU features without discriminating by CPU + // brand + __intel_mkl_feature_indicator = 0; + __intel_mkl_feature_indicator_x = 0; + __intel_mkl_features_init_x(); + __intel_mkl_feature_indicator = __intel_mkl_feature_indicator_x; +} diff --git a/src/TiledArray/external/agnerfog/readme.txt b/src/TiledArray/external/agnerfog/readme.txt new file mode 100644 index 0000000000..0f891c9ed3 --- /dev/null +++ b/src/TiledArray/external/agnerfog/readme.txt @@ -0,0 +1,84 @@ + intel_dispatch_patch.zip + ======================== + +By Agner Fog, Technical University of Denmark, 2019. + +Intel's compilers are generating code that will run slower than necessary when +the code is executed on a CPU that is not produced by Intel. This has been +observed with Intel C, C++, and Fortran compilers. + +The same happens when certain function libraries produced by Intel are used, +even if the code is compiled with another compiler, such as Microsoft, Gnu +or Clang compilers. + +This problem is affecting several commonly used software programs such as +Matlab, because they are using Intel software libraries. + +The library code and the code generated by an Intel compiler may contain +multiple versions, each optimized for a particular instruction set extension. +A so-called CPU dispatcher is chosing the optimal version of the code at +runtime, based on which CPU it is running on. + +CPU dispatchers can be fair or unfair. A fair CPU dispatcher is chosing the +optimal code based only on which instruction set extensions are supported +by the CPU. An unfair dispatcher first checks the CPU brand. If the brand +is not Intel, then the unfair dispatcher will chose the "generic" version +of the code, i.e. the slowest version that is compatible with old CPUs +without the relevant instruction set extensions. + +The CPU dispatchers in many Intel function libraries have two versions, a +fair and an unfair one. It is not clear when the fair dispatcher is used +and when the unfair dispatcher is used. My observations about fair and +unfair CPU dispatching are as follows: + +* Code compiled with an Intel compiler will usually have unfair CPU dispatching. + +* The SVML (Short Vector Math Library) and IPP (Intel Performance Primitives) + function libraries from Intel are using the fair CPU dispatcher when used + with a non-Intel compiler. + +* The MKL (Math Kernel Library) library contains both fair and unfair + dispatchers. It is not clear which dispatcher is used on each function. + +The code examples contained herein may be used for circumventing unfair CPU +dispatching in order to improve compatibility with non-Intel CPUs. + +The following files are contained: + +intel_cpu_feature_patch.c +------------------------- +This code makes sure the fair dispatcher is called instead of the unfair +one for code generated with an Intel compiler and for general Intel +function libraries. + +intel_mkl_feature_patch.c +------------------------- +This does the same for the Intel MKL library. + +intel_mkl_cpuid_patch.c +----------------------- +This code example is overriding CPU detection functions in Intel's MKL +function library. The mkl_serv_intel_cpu() function in MKL is returning +1 when running on an Intel CPU and 0 when running on any other brand of +CPU. You may include this code to replace this function in MKL with a +function that returns 1 regardless of CPU brand. + +It may be necessary to use both intel_mkl_feature_patch.c and +intel_mkl_cpuid_patch.c when using the MKL library in software that +may run on any brand of CPU. + +An alternative method is to set the environment variable + MKL_DEBUG_CPU_TYPE=5 +when running on an AMD processor. This may be useful when you do not have +access to the source code, for example when running Matlab software. + +The patches provided here are based on undocumented features in Intel +function libraries. Use them at your own risk, and make sure to test your +code properly to make sure it works as intended. + +The most reliable solution is, of course, to avoid Intel compilers and +Intel function libraries in code that may run on other CPU brands such +as AMD and VIA. You may find other function libraries on the web, or +you may make your own functions. My vector class library (VCL) is useful +for making mathematical functions that process multiple data in parallel, +using the vector processing features of modern CPUs. diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp index 38bf61e86e..2a4b3d1199 100644 --- a/src/TiledArray/tiledarray.cpp +++ b/src/TiledArray/tiledarray.cpp @@ -16,6 +16,10 @@ #include #endif +#ifdef IntelMKL_FAIR_DISPATCH +extern "C" void intel_mkl_use_fair_dispatch(); +#endif + #include #include #include @@ -100,6 +104,9 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv, TiledArray::set_default_world(default_world); #ifdef TILEDARRAY_HAS_DEVICE TiledArray::device_initialize(); +#endif +#ifdef IntelMKL_FAIR_DISPATCH + intel_mkl_use_fair_dispatch(); #endif TiledArray::max_threads = TiledArray::get_num_threads(); TiledArray::set_num_threads(1); From 1af2613553a2c8853253921d28c3d7e1b7994a9b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 2 Apr 2024 11:53:40 -0400 Subject: [PATCH 369/592] introduced duration deque and statistics computation --- src/TiledArray/util/time.h | 83 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/src/TiledArray/util/time.h b/src/TiledArray/util/time.h index aa0639bc0a..8ae649a6af 100644 --- a/src/TiledArray/util/time.h +++ b/src/TiledArray/util/time.h @@ -26,7 +26,10 @@ #ifndef TILEDARRAY_UTIL_TIME_H__INCLUDED #define TILEDARRAY_UTIL_TIME_H__INCLUDED +#include #include +#include +#include namespace TiledArray { @@ -46,6 +49,86 @@ inline int64_t duration_in_ns(time_point const &t0, time_point const &t1) { return std::chrono::duration_cast(t1 - t0).count(); } +namespace detail { +inline std::deque &call_durations_accessor() { + static std::deque call_durations; + return call_durations; +} +} // namespace detail + +/// Access recorded durations +inline const std::deque &durations() { + return detail::call_durations_accessor(); +} + +/// Clear recorded durations +inline void clear_durations() { detail::call_durations_accessor().clear(); } + +/// Record duration since the given time point +/// \param tp_start The start time point +inline void record_duration_since(const time_point &tp_start) { + detail::call_durations_accessor().push_back(duration_in_s(tp_start, now())); +} + +/// Record duration of a single function call +template +void record_duration(F &&f, Args &&...args) { + auto tp_start = now(); + std::forward(f)(std::forward(args)...); + record_duration_since(tp_start); +} + +/// Statistics of recorded durations +struct duration_stats_t { + double min = 0.0; + double max = 0.0; + double mean = 0.0; + double stddev = 0.0; + double median = 0.0; + double mean_reciprocal = 0.0; +}; + +/// Compute statistics of recorded durations +/// \return Statistics of recorded durations +inline duration_stats_t duration_statistics() { + duration_stats_t stats; + auto &durations = detail::call_durations_accessor(); + if (durations.empty()) return stats; + + stats.min = durations.front(); + stats.max = durations.front(); + stats.mean = durations.front(); + stats.mean_reciprocal = 1.0 / durations.front(); + double total = stats.mean; + double total_reciprocal = stats.mean_reciprocal; + for (size_t i = 1; i < durations.size(); ++i) { + total += durations[i]; + total_reciprocal += 1. / durations[i]; + stats.min = std::min(stats.min, durations[i]); + stats.max = std::max(stats.max, durations[i]); + } + stats.mean = total / durations.size(); + stats.mean_reciprocal = total_reciprocal / durations.size(); + + double sum_sq = 0.0; + for (size_t i = 0; i < durations.size(); ++i) { + sum_sq += (durations[i] - stats.mean) * (durations[i] - stats.mean); + } + stats.stddev = + durations.size() > 1 ? std::sqrt(sum_sq / (durations.size() - 1)) : 0.0; + + std::sort(durations.begin(), durations.end()); + stats.median = durations[durations.size() / 2]; + + return stats; +} + } // namespace TiledArray +#ifndef TA_RECORD_DURATION +/// Record duration of a statement +#define TA_RECORD_DURATION(statement) \ + TiledArray::record_duration([&] { statement; }); +#endif // !defined(TA_RECORD_DURATION) + #endif // TILEDARRAY_UTIL_TIME_H__INCLUDED From c49adefd5ef40a8a401cc25f997019d9b69c0445 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 2 Apr 2024 11:55:24 -0400 Subject: [PATCH 370/592] examples/dgemm -> examples/gemm + misc cleanup - cleanup examples to use centralized timers/statistics facilities - remove dense_new_tile --- examples/CMakeLists.txt | 2 +- examples/dgemm/ta_dense_new_tile.cpp | 168 ------------------ examples/{dgemm => gemm}/CMakeLists.txt | 2 +- examples/{dgemm => gemm}/README | 4 +- .../block_size_data_process.py | 0 examples/{dgemm => gemm}/block_size_scan.sh | 0 examples/{dgemm => gemm}/ta_band.cpp | 32 ++-- examples/{dgemm => gemm}/ta_blas.cpp | 29 ++- examples/{dgemm => gemm}/ta_cc_abcd.cpp | 41 +++-- examples/{dgemm => gemm}/ta_dense.cpp | 29 ++- examples/{dgemm => gemm}/ta_dense_asymm.cpp | 28 ++- .../{dgemm => gemm}/ta_dense_nonuniform.cpp | 27 +-- examples/{dgemm => gemm}/ta_eigen.cpp | 23 +-- examples/{dgemm => gemm}/ta_sparse.cpp | 0 examples/{dgemm => gemm}/ta_sparse_grow.cpp | 0 15 files changed, 99 insertions(+), 286 deletions(-) delete mode 100644 examples/dgemm/ta_dense_new_tile.cpp rename examples/{dgemm => gemm}/CMakeLists.txt (94%) rename examples/{dgemm => gemm}/README (92%) rename examples/{dgemm => gemm}/block_size_data_process.py (100%) rename examples/{dgemm => gemm}/block_size_scan.sh (100%) rename examples/{dgemm => gemm}/ta_band.cpp (86%) rename examples/{dgemm => gemm}/ta_blas.cpp (80%) rename examples/{dgemm => gemm}/ta_cc_abcd.cpp (93%) rename examples/{dgemm => gemm}/ta_dense.cpp (89%) rename examples/{dgemm => gemm}/ta_dense_asymm.cpp (92%) rename examples/{dgemm => gemm}/ta_dense_nonuniform.cpp (87%) rename examples/{dgemm => gemm}/ta_eigen.cpp (76%) rename examples/{dgemm => gemm}/ta_sparse.cpp (100%) rename examples/{dgemm => gemm}/ta_sparse_grow.cpp (100%) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 99edd4e33b..d240192893 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -30,7 +30,7 @@ endif() # Add Subdirectories add_subdirectory (cc) add_subdirectory (device) -add_subdirectory (dgemm) +add_subdirectory (gemm) add_subdirectory (demo) add_subdirectory (scalapack) add_subdirectory (fock) diff --git a/examples/dgemm/ta_dense_new_tile.cpp b/examples/dgemm/ta_dense_new_tile.cpp deleted file mode 100644 index 79dae8a579..0000000000 --- a/examples/dgemm/ta_dense_new_tile.cpp +++ /dev/null @@ -1,168 +0,0 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2013 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - */ - -#include -#include -#include - -using Tile_t = TiledArray::Tile>; -using Array_t = TiledArray::DistArray; - -void set_tiles(double val, Array_t& a) { - auto const& trange = a.trange(); - - auto pmap = a.pmap(); - const auto end = pmap->end(); - for (auto it = pmap->begin(); it != end; ++it) { - auto range = trange.make_tile_range(*it); - a.set(*it, Tile_t(TiledArray::Tensor(range, val))); - } -} - -int main(int argc, char** argv) { - int rc = 0; - - try { - // Initialize runtime - TiledArray::World& world = TA_SCOPED_INITIALIZE(argc, argv); - - // Get command line arguments - if (argc < 2) { - std::cout << "Usage: " << argv[0] - << " matrix_size block_size [repetitions]\n"; - return 0; - } - const long matrix_size = atol(argv[1]); - const long block_size = atol(argv[2]); - if (matrix_size <= 0) { - std::cerr << "Error: matrix size must be greater than zero.\n"; - return 1; - } - if (block_size <= 0) { - std::cerr << "Error: block size must be greater than zero.\n"; - return 1; - } - if ((matrix_size % block_size) != 0ul) { - std::cerr << "Error: matrix size must be evenly divisible by block " - "size.\n"; - return 1; - } - const long repeat = (argc >= 4 ? atol(argv[3]) : 5); - if (repeat <= 0) { - std::cerr << "Error: number of repetitions must be greater than zero.\n"; - return 1; - } - - const std::size_t num_blocks = matrix_size / block_size; - const std::size_t block_count = num_blocks * num_blocks; - - if (world.rank() == 0) - std::cout << "TiledArray: dense matrix multiply test..." - << "\nGit description: " << TiledArray::git_description() - << "\nNumber of nodes = " << world.size() - << "\nMatrix size = " << matrix_size << "x" - << matrix_size << "\nBlock size = " << block_size - << "x" << block_size << "\nMemory per matrix = " - << double(matrix_size * matrix_size * sizeof(double)) / 1.0e9 - << " GB\nNumber of blocks = " << block_count - << "\nAverage blocks/node = " - << double(block_count) / double(world.size()) << "\n"; - - const double flop = - 2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9; - - // Construct TiledRange - std::vector blocking; - blocking.reserve(num_blocks + 1); - for (long i = 0l; i <= matrix_size; i += block_size) blocking.push_back(i); - - std::vector blocking2( - 2, TiledArray::TiledRange1(blocking.begin(), blocking.end())); - - TiledArray::TiledRange trange(blocking2.begin(), blocking2.end()); - - // Construct and initialize arrays - Array_t a(world, trange); - Array_t b(world, trange); - Array_t c(world, trange); - set_tiles(1.0, a); - set_tiles(1.0, b); - - TiledArray::TArrayD a_check(world, trange); - TiledArray::TArrayD b_check(world, trange); - TiledArray::TArrayD c_check(world, trange); - a_check.fill(1.0); - b_check.fill(1.0); - - // Start clock - world.gop.fence(); - if (world.rank() == 0) - std::cout << "Starting iterations: " - << "\n"; - - double total_time = 0.0; - - // Do matrix multiplication - for (int i = 0; i < repeat; ++i) { - const double start = madness::wall_time(); - c("m,n") = a("m,k") * b("k,n"); - c_check("m,n") = a_check("m,k") * b_check("k,n"); - // world.gop.fence(); - const double time = madness::wall_time() - start; - total_time += time; - if (world.rank() == 0) - std::cout << "Iteration " << i + 1 << " time=" << time - << " GFLOPS=" << flop / time << "\n"; - auto check_it = c_check.begin(); - for (auto it = c.begin(); it != c.end() && check_it != c_check.end(); - ++it, ++check_it) { - auto tile_diff = it->get().tensor().subt(check_it->get()).norm(); - if (tile_diff >= 1e-15) { - std::cout << "Tile " << it.ordinal() << " failed test " - << " with norm diff " << tile_diff << std::endl; - assert(false); - } - } - } - - // Print results - if (world.rank() == 0) - std::cout << "Average wall time = " << total_time / double(repeat) - << " sec\nAverage GFLOPS = " - << double(repeat) * flop / total_time << "\n"; - - } catch (TiledArray::Exception& e) { - std::cerr << "!! TiledArray exception: " << e.what() << "\n"; - rc = 1; - } catch (madness::MadnessException& e) { - std::cerr << "!! MADNESS exception: " << e.what() << "\n"; - rc = 1; - } catch (SafeMPI::Exception& e) { - std::cerr << "!! SafeMPI exception: " << e.what() << "\n"; - rc = 1; - } catch (std::exception& e) { - std::cerr << "!! std exception: " << e.what() << "\n"; - rc = 1; - } catch (...) { - std::cerr << "!! exception: unknown exception\n"; - rc = 1; - } - - return rc; -} diff --git a/examples/dgemm/CMakeLists.txt b/examples/gemm/CMakeLists.txt similarity index 94% rename from examples/dgemm/CMakeLists.txt rename to examples/gemm/CMakeLists.txt index 47df67bf36..5808cdec6e 100644 --- a/examples/dgemm/CMakeLists.txt +++ b/examples/gemm/CMakeLists.txt @@ -26,7 +26,7 @@ # Create example executable foreach(_exec ta_blas ta_eigen ta_band ta_dense ta_sparse ta_dense_nonuniform - ta_dense_asymm ta_sparse_grow ta_dense_new_tile + ta_dense_asymm ta_sparse_grow ta_cc_abcd) # Add executable diff --git a/examples/dgemm/README b/examples/gemm/README similarity index 92% rename from examples/dgemm/README rename to examples/gemm/README index bbb80e88c0..de156f154d 100644 --- a/examples/dgemm/README +++ b/examples/gemm/README @@ -12,9 +12,9 @@ Applications usage: ta_band matrix_size block_size band_width [repetitions] - blas matrix_size [repetitions] + ta_blas matrix_size [repetitions] - eigen matrix_size [repetitions] + ta_eigen matrix_size [repetitions] Argument definitions: diff --git a/examples/dgemm/block_size_data_process.py b/examples/gemm/block_size_data_process.py similarity index 100% rename from examples/dgemm/block_size_data_process.py rename to examples/gemm/block_size_data_process.py diff --git a/examples/dgemm/block_size_scan.sh b/examples/gemm/block_size_scan.sh similarity index 100% rename from examples/dgemm/block_size_scan.sh rename to examples/gemm/block_size_scan.sh diff --git a/examples/dgemm/ta_band.cpp b/examples/gemm/ta_band.cpp similarity index 86% rename from examples/dgemm/ta_band.cpp rename to examples/gemm/ta_band.cpp index d55550cebd..0743ef734b 100644 --- a/examples/dgemm/ta_band.cpp +++ b/examples/gemm/ta_band.cpp @@ -17,6 +17,7 @@ * */ +#include #include #include @@ -104,38 +105,33 @@ int main(int argc, char** argv) { for (; j < j_end; ++j, ++ij) shape_tensor[ij] = 1.0; } - TiledArray::SparseShape shape(shape_tensor, trange); + TiledArray::SparseShape shape( + shape_tensor, trange, /* per_element_norms_already = */ true); // Construct and initialize arrays TiledArray::TSpArrayD a(world, trange, shape); TiledArray::TSpArrayD b(world, trange, shape); - TiledArray::TSpArrayD c(world, trange); + TiledArray::TSpArrayD c; a.fill(1.0); b.fill(1.0); - // Start clock - world.gop.fence(); - const double wall_time_start = madness::wall_time(); - // Do matrix multiplication + world.gop.fence(); for (int i = 0; i < repeat; ++i) { - c("m,n") = a("m,k") * b("k,n"); - world.gop.fence(); + TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); world.gop.fence();) if (world.rank() == 0) std::cout << "Iteration " << i + 1 << "\n"; } - // Stop clock - const double wall_time_stop = madness::wall_time(); - // Print results - const long flop = 2.0 * c("m,n").sum().get(); + const auto gflops_per_call = 2.0 * c("m,n").sum().get() / 1.e9; if (world.rank() == 0) { - std::cout << "Average wall time = " - << (wall_time_stop - wall_time_start) / double(repeat) - << "\nAverage GFLOPS = " - << double(repeat) * double(flop) / - (wall_time_stop - wall_time_start) / 1.0e9 - << "\n"; + auto durations = TiledArray::duration_statistics(); + std::cout << "Average wall time = " << durations.mean + << " s\nAverage GFLOPS = " + << gflops_per_call * durations.mean_reciprocal + << "\nMedian wall time = " << durations.median + << " s\nMedian GFLOPS = " + << gflops_per_call / durations.median << "\n"; } } catch (TiledArray::Exception& e) { diff --git a/examples/dgemm/ta_blas.cpp b/examples/gemm/ta_blas.cpp similarity index 80% rename from examples/dgemm/ta_blas.cpp rename to examples/gemm/ta_blas.cpp index aeefaae908..c97f5bbedc 100644 --- a/examples/dgemm/ta_blas.cpp +++ b/examples/gemm/ta_blas.cpp @@ -17,13 +17,14 @@ * */ +#include #include #include int main(int argc, char** argv) { // Get command line arguments if (argc < 2) { - std::cout << "Usage: " << argv[0] << " matrix_size [repetitions]\n"; + std::cout << "Usage: " << argv[0] << " matrix_size [repetitions = 5]\n"; return 0; } const long matrix_size = atol(argv[1]); @@ -66,31 +67,25 @@ int main(int argc, char** argv) { const integer m = matrix_size, n = matrix_size, k = matrix_size; const integer lda = matrix_size, ldb = matrix_size, ldc = matrix_size; - // Start clock - const double wall_time_start = madness::wall_time(); - // Do matrix multiplication - // Note: If TiledArray has not been configured with blas, this will be an - // eigen call. for (int i = 0; i < repeat; ++i) { - gemm(opa, opb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + TA_RECORD_DURATION( + gemm(opa, opb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)); } - - // Stop clock - const double wall_time_stop = madness::wall_time(); + auto durations = TiledArray::duration_statistics(); // Cleanup memory free(a); free(b); free(c); - std::cout << "Average wall time = " - << (wall_time_stop - wall_time_start) / double(repeat) - << "\nAverage GFLOPS = " - << double(repeat) * 2.0 * - double(matrix_size * matrix_size * matrix_size) / - (wall_time_stop - wall_time_start) / 1.0e9 - << "\n"; + const auto gflops_per_call = + 2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9; + std::cout << "Average wall time = " << durations.mean << "\nAverage GFLOPS = " + << gflops_per_call * durations.mean_reciprocal + << "\nMedian wall time = " << durations.median + << "\nMedian GFLOPS = " << gflops_per_call / durations.median + << std::endl; return 0; } diff --git a/examples/dgemm/ta_cc_abcd.cpp b/examples/gemm/ta_cc_abcd.cpp similarity index 93% rename from examples/dgemm/ta_cc_abcd.cpp rename to examples/gemm/ta_cc_abcd.cpp index c1881063d4..df05e78c4a 100644 --- a/examples/dgemm/ta_cc_abcd.cpp +++ b/examples/gemm/ta_cc_abcd.cpp @@ -17,6 +17,7 @@ * */ +#include #include #include #include @@ -35,13 +36,13 @@ bool to_bool(const char* str) { // the last tile absorbs the remainder std::vector make_tiling(unsigned int range_size, unsigned int ntiles) { - const auto average_tile_size = range_size / ntiles; - TA_ASSERT(average_tile_size > ntiles); + const int average_tile_size = range_size / ntiles; std::vector result(ntiles + 1); result[0] = 0; for (long t = 0; t != ntiles - 1; ++t) { - result[t + 1] = - result[t] + average_tile_size + ((t % 2 == 0) ? (t + 1) : (-t)); + result[t + 1] = result[t] + average_tile_size + + std::max(static_cast((t % 2 == 0) ? (t + 1) : (-t)), + 1 - average_tile_size); } result[ntiles] = range_size; return result; @@ -174,8 +175,8 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, const double flops_per_fma = (complex_T ? 8 : 2); // 1 multiply takes 6/1 flops for complex/real // 1 add takes 2/1 flops for complex/real - const double n_gflop = flops_per_fma * std::pow(n_occ, 2) * - std::pow(n_uocc, 4) / std::pow(1024., 3); + const double gflops_per_call = flops_per_fma * std::pow(n_occ, 2) * + std::pow(n_uocc, 4) / std::pow(1024., 3); // Construct tensors TA::TArrayD t2(world, trange_oovv); @@ -196,13 +197,9 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, std::cout << "Starting iterations: " << "\n"; - double total_time = 0.0; - double total_gflop_rate = 0.0; - // Do matrix multiplication for (int i = 0; i < repeat; ++i) { - const double start = madness::wall_time(); - + auto tp_start = TiledArray::now(); // this is how the user would express this contraction if (false) t2_v("i,j,a,b") = t2("i,j,c,d") * v("a,b,c,d"); @@ -223,23 +220,25 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, << error("i,j,a,b").squared_norm().get() << std::endl; } } + TiledArray::record_duration_since(tp_start); - const double stop = madness::wall_time(); - const double time = stop - start; - total_time += time; - const double gflop_rate = n_gflop / time; - total_gflop_rate += gflop_rate; + const double time = TiledArray::durations().back(); + const double gflop_rate = gflops_per_call / time; if (world.rank() == 0) std::cout << "Iteration " << i + 1 << " time=" << time << " GFLOPS=" << gflop_rate << "\n"; } // Print results - if (world.rank() == 0) - std::cout << "Average wall time = " - << total_time / static_cast(repeat) - << " sec\nAverage GFLOPS = " - << total_gflop_rate / static_cast(repeat) << "\n"; + if (world.rank() == 0) { + auto durations = TiledArray::duration_statistics(); + std::cout << "Average wall time = " << durations.mean + << " s\nAverage GFLOPS = " + << gflops_per_call * durations.mean_reciprocal + << "\nMedian wall time = " << durations.median + << " s\nMedian GFLOPS = " + << gflops_per_call / durations.median << "\n"; + } } template diff --git a/examples/dgemm/ta_dense.cpp b/examples/gemm/ta_dense.cpp similarity index 89% rename from examples/dgemm/ta_dense.cpp rename to examples/gemm/ta_dense.cpp index 82506b1d0d..c0ffebd4dc 100644 --- a/examples/dgemm/ta_dense.cpp +++ b/examples/gemm/ta_dense.cpp @@ -17,6 +17,7 @@ * */ +#include #include #include #include @@ -129,7 +130,7 @@ void gemm_(TiledArray::World& world, const TiledArray::TiledRange& trange, const auto n = trange.elements_range().extent()[0]; const auto complex_T = TiledArray::detail::is_complex::value; - const double gflop = + const double gflops_per_call = (complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real // 1 add takes 2/1 flops for complex/real * double(n * n * n) / 1.0e9; @@ -168,28 +169,26 @@ void gemm_(TiledArray::World& world, const TiledArray::TiledRange& trange, std::cout << "Starting iterations: " << "\n"; - double total_time = 0.0; - double total_gflop_rate = 0.0; - // Do matrix multiplication for (int i = 0; i < repeat; ++i) { - const double start = madness::wall_time(); - c("m,n") = a("m,k") * b("k,n"); - memtrace("c=a*b"); - const double time = madness::wall_time() - start; - total_time += time; - const double gflop_rate = gflop / time; - total_gflop_rate += gflop_rate; + TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); memtrace("c=a*b");) + const auto time = TiledArray::durations().back(); + const double gflop_rate = gflops_per_call / time; if (world.rank() == 0) std::cout << "Iteration " << i + 1 << " time=" << time << " GFLOPS=" << gflop_rate << "\n"; } // Print results - if (world.rank() == 0) - std::cout << "Average wall time = " << total_time / double(repeat) - << " sec\nAverage GFLOPS = " - << total_gflop_rate / double(repeat) << "\n"; + if (world.rank() == 0) { + auto durations = TiledArray::duration_statistics(); + std::cout << "Average wall time = " << durations.mean + << " s\nAverage GFLOPS = " + << gflops_per_call * durations.mean_reciprocal + << "\nMedian wall time = " << durations.median + << " s\nMedian GFLOPS = " + << gflops_per_call / durations.median << "\n"; + } } // array lifetime scope memtrace("stop"); diff --git a/examples/dgemm/ta_dense_asymm.cpp b/examples/gemm/ta_dense_asymm.cpp similarity index 92% rename from examples/dgemm/ta_dense_asymm.cpp rename to examples/gemm/ta_dense_asymm.cpp index 40183603bb..acef959c7a 100644 --- a/examples/dgemm/ta_dense_asymm.cpp +++ b/examples/gemm/ta_dense_asymm.cpp @@ -18,6 +18,7 @@ */ #include +#include #include #include @@ -125,11 +126,11 @@ int main(int argc, char** argv) { using Array = std::decay_t>; using T = TiledArray::detail::numeric_t; const auto complex_T = TiledArray::detail::is_complex_v; - const std::int64_t nflops = + const double gflops_per_call = (complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real // 1 add takes 2/1 flops for complex/real * static_cast(Nn) * static_cast(Nm) * - static_cast(Nk); + static_cast(Nk) / 1.e9; if (world.rank() == 0) std::cout << "TiledArray: dense matrix multiply test...\n" @@ -182,18 +183,11 @@ int main(int argc, char** argv) { std::cout << "Starting iterations: " << "\n"; - double total_time = 0.0; - double total_gflop_rate = 0.0; - // Do matrix multiplication for (int i = 0; i < repeat; ++i) { - const double start = madness::wall_time(); - c("m,n") = a("m,k") * b("k,n"); - memtrace("c=a*b"); - const double time = madness::wall_time() - start; - total_time += time; - const double gflop_rate = double(nflops) / (time * 1.e9); - total_gflop_rate += gflop_rate; + TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); memtrace("c=a*b");) + const double time = TiledArray::durations().back(); + const double gflop_rate = gflops_per_call / time; if (world.rank() == 0) std::cout << "Iteration " << i + 1 << " time=" << time << " GFLOPS=" << gflop_rate << "\n"; @@ -203,9 +197,13 @@ int main(int argc, char** argv) { const double wall_time_stop = madness::wall_time(); if (world.rank() == 0) { - std::cout << "Average wall time = " << total_time / double(repeat) - << " sec\nAverage GFLOPS = " - << total_gflop_rate / double(repeat) << "\n"; + auto durations = TiledArray::duration_statistics(); + std::cout << "Average wall time = " << durations.mean + << " s\nAverage GFLOPS = " + << gflops_per_call * durations.mean_reciprocal + << "\nMedian wall time = " << durations.median + << " s\nMedian GFLOPS = " + << gflops_per_call / durations.median << "\n"; } } // array lifetime scope diff --git a/examples/dgemm/ta_dense_nonuniform.cpp b/examples/gemm/ta_dense_nonuniform.cpp similarity index 87% rename from examples/dgemm/ta_dense_nonuniform.cpp rename to examples/gemm/ta_dense_nonuniform.cpp index c01a4ece11..20e8cce712 100644 --- a/examples/dgemm/ta_dense_nonuniform.cpp +++ b/examples/gemm/ta_dense_nonuniform.cpp @@ -17,6 +17,7 @@ * */ +#include #include #include #include @@ -58,7 +59,7 @@ int main(int argc, char** argv) { const long num_blocks = matrix_size / block_size; const long block_count = num_blocks * num_blocks; - const double flop = + const double gflops_per_call = 2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9; // Construct TiledRange @@ -121,25 +122,25 @@ int main(int argc, char** argv) { std::cout << "Starting iterations: " << "\n"; - double total_time = 0.0; - // Do matrix multiplication for (int i = 0; i < repeat; ++i) { - const double start = madness::wall_time(); - c("m,n") = a("m,k") * b("k,n"); - // world.gop.fence(); - const double time = madness::wall_time() - start; - total_time += time; + TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); world.gop.fence();) + const double time = TiledArray::durations().back(); if (world.rank() == 0) std::cout << "Iteration " << i + 1 << " time=" << time - << " GFLOPS=" << flop / time << "\n"; + << " GFLOPS=" << gflops_per_call / time << "\n"; } // Print results - if (world.rank() == 0) - std::cout << "Average wall time = " << total_time / double(repeat) - << " sec\nAverage GFLOPS = " - << double(repeat) * flop / total_time << "\n"; + if (world.rank() == 0) { + auto durations = TiledArray::duration_statistics(); + std::cout << "Average wall time = " << durations.mean + << " s\nAverage GFLOPS = " + << gflops_per_call * durations.mean_reciprocal + << "\nMedian wall time = " << durations.median + << " s\nMedian GFLOPS = " + << gflops_per_call / durations.median << "\n"; + } } catch (TiledArray::Exception& e) { std::cerr << "!! TiledArray exception: " << e.what() << "\n"; diff --git a/examples/dgemm/ta_eigen.cpp b/examples/gemm/ta_eigen.cpp similarity index 76% rename from examples/dgemm/ta_eigen.cpp rename to examples/gemm/ta_eigen.cpp index 0aa5474cd6..018de9a81f 100644 --- a/examples/dgemm/ta_eigen.cpp +++ b/examples/gemm/ta_eigen.cpp @@ -17,6 +17,7 @@ * */ +#include #include #include @@ -50,24 +51,16 @@ int main(int argc, char** argv) { b.fill(1.0); c.fill(0.0); - // Start clock - const double wall_time_start = madness::wall_time(); - - // Do matrix multiplcation + // Do matrix multiplication for (int i = 0; i < repeat; ++i) { - c.noalias() = 1.0 * a * b + 0.0 * c; + TA_RECORD_DURATION(c.noalias() = 1.0 * a * b + 0.0 * c); } - // Stop clock - const double wall_time_stop = madness::wall_time(); - - std::cout << "Average wall time = " - << (wall_time_stop - wall_time_start) / double(repeat) - << "\nAverage GFLOPS = " - << double(repeat) * 2.0 * - double(matrix_size * matrix_size * matrix_size) / - (wall_time_stop - wall_time_start) / 1.0e9 - << "\n"; + auto durations = TiledArray::duration_statistics(); + std::cout << "Average wall time = " << durations.mean << "\nAverage GFLOPS = " + << (2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9) * + durations.mean_reciprocal + << std::endl; return 0; } diff --git a/examples/dgemm/ta_sparse.cpp b/examples/gemm/ta_sparse.cpp similarity index 100% rename from examples/dgemm/ta_sparse.cpp rename to examples/gemm/ta_sparse.cpp diff --git a/examples/dgemm/ta_sparse_grow.cpp b/examples/gemm/ta_sparse_grow.cpp similarity index 100% rename from examples/dgemm/ta_sparse_grow.cpp rename to examples/gemm/ta_sparse_grow.cpp From 1ce2c33dcaa37cd688205aab9679515d0b3e3f6f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 2 Apr 2024 17:36:17 -0400 Subject: [PATCH 371/592] fetching eigen3 via ExternalProject_Add's needs to specify STAMP_DIR and TMP_DIR just like librett and umpire --- external/eigen.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/external/eigen.cmake b/external/eigen.cmake index f2d28076dd..1489b53fe6 100644 --- a/external/eigen.cmake +++ b/external/eigen.cmake @@ -104,7 +104,9 @@ else() ExternalProject_Add(eigen3 PREFIX ${CMAKE_INSTALL_PREFIX} - #--Download step-------------- + STAMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts + TMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts # needed in case CMAKE_INSTALL_PREFIX is not writable + #--Download step-------------- DOWNLOAD_DIR ${EXTERNAL_SOURCE_DIR} URL ${EIGEN3_URL} URL_HASH ${EIGEN3_URL_HASH} From 5e22878f2775f711286fcb03b4ddfaf46f089100 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 3 Apr 2024 08:30:25 -0400 Subject: [PATCH 372/592] to avoid premature creation of install directory ExternalProject_add will use FETCHCONTENT_BASE_DIR as PREFIX --- external/eigen.cmake | 2 +- external/librett.cmake | 2 +- external/umpire.cmake | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/external/eigen.cmake b/external/eigen.cmake index 1489b53fe6..57bbead90d 100644 --- a/external/eigen.cmake +++ b/external/eigen.cmake @@ -103,7 +103,7 @@ else() message("** Will build Eigen from ${EIGEN3_URL}") ExternalProject_Add(eigen3 - PREFIX ${CMAKE_INSTALL_PREFIX} + PREFIX ${FETCHCONTENT_BASE_DIR} STAMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts TMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts # needed in case CMAKE_INSTALL_PREFIX is not writable #--Download step-------------- diff --git a/external/librett.cmake b/external/librett.cmake index c04cf56b38..afebabb486 100644 --- a/external/librett.cmake +++ b/external/librett.cmake @@ -109,7 +109,7 @@ else() message(STATUS "custom target librett is expected to build these byproducts: ${LIBRETT_BUILD_BYPRODUCTS}") ExternalProject_Add(librett - PREFIX ${CMAKE_INSTALL_PREFIX} + PREFIX ${FETCHCONTENT_BASE_DIR} STAMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts TMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts # needed in case CMAKE_INSTALL_PREFIX is not writable #--Download step-------------- diff --git a/external/umpire.cmake b/external/umpire.cmake index c7a02d65bf..57675ca189 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -163,7 +163,7 @@ else() message(STATUS "custom target Umpire is expected to build these byproducts: ${UMPIRE_BUILD_BYPRODUCTS}") ExternalProject_Add(Umpire - PREFIX ${CMAKE_INSTALL_PREFIX} + PREFIX ${FETCHCONTENT_BASE_DIR} STAMP_DIR ${FETCHCONTENT_BASE_DIR}/umpire-ep-artifacts TMP_DIR ${FETCHCONTENT_BASE_DIR}/umpire-ep-artifacts # needed in case CMAKE_INSTALL_PREFIX is not writable #--Download step-------------- From c9d16a98715d640192ac422248674ae0f4032743 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 5 Apr 2024 11:13:21 -0400 Subject: [PATCH 373/592] Pick up changes from the following commit: df7e0c804dfa6f5901fa4e6bedbeb1993e2a5286 --- src/TiledArray/tensor/kernels.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 8172587d66..876ed00feb 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -148,10 +148,13 @@ void gemm(Alpha alpha, const Tensor& A, const Tensor& B, gemm_helper.compute_matrix_sizes(m, n, k, A.range(), B.range()); // Get the leading dimension for left and right matrices. - const integer lda = - (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose ? k : m); - const integer ldb = - (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? n : k); + const integer lda = std::max( + integer{1}, + (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose ? k : m)); + const integer ldb = std::max( + integer{1}, + (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? n + : k)); // may need to split gemm into multiply + accumulate for tracing purposes #ifdef TA_ENABLE_TILE_OPS_LOGGING @@ -219,8 +222,9 @@ void gemm(Alpha alpha, const Tensor& A, const Tensor& B, } } #else // TA_ENABLE_TILE_OPS_LOGGING + const integer ldc = std::max(integer{1}, n); math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, k, - alpha, A.data(), lda, B.data(), ldb, beta, C.data(), n); + alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc); #endif // TA_ENABLE_TILE_OPS_LOGGING } } From 154d42703990c796613d60c8de1a0479e95a31e1 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 16 Apr 2024 11:26:22 -0400 Subject: [PATCH 374/592] typo. --- src/TiledArray/tensor/kernels.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 876ed00feb..699496d77e 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -1280,7 +1280,6 @@ auto tensor_hadamard(TensorA const& A, Annot const& aA, TensorB const& B, } else { auto pA = A.permute(perm.AC); return pA.mult_to(B.permute(perm.BC)); - return pA; } } From 2bfd5aa63d446e0a2b1fc65113b988d1bc0f85e4 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 17 Apr 2024 17:58:04 -0400 Subject: [PATCH 375/592] More corner cases of ToT evaluations supported. --- src/TiledArray/einsum/tiledarray.h | 117 +++++++++++++++++++++++++++++ tests/einsum.cpp | 20 +++++ 2 files changed, 137 insertions(+) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index d2d8b2c9ba..c0fdbd3e66 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -167,6 +167,101 @@ auto replicate_array(Array from, TiledRange const &prepend_trng) { return result; } +template +auto reduce_modes(Tensor const &orig, size_t drank) { + TA_ASSERT(orig.nbatch() == 1); + auto const orig_rng = orig.range(); + TA_ASSERT(orig_rng.rank() > drank); + + auto const result_rng = [orig_rng, drank]() { + container::vector r1s; + for (auto i = 0; i < orig_rng.rank() - drank; ++i) + r1s.emplace_back(orig_rng.dim(i)); + return TA::Range(r1s); + }(); + + auto const delta_rng = [orig_rng, drank]() { + container::vector r1s; + for (auto i = orig_rng.rank() - drank; i < orig_rng.rank(); ++i) + r1s.emplace_back(orig_rng.dim(i)); + return TA::Range(r1s); + }(); + + auto const delta_vol = delta_rng.volume(); + + auto reducer = [orig, delta_vol, delta_rng](auto const &ix) { + auto orig_ix = ix; + std::copy(delta_rng.lobound().begin(), // + delta_rng.lobound().end(), // + std::back_inserter(orig_ix)); + + auto beg = orig.data() + orig.range().ordinal(orig_ix); + auto end = beg + delta_vol; + + // cannot get it done this way: return std::reduce(beg, end); + + typename std::iterator_traits::value_type sum{}; + for (; beg != end; ++beg) sum += *beg; + return sum; + }; + + return Tensor(result_rng, reducer); +} + +/// +/// \param orig Input DistArray. +/// \param dmodes Reduce this many modes from the end as implied in the +/// tiled range of the input array. +/// \return Array with reduced rank. +/// +template +auto reduce_modes(TA::DistArray orig, size_t drank) { + TA_ASSERT(orig.trange().rank() > drank); + + auto const result_trange = [orig, drank]() { + container::svector tr1s; + for (auto i = 0; i < (orig.trange().rank() - drank); ++i) + tr1s.emplace_back(orig.trange().at(i)); + return TiledRange(tr1s); + }(); + + auto const delta_trange = [orig, drank]() { + container::svector tr1s; + for (auto i = orig.trange().rank() - drank; i < orig.trange().rank(); ++i) + tr1s.emplace_back(orig.trange().at(i)); + return TiledRange(tr1s); + }(); + + orig.make_replicated(); + orig.world().gop.fence(); + + auto make_tile = [orig, delta_trange, drank](auto &tile, auto const &rng) { + using tile_type = std::remove_reference_t; + + tile_type res(rng, typename tile_type::value_type{}); + + for (auto &&r : delta_trange.tiles_range()) { + container::svector ix1s = rng.lobound(); + + { + auto dlo = delta_trange.make_tile_range(r).lobound(); + std::copy(dlo.begin(), dlo.end(), std::back_inserter(ix1s)); + } + + auto tix = orig.trange().element_to_tile(ix1s); + auto got = orig.find_local(tix).get(false); + + res += reduce_modes(got, drank); + } + + tile = res; + return res.norm(); + }; + + return make_array>(orig.world(), result_trange, + make_tile); +} + template TiledRange make_trange(RangeMap const &map, Ixs const &ixs) { container::svector tr1s; @@ -320,6 +415,28 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, return C; } + // + // special Hadamard + contraction + // when ToT times T implied and T's indices are contraction AND Hadamard + // BUT not externals + // + if constexpr (!AreArraySame && + DeNestFlag == DeNest::False) { + auto hi_size = h.size() + i.size(); + if (hi_size != h.size() && hi_size != i.size() && + ((hi_size == a.size() && IsArrayT) || + (hi_size == b.size() && IsArrayT))) { + auto annot_c = std::string(h + e + i) + inner.c; + auto temp1 = einsum(A, B, idx(annot_c), world); + auto temp2 = reduce_modes(temp1, i.size()); + + auto annot_c_ = std::string(h + e) + inner.c; + decltype(temp2) result; + result(std::string(c) + inner.c) = temp2(annot_c_); + return result; + } + } + using ::Einsum::index::permutation; using TiledArray::Permutation; diff --git a/tests/einsum.cpp b/tests/einsum.cpp index cfae7b5925..8e76fc08a4 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -404,6 +404,26 @@ BOOST_AUTO_TEST_CASE(corner_cases) { {{0, 3, 5}, {0, 3, 8}}, // {{0, 3, 8}, {0, 3, 5}, {0, 2}}, // {3, 9}))); + + BOOST_REQUIRE(check_manual_eval("bi,bi->i", // + {{0, 2}, {0, 4}}, // + {{0, 2}, {0, 4}})); + + BOOST_REQUIRE(check_manual_eval("bi;a,bi;a->i;a", // + {{0, 2}, {0, 4}}, // + {{0, 2}, {0, 4}}, // + {3}, {3})); + + BOOST_REQUIRE( + (check_manual_eval("jk;a,ijk->i;a", // + {{0, 2}, {0, 4}}, // + {{0, 3}, {0, 2}, {0, 4}}, // + {5}))); + + BOOST_REQUIRE((check_manual_eval("bi;a,bi->i;a", // + {{0, 4, 8}, {0, 4}}, // + {{0, 4, 8}, {0, 4}}, // + {8}))); } BOOST_AUTO_TEST_SUITE_END() From 92e416e3bfb83b13493a31d13a2aad88f70a9ab3 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 1 May 2024 11:54:10 -0400 Subject: [PATCH 376/592] Fix `ToT times ToT into T` kind of evaluations so that varying inner tensor extents are supported. --- src/TiledArray/einsum/tiledarray.h | 148 ++++++++++++++++++++++++++--- 1 file changed, 133 insertions(+), 15 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index c0fdbd3e66..4953141fbd 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -26,6 +26,102 @@ using ::Einsum::index::IndexMap; using ::Einsum::index::Permutation; using ::Einsum::index::permutation; +/// +/// \tparam T A type that parameterizes ::Einsum::Index. +/// +/// This class makes it easier to work with indices involved in a binary +/// tensor multiplication. Also defines a canonical order of the indices. +/// +/// Consider an arbitrary binary tensor multiplication annotated as: +/// A(a_1,...,a_m) * B(b_1,...,b_n) -> C(c_1,...,c_l) +/// Note that {c_1,...,c_l} is subset of ({a_1,...,a_m} union {b_1,...,b_n}). +/// +/// We define following index types. +/// * Hadamard index: An index that annotates A, B, and C. +/// * Contracted index: An index that annotates A and B but not C. +/// * External index of A: An index that annotates A and C but not B. +/// * External index of B: An index that annotates B and C but not A. +/// +/// Defining canonical index ordering. +/// * Hadamard indices are canonically ordered if they appear in the same +/// order in A's annotation. +/// * Contracted indices are canonically ordered if they appear in the same +/// order in A's annotation. +/// * External indices of A are canonically ordered if they appear in the +/// same order in A's annotation. +/// * External indices of B are canonically ordered if they appear in the +/// same order in B's annotation. +/// * Tensor A's indices are canonically ordered if Hadamard, external +/// indices of A, and contracted indices appear in that order and all +/// three index groups are themselves canonically ordered. +/// * Tensor B's indices are canonically ordered if Hadamard, external +/// indices of B, and contracted indices appear in that order and all +/// three index groups are themselves canonically ordered. +/// * Tensor C's indices are canonically ordered if Hadamard, external +/// indices of A and external indices of B appear in that order and all +/// three index groups are themselves canonically ordered. +/// +/// Example: Consider the evaluation: A(i,j,p,a,b) * B(j,i,q,b,a) -> C(i,p,j,q). +/// - Hadamard indices: {i,j} +/// - External indices of A: {p} +/// - External indices of B: {q} +/// - Contracted indices: {a, b} +/// All index groups above are canonically ordered. +/// Writing C's indices in canonical order would give: {i,j,p,q}. +/// +template +class TensorOpIndices { + public: + using index_t = ::Einsum::Index; + + TensorOpIndices(index_t const &ixA, index_t const &ixB, index_t const &ixC) + : orig_indices_({ixA, ixB, ixC}) { + hadamard_ = ixA & ixB & ixC; + contracted_ = (ixA & ixB) - ixC; + external_A_ = (ixA - ixB) & ixC; + external_B_ = (ixB - ixA) & ixC; + } + + [[nodiscard]] index_t const &ix_A() const { return orig_indices_[A]; } + [[nodiscard]] index_t const &ix_B() const { return orig_indices_[B]; } + [[nodiscard]] index_t const &ix_C() const { return orig_indices_[C]; } + + [[nodiscard]] index_t ix_A_canon() const { + return hadamard() + external_A() + contracted(); + } + + [[nodiscard]] index_t ix_B_canon() const { + return hadamard() + external_B() + contracted(); + } + + [[nodiscard]] index_t ix_C_canon() const { + return hadamard() + external_A() + external_B(); + } + + [[nodiscard]] index_t const &hadamard() const { return hadamard_; } + [[nodiscard]] index_t const &contracted() const { return contracted_; } + [[nodiscard]] index_t const &external_A() const { return external_A_; } + [[nodiscard]] index_t const &external_B() const { return external_B_; } + + [[nodiscard]] Permutation to_canon_A() const { + return ::Einsum::index::permutation(ix_A(), ix_A_canon()); + } + + [[nodiscard]] Permutation to_canon_B() const { + return ::Einsum::index::permutation(ix_B(), ix_B_canon()); + } + + [[nodiscard]] Permutation to_canon_C() const { + return ::Einsum::index::permutation(ix_C(), ix_C_canon()); + } + + private: + enum { A, B, C, ABC }; + std::array orig_indices_; + + index_t hadamard_, contracted_, external_A_, external_B_; +}; + /// converts the annotation of an expression to an Index template auto idx(const std::string &s) { @@ -334,20 +430,22 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, "Nested-rank-reduction only supported when the inner tensor " "ranks match on the arguments"); - // Step I: A * B -> C' - // Step II: C' -> C // - // At "Step I", a general product (without reduction) in outer indices, - // and pure Hadamard product in inner indices is carried out. - // Then at "Step II", the inner tensors are reduced with a unary function. - // The reducing function is determined by looking at the contracting and - // non-contracting outer indices. + // Illustration of steps by an example. // - // eg. A(i,j,k;a,b) * B(k,j;a,b) -> C(i,j) involves following two steps: - // Step I: A(i,j,k;a,b) * B(k,j;a,b) -> C'(i,j;a,b) - // Step II: C'(i,j;a,b) -> C(i,j) - - auto Cp = einsum(A, B, std::string(c) + ";" + std::string(inner.i)); + // Consider the evaluation: A(ijpab;xy) * B(jiqba;yx) -> C(ipjq). + // + // Note for the outer indices: + // - Hadamard: 'ij' + // - External A: 'p' + // - External B: 'q' + // - Contracted: 'ab' + // + // Now C is evaluated in the following steps. + // Step I: A(ijpab;xy) * B(jiqba;yx) -> C0(ijpqab;xy) + // Step II: C0(ijpqab;xy) -> C1(ijpqab) + // Step III: C1(ijpqab) -> C2(ijpq) + // Step IV: C2(ijpq) -> C(ipjq) auto sum_tot_2_tos = [](auto const &tot) { typename std::remove_reference_t::value_type result( @@ -355,12 +453,32 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, return result; }; - auto result = TA::foreach( - Cp, [sum_tot_2_tos](auto &out_tile, auto const &in_tile) { + auto const oixs = TensorOpIndices(a, b, c); + + struct { + std::string C0, C1, C2; + } const Cn_annot{ + std::string(oixs.ix_C_canon() + oixs.contracted()) + inner.a, + {oixs.ix_C_canon() + oixs.contracted()}, + {oixs.ix_C_canon()}}; + + // Step I: A(ijpab;xy) * B(jiqba;yx) -> C0(ijpqab;xy) + auto C0 = einsum(A, B, Cn_annot.C0); + + // Step II: C0(ijpqab;xy) -> C1(ijpqab) + auto C1 = TA::foreach( + C0, [sum_tot_2_tos](auto &out_tile, auto const &in_tile) { out_tile = sum_tot_2_tos(in_tile); }); - return result; + // Step III: C1(ijpqab) -> C2(ijpq) + auto C2 = reduce_modes(C1, oixs.contracted().size()); + + // Step IV: C2(ijpq) -> C(ipjq) + ArrayC C; + C(c) = C2(Cn_annot.C2); + return C; + } else { // these are "Hadamard" (fused) indices auto h = a & b & c; From 93f6986f15d5a514d1db1f14c9cb23cd837fe505 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 1 May 2024 13:48:01 -0400 Subject: [PATCH 377/592] Bug fix. --- src/TiledArray/einsum/tiledarray.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 4953141fbd..cff0e2cd7b 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -265,6 +265,7 @@ auto replicate_array(Array from, TiledRange const &prepend_trng) { template auto reduce_modes(Tensor const &orig, size_t drank) { + if (drank == 0) return orig; TA_ASSERT(orig.nbatch() == 1); auto const orig_rng = orig.range(); TA_ASSERT(orig_rng.rank() > drank); @@ -313,6 +314,7 @@ auto reduce_modes(Tensor const &orig, size_t drank) { template auto reduce_modes(TA::DistArray orig, size_t drank) { TA_ASSERT(orig.trange().rank() > drank); + if (drank == 0) return orig; auto const result_trange = [orig, drank]() { container::svector tr1s; From 2e67af6243476f41cf137cc9bede3ee18aa0905f Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 7 May 2024 17:00:04 -0400 Subject: [PATCH 378/592] Simplify `(H+E,H)->H+E` logic. --- src/TiledArray/einsum/tiledarray.h | 31 ++++++++++-------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index cff0e2cd7b..3da230ca19 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -504,34 +504,23 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, auto range_map = (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange())); - auto perm_and_rank_replicate = [delta_trng = make_trange(range_map, e)]( - auto pre, // - std::string const &pre_annot, // - std::string const &permed_annot) { - decltype(pre) permed; - permed(permed_annot) = pre(pre_annot); - return replicate_array(permed, delta_trng); - }; - // special Hadamard if (h.size() == a.size() || h.size() == b.size()) { TA_ASSERT(!i && e); - bool small_a = h.size() == a.size(); - std::string const eh_annot = (e | h); - std::string const permed_annot = - std::string(h) + (small_a ? inner.a : inner.b); - std::string const C_annot = std::string(c) + inner.c; - std::string const temp_annot = std::string(e) + "," + permed_annot; + bool const small_a = h.size() == a.size(); + auto const delta_trng = make_trange(range_map, e); + std::string target_layout = std::string(c) + inner.c; ArrayC C; if (small_a) { - auto temp = - perm_and_rank_replicate(A.array(), A.annotation(), permed_annot); - C(C_annot) = temp(temp_annot) * B; + auto temp = replicate_array(A.array(), delta_trng); + std::string temp_layout = std::string(e) + "," + A.annotation(); + C(target_layout) = temp(temp_layout) * B; } else { - auto temp = - perm_and_rank_replicate(B.array(), B.annotation(), permed_annot); - C(C_annot) = A * temp(temp_annot); + auto temp = replicate_array(B.array(), delta_trng); + std::string temp_layout = std::string(e) + "," + B.annotation(); + C(target_layout) = A * temp(temp_layout); } + return C; } From 1e07eb065e0c4795d41c7e75a20e0ab680f084b6 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 8 May 2024 12:58:52 -0400 Subject: [PATCH 379/592] [skip ci] Add corner case of outer Hadamard and inner outer-product kind of ToT eval. --- tests/einsum.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 8e76fc08a4..6ca0a611ff 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -232,6 +232,10 @@ BOOST_AUTO_TEST_CASE(equal_nested_ranks) { {3}, // {2})); // H+C;H+C not supported + + // H;C(op) + BOOST_REQUIRE(check_manual_eval( + "ijk;bc,j;d->kji;dcb", {{0, 1}, {0, 1}, {0, 1}}, {{0, 1}}, {2, 3}, {4})); } BOOST_AUTO_TEST_CASE(different_nested_ranks) { From 3a68f0c54e97f68e21f50e942472a18cbf5e636e Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 8 May 2024 14:37:39 -0400 Subject: [PATCH 380/592] bug fix. --- src/TiledArray/expressions/cont_engine.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index d40e9c88fc..58d7b9ad57 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -513,6 +513,7 @@ class ContEngine : public BinaryEngine { const left_tile_element_type& left, const right_tile_element_type& right) { contrreduce_op(result, left, right); + result = contrreduce_op(result); // permutations of result are applied as "postprocessing" }; } // ToT x ToT } else if (inner_prod == TensorProduct::Hadamard) { From 694212027c13080ad0d3c89b812312d079cf9d25 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Fri, 10 May 2024 14:54:53 -0400 Subject: [PATCH 381/592] Outer tensor contraction logic update. When the contraction occurs in the outer tensor (between two ToTs or between T or ToT), the contraction step cannot be passed to the expression layer. Example: `J = TA::einsum(I("i,j,k,l;eik,fjl"), S_1("i,j,k;aij,eik"), "i,j,l;aij,fjl");` --- src/TiledArray/einsum/tiledarray.h | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 3da230ca19..911591bd13 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -525,25 +525,18 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, } // - // special Hadamard + contraction - // when ToT times T implied and T's indices are contraction AND Hadamard - // BUT not externals + // when contraction happens in the outer tensor + // need to evaluate specially.. // - if constexpr (!AreArraySame && - DeNestFlag == DeNest::False) { - auto hi_size = h.size() + i.size(); - if (hi_size != h.size() && hi_size != i.size() && - ((hi_size == a.size() && IsArrayT) || - (hi_size == b.size() && IsArrayT))) { - auto annot_c = std::string(h + e + i) + inner.c; - auto temp1 = einsum(A, B, idx(annot_c), world); - auto temp2 = reduce_modes(temp1, i.size()); - - auto annot_c_ = std::string(h + e) + inner.c; - decltype(temp2) result; - result(std::string(c) + inner.c) = temp2(annot_c_); - return result; - } + if (IsArrayToT && i.size() > 0) { + auto annot_c = std::string(h + e + i) + inner.c; + auto temp1 = einsum(A, B, idx(annot_c), world); + auto temp2 = reduce_modes(temp1, i.size()); + + auto annot_c_ = std::string(h + e) + inner.c; + decltype(temp2) result; + result(std::string(c) + inner.c) = temp2(annot_c_); + return result; } using ::Einsum::index::permutation; From 95bdee5f0add895dd3315f6281ed56b0a2447e03 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 15 May 2024 14:25:21 -0400 Subject: [PATCH 382/592] Try using latest-stable xcode in CI. --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7753a3436d..35acea8182 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,7 +35,7 @@ jobs: - uses: maxim-lobanov/setup-xcode@v1 with: - xcode-version: '<14' + xcode-version: 'latest-stable' - name: Host system info shell: bash From 1607ab4d15c4cc3389284f29d06d008588d294e2 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 15 May 2024 14:36:35 -0400 Subject: [PATCH 383/592] use gcc@11 in CI --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 35acea8182..6b899e6b31 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,7 @@ jobs: cxx : [ clang++, /usr/local/bin/g++-10 ] build_type : [ Release, Debug ] task_backend: [ Pthreads, PaRSEC ] - prerequisites : [ gcc@10 boost eigen open-mpi bison scalapack ] + prerequisites : [ gcc@11 boost eigen open-mpi bison scalapack ] name: "${{ matrix.os }}: ${{ matrix.cxx }} ${{ matrix.build_type }} ${{ matrix.task_backend }}" runs-on: ${{ matrix.os }} From ecd4caf365387ed6e869468952a40fbe11b81256 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 15 May 2024 14:43:03 -0400 Subject: [PATCH 384/592] use gcc@11 in CI [ammend] --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6b899e6b31..b10b6f1ada 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,7 @@ jobs: fail-fast: false matrix: os : [ macos-latest ] - cxx : [ clang++, /usr/local/bin/g++-10 ] + cxx : [ clang++, /usr/local/bin/g++-11 ] build_type : [ Release, Debug ] task_backend: [ Pthreads, PaRSEC ] prerequisites : [ gcc@11 boost eigen open-mpi bison scalapack ] From ec51ab0f7e8948a5edd9fda0c38a402674752a57 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 15 May 2024 15:24:14 -0400 Subject: [PATCH 385/592] Try setting GNU/gcc compiler from `/opt/homebrew/bin`. --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b10b6f1ada..2339070e54 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,7 @@ jobs: fail-fast: false matrix: os : [ macos-latest ] - cxx : [ clang++, /usr/local/bin/g++-11 ] + cxx : [ clang++, /opt/homebrew/bin/g++-11 ] build_type : [ Release, Debug ] task_backend: [ Pthreads, PaRSEC ] prerequisites : [ gcc@11 boost eigen open-mpi bison scalapack ] From 40705d7c26ce7a00cd620f0cceac25010aaccfc9 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 16 May 2024 11:18:33 -0400 Subject: [PATCH 386/592] Avoid using deprecated `TA::Array` typedef. --- tests/einsum.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 6ca0a611ff..d1afaf74e6 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -117,7 +117,7 @@ bool check_manual_eval(std::string const& annot, il_trange trangeA, } BOOST_AUTO_TEST_CASE(contract) { - using Array = TA::Array; + using Array = TA::TArrayI; BOOST_REQUIRE(check_manual_eval("ij,j->i", {{0, 2, 4}, {0, 4, 8}}, // A's trange @@ -136,7 +136,7 @@ BOOST_AUTO_TEST_CASE(contract) { } BOOST_AUTO_TEST_CASE(hadamard) { - using Array = TA::Array; + using Array = TA::TArrayI; BOOST_REQUIRE(check_manual_eval("i,i->i", // {{0, 1}}, // {{0, 1}} // @@ -153,13 +153,11 @@ BOOST_AUTO_TEST_CASE(hadamard) { } BOOST_AUTO_TEST_CASE(general) { - using Array = TA::Array; + using Array = TA::TArrayI; BOOST_REQUIRE(check_manual_eval("ijk,kil->ijl", // {{0, 2}, {0, 3, 5}, {0, 2, 4}}, // {{0, 2, 4}, {0, 2}, {0, 1}} // )); - - using Array = TA::Array; using Tensor = typename Array::value_type; using namespace std::string_literals; From 6e864ebddac3c7dd45d3888e9a0cb9441c0bed12 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 16 May 2024 14:57:38 -0400 Subject: [PATCH 387/592] `reduce_modes` function impl. amended to handle sparse dist-arrays. --- src/TiledArray/einsum/tiledarray.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 911591bd13..6c1e52c5fc 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -338,6 +338,7 @@ auto reduce_modes(TA::DistArray orig, size_t drank) { tile_type res(rng, typename tile_type::value_type{}); + bool all_summed_tiles_zeros{true}; for (auto &&r : delta_trange.tiles_range()) { container::svector ix1s = rng.lobound(); @@ -347,11 +348,18 @@ auto reduce_modes(TA::DistArray orig, size_t drank) { } auto tix = orig.trange().element_to_tile(ix1s); + if constexpr (std::is_same_v::policy_type, + SparsePolicy>) + if (orig.is_zero(tix)) continue; auto got = orig.find_local(tix).get(false); res += reduce_modes(got, drank); + all_summed_tiles_zeros = false; } + if (all_summed_tiles_zeros) + return typename std::remove_reference_t::scalar_type{0}; + tile = res; return res.norm(); }; From 02c9ba90e8ea74fe5a55aff4bb89b0573b82c9a9 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 May 2024 12:18:32 -0400 Subject: [PATCH 388/592] Refactor OuterInnerIndices to OuterInnerSetup. --- tests/tot_array_fixture.h | 129 ++++++++++++++++++++++---------------- 1 file changed, 75 insertions(+), 54 deletions(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 94f57b0930..4710ab79d7 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -229,50 +229,6 @@ void apply_partial_perm(T& to, T const& from, PartialPerm const& p) { } } -/// -/// Example: To represent A("ik;ac") * B("kj;cb") -> C("ij;ab"), -/// construct with std::string("ij;ac,kj;cb->ij;ab"); -/// outer_indices;inner_indices annotates a single object (DistArray, Tensor -/// etc.) A_indices,B_indices annotates first(A) and second(B) object -/// '->' separates argument objects' annotation from the result's annotation -/// -class OuterInnerIndices { - // array[0] annotes A - // array[1] annotes B - // array[2] annotes C - std::array outer_, inner_; - - public: - OuterInnerIndices(std::string const& annot) { - using ::Einsum::string::split2; - - constexpr size_t A = 0; - constexpr size_t B = 1; - constexpr size_t C = 2; - - auto [ab, aC] = split2(annot, "->"); - std::tie(outer_[C], inner_[C]) = split2(aC, ";"); - - auto [aA, aB] = split2(ab, ","); - - std::tie(outer_[A], inner_[A]) = split2(aA, ";"); - std::tie(outer_[B], inner_[B]) = split2(aB, ";"); - } - - template - OuterInnerIndices(const char (&s)[N]) : OuterInnerIndices{std::string(s)} {} - - [[nodiscard]] auto const& outer() const noexcept { return outer_; } - [[nodiscard]] auto const& inner() const noexcept { return inner_; } - - [[nodiscard]] auto const& outerA() const noexcept { return outer_[0]; } - [[nodiscard]] auto const& outerB() const noexcept { return outer_[1]; } - [[nodiscard]] auto const& outerC() const noexcept { return outer_[2]; } - [[nodiscard]] auto const& innerA() const noexcept { return inner_[0]; } - [[nodiscard]] auto const& innerB() const noexcept { return inner_[1]; } - [[nodiscard]] auto const& innerC() const noexcept { return inner_[2]; } -}; - enum struct TensorProduct { General, Dot, Invalid }; struct ProductSetup { @@ -294,7 +250,7 @@ struct ProductSetup { rank_E, // rank_I; - // ProductSetup() = default; + ProductSetup() = default; template >> @@ -342,6 +298,71 @@ struct ProductSetup { } }; +/// +/// Example: To represent A("ik;ac") * B("kj;cb") -> C("ij;ab") +/// +/// Method 1: +/// --- +/// construct with a single argument std::string("ij;ac,kj;cb->ij;ab"); +/// - the substring ";" +/// annotates a single object (DistArray, Tensor etc.) +/// - "," implies two distinct annotations (for A and B) +/// separated by a comma +/// - the right hand side of '->' annotates the result. +/// - Note: the only use of comma is to separate A's and B's annotations. +/// +/// Method 2: +/// --- +/// construct with three arguments: +/// std::string("i,k;a,c"), std::string("k,j;c,b"), std::string("i,j;a,b") +/// - Note the use of comma. +/// +class OuterInnerSetup { + ProductSetup outer_; + ProductSetup inner_; + + public: + OuterInnerSetup(std::string const& annot) { + using ::Einsum::string::split2; + using Ix = ::Einsum::index::Index; + + enum { A, B, C }; + std::array O; + std::array I; + + auto [ab, aC] = split2(annot, "->"); + std::tie(O[C], I[C]) = split2(aC, ";"); + + auto [aA, aB] = split2(ab, ","); + std::tie(O[A], I[A]) = split2(aA, ";"); + std::tie(O[B], I[B]) = split2(aB, ";"); + outer_ = ProductSetup(Ix(O[A]), Ix(O[B]), Ix(O[C])); + inner_ = ProductSetup(Ix(I[A]), Ix(I[B]), Ix(I[C])); + } + + template + OuterInnerSetup(const char (&s)[N]) : OuterInnerSetup{std::string(s)} {} + + OuterInnerSetup(std::string const& annotA, std::string const& annotB, + std::string const& annotC) { + using ::Einsum::string::split2; + using Ix = ::Einsum::index::Index; + + enum { A, B, C }; + std::array O; + std::array I; + std::tie(O[A], I[A]) = split2(annotA, ";"); + std::tie(O[B], I[B]) = split2(annotB, ";"); + std::tie(O[C], I[C]) = split2(annotC, ";"); + outer_ = ProductSetup(Ix(O[A]), Ix(O[B]), Ix(O[C])); + inner_ = ProductSetup(Ix(I[A]), Ix(I[B]), Ix(I[C])); + } + + [[nodiscard]] auto const& outer() const noexcept { return outer_; } + + [[nodiscard]] auto const& inner() const noexcept { return inner_; } +}; + namespace { auto make_perm(PartialPerm const& pp) { @@ -569,28 +590,28 @@ auto general_product(TA::DistArray A, template >> -auto manual_eval(OuterInnerIndices const& oixs, ArrayA A, ArrayB B) { +auto manual_eval(OuterInnerSetup const& setups, ArrayA A, ArrayB B) { constexpr auto mnr = TA::detail::max_nested_rank; static_assert(mnr == 1 || mnr == 2); - auto const outer_setup = ProductSetup(oixs.outer()); + auto const& outer = setups.outer(); + auto const& inner = setups.inner(); - TA_ASSERT(outer_setup.valid()); + TA_ASSERT(outer.valid()); if constexpr (mnr == 2) { - auto const inner_setup = ProductSetup(oixs.inner()); - TA_ASSERT(inner_setup.valid()); + TA_ASSERT(inner.valid()); if constexpr (DeNestFlag == DeNest::True) { // reduced nested rank in result using TA::detail::nested_rank; static_assert(nested_rank == nested_rank); - TA_ASSERT(inner_setup.rank_C == 0); + TA_ASSERT(inner.rank_C == 0); using TileC = typename ArrayA::value_type::value_type; - return general_product(A, B, outer_setup, inner_setup); + return general_product(A, B, outer, inner); } else - return general_product(A, B, outer_setup, inner_setup); + return general_product(A, B, outer, inner); } else { - return general_product(A, B, outer_setup); + return general_product(A, B, outer); } } From f00c2dc55a84c6722765bebb501e0f4d62da9f3a Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 20 May 2024 13:02:43 -0400 Subject: [PATCH 389/592] Make dense_array.set(..) more generic. --- src/TiledArray/conversions/sparse_to_dense.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/conversions/sparse_to_dense.h b/src/TiledArray/conversions/sparse_to_dense.h index c5bdd812c5..89c45a2cb6 100644 --- a/src/TiledArray/conversions/sparse_to_dense.h +++ b/src/TiledArray/conversions/sparse_to_dense.h @@ -53,7 +53,7 @@ to_dense(DistArray const& sparse_array) { dense_array.set(ord, tile); } else { // see DistArray::set(ordinal, element_type) - dense_array.set(ord, 0); + dense_array.set(ord, typename ArrayType::value_type{}); } } From 4855438d810b24fbb3d3af09c4be04ac3ec20214 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 21 May 2024 08:45:31 -0400 Subject: [PATCH 390/592] Amend make dense_array.set(..) more generic. --- src/TiledArray/conversions/sparse_to_dense.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/conversions/sparse_to_dense.h b/src/TiledArray/conversions/sparse_to_dense.h index 89c45a2cb6..4c8728bc94 100644 --- a/src/TiledArray/conversions/sparse_to_dense.h +++ b/src/TiledArray/conversions/sparse_to_dense.h @@ -52,8 +52,13 @@ to_dense(DistArray const& sparse_array) { Tile tile(sparse_array.find(ord).get().clone()); dense_array.set(ord, tile); } else { - // see DistArray::set(ordinal, element_type) - dense_array.set(ord, typename ArrayType::value_type{}); + if constexpr (detail::is_tensor_of_tensor_v) { + // `zero' tiles that satisfy detail::is_tensor_of_tensor_v + // will be left uninitialized + } else { + // see DistArray::set(ordinal, element_type) + dense_array.set(ord, 0); + } } } From 00663f80e39beafcf2a316d56d6c6a90d0ec9711 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 21 May 2024 09:07:45 -0400 Subject: [PATCH 391/592] Amend make dense_array.set(..) more generic. --- src/TiledArray/conversions/sparse_to_dense.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/TiledArray/conversions/sparse_to_dense.h b/src/TiledArray/conversions/sparse_to_dense.h index 4c8728bc94..7ee6e92049 100644 --- a/src/TiledArray/conversions/sparse_to_dense.h +++ b/src/TiledArray/conversions/sparse_to_dense.h @@ -52,13 +52,7 @@ to_dense(DistArray const& sparse_array) { Tile tile(sparse_array.find(ord).get().clone()); dense_array.set(ord, tile); } else { - if constexpr (detail::is_tensor_of_tensor_v) { - // `zero' tiles that satisfy detail::is_tensor_of_tensor_v - // will be left uninitialized - } else { - // see DistArray::set(ordinal, element_type) - dense_array.set(ord, 0); - } + dense_array.set(ord, typename Tile::value_type{}); } } From 4b3f39f6fd77c2f6f43926aea2f7139434e5e421 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 21 May 2024 11:45:48 -0400 Subject: [PATCH 392/592] Remove use of deprecated typedefs.. (moar) --- tests/einsum.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index d1afaf74e6..8c9e0ae057 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -117,7 +117,7 @@ bool check_manual_eval(std::string const& annot, il_trange trangeA, } BOOST_AUTO_TEST_CASE(contract) { - using Array = TA::TArrayI; + using Array = TA::DistArray>; BOOST_REQUIRE(check_manual_eval("ij,j->i", {{0, 2, 4}, {0, 4, 8}}, // A's trange @@ -136,7 +136,7 @@ BOOST_AUTO_TEST_CASE(contract) { } BOOST_AUTO_TEST_CASE(hadamard) { - using Array = TA::TArrayI; + using Array = TA::DistArray>; BOOST_REQUIRE(check_manual_eval("i,i->i", // {{0, 1}}, // {{0, 1}} // @@ -153,7 +153,7 @@ BOOST_AUTO_TEST_CASE(hadamard) { } BOOST_AUTO_TEST_CASE(general) { - using Array = TA::TArrayI; + using Array = TA::DistArray>; BOOST_REQUIRE(check_manual_eval("ijk,kil->ijl", // {{0, 2}, {0, 3, 5}, {0, 2, 4}}, // {{0, 2, 4}, {0, 2}, {0, 1}} // From c94488edbf930456ccd489ff5dc8823530a134ed Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 21 May 2024 12:57:25 -0400 Subject: [PATCH 393/592] Use of `ArrayIterator::ordinal()` changed to `ArrayIterator::index()` to allow rank-1 array handling. --- src/TiledArray/conversions/dense_to_sparse.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/conversions/dense_to_sparse.h b/src/TiledArray/conversions/dense_to_sparse.h index e5c23cf5ba..6147c01a56 100644 --- a/src/TiledArray/conversions/dense_to_sparse.h +++ b/src/TiledArray/conversions/dense_to_sparse.h @@ -27,7 +27,7 @@ to_sparse(DistArray const &dense_array) { const auto begin = dense_array.begin(); for (auto it = begin; it != end; ++it) { // write the norm of each local tile to the tensor - norm(it->get(), tile_norms[it.ordinal()]); + norm(it->get(), tile_norms[it.index()]); } // Construct a sparse shape the constructor will handle communicating the @@ -40,9 +40,9 @@ to_sparse(DistArray const &dense_array) { // sparse_array set the sparse array tile with a clone so as not to hold // a pointer to the original tile. for (auto it = begin; it != end; ++it) { - const auto ord = it.ordinal(); - if (!sparse_array.is_zero(ord)) { - sparse_array.set(ord, it->get().clone()); + const auto ix = it.index(); + if (!sparse_array.is_zero(ix)) { + sparse_array.set(ix, it->get().clone()); } } From 1dea3466d80101edbc75630c1992810673cabac4 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 22 May 2024 13:36:06 -0400 Subject: [PATCH 394/592] Bug fix. --- tests/einsum.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 8c9e0ae057..e58e17b9e8 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -64,7 +64,7 @@ template bool check_manual_eval(std::string const& annot, il_trange trangeA, il_trange trangeB) { - return check_manual_eval(annot, trangeA, + return check_manual_eval(annot, trangeA, trangeB); } From 44f8ec37d2eda4097eb43da0281e6578b87fdfe7 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 28 May 2024 10:09:33 -0400 Subject: [PATCH 395/592] Compare shape by default on Sparse policy distarrays. --- tests/einsum.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/einsum.cpp b/tests/einsum.cpp index e58e17b9e8..a5980c7446 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -33,12 +33,18 @@ using il_extent = std::initializer_list; } // namespace template >> bool check_manual_eval(std::string const& annot, ArrayA A, ArrayB B) { auto out = TA::einsum(annot, A, B); auto ref = manual_eval(annot, A, B); + + using Policy = typename decltype(out)::policy_type; + if constexpr (ShapeCompFlag == ShapeComp::True && + std::is_same_v) { + out.truncate(); + } return ToTArrayFixture::are_equal(ref, out); } @@ -50,7 +56,7 @@ bool check_manual_eval(std::string const& annot, ArrayA A, ArrayB B) { } template + ShapeComp ShapeCompFlag = ShapeComp::True> bool check_manual_eval(std::string const& annot, il_trange trangeA, il_trange trangeB) { static_assert(detail::is_array_v && @@ -69,7 +75,7 @@ bool check_manual_eval(std::string const& annot, il_trange trangeA, } template + ShapeComp ShapeCompFlag = ShapeComp::True> bool check_manual_eval(std::string const& annot, il_trange trangeA, il_trange trangeB, il_extent inner_extents) { static_assert(detail::is_array_v); @@ -96,7 +102,7 @@ bool check_manual_eval(std::string const& annot, il_trange trangeA, } template + ShapeComp ShapeCompFlag = ShapeComp::True> bool check_manual_eval(std::string const& annot, il_trange trangeA, il_trange trangeB, il_extent inner_extentsA, il_extent inner_extentsB) { From c9cc69ffa0958d5d188c704d7941730c7d58cd85 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 29 May 2024 07:57:14 -0400 Subject: [PATCH 396/592] Return zero norm for uninitialized tensor-of-tensors tile. --- src/TiledArray/tensor/tensor.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index cd0e7e97f1..38f0e65ff9 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -2373,6 +2373,18 @@ class Tensor { /// \return The vector norm of this tensor scalar_type squared_norm() const { + + if constexpr (detail::is_tensor_v) { + // If uninitialized tensor of tensor return zero. + // All elements of this->data() are empty tensors in this case, + // however, we only look at the first element. + // Because + // - It is expensive to look at all elements. + // - The state of the array having only some empty elements + // is ill-defined and should never happen. + if (detail::empty(*data())) return 0; + } + auto square_op = [](scalar_type& MADNESS_RESTRICT res, const numeric_type arg) { res += TiledArray::detail::squared_norm(arg); From fa242b421464904712261c309847c4194d0b44d6 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 29 May 2024 08:31:46 -0400 Subject: [PATCH 397/592] Recompute shape for specially handled sparse arrays in einsum. --- src/TiledArray/einsum/tiledarray.h | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 6c1e52c5fc..6d1acd784f 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -260,6 +260,10 @@ auto replicate_array(Array from, TiledRange const &prepend_trng) { tile = repped; return tile.norm(); }); + + if constexpr (std::is_same_v) + result.truncate(); + return result; } @@ -311,8 +315,8 @@ auto reduce_modes(Tensor const &orig, size_t drank) { /// tiled range of the input array. /// \return Array with reduced rank. /// -template -auto reduce_modes(TA::DistArray orig, size_t drank) { +template +auto reduce_modes(TA::DistArray orig, size_t drank) { TA_ASSERT(orig.trange().rank() > drank); if (drank == 0) return orig; @@ -348,8 +352,7 @@ auto reduce_modes(TA::DistArray orig, size_t drank) { } auto tix = orig.trange().element_to_tile(ix1s); - if constexpr (std::is_same_v::policy_type, - SparsePolicy>) + if constexpr (std::is_same_v) if (orig.is_zero(tix)) continue; auto got = orig.find_local(tix).get(false); @@ -364,8 +367,11 @@ auto reduce_modes(TA::DistArray orig, size_t drank) { return res.norm(); }; - return make_array>(orig.world(), result_trange, - make_tile); + auto result = + make_array>(orig.world(), result_trange, make_tile); + if constexpr (std::is_same_v) result.truncate(); + + return result; } template From 64dbcb14c71b53e30953089a6e3366301136da93 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 29 May 2024 10:28:21 -0400 Subject: [PATCH 398/592] Bug fix. --- src/TiledArray/einsum/tiledarray.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 6d1acd784f..6a45c1e891 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -256,6 +256,7 @@ auto replicate_array(Array from, TiledRange const &prepend_trng) { auto res_coord_ix = res_tr.element_to_tile(res_rng.lobound()); auto from_coord_ix = decltype(res_coord_ix)( next(begin(res_coord_ix), delta_rank), end(res_coord_ix)); + if (from.is_zero(from_coord_ix)) return typename Array::scalar_type{0}; replicate_tensor(repped, from.find_local(from_coord_ix).get(false)); tile = repped; return tile.norm(); From 179e1d39f0b8635f37d72be6d3c91f315a99eaea Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 3 Jun 2024 13:58:46 -0400 Subject: [PATCH 399/592] Sparse array evaluation support in manual eval from tot_array_fixture. --- tests/tot_array_fixture.h | 307 ++++++++++++++++++++++++++------------ 1 file changed, 209 insertions(+), 98 deletions(-) diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 4710ab79d7..5d0a0ce4dd 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -93,102 +93,203 @@ using output_archive_type = madness::archive::BinaryFstreamOutputArchive; enum class ShapeComp { True, False }; -template , bool> = true> -auto random_tensor(TA::Range const& rng) { - using NumericT = typename TensorT::numeric_type; - TensorT result{rng}; - - std::generate(/*std::execution::par, */ - result.begin(), result.end(), - TA::detail::MakeRandom::generate_value); - return result; +namespace fixture { +namespace { + +template +constexpr bool maps_index_to_range_v{}; + +template +constexpr bool maps_index_to_range_v< + Invocable, + std::enable_if_t>>>{ + true}; + +using il_range = std::initializer_list; +using il_trange = std::initializer_list; + +} // namespace + +/// +/// \tparam T Non cv-qualified TA::Tensor type. +/// \tparam Rng TA::Range should be constructible from Rng type. +/// Eg. TA::Range, std::initializer_list. +/// \param rng The range of the result tensor. +/// TA::Range(rng) will be called explicitly. +/// \return A TA::Tensor of a numeric type with random elements. +/// +template >, + typename = std::enable_if_t>> +auto random_tensor(Rng rng) { + using numeric_type = typename T::numeric_type; + + auto gen = [](auto&&) { + return detail::MakeRandom::generate_value(); + }; + + return T(TA::Range(rng), gen); } -template -auto random_tensor(std::initializer_list const& extents) { - auto lobounds = TA::container::svector(extents.size(), 0); - return random_tensor(TA::Range{lobounds, extents}); +/// +/// \tparam T Non cv-qualified +/// TA::Tensor,...> type. +/// \tparam RngO TA::Range should be constructible from RngO type. +/// Eg. TA::Range, std::initializer_list. +/// \tparam RngI TA::Range should be constructible from RngI type. +/// Eg. TA::Range, std::initializer_list. +/// \param rngo The range of the result tensor (ie the outer tensor). +/// TA::Range(rngo) will be called explicitly. +/// \param rngi The range of the inner tensors. Note that ALL inner tensors +/// will have an EQUAL range. TA::Range(rngi) will be +/// called explicitly. +/// \return A TA::Tensor of TA::Tensor with random +/// numeric_type elements. +/// +template < + typename T, typename RngO, typename RngI, // + typename = std::enable_if_t>, // + typename = std::enable_if_t>, // + typename = std::enable_if_t>> +auto random_tensor(RngO rngo, RngI rngi) { + using numeric_type = typename T::numeric_type; + using Inner = typename T::value_type; + + auto gen_inner = [](auto&&) { + return detail::MakeRandom::generate_value(); + }; + + auto gen_outer = [gen_inner, rngi](auto&&) { + return Inner(TA::Range(rngi), gen_inner); + }; + + return T(TA::Range(rngo), gen_outer); } -// -// note: all the inner tensors (elements of the outer tensor) -// have the same @c inner_rng -// +/// +/// \tparam T Non cv-qualified +/// TA::Tensor,...> type. +/// \tparam RngO TA::Range should be constructible from RngO type. +/// Eg. TA::Range, std::initializer_list. +/// \tparam IxMap An invocable type that maps the index of an element in the +/// outer tensor to a value, allowing the construction of +/// TA::Range from that value. +/// \param rngo The range of the result tensor (ie the outer tensor). +/// TA::Range(rngo) will be called explicitly. +/// \param ixmap An invocable that maps the index of an element in the +/// outer tensor to a value, allowing the construction of +/// TA::Range from that value. +/// \return A TA::Tensor of TA::Tensor with random +/// numeric_type elements. +/// template < - typename TensorT, - std::enable_if_t, bool> = true> -auto random_tensor(TA::Range const& outer_rng, TA::Range const& inner_rng) { - using InnerTensorT = typename TensorT::value_type; - TensorT result{outer_rng}; + typename T, typename RngO, typename IxMap, // + typename = std::enable_if_t>, // + typename = std::enable_if_t>, // + std::enable_if_t, bool> = true> +auto random_tensor(RngO rngo, IxMap ixmap) { + using numeric_type = typename T::numeric_type; + + auto gen_inner = [](auto&&) { + return TA::detail::MakeRandom::generate_value(); + }; - std::generate(/*std::execution::par,*/ - result.begin(), result.end(), [inner_rng]() { - return random_tensor(inner_rng); - }); + auto gen_outer = [gen_inner, ixmap](auto const& oix) { + auto inner_rng = TA::Range(ixmap(oix)); + return typename T::value_type(inner_rng, gen_inner); + }; - return result; + return T(TA::Range(rngo), gen_outer); } -template -auto random_tensor(TA::Range const& outer_rng, - std::initializer_list const& inner_extents) { - TA::container::svector lobounds(inner_extents.size(), 0); - return random_tensor(outer_rng, TA::Range(lobounds, inner_extents)); +/// +/// \tparam Array Non cv-qualified TA::DistArray type that has non-nested +/// tile type. Eg. TA::DistArray> +/// \tparam Rng TA::TiledRange should be constructible from Rng type. +/// \param rng The TA::TiledRange of the result TA::DistArray. +/// \return A TA::DistArray of non-nested tile type with random elements. +/// +template < + typename Array, typename Rng = il_trange, + typename = std::enable_if_t == 1>, + typename = std::enable_if_t>> +auto random_array(Rng rng) { + using T = typename Array::value_type; + + auto make_tile = [](auto& tile, auto const& rng) { + tile = random_tensor(rng); + return tile.norm(); + }; + + return TA::make_array(TA::get_default_world(), TA::TiledRange(rng), + make_tile); } /// -/// \tparam Array The type of DistArray to be generated. Cannot be cv-qualified -/// or reference type. -/// \tparam Args TA::Range type for inner tensor if the tile type of the result -/// is a tensor-of-tensor. -/// \param trange The TiledRange of the result DistArray. -/// \param args Either exactly one TA::Range type when the tile type of Array is -/// tensor-of-tensor or nothing. -/// \return Returns a DistArray of type Array whose elements are randomly -/// generated. -/// @note: -/// - Although DistArrays with Sparse policy can be generated all of their -/// tiles are initialized with random values -- technically the returned value -/// is dense. -/// - In case of arrays with tensor-of-tensor tiles, all the inner tensors have -/// the same rank and the same extent of corresponding modes. +/// \tparam Array Non cv-qualified TA::DistArray type that has a nested +/// tile type. +/// Eg. TA::DistArray>> +/// \tparam RngO TA::TiledRange should be constructible form RngO type. +/// \tparam RngI TA::Range should be constructible from RngI type. +/// \param rngo The TA::TiledRange of the result TA::DistArray. +/// \param rngi The range of the inner tensors. Note that ALL inner tensors +/// will have an EQUAL range. TA::Range(rngi) will be +/// called explicitly. +/// \return A TA::DistArray of nested tile type with random elements. /// template < - typename Array, typename... Args, - typename = - std::void_t, - std::enable_if_t, - bool> = true> -auto random_array(TA::TiledRange const& trange, Args const&... args) { - static_assert( - (sizeof...(Args) == 0 && - TA::detail::is_tensor_v) || - (sizeof...(Args) == 1) && - (TA::detail::is_tensor_of_tensor_v)); - - using TensorT = typename Array::value_type; - using PolicyT = typename Array::policy_type; - - auto make_tile_meta = [](auto&&... args) { - return [=](TensorT& tile, TA::Range const& rng) { - tile = random_tensor(rng, args...); - if constexpr (std::is_same_v) - return tile.norm(); - }; + typename Array, typename RngO = il_trange, typename RngI = il_range, + typename = std::enable_if_t == 2>, + typename = std::enable_if_t>, + typename = std::enable_if_t>> +auto random_array(RngO rngo, RngI rngi) { + using T = typename Array::value_type; + + auto make_tile = [rngi](auto& tile, auto const& rng) { + tile = random_tensor(rng, rngi); + return tile.norm(); }; - return TA::make_array(TA::get_default_world(), trange, - make_tile_meta(args...)); + return TA::make_array(TA::get_default_world(), TA::TiledRange(rngo), + make_tile); } -template -auto random_array(std::initializer_list> trange, - Args&&... args) { - return random_array(TA::TiledRange(trange), - std::forward(args)...); +/// +/// \tparam Array Non cv-qualified TA::DistArray type that has a nested +/// tile type. +/// Eg. TA::DistArray>> +/// \tparam RngO TA::TiledRange should be constructible form RngO type. +/// \tparam IxMap An invocable type that maps the index of an element in the +/// outer tensor to a value, allowing the construction of +/// TA::Range from that value. +/// \param rngo The TA::TiledRange of the result TA::DistArray. +/// \param ixmap An invocable that maps the index of an element in the +/// outer tensor to a value, allowing the construction of +/// TA::Range from that value. +/// \return A TA::DistArray of nested tile type with random elements. +template < + typename Array, typename RngO, typename IxMap, + typename = std::enable_if_t == 2>, + typename = std::enable_if_t>, + std::enable_if_t, bool> = true> +auto random_array(RngO rngo, IxMap ixmap) { + using T = typename Array::value_type; + + auto make_tile = [ixmap](auto& tile, auto const& rng) { + tile = random_tensor(rng, ixmap); + return tile.norm(); + }; + + return TA::make_array(TA::get_default_world(), TA::TiledRange(rngo), + make_tile); } +} // namespace fixture + +using fixture::random_array; +using fixture::random_tensor; + /// /// Succinctly call TA::detail::tensor_contract /// @@ -404,6 +505,9 @@ Result general_product(TensorA const& A, TensorB const& B, using TA::detail::max_nested_rank; using TA::detail::nested_rank; + // empty tensors + if (A.empty() || B.empty()) return Result{}; + static_assert(std::is_same_v); @@ -502,12 +606,22 @@ Result general_product(TensorA const& A, TensorB const& B, } } else { typename Result::value_type temp{}; - for (auto ix_I : rng_I) { + for (auto const& ix_I : rng_I) { apply_partial_perm(ix_A, ix_I, setup.I_to_A); apply_partial_perm(ix_B, ix_I, setup.I_to_B); - if constexpr (is_tot) - temp += general_product( + if constexpr (is_tot) { + auto temp_ = general_product( A(ix_A), B(ix_B), args...); + if constexpr (TA::detail::is_nested_tensor_v< + typename Result::value_type>) { + if (temp.empty()) + temp = std::move(temp_); + else + temp += temp_; + } else { + temp += temp_; + } + } else { TA_ASSERT(!(ix_A.empty() || ix_B.empty())); temp += A(ix_A) * B(ix_B); @@ -548,26 +662,14 @@ auto general_product(TA::DistArray A, TA::TiledRange result_trange; { - auto const rank = result_tensor.range().rank(); - auto const result_range = result_tensor.range(); - - TA::container::svector> tr1s(rank, {0}); - - TA::container::svector const ix_hi(result_range.upbound()); - for (auto d = 0; d < rank; ++d) { - TA::container::svector ix(result_range.lobound()); - for (auto& i = ix[d]; i < ix_hi[d]; ++i) { - auto const& elem_tensor = result_tensor(ix); - auto& tr1 = tr1s[d]; - tr1.emplace_back(tr1.back() + elem_tensor.range().extent(d)); - } + TA::container::svector tr1s(setup.rank_C); + for (auto [t, f] : setup.C_to_A) { + tr1s.at(t) = A.trange().at(f); } - - TA::container::svector tr1s_explicit; - tr1s_explicit.reserve(tr1s.size()); - for (auto const& v : tr1s) tr1s_explicit.emplace_back(v.begin(), v.end()); - - result_trange = TA::TiledRange(tr1s_explicit); + for (auto [t, f] : setup.C_to_B) { + tr1s.at(t) = B.trange().at(f); + } + result_trange = TiledRange(tr1s); } TA::DistArray C(world, result_trange); @@ -588,6 +690,15 @@ auto general_product(TA::DistArray A, return general_product(A, B, args...); } +template +auto general_product(TA::DistArray A, + TA::DistArray B, + Setups const&... args) { + auto A_dense = to_dense(A); + auto B_dense = to_dense(B); + return TA::to_sparse(general_product(A_dense, B_dense, args...)); +} + template >> auto manual_eval(OuterInnerSetup const& setups, ArrayA A, ArrayB B) { From 59fbafb9ed4dca8340f00902d8583883430a8218 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 3 Jun 2024 15:23:50 -0400 Subject: [PATCH 400/592] Add dox. --- src/TiledArray/einsum/tiledarray.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 6a45c1e891..5d68770506 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -268,6 +268,18 @@ auto replicate_array(Array from, TiledRange const &prepend_trng) { return result; } +/// +/// Given a rank-N tensor and a ∂-rank such that ∂ in [0,N), returns a new +/// rank-N' tensor (where N' = N - ∂) by summing over the ∂ ranks from the +/// end of the input tensor's range. For example, reduce_modes(A, 2) where +/// A.range().rank() == 5 will result into a new tensor (B) of rank-3 such that +/// B(i,j,k) = Σ_l Σ_m A(i,j,k,l,m). +/// +/// \param orig Input Tensor. +/// \param dmodes Reduce this many modes from the end as implied in the +/// range of the input tensor. +/// \return Tensor with reduced rank. +/// template auto reduce_modes(Tensor const &orig, size_t drank) { if (drank == 0) return orig; @@ -315,6 +327,7 @@ auto reduce_modes(Tensor const &orig, size_t drank) { /// \param dmodes Reduce this many modes from the end as implied in the /// tiled range of the input array. /// \return Array with reduced rank. +/// \see reduce_modes(Tensor, size_t) /// template auto reduce_modes(TA::DistArray orig, size_t drank) { @@ -375,6 +388,12 @@ auto reduce_modes(TA::DistArray orig, size_t drank) { return result; } +/// +/// \tparam Ixs Iterable of indices. +/// \param map A map from the index type of \c Ixs to TiledRange1. +/// \param ixs Iterable of indices. +/// \return TiledRange object. +/// template TiledRange make_trange(RangeMap const &map, Ixs const &ixs) { container::svector tr1s; From 5032ac5c3e0ed6d14d4c847cab494e5c9392c287 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 18 Jun 2024 21:56:22 -0400 Subject: [PATCH 401/592] typo --- src/TiledArray/util/bug.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/util/bug.h b/src/TiledArray/util/bug.h index ea4b980d55..2d217ceaee 100644 --- a/src/TiledArray/util/bug.h +++ b/src/TiledArray/util/bug.h @@ -344,9 +344,9 @@ class Debugger { /// This calls handle(int) with all of the major signals. virtual void handle_defaults(); - /// This sets a prefix which preceeds all messages printing by Debugger. + /// This sets a prefix which precedes all messages printing by Debugger. virtual void set_prefix(const char *p); - /// Set the prefix to the decimal represention of p followed by a ": ". + /// Set the prefix to the decimal representation of p followed by a ": ". virtual void set_prefix(int p); // clang-format off From b4c880423dbe05272c812f1b9ff8c4b58045ffb9 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 18 Jun 2024 21:56:33 -0400 Subject: [PATCH 402/592] added solver adaptors for eigen matrix block --- src/TiledArray/math/linalg/basic.h | 62 +++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/math/linalg/basic.h b/src/TiledArray/math/linalg/basic.h index 9fec71f41e..856c915bbe 100644 --- a/src/TiledArray/math/linalg/basic.h +++ b/src/TiledArray/math/linalg/basic.h @@ -135,7 +135,8 @@ using TiledArray::math::linalg::set_linalg_crossover_to_distributed; namespace Eigen { -// freestanding adaptors for Eigen::MatrixBase needed by solvers like DIIS +// freestanding adaptors for Eigen::MatrixBase and Eigen::Block +// needed by solvers like DIIS template inline void vec_multiply(Eigen::MatrixBase& a1, @@ -143,15 +144,39 @@ inline void vec_multiply(Eigen::MatrixBase& a1, a1.array() *= a2.array(); } +template +inline void vec_multiply( + Eigen::Block& a1, + const Eigen::Block& a2) { + a1.array() *= a2.array(); +} + template inline void scale(Eigen::MatrixBase& a, S scaling_factor) { using numeric_type = typename Eigen::MatrixBase::value_type; a.array() *= numeric_type(scaling_factor); } +template +inline void scale( + Eigen::Block& a, + S scaling_factor) { + using numeric_type = typename Eigen::Block::value_type; + a.array() *= numeric_type(scaling_factor); +} + template inline void zero(Eigen::MatrixBase& a) { - a = Derived::Zero(a.rows(), a.cols()); + a.fill(0); +} + +template +inline void zero( + Eigen::Block& a) { + a.fill(0); } template @@ -161,23 +186,56 @@ inline void axpy(Eigen::MatrixBase& y, S alpha, y.array() += numeric_type(alpha) * x.array(); } +template +inline void axpy( + Eigen::Block& y, S alpha, + const Eigen::Block& x) { + using numeric_type = typename Eigen::Block::value_type; + y.array() += numeric_type(alpha) * x.array(); +} + template inline auto dot(const Eigen::MatrixBase& l, const Eigen::MatrixBase& r) { return l.adjoint().dot(r); } +template +inline auto dot( + const Eigen::Block& l, + const Eigen::Block& r) { + return l.adjoint().dot(r); +} + template inline auto inner_product(const Eigen::MatrixBase& l, const Eigen::MatrixBase& r) { return l.dot(r); } +template +inline auto inner_product( + const Eigen::Block& l, + const Eigen::Block& r) { + return l.dot(r); +} + template inline auto norm2(const Eigen::MatrixBase& m) { return m.template lpNorm<2>(); } +template +inline auto norm2( + const Eigen::Block& m) { + return m.template lpNorm<2>(); +} + } // namespace Eigen #ifndef TILEDARRAY_MATH_LINALG_DISPATCH_W_TTG From beed33cea52373e7e2cd4efea71a2d4ea0078dae Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 18 Jun 2024 23:14:08 -0400 Subject: [PATCH 403/592] make Range-V3 mandatory + bump its tag to 0.12.0 --- CMakeLists.txt | 10 ++-------- INSTALL.md | 3 +-- cmake/tiledarray-config.cmake.in | 5 +++++ external/versions.cmake | 4 ++-- tests/CMakeLists.txt | 7 ++----- tests/block_range.cpp | 4 ---- tests/expressions_fixture.h | 2 -- tests/range.cpp | 20 +++++--------------- tests/sparse_shape.cpp | 10 ---------- tests/tensor.cpp | 4 ---- 10 files changed, 17 insertions(+), 52 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7bc524337b..101b1b0d16 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,9 +169,6 @@ option(TA_TRACE_GLOBAL_COMM_STATS "Enable tracing of communication stats of glob add_feature_info(TASK_TRACE_DEBUG TA_TRACE_GLOBAL_COMM_STATS "Debug communication stats of global objects (DistEval's and DIstributedStorage) TiledArray") set(TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE ${TA_TRACE_GLOBAL_COMM_STATS}) -option(TA_RANGEV3 "Enable Range-V3 library" OFF) -add_feature_info(TA_RANGEV3 TA_RANGEV3 "Range-V3 ranges library") - option(TA_TTG "Enable search/build of TTG library" OFF) add_feature_info(TA_TTG TA_TTG "TTG library") @@ -310,6 +307,7 @@ endif() if(ENABLE_HIP) include(external/hip.cmake) endif() +include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchRangeV3.cmake) include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchMADWorld.cmake) if (TA_TTG) include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchTTG.cmake) @@ -347,11 +345,7 @@ if(CCACHE) set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C++") set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C") endif(CCACHE) -# 2. range-v3 -if (TA_RANGEV3) - include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchRangeV3.cmake) -endif(TA_RANGEV3) -# 3. TTG +# 2. TTG # N.B. make sure TA configures MADNESS correctly #if (TA_TTG) # include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchTTG.cmake) diff --git a/INSTALL.md b/INSTALL.md index 3f669073f0..d18ee99025 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -40,6 +40,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Container: header-only - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* +- [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20 and later. - [BTAS](http://github.com/ValeevGroup/BTAS), tag 4e8f5233aa7881dccdfcc37ce07128833926d3c2 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 8788aea9758bfe6479cc23d39e6c77b7528009db . @@ -74,7 +75,6 @@ Optional prerequisites: - [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ (C++17) wrapper for ScaLAPACK (tag 6397f52cf11c0dfd82a79698ee198a2fce515d81); pulls and builds the following additional prerequisite - [blacspp](https://github.com/wavefunction91/blacspp.git) -- a modern C++ (C++17) wrapper for BLACS - Python3 interpreter -- to test (optionally-built) Python bindings -- [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20; only used for some unit testing of the functionality anticipated to be supported by future C++ standards. - [TTG](https://github.com/TESSEorg/ttg.git) -- C++ implementation of the Template Task Graph programming model for fine-grained flow-graph composition of distributed memory programs (tag 3fe4a06dbf4b05091269488aab38223da1f8cb8e). Many of these dependencies can be installed with a package manager, @@ -416,7 +416,6 @@ support may be added. * `TA_ASSERT_POLICY` -- Set to `TA_ASSERT_IGNORE` to disable `TA_ASSERT` assertions, `TA_ASSERT_THROW` to cause `TA_ASSERT` assertions to throw, `TA_ASSERT_ABORT` to cause `TA_ASSERT` assertions to abort. The default is `TA_ASSERT_IGNORE` if CMake uses a single-configuration generator and`CMAKE_BUILD_TYPE` is set to `Release` or `MinSizeRel`, else the default is `TA_ASSERT_THROW`. * `BUILD_TESTING` -- Set of `OFF` to disable building unit tests. The default is `ON`. * `TA_TRACE_TASKS` -- Set to `ON` to enable tracing of MADNESS tasks using custom task tracer. Note that standard profilers/tracers are generally useless (except in the trivial cases) with MADWorld-based programs since the submission context of tasks is not captured by standard tracing tools; this makes it impossible in a nontrivial program to attribute tasks to source code. WARNING: task tracing his will greatly increase the memory requirements. [Default=OFF]. -* `TA_RANGEV3` -- Set to `ON` to find or fetch the Range-V3 library and enable additional tests of TA components with constructs anticipated to be supported in the future. [Default=OFF]. * `TA_TTG` -- Set to `ON` to find or fetch the TTG library. [Default=OFF]. * `TA_SIGNED_1INDEX_TYPE` -- Set to `OFF` to use unsigned 1-index coordinate type (default for TiledArray 1.0.0-alpha.2 and older). The default is `ON`, which enables the use of negative indices in coordinates. * `TA_MAX_SOO_RANK_METADATA` -- Specifies the maximum rank for which to use Small Object Optimization (hence, avoid the use of the heap) for metadata. The default is `8`. diff --git a/cmake/tiledarray-config.cmake.in b/cmake/tiledarray-config.cmake.in index 3d1484013b..c6d0a49822 100644 --- a/cmake/tiledarray-config.cmake.in +++ b/cmake/tiledarray-config.cmake.in @@ -18,6 +18,11 @@ include(CMakeFindDependencyMacro) @Boost_CONFIG_FILE_CONTENTS@ +if (NOT TARGET range-v3::range-v3) + get_filename_component(range-v3_DIR "@range-v3_CONFIG@" DIRECTORY) + find_dependency(range-v3 QUIET REQUIRED HINTS "${range-v3_DIR}") +endif(NOT TARGET range-v3::range-v3) + if (NOT TARGET BTAS::BTAS) get_filename_component(BTAS_DIR "@BTAS_CONFIG@" DIRECTORY) find_dependency(BTAS 1.0.0 QUIET CONFIG REQUIRED HINTS "${BTAS_DIR}") diff --git a/external/versions.cmake b/external/versions.cmake index e04b066573..e6656d7c24 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -28,8 +28,8 @@ set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v2023.06.0) set(TA_TRACKED_SCALAPACKPP_TAG 6397f52cf11c0dfd82a79698ee198a2fce515d81) set(TA_TRACKED_SCALAPACKPP_PREVIOUS_TAG 711ef363479a90c88788036f9c6c8adb70736cbf ) -set(TA_TRACKED_RANGEV3_TAG 2e0591c57fce2aca6073ad6e4fdc50d841827864) -set(TA_TRACKED_RANGEV3_PREVIOUS_TAG dbdaa247a25a0daa24c68f1286a5693c72ea0006) +set(TA_TRACKED_RANGEV3_TAG 0.12.0) +set(TA_TRACKED_RANGEV3_PREVIOUS_TAG 2e0591c57fce2aca6073ad6e4fdc50d841827864) set(TA_TRACKED_TTG_URL https://github.com/TESSEorg/ttg) set(TA_TRACKED_TTG_TAG 3fe4a06dbf4b05091269488aab38223da1f8cb8e) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 76bb14e4b1..afb1e1c6a6 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -148,11 +148,8 @@ target_include_directories(${executable} PRIVATE # is too late to do this here; must set TA_ERROR=throw if want to run unit tests target_compile_definitions(${executable} PRIVATE TILEDARRAY_NO_USER_ERROR_MESSAGES=1 MADNESS_DISPLAY_EXCEPTION_BREAK_MESSAGE=0) -# optional dependencies -if (TARGET range-v3::range-v3) - target_link_libraries(${executable} PRIVATE range-v3::range-v3) - target_compile_definitions(${executable} PRIVATE TILEDARRAY_HAS_RANGEV3=1) -endif (TARGET range-v3::range-v3) +# always test range-v3 +target_link_libraries(${executable} PRIVATE range-v3::range-v3) # Add targets add_test(tiledarray/unit/build "${CMAKE_COMMAND}" --build ${PROJECT_BINARY_DIR} --target ${executable}) diff --git a/tests/block_range.cpp b/tests/block_range.cpp index 5d8431fa41..47f9d88e8f 100644 --- a/tests/block_range.cpp +++ b/tests/block_range.cpp @@ -25,9 +25,7 @@ #include #include -#ifdef TILEDARRAY_HAS_RANGEV3 #include -#endif #include "TiledArray/block_range.h" #include "range_fixture.h" @@ -229,14 +227,12 @@ BOOST_AUTO_TEST_CASE(block) { BlockRange br2(r, boost::combine(lobounds, upbounds)); BOOST_CHECK_EQUAL(br2, bref); -#ifdef TILEDARRAY_HAS_RANGEV3 // using zipped ranges of bounds (using Ranges-V3) // need to #include BOOST_CHECK_NO_THROW( BlockRange br3(r, ranges::views::zip(lobounds, upbounds))); BlockRange br3(r, ranges::views::zip(lobounds, upbounds)); BOOST_CHECK_EQUAL(br3, bref); -#endif // using nested initializer_list BOOST_CHECK_NO_THROW(BlockRange br4(r, {{0, 4}, {1, 6}, {2, 8}})); diff --git a/tests/expressions_fixture.h b/tests/expressions_fixture.h index 8e527465d1..94c09a7449 100644 --- a/tests/expressions_fixture.h +++ b/tests/expressions_fixture.h @@ -28,9 +28,7 @@ #include #include -#ifdef TILEDARRAY_HAS_RANGEV3 #include -#endif #include diff --git a/tests/range.cpp b/tests/range.cpp index a20f185d44..71f20aeb3f 100644 --- a/tests/range.cpp +++ b/tests/range.cpp @@ -19,9 +19,7 @@ #include #include -#ifdef TILEDARRAY_HAS_RANGEV3 #include -#endif #include #include @@ -169,10 +167,8 @@ BOOST_AUTO_TEST_CASE(constructors) { BOOST_REQUIRE_NO_THROW(Range r2(p2, f2)); // uses index containers BOOST_REQUIRE_NO_THROW( Range r(boost::combine(p2, f2))); // uses zipped range of p2 and f2 -#ifdef TILEDARRAY_HAS_RANGEV3 BOOST_REQUIRE_NO_THROW( Range r(ranges::views::zip(p2, f2))); // uses zipped range of p2 and f2 -#endif BOOST_CHECK_THROW(Range r2(f2, p2), Exception); // lobound > upbound Range r2(p2, f2); @@ -190,11 +186,9 @@ BOOST_AUTO_TEST_CASE(constructors) { Range should_be_copy_of_r2( boost::combine(p2, f2)); // uses zipped range of p2 and f2 BOOST_CHECK_EQUAL(r2, should_be_copy_of_r2); -#ifdef TILEDARRAY_HAS_RANGEV3 Range should_be_another_copy_of_r2( ranges::views::zip(p2, f2)); // uses zipped range of p2 and f2 BOOST_CHECK_EQUAL(r2, should_be_another_copy_of_r2); -#endif // test the rest of bound-based ctors { @@ -243,10 +237,8 @@ BOOST_AUTO_TEST_CASE(constructors) { // uses zipped bounds Range r7(boost::combine(std::vector{0, 1, 2}, std::array{4, 6, 8})); BOOST_CHECK_EQUAL(ref, r7); -#ifdef TILEDARRAY_HAS_RANGEV3 -// Range r8(ranges::views::zip(std::array{0, 1, 2}, std::vector{4, 6, 8})); -// BOOST_CHECK_EQUAL(ref, r8); -#endif + // Range r8(ranges::views::zip(std::array{0, 1, 2}, std::vector{4, 6, + // 8})); BOOST_CHECK_EQUAL(ref, r8); // zipped bounds with Eigen vectors { @@ -278,11 +270,9 @@ BOOST_AUTO_TEST_CASE(constructors) { Range r14(boost::combine(iv({0, 1, 2}), iv(iv({0, 1, 2}) + iv(4, 5, 6)))); BOOST_CHECK_EQUAL(ref, r14); -#ifdef TILEDARRAY_HAS_RANGEV3 -// this requires Eigen ~3.4 (3.3.90 docs suggest it should be sufficient) -// Range r15(ranges::views::zip(iv(0, 1, 2), iv(4, 6, 8))); -// BOOST_CHECK_EQUAL(ref, r15); -#endif + // this requires Eigen ~3.4 (3.3.90 docs suggest it should be sufficient) + // Range r15(ranges::views::zip(iv(0, 1, 2), iv(4, 6, 8))); + // BOOST_CHECK_EQUAL(ref, r15); } // container::svector as bounds diff --git a/tests/sparse_shape.cpp b/tests/sparse_shape.cpp index a79d7ceb8e..8bf1c4ae3b 100644 --- a/tests/sparse_shape.cpp +++ b/tests/sparse_shape.cpp @@ -24,9 +24,7 @@ */ #include -#ifdef TILEDARRAY_HAS_RANGEV3 #include -#endif #include "TiledArray/sparse_shape.h" #include "sparse_shape_fixture.h" @@ -350,12 +348,10 @@ BOOST_AUTO_TEST_CASE(block) { sparse_shape.block(boost::combine(lower, upper))); auto result3 = sparse_shape.block(boost::combine(lower, upper)); BOOST_CHECK_EQUAL(result, result3); -#ifdef TILEDARRAY_HAS_RANGEV3 BOOST_REQUIRE_NO_THROW( sparse_shape.block(ranges::views::zip(lower, upper))); auto result4 = sparse_shape.block(ranges::views::zip(lower, upper)); BOOST_CHECK_EQUAL(result, result4); -#endif } else { // Check that block throws an exception with a bad block range BOOST_CHECK_TA_ASSERT(sparse_shape.block(lower, upper), @@ -447,13 +443,11 @@ BOOST_AUTO_TEST_CASE(block_scale) { sparse_shape.block(boost::combine(lower, upper), factor)); auto result3 = sparse_shape.block(boost::combine(lower, upper), factor); BOOST_CHECK_EQUAL(result, result3); -#ifdef TILEDARRAY_HAS_RANGEV3 BOOST_REQUIRE_NO_THROW( sparse_shape.block(ranges::views::zip(lower, upper), factor)); auto result4 = sparse_shape.block(ranges::views::zip(lower, upper), factor); BOOST_CHECK_EQUAL(result, result4); -#endif } else { // Check that block throws an exception with a bad block range @@ -548,13 +542,11 @@ BOOST_AUTO_TEST_CASE(block_perm) { sparse_shape.block(boost::combine(lower, upper), perm)); auto result3 = sparse_shape.block(boost::combine(lower, upper), perm); BOOST_CHECK_EQUAL(result, result3); -#ifdef TILEDARRAY_HAS_RANGEV3 BOOST_REQUIRE_NO_THROW( sparse_shape.block(ranges::views::zip(lower, upper), perm)); auto result4 = sparse_shape.block(ranges::views::zip(lower, upper), perm); BOOST_CHECK_EQUAL(result, result4); -#endif } else { // Check that block throws an exception with a bad block range @@ -653,13 +645,11 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) { auto result3 = sparse_shape.block(boost::combine(lower, upper), factor, perm); BOOST_CHECK_EQUAL(result, result3); -#ifdef TILEDARRAY_HAS_RANGEV3 BOOST_REQUIRE_NO_THROW( sparse_shape.block(ranges::views::zip(lower, upper), factor, perm)); auto result4 = sparse_shape.block(ranges::views::zip(lower, upper), factor, perm); BOOST_CHECK_EQUAL(result, result4); -#endif } else { // Check that block throws an exception with a bad block range diff --git a/tests/tensor.cpp b/tests/tensor.cpp index be214ef841..99b10fc7b7 100644 --- a/tests/tensor.cpp +++ b/tests/tensor.cpp @@ -18,9 +18,7 @@ */ #include -#ifdef TILEDARRAY_HAS_RANGEV3 #include -#endif #include #include "TiledArray/math/gemm_helper.h" @@ -709,9 +707,7 @@ BOOST_AUTO_TEST_CASE(block) { // need to #include BOOST_CHECK_NO_THROW(s.block(boost::combine(lobound, upbound))); -#ifdef TILEDARRAY_HAS_RANGEV3 BOOST_CHECK_NO_THROW(s.block(ranges::views::zip(lobound, upbound))); -#endif auto sview0 = s.block(lobound, upbound); BOOST_CHECK(sview0.range().includes(lobound)); From 20a4b78389ceb13bd4d6b0d6e7cb6e2d27d27d2b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 19 Jun 2024 12:51:00 -0400 Subject: [PATCH 404/592] bump MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/539 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index d18ee99025..f8ad366009 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -43,7 +43,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20 and later. - [BTAS](http://github.com/ValeevGroup/BTAS), tag 4e8f5233aa7881dccdfcc37ce07128833926d3c2 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 8788aea9758bfe6479cc23d39e6c77b7528009db . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 3d0ae2fad1b97e347ca6dd98b9f1b9e74e629f52 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index e6656d7c24..11b3d4ac3e 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -11,8 +11,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 8788aea9758bfe6479cc23d39e6c77b7528009db) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 87715d98a244bff5cbff0bd2c644a8a00d882989) +set(TA_TRACKED_MADNESS_TAG 3d0ae2fad1b97e347ca6dd98b9f1b9e74e629f52) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 8788aea9758bfe6479cc23d39e6c77b7528009db) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From d5cbd040e9ef4d26bb31b42bd8b1e75664795727 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 21 Jun 2024 09:21:34 -0400 Subject: [PATCH 405/592] bump MADNESS tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/541 needed to build with C++20 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index f8ad366009..b0705e6f1e 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -43,7 +43,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20 and later. - [BTAS](http://github.com/ValeevGroup/BTAS), tag 4e8f5233aa7881dccdfcc37ce07128833926d3c2 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 3d0ae2fad1b97e347ca6dd98b9f1b9e74e629f52 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 96ac90e8f193ccfaf16f346b4652927d2d362e75 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 11b3d4ac3e..8443052d37 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -11,8 +11,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 3d0ae2fad1b97e347ca6dd98b9f1b9e74e629f52) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 8788aea9758bfe6479cc23d39e6c77b7528009db) +set(TA_TRACKED_MADNESS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 3d0ae2fad1b97e347ca6dd98b9f1b9e74e629f52) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From e3485f4ca55f582ad08de495c1d9ed5c4deef045 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 21 Jun 2024 09:23:45 -0400 Subject: [PATCH 406/592] std::shared_ptr::unique() removed in C++20 https://en.cppreference.com/w/cpp/memory/shared_ptr/unique --- src/TiledArray/dist_array.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index c6d6cddb79..2caa7e321d 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -634,7 +634,7 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// Checks if this is a unique handle to the implementation object /// \return true if this is a unique handle to the implementation object - bool is_unique() const { return pimpl_.unique(); } + bool is_unique() const { return pimpl_.use_count() == 1; } /// Wait for lazy tile cleanup @@ -1952,7 +1952,7 @@ DistArray replicated(const DistArray& a) { // Put the replicator pointer in the deferred cleanup object so it will // be deleted at the end of the next fence. - TA_ASSERT(replicator.unique()); // Required for deferred_cleanup + TA_ASSERT(replicator.use_count() == 1); // Required for deferred_cleanup madness::detail::deferred_cleanup(world, replicator); return result; From f5fd63e31ed4f07ac7f84abeb3b823356e6e56e1 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 21 Jun 2024 12:57:13 -0400 Subject: [PATCH 407/592] [ci] try sending most of the CI jobs to SaaS runners --- .gitlab-ci.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 264b42f0bb..b57a210430 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -29,7 +29,6 @@ before_script: ubuntu: stage: build tags: - - docker - ${RUNNER_TAGS} timeout: 3h image: valeevgroup/${IMAGE} @@ -65,12 +64,12 @@ ubuntu: BLA_THREADS : [ "IntelMKL_THREAD_LAYER=tbb" ] # ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ] TA_PYTHON : [ "TA_PYTHON=OFF" ] # needs to be fixed for MKL - RUNNER_TAGS: [ linux ] + RUNNER_TAGS: [ saas-linux-small-amd64 ] - IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ] CXX: [ g++, clang++-13 ] BUILD_TYPE : [ "Release", "Debug" ] ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ] - RUNNER_TAGS: [ linux ] + RUNNER_TAGS: [ saas-linux-small-amd64 ] - IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ] CXX: [ g++ ] BUILD_TYPE : [ "Release", "Debug" ] From f8ba0e401862b480ab4984814ec5181667e38d83 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 24 Jun 2024 13:36:38 -0400 Subject: [PATCH 408/592] Update the `volume(DistArray)` function to support sparse arrays as well as arrays with tensor-of-tensor tiles. --- src/TiledArray/dist_array.h | 78 +++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 29 deletions(-) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index c6d6cddb79..c0c9aac78e 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -869,9 +869,10 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// first minimally contains the same number of elements as /// the tile. /// \throw TiledArray::Exception if the tile is already initialized. - template )&&detail:: - is_input_iterator::value>> + template < + typename Integer, typename InIter, + typename = std::enable_if_t<(std::is_integral_v) && + detail::is_input_iterator::value>> typename std::enable_if::value>::type set( const std::initializer_list& i, InIter first) { set>(i, first); @@ -964,10 +965,9 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// \throw TiledArray::Exception if index \c i has the wrong rank. Strong /// throw guarantee. /// \throw TiledArray::Exception if tile \c i is already set. - template < - typename Index, typename Value, - typename = std::enable_if_t< - (std::is_integral_v)&&is_value_or_future_to_value_v>> + template ) && + is_value_or_future_to_value_v>> void set(const std::initializer_list& i, Value&& v) { set>(i, std::forward(v)); } @@ -1459,7 +1459,7 @@ class DistArray : public madness::archive::ParallelSerializableObject { shape() & typeid(pmap().get()).hash_code(); int64_t count = 0; for (auto it = begin(); it != end(); ++it) ++count; - ar& count; + ar & count; for (auto it = begin(); it != end(); ++it) ar & it->get(); } @@ -1476,14 +1476,14 @@ class DistArray : public madness::archive::ParallelSerializableObject { auto& world = TiledArray::get_default_world(); std::size_t typeid_hash = 0l; - ar& typeid_hash; + ar & typeid_hash; if (typeid_hash != typeid(*this).hash_code()) TA_EXCEPTION( "DistArray::serialize: source DistArray type != this DistArray type"); ProcessID world_size = -1; ProcessID world_rank = -1; - ar& world_size& world_rank; + ar & world_size & world_rank; if (world_size != world.size() || world_rank != world.rank()) TA_EXCEPTION( "DistArray::serialize: source DistArray world != this DistArray " @@ -1491,13 +1491,13 @@ class DistArray : public madness::archive::ParallelSerializableObject { trange_type trange; shape_type shape; - ar& trange& shape; + ar & trange & shape; // use default pmap, ensure it's the same pmap used to serialize auto volume = trange.tiles_range().volume(); auto pmap = detail::policy_t::default_pmap(world, volume); size_t pmap_hash_code = 0; - ar& pmap_hash_code; + ar & pmap_hash_code; if (pmap_hash_code != typeid(pmap.get()).hash_code()) TA_EXCEPTION( "DistArray::serialize: source DistArray pmap != this DistArray pmap"); @@ -1505,10 +1505,10 @@ class DistArray : public madness::archive::ParallelSerializableObject { new impl_type(world, std::move(trange), std::move(shape), pmap)); int64_t count = 0; - ar& count; + ar & count; for (auto it = begin(); it != end(); ++it, --count) { Tile tile; - ar& tile; + ar & tile; this->set(it.ordinal(), std::move(tile)); } if (count != 0) @@ -1541,27 +1541,27 @@ class DistArray : public madness::archive::ParallelSerializableObject { // make sure source data matches the expected type // TODO would be nice to be able to convert the data upon reading std::size_t typeid_hash = 0l; - localar& typeid_hash; + localar & typeid_hash; if (typeid_hash != typeid(*this).hash_code()) TA_EXCEPTION( "DistArray::load: source DistArray type != this DistArray type"); // make sure same number of clients for every I/O node int num_io_clients = 0; - localar& num_io_clients; + localar & num_io_clients; if (num_io_clients != ar.num_io_clients()) TA_EXCEPTION("DistArray::load: invalid parallel archive"); trange_type trange; shape_type shape; - localar& trange& shape; + localar & trange & shape; // send trange and shape to every client for (ProcessID p = 0; p < world.size(); ++p) { if (p != me && ar.io_node(p) == me) { world.mpi.Send(int(1), p, tag); // Tell client to expect the data madness::archive::MPIOutputArchive dest(world, p); - dest& trange& shape; + dest & trange & shape; dest.flush(); } } @@ -1573,13 +1573,13 @@ class DistArray : public madness::archive::ParallelSerializableObject { new impl_type(world, std::move(trange), std::move(shape), pmap)); int64_t count = 0; - localar& count; + localar & count; for (size_t ord = 0; ord != volume; ++ord) { if (!is_zero(ord)) { auto owner_rank = pmap->owner(ord); if (ar.io_node(owner_rank) == me) { Tile tile; - localar& tile; + localar & tile; this->set(ord, std::move(tile)); --count; } @@ -1598,7 +1598,7 @@ class DistArray : public madness::archive::ParallelSerializableObject { world.mpi.Recv(flag, p, tag); TA_ASSERT(flag == 1); madness::archive::MPIInputArchive source(world, p); - source& trange& shape; + source & trange & shape; // use default pmap auto volume = trange.tiles_range().volume(); @@ -1643,7 +1643,7 @@ class DistArray : public madness::archive::ParallelSerializableObject { } } } - localar& count; + localar & count; for (size_t ord = 0; ord != volume; ++ord) { if (!is_zero(ord)) { auto owner_rank = pmap()->owner(ord); @@ -1857,12 +1857,32 @@ auto rank(const DistArray& a) { return a.trange().tiles_range().rank(); } +/// +/// \brief Get the total elements in the non-zero tiles of an array. +/// For tensor-of-tensor tiles, the total is the sum of the elements +/// of the inner tensors in non-zero tiles. +/// template -size_t volume(const DistArray& a) { - // this is the number of tiles - if (a.size() > 0) // assuming dense shape - return a.trange().elements_range().volume(); - return 0; +size_t volume(const DistArray& array) { + std::atomic vol = 0; + + auto local_vol = [&vol](Tile const& in_tile) { + if constexpr (detail::is_tensor_of_tensor_v) { + in_tile.unary([&vol](auto const& el) { vol += el.total_size(); }); + } else + vol += in_tile.total_size(); + }; + + for (auto&& tix : array.tiles_range()) + if (!array.is_zero(tix) && array.is_local(tix)) + array.world().taskq.add(std::move(local_vol), array.find_local(tix).get()); + + array.world().gop.fence(); + + size_t vol_ = vol; + array.world().gop.sum(&vol_, 1); + + return vol_; } template @@ -2002,13 +2022,13 @@ template void save(const TiledArray::DistArray& x, const std::string name) { archive::ParallelOutputArchive<> ar2(x.world(), name.c_str(), 1); - ar2& x; + ar2 & x; } template void load(TiledArray::DistArray& x, const std::string name) { archive::ParallelInputArchive<> ar2(x.world(), name.c_str(), 1); - ar2& x; + ar2 & x; } } // namespace madness From 08854201787720bda96783b7d9378d6411135af1 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 24 Jun 2024 13:49:46 -0400 Subject: [PATCH 409/592] amend computation of the root dir of CUDAToolkit: CUDAToolkit_INCLUDE_DIR is not defined, but CUDAToolkit_LIBRARY_DIR is --- external/cuda.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/cuda.cmake b/external/cuda.cmake index 00a8b17477..2e757b60c4 100644 --- a/external/cuda.cmake +++ b/external/cuda.cmake @@ -28,7 +28,7 @@ foreach (library cublas;nvToolsExt) endforeach() if (NOT DEFINED CUDAToolkit_ROOT) - get_filename_component(CUDAToolkit_ROOT "${CUDAToolkit_INCLUDE_DIR}/../" ABSOLUTE CACHE) + get_filename_component(CUDAToolkit_ROOT "${CUDAToolkit_LIBRARY_DIR}/../" ABSOLUTE CACHE) endif(NOT DEFINED CUDAToolkit_ROOT) # sanitize implicit dirs if CUDA host compiler != C++ compiler From b9b663a8bf920d6eaccc0bd29236cbf35f883828 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 24 Jun 2024 13:57:29 -0400 Subject: [PATCH 410/592] bump Umpire tag to v2024.02.1 --- INSTALL.md | 2 +- external/umpire.cmake | 2 +- external/versions.cmake | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index b0705e6f1e..96e7259ed5 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -69,7 +69,7 @@ Optional prerequisites: - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required. - [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators. Note that TiledArray does not use ROCm directly but its C++ Heterogeneous-Compute Interface for Portability, `HIP`; although HIP can also be used to program CUDA-enabled devices, in TiledArray it is used only to program ROCm devices, hence ROCm and HIP will be used interchangeably. - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 6eed30d4dd2a5aa58840fe895dcffd80be7fbece). - - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag 20839b2e8e8972070dd8f75c7f00d50d6c399716). + - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag v2024.02.1). - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later). - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing: - [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ (C++17) wrapper for ScaLAPACK (tag 6397f52cf11c0dfd82a79698ee198a2fce515d81); pulls and builds the following additional prerequisite diff --git a/external/umpire.cmake b/external/umpire.cmake index 57675ca189..37152e98d2 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -215,7 +215,7 @@ else() TiledArray_UMPIRE PROPERTIES INTERFACE_INCLUDE_DIRECTORIES - "$;$;$;$;$;$" + "$;$;$;$;$;$;$" INTERFACE_LINK_LIBRARIES "$;$" ) diff --git a/external/versions.cmake b/external/versions.cmake index 8443052d37..e0680a6d48 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -22,8 +22,8 @@ set(TA_TRACKED_BTAS_PREVIOUS_TAG b7b2ea7513b087e35c6f1b26184a3904ac1e6b14) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) -set(TA_TRACKED_UMPIRE_TAG 20839b2e8e8972070dd8f75c7f00d50d6c399716) -set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v2023.06.0) +set(TA_TRACKED_UMPIRE_TAG v2024.02.1) +set(TA_TRACKED_UMPIRE_PREVIOUS_TAG 20839b2e8e8972070dd8f75c7f00d50d6c399716) set(TA_TRACKED_SCALAPACKPP_TAG 6397f52cf11c0dfd82a79698ee198a2fce515d81) set(TA_TRACKED_SCALAPACKPP_PREVIOUS_TAG 711ef363479a90c88788036f9c6c8adb70736cbf ) From 9433749fec3fb3602f2f0e752b4c2bc6afb73cbb Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 24 Jun 2024 14:04:07 -0400 Subject: [PATCH 411/592] typo. --- src/TiledArray/dist_array.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index c0c9aac78e..2c1640cf5f 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -1875,7 +1875,7 @@ size_t volume(const DistArray& array) { for (auto&& tix : array.tiles_range()) if (!array.is_zero(tix) && array.is_local(tix)) - array.world().taskq.add(std::move(local_vol), array.find_local(tix).get()); + array.world().taskq.add(local_vol, array.find_local(tix).get()); array.world().gop.fence(); From a197d31a54bc0fc7798513cb5b696c3c7944fa24 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 25 Jun 2024 15:49:08 -0400 Subject: [PATCH 412/592] Add test and bug fix `TA::volume` function. --- src/TiledArray/dist_array.h | 4 ++- tests/dist_array.cpp | 53 +++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index 2c1640cf5f..65e2b83dce 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -1868,7 +1868,9 @@ size_t volume(const DistArray& array) { auto local_vol = [&vol](Tile const& in_tile) { if constexpr (detail::is_tensor_of_tensor_v) { - in_tile.unary([&vol](auto const& el) { vol += el.total_size(); }); + vol += std::accumulate( + in_tile.data(), in_tile.data() + in_tile.total_size(), size_t{0}, + [](auto t, auto const& inner) { return t + inner.total_size(); }); } else vol += in_tile.total_size(); }; diff --git a/tests/dist_array.cpp b/tests/dist_array.cpp index c2ac8262d0..288deabd20 100644 --- a/tests/dist_array.cpp +++ b/tests/dist_array.cpp @@ -830,4 +830,57 @@ BOOST_AUTO_TEST_CASE(rebind) { std::is_same_v, SpArrayTZ>); } +BOOST_AUTO_TEST_CASE(volume) { + using T = Tensor; + using ToT = Tensor; + using Policy = SparsePolicy; + using ArrayToT = DistArray; + + size_t constexpr nrows = 3; + size_t constexpr ncols = 4; + TiledRange const trange({{0, 2, 5, 7}, {0, 5, 7, 10, 12}}); + TA_ASSERT(trange.tiles_range().extent().at(0) == nrows && + trange.tiles_range().extent().at(1) == ncols, + "Following code depends on this condition."); + + // this Range is used to construct all inner tensors of the tile with + // tile index @c tix. + auto inner_dims = [nrows, ncols](Range::index_type const& tix) -> Range { + static std::array const rows{7, 8, 9}; + static std::array const cols{7, 8, 9, 10}; + + TA_ASSERT(tix.size() == 2, "Only rank-2 tensor expected."); + return Range({rows[tix.at(0) % nrows], cols[tix.at(1) % ncols]}); + }; + + // let's make all 'diagonal' tiles zero + auto zero_tile = [](Range::index_type const& tix) -> bool { + return tix.at(0) == tix.at(1); + }; + + auto make_tile = [inner_dims, zero_tile, &trange](auto& tile, + auto const& rng) { + auto&& tix = trange.element_to_tile(rng.lobound()); + if (zero_tile(tix)) + return 0.; + else { + tile = ToT(rng, [inner_rng = inner_dims(tix)](auto&&) { + return T(inner_rng, 0.1); + }); + return tile.norm(); + } + }; + + auto& world = get_default_world(); + auto array = make_array(world, trange, make_tile); + + // manually compute the volume of array + size_t vol = 0; + for (auto&& tix : trange.tiles_range()) + if (!zero_tile(tix)) + vol += trange.tile(tix).volume() * inner_dims(tix).volume(); + + BOOST_REQUIRE(vol == TA::volume(array)); +} + BOOST_AUTO_TEST_SUITE_END() From d2dd697528172327cead2c5146dbe1bcd61529e7 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 25 Jun 2024 16:06:48 -0400 Subject: [PATCH 413/592] if CMAKE_CUDA_HOST_COMPILER is not set, set it to CMAKE_CXX_COMPILER in case it's not in PATH --- external/cuda.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/external/cuda.cmake b/external/cuda.cmake index 2e757b60c4..aa1e51e53e 100644 --- a/external/cuda.cmake +++ b/external/cuda.cmake @@ -12,6 +12,11 @@ if (DEFINED CMAKE_CUDA_FLAGS) else() set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr") endif() +# if CMAKE_CUDA_HOST_COMPILER not set, set it to CMAKE_CXX_COMPILER, else NVCC will grab something from PATH +if (NOT DEFINED CMAKE_CUDA_HOST_COMPILER) + set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "The host C++ compiler to be used by the CUDA compiler") +endif() + enable_language(CUDA) set(CUDA_FOUND TRUE) From 9a0492b76367d8ca5c2d7a5623e0569cd74f1acb Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 25 Jun 2024 17:15:53 -0400 Subject: [PATCH 414/592] Cleanup. --- src/TiledArray/dist_array.h | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index 65e2b83dce..7059b77333 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -1859,8 +1859,8 @@ auto rank(const DistArray& a) { /// /// \brief Get the total elements in the non-zero tiles of an array. -/// For tensor-of-tensor tiles, the total is the sum of the elements -/// of the inner tensors in non-zero tiles. +/// For tensor-of-tensor tiles, the total is the sum of the number of +/// elements in the inner tensors of non-zero tiles. /// template size_t volume(const DistArray& array) { @@ -1868,16 +1868,19 @@ size_t volume(const DistArray& array) { auto local_vol = [&vol](Tile const& in_tile) { if constexpr (detail::is_tensor_of_tensor_v) { - vol += std::accumulate( - in_tile.data(), in_tile.data() + in_tile.total_size(), size_t{0}, - [](auto t, auto const& inner) { return t + inner.total_size(); }); + auto reduce_op = [](size_t& MADNESS_RESTRICT result, auto&& arg) { + result += arg->total_size(); + }; + auto join_op = [](auto& MADNESS_RESTRICT result, size_t count) { + result += count; + }; + vol += in_tile.reduce(reduce_op, join_op, size_t{0}); } else vol += in_tile.total_size(); }; - for (auto&& tix : array.tiles_range()) - if (!array.is_zero(tix) && array.is_local(tix)) - array.world().taskq.add(local_vol, array.find_local(tix).get()); + for (auto&& local_tile_future : array) + array.world().taskq.add(local_vol, local_tile_future.get()); array.world().gop.fence(); From 9decfc25800db7d86d0132525925d8ea52bf9537 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 26 Jun 2024 07:29:13 -0400 Subject: [PATCH 415/592] amends 577fda29 to always use the old implementation of ta_tensor_to_um_tensor when element conversion is needed --- src/TiledArray/device/btas_um_tensor.h | 70 +++++++++++++++++--------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/src/TiledArray/device/btas_um_tensor.h b/src/TiledArray/device/btas_um_tensor.h index 45f9b63731..dec80dcaf1 100644 --- a/src/TiledArray/device/btas_um_tensor.h +++ b/src/TiledArray/device/btas_um_tensor.h @@ -67,7 +67,7 @@ struct ArchiveLoadImpl> { TiledArray::btasUMTensorVarray &t) { TiledArray::Range range{}; TiledArray::device_um_btas_varray store{}; - ar &range &store; + ar & range & store; t = TiledArray::btasUMTensorVarray(std::move(range), std::move(store)); // device::setDevice(TiledArray::deviceEnv::instance()->default_device_id()); // auto &stream = device::stream_for(range); @@ -83,7 +83,7 @@ struct ArchiveStoreImpl> { auto stream = TiledArray::device::stream_for(t.range()); TiledArray::to_execution_space( t.storage(), stream); - ar &t.range() & t.storage(); + ar & t.range() & t.storage(); } }; @@ -674,25 +674,12 @@ template typename std::enable_if::value, TiledArray::DistArray>::type ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { - auto convert_tile_memcpy = [](const TATensor &tile) { - /// UMTensor must be wrapped into TA::Tile - - using Tensor = typename UMTensor::tensor_type; - - auto stream = device::stream_for(tile.range()); - typename Tensor::storage_type storage; - make_device_storage(storage, tile.range().area(), stream); - Tensor result(tile.range(), std::move(storage)); - - DeviceSafeCall( - device::memcpyAsync(result.data(), tile.data(), - tile.size() * sizeof(typename Tensor::value_type), - device::MemcpyDefault, stream)); - - device::sync_madness_task_with(stream); - return TiledArray::Tile(std::move(result)); - }; + using inT = typename TATensor::value_type; + using outT = typename UMTensor::value_type; + // check if element conversion is necessary + constexpr bool T_conversion = !std::is_same_v; + // this is safe even when need to convert element types, but less efficient auto convert_tile_um = [](const TATensor &tile) { /// UMTensor must be wrapped into TA::Tile @@ -711,14 +698,47 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray &array) { TiledArray::to_execution_space( result.storage(), stream); + // N.B. move! without it have D-to-H transfer due to calling UM + // allocator construct() on the host return TiledArray::Tile(std::move(result)); }; - const char *use_legacy_conversion = - std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION"); - auto um_array = use_legacy_conversion - ? to_new_tile_type(array, convert_tile_um) - : to_new_tile_type(array, convert_tile_memcpy); + TiledArray::DistArray um_array; + if constexpr (T_conversion) { + um_array = to_new_tile_type(array, convert_tile_um); + } else { + // this is more efficient for copying: + // - avoids copy on host followed by UM transfer, instead uses direct copy + // - replaced unneeded copy (which also caused D-to-H transfer due to + // calling UM allocator construct() on the host) by move + // This eliminates all spurious UM traffic in (T) W3 contractions + auto convert_tile_memcpy = [](const TATensor &tile) { + /// UMTensor must be wrapped into TA::Tile + + using Tensor = typename UMTensor::tensor_type; + + auto stream = device::stream_for(tile.range()); + typename Tensor::storage_type storage; + make_device_storage(storage, tile.range().area(), stream); + Tensor result(tile.range(), std::move(storage)); + + DeviceSafeCall( + device::memcpyAsync(result.data(), tile.data(), + tile.size() * sizeof(typename Tensor::value_type), + device::MemcpyDefault, stream)); + + device::sync_madness_task_with(stream); + // N.B. move! without it have D-to-H transfer due to calling UM + // allocator construct() on the host + return TiledArray::Tile(std::move(result)); + }; + + const char *use_legacy_conversion = + std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION"); + um_array = use_legacy_conversion + ? to_new_tile_type(array, convert_tile_um) + : to_new_tile_type(array, convert_tile_memcpy); + } array.world().gop.fence(); return um_array; From 051b3b9855062a24cc2c120cb234dccee2860e3f Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 26 Jun 2024 09:00:17 -0400 Subject: [PATCH 416/592] Generalize `TA::squared_norm` to be applicable for tensor-of-tensor arrays as well. --- src/TiledArray/dist_array.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index f75b79f083..63ed48e795 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -1213,6 +1213,17 @@ class DistArray : public madness::archive::ParallelSerializableObject { return TiledArray::expressions::TsrExpr(*this, vars); } + /// + /// \brief This method creates a tensor expression but does not insist the + /// annotation to be bipartite (outer and inner tensor annotations). + /// \param vars Annotation for the tensor expression. + /// \note Only use for unary evaluations when the indexing of the inner + /// tensors is not significant, eg. norm computation. + /// + auto index_unchecked_tensor_expression(const std::string& vars) const { + return TiledArray::expressions::TsrExpr(*this, vars); + } + /// Create a tensor expression /// \param vars A string with a comma-separated list of variables @@ -1917,7 +1928,8 @@ auto inner_product(const DistArray& a, template auto squared_norm(const DistArray& a) { - return a(detail::dummy_annotation(rank(a))).squared_norm(); + return a.index_unchecked_tensor_expression(detail::dummy_annotation(rank(a))) + .squared_norm(); } template From 01c5684aee54269858eb386b45dfaf509a85450c Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 26 Jun 2024 09:13:10 -0400 Subject: [PATCH 417/592] Cleanup. --- src/TiledArray/dist_array.h | 40 +++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index 63ed48e795..a5bced3bc4 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -1213,17 +1213,6 @@ class DistArray : public madness::archive::ParallelSerializableObject { return TiledArray::expressions::TsrExpr(*this, vars); } - /// - /// \brief This method creates a tensor expression but does not insist the - /// annotation to be bipartite (outer and inner tensor annotations). - /// \param vars Annotation for the tensor expression. - /// \note Only use for unary evaluations when the indexing of the inner - /// tensors is not significant, eg. norm computation. - /// - auto index_unchecked_tensor_expression(const std::string& vars) const { - return TiledArray::expressions::TsrExpr(*this, vars); - } - /// Create a tensor expression /// \param vars A string with a comma-separated list of variables @@ -1235,6 +1224,32 @@ class DistArray : public madness::archive::ParallelSerializableObject { return TiledArray::expressions::TsrExpr(*this, vars); } + /// Create a tensor expression from an annotation (possibly free of + /// inner-tensor sub-annotation). + + /// \brief This method creates a tensor expression but does not insist the + /// annotation to be bipartite (outer and inner tensor annotations). + /// \param vars A string with a comma-separated list of variables. + /// \note Only use for unary evaluations when the indexing of the inner + /// tensors is not significant, eg. norm computation. + /// + auto make_tsrexpr(const std::string& vars) { + return TiledArray::expressions::TsrExpr(*this, vars); + } + + /// Create a tensor expression from an annotation (possibly free of + /// inner-tensor sub-annotation). + + /// \brief This method creates a tensor expression but does not insist the + /// annotation to be bipartite (outer and inner tensor annotations). + /// \param vars A string with a comma-separated list of variables. + /// \note Only use for unary evaluations when the indexing of the inner + /// tensors is not significant, eg. norm computation. + /// + auto make_tsrexpr(const std::string& vars) const { + return TiledArray::expressions::TsrExpr(*this, vars); + } + /// \deprecated use DistArray::world() [[deprecated]] World& get_world() const { return world(); } @@ -1928,8 +1943,7 @@ auto inner_product(const DistArray& a, template auto squared_norm(const DistArray& a) { - return a.index_unchecked_tensor_expression(detail::dummy_annotation(rank(a))) - .squared_norm(); + return a.make_tsrexpr(detail::dummy_annotation(rank(a))).squared_norm(); } template From fc89883eb6f44e2b762e86403720c848da507247 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 26 Jun 2024 11:31:02 -0400 Subject: [PATCH 418/592] More unary reductions do not require inner tensor annotation. --- src/TiledArray/dist_array.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index a5bced3bc4..167464ccfb 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -1918,27 +1918,27 @@ size_t volume(const DistArray& array) { template auto abs_min(const DistArray& a) { - return a(detail::dummy_annotation(rank(a))).abs_min(); + return a.make_tsrexpr(detail::dummy_annotation(rank(a))).abs_min(); } template auto abs_max(const DistArray& a) { - return a(detail::dummy_annotation(rank(a))).abs_max(); + return a.make_tsrexpr(detail::dummy_annotation(rank(a))).abs_max(); } template auto dot(const DistArray& a, const DistArray& b) { - return (a(detail::dummy_annotation(rank(a))) - .dot(b(detail::dummy_annotation(rank(b))))) - .get(); + auto&& expr_a = a.make_tsrexpr(detail::dummy_annotation(rank(a))); + auto&& expr_b = b.make_tsrexpr(detail::dummy_annotation(rank(b))); + return expr_a.dot(expr_b).get(); } template auto inner_product(const DistArray& a, const DistArray& b) { - return (a(detail::dummy_annotation(rank(a))) - .inner_product(b(detail::dummy_annotation(rank(b))))) - .get(); + auto&& expr_a = a.make_tsrexpr(detail::dummy_annotation(rank(a))); + auto&& expr_b = b.make_tsrexpr(detail::dummy_annotation(rank(b))); + return expr_a.inner_product(expr_b).get(); } template From 512464eaff23243951195551480b9d0ebfe5cb9d Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 27 Jun 2024 10:10:07 -0400 Subject: [PATCH 419/592] Unit tests for unary reductions involving ToT. --- tests/dist_array.cpp | 61 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tests/dist_array.cpp b/tests/dist_array.cpp index 288deabd20..2f72bda763 100644 --- a/tests/dist_array.cpp +++ b/tests/dist_array.cpp @@ -883,4 +883,65 @@ BOOST_AUTO_TEST_CASE(volume) { BOOST_REQUIRE(vol == TA::volume(array)); } +BOOST_AUTO_TEST_CASE(unary_reduction_tot) { + using Numeric = double; + using T = Tensor; + using ToT = Tensor; + using Policy = SparsePolicy; + using ArrayToT = DistArray; + + auto unit_T = [](Range const& rng) { return T(rng, Numeric{1}); }; + + auto unit_ToT = [unit_T](Range const& rngo, Range const& rngi) { + return ToT(rngo, unit_T(rngi)); + }; + + size_t constexpr nrows = 3; + size_t constexpr ncols = 4; + TiledRange const trange({{0, 2, 5, 7}, {0, 5, 7, 10, 12}}); + TA_ASSERT(trange.tiles_range().extent().at(0) == nrows && + trange.tiles_range().extent().at(1) == ncols, + "Following code depends on this condition."); + + // this Range is used to construct all inner tensors of the tile with + // tile index @c tix. + auto inner_dims = [nrows, ncols](Range::index_type const& tix) -> Range { + static std::array const rows{7, 8, 9}; + static std::array const cols{7, 8, 9, 10}; + + TA_ASSERT(tix.size() == 2, "Only rank-2 tensor expected."); + return Range({rows[tix.at(0) % nrows], cols[tix.at(1) % ncols]}); + }; + + // let's make all 'diagonal' tiles zero + auto zero_tile = [](Range::index_type const& tix) -> bool { + return tix.at(0) == tix.at(1); + }; + + auto make_tile = [inner_dims, // + zero_tile, // + &trange, // + unit_ToT](auto& tile, auto const& rng) { + auto&& tix = trange.element_to_tile(rng.lobound()); + if (zero_tile(tix)) + return 0.; + else { + tile = unit_ToT(rng, inner_dims(tix)); + return tile.norm(); + } + }; + + auto& world = get_default_world(); + + // all non-zero inner tensors of this ToT array are unit (ie all + // inner tensors' elements are 1.) + auto array = make_array(world, trange, make_tile); + + // since all inner tensors are filled with 1. + double array_norm = std::sqrt(TA::volume(array)); + + BOOST_REQUIRE(array_norm == TA::norm2(array)); + BOOST_REQUIRE(array_norm = std::sqrt(TA::dot(array, array))); +} + BOOST_AUTO_TEST_SUITE_END() From 85581295703bb0a72c682929f53fc1ef91ccccf8 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 27 Jun 2024 11:21:49 -0400 Subject: [PATCH 420/592] [skip ci] rename a test case. --- tests/dist_array.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/dist_array.cpp b/tests/dist_array.cpp index 2f72bda763..998b0d8f9f 100644 --- a/tests/dist_array.cpp +++ b/tests/dist_array.cpp @@ -883,7 +883,7 @@ BOOST_AUTO_TEST_CASE(volume) { BOOST_REQUIRE(vol == TA::volume(array)); } -BOOST_AUTO_TEST_CASE(unary_reduction_tot) { +BOOST_AUTO_TEST_CASE(reduction) { using Numeric = double; using T = Tensor; using ToT = Tensor; From 644f0e966e1d533f7c82d5b716c86157bee2116a Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 7 Jul 2024 15:10:51 -0400 Subject: [PATCH 421/592] Tests and fixes one more corner case of ToT x ToT evaluation. --- src/TiledArray/einsum/tiledarray.h | 10 +++++++--- tests/einsum.cpp | 7 +++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 5d68770506..4777a8656e 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -525,9 +525,13 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // contracted indices auto i = (a & b) - h; - // no Hadamard indices => standard contraction (or even outer product) - // same a, b, and c => pure Hadamard - if (!h || (h && !(i || e))) { + // + // - no Hadamard indices for non-nested DistArray imply evaluation can be + // delegated to expression layer. + // - only Hadamard indices for nested and non-nested DistArray imply + // evaluation can be delegated to expression layer. + // + if ((!IsArrayToT && !h) || (h && !(i || e))) { ArrayC C; C(std::string(c) + inner.c) = A * B; return C; diff --git a/tests/einsum.cpp b/tests/einsum.cpp index a5980c7446..646d636b65 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -432,6 +432,13 @@ BOOST_AUTO_TEST_CASE(corner_cases) { {{0, 4, 8}, {0, 4}}, // {{0, 4, 8}, {0, 4}}, // {8}))); + + BOOST_REQUIRE( + check_manual_eval("ijkl;abecdf,k;e->ijl;bafdc", // + {{0, 2}, {0, 3}, {0, 4}, {0, 5}}, // + {{0, 4}}, // + {2, 3, 6, 4, 5, 7}, // + {6})); } BOOST_AUTO_TEST_SUITE_END() From 52b09600924d68092d85c766daa3455be74040ba Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 7 Jul 2024 15:16:06 -0400 Subject: [PATCH 422/592] ToT support for `math/linalg` functions and `concat` function. --- src/TiledArray/conversions/concat.h | 5 +++-- src/TiledArray/math/linalg/basic.h | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/conversions/concat.h b/src/TiledArray/conversions/concat.h index 7c440c54e2..398a5dc7b3 100644 --- a/src/TiledArray/conversions/concat.h +++ b/src/TiledArray/conversions/concat.h @@ -92,8 +92,9 @@ DistArray concat( DistArray result(*target_world, tr); const auto annot = detail::dummy_annotation(r); for (auto i = 0ul; i != arrays.size(); ++i) { - result(annot).block(tile_begin_end[i].first, tile_begin_end[i].second) = - arrays[i](annot); + result.make_tsrexpr(annot).block(tile_begin_end[i].first, + tile_begin_end[i].second) = + arrays[i].make_tsrexpr(annot); } result.world().gop.fence(); diff --git a/src/TiledArray/math/linalg/basic.h b/src/TiledArray/math/linalg/basic.h index 856c915bbe..c00a363286 100644 --- a/src/TiledArray/math/linalg/basic.h +++ b/src/TiledArray/math/linalg/basic.h @@ -79,14 +79,14 @@ template inline void vec_multiply(DistArray& a1, const DistArray& a2) { auto vars = TiledArray::detail::dummy_annotation(rank(a1)); - a1(vars) = a1(vars) * a2(vars); + a1.make_tsrexpr(vars) = a1.make_tsrexpr(vars) * a2.make_tsrexpr(vars); } template inline void scale(DistArray& a, S scaling_factor) { using numeric_type = typename DistArray::numeric_type; auto vars = TiledArray::detail::dummy_annotation(rank(a)); - a(vars) = numeric_type(scaling_factor) * a(vars); + a.make_tsrexpr(vars) = numeric_type(scaling_factor) * a.make_tsrexpr(vars); } template @@ -99,7 +99,8 @@ inline void axpy(DistArray& y, S alpha, const DistArray& x) { using numeric_type = typename DistArray::numeric_type; auto vars = TiledArray::detail::dummy_annotation(rank(y)); - y(vars) = y(vars) + numeric_type(alpha) * x(vars); + y.make_tsrexpr(vars) = + y.make_tsrexpr(vars) + numeric_type(alpha) * x.make_tsrexpr(vars); } /// selector for concat From 35541916ba08646497a10b1353b15092f8fde0f6 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 8 Jul 2024 07:40:35 -0400 Subject: [PATCH 423/592] Revert "Tests and fixes one more corner case of ToT x ToT evaluation." This reverts commit 644f0e966e1d533f7c82d5b716c86157bee2116a. --- src/TiledArray/einsum/tiledarray.h | 10 +++------- tests/einsum.cpp | 7 ------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 4777a8656e..5d68770506 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -525,13 +525,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // contracted indices auto i = (a & b) - h; - // - // - no Hadamard indices for non-nested DistArray imply evaluation can be - // delegated to expression layer. - // - only Hadamard indices for nested and non-nested DistArray imply - // evaluation can be delegated to expression layer. - // - if ((!IsArrayToT && !h) || (h && !(i || e))) { + // no Hadamard indices => standard contraction (or even outer product) + // same a, b, and c => pure Hadamard + if (!h || (h && !(i || e))) { ArrayC C; C(std::string(c) + inner.c) = A * B; return C; diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 646d636b65..a5980c7446 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -432,13 +432,6 @@ BOOST_AUTO_TEST_CASE(corner_cases) { {{0, 4, 8}, {0, 4}}, // {{0, 4, 8}, {0, 4}}, // {8}))); - - BOOST_REQUIRE( - check_manual_eval("ijkl;abecdf,k;e->ijl;bafdc", // - {{0, 2}, {0, 3}, {0, 4}, {0, 5}}, // - {{0, 4}}, // - {2, 3, 6, 4, 5, 7}, // - {6})); } BOOST_AUTO_TEST_SUITE_END() From d19df41498955ae7602ee29a9dc0d68d2867a803 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 8 Jul 2024 07:41:21 -0400 Subject: [PATCH 424/592] Revert "ToT support for `math/linalg` functions and `concat` function." This reverts commit 52b09600924d68092d85c766daa3455be74040ba. --- src/TiledArray/conversions/concat.h | 5 ++--- src/TiledArray/math/linalg/basic.h | 7 +++---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/TiledArray/conversions/concat.h b/src/TiledArray/conversions/concat.h index 398a5dc7b3..7c440c54e2 100644 --- a/src/TiledArray/conversions/concat.h +++ b/src/TiledArray/conversions/concat.h @@ -92,9 +92,8 @@ DistArray concat( DistArray result(*target_world, tr); const auto annot = detail::dummy_annotation(r); for (auto i = 0ul; i != arrays.size(); ++i) { - result.make_tsrexpr(annot).block(tile_begin_end[i].first, - tile_begin_end[i].second) = - arrays[i].make_tsrexpr(annot); + result(annot).block(tile_begin_end[i].first, tile_begin_end[i].second) = + arrays[i](annot); } result.world().gop.fence(); diff --git a/src/TiledArray/math/linalg/basic.h b/src/TiledArray/math/linalg/basic.h index c00a363286..856c915bbe 100644 --- a/src/TiledArray/math/linalg/basic.h +++ b/src/TiledArray/math/linalg/basic.h @@ -79,14 +79,14 @@ template inline void vec_multiply(DistArray& a1, const DistArray& a2) { auto vars = TiledArray::detail::dummy_annotation(rank(a1)); - a1.make_tsrexpr(vars) = a1.make_tsrexpr(vars) * a2.make_tsrexpr(vars); + a1(vars) = a1(vars) * a2(vars); } template inline void scale(DistArray& a, S scaling_factor) { using numeric_type = typename DistArray::numeric_type; auto vars = TiledArray::detail::dummy_annotation(rank(a)); - a.make_tsrexpr(vars) = numeric_type(scaling_factor) * a.make_tsrexpr(vars); + a(vars) = numeric_type(scaling_factor) * a(vars); } template @@ -99,8 +99,7 @@ inline void axpy(DistArray& y, S alpha, const DistArray& x) { using numeric_type = typename DistArray::numeric_type; auto vars = TiledArray::detail::dummy_annotation(rank(y)); - y.make_tsrexpr(vars) = - y.make_tsrexpr(vars) + numeric_type(alpha) * x.make_tsrexpr(vars); + y(vars) = y(vars) + numeric_type(alpha) * x(vars); } /// selector for concat From 8ae0cdaa81fa4f837ca9ffd4f90437b218a5a504 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 8 Jul 2024 10:59:51 -0400 Subject: [PATCH 425/592] Bug fix the logic in einsum function that delegates evaluation to the expression layer. --- src/TiledArray/einsum/tiledarray.h | 19 ++++++++++++++----- tests/einsum.cpp | 6 ++++++ 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 4777a8656e..72dea21231 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -526,12 +526,21 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, auto i = (a & b) - h; // - // - no Hadamard indices for non-nested DistArray imply evaluation can be - // delegated to expression layer. - // - only Hadamard indices for nested and non-nested DistArray imply - // evaluation can be delegated to expression layer. + // *) Pure Hadamard indices: (h && !(i || e)) is true implies + // the evaluation can be delegated to the expression layer + // for distarrays of both nested and non-nested tensor tiles. + // *) If no Hadamard indices are present (!h) the evaluation + // can be delegated to the expression _only_ for distarrays with + // non-nested tensor tiles. + // This is because even if Hadamard indices are not present, a contracted + // index might be present pertinent to the outer tensor in case of a + // nested-tile distarray, which is especially handled within this + // function because expression layer cannot handle that yet. // - if ((!IsArrayToT && !h) || (h && !(i || e))) { + if ((h && !(i || e)) // pure Hadamard + || (IsArrayToT && !(i || h)) // ToT result from outer-product + || (IsArrayT && !h) // T from general product without Hadamard + ) { ArrayC C; C(std::string(c) + inner.c) = A * B; return C; diff --git a/tests/einsum.cpp b/tests/einsum.cpp index 646d636b65..6be4a4a99d 100644 --- a/tests/einsum.cpp +++ b/tests/einsum.cpp @@ -433,6 +433,12 @@ BOOST_AUTO_TEST_CASE(corner_cases) { {{0, 4, 8}, {0, 4}}, // {8}))); + BOOST_REQUIRE(check_manual_eval("il;bae,il;e->li;ab", // + {{0, 2}, {0, 4}}, // + {{0, 2}, {0, 4}}, // + {4, 2, 3}, // + {3})); + BOOST_REQUIRE( check_manual_eval("ijkl;abecdf,k;e->ijl;bafdc", // {{0, 2}, {0, 3}, {0, 4}, {0, 5}}, // From 6994163c27395b5247f92580953e0b2ca3697e29 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Sun, 7 Jul 2024 15:16:06 -0400 Subject: [PATCH 426/592] ToT support for `math/linalg` functions and `concat` function. --- src/TiledArray/conversions/concat.h | 5 +++-- src/TiledArray/math/linalg/basic.h | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/conversions/concat.h b/src/TiledArray/conversions/concat.h index 7c440c54e2..398a5dc7b3 100644 --- a/src/TiledArray/conversions/concat.h +++ b/src/TiledArray/conversions/concat.h @@ -92,8 +92,9 @@ DistArray concat( DistArray result(*target_world, tr); const auto annot = detail::dummy_annotation(r); for (auto i = 0ul; i != arrays.size(); ++i) { - result(annot).block(tile_begin_end[i].first, tile_begin_end[i].second) = - arrays[i](annot); + result.make_tsrexpr(annot).block(tile_begin_end[i].first, + tile_begin_end[i].second) = + arrays[i].make_tsrexpr(annot); } result.world().gop.fence(); diff --git a/src/TiledArray/math/linalg/basic.h b/src/TiledArray/math/linalg/basic.h index 856c915bbe..c00a363286 100644 --- a/src/TiledArray/math/linalg/basic.h +++ b/src/TiledArray/math/linalg/basic.h @@ -79,14 +79,14 @@ template inline void vec_multiply(DistArray& a1, const DistArray& a2) { auto vars = TiledArray::detail::dummy_annotation(rank(a1)); - a1(vars) = a1(vars) * a2(vars); + a1.make_tsrexpr(vars) = a1.make_tsrexpr(vars) * a2.make_tsrexpr(vars); } template inline void scale(DistArray& a, S scaling_factor) { using numeric_type = typename DistArray::numeric_type; auto vars = TiledArray::detail::dummy_annotation(rank(a)); - a(vars) = numeric_type(scaling_factor) * a(vars); + a.make_tsrexpr(vars) = numeric_type(scaling_factor) * a.make_tsrexpr(vars); } template @@ -99,7 +99,8 @@ inline void axpy(DistArray& y, S alpha, const DistArray& x) { using numeric_type = typename DistArray::numeric_type; auto vars = TiledArray::detail::dummy_annotation(rank(y)); - y(vars) = y(vars) + numeric_type(alpha) * x(vars); + y.make_tsrexpr(vars) = + y.make_tsrexpr(vars) + numeric_type(alpha) * x.make_tsrexpr(vars); } /// selector for concat From 5b6b4eac4a6de1d29d32975a87ff5cb780cfdc94 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 22 Jul 2024 09:11:19 -0400 Subject: [PATCH 427/592] typo --- examples/gemm/ta_cc_abcd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/gemm/ta_cc_abcd.cpp b/examples/gemm/ta_cc_abcd.cpp index df05e78c4a..f2b612b7aa 100644 --- a/examples/gemm/ta_cc_abcd.cpp +++ b/examples/gemm/ta_cc_abcd.cpp @@ -72,7 +72,7 @@ int main(int argc, char** argv) { // Get command line arguments if (argc < 5) { - std::cout << "Mocks t2(i,a,j,b) * v(a,b,c,d) term in CC amplitude eqs" + std::cout << "Mocks t2(i,j,a,b) * v(a,b,c,d) term in CC amplitude eqs" << std::endl << "Usage: " << argv[0] << " occ_size occ_nblocks uocc_size " From ca1bd999c4e1da097eaae0fbc38732159d367d98 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 22 Jul 2024 09:55:54 -0400 Subject: [PATCH 428/592] ta_dense_asymm supports extents not evenly divisible by tile sizes --- examples/gemm/ta_dense_asymm.cpp | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/examples/gemm/ta_dense_asymm.cpp b/examples/gemm/ta_dense_asymm.cpp index acef959c7a..ef178319cd 100644 --- a/examples/gemm/ta_dense_asymm.cpp +++ b/examples/gemm/ta_dense_asymm.cpp @@ -50,11 +50,6 @@ int main(int argc, char** argv) { std::cerr << "Error: block sizes must be greater than zero.\n"; return 1; } - if ((Nm % Bm) != 0ul || Nn % Bn != 0ul || Nk % Bk != 0ul) { - std::cerr - << "Error: dimension size must be evenly divisible by block size.\n"; - return 1; - } const long repeat = (argc >= 8 ? atol(argv[7]) : 5); if (repeat <= 0) { std::cerr << "Error: number of repetitions must be greater than zero.\n"; @@ -72,22 +67,22 @@ int main(int argc, char** argv) { const bool do_memtrace = (argc >= 10 ? std::atol(argv[9]) : false); - const std::size_t Tm = Nm / Bm; - const std::size_t Tn = Nn / Bn; - const std::size_t Tk = Nk / Bk; - // Construct TiledRange std::vector blocking_m; - blocking_m.reserve(Tm + 1); for (long i = 0l; i <= Nm; i += Bm) blocking_m.push_back(i); + if (blocking_m.back() != Nm) blocking_m.push_back(Nm); std::vector blocking_n; - blocking_n.reserve(Tn + 1); for (long i = 0l; i <= Nn; i += Bn) blocking_n.push_back(i); + if (blocking_n.back() != Nn) blocking_n.push_back(Nn); std::vector blocking_k; - blocking_k.reserve(Tk + 1); for (long i = 0l; i <= Nk; i += Bk) blocking_k.push_back(i); + if (blocking_k.back() != Nk) blocking_k.push_back(Nk); + + const std::size_t Tm = blocking_m.size() - 1; + const std::size_t Tn = blocking_n.size() - 1; + const std::size_t Tk = blocking_k.size() - 1; // Structure of c std::vector blocking_C; @@ -138,13 +133,13 @@ int main(int argc, char** argv) { << "\nScalar type = " << scalar_type_str << "\nSize of A = " << Nm << "x" << Nk << " (" << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)" - << "\nSize of A block = " << Bm << "x" << Bk + << "\nSize of (largest) A block = " << Bm << "x" << Bk << "\nSize of B = " << Nk << "x" << Nn << " (" << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)" - << "\nSize of B block = " << Bk << "x" << Bn + << "\nSize of (largest) B block = " << Bk << "x" << Bn << "\nSize of C = " << Nm << "x" << Nn << " (" << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)" - << "\nSize of C block = " << Bm << "x" << Bn + << "\nSize of (largest) C block = " << Bm << "x" << Bn << "\n# of blocks of C = " << Tm * Tn << "\nAverage # of blocks of C/node = " << double(Tm * Tn) / double(world.size()) << "\n"; From df29f8d80a24784787157d491e6c785297c692bc Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 22 Jul 2024 09:56:51 -0400 Subject: [PATCH 429/592] ta_cc_abcd supports all 4 scalar types, uses target tile size as argument (instead of number of tiles), and support uniform tiling --- examples/gemm/ta_cc_abcd.cpp | 122 +++++++++++++++++++++++------------ 1 file changed, 79 insertions(+), 43 deletions(-) diff --git a/examples/gemm/ta_cc_abcd.cpp b/examples/gemm/ta_cc_abcd.cpp index f2b612b7aa..e948e9de72 100644 --- a/examples/gemm/ta_cc_abcd.cpp +++ b/examples/gemm/ta_cc_abcd.cpp @@ -34,17 +34,30 @@ bool to_bool(const char* str) { // if n = average tile size // this will produce tiles of these sizes: n+1, n-1, n+2, n-2, etc. // the last tile absorbs the remainder -std::vector make_tiling(unsigned int range_size, - unsigned int ntiles) { - const int average_tile_size = range_size / ntiles; - std::vector result(ntiles + 1); - result[0] = 0; - for (long t = 0; t != ntiles - 1; ++t) { - result[t + 1] = result[t] + average_tile_size + - std::max(static_cast((t % 2 == 0) ? (t + 1) : (-t)), - 1 - average_tile_size); +std::vector make_nonuniform_tiling(unsigned int range_size, + int tile_size) { + std::vector result; + result.push_back(0); + for (long t = 0; true; ++t) { + unsigned int next_tile_boundary = + result.back() + tile_size + + std::max(static_cast((t % 2 == 0) ? (t + 1) : (-t)), + 1 - tile_size); + if (next_tile_boundary >= range_size) break; + result.push_back(next_tile_boundary); } - result[ntiles] = range_size; + if (result.back() != range_size) result.push_back(range_size); + return result; +} + +// makes tiles as uniform as possible +std::vector make_uniform_tiling(unsigned int range_size, + int tile_size) { + std::vector result; + for (unsigned int t = 0; t <= range_size; t += tile_size) { + result.push_back(t); + } + if (result.back() != range_size) result.push_back(range_size); return result; } @@ -72,40 +85,33 @@ int main(int argc, char** argv) { // Get command line arguments if (argc < 5) { - std::cout << "Mocks t2(i,j,a,b) * v(a,b,c,d) term in CC amplitude eqs" - << std::endl - << "Usage: " << argv[0] - << " occ_size occ_nblocks uocc_size " - "uocc_nblocks [repetitions] [use_complex]" - << std::endl; + std::cout + << "Mocks t2(i,j,a,b) * v(a,b,c,d) term in CC amplitude eqs" + << std::endl + << "Usage: " << argv[0] + << " occ_size occ_tilesize uocc_size " + "uocc_tilesize [repetitions] [scalar=double] [uniform_tiling=1]" + << std::endl; return 0; } const long n_occ = atol(argv[1]); - const long nblk_occ = atol(argv[2]); + const long b_occ = atol(argv[2]); const long n_uocc = atol(argv[3]); - const long nblk_uocc = atol(argv[4]); + const long b_uocc = atol(argv[4]); if (n_occ <= 0) { std::cerr << "Error: occ_size must be greater than zero.\n"; return 1; } - if (nblk_occ <= 0) { - std::cerr << "Error: occ_nblocks must be greater than zero.\n"; + if (b_occ <= 0) { + std::cerr << "Error: occ_tilesize must be greater than zero.\n"; return 1; } if (n_uocc <= 0) { std::cerr << "Error: uocc_size must be greater than zero.\n"; return 1; } - if (nblk_uocc <= 0) { - std::cerr << "Error: uocc_nblocks must be greater than zero.\n"; - return 1; - } - if ((n_occ < nblk_occ) != 0ul) { - std::cerr << "Error: occ_size must be greater than occ_nblocks.\n"; - return 1; - } - if ((n_uocc < nblk_uocc) != 0ul) { - std::cerr << "Error: uocc_size must be greater than uocc_nblocks.\n"; + if (b_uocc <= 0) { + std::cerr << "Error: uocc_tilesize must be greater than zero.\n"; return 1; } const long repeat = (argc >= 6 ? atol(argv[5]) : 5); @@ -113,29 +119,59 @@ int main(int argc, char** argv) { std::cerr << "Error: number of repetitions must be greater than zero.\n"; return 1; } - const bool use_complex = (argc >= 7 ? to_bool(argv[6]) : false); + + const std::string scalar_type_str = (argc >= 7 ? argv[6] : "double"); + if (scalar_type_str != "double" && scalar_type_str != "float" && + scalar_type_str != "zdouble" && scalar_type_str != "zfloat") { + std::cerr << "Error: invalid real type " << scalar_type_str << ".\n"; + std::cerr << " valid real types are \"double\", \"float\", " + "\"zdouble\", and \"zfloat\".\n"; + return 1; + } + + const bool uniform_tiling = (argc >= 8 ? std::atol(argv[7]) : true); if (world.rank() == 0) std::cout << "TiledArray: CC T2.V term test..." << "\nGit description: " << TiledArray::git_description() << "\nNumber of nodes = " << world.size() << "\nocc size = " << n_occ - << "\nocc nblocks = " << nblk_occ + << "\nocc tilesize = " << b_occ << "\nuocc size = " << n_uocc - << "\nuocc nblocks = " << nblk_uocc - << "\nComplex = " - << (use_complex ? "true" : "false") << "\n"; + << "\nuocc tilesize = " << b_uocc + << "\nscalar type = " << scalar_type_str + << "\nuniform tiling = " + << (uniform_tiling ? "true" : "false") << std::endl; // Construct TiledRange1's - std::vector tiling_occ = make_tiling(n_occ, nblk_occ); - std::vector tiling_uocc = make_tiling(n_uocc, nblk_uocc); + std::vector tiling_occ = + uniform_tiling ? make_uniform_tiling(n_occ, b_occ) + : make_nonuniform_tiling(n_occ, b_occ); + std::vector tiling_uocc = + uniform_tiling ? make_uniform_tiling(n_uocc, b_uocc) + : make_nonuniform_tiling(n_uocc, b_uocc); auto trange_occ = TA::TiledRange1(tiling_occ.begin(), tiling_occ.end()); auto trange_uocc = TA::TiledRange1(tiling_uocc.begin(), tiling_uocc.end()); - - if (use_complex) - cc_abcd>(world, trange_occ, trange_uocc, repeat); - else + auto print_tile_sizes = [](const auto& tiling) { + auto b = tiling.begin(); + for (auto current = b + 1; current != tiling.end(); ++current) { + std::cout << *current - *(current - 1) << " "; + } + std::cout << std::endl; + }; + std::cout << " occ tile sizes: "; + print_tile_sizes(tiling_occ); + std::cout << "uocc tile sizes: "; + print_tile_sizes(tiling_uocc); + + if (scalar_type_str == "double") cc_abcd(world, trange_occ, trange_uocc, repeat); + else if (scalar_type_str == "zdouble") + cc_abcd>(world, trange_occ, trange_uocc, repeat); + else if (scalar_type_str == "float") + cc_abcd(world, trange_occ, trange_uocc, repeat); + else if (scalar_type_str == "zfloat") + cc_abcd>(world, trange_occ, trange_uocc, repeat); TA::finalize(); @@ -201,11 +237,11 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, for (int i = 0; i < repeat; ++i) { auto tp_start = TiledArray::now(); // this is how the user would express this contraction - if (false) t2_v("i,j,a,b") = t2("i,j,c,d") * v("a,b,c,d"); + if (true) t2_v("i,j,a,b") = t2("i,j,c,d") * v("a,b,c,d"); // this demonstrates to the PaRSEC team what happens under the hood of the // expression above - if (true) { + if (false) { tensor_contract_444(t2_v, t2, v); // to validate replace: false -> true From c9302ed8bf4900179fcacf2f8cd472338685e7b2 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 22 Jul 2024 15:30:48 -0400 Subject: [PATCH 430/592] ta_cc_abcd: fence inside the loop --- examples/gemm/ta_cc_abcd.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/gemm/ta_cc_abcd.cpp b/examples/gemm/ta_cc_abcd.cpp index e948e9de72..66b48f96d5 100644 --- a/examples/gemm/ta_cc_abcd.cpp +++ b/examples/gemm/ta_cc_abcd.cpp @@ -256,6 +256,7 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, << error("i,j,a,b").squared_norm().get() << std::endl; } } + t2_v.world().gop.fence(); TiledArray::record_duration_since(tp_start); const double time = TiledArray::durations().back(); From 269c59bfed2e28be0a5a14c667fcd0f675977719 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 22 Jul 2024 16:18:16 -0400 Subject: [PATCH 431/592] ta_cc_abcd: GFLOP = 10^9 FLOP, not 12^30 FLOP --- examples/device/ta_cc_abcd_device.cpp | 4 ++-- examples/gemm/ta_cc_abcd.cpp | 4 ++-- examples/gemm/ta_dense_asymm.cpp | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/device/ta_cc_abcd_device.cpp b/examples/device/ta_cc_abcd_device.cpp index 7a2046a5ef..02d7781b12 100644 --- a/examples/device/ta_cc_abcd_device.cpp +++ b/examples/device/ta_cc_abcd_device.cpp @@ -182,8 +182,8 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, const double flops_per_fma = (complex_T ? 8 : 2); // 1 multiply takes 6/1 flops for complex/real // 1 add takes 2/1 flops for complex/real - const double n_gflop = flops_per_fma * std::pow(n_occ, 2) * - std::pow(n_uocc, 4) / std::pow(1024., 3); + const double n_gflop = + flops_per_fma * std::pow(n_occ, 2) * std::pow(n_uocc, 4) / 1e9; using deviceTile = btas::Tensor>; diff --git a/examples/gemm/ta_cc_abcd.cpp b/examples/gemm/ta_cc_abcd.cpp index 66b48f96d5..c1ad166ca5 100644 --- a/examples/gemm/ta_cc_abcd.cpp +++ b/examples/gemm/ta_cc_abcd.cpp @@ -211,8 +211,8 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, const double flops_per_fma = (complex_T ? 8 : 2); // 1 multiply takes 6/1 flops for complex/real // 1 add takes 2/1 flops for complex/real - const double gflops_per_call = flops_per_fma * std::pow(n_occ, 2) * - std::pow(n_uocc, 4) / std::pow(1024., 3); + const double gflops_per_call = + flops_per_fma * std::pow(n_occ, 2) * std::pow(n_uocc, 4) / 1e9; // Construct tensors TA::TArrayD t2(world, trange_oovv); diff --git a/examples/gemm/ta_dense_asymm.cpp b/examples/gemm/ta_dense_asymm.cpp index ef178319cd..356d838ec0 100644 --- a/examples/gemm/ta_dense_asymm.cpp +++ b/examples/gemm/ta_dense_asymm.cpp @@ -148,10 +148,11 @@ int main(int argc, char** argv) { if (do_memtrace) { world.gop.fence(); madness::print_meminfo(world.rank(), str); + } else { + world.gop.fence(); } #ifdef TA_TENSOR_MEM_PROFILE { - world.gop.fence(); std::cout << str << ": TA::Tensor allocated " << TA::hostEnv::instance()->host_allocator_getActualHighWatermark() From 0b5244f1e2db8d80442bc21af8956cb53730459f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 22 Jul 2024 17:45:49 -0400 Subject: [PATCH 432/592] ta_cc_abcd: do not assume real double everywhere! --- examples/gemm/ta_cc_abcd.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/gemm/ta_cc_abcd.cpp b/examples/gemm/ta_cc_abcd.cpp index c1ad166ca5..f038f09ea0 100644 --- a/examples/gemm/ta_cc_abcd.cpp +++ b/examples/gemm/ta_cc_abcd.cpp @@ -215,9 +215,9 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, flops_per_fma * std::pow(n_occ, 2) * std::pow(n_uocc, 4) / 1e9; // Construct tensors - TA::TArrayD t2(world, trange_oovv); - TA::TArrayD v(world, trange_vvvv); - TA::TArrayD t2_v; + TA::TSpArray t2(world, trange_oovv); + TA::TSpArray v(world, trange_vvvv); + TA::TSpArray t2_v; // To validate, fill input tensors with random data, otherwise just with 1s if (do_validate) { rand_fill_array(t2); @@ -247,9 +247,9 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ, // to validate replace: false -> true if (do_validate) { // obtain reference result using the high-level DSL - TA::TArrayD t2_v_ref; + TA::TSpArray t2_v_ref; t2_v_ref("i,j,a,b") = t2("i,j,c,d") * v("c,d,a,b"); - TA::TArrayD error; + TA::TSpArray error; error("i,j,a,b") = t2_v_ref("i,j,a,b") - t2_v("i,j,a,b"); std::cout << "Validating the result (ignore the timings/performance!): " "||ref_result - result||_2^2 = " @@ -394,6 +394,7 @@ template void tensor_contract_444(TA::DistArray& tv, const TA::DistArray& t, const TA::DistArray& v) { + using Shape = typename Policy::shape_type; // for convenience, obtain the tiled ranges for the two kinds of dimensions // used to define t, v, and tv auto trange_occ = t.trange().dim(0); // the first dimension of t is occ @@ -415,10 +416,10 @@ void tensor_contract_444(TA::DistArray& tv, auto ncols = n_uocc * n_uocc; TA::detail::ProcGrid proc_grid(world, nrowtiles, ncoltiles, nrows, ncols); std::shared_ptr pmap; - auto t_eval = make_array_eval(t, t.world(), TA::DenseShape(), + auto t_eval = make_array_eval(t, t.world(), Shape(), proc_grid.make_row_phase_pmap(ninttiles), TA::Permutation(), make_array_noop()); - auto v_eval = make_array_eval(v, v.world(), TA::DenseShape(), + auto v_eval = make_array_eval(v, v.world(), Shape(), proc_grid.make_col_phase_pmap(ninttiles), TA::Permutation(), make_array_noop()); @@ -437,7 +438,7 @@ void tensor_contract_444(TA::DistArray& tv, // 2. there will be a dummy output ArrayEval, its Futures will be set by the // PTG auto contract = - make_contract_eval(t_eval, v_eval, world, TA::DenseShape(), pmap, + make_contract_eval(t_eval, v_eval, world, Shape(), pmap, TA::Permutation(), make_contract(4u, 4u, 4u)); // eval() just schedules the Summa task and proceeds From 9d5ad2331fb0e079ff846abd272030644b64ad0f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 23 Jul 2024 05:53:13 -0400 Subject: [PATCH 433/592] ta_dense_device supports extents not evenly divisible by tile sizes --- examples/device/ta_dense_device.cpp | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/examples/device/ta_dense_device.cpp b/examples/device/ta_dense_device.cpp index d30bf5079c..e975f8759d 100644 --- a/examples/device/ta_dense_device.cpp +++ b/examples/device/ta_dense_device.cpp @@ -36,10 +36,6 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, using RT = TiledArray::detail::scalar_t; constexpr auto complex_T = TiledArray::detail::is_complex_v; - const std::size_t Tm = Nm / Bm; - const std::size_t Tn = Nn / Bn; - const std::size_t Tk = Nk / Bk; - const std::int64_t nflops = (complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real // 1 add takes 2/1 flops for complex/real @@ -64,16 +60,16 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, // Construct TiledRange std::vector blocking_m; - blocking_m.reserve(Tm + 1); for (long i = 0l; i <= Nm; i += Bm) blocking_m.push_back(i); + const std::size_t Tm = blocking_m.size() - 1; std::vector blocking_n; - blocking_n.reserve(Tn + 1); for (long i = 0l; i <= Nn; i += Bn) blocking_n.push_back(i); + const std::size_t Tn = blocking_n.size() - 1; std::vector blocking_k; - blocking_k.reserve(Tk + 1); for (long i = 0l; i <= Nk; i += Bk) blocking_k.push_back(i); + const std::size_t Tk = blocking_k.size(); // Structure of c std::vector blocking_C; @@ -255,11 +251,6 @@ int try_main(int argc, char **argv) { std::cerr << "Error: block sizes must be greater than zero.\n"; return 1; } - if ((Nm % Bm) != 0ul || Nn % Bn != 0ul || Nk % Bk != 0ul) { - std::cerr - << "Error: dimension size must be evenly divisible by block size.\n"; - return 1; - } const long nrepeat = (argc >= 8 ? atol(argv[7]) : 5); if (nrepeat <= 0) { std::cerr << "Error: number of repetitions must be greater than zero.\n"; From 5204c06cf978892ee04503b476162d1c5cefd9de Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 23 Jul 2024 12:03:20 -0400 Subject: [PATCH 434/592] amends 9d5ad2331fb0e079ff846abd272030644b64ad0f --- examples/device/ta_dense_device.cpp | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/examples/device/ta_dense_device.cpp b/examples/device/ta_dense_device.cpp index e975f8759d..4900072e8a 100644 --- a/examples/device/ta_dense_device.cpp +++ b/examples/device/ta_dense_device.cpp @@ -42,22 +42,6 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, * static_cast(Nn) * static_cast(Nm) * static_cast(Nk); - if (world.rank() == 0) - std::cout << "TiledArray: dense matrix multiply test...\n" - << "Number of nodes = " << world.size() - << "\nSize of A = " << Nm << "x" << Nk << " (" - << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)" - << "\nSize of A block = " << Bm << "x" << Bk - << "\nSize of B = " << Nk << "x" << Nn << " (" - << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)" - << "\nSize of B block = " << Bk << "x" << Bn - << "\nSize of C = " << Nm << "x" << Nn << " (" - << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)" - << "\nSize of C block = " << Bm << "x" << Bn - << "\n# of blocks of C = " << Tm * Tn - << "\nAverage # of blocks of C/node = " - << double(Tm * Tn) / double(world.size()) << "\n"; - // Construct TiledRange std::vector blocking_m; for (long i = 0l; i <= Nm; i += Bm) blocking_m.push_back(i); @@ -71,6 +55,22 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm, for (long i = 0l; i <= Nk; i += Bk) blocking_k.push_back(i); const std::size_t Tk = blocking_k.size(); + if (world.rank() == 0) + std::cout << "TiledArray: dense matrix multiply test...\n" + << "Number of nodes = " << world.size() + << "\nSize of A = " << Nm << "x" << Nk << " (" + << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)" + << "\nSize of (largest) A block = " << Bm << "x" << Bk + << "\nSize of B = " << Nk << "x" << Nn << " (" + << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)" + << "\nSize of (largest) B block = " << Bk << "x" << Bn + << "\nSize of C = " << Nm << "x" << Nn << " (" + << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)" + << "\nSize of (largest) C block = " << Bm << "x" << Bn + << "\n# of blocks of C = " << Tm * Tn + << "\nAverage # of blocks of C/node = " + << double(Tm * Tn) / double(world.size()) << "\n"; + // Structure of c std::vector blocking_C; blocking_C.reserve(2); From 2509075b01e3df7ac38691dc3e1b1143740d5974 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 22 Aug 2024 18:41:49 -0400 Subject: [PATCH 435/592] SVD computes full sets of vectors, not partial --- src/TiledArray/math/linalg/rank-local.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/math/linalg/rank-local.h b/src/TiledArray/math/linalg/rank-local.h index f7db4abd01..625807663a 100644 --- a/src/TiledArray/math/linalg/rank-local.h +++ b/src/TiledArray/math/linalg/rank-local.h @@ -56,8 +56,7 @@ void svd(Job jobu, Job jobvt, Matrix &A, template void svd(Matrix &A, std::vector> &S, Matrix *U, Matrix *VT) { - svd(U ? Job::SomeVec : Job::NoVec, VT ? Job::SomeVec : Job::NoVec, A, S, U, - VT); + svd(U ? Job::AllVec : Job::NoVec, VT ? Job::AllVec : Job::NoVec, A, S, U, VT); } template @@ -82,10 +81,10 @@ struct ArchiveSerializeImpl { MAD_ARCHIVE_DEBUG(std::cout << "(de)serialize lapack::Error" << std::endl); if constexpr (is_output_archive_v) { // serialize const std::string msg = e.what(); - ar &msg; + ar & msg; } else { std::string msg; - ar &msg; + ar & msg; e = lapack::Error(msg); } } From f50e580a9665c04d28cce7f9705675afda87f634 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 23 Aug 2024 09:18:18 -0400 Subject: [PATCH 436/592] concat(arrays) can handle zero-volume arrays --- src/TiledArray/conversions/concat.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/conversions/concat.h b/src/TiledArray/conversions/concat.h index 398a5dc7b3..cc55f91e17 100644 --- a/src/TiledArray/conversions/concat.h +++ b/src/TiledArray/conversions/concat.h @@ -92,9 +92,15 @@ DistArray concat( DistArray result(*target_world, tr); const auto annot = detail::dummy_annotation(r); for (auto i = 0ul; i != arrays.size(); ++i) { - result.make_tsrexpr(annot).block(tile_begin_end[i].first, - tile_begin_end[i].second) = - arrays[i].make_tsrexpr(annot); + if (arrays[i].trange().tiles_range().volume() != + 0) { // N.B. empty block range expression bug workaround + result.make_tsrexpr(annot).block(tile_begin_end[i].first, + tile_begin_end[i].second) = + arrays[i].make_tsrexpr(annot); + result.make_tsrexpr(annot).block(tile_begin_end[i].first, + tile_begin_end[i].second) = + arrays[i].make_tsrexpr(annot); + } } result.world().gop.fence(); From af1d41cc3fee60326b6e1951d82fdff6a54e2daa Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 24 Aug 2024 10:07:36 -0400 Subject: [PATCH 437/592] KroneckerDeltaTile -> KroneckerDeltaTile --- src/TiledArray/special/kronecker_delta.h | 105 ++++++++++------------- tests/expressions_mixed.cpp | 3 +- 2 files changed, 48 insertions(+), 60 deletions(-) diff --git a/src/TiledArray/special/kronecker_delta.h b/src/TiledArray/special/kronecker_delta.h index 2b1df03294..24d49e5cd0 100644 --- a/src/TiledArray/special/kronecker_delta.h +++ b/src/TiledArray/special/kronecker_delta.h @@ -39,20 +39,13 @@ /// *generalized* (asymmetric) Kronecker delta -/// *generalized* (asymmetric) Kronecker delta is a product of \c _N ordinary -/// Kronecker deltas Definition: KroneckerDeltaTile(b,k) = (b==k) ? 1 : 0 -/// KroneckerDeltaTile(b0,k0,b1,k1,b2,k2...bN,kN) = KroneckerDeltaTile(b0,k0) -/// KroneckerDeltaTile(b1,k1) ...`KroneckerDeltaTile(bN,kN) -/// -/// \note This is a stateful data tile. Meant to be generated by its (stateless) -/// lazy generator, \c LazyKroneckerDeltaTile. -/// -/// \tparam _N the number of ordinal Kronecker deltas in this product -template +/// *generalized* (asymmetric) Kronecker delta is a product of `N` ordinary +/// Kronecker deltas +/// Definition: `KroneckerDeltaTile(b,k) = (b==k) ? 1 : 0` and +/// `KroneckerDeltaTile(b0,k0,b1,k1,b2,k2...bN,kN) = KroneckerDeltaTile(b0,k0) +/// KroneckerDeltaTile(b1,k1) ... KroneckerDeltaTile(bN,kN)` class KroneckerDeltaTile { public: - // Constants - static constexpr unsigned N = _N; // Concept typedefs typedef TiledArray::Range range_type; // range type typedef int value_type; // Element type @@ -61,7 +54,7 @@ class KroneckerDeltaTile { typedef size_t size_type; // Size type private: - range_type range_; + range_type range_; // range_.rank() = 2*N bool empty_; public: @@ -69,8 +62,13 @@ class KroneckerDeltaTile { KroneckerDeltaTile() : empty_(true) {} /// Productive ctor 1 + /// \param[in] range the range of the tile, by definition must be even-order + /// such that the number of Kronecker deltas `N` is `range.rank() / 2` \pre + /// `range.rank() % 2 == 1` KroneckerDeltaTile(const range_type& range) - : range_(range), empty_(is_empty(range_)) {} + : range_(range), empty_(is_empty(range_)) { + TA_ASSERT(range.rank() % 2 == 0); + } /// copy constructor (= deep copy) KroneckerDeltaTile(const KroneckerDeltaTile&) = default; @@ -88,6 +86,9 @@ class KroneckerDeltaTile { bool empty() const { return empty_; } + /// \return the number of Kronecker deltas in the product + unsigned int N() const { return range_.rank() / 2; } + /// MADNESS compliant serialization template void serialize(Archive& ar) { @@ -100,7 +101,8 @@ class KroneckerDeltaTile { /// @return false if contains any nonzeros static bool is_empty(const range_type& range) { bool empty = false; - TA_ASSERT(range.rank() == 2 * N); + TA_ASSERT(range.rank() % 2 == 0); + const auto N = range.rank() / 2; auto lobound = range.lobound_data(); auto upbound = range.upbound_data(); for (auto i = 0; i != 2 * N && not empty; i += 2) @@ -115,84 +117,70 @@ class KroneckerDeltaTile { // these are to satisfy interfaces, but not needed, actually // Sum of hyper diagonal elements -template -typename KroneckerDeltaTile<_N>::numeric_type trace( - const KroneckerDeltaTile<_N>& arg); +typename KroneckerDeltaTile::numeric_type trace(const KroneckerDeltaTile& arg); // foreach(i) result += arg[i] -template -typename KroneckerDeltaTile<_N>::numeric_type sum( - const KroneckerDeltaTile<_N>& arg); +typename KroneckerDeltaTile::numeric_type sum(const KroneckerDeltaTile& arg); // foreach(i) result *= arg[i] -template -typename KroneckerDeltaTile<_N>::numeric_type product( - const KroneckerDeltaTile<_N>& arg); +typename KroneckerDeltaTile::numeric_type product( + const KroneckerDeltaTile& arg); // foreach(i) result += arg[i] * arg[i] -template -typename KroneckerDeltaTile<_N>::numeric_type squared_norm( - const KroneckerDeltaTile<_N>& arg); +typename KroneckerDeltaTile::numeric_type squared_norm( + const KroneckerDeltaTile& arg); // foreach(i) result = min(result, arg[i]) -template -typename KroneckerDeltaTile<_N>::numeric_type min( - const KroneckerDeltaTile<_N>& arg); +typename KroneckerDeltaTile::numeric_type min(const KroneckerDeltaTile& arg); // foreach(i) result = max(result, arg[i]) -template -typename KroneckerDeltaTile<_N>::numeric_type max( - const KroneckerDeltaTile<_N>& arg); +typename KroneckerDeltaTile::numeric_type max(const KroneckerDeltaTile& arg); // foreach(i) result = abs_min(result, arg[i]) -template -typename KroneckerDeltaTile<_N>::numeric_type abs_min( - const KroneckerDeltaTile<_N>& arg); +typename KroneckerDeltaTile::numeric_type abs_min( + const KroneckerDeltaTile& arg); // foreach(i) result = abs_max(result, arg[i]) -template -typename KroneckerDeltaTile<_N>::numeric_type abs_max( - const KroneckerDeltaTile<_N>& arg); +typename KroneckerDeltaTile::numeric_type abs_max( + const KroneckerDeltaTile& arg); // Permutation operation // returns a tile for which result[perm ^ i] = tile[i] -template < - unsigned N, typename Perm, - typename = std::enable_if_t>> -KroneckerDeltaTile permute(const KroneckerDeltaTile& tile, - const Perm& perm) { +template >> +KroneckerDeltaTile permute(const KroneckerDeltaTile& tile, const Perm& perm) { abort(); } // dense_result[i] = dense_arg1[i] * sparse_arg2[i] -template -TiledArray::Tensor mult(const KroneckerDeltaTile<_N>& arg1, +template +TiledArray::Tensor mult(const KroneckerDeltaTile& arg1, const TiledArray::Tensor& arg2) { abort(); } // dense_result[perm ^ i] = dense_arg1[i] * sparse_arg2[i] template < - typename T, unsigned _N, typename Perm, + typename T, typename Perm, typename = std::enable_if_t>> -TiledArray::Tensor mult(const KroneckerDeltaTile<_N>& arg1, +TiledArray::Tensor mult(const KroneckerDeltaTile& arg1, const TiledArray::Tensor& arg2, const Perm& perm) { abort(); } // dense_result[i] *= sparse_arg1[i] -template +template TiledArray::Tensor& mult_to(TiledArray::Tensor& result, - const KroneckerDeltaTile& arg1) { + const KroneckerDeltaTile& arg1) { abort(); return result; } // dense_result[i] = binary(dense_arg1[i], sparse_arg2[i], op) -template -TiledArray::Tensor binary(const KroneckerDeltaTile<_N>& arg1, +template +TiledArray::Tensor binary(const KroneckerDeltaTile& arg1, const TiledArray::Tensor& arg2, Op&& op) { abort(); } // dense_result[perm ^ i] = binary(dense_arg1[i], sparse_arg2[i], op) template < - typename T, unsigned _N, typename Op, typename Perm, + typename T, typename Op, typename Perm, typename = std::enable_if_t>> -TiledArray::Tensor binary(const KroneckerDeltaTile<_N>& arg1, +TiledArray::Tensor binary(const KroneckerDeltaTile& arg1, const TiledArray::Tensor& arg2, Op&& op, const Perm& perm) { abort(); @@ -202,9 +190,9 @@ TiledArray::Tensor binary(const KroneckerDeltaTile<_N>& arg1, // GEMM operation with fused indices as defined by gemm_config: // dense_result[i,j] = dense_arg1[i,k] * sparse_arg2[k,j] -template +template TiledArray::Tensor gemm( - const KroneckerDeltaTile& arg1, const TiledArray::Tensor& arg2, + const KroneckerDeltaTile& arg1, const TiledArray::Tensor& arg2, const typename TiledArray::Tensor::numeric_type factor, const TiledArray::math::GemmHelper& gemm_config) { // preconditions: @@ -224,6 +212,7 @@ TiledArray::Tensor gemm( auto arg2_volume = arg2_range.volume(); if (not arg1.empty()) { + const auto N = arg1.N(); switch (N) { case 1: { auto i0_range = std::min(arg1_extents[0], arg1_extents[1]); @@ -258,8 +247,8 @@ TiledArray::Tensor gemm( } // GEMM operation with fused indices as defined by gemm_config: // dense_result[i,j] += dense_arg1[i,k] * sparse_arg2[k,j] -template -void gemm(TiledArray::Tensor& result, const KroneckerDeltaTile& arg1, +template +void gemm(TiledArray::Tensor& result, const KroneckerDeltaTile& arg1, const TiledArray::Tensor& arg2, const typename TiledArray::Tensor::numeric_type factor, const TiledArray::math::GemmHelper& gemm_config) { diff --git a/tests/expressions_mixed.cpp b/tests/expressions_mixed.cpp index 6b92cec695..872b4086f2 100644 --- a/tests/expressions_mixed.cpp +++ b/tests/expressions_mixed.cpp @@ -37,8 +37,7 @@ struct tag {}; struct MixedExpressionsFixture : public TiledRangeFixture { typedef DistArray>, DensePolicy> TArrayDS1; typedef DistArray>, DensePolicy> TArrayDS2; - typedef DistArray, DensePolicy> - ArrayKronDelta1; // will be turned into SparsePolicy next + typedef DistArray ArrayKronDelta1; MixedExpressionsFixture() : u(*GlobalFixture::world, trange2), From 828699e9d073e7d10e770e50cf746c7122aa5ed0 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 24 Aug 2024 10:08:10 -0400 Subject: [PATCH 438/592] test outer product with sparse KroneckerDeltaTile --- tests/expressions_mixed.cpp | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/expressions_mixed.cpp b/tests/expressions_mixed.cpp index 872b4086f2..54d725cba6 100644 --- a/tests/expressions_mixed.cpp +++ b/tests/expressions_mixed.cpp @@ -23,6 +23,7 @@ * */ +#include "TiledArray/special/diagonal_array.h" #include "TiledArray/special/kronecker_delta.h" #include "range_fixture.h" #include "sparse_tile.h" @@ -38,6 +39,7 @@ struct MixedExpressionsFixture : public TiledRangeFixture { typedef DistArray>, DensePolicy> TArrayDS1; typedef DistArray>, DensePolicy> TArrayDS2; typedef DistArray ArrayKronDelta1; + typedef DistArray SpArrayKronDelta1; MixedExpressionsFixture() : u(*GlobalFixture::world, trange2), @@ -51,7 +53,12 @@ struct MixedExpressionsFixture : public TiledRangeFixture { *GlobalFixture::world, trange2.tiles_range().volume())), delta1e(*GlobalFixture::world, trange2e, DenseShape(), std::make_shared( - *GlobalFixture::world, trange2e.tiles_range().volume())) { + *GlobalFixture::world, trange2e.tiles_range().volume())), + spe2(*GlobalFixture::world, trange2e), + spdelta1(*GlobalFixture::world, trange2, + SparseShape(detail::diagonal_shape(trange2, 1), trange2), + std::make_shared( + *GlobalFixture::world, trange2.tiles_range().volume())) { random_fill(u); random_fill(v); u2.fill(0); @@ -59,11 +66,13 @@ struct MixedExpressionsFixture : public TiledRangeFixture { e4.fill(0); init_kronecker_delta(delta1); init_kronecker_delta(delta1e); + random_fill(spe2); + init_kronecker_delta(spdelta1); GlobalFixture::world->gop.fence(); } - template - static void random_fill(DistArray& array) { + template + static void random_fill(DistArray& array) { array.fill_random(); } @@ -133,6 +142,8 @@ struct MixedExpressionsFixture : public TiledRangeFixture { TArrayDS2 w; ArrayKronDelta1 delta1; ArrayKronDelta1 delta1e; + TSpArrayD spe2; + SpArrayKronDelta1 spdelta1; }; // MixedExpressionsFixture // Instantiate static variables for fixture @@ -194,6 +205,10 @@ BOOST_AUTO_TEST_CASE(outer_product_factories) { // ok BOOST_CHECK_NO_THROW(u2("a,b,c,d") += delta1("a,b") * u("c,d")); + // ok + TSpArrayD tmp; + BOOST_CHECK_NO_THROW(tmp("a,b,c,d") = spdelta1("a,b") * spe2("c,d")); + // ok BOOST_CHECK_NO_THROW(e4("a,c,b,d") += delta1e("a,b") * e2("c,d")); } From 7e2eb43080b95baf3986c1988116029b1b0f2f23 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 24 Aug 2024 15:10:49 -0400 Subject: [PATCH 439/592] KroneckerDeltaTile moved to TiledArray namespace --- src/TiledArray/special/kronecker_delta.h | 60 +++++++++++------------- 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/src/TiledArray/special/kronecker_delta.h b/src/TiledArray/special/kronecker_delta.h index 24d49e5cd0..ff1907a49e 100644 --- a/src/TiledArray/special/kronecker_delta.h +++ b/src/TiledArray/special/kronecker_delta.h @@ -37,6 +37,8 @@ #include #include +namespace TiledArray { + /// *generalized* (asymmetric) Kronecker delta /// *generalized* (asymmetric) Kronecker delta is a product of `N` ordinary @@ -47,8 +49,8 @@ class KroneckerDeltaTile { public: // Concept typedefs - typedef TiledArray::Range range_type; // range type - typedef int value_type; // Element type + typedef Range range_type; // range type + typedef int value_type; // Element type typedef value_type numeric_type; // The scalar type that is compatible with value_type typedef size_t size_type; // Size type @@ -140,49 +142,43 @@ typename KroneckerDeltaTile::numeric_type abs_max( // Permutation operation // returns a tile for which result[perm ^ i] = tile[i] -template >> +template >> KroneckerDeltaTile permute(const KroneckerDeltaTile& tile, const Perm& perm) { abort(); } // dense_result[i] = dense_arg1[i] * sparse_arg2[i] template -TiledArray::Tensor mult(const KroneckerDeltaTile& arg1, - const TiledArray::Tensor& arg2) { +Tensor mult(const KroneckerDeltaTile& arg1, const Tensor& arg2) { abort(); } // dense_result[perm ^ i] = dense_arg1[i] * sparse_arg2[i] -template < - typename T, typename Perm, - typename = std::enable_if_t>> -TiledArray::Tensor mult(const KroneckerDeltaTile& arg1, - const TiledArray::Tensor& arg2, - const Perm& perm) { +template >> +Tensor mult(const KroneckerDeltaTile& arg1, const Tensor& arg2, + const Perm& perm) { abort(); } // dense_result[i] *= sparse_arg1[i] template -TiledArray::Tensor& mult_to(TiledArray::Tensor& result, - const KroneckerDeltaTile& arg1) { +Tensor& mult_to(Tensor& result, const KroneckerDeltaTile& arg1) { abort(); return result; } // dense_result[i] = binary(dense_arg1[i], sparse_arg2[i], op) template -TiledArray::Tensor binary(const KroneckerDeltaTile& arg1, - const TiledArray::Tensor& arg2, Op&& op) { +Tensor binary(const KroneckerDeltaTile& arg1, const Tensor& arg2, + Op&& op) { abort(); } // dense_result[perm ^ i] = binary(dense_arg1[i], sparse_arg2[i], op) -template < - typename T, typename Op, typename Perm, - typename = std::enable_if_t>> -TiledArray::Tensor binary(const KroneckerDeltaTile& arg1, - const TiledArray::Tensor& arg2, Op&& op, - const Perm& perm) { +template >> +Tensor binary(const KroneckerDeltaTile& arg1, const Tensor& arg2, Op&& op, + const Perm& perm) { abort(); } @@ -191,10 +187,9 @@ TiledArray::Tensor binary(const KroneckerDeltaTile& arg1, // GEMM operation with fused indices as defined by gemm_config: // dense_result[i,j] = dense_arg1[i,k] * sparse_arg2[k,j] template -TiledArray::Tensor gemm( - const KroneckerDeltaTile& arg1, const TiledArray::Tensor& arg2, - const typename TiledArray::Tensor::numeric_type factor, - const TiledArray::math::GemmHelper& gemm_config) { +Tensor gemm(const KroneckerDeltaTile& arg1, const Tensor& arg2, + const typename Tensor::numeric_type factor, + const math::GemmHelper& gemm_config) { // preconditions: // 1. implemented only outer product assert(gemm_config.result_rank() == @@ -203,8 +198,8 @@ TiledArray::Tensor gemm( auto arg1_range = arg1.range(); auto arg2_range = arg2.range(); auto result_range = - gemm_config.make_result_range(arg1_range, arg2_range); - TiledArray::Tensor result(result_range, 0); + gemm_config.make_result_range(arg1_range, arg2_range); + Tensor result(result_range, 0); auto result_data = result.data(); auto arg1_extents = arg1_range.extent_data(); @@ -248,11 +243,12 @@ TiledArray::Tensor gemm( // GEMM operation with fused indices as defined by gemm_config: // dense_result[i,j] += dense_arg1[i,k] * sparse_arg2[k,j] template -void gemm(TiledArray::Tensor& result, const KroneckerDeltaTile& arg1, - const TiledArray::Tensor& arg2, - const typename TiledArray::Tensor::numeric_type factor, - const TiledArray::math::GemmHelper& gemm_config) { +void gemm(Tensor& result, const KroneckerDeltaTile& arg1, + const Tensor& arg2, const typename Tensor::numeric_type factor, + const math::GemmHelper& gemm_config) { abort(); } +} // namespace TiledArray + #endif // TILEDARRAY_TEST_SPARSE_TILE_H__INCLUDED From 19474027acb1b5b152871a0020099856230b6233 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 25 Aug 2024 11:52:36 -0400 Subject: [PATCH 440/592] contents of namespace TiledArray::meta moved to TiledArray::detail, TiledArray/meta.h deprecated in favor of TiledArray/util/invoke.h --- src/CMakeLists.txt | 1 + src/TiledArray/meta.h | 67 ++--------------------- src/TiledArray/tile_interface/cast.h | 12 ++--- src/TiledArray/tile_op/binary_wrapper.h | 14 ++--- src/TiledArray/tile_op/unary_wrapper.h | 15 +++--- src/TiledArray/util/invoke.h | 70 +++++++++++++++++++++++++ 6 files changed, 95 insertions(+), 84 deletions(-) create mode 100644 src/TiledArray/util/invoke.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0167aab636..ffceb74017 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -193,6 +193,7 @@ TiledArray/util/backtrace.h TiledArray/util/bug.h TiledArray/util/function.h TiledArray/util/initializer_list.h +TiledArray/util/invoke.h TiledArray/util/logger.h TiledArray/util/ptr_registry.h TiledArray/util/random.h diff --git a/src/TiledArray/meta.h b/src/TiledArray/meta.h index 9dc4ac9f55..18a1bf69f9 100644 --- a/src/TiledArray/meta.h +++ b/src/TiledArray/meta.h @@ -1,70 +1,9 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2017 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Eduard Valeyev - * Department of Chemistry, Virginia Tech - * - * meta.h - * April 11, 2017 - * - */ - #ifndef SRC_TILEDARRAY_META_H_ #define SRC_TILEDARRAY_META_H_ -#include -#include -#include -#include - -namespace TiledArray { -namespace meta { - -/// ||'s bools -template -struct or_reduce { - static constexpr bool value = head || or_reduce::value; -}; - -template -struct or_reduce { - static constexpr bool value = b; -}; - -// is any argument a Future? -// - yes: async launch -// - no: direct launch -template -auto invoke(Function&& fn, Args&&... args) -> typename std::enable_if< - !or_reduce>::value...>::value, - decltype(fn(args...))>::type { - return fn(std::forward(args)...); -} - -template < - typename Function, typename... Args, - typename = typename std::enable_if>::value...>::value>::type> -auto invoke(Function&& fn, Args&&... args) { - return TiledArray::get_default_world().taskq.add(std::forward(fn), - std::forward(args)...); -} +#pragma message( \ + "Header `TiledArray/meta.h` is deprecated, use `TiledArray/util/invoke.h` instead.") -} // namespace meta -} // namespace TiledArray +#include #endif // SRC_TILEDARRAY_META_H_ diff --git a/src/TiledArray/tile_interface/cast.h b/src/TiledArray/tile_interface/cast.h index c22b97b051..52c7a550be 100644 --- a/src/TiledArray/tile_interface/cast.h +++ b/src/TiledArray/tile_interface/cast.h @@ -26,8 +26,8 @@ #ifndef TILEDARRAY_TILE_INTERFACE_CAST_H__INCLUDED #define TILEDARRAY_TILE_INTERFACE_CAST_H__INCLUDED -#include "../meta.h" -#include "../type_traits.h" +#include "TiledArray/type_traits.h" +#include "TiledArray/util/invoke.h" namespace TiledArray { @@ -80,7 +80,7 @@ class Cast(std::forward(arg)); }; - return TiledArray::meta::invoke(exec, arg); + return TiledArray::detail::invoke(exec, arg); } template static auto invoker( @@ -93,7 +93,7 @@ class Cast>(std::forward(arg)); }; - return TiledArray::meta::invoke(exec, std::forward(arg)); + return TiledArray::detail::invoke(exec, std::forward(arg)); } public: @@ -151,7 +151,7 @@ class Cast>::type> auto invoke_cast(Arg&& arg) { Cast> cast; - return TiledArray::meta::invoke(cast, std::forward(arg)); + return TiledArray::detail::invoke(cast, std::forward(arg)); } } // namespace TiledArray diff --git a/src/TiledArray/tile_op/binary_wrapper.h b/src/TiledArray/tile_op/binary_wrapper.h index 33d021f2b0..07dd9d19fd 100644 --- a/src/TiledArray/tile_op/binary_wrapper.h +++ b/src/TiledArray/tile_op/binary_wrapper.h @@ -224,7 +224,7 @@ class BinaryWrapper { madness::future_to_ref_t r) { return BinaryWrapper_::operator()(l, r); }; - return meta::invoke(continuation, eval_left, eval_right); + return detail::invoke(continuation, eval_left, eval_right); } /// Evaluate lazy and non-lazy tiles @@ -249,7 +249,7 @@ class BinaryWrapper { R&& r) { return BinaryWrapper_::operator()(l, std::forward(r)); }; - return meta::invoke(continuation, eval_left, std::forward(right)); + return detail::invoke(continuation, eval_left, std::forward(right)); } /// Evaluate non-lazy and lazy tiles @@ -273,7 +273,7 @@ class BinaryWrapper { [this](L&& l, madness::future_to_ref_t r) { return BinaryWrapper_::operator()(std::forward(l), r); }; - return meta::invoke(continuation, std::forward(left), eval_right); + return detail::invoke(continuation, std::forward(left), eval_right); } /// Evaluate two lazy-array tiles @@ -294,7 +294,7 @@ class BinaryWrapper { auto eval_left = invoke_cast(std::forward(left)); auto eval_right = invoke_cast(std::forward(right)); - if (perm_) return meta::invoke(op_, eval_left, eval_right, perm_); + if (perm_) return detail::invoke(op_, eval_left, eval_right, perm_); auto op_left = [this](eval_t& _left, eval_t& _right) { return op_.consume_left(_left, _right); @@ -304,11 +304,11 @@ class BinaryWrapper { }; // Override consumable if (is_consumable_tile>::value && left.is_consumable()) - return meta::invoke(op_left, eval_left, eval_right); + return detail::invoke(op_left, eval_left, eval_right); if (is_consumable_tile>::value && right.is_consumable()) - return meta::invoke(op_right, eval_left, eval_right); + return detail::invoke(op_right, eval_left, eval_right); - return meta::invoke(op_, eval_left, eval_right); + return detail::invoke(op_, eval_left, eval_right); } template < diff --git a/src/TiledArray/tile_op/unary_wrapper.h b/src/TiledArray/tile_op/unary_wrapper.h index 3712aca4f1..e1b89e02a7 100644 --- a/src/TiledArray/tile_op/unary_wrapper.h +++ b/src/TiledArray/tile_op/unary_wrapper.h @@ -152,8 +152,9 @@ class UnaryWrapper { /// `arg`. template >* = nullptr> auto operator()(A&& arg) const { - return (perm_ ? meta::invoke(op_, invoke_cast(std::forward(arg)), perm_) - : meta::invoke(op_, invoke_cast(std::forward(arg)))); + return (perm_ + ? detail::invoke(op_, invoke_cast(std::forward(arg)), perm_) + : detail::invoke(op_, invoke_cast(std::forward(arg)))); } /// Evaluate a lazy array tile @@ -176,10 +177,10 @@ class UnaryWrapper { // return op_.consume(std::forward(arg)); // }; auto op_consume = [this](eval_t& arg) { return op_.consume(arg); }; - return (perm_ ? meta::invoke(op_, std::move(cast_arg), perm_) + return (perm_ ? detail::invoke(op_, std::move(cast_arg), perm_) : (arg.is_consumable() - ? meta::invoke(op_consume, cast_arg) - : meta::invoke(op_, std::move(cast_arg)))); + ? detail::invoke(op_consume, cast_arg) + : detail::invoke(op_, std::move(cast_arg)))); } /// Consume a lazy tile @@ -196,8 +197,8 @@ class UnaryWrapper { // return op_.consume(std::forward(arg)); // }; auto op_consume = [this](eval_t& arg) { return op_.consume(arg); }; - return (perm_ ? meta::invoke(op_, std::move(cast_arg), perm_) - : meta::invoke(op_consume, cast_arg)); + return (perm_ ? detail::invoke(op_, std::move(cast_arg), perm_) + : detail::invoke(op_consume, cast_arg)); } template >* = nullptr> diff --git a/src/TiledArray/util/invoke.h b/src/TiledArray/util/invoke.h new file mode 100644 index 0000000000..ff8bbed191 --- /dev/null +++ b/src/TiledArray/util/invoke.h @@ -0,0 +1,70 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2017 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Eduard Valeyev + * Department of Chemistry, Virginia Tech + * + * meta.h + * April 11, 2017 + * + */ + +#ifndef TILEDARRAY_UTIL_INVOKE_H +#define TILEDARRAY_UTIL_INVOKE_H + +#include +#include +#include +#include + +namespace TiledArray { +namespace detail { + +/// ||'s bools +template +struct or_reduce { + static constexpr bool value = head || or_reduce::value; +}; + +template +struct or_reduce { + static constexpr bool value = b; +}; + +// is any argument a Future? +// - yes: async launch +// - no: direct launch +template +auto invoke(Function&& fn, Args&&... args) -> typename std::enable_if< + !or_reduce>::value...>::value, + decltype(fn(args...))>::type { + return fn(std::forward(args)...); +} + +template < + typename Function, typename... Args, + typename = typename std::enable_if>::value...>::value>::type> +auto invoke(Function&& fn, Args&&... args) { + return TiledArray::get_default_world().taskq.add(std::forward(fn), + std::forward(args)...); +} + +} // namespace detail +} // namespace TiledArray + +#endif // TILEDARRAY_UTIL_INVOKE_H From 7ef3db06ceec62ed6869d449f198585d3bc19702 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 25 Aug 2024 11:54:53 -0400 Subject: [PATCH 441/592] introduced comparability traits --- src/TiledArray/type_traits.h | 209 ++++++++++++++++++++++++++++++----- 1 file changed, 181 insertions(+), 28 deletions(-) diff --git a/src/TiledArray/type_traits.h b/src/TiledArray/type_traits.h index 1bddff446d..5c3d066e9c 100644 --- a/src/TiledArray/type_traits.h +++ b/src/TiledArray/type_traits.h @@ -108,9 +108,9 @@ class LazyArrayTile; struct Derived : T, Fallback {}; \ \ template \ - static No& test(decltype(U::Member)*); \ + static No &test(decltype(U::Member) *); \ template \ - static Yes& test(U*); \ + static Yes &test(U *); \ \ public: \ static constexpr bool value = \ @@ -141,9 +141,9 @@ class LazyArrayTile; struct Derived : T, Fallback {}; \ \ template \ - static No& test(typename U::Type*); \ + static No &test(typename U::Type *); \ template \ - static Yes& test(U*); \ + static Yes &test(U *); \ \ public: \ static constexpr bool value = \ @@ -177,11 +177,11 @@ class LazyArrayTile; template \ struct CheckConst; \ template \ - static Yes test_const(CheckConst*); \ + static Yes test_const(CheckConst *); \ template \ static No test_const(...); \ template \ - static Yes test_nonconst(Check*); \ + static Yes test_nonconst(Check *); \ template \ static No test_nonconst(...); \ \ @@ -215,7 +215,7 @@ class LazyArrayTile; using Yes = char; \ using No = int; \ template \ - static auto func(void*) \ + static auto func(void *) \ -> decltype(std::add_pointer_t().Member( \ std::declval()...))>{}, \ Yes{}); \ @@ -248,7 +248,7 @@ class LazyArrayTile; using Yes = char; \ using No = int; \ template \ - static auto func(void*) \ + static auto func(void *) \ -> decltype(std::add_pointer_t< \ decltype(Function(std::declval()...))>{}, \ Yes{}); \ @@ -278,7 +278,7 @@ class LazyArrayTile; using Yes = char; \ using No = int; \ template \ - static auto func(void*) \ + static auto func(void *) \ -> decltype(std::add_pointer_t()...))>{}, \ Yes{}); \ @@ -451,7 +451,7 @@ template struct has_conversion_operator< From, To, typename std::enable_if< - is_type().operator To&())>::value>::type> + is_type().operator To &())>::value>::type> : std::true_type {}; #else template @@ -473,7 +473,7 @@ struct has_conversion_operator { /* operator exists */ template static decltype(test(&A::operator To)) test(decltype(&A::operator To), - void*) { + void *) { /* Operator exists. What about sig? */ typedef decltype(test(&A::operator To)) return_type; return return_type(); @@ -846,7 +846,7 @@ struct is_strictly_ordered_helper { using Yes = char; using No = int; template - static auto test(void*) + static auto test(void *) -> decltype(std::add_pointer_t() < std::declval())>{}, Yes{}); @@ -857,6 +857,160 @@ struct is_strictly_ordered_helper { static constexpr const bool value = sizeof(test(0)) == sizeof(Yes); }; +///////// is_less_than_comparable ///////// + +template > +struct is_less_than_comparable : public std::false_type {}; + +template +struct is_less_than_comparable() < + std::declval())>> + : public std::true_type {}; + +template +static constexpr bool is_less_than_comparable_v = + is_less_than_comparable::value; + +///////// are_less_than_comparable ///////// + +template > +struct are_less_than_comparable : public std::false_type {}; + +template +struct are_less_than_comparable< + T, U, + std::void_t() < + std::declval())>> : public std::true_type { +}; + +template +static constexpr bool are_less_than_comparable_v = + are_less_than_comparable::value; + +///////// is_less_than_or_equal_comparable ///////// + +template > +struct is_less_than_or_equal_comparable : public std::false_type {}; + +template +struct is_less_than_or_equal_comparable< + T, std::void_t() <= + std::declval())>> + : public std::true_type {}; + +template +static constexpr bool is_less_than_or_equal_comparable_v = + is_less_than_or_equal_comparable::value; + +///////// are_less_than_comparable ///////// + +template > +struct are_less_than_or_equal_comparable : public std::false_type {}; + +template +struct are_less_than_or_equal_comparable< + T, U, + std::void_t() <= + std::declval())>> : public std::true_type { +}; + +template +static constexpr bool are_less_than_or_equal_comparable_v = + are_less_than_or_equal_comparable::value; + +///////// is_greater_than_comparable ///////// + +template > +struct is_greater_than_comparable : public std::false_type {}; + +template +struct is_greater_than_comparable< + T, std::void_t() > + std::declval())>> + : public std::true_type {}; + +template +static constexpr bool is_greater_than_comparable_v = + is_greater_than_comparable::value; + +///////// are_greater_than_comparable ///////// + +template > +struct are_greater_than_comparable : public std::false_type {}; + +template +struct are_greater_than_comparable< + T, U, + std::void_t() > + std::declval())>> : public std::true_type { +}; + +template +static constexpr bool are_greater_than_comparable_v = + are_greater_than_comparable::value; + +///////// is_greater_than_or_equal_comparable ///////// + +template > +struct is_greater_than_or_equal_comparable : public std::false_type {}; + +template +struct is_greater_than_or_equal_comparable< + T, std::void_t() >= + std::declval())>> + : public std::true_type {}; + +template +static constexpr bool is_greater_than_or_equal_comparable_v = + is_greater_than_or_equal_comparable::value; + +///////// are_greater_than_comparable ///////// + +template > +struct are_greater_than_or_equal_comparable : public std::false_type {}; + +template +struct are_greater_than_or_equal_comparable< + T, U, + std::void_t() >= + std::declval())>> : public std::true_type { +}; + +template +static constexpr bool are_greater_than_or_equal_comparable_v = + are_greater_than_or_equal_comparable::value; + +///////// is_equality_comparable ///////// + +template > +struct is_equality_comparable : public std::false_type {}; + +template +struct is_equality_comparable() == + std::declval())>> + : public std::true_type {}; + +template +static constexpr bool is_equality_comparable_v = + is_equality_comparable::value; + +///////// are_equality_comparable ///////// + +template > +struct are_equality_comparable : public std::false_type {}; + +template +struct are_equality_comparable() == + std::declval())>> + : public std::true_type {}; + +template +static constexpr bool are_equality_comparable_v = + are_equality_comparable::value; + /// \c is_strictly_ordered::value is true if strict order is defined for T, /// i.e. "T < T" is defined template @@ -916,7 +1070,7 @@ struct is_std_gettable : std::false_type {}; template struct is_std_gettable< - I, T, std::void_t(std::declval()))>> + I, T, std::void_t(std::declval()))>> : std::true_type {}; template @@ -927,7 +1081,7 @@ struct is_boost_gettable : std::false_type {}; template struct is_boost_gettable< - I, T, std::void_t(std::declval()))>> + I, T, std::void_t(std::declval()))>> : std::true_type {}; template @@ -938,7 +1092,7 @@ constexpr const bool is_gettable_v = is_std_gettable_v || is_boost_gettable_v; template -auto get(T&& t) { +auto get(T &&t) { using boost::get; using std::get; return get(std::forward(t)); @@ -1099,22 +1253,22 @@ struct is_iterator -struct is_iterator : std::true_type { +struct is_iterator : std::true_type { typedef std::random_access_iterator_tag iterator_category; }; template -struct is_iterator : std::true_type { +struct is_iterator : std::true_type { typedef std::random_access_iterator_tag iterator_category; }; template -struct is_iterator : std::true_type { +struct is_iterator : std::true_type { typedef std::random_access_iterator_tag iterator_category; }; template -struct is_iterator : std::true_type { +struct is_iterator : std::true_type { typedef std::random_access_iterator_tag iterator_category; }; @@ -1153,8 +1307,8 @@ template struct is_range : std::false_type {}; template -struct is_range()), - std::end(std::declval()))>> +struct is_range()), + std::end(std::declval()))>> : std::true_type {}; /// \c is_range_v is an alias for \c is_range::value @@ -1169,7 +1323,7 @@ template struct is_sized_range : std::false_type {}; template -struct is_sized_range()))>> +struct is_sized_range()))>> : is_range {}; /// `is_sized_range_v` is an alias for `is_sized_range::value` @@ -1184,9 +1338,8 @@ template struct is_contiguous_range : std::false_type {}; template -struct is_contiguous_range()))>> - : is_range {}; +struct is_contiguous_range< + T, std::void_t()))>> : is_range {}; /// `is_contiguous_range_v` is an alias for `is_contiguous_range::value` template @@ -1197,14 +1350,14 @@ static constexpr bool is_contiguous_range_v = is_contiguous_range::value; /// std::begin(T&) /// @warning will be replaced by C++20 ranges::iterator_t template -using iterator_t = decltype(std::begin(std::declval())); +using iterator_t = decltype(std::begin(std::declval())); /// @tparam T a range type /// @c value_t is the value type, i.e. the type to which @c std::begin(T&) /// dereferences to /// @warning will be replaced by C++20 ranges::value_t template -using value_t = remove_cvr_t()))>; +using value_t = remove_cvr_t()))>; /// @tparam T a type /// `is_integral_range::value` is true if @p T is a range type that @@ -1361,7 +1514,7 @@ static constexpr bool is_gpair_range_v = is_gpair_range::value; template >>> -decltype(auto) at(GeneralizedPair&& v, std::size_t idx) { +decltype(auto) at(GeneralizedPair &&v, std::size_t idx) { assert(idx == 0 || idx == 1); if constexpr (is_gettable_pair_v>) { #if __cplusplus <= 201703L From 3543fb578110506fed5b54bc43768bf0eb38cd7e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 06:39:22 -0400 Subject: [PATCH 442/592] introduced Range1::volume to mirror Range::volume --- src/TiledArray/range1.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/range1.h b/src/TiledArray/range1.h index ef6d422dcc..095f5f06c6 100644 --- a/src/TiledArray/range1.h +++ b/src/TiledArray/range1.h @@ -74,6 +74,9 @@ struct Range1 { /// @return the extent of this range, i.e. second - first auto extent() const noexcept { return second - first; } + /// @return the volume of this range, i.e. second - first + auto volume() const noexcept { return second - first; } + /// swaps `*this` with @p other /// @p other a Range1 object void swap(Range1& other) noexcept { @@ -150,14 +153,14 @@ struct Range1 { typename std::enable_if>>::type* = nullptr> void serialize(Archive& ar) { - ar& first& second; + ar & first & second; } template >>::type* = nullptr> void serialize(Archive& ar) const { - ar& first& second; + ar & first & second; } }; From 7d83b5342ee2426e56324a1027a8bfbb40d31b17 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 06:40:12 -0400 Subject: [PATCH 443/592] remove gratuitous inclusion of tiledarray.h --- src/TiledArray/conversions/vector_of_arrays.h | 2 -- src/TiledArray/math/solvers/cp/cp.h | 1 - src/TiledArray/math/solvers/cp/cp_reconstruct.h | 1 - tests/sparse_tile.h | 14 +++++++------- tests/tot_array_fixture.h | 7 ++++--- 5 files changed, 11 insertions(+), 14 deletions(-) diff --git a/src/TiledArray/conversions/vector_of_arrays.h b/src/TiledArray/conversions/vector_of_arrays.h index 9de3bf8d09..8b3f5ea8a4 100644 --- a/src/TiledArray/conversions/vector_of_arrays.h +++ b/src/TiledArray/conversions/vector_of_arrays.h @@ -5,8 +5,6 @@ #ifndef TILEDARRAY_CONVERSIONS_VECTOR_OF_ARRAYS_H_ #define TILEDARRAY_CONVERSIONS_VECTOR_OF_ARRAYS_H_ -#include - namespace TiledArray { namespace detail { diff --git a/src/TiledArray/math/solvers/cp/cp.h b/src/TiledArray/math/solvers/cp/cp.h index 8c82485211..9065776211 100644 --- a/src/TiledArray/math/solvers/cp/cp.h +++ b/src/TiledArray/math/solvers/cp/cp.h @@ -28,7 +28,6 @@ #include #include -#include namespace TiledArray::math::cp { diff --git a/src/TiledArray/math/solvers/cp/cp_reconstruct.h b/src/TiledArray/math/solvers/cp/cp_reconstruct.h index b09165d335..283f96bb76 100644 --- a/src/TiledArray/math/solvers/cp/cp_reconstruct.h +++ b/src/TiledArray/math/solvers/cp/cp_reconstruct.h @@ -29,7 +29,6 @@ #include #include #include -#include namespace TiledArray::math::cp { diff --git a/tests/sparse_tile.h b/tests/sparse_tile.h index 888c39811f..70897d7ca1 100644 --- a/tests/sparse_tile.h +++ b/tests/sparse_tile.h @@ -24,8 +24,6 @@ #include #include -#include - #include // Array class @@ -37,6 +35,8 @@ #include #include +#include + // sparse 2-dimensional matrix type, with tag type thrown in to make expression // engine work harder template > @@ -192,7 +192,7 @@ class EigenSparseTile { for (typename matrix_type::InnerIterator it(mat, k); it; ++it) { datavec.push_back(Eigen::Triplet(it.row(), it.col(), it.value())); } - ar& datavec& this->range(); + ar & datavec& this->range(); } else { ar & false; } @@ -204,11 +204,11 @@ class EigenSparseTile { madness::is_input_archive_v>::type* = nullptr> void serialize(Archive& ar) { bool have_impl = false; - ar& have_impl; + ar & have_impl; if (have_impl) { std::vector> datavec; range_type range; - ar& datavec& range; + ar & datavec & range; auto extents = range.extent(); matrix_type mat(extents[0], extents[1]); mat.setFromTriplets(datavec.begin(), datavec.end()); @@ -700,7 +700,7 @@ struct ArchiveLoadImpl> { static inline void load(const Archive& ar, Eigen::Triplet& obj) { int row, col; T value; - ar& row& col& value; + ar & row & col & value; obj = Eigen::Triplet(row, col, value); } }; @@ -708,7 +708,7 @@ struct ArchiveLoadImpl> { template struct ArchiveStoreImpl> { static inline void store(const Archive& ar, const Eigen::Triplet& obj) { - ar& obj.row() & obj.col() & obj.value(); + ar & obj.row() & obj.col() & obj.value(); } }; } // namespace archive diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h index 5d0a0ce4dd..2c27824961 100644 --- a/tests/tot_array_fixture.h +++ b/tests/tot_array_fixture.h @@ -19,8 +19,10 @@ #ifndef TILEDARRAY_TEST_TOT_ARRAY_FIXTURE_H__INCLUDED #define TILEDARRAY_TEST_TOT_ARRAY_FIXTURE_H__INCLUDED +#include +#include #include -#include +#include #include "unit_test_config.h" #ifdef TILEDARRAY_HAS_BTAS #include @@ -621,8 +623,7 @@ Result general_product(TensorA const& A, TensorB const& B, } else { temp += temp_; } - } - else { + } else { TA_ASSERT(!(ix_A.empty() || ix_B.empty())); temp += A(ix_A) * B(ix_B); } From f8c337689ce3fad28831ed0aaae537a4560d55e1 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 06:42:13 -0400 Subject: [PATCH 444/592] diagonal_array works with non-0-based ranges --- src/TiledArray/special/diagonal_array.h | 212 ++++++++++-------------- 1 file changed, 83 insertions(+), 129 deletions(-) diff --git a/src/TiledArray/special/diagonal_array.h b/src/TiledArray/special/diagonal_array.h index dd62db1498..d60b23db94 100644 --- a/src/TiledArray/special/diagonal_array.h +++ b/src/TiledArray/special/diagonal_array.h @@ -31,6 +31,8 @@ #include #include +#include + #include namespace TiledArray { @@ -43,7 +45,7 @@ namespace detail { /// empty Range /// \param[in] rng an input (rank-d) Range /// \return the range of diagonal elements, as a rank-1 Range -inline Range diagonal_range(Range const &rng) { +inline Range1 diagonal_range(Range const &rng) { const auto rank = rng.rank(); TA_ASSERT(rng.rank() > 0); auto lo = rng.lobound_data(); @@ -56,92 +58,64 @@ inline Range diagonal_range(Range const &rng) { // If the max small elem is less than the min large elem then a diagonal // elem is in this tile; if (max_low < min_up) { - return Range({max_low}, {min_up}); + return Range1{max_low, min_up}; } else { - return Range(); + return Range1{}; } } -/// \brief computes shape data (i.e. Frobenius norms of the tiles) for a -/// constant diagonal tensor -/// \tparam T a numeric type -/// \param trange a TiledRange of the result -/// \param val value of the diagonal elements -/// \return a Tensor containing the Frobenius norms of -/// the tiles of a DistArray with \p val on the diagonal and -/// zeroes elsewhere -template -Tensor diagonal_shape(TiledRange const &trange, T val) { - Tensor shape(trange.tiles_range(), 0.0); - - auto ext = trange.elements_range().extent(); - auto diag_extent = *std::min_element(std::begin(ext), std::end(ext)); - - auto ndim = trange.rank(); - auto diag_elem = 0ul; - // the diagonal elements will never be larger than the length of the - // shortest dimension - while (diag_elem < diag_extent) { - // Get the tile index corresponding to the current diagonal_elem - auto tile_idx = - trange.element_to_tile(container::svector(ndim, diag_elem)); - auto tile_range = trange.make_tile_range(tile_idx); - - // Compute the range of diagonal elements in the tile - auto d_range = diagonal_range(tile_range); - - // Since each diag elem has the same value the norm of the tile is - // \sqrt{\sum_{diag} val^2} = \sqrt{ndiags * val^2} - float t_norm = std::sqrt(val * val * d_range.volume()); - shape(tile_idx) = t_norm; - - // Update diag_elem to the next elem not in this tile - diag_elem = d_range.upbound_data()[0]; - } - - return shape; -} - /// \brief computes shape data (i.e. Frobenius norms of the tiles) for a /// non-constant diagonal tensor /// \tparam RandomAccessIterator an iterator over /// the range of diagonal elements +/// \tparam Sentinel sentinel type for the range of diagonal elements /// \param[in] trange a TiledRange of the result /// \param[in] diagonals_begin the begin iterator of the range of the diagonals /// \param[in] diagonals_end the end iterator of the range of the diagonals; if /// not given, default initialized and thus will not be checked /// \return a Tensor containing the Frobenius norms of the tiles of /// a DistArray with \p val on the diagonal and zeroes elsewhere -template +template std::enable_if_t::value, Tensor> diagonal_shape(TiledRange const &trange, RandomAccessIterator diagonals_begin, - RandomAccessIterator diagonals_end = {}) { - const bool have_end = diagonals_end == RandomAccessIterator{}; + Sentinel diagonals_end = {}) { + bool have_end = false; + if constexpr (detail::is_equality_comparable_v) { + have_end = diagonals_end != Sentinel{}; + } Tensor shape(trange.tiles_range(), 0.0); const auto rank = trange.rank(); - auto ext = trange.elements_range().extent_data(); - auto diag_extent = *std::min_element(ext, ext + rank); + TA_ASSERT(rank > 0); + const auto *lobound = trange.elements_range().lobound_data(); + const auto diag_lobound = *std::max_element(lobound, lobound + rank); + const auto *upbound = trange.elements_range().upbound_data(); + const auto diag_upbound = *std::min_element(upbound, upbound + rank); - auto ndim = trange.rank(); - auto diag_elem = 0ul; + auto diag_elem = diag_lobound; // the diagonal elements will never be larger than the length of the // shortest dimension - while (diag_elem < diag_extent) { + while (diag_elem < diag_upbound) { // Get the tile index corresponding to the current diagonal_elem - auto tile_idx = trange.element_to_tile(std::vector(ndim, diag_elem)); + auto tile_idx = trange.element_to_tile(Index(rank, diag_elem)); auto tile_range = trange.make_tile_range(tile_idx); // Compute the range of diagonal elements in the tile auto d_range = diagonal_range(tile_range); - TA_ASSERT(d_range != Range{}); - TA_ASSERT(diag_elem == d_range.lobound_data()[0]); - const auto beg = diag_elem; - const auto end = d_range.upbound_data()[0]; + TA_ASSERT(d_range != Range1{}); + TA_ASSERT(diag_elem == d_range.lobound()); + const auto beg = d_range.lobound(); + const auto end = d_range.upbound(); if (have_end) { - TA_ASSERT(diagonals_begin + beg < diagonals_end); - TA_ASSERT(diagonals_begin + end <= diagonals_end); + if constexpr (detail::are_less_than_comparable_v) { + TA_ASSERT(diagonals_begin + beg < diagonals_end); + } + if constexpr (detail::are_less_than_or_equal_comparable_v< + RandomAccessIterator, Sentinel>) { + TA_ASSERT(diagonals_begin + end <= diagonals_end); + } } auto t_norm = std::accumulate(diagonals_begin + beg, diagonals_begin + end, @@ -149,7 +123,7 @@ diagonal_shape(TiledRange const &trange, RandomAccessIterator diagonals_begin, const auto abs_val = std::abs(val); return sum + abs_val * abs_val; }); - shape(tile_idx) = static_cast(t_norm); + shape(tile_idx) = std::sqrt(static_cast(t_norm)); // Update diag_elem to the next elem not in this tile diag_elem = end; @@ -158,36 +132,18 @@ diagonal_shape(TiledRange const &trange, RandomAccessIterator diagonals_begin, return shape; } -/// \brief Writes tiles of a constant diagonal array - -/// \tparam Array a DistArray type +/// \brief computes shape data (i.e. Frobenius norms of the tiles) for a +/// constant diagonal tensor /// \tparam T a numeric type -/// \param[in] A an Array object -/// \param[in] val the value of the diagonal elements of A -template -void write_diag_tiles_to_array_val(Array &A, T val) { - using Tile = typename Array::value_type; - - // Task to create each tile - A.init_tiles([val](const Range &rng) { - // Compute range of diagonal elements in the tile - auto diags = detail::diagonal_range(rng); - const auto rank = rng.rank(); - - Tile tile(rng, 0.0); - - if (diags.volume() > 0) { // If the tile has diagonal elems - - // Loop over the elements and write val into them - auto diag_lo = diags.lobound_data()[0]; - auto diag_hi = diags.upbound_data()[0]; - for (auto elem = diag_lo; elem < diag_hi; ++elem) { - tile(std::vector(rank, elem)) = val; - } - } - - return tile; - }); +/// \param trange a TiledRange of the result +/// \param val value of the diagonal elements +/// \return a Tensor containing the Frobenius norms of +/// the tiles of a DistArray with \p val on the diagonal and +/// zeroes elsewhere +template +Tensor diagonal_shape(TiledRange const &trange, T val) { + auto val_range = ranges::views::repeat(val); + return diagonal_shape(trange, val_range.begin(), val_range.end()); } /// \brief Writes tiles of a nonconstant diagonal array @@ -212,10 +168,11 @@ write_diag_tiles_to_array_rng(Array &A, RandomAccessIterator diagonals_begin) { if (diags.volume() > 0) { // If the tile has diagonal elems // Loop over the elements and write val into them - auto diag_lo = diags.lobound_data()[0]; - auto diag_hi = diags.upbound_data()[0]; - for (auto elem = diag_lo; elem < diag_hi; ++elem) { - tile(std::vector(rank, elem)) = *(diagonals_begin + elem); + auto diag_lo = diags.lobound(); + auto diag_hi = diags.upbound(); + auto elem_it = diagonals_begin + diag_lo; + for (auto elem = diag_lo; elem < diag_hi; ++elem, ++elem_it) { + tile(Index(rank, elem)) = *elem_it; } } @@ -225,36 +182,6 @@ write_diag_tiles_to_array_rng(Array &A, RandomAccessIterator diagonals_begin) { } // namespace detail -/// \brief Creates a constant diagonal DistArray - -/// Creates an array whose only nonzero values are the (hyper)diagonal elements -/// (i.e. (n,n,n, ..., n) ), and they are all have the same value -/// \tparam Policy the policy type of the resulting DistArray -/// \tparam T a numeric type -/// \param world The world for the array -/// \param[in] trange The trange for the array -/// \param[in] val The value of the diagonal elements -/// \return a constant diagonal DistArray -template -Array diagonal_array(World &world, TiledRange const &trange, T val = 1) { - using Policy = typename Array::policy_type; - // Init the array - if constexpr (is_dense_v) { - Array A(world, trange); - detail::write_diag_tiles_to_array_val(A, val); - return A; - } else { - // Compute shape and init the Array - auto shape_norm = detail::diagonal_shape(trange, val); - using ShapeType = typename Policy::shape_type; - ShapeType shape(shape_norm, trange); - Array A(world, trange, shape); - detail::write_diag_tiles_to_array_val(A, val); - return A; - } - abort(); // unreachable -} - /// \brief Creates a non-constant diagonal DistArray /// Creates an array whose only nonzero values are the (hyper)diagonal elements @@ -262,30 +189,39 @@ Array diagonal_array(World &world, TiledRange const &trange, T val = 1) { /// input range /// \tparam Array a DistArray type /// \tparam RandomAccessIterator an iterator over the range of diagonal elements +/// \tparam Sentinel sentinel type for the range of diagonal elements /// \param world The world for the array /// \param[in] trange The trange for the array /// \param[in] diagonals_begin the begin iterator of the range of the diagonals /// \param[in] diagonals_end the end iterator of the range of the diagonals; /// if not given, default initialized and thus will not be checked /// \return a diagonal DistArray -template +template std::enable_if_t::value, Array> diagonal_array(World &world, TiledRange const &trange, RandomAccessIterator diagonals_begin, - RandomAccessIterator diagonals_end = {}) { + Sentinel diagonals_end = {}) { using Policy = typename Array::policy_type; - if (diagonals_end != RandomAccessIterator{}) { - const auto rank = trange.rank(); - auto ext = trange.elements_range().extent_data(); - [[maybe_unused]] auto diag_extent = *std::min_element(ext, ext + rank); - TA_ASSERT(diagonals_begin + diag_extent <= diagonals_end); + if constexpr (detail::is_equality_comparable_v) { + if (diagonals_end != Sentinel{}) { + auto diagonals_range = detail::diagonal_range(trange.elements_range()); + if constexpr (detail::are_less_than_comparable_v) { + TA_ASSERT(diagonals_begin + diagonals_range.lobound() < diagonals_end); + } + if constexpr (detail::are_less_than_or_equal_comparable_v< + RandomAccessIterator, Sentinel>) { + TA_ASSERT(diagonals_begin + diagonals_range.upbound() <= diagonals_end); + } + } } // Init the array if constexpr (is_dense_v) { Array A(world, trange); detail::write_diag_tiles_to_array_rng(A, diagonals_begin); + A.world().taskq.fence(); // ensure tasks outlive the diagonals_begin view return A; } else { // Compute shape and init the Array @@ -295,11 +231,29 @@ diagonal_array(World &world, TiledRange const &trange, ShapeType shape(shape_norm, trange); Array A(world, trange, shape); detail::write_diag_tiles_to_array_rng(A, diagonals_begin); + A.world().taskq.fence(); // ensure tasks outlive the diagonals_begin view return A; } abort(); // unreachable } +/// \brief Creates a constant diagonal DistArray + +/// Creates an array whose only nonzero values are the (hyper)diagonal elements +/// (i.e. (n,n,n, ..., n) ), and they are all have the same value +/// \tparam Policy the policy type of the resulting DistArray +/// \tparam T a numeric type +/// \param world The world for the array +/// \param[in] trange The trange for the array +/// \param[in] val The value of the diagonal elements +/// \return a constant diagonal DistArray +template +Array diagonal_array(World &world, TiledRange const &trange, T val = 1) { + auto val_range = ranges::views::repeat(val); + return diagonal_array(world, trange, val_range.begin(), + val_range.end()); +} + } // namespace TiledArray #endif // TILEDARRAY_SPECIALARRAYS_DIAGONAL_ARRAY_H__INCLUDED From 2f6a907d0fba638bd3ff2e32016883c67a21d8e6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 06:43:45 -0400 Subject: [PATCH 445/592] revamp Kronecker delta for use in retiling --- src/TiledArray/special/kronecker_delta.h | 168 +++++++++++++++-------- tests/expressions_mixed.cpp | 64 ++++----- 2 files changed, 141 insertions(+), 91 deletions(-) diff --git a/src/TiledArray/special/kronecker_delta.h b/src/TiledArray/special/kronecker_delta.h index ff1907a49e..35a8da6e57 100644 --- a/src/TiledArray/special/kronecker_delta.h +++ b/src/TiledArray/special/kronecker_delta.h @@ -39,13 +39,15 @@ namespace TiledArray { +// clang-format off /// *generalized* (asymmetric) Kronecker delta /// *generalized* (asymmetric) Kronecker delta is a product of `N` ordinary /// Kronecker deltas /// Definition: `KroneckerDeltaTile(b,k) = (b==k) ? 1 : 0` and -/// `KroneckerDeltaTile(b0,k0,b1,k1,b2,k2...bN,kN) = KroneckerDeltaTile(b0,k0) -/// KroneckerDeltaTile(b1,k1) ... KroneckerDeltaTile(bN,kN)` +/// `KroneckerDeltaTile(b1,b2,...bN,k1,k2,...kN) = KroneckerDeltaTile(b0,k0) KroneckerDeltaTile(b1,k1) ... KroneckerDeltaTile(bN,kN)`. +/// The implicit layout is hardwired to `b0,b1,b2,...,bN,k0,k1,k2,...,kN` since the intended use is for taking slices. +// clang-format on class KroneckerDeltaTile { public: // Concept typedefs @@ -107,10 +109,11 @@ class KroneckerDeltaTile { const auto N = range.rank() / 2; auto lobound = range.lobound_data(); auto upbound = range.upbound_data(); - for (auto i = 0; i != 2 * N && not empty; i += 2) - empty = (upbound[i] > lobound[i + 1] && upbound[i + 1] > lobound[i]) - ? true - : false; // assumes extents > 0 + for (auto i = 0; i != N && not empty; ++i) { + const auto lo = std::max(lobound[i], lobound[i + N]); + const auto up = std::min(upbound[i], upbound[i + N]); + empty = lo >= up; + } return empty; } @@ -182,73 +185,128 @@ Tensor binary(const KroneckerDeltaTile& arg1, const Tensor& arg2, Op&& op, abort(); } -// Contraction operation +// Contraction operations // GEMM operation with fused indices as defined by gemm_config: -// dense_result[i,j] = dense_arg1[i,k] * sparse_arg2[k,j] +// dense_result[i,j] += dense_arg1[i,k] * sparse_arg2[k,j] template -Tensor gemm(const KroneckerDeltaTile& arg1, const Tensor& arg2, - const typename Tensor::numeric_type factor, - const math::GemmHelper& gemm_config) { +void gemm(Tensor& result, const KroneckerDeltaTile& arg1, + const Tensor& arg2, const typename Tensor::numeric_type factor, + const math::GemmHelper& gemm_config) { // preconditions: - // 1. implemented only outer product - assert(gemm_config.result_rank() == - gemm_config.left_rank() + gemm_config.right_rank()); + // 1. implemented only kronecker transform (every mode of arg2 is contracted + // with the matching mode of arg1) + TA_ASSERT((gemm_config.result_rank() == gemm_config.right_rank() && + gemm_config.left_rank() == + gemm_config.result_rank() + gemm_config.right_rank())); auto arg1_range = arg1.range(); auto arg2_range = arg2.range(); - auto result_range = - gemm_config.make_result_range(arg1_range, arg2_range); - Tensor result(result_range, 0); + // if result is empty, initialize it + const auto& result_range = + result.empty() + ? gemm_config.make_result_range(arg1_range, arg2_range) + : result.range(); + if (result.empty()) result = Tensor(result_range, 0); auto result_data = result.data(); auto arg1_extents = arg1_range.extent_data(); auto arg2_data = arg2.data(); auto arg2_volume = arg2_range.volume(); - if (not arg1.empty()) { - const auto N = arg1.N(); - switch (N) { - case 1: { - auto i0_range = std::min(arg1_extents[0], arg1_extents[1]); - for (decltype(i0_range) i0 = 0; i0 != i0_range; ++i0) { - auto result_i0i0_ptr = - result_data + (i0 * arg1_extents[1] + i0) * arg2_volume; - std::copy(arg2_data, arg2_data + arg2_volume, result_i0i0_ptr); - } - } break; - case 2: { - auto i0_range = std::min(arg1_extents[0], arg1_extents[1]); - auto i1_range = std::min(arg1_extents[2], arg1_extents[3]); - auto ndim23 = arg1_extents[2] * arg1_extents[3]; - for (decltype(i0_range) i0 = 0; i0 != i0_range; ++i0) { - auto result_i0i0i1i1_ptr_offset = - result_data + (i0 * arg1_extents[1] + i0) * ndim23 * arg2_volume; - for (decltype(i1_range) i1 = 0; i1 != i1_range; ++i1) { - auto result_i0i0i1i1_ptr = - result_i0i0i1i1_ptr_offset + - (i1 * arg1_extents[3] + i1) * arg2_volume; - std::copy(arg2_data, arg2_data + arg2_volume, result_i0i0i1i1_ptr); - } - } - } break; - - default: - abort(); // not implemented - } - } - - return result; + TA_ASSERT(!arg1.empty()); + const auto N = arg1.N(); + auto max = [&](const auto* v1, const auto* v2) { + TA::Index result(N); + for (auto i = 0; i != N; ++i) result[i] = std::max(v1[i], v2[i]); + return result; + }; + auto min = [&](const auto* v1, const auto* v2) { + TA::Index result(N); + for (auto i = 0; i != N; ++i) result[i] = std::min(v1[i], v2[i]); + return result; + }; + const auto read_lobound = + max(result_range.lobound_data(), arg2_range.lobound_data()); + const auto read_upbound = + min(result_range.upbound_data(), arg2_range.upbound_data()); + result.block(read_lobound, read_upbound) = + arg2.block(read_lobound, read_upbound); } + // GEMM operation with fused indices as defined by gemm_config: -// dense_result[i,j] += dense_arg1[i,k] * sparse_arg2[k,j] +// dense_result[b0,..bN] = kronecker_arg1[b1,...bN,k1,...kN] * +// dense_arg2[k1,...kN] template -void gemm(Tensor& result, const KroneckerDeltaTile& arg1, - const Tensor& arg2, const typename Tensor::numeric_type factor, - const math::GemmHelper& gemm_config) { - abort(); +Tensor gemm(const KroneckerDeltaTile& arg1, const Tensor& arg2, + const typename Tensor::numeric_type factor, + const math::GemmHelper& gemm_config) { + Tensor result; + gemm(result, arg1, arg2, factor, gemm_config); + return result; +} + +namespace detail { + +/// \brief computes shape data (i.e. Frobenius norms of the tiles) for a +/// DistArray of KroneckerDeltaTile +/// \param trange a TiledRange of the result +/// \return a Tensor containing the Frobenius norms of +/// the tiles of a DistArray of KroneckerDeltaTile's +/// \note Unlike diagonal_shape() which works for hyperdiagonal tensor with +/// `N` modes (`t(i,i,...i) = 1`), this works for product of `N` +/// Kroneckers (`t(i1,...iN,i1,...iN) = 1`, with `N` = `trange.rank() / 2`). +inline Tensor kronecker_shape(TiledRange const& trange) { + // preconditions + TA_ASSERT(trange.rank() % 2 == 0); + + Tensor shape(trange.tiles_range(), 0.0); + const auto N = trange.rank() / 2; + + // for every bra-ket pair of modes compute list of + // {bra tile index, ket tile index, number of nonzeros} + using bkn_type = std::tuple; + std::vector> bkns(N); + for (auto d = 0; d != N; ++d) { + auto& bkn = bkns[d]; + auto& bra_tr1 = trange.dim(d); + auto& ket_tr1 = trange.dim(d + N); + auto eidx = std::max(bra_tr1.elements_range().lobound(), + ket_tr1.elements_range().lobound()); + const auto eidx_fence = std::min(bra_tr1.elements_range().upbound(), + ket_tr1.elements_range().upbound()); + while (eidx < eidx_fence) { + const auto bra_tile_idx = bra_tr1.element_to_tile(eidx); + const auto& bra_tile = bra_tr1.tile(bra_tile_idx); + auto ket_tile_idx = ket_tr1.element_to_tile(eidx); + const auto& ket_tile = ket_tr1.tile(ket_tile_idx); + // closest tile boundary + const auto next_eidx = std::min(bra_tile.upbound(), ket_tile.upbound()); + bkn.emplace_back(bra_tile_idx, ket_tile_idx, next_eidx - eidx); + eidx = next_eidx; + } + } + + // number of nonzero tiles per mode + TA::Index nnz_tiles(N); + for (auto d = 0; d != N; ++d) nnz_tiles[d] = bkns[d].size(); + TA::Range nztiles_range(nnz_tiles); + TA::Index tile_idx(2 * N); + for (auto&& nztile : nztiles_range) { + std::size_t nnz_elements = 1; + for (auto d = 0; d != N; ++d) { + tile_idx[d] = std::get<0>(bkns[d][nztile[d]]); + tile_idx[d + N] = std::get<1>(bkns[d][nztile[d]]); + nnz_elements *= std::get<2>(bkns[d][nztile[d]]); + } + shape(tile_idx) = std::sqrt(nnz_elements); + } + + return shape; } +} // namespace detail + } // namespace TiledArray #endif // TILEDARRAY_TEST_SPARSE_TILE_H__INCLUDED diff --git a/tests/expressions_mixed.cpp b/tests/expressions_mixed.cpp index 54d725cba6..9e16728461 100644 --- a/tests/expressions_mixed.cpp +++ b/tests/expressions_mixed.cpp @@ -38,8 +38,8 @@ struct tag {}; struct MixedExpressionsFixture : public TiledRangeFixture { typedef DistArray>, DensePolicy> TArrayDS1; typedef DistArray>, DensePolicy> TArrayDS2; - typedef DistArray ArrayKronDelta1; - typedef DistArray SpArrayKronDelta1; + typedef DistArray ArrayKronDelta; + typedef DistArray SpArrayKronDelta; MixedExpressionsFixture() : u(*GlobalFixture::world, trange2), @@ -47,27 +47,12 @@ struct MixedExpressionsFixture : public TiledRangeFixture { e2(*GlobalFixture::world, trange2e), e4(*GlobalFixture::world, trange4e), v(*GlobalFixture::world, trange2), - w(*GlobalFixture::world, trange2), - delta1(*GlobalFixture::world, trange2, DenseShape(), - std::make_shared( - *GlobalFixture::world, trange2.tiles_range().volume())), - delta1e(*GlobalFixture::world, trange2e, DenseShape(), - std::make_shared( - *GlobalFixture::world, trange2e.tiles_range().volume())), - spe2(*GlobalFixture::world, trange2e), - spdelta1(*GlobalFixture::world, trange2, - SparseShape(detail::diagonal_shape(trange2, 1), trange2), - std::make_shared( - *GlobalFixture::world, trange2.tiles_range().volume())) { + w(*GlobalFixture::world, trange2) { random_fill(u); random_fill(v); u2.fill(0); random_fill(e2); e4.fill(0); - init_kronecker_delta(delta1); - init_kronecker_delta(delta1e); - random_fill(spe2); - init_kronecker_delta(spdelta1); GlobalFixture::world->gop.fence(); } @@ -140,10 +125,6 @@ struct MixedExpressionsFixture : public TiledRangeFixture { TArrayDS1 v; TArrayDS1 v1; TArrayDS2 w; - ArrayKronDelta1 delta1; - ArrayKronDelta1 delta1e; - TSpArrayD spe2; - SpArrayKronDelta1 spdelta1; }; // MixedExpressionsFixture // Instantiate static variables for fixture @@ -193,25 +174,36 @@ BOOST_AUTO_TEST_CASE(mult_factories) { // BOOST_CHECK_NO_THROW(w("a,b") = u("a,b") * v("a,b")); } -BOOST_AUTO_TEST_CASE(outer_product_factories) { +BOOST_AUTO_TEST_CASE(kronecker) { #if !MULT_DENSE_SPARSE_TO_SPARSE // ok BOOST_CHECK_NO_THROW(u2("a,b,c,d") += u("a,b") * v("c,d")); #endif - // these can only work if nproc == 1 since KroneckerDelta does not travel, and - // SUMMA does not support replicated args - if (GlobalFixture::world->nproc() == 1) { - // ok - BOOST_CHECK_NO_THROW(u2("a,b,c,d") += delta1("a,b") * u("c,d")); - - // ok - TSpArrayD tmp; - BOOST_CHECK_NO_THROW(tmp("a,b,c,d") = spdelta1("a,b") * spe2("c,d")); - - // ok - BOOST_CHECK_NO_THROW(e4("a,c,b,d") += delta1e("a,b") * e2("c,d")); - } + // retile test + TSpArrayD x(*GlobalFixture::world, trange2); + random_fill(x); + + TA::TiledRange yrange{{5, 18}, {7, 20}}; + TA::TiledRange retiler_range{yrange.dim(0), yrange.dim(1), trange2.dim(0), + trange2.dim(1)}; + SpArrayKronDelta retiler( + *GlobalFixture::world, retiler_range, + SparseShape(detail::kronecker_shape(retiler_range), retiler_range), + std::make_shared( + *GlobalFixture::world, retiler_range.tiles_range().volume())); + init_kronecker_delta(retiler); + + TA::TSpArrayD y; + y("d1,d2") = retiler("d1,d2,s1,s2") * x("s1,s2"); + // std::cout << "y = " << y << std::endl; + // why deadlock without this? + y.world().gop.fence(); + + TA::TSpArrayD y_ref = TA::retile(x, yrange); + // std::cout << "y_ref = " << y_ref << std::endl; + + BOOST_CHECK((y("d1,d2") - y_ref("d1,d2")).norm().get() == 0.); } BOOST_AUTO_TEST_SUITE_END() From 5fd506022adfb918c0cef4b33f6fc0766fac42f0 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 09:33:20 -0400 Subject: [PATCH 446/592] TA::detail::retile_v1 implements a KroneckerDelta-based retiling --- src/TiledArray/conversions/retile.h | 79 ++++++++++++++++++++++++++--- tests/expressions_mixed.cpp | 13 +++-- 2 files changed, 80 insertions(+), 12 deletions(-) diff --git a/src/TiledArray/conversions/retile.h b/src/TiledArray/conversions/retile.h index 26440166c4..014582cf97 100644 --- a/src/TiledArray/conversions/retile.h +++ b/src/TiledArray/conversions/retile.h @@ -22,8 +22,9 @@ #ifndef TILEDARRAY_RETILE_H #define TILEDARRAY_RETILE_H -#include "TiledArray/util/annotation.h" #include "TiledArray/special/diagonal_array.h" +#include "TiledArray/special/kronecker_delta.h" +#include "TiledArray/util/annotation.h" /// \name Retile function /// \brief Retiles a tensor with a provided TiledRange @@ -38,9 +39,11 @@ namespace TiledArray { -template -auto retile(const DistArray& tensor, - const TiledRange& new_trange) { +namespace detail { + +template +auto retile_v0(const DistArray& tensor, + const TiledRange& new_trange) { // Make sure ranks match auto rank = new_trange.rank(); auto tensor_rank = tensor.trange().rank(); @@ -67,11 +70,13 @@ auto retile(const DistArray& tensor, }; // Check the different dimensions and contract when needed - using tensor_type = DistArray; + using tensor_type = DistArray; auto start = detail::dummy_annotation(rank); tensor_type output_tensor; for (auto i = 0; i < rank; ++i) { - if (i == 0) { output_tensor(start) = tensor(start); } + if (i == 0) { + output_tensor(start) = tensor(start); + } if (new_trange.dim(i) != tensor.trange().dim(i)) { // Make identity for contraction TiledRange retiler{tensor.trange().dim(i), new_trange.dim(i)}; @@ -88,7 +93,67 @@ auto retile(const DistArray& tensor, return output_tensor; } -} // namespace TiledArray +template +auto retile_v1(const DistArray& tensor, + const TiledRange& new_trange) { + // Make sure ranks match + auto rank = new_trange.rank(); + auto tensor_rank = tensor.trange().rank(); + assert((rank == tensor_rank) && "TiledRanges are of different ranks"); + + // Makes the annotations for the contraction step + auto annotations = [&]() -> std::tuple { + std::ostringstream final, switcher; + final << "j0"; + switcher << "j0"; + for (unsigned int d = 1; d < rank; ++d) { + final << ",j" << d; + switcher << ",j" << d; + } + for (unsigned int d = 0; d < rank; ++d) { + switcher << ",i" << d; + } + return {final.str(), switcher.str()}; + }; + + // Check the different dimensions and contract when needed + using Array = DistArray; + container::svector retiler_ranges; + for (auto i = 0; i < rank; ++i) { + retiler_ranges.emplace_back(new_trange.dim(i)); + } + for (auto i = 0; i < rank; ++i) { + retiler_ranges.emplace_back(tensor.trange().dim(i)); + } + TA::TiledRange retiler_range(retiler_ranges); + TA::DistArray retiler( + tensor.world(), retiler_range, + SparseShape(kronecker_shape(retiler_range), retiler_range), + std::make_shared( + tensor.world(), retiler_range.tiles_range().volume())); + retiler.init_tiles([=](const TiledArray::Range& range) { + return KroneckerDeltaTile(range); + }); + + // Make indices for contraction + + // Retile + Array output; + auto start = detail::dummy_annotation(rank); + auto [finish, change] = annotations(); + output(finish) = retiler(change) * tensor(start); + + return output; +} + +} // namespace detail + +template +auto retile(const DistArray& tensor, + const TiledRange& new_trange) { + return detail::retile_v0(tensor, new_trange); +} +} // namespace TiledArray #endif // TILEDARRAY_RETILE_H diff --git a/tests/expressions_mixed.cpp b/tests/expressions_mixed.cpp index 9e16728461..40a0c16440 100644 --- a/tests/expressions_mixed.cpp +++ b/tests/expressions_mixed.cpp @@ -103,10 +103,12 @@ struct MixedExpressionsFixture : public TiledRangeFixture { return matrix; } - template - static void init_kronecker_delta(DistArray& array) { - array.init_tiles( - [=](const TiledArray::Range& range) { return Tile(range); }); + template + static void init_kronecker_delta( + DistArray& array) { + array.init_tiles([=](const TiledArray::Range& range) { + return KroneckerDeltaTile(range); + }); } ~MixedExpressionsFixture() { GlobalFixture::world->gop.fence(); } @@ -195,7 +197,8 @@ BOOST_AUTO_TEST_CASE(kronecker) { init_kronecker_delta(retiler); TA::TSpArrayD y; - y("d1,d2") = retiler("d1,d2,s1,s2") * x("s1,s2"); + // y("d1,d2") = retiler("d1,d2,s1,s2") * x("s1,s2"); + y = TA::detail::retile_v1(x, yrange); // std::cout << "y = " << y << std::endl; // why deadlock without this? y.world().gop.fence(); From 9b62211616f4f59a0b9a2f9db2c7ed976cf5ebe0 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 10:03:18 -0400 Subject: [PATCH 447/592] Explain that retile supports general-sense retiling (i.e. element range can stay same or change, up or down) --- src/TiledArray/conversions/retile.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/conversions/retile.h b/src/TiledArray/conversions/retile.h index 014582cf97..55caa17c40 100644 --- a/src/TiledArray/conversions/retile.h +++ b/src/TiledArray/conversions/retile.h @@ -148,10 +148,17 @@ auto retile_v1(const DistArray& tensor, } // namespace detail +/// Creates a new DistArray with the same data as the input tensor, but with a +/// different trange. The primary use-case is to change tiling while keeping the +/// element range the same, but it can be used to select blocks of the data as +/// well as increasing the element range (with the new elements initialized to +/// zero) +/// \param array The DistArray whose data is to be retiled +/// \param new_trange The desired TiledRange of the output tensor template -auto retile(const DistArray& tensor, +auto retile(const DistArray& array, const TiledRange& new_trange) { - return detail::retile_v0(tensor, new_trange); + return detail::retile_v0(array, new_trange); } } // namespace TiledArray From a13e0ed3b3c9cb5ca136a2dd1f2eaf832137cd32 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 14:27:24 -0400 Subject: [PATCH 448/592] introduced Range1::{includes,overlaps_with} --- src/TiledArray/range1.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/TiledArray/range1.h b/src/TiledArray/range1.h index 095f5f06c6..dbb4b05a67 100644 --- a/src/TiledArray/range1.h +++ b/src/TiledArray/range1.h @@ -90,6 +90,21 @@ struct Range1 { return std::make_pair(first, second); } + /// Checks if a given index is within this range + /// @return true if \p i is within this range + template + typename std::enable_if::value, bool>::type includes( + const I& i) const { + return first <= i && i < second; + } + + /// Checks if a given range overlaps with this range + + /// @return true if \p r overlaps with this range + bool overlaps_with(const Range1& rng) const { + return lobound() < rng.upbound() && upbound() > rng.lobound(); + } + /// \brief Range1 iterator type /// /// Iterates over Range1 From a9633fcce3e5f54298ddd3ac023263db0587c0c6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 14:30:34 -0400 Subject: [PATCH 449/592] Tensor::block(bounds) handles decltype(bounds)=Range separately --- src/TiledArray/tensor/tensor.h | 39 +++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 38f0e65ff9..12479ef53c 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1242,7 +1242,8 @@ class Tensor { // clang-format on /// @{ template >> + typename = std::enable_if_t && + !std::is_same_v>> detail::TensorInterface block( const PairRange& bounds) const { return detail::TensorInterface( @@ -1250,7 +1251,8 @@ class Tensor { } template >> + typename = std::enable_if_t && + !std::is_same_v>> detail::TensorInterface block(const PairRange& bounds) { return detail::TensorInterface( BlockRange(this->range_, bounds), this->data()); @@ -1289,6 +1291,38 @@ class Tensor { } /// @} + // clang-format off + /// Constructs a view of the block defined by a TiledArray::Range . + + /// Examples of using this: + /// \code + /// std::vector lobounds = {0, 1, 2}; + /// std::vector upbounds = {4, 6, 8}; + /// + /// auto tview = t.block(TiledArray::Range(lobounds, upbounds)); + /// \endcode + /// \tparam PairRange Type representing a range of generalized pairs (see TiledArray::detail::is_gpair_v ) + /// \param bounds The block bounds + /// \return a {const,mutable} view of the block defined by its \p bounds + /// \throw TiledArray::Exception When the size of \p lower_bound is not + /// equal to that of \p upper_bound. + /// \throw TiledArray::Exception When `get<0>(bounds[i]) >= get<1>(bounds[i])` + // clang-format on + /// @{ + detail::TensorInterface block( + const Range& bounds) const { + return detail::TensorInterface( + BlockRange(this->range_, bounds.lobound(), bounds.upbound()), + this->data()); + } + + detail::TensorInterface block(const Range& bounds) { + return detail::TensorInterface( + BlockRange(this->range_, bounds.lobound(), bounds.upbound()), + this->data()); + } + /// @} + /// Create a permuted copy of this tensor /// \tparam Perm A permutation tile @@ -2373,7 +2407,6 @@ class Tensor { /// \return The vector norm of this tensor scalar_type squared_norm() const { - if constexpr (detail::is_tensor_v) { // If uninitialized tensor of tensor return zero. // All elements of this->data() are empty tensors in this case, From 569b0bf8f56e070bc630796f7fa6448b8d5332fa Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 14:35:03 -0400 Subject: [PATCH 450/592] Implemented TA::detail::retile_v2, the nearly-optimal version of element-level reranging (retiling, etc.) --- src/TiledArray/conversions/retile.h | 156 +++++++++++++++++++++++++++- tests/expressions_mixed.cpp | 40 ++++--- 2 files changed, 176 insertions(+), 20 deletions(-) diff --git a/src/TiledArray/conversions/retile.h b/src/TiledArray/conversions/retile.h index 55caa17c40..6b35872476 100644 --- a/src/TiledArray/conversions/retile.h +++ b/src/TiledArray/conversions/retile.h @@ -146,6 +146,156 @@ auto retile_v1(const DistArray& tensor, return output; } +template +void write_tile_block(madness::uniqueidT target_array_id, + std::size_t target_tile_ord, + const Tile& target_tile_contribution) { + auto* world_ptr = World::world_from_id(target_array_id.get_world_id()); + auto target_array_ptr_opt = world_ptr->ptr_from_id< + typename DistArray::impl_type::storage_type>( + target_array_id); + TA_ASSERT(target_array_ptr_opt); + TA_ASSERT((*target_array_ptr_opt)->is_local(target_tile_ord)); + (*target_array_ptr_opt) + ->get_local(target_tile_ord) + .get() + .block(target_tile_contribution.range()) = target_tile_contribution; +} + +template +auto retile_v2(const DistArray& source_array, + const TiledRange& target_trange) { + auto& world = source_array.world(); + const auto rank = source_array.trange().rank(); + TA_ASSERT(rank == target_trange.rank()); + + // compute metadata + // - list of target tile indices and the corresponding Range1 for each 1-d + // source tile + using target_tiles_t = std::vector>; + using mode_target_tiles_t = std::vector; + using all_target_tiles_t = std::vector; + + all_target_tiles_t all_target_tiles(target_trange.rank()); + // for each mode ... + for (auto d = 0; d != target_trange.rank(); ++d) { + mode_target_tiles_t& mode_target_tiles = all_target_tiles[d]; + auto& target_tr1 = target_trange.dim(d); + auto& target_element_range = target_tr1.elements_range(); + // ... and each tile in that mode ... + for (auto&& source_tile : source_array.trange().dim(d)) { + mode_target_tiles.emplace_back(); + auto& target_tiles = mode_target_tiles.back(); + auto source_tile_lo = source_tile.lobound(); + auto source_tile_up = source_tile.upbound(); + auto source_element_idx = source_tile_lo; + // ... find all target tiles what overlap with it + if (target_element_range.overlaps_with(source_tile)) { + while (source_element_idx < source_tile_up) { + if (target_element_range.includes(source_element_idx)) { + auto target_tile_idx = + target_tr1.element_to_tile(source_element_idx); + auto target_tile = target_tr1.tile(target_tile_idx); + auto target_lo = + std::max(source_element_idx, target_tile.lobound()); + auto target_up = std::min(source_tile_up, target_tile.upbound()); + target_tiles.emplace_back(target_tile_idx, + Range1(target_lo, target_up)); + source_element_idx = target_up; + } else if (source_element_idx < target_element_range.lobound()) { + source_element_idx = target_element_range.lobound(); + } else if (source_element_idx >= target_element_range.upbound()) + break; + } + } + } + } + + // estimate the shape, if sparse + // use max value for each nonzero tile, then will recompute after tiles are + // assigned + using shape_type = typename Policy::shape_type; + shape_type target_shape; + const auto& target_tiles_range = target_trange.tiles_range(); + if constexpr (!is_dense_v) { + // each rank computes contributions to the shape norms from its local tiles + Tensor target_shape_norms(target_tiles_range, 0); + auto& source_trange = source_array.trange(); + const auto e = source_array.end(); + for (auto it = source_array.begin(); it != e; ++it) { + auto source_tile_idx = it.index(); + + // make range for iterating over all possible target tile idx combinations + TA::Index target_tile_ord_extent_range(rank); + for (auto d = 0; d != rank; ++d) { + target_tile_ord_extent_range[d] = + all_target_tiles[d][source_tile_idx[d]].size(); + } + + // loop over every target tile combination + TA::Range target_tile_ord_extent(target_tile_ord_extent_range); + for (auto& target_tile_ord : target_tile_ord_extent) { + TA::Index target_tile_idx(rank); + for (auto d = 0; d != rank; ++d) { + target_tile_idx[d] = + all_target_tiles[d][source_tile_idx[d]][target_tile_ord[d]].first; + } + target_shape_norms(target_tile_idx) = std::numeric_limits::max(); + } + } + world.gop.max(target_shape_norms.data(), target_shape_norms.size()); + target_shape = SparseShape(target_shape_norms, target_trange); + } + + using Array = DistArray; + Array target_array(source_array.world(), target_trange, target_shape); + target_array.fill_local(0.0); + target_array.world().gop.fence(); + + // loop over local tile and sends its contributions to the targets + { + auto& source_trange = source_array.trange(); + const auto e = source_array.end(); + auto& target_tiles_range = target_trange.tiles_range(); + for (auto it = source_array.begin(); it != e; ++it) { + const auto& source_tile = *it; + auto source_tile_idx = it.index(); + + // make range for iterating over all possible target tile idx combinations + TA::Index target_tile_ord_extent_range(rank); + for (auto d = 0; d != rank; ++d) { + target_tile_ord_extent_range[d] = + all_target_tiles[d][source_tile_idx[d]].size(); + } + + // loop over every target tile combination + TA::Range target_tile_ord_extent(target_tile_ord_extent_range); + for (auto& target_tile_ord : target_tile_ord_extent) { + TA::Index target_tile_idx(rank); + container::svector target_tile_rngs1(rank); + for (auto d = 0; d != rank; ++d) { + std::tie(target_tile_idx[d], target_tile_rngs1[d]) = + all_target_tiles[d][source_tile_idx[d]][target_tile_ord[d]]; + } + TA_ASSERT(source_tile.future().probe()); + Tile target_tile_contribution( + source_tile.get().block(target_tile_rngs1)); + auto target_tile_idx_ord = target_tiles_range.ordinal(target_tile_idx); + auto target_proc = target_array.pmap()->owner(target_tile_idx_ord); + world.taskq.add(target_proc, &write_tile_block, + target_array.id(), target_tile_idx_ord, + target_tile_contribution); + } + } + } + // data is mutated in place, so must wait for all tasks to complete + target_array.world().gop.fence(); + // recompute norms/trim away zeros + target_array.truncate(); + + return target_array; +} + } // namespace detail /// Creates a new DistArray with the same data as the input tensor, but with a @@ -154,11 +304,11 @@ auto retile_v1(const DistArray& tensor, /// well as increasing the element range (with the new elements initialized to /// zero) /// \param array The DistArray whose data is to be retiled -/// \param new_trange The desired TiledRange of the output tensor +/// \param target_trange The desired TiledRange of the output tensor template auto retile(const DistArray& array, - const TiledRange& new_trange) { - return detail::retile_v0(array, new_trange); + const TiledRange& target_trange) { + return detail::retile_v0(array, target_trange); } } // namespace TiledArray diff --git a/tests/expressions_mixed.cpp b/tests/expressions_mixed.cpp index 40a0c16440..b8de603356 100644 --- a/tests/expressions_mixed.cpp +++ b/tests/expressions_mixed.cpp @@ -186,27 +186,33 @@ BOOST_AUTO_TEST_CASE(kronecker) { TSpArrayD x(*GlobalFixture::world, trange2); random_fill(x); - TA::TiledRange yrange{{5, 18}, {7, 20}}; - TA::TiledRange retiler_range{yrange.dim(0), yrange.dim(1), trange2.dim(0), - trange2.dim(1)}; - SpArrayKronDelta retiler( - *GlobalFixture::world, retiler_range, - SparseShape(detail::kronecker_shape(retiler_range), retiler_range), - std::make_shared( - *GlobalFixture::world, retiler_range.tiles_range().volume())); - init_kronecker_delta(retiler); - - TA::TSpArrayD y; + // includes target tiles that receive contributions from multiple source + // tiles, tiny target tiles with single contribution, and tiles partially and + // completely outside the source range N.B. retile_v0 seems to struggle with + // completely empty tiles (e.g. add 47 to each 1-d range) + TA::TiledRange yrange{{5, 18, 20, 45}, {7, 20, 22, 45}}; + TA::TSpArrayD y1; + // TA::TiledRange retiler_range{yrange.dim(0), yrange.dim(1), trange2.dim(0), + // trange2.dim(1)}; + // SpArrayKronDelta retiler( + // *GlobalFixture::world, retiler_range, + // SparseShape(detail::kronecker_shape(retiler_range), retiler_range), + // std::make_shared( + // *GlobalFixture::world, retiler_range.tiles_range().volume())); + // init_kronecker_delta(retiler); // y("d1,d2") = retiler("d1,d2,s1,s2") * x("s1,s2"); - y = TA::detail::retile_v1(x, yrange); - // std::cout << "y = " << y << std::endl; - // why deadlock without this? - y.world().gop.fence(); + y1 = TA::detail::retile_v1(x, yrange); + // std::cout << "y1 = " << y1 << std::endl; + // why deadlock without this? + y1.world().gop.fence(); - TA::TSpArrayD y_ref = TA::retile(x, yrange); + TA::TSpArrayD y_ref = TA::detail::retile_v0(x, yrange); // std::cout << "y_ref = " << y_ref << std::endl; + BOOST_CHECK((y1("d1,d2") - y_ref("d1,d2")).norm().get() == 0.); - BOOST_CHECK((y("d1,d2") - y_ref("d1,d2")).norm().get() == 0.); + auto y2 = TA::detail::retile_v2(x, yrange); + // std::cout << "y2 = " << y2 << std::endl; + BOOST_CHECK((y2("d1,d2") - y_ref("d1,d2")).norm().get() == 0.); } BOOST_AUTO_TEST_SUITE_END() From 0b8174695df9e8622c4238b260ace9c8dbd83a2c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 15:04:52 -0400 Subject: [PATCH 451/592] Range-V3 is the default dependency --- src/CMakeLists.txt | 2 +- tests/CMakeLists.txt | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ffceb74017..c426d1ffbe 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -259,7 +259,7 @@ set_source_files_properties( # the list of libraries on which TiledArray depends on, will be cached later # when FetchContent umpire: set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers umpire) -set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers TiledArray_UMPIRE) +set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers TiledArray_UMPIRE range-v3::range-v3) if(CUDA_FOUND OR HIP_FOUND) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index afb1e1c6a6..823e13bec8 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -148,8 +148,6 @@ target_include_directories(${executable} PRIVATE # is too late to do this here; must set TA_ERROR=throw if want to run unit tests target_compile_definitions(${executable} PRIVATE TILEDARRAY_NO_USER_ERROR_MESSAGES=1 MADNESS_DISPLAY_EXCEPTION_BREAK_MESSAGE=0) -# always test range-v3 -target_link_libraries(${executable} PRIVATE range-v3::range-v3) # Add targets add_test(tiledarray/unit/build "${CMAKE_COMMAND}" --build ${PROJECT_BINARY_DIR} --target ${executable}) From 5caa0d1cda1aa2e2a327e80962fa96bf1d2cdc05 Mon Sep 17 00:00:00 2001 From: Samuel Powell Date: Mon, 26 Aug 2024 16:02:53 -0400 Subject: [PATCH 452/592] singleToDoublePrecPerfRatio hip device property does not exist before hip 6.0.0, and is depreciated in latest hip --- examples/device/ta_dense_device.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/device/ta_dense_device.cpp b/examples/device/ta_dense_device.cpp index 4900072e8a..30333c7edc 100644 --- a/examples/device/ta_dense_device.cpp +++ b/examples/device/ta_dense_device.cpp @@ -315,9 +315,7 @@ int try_main(int argc, char **argv) { std::cout << "error(GetDeviceProperties) = " << error << std::endl; } std::cout << "Device #" << device_id << ": " << prop.name << std::endl - << " managedMemory = " << prop.managedMemory << std::endl - << " singleToDoublePrecisionPerfRatio = " - << prop.singleToDoublePrecisionPerfRatio << std::endl; + << " managedMemory = " << prop.managedMemory << std::endl; int result; error = TiledArray::device::deviceGetAttribute( &result, TiledArray::device::DevAttrUnifiedAddressing, device_id); From e1a0e6e4e4f5e78b16f446af83246e455f5311a4 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 17:25:07 -0400 Subject: [PATCH 453/592] is_dense trait moved to fwd.h --- src/TiledArray/fwd.h | 17 +++++++++++++++++ src/TiledArray/shape.h | 25 ------------------------- 2 files changed, 17 insertions(+), 25 deletions(-) diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index 7f411eaeba..073e8bacd3 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -147,6 +147,23 @@ class SparseShape; template class DistArray; +/// Type trait to detect dense shape types +template +struct is_dense : public std::false_type {}; + +template <> +struct is_dense : public std::true_type {}; + +template <> +struct is_dense : public std::true_type {}; + +template +struct is_dense> + : public is_dense::shape_type> {}; + +template +constexpr const bool is_dense_v = is_dense::value; + // Dense Array Typedefs template using TArray = DistArray, DensePolicy>; diff --git a/src/TiledArray/shape.h b/src/TiledArray/shape.h index b630d7e019..9b8de8f6ef 100644 --- a/src/TiledArray/shape.h +++ b/src/TiledArray/shape.h @@ -23,29 +23,4 @@ #include #include -namespace TiledArray { - -template -class DistArray; -class DensePolicy; - -/// Type trait to detect dense shape types -template -struct is_dense : public std::false_type {}; - -template <> -struct is_dense : public std::true_type {}; - -template <> -struct is_dense : public std::true_type {}; - -template -struct is_dense > - : public is_dense::shape_type> {}; - -template -constexpr const bool is_dense_v = is_dense::value; - -} // namespace TiledArray - #endif // TILEDARRAY_SHAPE_H__INCLUDED From 2493817611b9fa31c1bbbebceb5a8f4b22b3dd71 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 17:26:39 -0400 Subject: [PATCH 454/592] moved init_tiles and lazy_deleter to ArrayImpl to be able to implement complex "constructors" in ArrayImpl --- src/TiledArray/array_impl.h | 165 +++++++++++++++++++++++++++++++++++- src/TiledArray/dist_array.h | 139 +++--------------------------- 2 files changed, 175 insertions(+), 129 deletions(-) diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h index e5ad9d5db9..e9179eaefb 100644 --- a/src/TiledArray/array_impl.h +++ b/src/TiledArray/array_impl.h @@ -30,6 +30,7 @@ #include #include #include +#include namespace TiledArray { namespace detail { @@ -407,7 +408,8 @@ class ArrayIterator { /// \note It is the users responsibility to ensure the process maps on all /// nodes are identical. template -class ArrayImpl : public TensorImpl { +class ArrayImpl : public TensorImpl, + public std::enable_shared_from_this> { public: typedef ArrayImpl ArrayImpl_; ///< This object type typedef TensorImpl TensorImpl_; ///< The base class of this object @@ -440,6 +442,68 @@ class ArrayImpl : public TensorImpl { private: storage_type data_; ///< Tile container + public: + static madness::AtomicInt cleanup_counter_; + + /// Array deleter function + + /// This function schedules a task for lazy cleanup. Array objects are + /// deleted only after the object has been deleted in all processes. + /// \param pimpl The implementation pointer to be deleted. + static void lazy_deleter(const ArrayImpl_* const pimpl) { + if (pimpl) { + if (madness::initialized()) { + World& world = pimpl->world(); + const madness::uniqueidT id = pimpl->id(); + cleanup_counter_++; + + // wait for all DelayedSet's to vanish + world.await([&]() { return (pimpl->num_live_ds() == 0); }, true); + + try { + world.gop.lazy_sync(id, [pimpl]() { + delete pimpl; + ArrayImpl_::cleanup_counter_--; + }); + } catch (madness::MadnessException& e) { + fprintf(stderr, + "!! ERROR TiledArray: madness::MadnessException thrown in " + "DistArray::lazy_deleter().\n" + "%s\n" + "!! ERROR TiledArray: The exception has been absorbed.\n" + "!! ERROR TiledArray: rank=%i\n", + e.what(), world.rank()); + + cleanup_counter_--; + delete pimpl; + } catch (std::exception& e) { + fprintf(stderr, + "!! ERROR TiledArray: std::exception thrown in " + "DistArray::lazy_deleter().\n" + "%s\n" + "!! ERROR TiledArray: The exception has been absorbed.\n" + "!! ERROR TiledArray: rank=%i\n", + e.what(), world.rank()); + + cleanup_counter_--; + delete pimpl; + } catch (...) { + fprintf(stderr, + "!! ERROR TiledArray: An unknown exception was thrown in " + "DistArray::lazy_deleter().\n" + "!! ERROR TiledArray: The exception has been absorbed.\n" + "!! ERROR TiledArray: rank=%i\n", + world.rank()); + + cleanup_counter_--; + delete pimpl; + } + } else { + delete pimpl; + } + } + } + public: /// Constructor @@ -453,7 +517,32 @@ class ArrayImpl : public TensorImpl { ArrayImpl(World& world, const trange_type& trange, const shape_type& shape, const std::shared_ptr& pmap) : TensorImpl_(world, trange, shape, pmap), - data_(world, trange.tiles_range().volume(), pmap) {} + data_(world, trange.tiles_range().volume(), pmap) { + // Validate the process map + TA_ASSERT(pmap->size() == trange.tiles_range().volume() && + "TiledArray::DistArray::DistArray() -- The size of the process " + "map is not " + "equal to the number of tiles in the TiledRange object."); + TA_ASSERT(pmap->rank() == + typename pmap_interface::size_type(world.rank()) && + "TiledArray::DistArray::DistArray() -- The rank of the process " + "map is not equal to that " + "of the world object."); + TA_ASSERT(pmap->procs() == + typename pmap_interface::size_type(world.size()) && + "TiledArray::DistArray::DistArray() -- The number of processes " + "in the process map is not " + "equal to that of the world object."); + + // Validate the shape + TA_ASSERT( + !shape.empty() && + "TiledArray::DistArray::DistArray() -- The shape is not initialized."); + TA_ASSERT(shape.validate(trange.tiles_range()) && + "TiledArray::DistArray::DistArray() -- The range of the shape is " + "not equal to " + "the tiles range."); + } /// Virtual destructor virtual ~ArrayImpl() {} @@ -649,8 +738,80 @@ class ArrayImpl : public TensorImpl { return data_.num_live_df(); } + /// Initialize (local) tiles with a user provided functor + + /// This function is used to initialize the local, non-zero tiles of the array + /// via a function (or functor). The work is done in parallel, therefore \c op + /// must be a thread safe function/functor. The signature of the functor + /// should be: + /// \code + /// value_type op(const range_type&) + /// \endcode + /// For example, in the following code, the array tiles are initialized with + /// random numbers from 0 to 1: + /// \code + /// array.init_tiles([] (const TiledArray::Range& range) -> + /// TiledArray::Tensor + /// { + /// // Initialize the tile with the given range object + /// TiledArray::Tensor tile(range); + /// + /// // Initialize the random number generator + /// std::default_random_engine generator; + /// std::uniform_real_distribution distribution(0.0,1.0); + /// + /// // Fill the tile with random numbers + /// for(auto& value : tile) + /// value = distribution(generator); + /// + /// return tile; + /// }); + /// \endcode + /// \tparam Op The type of the functor/function + /// \param[in] op The operation used to generate tiles + /// \param[in] skip_set If false, will throw if any tiles are already set + /// \throw TiledArray::Exception if the PIMPL is not set. Strong throw + /// guarantee. + /// \throw TiledArray::Exception if a tile is already set and skip_set is + /// false. Weak throw guarantee. + template + void init_tiles(Op&& op, bool skip_set = false) { + // lifetime management of op depends on whether it is a lvalue ref (i.e. has + // an external owner) or an rvalue ref + // - if op is an lvalue ref: pass op to tasks + // - if op is an rvalue ref pass make_shared_function(op) to tasks + auto op_shared_handle = make_op_shared_handle(std::forward(op)); + + auto it = this->pmap()->begin(); + const auto end = this->pmap()->end(); + for (; it != end; ++it) { + const auto& index = *it; + if (!this->is_zero(index)) { + if (skip_set) { + auto& fut = this->get_local(index); + if (fut.probe()) continue; + } + if constexpr (Exec == HostExecutor::MADWorld) { + Future tile = this->world().taskq.add( + [this_sptr = this->shared_from_this(), + index = ordinal_type(index), op_shared_handle]() -> value_type { + return op_shared_handle( + this_sptr->trange().make_tile_range(index)); + }); + set(index, std::move(tile)); + } else { + static_assert(Exec == HostExecutor::Thread); + set(index, op_shared_handle(this->trange().make_tile_range(index))); + } + } + } + } + }; // class ArrayImpl +template +madness::AtomicInt ArrayImpl::cleanup_counter_; + #ifndef TILEDARRAY_HEADER_ONLY extern template class ArrayImpl, DensePolicy>; diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index 167464ccfb..6185da8b75 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -163,67 +163,6 @@ class DistArray : public madness::archive::ParallelSerializableObject { false; ///< if true, the impl object is scheduled to be destroyed in the ///< next fence - static madness::AtomicInt cleanup_counter_; - - /// Array deleter function - - /// This function schedules a task for lazy cleanup. Array objects are - /// deleted only after the object has been deleted in all processes. - /// \param pimpl The implementation pointer to be deleted. - static void lazy_deleter(const impl_type* const pimpl) { - if (pimpl) { - if (madness::initialized()) { - World& world = pimpl->world(); - const madness::uniqueidT id = pimpl->id(); - cleanup_counter_++; - - // wait for all DelayedSet's to vanish - world.await([&]() { return (pimpl->num_live_ds() == 0); }, true); - - try { - world.gop.lazy_sync(id, [pimpl]() { - delete pimpl; - DistArray::cleanup_counter_--; - }); - } catch (madness::MadnessException& e) { - fprintf(stderr, - "!! ERROR TiledArray: madness::MadnessException thrown in " - "Array::lazy_deleter().\n" - "%s\n" - "!! ERROR TiledArray: The exception has been absorbed.\n" - "!! ERROR TiledArray: rank=%i\n", - e.what(), world.rank()); - - cleanup_counter_--; - delete pimpl; - } catch (std::exception& e) { - fprintf(stderr, - "!! ERROR TiledArray: std::exception thrown in " - "Array::lazy_deleter().\n" - "%s\n" - "!! ERROR TiledArray: The exception has been absorbed.\n" - "!! ERROR TiledArray: rank=%i\n", - e.what(), world.rank()); - - cleanup_counter_--; - delete pimpl; - } catch (...) { - fprintf(stderr, - "!! ERROR TiledArray: An unknown exception was thrown in " - "Array::lazy_deleter().\n" - "!! ERROR TiledArray: The exception has been absorbed.\n" - "!! ERROR TiledArray: rank=%i\n", - world.rank()); - - cleanup_counter_--; - delete pimpl; - } - } else { - delete pimpl; - } - } - } - /// Sparse array initialization /// \param world The world where the array will live. @@ -239,34 +178,10 @@ class DistArray : public madness::archive::ParallelSerializableObject { if (!pmap) { // Construct a default process map pmap = Policy::default_pmap(world, trange.tiles_range().volume()); - } else { - // Validate the process map - TA_ASSERT(pmap->size() == trange.tiles_range().volume() && - "TiledArray::DistArray::DistArray() -- The size of the process " - "map is not " - "equal to the number of tiles in the TiledRange object."); - TA_ASSERT(pmap->rank() == - typename pmap_interface::size_type(world.rank()) && - "TiledArray::DistArray::DistArray() -- The rank of the process " - "map is not equal to that " - "of the world object."); - TA_ASSERT(pmap->procs() == - typename pmap_interface::size_type(world.size()) && - "TiledArray::DistArray::DistArray() -- The number of processes " - "in the process map is not " - "equal to that of the world object."); } - // Validate the shape - TA_ASSERT( - !shape.empty() && - "TiledArray::DistArray::DistArray() -- The shape is not initialized."); - TA_ASSERT(shape.validate(trange.tiles_range()) && - "TiledArray::DistArray::DistArray() -- The range of the shape is " - "not equal to " - "the tiles range."); - - return pimpl_type(new impl_type(world, trange, shape, pmap), lazy_deleter); + return pimpl_type(new impl_type(world, trange, shape, pmap), + impl_type::lazy_deleter); } public: @@ -647,10 +562,10 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// \throw madness::MadnessException When timeout has been exceeded. static void wait_for_lazy_cleanup(World& world, const double = 60.0) { try { - world.await([&]() { return (cleanup_counter_ == 0); }, true); + world.await([&]() { return (impl_type::cleanup_counter_ == 0); }, true); } catch (...) { printf("%i: Array lazy cleanup timeout with %i pending cleanup(s)\n", - world.rank(), int(cleanup_counter_)); + world.rank(), int(impl_type::cleanup_counter_)); throw; } } @@ -869,10 +784,9 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// first minimally contains the same number of elements as /// the tile. /// \throw TiledArray::Exception if the tile is already initialized. - template < - typename Integer, typename InIter, - typename = std::enable_if_t<(std::is_integral_v) && - detail::is_input_iterator::value>> + template )&&detail:: + is_input_iterator::value>> typename std::enable_if::value>::type set( const std::initializer_list& i, InIter first) { set>(i, first); @@ -965,9 +879,10 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// \throw TiledArray::Exception if index \c i has the wrong rank. Strong /// throw guarantee. /// \throw TiledArray::Exception if tile \c i is already set. - template ) && - is_value_or_future_to_value_v>> + template < + typename Index, typename Value, + typename = std::enable_if_t< + (std::is_integral_v)&&is_value_or_future_to_value_v>> void set(const std::initializer_list& i, Value&& v) { set>(i, std::forward(v)); } @@ -1061,34 +976,7 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// false. Weak throw guarantee. template void init_tiles(Op&& op, bool skip_set = false) { - // lifetime management of op depends on whether it is a lvalue ref (i.e. has - // an external owner) or an rvalue ref - // - if op is an lvalue ref: pass op to tasks - // - if op is an rvalue ref pass make_shared_function(op) to tasks - auto op_shared_handle = make_op_shared_handle(std::forward(op)); - - auto it = impl_ref().pmap()->begin(); - const auto end = pimpl_->pmap()->end(); - for (; it != end; ++it) { - const auto& index = *it; - if (!pimpl_->is_zero(index)) { - if (skip_set) { - auto fut = find_local(index); - if (fut.probe()) continue; - } - if constexpr (Exec == HostExecutor::MADWorld) { - Future tile = pimpl_->world().taskq.add( - [pimpl = pimpl_, index = ordinal_type(index), - op_shared_handle]() -> value_type { - return op_shared_handle(pimpl->trange().make_tile_range(index)); - }); - set(index, std::move(tile)); - } else { - static_assert(Exec == HostExecutor::Thread); - set(index, op_shared_handle(trange().make_tile_range(index))); - } - } - } + impl_ref().template init_tiles(std::forward(op), skip_set); } /// Initialize elements of local, non-zero tiles with a user provided functor @@ -1827,9 +1715,6 @@ class DistArray : public madness::archive::ParallelSerializableObject { }; // class DistArray -template -madness::AtomicInt DistArray::cleanup_counter_; - #ifndef TILEDARRAY_HEADER_ONLY extern template class DistArray, DensePolicy>; From faeb65290c918fdc109eb0fa3b60602d1b55d5c1 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 17:27:16 -0400 Subject: [PATCH 455/592] retile_v2 became a DistArray ctor --- src/TiledArray/array_impl.h | 161 ++++++++++++++++++++++++++++ src/TiledArray/conversions/retile.h | 146 +------------------------ src/TiledArray/dist_array.h | 11 ++ 3 files changed, 173 insertions(+), 145 deletions(-) diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h index e9179eaefb..df7138a9e7 100644 --- a/src/TiledArray/array_impl.h +++ b/src/TiledArray/array_impl.h @@ -834,6 +834,167 @@ extern template class ArrayImpl>, SparsePolicy>; #endif // TILEDARRAY_HEADER_ONLY +template +void write_tile_block(madness::uniqueidT target_array_id, + std::size_t target_tile_ord, + const Tile& target_tile_contribution) { + auto* world_ptr = World::world_from_id(target_array_id.get_world_id()); + auto target_array_ptr_opt = + world_ptr->ptr_from_id::storage_type>( + target_array_id); + TA_ASSERT(target_array_ptr_opt); + TA_ASSERT((*target_array_ptr_opt)->is_local(target_tile_ord)); + (*target_array_ptr_opt) + ->get_local(target_tile_ord) + .get() + .block(target_tile_contribution.range()) = target_tile_contribution; +} + +template +std::shared_ptr> make_with_new_trange( + const std::shared_ptr>& source_array_sptr, + const TiledRange& target_trange, + typename ArrayImpl::numeric_type new_value_fill = + typename ArrayImpl::numeric_type{0}) { + TA_ASSERT(source_array_sptr); + auto& source_array = *source_array_sptr; + auto& world = source_array.world(); + const auto rank = source_array.trange().rank(); + TA_ASSERT(rank == target_trange.rank()); + + // compute metadata + // - list of target tile indices and the corresponding Range1 for each 1-d + // source tile + using target_tiles_t = std::vector>; + using mode_target_tiles_t = std::vector; + using all_target_tiles_t = std::vector; + + all_target_tiles_t all_target_tiles(target_trange.rank()); + // for each mode ... + for (auto d = 0; d != target_trange.rank(); ++d) { + mode_target_tiles_t& mode_target_tiles = all_target_tiles[d]; + auto& target_tr1 = target_trange.dim(d); + auto& target_element_range = target_tr1.elements_range(); + // ... and each tile in that mode ... + for (auto&& source_tile : source_array.trange().dim(d)) { + mode_target_tiles.emplace_back(); + auto& target_tiles = mode_target_tiles.back(); + auto source_tile_lo = source_tile.lobound(); + auto source_tile_up = source_tile.upbound(); + auto source_element_idx = source_tile_lo; + // ... find all target tiles what overlap with it + if (target_element_range.overlaps_with(source_tile)) { + while (source_element_idx < source_tile_up) { + if (target_element_range.includes(source_element_idx)) { + auto target_tile_idx = + target_tr1.element_to_tile(source_element_idx); + auto target_tile = target_tr1.tile(target_tile_idx); + auto target_lo = + std::max(source_element_idx, target_tile.lobound()); + auto target_up = std::min(source_tile_up, target_tile.upbound()); + target_tiles.emplace_back(target_tile_idx, + Range1(target_lo, target_up)); + source_element_idx = target_up; + } else if (source_element_idx < target_element_range.lobound()) { + source_element_idx = target_element_range.lobound(); + } else if (source_element_idx >= target_element_range.upbound()) + break; + } + } + } + } + + // estimate the shape, if sparse + // use max value for each nonzero tile, then will recompute after tiles are + // assigned + using shape_type = typename Policy::shape_type; + shape_type target_shape; + const auto& target_tiles_range = target_trange.tiles_range(); + if constexpr (!is_dense_v) { + // each rank computes contributions to the shape norms from its local tiles + Tensor target_shape_norms(target_tiles_range, 0); + auto& source_trange = source_array.trange(); + const auto e = source_array.cend(); + for (auto it = source_array.cbegin(); it != e; ++it) { + auto source_tile_idx = it.index(); + + // make range for iterating over all possible target tile idx combinations + TA::Index target_tile_ord_extent_range(rank); + for (auto d = 0; d != rank; ++d) { + target_tile_ord_extent_range[d] = + all_target_tiles[d][source_tile_idx[d]].size(); + } + + // loop over every target tile combination + TA::Range target_tile_ord_extent(target_tile_ord_extent_range); + for (auto& target_tile_ord : target_tile_ord_extent) { + TA::Index target_tile_idx(rank); + for (auto d = 0; d != rank; ++d) { + target_tile_idx[d] = + all_target_tiles[d][source_tile_idx[d]][target_tile_ord[d]].first; + } + target_shape_norms(target_tile_idx) = std::numeric_limits::max(); + } + } + world.gop.max(target_shape_norms.data(), target_shape_norms.size()); + target_shape = SparseShape(target_shape_norms, target_trange); + } + + using Array = ArrayImpl; + auto target_array_sptr = std::shared_ptr( + new Array( + source_array.world(), target_trange, target_shape, + Policy::default_pmap(world, target_trange.tiles_range().volume())), + Array::lazy_deleter); + auto& target_array = *target_array_sptr; + target_array.init_tiles([value = new_value_fill](const Range& range) { + return typename Array::value_type(range, value); + }); + target_array.world().gop.fence(); + + // loop over local tile and sends its contributions to the targets + { + auto& source_trange = source_array.trange(); + const auto e = source_array.cend(); + auto& target_tiles_range = target_trange.tiles_range(); + for (auto it = source_array.cbegin(); it != e; ++it) { + const auto& source_tile = *it; + auto source_tile_idx = it.index(); + + // make range for iterating over all possible target tile idx combinations + TA::Index target_tile_ord_extent_range(rank); + for (auto d = 0; d != rank; ++d) { + target_tile_ord_extent_range[d] = + all_target_tiles[d][source_tile_idx[d]].size(); + } + + // loop over every target tile combination + TA::Range target_tile_ord_extent(target_tile_ord_extent_range); + for (auto& target_tile_ord : target_tile_ord_extent) { + TA::Index target_tile_idx(rank); + container::svector target_tile_rngs1(rank); + for (auto d = 0; d != rank; ++d) { + std::tie(target_tile_idx[d], target_tile_rngs1[d]) = + all_target_tiles[d][source_tile_idx[d]][target_tile_ord[d]]; + } + TA_ASSERT(source_tile.future().probe()); + Tile target_tile_contribution( + source_tile.get().block(target_tile_rngs1)); + auto target_tile_idx_ord = target_tiles_range.ordinal(target_tile_idx); + auto target_proc = target_array.pmap()->owner(target_tile_idx_ord); + world.taskq.add(target_proc, &write_tile_block, + target_array.id(), target_tile_idx_ord, + target_tile_contribution); + } + } + } + // data is mutated in place, so must wait for all tasks to complete + target_array.world().gop.fence(); + // WARNING!! need to truncate in DistArray ctor + + return target_array_sptr; +} + } // namespace detail } // namespace TiledArray diff --git a/src/TiledArray/conversions/retile.h b/src/TiledArray/conversions/retile.h index 6b35872476..4db6564ac6 100644 --- a/src/TiledArray/conversions/retile.h +++ b/src/TiledArray/conversions/retile.h @@ -146,154 +146,10 @@ auto retile_v1(const DistArray& tensor, return output; } -template -void write_tile_block(madness::uniqueidT target_array_id, - std::size_t target_tile_ord, - const Tile& target_tile_contribution) { - auto* world_ptr = World::world_from_id(target_array_id.get_world_id()); - auto target_array_ptr_opt = world_ptr->ptr_from_id< - typename DistArray::impl_type::storage_type>( - target_array_id); - TA_ASSERT(target_array_ptr_opt); - TA_ASSERT((*target_array_ptr_opt)->is_local(target_tile_ord)); - (*target_array_ptr_opt) - ->get_local(target_tile_ord) - .get() - .block(target_tile_contribution.range()) = target_tile_contribution; -} - template auto retile_v2(const DistArray& source_array, const TiledRange& target_trange) { - auto& world = source_array.world(); - const auto rank = source_array.trange().rank(); - TA_ASSERT(rank == target_trange.rank()); - - // compute metadata - // - list of target tile indices and the corresponding Range1 for each 1-d - // source tile - using target_tiles_t = std::vector>; - using mode_target_tiles_t = std::vector; - using all_target_tiles_t = std::vector; - - all_target_tiles_t all_target_tiles(target_trange.rank()); - // for each mode ... - for (auto d = 0; d != target_trange.rank(); ++d) { - mode_target_tiles_t& mode_target_tiles = all_target_tiles[d]; - auto& target_tr1 = target_trange.dim(d); - auto& target_element_range = target_tr1.elements_range(); - // ... and each tile in that mode ... - for (auto&& source_tile : source_array.trange().dim(d)) { - mode_target_tiles.emplace_back(); - auto& target_tiles = mode_target_tiles.back(); - auto source_tile_lo = source_tile.lobound(); - auto source_tile_up = source_tile.upbound(); - auto source_element_idx = source_tile_lo; - // ... find all target tiles what overlap with it - if (target_element_range.overlaps_with(source_tile)) { - while (source_element_idx < source_tile_up) { - if (target_element_range.includes(source_element_idx)) { - auto target_tile_idx = - target_tr1.element_to_tile(source_element_idx); - auto target_tile = target_tr1.tile(target_tile_idx); - auto target_lo = - std::max(source_element_idx, target_tile.lobound()); - auto target_up = std::min(source_tile_up, target_tile.upbound()); - target_tiles.emplace_back(target_tile_idx, - Range1(target_lo, target_up)); - source_element_idx = target_up; - } else if (source_element_idx < target_element_range.lobound()) { - source_element_idx = target_element_range.lobound(); - } else if (source_element_idx >= target_element_range.upbound()) - break; - } - } - } - } - - // estimate the shape, if sparse - // use max value for each nonzero tile, then will recompute after tiles are - // assigned - using shape_type = typename Policy::shape_type; - shape_type target_shape; - const auto& target_tiles_range = target_trange.tiles_range(); - if constexpr (!is_dense_v) { - // each rank computes contributions to the shape norms from its local tiles - Tensor target_shape_norms(target_tiles_range, 0); - auto& source_trange = source_array.trange(); - const auto e = source_array.end(); - for (auto it = source_array.begin(); it != e; ++it) { - auto source_tile_idx = it.index(); - - // make range for iterating over all possible target tile idx combinations - TA::Index target_tile_ord_extent_range(rank); - for (auto d = 0; d != rank; ++d) { - target_tile_ord_extent_range[d] = - all_target_tiles[d][source_tile_idx[d]].size(); - } - - // loop over every target tile combination - TA::Range target_tile_ord_extent(target_tile_ord_extent_range); - for (auto& target_tile_ord : target_tile_ord_extent) { - TA::Index target_tile_idx(rank); - for (auto d = 0; d != rank; ++d) { - target_tile_idx[d] = - all_target_tiles[d][source_tile_idx[d]][target_tile_ord[d]].first; - } - target_shape_norms(target_tile_idx) = std::numeric_limits::max(); - } - } - world.gop.max(target_shape_norms.data(), target_shape_norms.size()); - target_shape = SparseShape(target_shape_norms, target_trange); - } - - using Array = DistArray; - Array target_array(source_array.world(), target_trange, target_shape); - target_array.fill_local(0.0); - target_array.world().gop.fence(); - - // loop over local tile and sends its contributions to the targets - { - auto& source_trange = source_array.trange(); - const auto e = source_array.end(); - auto& target_tiles_range = target_trange.tiles_range(); - for (auto it = source_array.begin(); it != e; ++it) { - const auto& source_tile = *it; - auto source_tile_idx = it.index(); - - // make range for iterating over all possible target tile idx combinations - TA::Index target_tile_ord_extent_range(rank); - for (auto d = 0; d != rank; ++d) { - target_tile_ord_extent_range[d] = - all_target_tiles[d][source_tile_idx[d]].size(); - } - - // loop over every target tile combination - TA::Range target_tile_ord_extent(target_tile_ord_extent_range); - for (auto& target_tile_ord : target_tile_ord_extent) { - TA::Index target_tile_idx(rank); - container::svector target_tile_rngs1(rank); - for (auto d = 0; d != rank; ++d) { - std::tie(target_tile_idx[d], target_tile_rngs1[d]) = - all_target_tiles[d][source_tile_idx[d]][target_tile_ord[d]]; - } - TA_ASSERT(source_tile.future().probe()); - Tile target_tile_contribution( - source_tile.get().block(target_tile_rngs1)); - auto target_tile_idx_ord = target_tiles_range.ordinal(target_tile_idx); - auto target_proc = target_array.pmap()->owner(target_tile_idx_ord); - world.taskq.add(target_proc, &write_tile_block, - target_array.id(), target_tile_idx_ord, - target_tile_contribution); - } - } - } - // data is mutated in place, so must wait for all tasks to complete - target_array.world().gop.fence(); - // recompute norms/trim away zeros - target_array.truncate(); - - return target_array; + return DistArray(source_array, target_trange); } } // namespace detail diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index 6185da8b75..c2645dd7ce 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -456,6 +456,17 @@ class DistArray : public madness::archive::ParallelSerializableObject { : DistArray(array_from_il(get_default_world(), trange, il)) {} /// @} + /// "copy" constructor that replaces the TiledRange + + /// This constructor remaps the data of \p other according to \p new_trange , + /// with \p new_value_fill used to fill the new elements, if any + DistArray(const DistArray& other, const trange_type& new_trange, + numeric_type new_value_fill = numeric_type{0}) + : pimpl_( + make_with_new_trange(other.pimpl(), new_trange, new_value_fill)) { + this->truncate(); + } + /// converting copy constructor /// This constructor uses the meta data of `other` to initialize the meta From c275ec51a0e7f27e0b4fd992df4b7fea475d495e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 26 Aug 2024 17:28:43 -0400 Subject: [PATCH 456/592] TA::retile uses efficient (retile_v2) method, no more GEMM-based retiling by default (thanks, @calewis ) --- src/TiledArray/conversions/retile.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/conversions/retile.h b/src/TiledArray/conversions/retile.h index 4db6564ac6..9f0f4cab4a 100644 --- a/src/TiledArray/conversions/retile.h +++ b/src/TiledArray/conversions/retile.h @@ -164,7 +164,7 @@ auto retile_v2(const DistArray& source_array, template auto retile(const DistArray& array, const TiledRange& target_trange) { - return detail::retile_v0(array, target_trange); + return detail::retile_v2(array, target_trange); } } // namespace TiledArray From 7b2124df9f60fc195439657b3f139184c11231b9 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 27 Aug 2024 06:06:55 -0400 Subject: [PATCH 457/592] einsum: fixed lifetime bug in reduce_modes --- src/TiledArray/einsum/tiledarray.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 7e6e4251b3..41797efafa 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -361,7 +361,8 @@ auto reduce_modes(TA::DistArray orig, size_t drank) { container::svector ix1s = rng.lobound(); { - auto dlo = delta_trange.make_tile_range(r).lobound(); + auto d = delta_trange.make_tile_range(r); + auto dlo = d.lobound(); std::copy(dlo.begin(), dlo.end(), std::back_inserter(ix1s)); } From 09819dec3d3b3231e9273e367c9594d4f3a94aad Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 27 Aug 2024 06:17:59 -0400 Subject: [PATCH 458/592] expressions_mixed: kronecker test used "new" retile as default result + test negative lobound if using signed 1-index type --- tests/expressions_mixed.cpp | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/tests/expressions_mixed.cpp b/tests/expressions_mixed.cpp index b8de603356..bf79d86fc1 100644 --- a/tests/expressions_mixed.cpp +++ b/tests/expressions_mixed.cpp @@ -188,31 +188,28 @@ BOOST_AUTO_TEST_CASE(kronecker) { // includes target tiles that receive contributions from multiple source // tiles, tiny target tiles with single contribution, and tiles partially and - // completely outside the source range N.B. retile_v0 seems to struggle with - // completely empty tiles (e.g. add 47 to each 1-d range) - TA::TiledRange yrange{{5, 18, 20, 45}, {7, 20, 22, 45}}; + // completely outside the source range +#ifdef TA_SIGNED_1INDEX_TYPE + TA::TiledRange yrange{{-1, 18, 20, 45, 47}, {-1, 20, 22, 45, 47}}; +#else + TA::TiledRange yrange{{5, 18, 20, 45, 47}, {7, 20, 22, 45, 47}}; +#endif TA::TSpArrayD y1; - // TA::TiledRange retiler_range{yrange.dim(0), yrange.dim(1), trange2.dim(0), - // trange2.dim(1)}; - // SpArrayKronDelta retiler( - // *GlobalFixture::world, retiler_range, - // SparseShape(detail::kronecker_shape(retiler_range), retiler_range), - // std::make_shared( - // *GlobalFixture::world, retiler_range.tiles_range().volume())); - // init_kronecker_delta(retiler); - // y("d1,d2") = retiler("d1,d2,s1,s2") * x("s1,s2"); - y1 = TA::detail::retile_v1(x, yrange); + // identical to y1 = TA::detail::retile_v1(x, yrange); + TA::TiledRange retiler_range{yrange.dim(0), yrange.dim(1), trange2.dim(0), + trange2.dim(1)}; + SpArrayKronDelta retiler( + *GlobalFixture::world, retiler_range, + SparseShape(detail::kronecker_shape(retiler_range), retiler_range), + std::make_shared( + *GlobalFixture::world, retiler_range.tiles_range().volume())); + init_kronecker_delta(retiler); + y1("d1,d2") = retiler("d1,d2,s1,s2") * x("s1,s2"); // std::cout << "y1 = " << y1 << std::endl; - // why deadlock without this? - y1.world().gop.fence(); - TA::TSpArrayD y_ref = TA::detail::retile_v0(x, yrange); + auto y_ref = TA::retile(x, yrange); // std::cout << "y_ref = " << y_ref << std::endl; BOOST_CHECK((y1("d1,d2") - y_ref("d1,d2")).norm().get() == 0.); - - auto y2 = TA::detail::retile_v2(x, yrange); - // std::cout << "y2 = " << y2 << std::endl; - BOOST_CHECK((y2("d1,d2") - y_ref("d1,d2")).norm().get() == 0.); } BOOST_AUTO_TEST_SUITE_END() From ff7db67f46e66859a127807c14e3cf9ba92bed1d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 27 Aug 2024 07:51:54 -0400 Subject: [PATCH 459/592] bump VG CMake kit tag to pull in most recent {blas,lapack}pp to be able to build with post-5.6 rocm --- external/versions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/versions.cmake b/external/versions.cmake index e0680a6d48..a005bcdec5 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -1,7 +1,7 @@ # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS) # to be able to auto-update them -set(TA_TRACKED_VGCMAKEKIT_TAG d5c0a6f9ff6dc97cbb5132912733e1eb1cf73f1e) +set(TA_TRACKED_VGCMAKEKIT_TAG 72bb8f049e68443e817ce7299f0d1dabfaf01b7e) # N.B. may need to update INSTALL.md manually with the CUDA-specific version set(TA_TRACKED_EIGEN_VERSION 3.3.5) From cd7896180533fb9a382479e701d087f5f152cc05 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 27 Aug 2024 11:02:27 -0400 Subject: [PATCH 460/592] introduced `make_uniform(Range1,tilesize)` --- src/TiledArray/tiled_range1.h | 28 ++++++++++++++++++---------- tests/tiled_range1.cpp | 15 +++++++++++++++ 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 69e5a5eea3..e25c8a5357 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -244,26 +244,27 @@ class TiledRange1 { // clang-format off /// @brief makes a uniform (or, as uniform as possible) TiledRange1 - /// @param[in] range_size the range size + /// @param[in] range the Range to be tiled /// @param[in] target_tile_size the desired tile size - /// @return TiledRange1 obtained by tiling range `[0,range_size)` into - /// `ntiles = (range_size + target_tile_size - 1)/target_tile_size` - /// tiles; if `x = range_size % ntiles` is not zero, first `x` tiles + /// @return TiledRange1 obtained by tiling \p range into + /// `ntiles = (range.extent() + target_tile_size - 1)/target_tile_size` + /// tiles; if `x = range.extent() % ntiles` is not zero, first `x` tiles /// have size `target_tile_size` and last /// `ntiles - x` tiles have size `target_tile_size - 1`, else /// all tiles have size `target_tile_size` . // clang-format on - static TiledRange1 make_uniform(std::size_t range_size, + static TiledRange1 make_uniform(const Range1& range, std::size_t target_tile_size) { - if (range_size > 0) { + const auto range_extent = range.extent(); + if (range_extent > 0) { TA_ASSERT(target_tile_size > 0); std::size_t ntiles = - (range_size + target_tile_size - 1) / target_tile_size; - auto dv = std::div((long)(range_size + ntiles - 1), (long)ntiles); + (range_extent + target_tile_size - 1) / target_tile_size; + auto dv = std::div((long)(range_extent + ntiles - 1), (long)ntiles); auto avg_tile_size = dv.quot - 1, num_avg_plus_one = dv.rem + 1; std::vector hashmarks; hashmarks.reserve(ntiles + 1); - std::size_t element = 0; + std::size_t element = range.lobound(); for (auto i = 0; i < num_avg_plus_one; ++i, element += avg_tile_size + 1) { hashmarks.push_back(element); @@ -272,12 +273,19 @@ class TiledRange1 { ++i, element += avg_tile_size) { hashmarks.push_back(element); } - hashmarks.push_back(range_size); + hashmarks.push_back(range.upbound()); return TiledRange1(hashmarks.begin(), hashmarks.end()); } else return TiledRange1{}; } + /// same as make_uniform(const Range1&, std::size_t) for a 0-based range + /// specified by its extent + static TiledRange1 make_uniform(std::size_t range_extent, + std::size_t target_tile_size) { + return make_uniform(Range1(0, range_extent), target_tile_size); + } + /// swapper /// \param other the range with which the contents of this range will be diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index eb94091e59..b4aef7f51c 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -294,6 +294,21 @@ BOOST_AUTO_TEST_CASE(concatenation) { } BOOST_AUTO_TEST_CASE(make_uniform) { + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{1, 1}, 0)); + BOOST_CHECK(TiledRange1::make_uniform(Range1{1, 1}, 0) == TiledRange1{}); + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{3, 6}, 10)); + BOOST_CHECK(TiledRange1::make_uniform(Range1{3, 6}, 10) == + (TiledRange1{3, 6})); + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{10, 60}, 10)); + BOOST_CHECK(TiledRange1::make_uniform(Range1{10, 60}, 10) == + (TiledRange1{10, 20, 30, 40, 50, 60})); + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{10, 65}, 10)); + BOOST_CHECK(TiledRange1::make_uniform(Range1{10, 65}, 10) == + (TiledRange1{10, 20, 29, 38, 47, 56, 65})); + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{10, 69}, 10)); + BOOST_CHECK(TiledRange1::make_uniform(Range1{10, 69}, 10) == + (TiledRange1{10, 20, 30, 40, 50, 60, 69})); + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(0, 0)); BOOST_CHECK(TiledRange1::make_uniform(0, 0) == TiledRange1{}); BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(0, 1)); From 09bb258c96e90631353dc2219ff67e0ba89c6f35 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 27 Aug 2024 15:12:49 -0400 Subject: [PATCH 461/592] introduced Tile::total_size() to mirror Tensor::total_size --- src/TiledArray/tile.h | 14 +++++++++++++- src/TiledArray/type_traits.h | 2 ++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h index 1091362287..7d568f7200 100644 --- a/src/TiledArray/tile.h +++ b/src/TiledArray/tile.h @@ -215,11 +215,23 @@ class Tile { // Dimension information accessors ----------------------------------------- - /// Size accessors + /// Size accessor /// \return The number of elements in the tensor decltype(auto) size() const { return tensor().size(); } + /// Total size accessor + + /// \return The number of elements in the tensor, tallied across batches (if + /// any) + decltype(auto) total_size() const { + if constexpr (detail::has_member_function_total_size_anyreturn_v< + tensor_type>) { + return tensor().total_size(); + } else + return size(); + } + /// Range accessor /// \return An object describes the upper and lower bounds of the tensor data diff --git a/src/TiledArray/type_traits.h b/src/TiledArray/type_traits.h index 5c3d066e9c..80c6bd924f 100644 --- a/src/TiledArray/type_traits.h +++ b/src/TiledArray/type_traits.h @@ -322,6 +322,8 @@ GENERATE_HAS_MEMBER_TYPE(mapped_type) GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(size) GENERATE_HAS_MEMBER_FUNCTION(size) +GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(total_size) +GENERATE_HAS_MEMBER_FUNCTION(total_size) GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(data) GENERATE_HAS_MEMBER_FUNCTION(data) GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(empty) From 2e907edccb949b21a461f6e55d67e7e1b78ec78d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 27 Aug 2024 15:14:27 -0400 Subject: [PATCH 462/592] wrap launchHostFunc call in device_task_fn into DeviceSafeCall --- src/TiledArray/device/device_task_fn.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/device/device_task_fn.h b/src/TiledArray/device/device_task_fn.h index fada332c63..e376cc39e6 100644 --- a/src/TiledArray/device/device_task_fn.h +++ b/src/TiledArray/device/device_task_fn.h @@ -121,7 +121,8 @@ struct deviceTaskFn : public TaskInterface { } else { // TODO should we use device callback or device events?? // insert device callback - TiledArray::device::launchHostFunc(*stream_, device_callback, task_); + DeviceSafeCall(TiledArray::device::launchHostFunc( + *stream_, device_callback, task_)); // processed sync, clear state stream_ = {}; } From a1a65c4daf76a7f0c36386b546b8b7fc457e60b1 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 27 Aug 2024 16:20:47 -0400 Subject: [PATCH 463/592] vector_of_array.h: added missing #include + reorg #includes in conversions.cpp --- src/TiledArray/conversions/vector_of_arrays.h | 2 ++ tests/conversions.cpp | 14 +++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/conversions/vector_of_arrays.h b/src/TiledArray/conversions/vector_of_arrays.h index 8b3f5ea8a4..29f4932ca5 100644 --- a/src/TiledArray/conversions/vector_of_arrays.h +++ b/src/TiledArray/conversions/vector_of_arrays.h @@ -5,6 +5,8 @@ #ifndef TILEDARRAY_CONVERSIONS_VECTOR_OF_ARRAYS_H_ #define TILEDARRAY_CONVERSIONS_VECTOR_OF_ARRAYS_H_ +#include + namespace TiledArray { namespace detail { diff --git a/tests/conversions.cpp b/tests/conversions.cpp index 107a383c00..9cab83bca7 100644 --- a/tests/conversions.cpp +++ b/tests/conversions.cpp @@ -23,13 +23,21 @@ * */ -#include "range_fixture.h" -#include "tiledarray.h" #include "unit_test_config.h" -#include "TiledArray/conversions/concat.h" #include "TiledArray/conversions/vector_of_arrays.h" +#include "TiledArray/conversions/concat.h" + +#include "TiledArray/conversions/dense_to_sparse.h" +#include "TiledArray/conversions/make_array.h" +#include "TiledArray/conversions/sparse_to_dense.h" +#include "TiledArray/conversions/to_new_tile_type.h" + +#include "TiledArray/expressions/tsr_expr.h" + +#include "range_fixture.h" + using namespace TiledArray; struct ConversionsFixture : public TiledRangeFixture { From 9bda8eaeda12f283eaa302ca26a5c84b04aadf03 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 28 Aug 2024 09:53:50 -0400 Subject: [PATCH 464/592] [skip ci] #define THRUST_DEVICE_SYSTEM if not defined to be able to use thrust headers from host sources --- src/TiledArray/device/thrust.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/TiledArray/device/thrust.h b/src/TiledArray/device/thrust.h index b98e425a46..2de7a5b8bb 100644 --- a/src/TiledArray/device/thrust.h +++ b/src/TiledArray/device/thrust.h @@ -32,6 +32,15 @@ #include #endif +// rocthrust headers rely on THRUST_DEVICE_SYSTEM being defined, which is only +// defined by the HIP-specific compilers to be usable with host compiler define +// it here explicitly +#ifdef TILEDARRAY_HAS_HIP +#ifndef THRUST_DEVICE_SYSTEM +#define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP +#endif +#endif + #include #include From 84be590e07ea9ab4f7ea88302e678f6e6746d712 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 31 Aug 2024 07:47:55 -0400 Subject: [PATCH 465/592] TiledRange1: can construct using a range of tile hashmarks --- src/TiledArray/tiled_range1.h | 29 ++++++++++++++++++++++++----- tests/tiled_range1.cpp | 15 +++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index e25c8a5357..4824dec26e 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -98,19 +98,38 @@ class TiledRange1 { /// Construct a 1D tiled range. - /// This will construct a 1D tiled range with tile boundaries ("hashmarks") - /// {\p t0 , \p t_rest... } + /// This will construct a 1D tiled range from range {t0, t1, t2, ... tn} + /// specifying the tile boundaries (hashmarks). + /// The number of tile boundaries is n + 1, where n is the number of tiles. + /// Tiles are defined as [\p t0 , t1), [t1, t2), [t2, t3), ... + /// Tiles are indexed starting with 0. + /// \tparam Integer An integral type + /// \param tile_boundaries The list of tile boundaries in order from smallest + /// to largest + /// \note validity of the {\p t0 , \p t_rest... } range is checked using + /// #TA_ASSERT() only if preprocessor macro \c NDEBUG is not defined + template >> + explicit TiledRange1(Range&& tile_boundaries) { + init_tiles_(tile_boundaries.begin(), tile_boundaries.end(), 0); + } + + /// Construct a 1D tiled range. + + /// This will construct a 1D tiled range from range {t0, t1, t2, ... tn} + /// specifying the tile boundaries (hashmarks). /// The number of tile boundaries is n + 1, where n is the number of tiles. /// Tiles are defined as [\p t0 , t1), [t1, t2), [t2, t3), ... /// Tiles are indexed starting with 0. /// \tparam Integer An integral type - /// \param list The list of tile boundaries in order from smallest to largest + /// \param tile_boundaries The list of tile boundaries in order from smallest + /// to largest /// \note validity of the {\p t0 , \p t_rest... } range is checked using /// #TA_ASSERT() only if preprocessor macro \c NDEBUG is not defined template >> - explicit TiledRange1(const std::initializer_list& list) { - init_tiles_(list.begin(), list.end(), 0); + explicit TiledRange1(const std::initializer_list& tile_boundaries) { + init_tiles_(tile_boundaries.begin(), tile_boundaries.end(), 0); } /// Copy assignment operator diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index b4aef7f51c..f01a9a208e 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -110,6 +110,21 @@ BOOST_AUTO_TEST_CASE(constructor) { } } + // check constructor using range of tile boundaries. + { + if (Range1Fixture::ntiles == 5) { + TiledRange1 r(a); + BOOST_CHECK_EQUAL(r.tiles_range().first, tiles.first); + BOOST_CHECK_EQUAL(r.tiles_range().second, tiles.second); + BOOST_CHECK_EQUAL(r.elements_range().first, elements.first); + BOOST_CHECK_EQUAL(r.elements_range().second, elements.second); + for (std::size_t i = 0; i < a.size() - 1; ++i) { + BOOST_CHECK_EQUAL(r.tile(i).first, a[i]); + BOOST_CHECK_EQUAL(r.tile(i).second, a[i + 1]); + } + } + } + // check construction with negative index values #ifdef TA_SIGNED_1INDEX_TYPE { From cf0e0ea0a351717c752546870b914917a1cbb33a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 31 Aug 2024 08:04:34 -0400 Subject: [PATCH 466/592] <-> Eigen::{Vector,Matrix,Tensor} conversions can handle DistArrays with non-zero base Ranges --- src/TiledArray/conversions/eigen.h | 209 +++++++----- tests/eigen.cpp | 491 ++++++++++++++++------------- tests/range_fixture.h | 47 ++- 3 files changed, 443 insertions(+), 304 deletions(-) diff --git a/src/TiledArray/conversions/eigen.h b/src/TiledArray/conversions/eigen.h index 816a8bfe24..3caeecc178 100644 --- a/src/TiledArray/conversions/eigen.h +++ b/src/TiledArray/conversions/eigen.h @@ -196,20 +196,26 @@ eigen_map(T& tensor) { /// Copy a block of an Eigen matrix into a tensor -/// A block of \c matrix will be copied into \c tensor. The block -/// dimensions will be determined by the dimensions of the tensor's range. +// clang-format off +/// A block of \c matrix will be copied into \c tensor. If `tensor.rank()==2` +/// the block is `[tensor.range().lobound()[0] - base_offsets[0], tensor.range().upbound()[0] - base_offsets[0]) x `[tensor.range().lobound()[1] - base_offsets[1], tensor.range().upbound()[1] - base_offsets[1])`, +/// else it is `[tensor.range().lobound()[0] - base_offsets[0], tensor.range().upbound()[0] - base_offsets[0])`. +/// /// \tparam T A tensor type, e.g. TiledArray::Tensor /// \tparam Derived The derived type of an Eigen matrix /// \param[in] matrix The object that will be assigned the content of \c tensor -/// \param[out] tensor The object that will be assigned the content of \c matrix +/// \param[out] tensor The object that will contain the block of \c matrix +/// \param[in] base_offsets The base offsets for the tensor range (should be lobound of the array that will contain tensor as a tile) /// \throw TiledArray::Exception When the dimensions of \c tensor are not equal /// to 1 or 2. /// \throw TiledArray::Exception When the range of \c tensor is outside the /// range of \c matrix . +// clang-format on template >* = nullptr> -inline void eigen_submatrix_to_tensor(const Eigen::MatrixBase& matrix, - T& tensor) { +inline void eigen_submatrix_to_tensor( + const Eigen::MatrixBase& matrix, T& tensor, + std::array base_offsets = {0, 0}) { [[maybe_unused]] typedef typename T::index1_type size_type; TA_ASSERT((tensor.range().rank() == 2u) || (tensor.range().rank() == 1u)); @@ -223,60 +229,71 @@ inline void eigen_submatrix_to_tensor(const Eigen::MatrixBase& matrix, if (tensor.range().rank() == 2u) { // Get tensor range data - const std::size_t tensor_lower_0 = tensor_lower[0]; - const std::size_t tensor_lower_1 = tensor_lower[1]; - [[maybe_unused]] const std::size_t tensor_upper_0 = tensor_upper[0]; - [[maybe_unused]] const std::size_t tensor_upper_1 = tensor_upper[1]; - const std::size_t tensor_extent_0 = tensor_extent[0]; - const std::size_t tensor_extent_1 = tensor_extent[1]; - - TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.rows())); - TA_ASSERT(tensor_upper_1 <= std::size_t(matrix.cols())); + const size_type tensor_lower_0 = tensor_lower[0]; + const size_type tensor_lower_1 = tensor_lower[1]; + [[maybe_unused]] const size_type tensor_upper_0 = tensor_upper[0]; + [[maybe_unused]] const size_type tensor_upper_1 = tensor_upper[1]; + const size_type tensor_extent_0 = tensor_extent[0]; + const size_type tensor_extent_1 = tensor_extent[1]; + + TA_ASSERT(tensor_extent_0 <= size_type(matrix.rows())); + TA_ASSERT(tensor_extent_1 <= size_type(matrix.cols())); + TA_ASSERT(tensor_lower_0 >= base_offsets[0]); + TA_ASSERT(tensor_lower_1 >= base_offsets[1]); // Copy matrix eigen_map(tensor, tensor_extent_0, tensor_extent_1) = matrix.block( - tensor_lower_0, tensor_lower_1, tensor_extent_0, tensor_extent_1); + tensor_lower_0 - base_offsets[0], tensor_lower_1 - base_offsets[1], + tensor_extent_0, tensor_extent_1); } else { // Get tensor range data - const std::size_t tensor_lower_0 = tensor_lower[0]; - [[maybe_unused]] const std::size_t tensor_upper_0 = tensor_upper[0]; - const std::size_t tensor_extent_0 = tensor_extent[0]; + const size_type tensor_lower_0 = tensor_lower[0]; + [[maybe_unused]] const size_type tensor_upper_0 = tensor_upper[0]; + const size_type tensor_extent_0 = tensor_extent[0]; // Check that matrix is a vector. TA_ASSERT((matrix.rows() == 1) || (matrix.cols() == 1)); if (matrix.rows() == 1) { - TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.cols())); + TA_ASSERT(tensor_extent_0 <= size_type(matrix.cols())); + TA_ASSERT(tensor_lower_0 >= base_offsets[0]); // Copy the row vector to tensor eigen_map(tensor, 1, tensor_extent_0) = - matrix.block(0, tensor_lower_0, 1, tensor_extent_0); + matrix.block(0, tensor_lower_0 - base_offsets[0], 1, tensor_extent_0); } else { - TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.rows())); + TA_ASSERT(tensor_extent_0 <= size_type(matrix.rows())); + TA_ASSERT(tensor_lower_0 >= base_offsets[0]); // Copy the column vector to tensor eigen_map(tensor, tensor_extent_0, 1) = - matrix.block(tensor_lower_0, 0, tensor_extent_0, 1); + matrix.block(tensor_lower_0 - base_offsets[0], 0, tensor_extent_0, 1); } } } /// Copy the content of a tensor into an Eigen matrix block -/// The content of tensor will be copied into a block of matrix. The block -/// dimensions will be determined by the dimensions of the tensor's range. -/// \tparam T A tensor type, e.g. TiledArray::Tensor -/// \tparam Derived The derived type of an Eigen matrix -/// \param[in] tensor The object that will be copied to \c matrix -/// \param[out] matrix The object that will be assigned the content of \c tensor -/// \throw TiledArray::Exception When the dimensions of \c tensor are not equal -/// to 1 or 2. -/// \throw TiledArray::Exception When the range of \c tensor is outside the -/// range of \c matrix . +/// The content of tensor will be copied into a block of matrix. +/// If `tensor.rank()==2` +/// the block is `[tensor.range().lobound()[0] - base_offsets[0], +/// tensor.range().upbound()[0] - base_offsets[0]) x +/// `[tensor.range().lobound()[1] - base_offsets[1], tensor.range().upbound()[1] +/// - base_offsets[1])`, else it is `[tensor.range().lobound()[0] - +/// base_offsets[0], tensor.range().upbound()[0] - base_offsets[0])`. \tparam T +/// A tensor type, e.g. TiledArray::Tensor \tparam Derived The derived type of +/// an Eigen matrix \param[in] tensor The object that will be copied to \c +/// matrix \param[out] matrix The object that will be assigned the content of \c +/// tensor \param[in] base_offsets The base offsets for the tensor range (should +/// be lobound of the array that will contain tensor as a tile) \throw +/// TiledArray::Exception When the dimensions of \c tensor are not equal to 1 +/// or 2. \throw TiledArray::Exception When the range of \c tensor is outside +/// the range of \c matrix . template >* = nullptr> -inline void tensor_to_eigen_submatrix(const T& tensor, - Eigen::MatrixBase& matrix) { +inline void tensor_to_eigen_submatrix( + const T& tensor, Eigen::MatrixBase& matrix, + std::array base_offsets = {0, 0}) { [[maybe_unused]] typedef typename T::index1_type size_type; TA_ASSERT((tensor.range().rank() == 2u) || (tensor.range().rank() == 1u)); @@ -290,39 +307,44 @@ inline void tensor_to_eigen_submatrix(const T& tensor, if (tensor.range().rank() == 2) { // Get tensor range data - const std::size_t tensor_lower_0 = tensor_lower[0]; - const std::size_t tensor_lower_1 = tensor_lower[1]; - [[maybe_unused]] const std::size_t tensor_upper_0 = tensor_upper[0]; - [[maybe_unused]] const std::size_t tensor_upper_1 = tensor_upper[1]; - const std::size_t tensor_extent_0 = tensor_extent[0]; - const std::size_t tensor_extent_1 = tensor_extent[1]; - - TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.rows())); - TA_ASSERT(tensor_upper_1 <= std::size_t(matrix.cols())); + const size_type tensor_lower_0 = tensor_lower[0]; + const size_type tensor_lower_1 = tensor_lower[1]; + [[maybe_unused]] const size_type tensor_upper_0 = tensor_upper[0]; + [[maybe_unused]] const size_type tensor_upper_1 = tensor_upper[1]; + const size_type tensor_extent_0 = tensor_extent[0]; + const size_type tensor_extent_1 = tensor_extent[1]; + + TA_ASSERT(tensor_extent_0 <= size_type(matrix.rows())); + TA_ASSERT(tensor_extent_1 <= size_type(matrix.cols())); + TA_ASSERT(tensor_lower_0 >= base_offsets[0]); + TA_ASSERT(tensor_lower_1 >= base_offsets[1]); // Copy tensor into matrix - matrix.block(tensor_lower_0, tensor_lower_1, tensor_extent_0, + matrix.block(tensor_lower_0 - base_offsets[0], + tensor_lower_1 - base_offsets[1], tensor_extent_0, tensor_extent_1) = eigen_map(tensor, tensor_extent_0, tensor_extent_1); } else { // Get tensor range data - const std::size_t tensor_lower_0 = tensor_lower[0]; - [[maybe_unused]] const std::size_t tensor_upper_0 = tensor_upper[0]; - const std::size_t tensor_extent_0 = tensor_extent[0]; + const size_type tensor_lower_0 = tensor_lower[0]; + [[maybe_unused]] const size_type tensor_upper_0 = tensor_upper[0]; + const size_type tensor_extent_0 = tensor_extent[0]; TA_ASSERT((matrix.rows() == 1) || (matrix.cols() == 1)); if (matrix.rows() == 1) { - TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.cols())); + TA_ASSERT(tensor_extent_0 <= size_type(matrix.cols())); + TA_ASSERT(tensor_lower_0 >= base_offsets[0]); // Copy tensor into row vector - matrix.block(0, tensor_lower_0, 1, tensor_extent_0) = + matrix.block(0, tensor_lower_0 - base_offsets[0], 1, tensor_extent_0) = eigen_map(tensor, 1, tensor_extent_0); } else { - TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.rows())); + TA_ASSERT(tensor_extent_0 <= size_type(matrix.rows())); + TA_ASSERT(tensor_lower_0 >= base_offsets[0]); // Copy tensor into column vector - matrix.block(tensor_lower_0, 0, tensor_extent_0, 1) = + matrix.block(tensor_lower_0 - base_offsets[0], 0, tensor_extent_0, 1) = eigen_map(tensor, tensor_extent_0, 1); } } @@ -344,7 +366,12 @@ void counted_eigen_submatrix_to_tensor(const Eigen::MatrixBase* matrix, const typename A::ordinal_type i, madness::AtomicInt* counter) { typename A::value_type tensor(array->trange().make_tile_range(i)); - eigen_submatrix_to_tensor(*matrix, tensor); + // array lobound, in case not base-0 + const auto* range_lobound_data = + array->trange().elements_range().lobound_data(); + std::array array_lobound{ + {range_lobound_data[0], range_lobound_data[1]}}; + eigen_submatrix_to_tensor(*matrix, tensor, array_lobound); array->set(i, tensor); (*counter)++; } @@ -357,10 +384,11 @@ void counted_eigen_submatrix_to_tensor(const Eigen::MatrixBase* matrix, /// \param tensor The tensor to be copied /// \param counter The task counter template -void counted_tensor_to_eigen_submatrix(const T& tensor, - Eigen::MatrixBase* matrix, - madness::AtomicInt* counter) { - tensor_to_eigen_submatrix(tensor, *matrix); +void counted_tensor_to_eigen_submatrix( + const T& tensor, Eigen::MatrixBase* matrix, + std::array base_offsets, + madness::AtomicInt* counter) { + tensor_to_eigen_submatrix(tensor, *matrix, base_offsets); (*counter)++; } @@ -524,6 +552,12 @@ array_to_eigen(const DistArray& array) { EigenMatrix matrix = EigenMatrix::Zero(array_extent[0], (rank == 2 ? array_extent[1] : 1)); + // array lobound, in case not base-0 + const auto* range_lobound_data = + array.trange().elements_range().lobound_data(); + std::array array_lobound{ + {range_lobound_data[0], range_lobound_data[1]}}; + // Spawn tasks to copy array tiles to the Eigen matrix madness::AtomicInt counter; counter = 0; @@ -533,7 +567,7 @@ array_to_eigen(const DistArray& array) { array.world().taskq.add( &detail::counted_tensor_to_eigen_submatrix< EigenMatrix, typename DistArray::value_type>, - array.find(i), &matrix, &counter); + array.find(i), &matrix, array_lobound, &counter); ++n; } } @@ -565,6 +599,7 @@ array_to_eigen(const DistArray& array) { /// // Create a range for the new array object /// std::vector blocks; /// for(std::size_t i = 0ul; i <= 100ul; i += 10ul) +/// // N.B. can create non-0-base range, replace i -> i + base_offse /// blocks.push_back(i); /// std::array blocks2 = /// {{ TiledArray::TiledRange1(blocks.begin(), blocks.end()), @@ -634,6 +669,7 @@ inline A row_major_buffer_to_array( /// // Create a range for the new array object /// std::vector blocks; /// for(std::size_t i = 0ul; i <= 100ul; i += 10ul) +/// // N.B. can create non-0-base range, replace i -> i + base_offse /// blocks.push_back(i); /// std::array blocks2 = /// {{ TiledArray::TiledRange1(blocks.begin(), blocks.end()), @@ -705,11 +741,13 @@ inline A column_major_buffer_to_array( /// match. // clang-format on template + typename Tensor_, std::size_t NumIndices_Sz = NumIndices_> inline void eigen_subtensor_to_tensor( const Eigen::Tensor& src, - Tensor_& dst) { + Tensor_& dst, + std::array base_offsets = {}) { TA_ASSERT(dst.range().rank() == NumIndices_); + static_assert(NumIndices_Sz == NumIndices_); auto to_array = [](const auto& seq) { TA_ASSERT(seq.size() == NumIndices_); @@ -718,6 +756,13 @@ inline void eigen_subtensor_to_tensor( return result; }; + auto to_base0 = [&](const auto& arr) { + TA_ASSERT(arr.size() == NumIndices_); + std::array result; + for (int i = 0; i < NumIndices_; ++i) result[i] = arr[i] - base_offsets[i]; + return result; + }; + [[maybe_unused]] auto reverse_extent_indices = []() { std::array result; std::iota(result.rbegin(), result.rend(), 0); @@ -725,8 +770,8 @@ inline void eigen_subtensor_to_tensor( }; const auto& dst_range = dst.range(); - auto src_block = - src.slice(to_array(dst_range.lobound()), to_array(dst_range.extent())); + auto src_block = src.slice(to_base0(to_array(dst_range.lobound())), + to_array(dst_range.extent())); auto dst_eigen_map = Eigen::TensorMap< Eigen::Tensor>( dst.data(), to_array(dst_range.extent())); @@ -758,11 +803,13 @@ inline void eigen_subtensor_to_tensor( /// of \c src and \c dst do not match. // clang-format on template + typename IndexType_, std::size_t NumIndices_Sz = NumIndices_> inline void tensor_to_eigen_subtensor( const Tensor_& src, - Eigen::Tensor& dst) { + Eigen::Tensor& dst, + std::array base_offsets = {}) { TA_ASSERT(src.range().rank() == NumIndices_); + static_assert(NumIndices_Sz == NumIndices_); auto to_array = [](const auto& seq) { TA_ASSERT(seq.size() == NumIndices_); @@ -771,6 +818,13 @@ inline void tensor_to_eigen_subtensor( return result; }; + auto to_base0 = [&](const auto& arr) { + TA_ASSERT(arr.size() == NumIndices_); + std::array result; + for (int i = 0; i < NumIndices_; ++i) result[i] = arr[i] - base_offsets[i]; + return result; + }; + [[maybe_unused]] auto reverse_extent_indices = []() { std::array result; std::iota(result.rbegin(), result.rend(), 0); @@ -778,8 +832,8 @@ inline void tensor_to_eigen_subtensor( }; const auto& src_range = src.range(); - auto dst_block = - dst.slice(to_array(src_range.lobound()), to_array(src_range.extent())); + auto dst_block = dst.slice(to_base0(to_array(src_range.lobound())), + to_array(src_range.extent())); auto src_eigen_map = Eigen::TensorMap< Eigen::Tensor>( src.data(), to_array(src_range.extent())); @@ -809,7 +863,13 @@ void counted_eigen_subtensor_to_tensor(const Eigen_Tensor_* src, const typename Range::index_type i, madness::AtomicInt* counter) { typename DistArray_::value_type tensor(dst->trange().make_tile_range(i)); - eigen_subtensor_to_tensor(*src, tensor); + // array lobound, in case not base-0 + const auto* range_lobound_data = + dst->trange().elements_range().lobound_data(); + std::array array_lobound; + std::copy(range_lobound_data, range_lobound_data + dst->trange().rank(), + array_lobound.begin()); + eigen_subtensor_to_tensor(*src, tensor, array_lobound); dst->set(i, tensor); (*counter)++; } @@ -822,10 +882,11 @@ void counted_eigen_subtensor_to_tensor(const Eigen_Tensor_* src, /// \param dst The destination tensor /// \param counter The task counter template -void counted_tensor_to_eigen_subtensor(const TA_Tensor_& src, - Eigen_Tensor_* dst, - madness::AtomicInt* counter) { - tensor_to_eigen_subtensor(src, *dst); +void counted_tensor_to_eigen_subtensor( + const TA_Tensor_& src, Eigen_Tensor_* dst, + std::array base_offsets, + madness::AtomicInt* counter) { + tensor_to_eigen_subtensor(src, *dst, base_offsets); (*counter)++; } @@ -1004,6 +1065,12 @@ Tensor array_to_eigen_tensor(const TiledArray::DistArray& src, result_type result(src.trange().elements_range().extent()); result.setZero(); + const auto* range_lobound_data = + src.trange().elements_range().lobound_data(); + std::array array_lobound; + std::copy(range_lobound_data, range_lobound_data + src.trange().rank(), + array_lobound.begin()); + // Spawn tasks to copy array tiles to btas::Tensor madness::AtomicInt counter; counter = 0; @@ -1012,7 +1079,7 @@ Tensor array_to_eigen_tensor(const TiledArray::DistArray& src, if (!src.is_zero(i)) { src.world().taskq.add( &detail::counted_tensor_to_eigen_subtensor, - src.find(i), &result, &counter); + src.find(i), &result, array_lobound, &counter); ++n; } } diff --git a/tests/eigen.cpp b/tests/eigen.cpp index d577804417..11ca7088b1 100644 --- a/tests/eigen.cpp +++ b/tests/eigen.cpp @@ -29,9 +29,16 @@ struct EigenFixture : public TiledRangeFixture { : trange(dims.begin(), dims.begin() + 2), trange1(dims.begin(), dims.begin() + 1), trangeN(dims.begin(), dims.begin() + GlobalFixture::dim), + trange_base1(dims_base1.begin(), dims_base1.begin() + 2), + trange1_base1(dims_base1.begin(), dims_base1.begin() + 1), + trangeN_base1(dims_base1.begin(), + dims_base1.begin() + GlobalFixture::dim), array(*GlobalFixture::world, trange), array1(*GlobalFixture::world, trange1), arrayN(*GlobalFixture::world, trangeN), + array_base1(*GlobalFixture::world, trange_base1), + array1_base1(*GlobalFixture::world, trange1_base1), + arrayN_base1(*GlobalFixture::world, trangeN_base1), matrix(dims[0].elements_range().second, dims[1].elements_range().second), rmatrix(dims[0].elements_range().second, @@ -43,9 +50,15 @@ struct EigenFixture : public TiledRangeFixture { TiledRange trange; TiledRange trange1; TiledRange trangeN; + TiledRange trange_base1; // base-1 version of trange + TiledRange trange1_base1; // base-1 version of trange1 + TiledRange trangeN_base1; // base-1 version of trangeN TArrayI array; TArrayI array1; TArrayI arrayN; + TArrayI array_base1; // base-1 version of array + TArrayI array1_base1; // base-1 version of array1 + TArrayI arrayN_base1; // base-1 version of array1 Eigen::MatrixXi matrix; EigenMatrixXi rmatrix; Eigen::VectorXi vector; @@ -172,15 +185,23 @@ BOOST_AUTO_TEST_CASE(matrix_to_array) { (array = eigen_to_array(*GlobalFixture::world, trange, matrix))); // Check that the data in array is equal to that in matrix - for (Range::const_iterator it = array.tiles_range().begin(); - it != array.tiles_range().end(); ++it) { - Future tile = array.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(tile.get()[*tile_it], - matrix((*tile_it)[0], (*tile_it)[1])); + auto test = [&](const auto& array, auto base = 0) { + for (Range::const_iterator it = array.tiles_range().begin(); + it != array.tiles_range().end(); ++it) { + Future tile = array.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(tile.get()[*tile_it], + matrix((*tile_it)[0] - base, (*tile_it)[1] - base)); + } } - } + }; + test(array, 0); + + // same with base-1 + BOOST_CHECK_NO_THROW((array_base1 = eigen_to_array( + *GlobalFixture::world, trange_base1, matrix))); + test(array_base1, 1); } BOOST_AUTO_TEST_CASE(vector_to_array) { @@ -193,14 +214,23 @@ BOOST_AUTO_TEST_CASE(vector_to_array) { trange1, vector))); // Check that the data in array matches the data in vector - for (Range::const_iterator it = array1.tiles_range().begin(); - it != array1.tiles_range().end(); ++it) { - Future tile = array1.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(tile.get()[*tile_it], vector((*tile_it)[0])); + auto test = [&](const auto& array1, auto base = 0) { + for (Range::const_iterator it = array1.tiles_range().begin(); + it != array1.tiles_range().end(); ++it) { + Future tile = array1.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(tile.get()[*tile_it], vector((*tile_it)[0] - base)); + } } - } + }; + + test(array1, 0); + + // same with base-1 + BOOST_CHECK_NO_THROW((array1_base1 = eigen_to_array( + *GlobalFixture::world, trange1_base1, vector))); + test(array1_base1, 1); } BOOST_AUTO_TEST_CASE(array_to_matrix) { @@ -208,168 +238,180 @@ BOOST_AUTO_TEST_CASE(array_to_matrix) { return array_to_eigen, DensePolicy, Eigen::RowMajor>(array); }; - if (GlobalFixture::world->size() == 1) { - // Fill the array with random data - GlobalFixture::world->srand(27); - for (Range::const_iterator it = array.tiles_range().begin(); - it != array.tiles_range().end(); ++it) { - TArrayI::value_type tile(array.trange().make_tile_range(*it)); - for (TArrayI::value_type::iterator tile_it = tile.begin(); - tile_it != tile.end(); ++tile_it) { - *tile_it = GlobalFixture::world->rand(); + for (auto base : {0, 1}) { + auto& arr = base == 1 ? array_base1 : array; + + if (GlobalFixture::world->size() == 1) { + // Fill the array with random data + GlobalFixture::world->srand(27); + for (Range::const_iterator it = arr.tiles_range().begin(); + it != arr.tiles_range().end(); ++it) { + TArrayI::value_type tile(arr.trange().make_tile_range(*it)); + for (TArrayI::value_type::iterator tile_it = tile.begin(); + tile_it != tile.end(); ++tile_it) { + *tile_it = GlobalFixture::world->rand(); + } + arr.set(*it, tile); } - array.set(*it, tile); - } - - // Convert the array to an Eigen matrices: column-major (matrix) and - // row-major (rmatrix) - BOOST_CHECK_NO_THROW(matrix = array_to_eigen(array)); - BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(array)); - // Check that the matrix dimensions are the same as the array - BOOST_CHECK_EQUAL(matrix.rows(), array.trange().elements_range().extent(0)); - BOOST_CHECK_EQUAL(matrix.cols(), array.trange().elements_range().extent(1)); - BOOST_CHECK_EQUAL(rmatrix.rows(), - array.trange().elements_range().extent(0)); - BOOST_CHECK_EQUAL(rmatrix.cols(), - array.trange().elements_range().extent(1)); - - // Check that the data in matrix matches the data in array - for (Range::const_iterator it = array.tiles_range().begin(); - it != array.tiles_range().end(); ++it) { - Future tile = array.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(matrix((*tile_it)[0], (*tile_it)[1]), - tile.get()[*tile_it]); - BOOST_CHECK_EQUAL(rmatrix((*tile_it)[0], (*tile_it)[1]), - tile.get()[*tile_it]); + // Convert the array to an Eigen matrices: column-major (matrix) and + // row-major (rmatrix) + BOOST_CHECK_NO_THROW(matrix = array_to_eigen(arr)); + BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(arr)); + BOOST_CHECK_NO_THROW(matrix = array_to_eigen(arr)); + BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(arr)); + + // Check that the matrix dimensions are the same as the array + BOOST_CHECK_EQUAL(matrix.rows(), arr.trange().elements_range().extent(0)); + BOOST_CHECK_EQUAL(matrix.cols(), arr.trange().elements_range().extent(1)); + BOOST_CHECK_EQUAL(rmatrix.rows(), + arr.trange().elements_range().extent(0)); + BOOST_CHECK_EQUAL(rmatrix.cols(), + arr.trange().elements_range().extent(1)); + + // Check that the data in matrix matches the data in array + for (Range::const_iterator it = arr.tiles_range().begin(); + it != arr.tiles_range().end(); ++it) { + Future tile = arr.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(matrix((*tile_it)[0] - base, (*tile_it)[1] - base), + tile.get()[*tile_it]); + BOOST_CHECK_EQUAL(rmatrix((*tile_it)[0] - base, (*tile_it)[1] - base), + tile.get()[*tile_it]); + } } - } - } else { - // Check that eigen_to_array throws when there is more than one node - BOOST_CHECK_THROW(array_to_eigen(array), TiledArray::Exception); - - // Fill local tiles with data - GlobalFixture::world->srand(27); - TArrayI::pmap_interface::const_iterator it = array.pmap()->begin(); - TArrayI::pmap_interface::const_iterator end = array.pmap()->end(); - for (; it != end; ++it) { - TArrayI::value_type tile(array.trange().make_tile_range(*it)); - for (TArrayI::value_type::iterator tile_it = tile.begin(); - tile_it != tile.end(); ++tile_it) { - *tile_it = GlobalFixture::world->rand(); + } else { + // Check that eigen_to_array throws when there is more than one node + BOOST_CHECK_THROW(array_to_eigen(arr), TiledArray::Exception); + + // Fill local tiles with data + GlobalFixture::world->srand(27); + TArrayI::pmap_interface::const_iterator it = arr.pmap()->begin(); + TArrayI::pmap_interface::const_iterator end = arr.pmap()->end(); + for (; it != end; ++it) { + TArrayI::value_type tile(arr.trange().make_tile_range(*it)); + for (TArrayI::value_type::iterator tile_it = tile.begin(); + tile_it != tile.end(); ++tile_it) { + *tile_it = GlobalFixture::world->rand(); + } + arr.set(*it, tile); } - array.set(*it, tile); - } - - // Distribute the data of array1 to all nodes - array.make_replicated(); - - BOOST_CHECK(array.pmap()->is_replicated()); - - // Convert the array to an Eigen matrix - BOOST_CHECK_NO_THROW(matrix = array_to_eigen(array)); - BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(array)); - - // Check that the matrix dimensions are the same as the array - BOOST_CHECK_EQUAL(matrix.rows(), array.trange().elements_range().extent(0)); - BOOST_CHECK_EQUAL(matrix.cols(), array.trange().elements_range().extent(1)); - BOOST_CHECK_EQUAL(rmatrix.rows(), - array.trange().elements_range().extent(0)); - BOOST_CHECK_EQUAL(rmatrix.cols(), - array.trange().elements_range().extent(1)); - // Check that the data in vector matches the data in array - for (Range::const_iterator it = array.tiles_range().begin(); - it != array.tiles_range().end(); ++it) { - BOOST_CHECK(array.is_local(*it)); - - Future tile = array.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(matrix((*tile_it)[0], (*tile_it)[1]), - tile.get()[*tile_it]); - BOOST_CHECK_EQUAL(rmatrix((*tile_it)[0], (*tile_it)[1]), - tile.get()[*tile_it]); + // Distribute the data of array1 to all nodes + arr.make_replicated(); + + BOOST_CHECK(arr.pmap()->is_replicated()); + + // Convert the array to an Eigen matrix + BOOST_CHECK_NO_THROW(matrix = array_to_eigen(arr)); + BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(arr)); + + // Check that the matrix dimensions are the same as the array + BOOST_CHECK_EQUAL(matrix.rows(), arr.trange().elements_range().extent(0)); + BOOST_CHECK_EQUAL(matrix.cols(), arr.trange().elements_range().extent(1)); + BOOST_CHECK_EQUAL(rmatrix.rows(), + arr.trange().elements_range().extent(0)); + BOOST_CHECK_EQUAL(rmatrix.cols(), + arr.trange().elements_range().extent(1)); + + // Check that the data in vector matches the data in array + for (Range::const_iterator it = arr.tiles_range().begin(); + it != arr.tiles_range().end(); ++it) { + BOOST_CHECK(arr.is_local(*it)); + + Future tile = arr.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(matrix((*tile_it)[0] - base, (*tile_it)[1] - base), + tile.get()[*tile_it]); + BOOST_CHECK_EQUAL(rmatrix((*tile_it)[0] - base, (*tile_it)[1] - base), + tile.get()[*tile_it]); + } } } - } + + } // base=0,1 } BOOST_AUTO_TEST_CASE(array_to_vector) { - if (GlobalFixture::world->size() == 1) { - // Fill the array with random data - GlobalFixture::world->srand(27); - for (Range::const_iterator it = array1.tiles_range().begin(); - it != array1.tiles_range().end(); ++it) { - TArrayI::value_type tile(array1.trange().make_tile_range(*it)); - for (TArrayI::value_type::iterator tile_it = tile.begin(); - tile_it != tile.end(); ++tile_it) { - *tile_it = GlobalFixture::world->rand(); + for (auto base : {0, 1}) { + auto& arr1 = base == 1 ? array1_base1 : array1; + + if (GlobalFixture::world->size() == 1) { + // Fill the array with random data + GlobalFixture::world->srand(27); + for (Range::const_iterator it = arr1.tiles_range().begin(); + it != arr1.tiles_range().end(); ++it) { + TArrayI::value_type tile(arr1.trange().make_tile_range(*it)); + for (TArrayI::value_type::iterator tile_it = tile.begin(); + tile_it != tile.end(); ++tile_it) { + *tile_it = GlobalFixture::world->rand(); + } + arr1.set(*it, tile); } - array1.set(*it, tile); - } - - // Convert the array to an Eigen vector - BOOST_CHECK_NO_THROW(vector = array_to_eigen(array1)); - - // Check that the matrix dimensions are the same as the array - BOOST_CHECK_EQUAL(vector.rows(), - array1.trange().elements_range().extent(0)); - BOOST_CHECK_EQUAL(vector.cols(), 1); - // Check that the data in vector matches the data in array - for (Range::const_iterator it = array1.tiles_range().begin(); - it != array1.tiles_range().end(); ++it) { - Future tile = array1.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(vector((*tile_it)[0]), tile.get()[*tile_it]); + // Convert the array to an Eigen vector + BOOST_CHECK_NO_THROW(vector = array_to_eigen(arr1)); + + // Check that the matrix dimensions are the same as the array + BOOST_CHECK_EQUAL(vector.rows(), + arr1.trange().elements_range().extent(0)); + BOOST_CHECK_EQUAL(vector.cols(), 1); + + // Check that the data in vector matches the data in array + for (Range::const_iterator it = arr1.tiles_range().begin(); + it != arr1.tiles_range().end(); ++it) { + Future tile = arr1.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(vector((*tile_it)[0] - base), tile.get()[*tile_it]); + } } - } - } else { - // Check that eigen_to_array throws when there is more than one node - BOOST_CHECK_THROW(array_to_eigen(array1), TiledArray::Exception); - - // Fill local tiles with data - GlobalFixture::world->srand(27); - TArrayI::pmap_interface::const_iterator it = array1.pmap()->begin(); - TArrayI::pmap_interface::const_iterator end = array1.pmap()->end(); - for (; it != end; ++it) { - TArrayI::value_type tile(array1.trange().make_tile_range(*it)); - for (TArrayI::value_type::iterator tile_it = tile.begin(); - tile_it != tile.end(); ++tile_it) { - *tile_it = GlobalFixture::world->rand(); + } else { + // Check that eigen_to_array throws when there is more than one node + BOOST_CHECK_THROW(array_to_eigen(arr1), TiledArray::Exception); + + // Fill local tiles with data + GlobalFixture::world->srand(27); + TArrayI::pmap_interface::const_iterator it = arr1.pmap()->begin(); + TArrayI::pmap_interface::const_iterator end = arr1.pmap()->end(); + for (; it != end; ++it) { + TArrayI::value_type tile(arr1.trange().make_tile_range(*it)); + for (TArrayI::value_type::iterator tile_it = tile.begin(); + tile_it != tile.end(); ++tile_it) { + *tile_it = GlobalFixture::world->rand(); + } + arr1.set(*it, tile); } - array1.set(*it, tile); - } - // Distribute the data of array1 to all nodes - array1.make_replicated(); + // Distribute the data of array1 to all nodes + arr1.make_replicated(); - BOOST_CHECK(array1.pmap()->is_replicated()); + BOOST_CHECK(arr1.pmap()->is_replicated()); - // Convert the array to an Eigen vector - BOOST_CHECK_NO_THROW(vector = array_to_eigen(array1)); + // Convert the array to an Eigen vector + BOOST_CHECK_NO_THROW(vector = array_to_eigen(arr1)); - // Check that the matrix dimensions are the same as the array - BOOST_CHECK_EQUAL(vector.rows(), - array1.trange().elements_range().extent(0)); - BOOST_CHECK_EQUAL(vector.cols(), 1); + // Check that the matrix dimensions are the same as the array + BOOST_CHECK_EQUAL(vector.rows(), + arr1.trange().elements_range().extent(0)); + BOOST_CHECK_EQUAL(vector.cols(), 1); - // Check that the data in vector matches the data in array - for (Range::const_iterator it = array1.tiles_range().begin(); - it != array1.tiles_range().end(); ++it) { - BOOST_CHECK(array1.is_local(*it)); + // Check that the data in vector matches the data in array + for (Range::const_iterator it = arr1.tiles_range().begin(); + it != arr1.tiles_range().end(); ++it) { + BOOST_CHECK(arr1.is_local(*it)); - Future tile = array1.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(vector((*tile_it)[0]), tile.get()[*tile_it]); + Future tile = arr1.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(vector((*tile_it)[0] - base), tile.get()[*tile_it]); + } } } - } + + } // base=0,1 } BOOST_AUTO_TEST_CASE(subtensor_to_tensor) { @@ -430,22 +472,26 @@ BOOST_AUTO_TEST_CASE(tensor_to_array) { BOOST_CHECK(eq() == true); } - // Copy matrix to array - BOOST_CHECK_NO_THROW((array = eigen_tensor_to_array( - *GlobalFixture::world, trangeN, tensor))); - - // Check that the data in array is equal to that in matrix - for (Range::const_iterator it = array.tiles_range().begin(); - it != array.tiles_range().end(); ++it) { - Future tile = array.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - std::array idx; - auto& t_idx = *tile_it; - std::copy(t_idx.begin(), t_idx.end(), idx.begin()); - BOOST_CHECK_EQUAL(tile.get()[*tile_it], tensor(idx)); + for (auto base : {0, 1}) { + auto& tr = base == 1 ? trangeN_base1 : trangeN; + auto& arr = base == 1 ? arrayN_base1 : arrayN; + // Copy matrix to array + BOOST_CHECK_NO_THROW((arr = eigen_tensor_to_array( + *GlobalFixture::world, tr, tensor))); + + // Check that the data in array is equal to that in matrix + for (Range::const_iterator it = arr.tiles_range().begin(); + it != arr.tiles_range().end(); ++it) { + Future tile = arr.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + auto& t_idx = *tile_it; + std::array idx; + for (auto d = 0; d != GlobalFixture::dim; ++d) idx[d] = t_idx[d] - base; + BOOST_CHECK_EQUAL(tile.get()[*tile_it], tensor(idx)); + } } - } + } // base } BOOST_AUTO_TEST_CASE(array_to_tensor) { @@ -462,57 +508,70 @@ BOOST_AUTO_TEST_CASE(array_to_tensor) { return result; }; - // Fill local tiles with data - GlobalFixture::world->srand(27); - TArrayI::pmap_interface::const_iterator it = arrayN.pmap()->begin(); - TArrayI::pmap_interface::const_iterator end = arrayN.pmap()->end(); - for (; it != end; ++it) { - TArrayI::value_type tile(arrayN.trange().make_tile_range(*it)); - for (TArrayI::value_type::iterator tile_it = tile.begin(); - tile_it != tile.end(); ++tile_it) { - *tile_it = GlobalFixture::world->rand(); + for (auto base : {0, 1}) { + auto& arr = base == 1 ? arrayN_base1 : arrayN; + + auto to_base0 = [&](const auto& arr) { + std::array result; + for (int i = 0; i < GlobalFixture::dim; ++i) result[i] = arr[i] - base; + return result; + }; + + // Fill local tiles with data + GlobalFixture::world->srand(27); + TArrayI::pmap_interface::const_iterator it = arr.pmap()->begin(); + TArrayI::pmap_interface::const_iterator end = arr.pmap()->end(); + for (; it != end; ++it) { + TArrayI::value_type tile(arr.trange().make_tile_range(*it)); + for (TArrayI::value_type::iterator tile_it = tile.begin(); + tile_it != tile.end(); ++tile_it) { + *tile_it = GlobalFixture::world->rand(); + } + arr.set(*it, tile); } - arrayN.set(*it, tile); - } - if (GlobalFixture::world->size() > 1) { - // Check that array_to_eigen_tensor throws when there is more than one node - BOOST_CHECK_THROW(array_to_eigen_tensor(arrayN), - TiledArray::Exception); - } + if (GlobalFixture::world->size() > 1) { + // Check that array_to_eigen_tensor throws when there is more than one + // node + BOOST_CHECK_THROW(array_to_eigen_tensor(arr), + TiledArray::Exception); + } - // Distribute the data of arrayN to all nodes - if (GlobalFixture::world->size() > 1) { - arrayN.make_replicated(); - BOOST_CHECK(arrayN.pmap()->is_replicated()); - } + // Distribute the data of arrayN to all nodes + if (GlobalFixture::world->size() > 1) { + arr.make_replicated(); + BOOST_CHECK(arr.pmap()->is_replicated()); + } + + // Convert the array to an Eigen matrix + BOOST_CHECK_NO_THROW(tensor = array_to_eigen_tensor(arr)); + BOOST_CHECK_NO_THROW(rtensor = a_to_e_rowmajor(arr)); + + // Check that the matrix dimensions are the same as the array + BOOST_CHECK_EQUAL_COLLECTIONS( + tensor.dimensions().begin(), tensor.dimensions().end(), + arr.trange().elements_range().extent().begin(), + arr.trange().elements_range().extent().end()); + BOOST_CHECK_EQUAL_COLLECTIONS( + rtensor.dimensions().begin(), rtensor.dimensions().end(), + arr.trange().elements_range().extent().begin(), + arr.trange().elements_range().extent().end()); - // Convert the array to an Eigen matrix - BOOST_CHECK_NO_THROW(tensor = array_to_eigen_tensor(arrayN)); - BOOST_CHECK_NO_THROW(rtensor = a_to_e_rowmajor(arrayN)); - - // Check that the matrix dimensions are the same as the array - BOOST_CHECK_EQUAL_COLLECTIONS( - tensor.dimensions().begin(), tensor.dimensions().end(), - arrayN.trange().elements_range().extent().begin(), - arrayN.trange().elements_range().extent().end()); - BOOST_CHECK_EQUAL_COLLECTIONS( - rtensor.dimensions().begin(), rtensor.dimensions().end(), - arrayN.trange().elements_range().extent().begin(), - arrayN.trange().elements_range().extent().end()); - - // Check that the data in vector matches the data in array - for (Range::const_iterator it = arrayN.tiles_range().begin(); - it != arrayN.tiles_range().end(); ++it) { - BOOST_CHECK(arrayN.is_local(*it)); - - Future tile = arrayN.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(tensor(to_array(*tile_it)), tile.get()[*tile_it]); - BOOST_CHECK_EQUAL(rtensor(to_array(*tile_it)), tile.get()[*tile_it]); + // Check that the data in vector matches the data in array + for (Range::const_iterator it = arr.tiles_range().begin(); + it != arr.tiles_range().end(); ++it) { + BOOST_CHECK(arr.is_local(*it)); + + Future tile = arr.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(tensor(to_base0(to_array(*tile_it))), + tile.get()[*tile_it]); + BOOST_CHECK_EQUAL(rtensor(to_base0(to_array(*tile_it))), + tile.get()[*tile_it]); + } } - } + } // base=0,1 } BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/range_fixture.h b/tests/range_fixture.h index 3eb9afd611..6b0fcd1604 100644 --- a/tests/range_fixture.h +++ b/tests/range_fixture.h @@ -65,37 +65,46 @@ struct RangeFixture { }; struct Range1Fixture { + using index1_type = Range1::index1_type; static const size_t ntiles = 5; Range1Fixture() - : a(init_tiling()), - tiles(0, a.size() - 1), - elements(a.front(), a.back()), - tr1(a.begin(), a.end()) {} + : tr1_hashmarks(make_hashmarks()), + a(tr1_hashmarks), + tiles(0, tr1_hashmarks.size() - 1), + elements(tr1_hashmarks.front(), tr1_hashmarks.back()), + tr1(tr1_hashmarks), + tr1_base1(make_hashmarks(1)) {} ~Range1Fixture() {} template - static std::array init_tiling() { - std::array result; - result[0] = 0u; + static std::array make_hashmarks(index1_type offset = 0) { + std::array result; + result[0] = offset; for (std::size_t i = 1; i < D; ++i) result[i] = result[i - 1] + GlobalFixture::primes[i - 1]; return result; } - const std::array a; - const TiledRange1::range_type tiles; - const TiledRange1::range_type elements; - TiledRange1 tr1; + const std::array tr1_hashmarks; + const std::array + a; // copy of tr1_hashmarks, to make legacy tests build + const TiledRange1::range_type tiles; // = tr1.tiles_range() + const TiledRange1::range_type elements; // = tr1.elements_range() + TiledRange1 tr1; // base-0 TiledRange1 std::array tile; + TiledRange1 tr1_base1; // base-1 TiledRange1 }; struct TiledRangeFixtureBase : public Range1Fixture { TiledRangeFixtureBase() { std::fill(dims.begin(), dims.end(), tr1); std::fill(extents.begin(), extents.end(), tr1.extent()); + std::fill(dims_base1.begin(), dims_base1.end(), tr1_base1); } - std::array dims; + std::array dims; // base-0 TiledRange1's + std::array + dims_base1; // base-1 version of dims std::array extents; }; // struct TiledRangeFixtureBase @@ -106,17 +115,21 @@ struct TiledRangeFixture : public RangeFixture, public TiledRangeFixtureBase { TiledRangeFixture() : tiles_range(TiledRangeFixture::index(GlobalFixture::dim, 0), TiledRangeFixture::index(GlobalFixture::dim, 5)), - elements_range(TiledRangeFixture::tile_index(GlobalFixture::dim, 0), - TiledRangeFixture::tile_index(GlobalFixture::dim, a[5])), - tr(dims.begin(), dims.end()) {} + elements_range(TiledRangeFixture::tile_index(GlobalFixture::dim, + tr1_hashmarks.front()), + TiledRangeFixture::tile_index(GlobalFixture::dim, + tr1_hashmarks.back())), + tr(dims.begin(), dims.end()), + tr_base1(dims_base1.begin(), dims_base1.end()) {} ~TiledRangeFixture() {} static tile_index fill_tile_index(TRangeN::range_type::index::value_type); const TRangeN::range_type tiles_range; - const TRangeN::range_type elements_range; - TRangeN tr; + const TRangeN::range_type elements_range; // elements range of tr + TRangeN tr; // base-0 TiledRangeN + TRangeN tr_base1; // base-1 version of tr }; #endif // TILEDARRAY_RANGE_FIXTURE_H__INCLUDED From 9129da9b5f5ff9104878819b1b6b0d9a81411e15 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 3 Sep 2024 23:33:05 -0400 Subject: [PATCH 467/592] [skip ci] typo --- src/TiledArray/conversions/concat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/conversions/concat.h b/src/TiledArray/conversions/concat.h index cc55f91e17..dd35e09456 100644 --- a/src/TiledArray/conversions/concat.h +++ b/src/TiledArray/conversions/concat.h @@ -64,7 +64,7 @@ DistArray concat( using std::begin; using std::end; - index b(r), e(r); // updated for concatted modes only + index b(r), e(r); // updated for concatenated modes only std::fill(begin(b), end(b), 0); for (auto i = 0ul; i != arrays.size(); ++i) { auto& tr = arrays[i].trange(); From 959adb1bcafc9bef05b8f1eb05f9b21e3437d47a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 3 Sep 2024 23:34:27 -0400 Subject: [PATCH 468/592] remove duplicate vlock copy in concat --- src/TiledArray/conversions/concat.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/TiledArray/conversions/concat.h b/src/TiledArray/conversions/concat.h index dd35e09456..e7b3e9da55 100644 --- a/src/TiledArray/conversions/concat.h +++ b/src/TiledArray/conversions/concat.h @@ -97,9 +97,6 @@ DistArray concat( result.make_tsrexpr(annot).block(tile_begin_end[i].first, tile_begin_end[i].second) = arrays[i].make_tsrexpr(annot); - result.make_tsrexpr(annot).block(tile_begin_end[i].first, - tile_begin_end[i].second) = - arrays[i].make_tsrexpr(annot); } } result.world().gop.fence(); From 4ed437c0ee1525a730725bf4906bccea3c390c69 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 3 Sep 2024 23:35:10 -0400 Subject: [PATCH 469/592] introduced tile_ranges_match_trange(DistArray) for validating tile ranges against trange --- src/TiledArray/dist_array.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index c2645dd7ce..3bc9fe3c62 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -1779,6 +1779,22 @@ auto rank(const DistArray& a) { return a.trange().tiles_range().rank(); } +/// Checks if for every tile `i` its range matches the tile range produced by +/// `a.trange()` + +/// @return `a.get(i)->range() == a.trange().make_tile_range(i)` for every tile +/// `i` +template +bool tile_ranges_match_trange(const DistArray& a) { + auto end = a.end(); + for (auto it = a.begin(); it != end; ++it) { + if (it->is_local() && !a.is_zero(it.index())) + if ((*it).get().range() != a.trange().make_tile_range(it.index())) + return false; + } + return true; +} + /// /// \brief Get the total elements in the non-zero tiles of an array. /// For tensor-of-tensor tiles, the total is the sum of the number of From 7f687b306faad027608dc915b1d1840b789aeb64 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 3 Sep 2024 23:35:47 -0400 Subject: [PATCH 470/592] SizeArray is a viewable range --- src/TiledArray/size_array.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/size_array.h b/src/TiledArray/size_array.h index bd52139ce5..ef2ed1e121 100644 --- a/src/TiledArray/size_array.h +++ b/src/TiledArray/size_array.h @@ -26,6 +26,8 @@ #include #include +#include + namespace TiledArray { namespace detail { @@ -445,6 +447,20 @@ class SizeArray { }; // class SizeArray +} // namespace detail +} // namespace TiledArray + +namespace ranges { +template +inline constexpr bool enable_view> = true; +} // namespace ranges + +static_assert(ranges::range>); +static_assert( + ranges::viewable_range>); + +namespace TiledArray::detail { + template std::enable_if_t< is_sized_range_v> && @@ -473,7 +489,6 @@ inline std::ostream& operator<<(std::ostream& os, return os; } -} // namespace detail -} // namespace TiledArray +} // namespace TiledArray::detail #endif // TILEDARRAY_SIZE_ARRAY_H__INCLUDED From 65b8520945baebedfb90d2785bc42b3841d3b58c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 3 Sep 2024 23:38:19 -0400 Subject: [PATCH 471/592] fixed assignment to block expression from an expression with nonzero base --- src/TiledArray/expressions/expr.h | 13 +++++++++++-- tests/expressions_fixture.h | 19 +++++++++++++++++++ tests/expressions_impl.h | 29 +++++++++++++++++++++++++++++ tests/range_fixture.h | 2 +- 4 files changed, 60 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index c3fdd6423b..8d52990eef 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -47,6 +47,9 @@ #include +#include +#include + namespace TiledArray::expressions { template @@ -509,8 +512,14 @@ class Expr { if (tsr.array().trange().tiles_range().volume() != 0) { // N.B. must deep copy TA_ASSERT(tsr.array().trange().tiles_range().includes(tsr.lower_bound())); - const container::svector shift = - tsr.array().trange().make_tile_range(tsr.lower_bound()).lobound(); + // N.B. this expression's range, + // dist_eval.trange().elements_range().lobound(), may not be zero! + const auto shift = + ranges::views::zip_with( + [](auto a, auto b) { return a - b; }, + tsr.array().trange().make_tile_range(tsr.lower_bound()).lobound(), + dist_eval.trange().elements_range().lobound()) | + ranges::to>(); std::shared_ptr shift_op = std::make_shared(shift_op_type(shift)); diff --git a/tests/expressions_fixture.h b/tests/expressions_fixture.h index 94c09a7449..7a7be4c9af 100644 --- a/tests/expressions_fixture.h +++ b/tests/expressions_fixture.h @@ -57,6 +57,8 @@ struct ExpressionsFixture : public TiledRangeFixture { ExpressionsFixture() : s_tr_1(make_random_sparseshape(tr)), s_tr_2(make_random_sparseshape(tr)), + s_tr_base1_1(make_random_sparseshape(tr_base1)), + s_tr_base1_2(make_random_sparseshape(tr_base1)), s_tr1_1(make_random_sparseshape(trange1)), s_tr1_2(make_random_sparseshape(trange1)), s_tr2(make_random_sparseshape(trange2)), @@ -65,6 +67,9 @@ struct ExpressionsFixture : public TiledRangeFixture { a(*GlobalFixture::world, tr, s_tr_1), b(*GlobalFixture::world, tr, s_tr_2), c(*GlobalFixture::world, tr, s_tr_2), + a_base1(*GlobalFixture::world, tr_base1, s_tr_base1_1), + b_base1(*GlobalFixture::world, tr_base1, s_tr_base1_2), + c_base1(*GlobalFixture::world, tr_base1, s_tr_base1_2), aC(*GlobalFixture::world, trangeC, s_trC), aC_f(*GlobalFixture::world, trangeC_f, s_trC_f), u(*GlobalFixture::world, trange1, s_tr1_1), @@ -72,12 +77,16 @@ struct ExpressionsFixture : public TiledRangeFixture { w(*GlobalFixture::world, trange2, s_tr2) { random_fill(a); random_fill(b); + random_fill(a_base1); + random_fill(b_base1); random_fill(u); random_fill(v); random_fill(aC); GlobalFixture::world->gop.fence(); a.truncate(); b.truncate(); + a_base1.truncate(); + b_base1.truncate(); u.truncate(); v.truncate(); } @@ -89,6 +98,9 @@ struct ExpressionsFixture : public TiledRangeFixture { : a(*GlobalFixture::world, tr), b(*GlobalFixture::world, tr), c(*GlobalFixture::world, tr), + a_base1(*GlobalFixture::world, tr_base1), + b_base1(*GlobalFixture::world, tr_base1), + c_base1(*GlobalFixture::world, tr_base1), u(*GlobalFixture::world, trange1), v(*GlobalFixture::world, trange1), w(*GlobalFixture::world, trange2), @@ -96,6 +108,8 @@ struct ExpressionsFixture : public TiledRangeFixture { aC_f(*GlobalFixture::world, trangeC_f) { random_fill(a); random_fill(b); + random_fill(a_base1); + random_fill(b_base1); random_fill(u); random_fill(v); random_fill(aC); @@ -229,6 +243,8 @@ struct ExpressionsFixture : public TiledRangeFixture { SparseShape s_tr_1; SparseShape s_tr_2; + SparseShape s_tr_base1_1; + SparseShape s_tr_base1_2; SparseShape s_tr1_1; SparseShape s_tr1_2; SparseShape s_tr2; @@ -237,6 +253,9 @@ struct ExpressionsFixture : public TiledRangeFixture { TArray a; TArray b; TArray c; + TArray a_base1; + TArray b_base1; + TArray c_base1; TArray u; TArray v; TArray w; diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h index 268b118568..ca8027c03d 100644 --- a/tests/expressions_impl.h +++ b/tests/expressions_impl.h @@ -32,6 +32,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(tensor_factories, F, Fixtures, F) { auto& a = F::a; auto& c = F::c; auto& aC = F::aC; + auto& a_base1 = F::a_base1; const auto& ca = a; const std::array lobound{{3, 3, 3}}; @@ -66,6 +67,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(tensor_factories, F, Fixtures, F) { BOOST_CHECK_NO_THROW(c("a,b,c") = ca("a,b,c").block(iv(3, 3, 3), iv(5, 5, 5))); + BOOST_CHECK_NO_THROW(c("a,b,c") = a_base1("a,b,c").block(lobound, upbound)); + // make sure that c("abc") = a("abc") does a deep copy { BOOST_CHECK_NO_THROW(c("a,b,c") = a("a, b, c")); @@ -291,6 +294,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(block, F, Fixtures, F) { auto& a = F::a; auto& b = F::b; auto& c = F::c; + auto& a_base1 = F::a_base1; BlockRange block_range(a.trange().tiles_range(), {3, 3, 3}, {5, 5, 5}); @@ -683,6 +687,31 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_block, F, Fixtures, F) { } } +BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_block_base1, F, Fixtures, F) { + auto& a = F::a; + auto& b = F::b; + auto& c = F::c; + auto& a_base1 = F::a_base1; + auto& c_base1 = F::c_base1; + auto& ntiles = F::ntiles; + + c.fill_local(0.0); + c_base1.fill_local(0.0); + + BOOST_REQUIRE_NO_THROW(c("a,b,c").block({3, 3, 3}, {5, 5, 5}) = + a_base1("a,b,c").block({3, 3, 3}, {5, 5, 5})); + BOOST_REQUIRE(tile_ranges_match_trange(c)); + BOOST_REQUIRE_NO_THROW(c_base1("a,b,c").block({3, 3, 3}, {5, 5, 5}) = + a("a,b,c").block({3, 3, 3}, {5, 5, 5})); + BOOST_REQUIRE(tile_ranges_match_trange(c_base1)); + BOOST_REQUIRE_NO_THROW(c("a,b,c").block({0, 0, 0}, {ntiles, ntiles, ntiles}) = + a_base1("a,b,c")); + BOOST_REQUIRE(tile_ranges_match_trange(c)); + BOOST_REQUIRE_NO_THROW( + c_base1("a,b,c").block({0, 0, 0}, {ntiles, ntiles, ntiles}) = a("a,b,c")); + BOOST_REQUIRE(tile_ranges_match_trange(c_base1)); +} + BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_permute_block, F, Fixtures, F) { auto& a = F::a; diff --git a/tests/range_fixture.h b/tests/range_fixture.h index 6b0fcd1604..5a554eab7c 100644 --- a/tests/range_fixture.h +++ b/tests/range_fixture.h @@ -66,7 +66,7 @@ struct RangeFixture { struct Range1Fixture { using index1_type = Range1::index1_type; - static const size_t ntiles = 5; + static const inline size_t ntiles = 5; Range1Fixture() : tr1_hashmarks(make_hashmarks()), From d9eb67738424ad4adfbdf24f3ce2c88a2d4d171a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 4 Sep 2024 11:24:53 -0400 Subject: [PATCH 472/592] [skip ci] to_container.hpp -> range/conversion.hpp to_container.hpp is deprecated --- src/TiledArray/expressions/expr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index 8d52990eef..f6d2ff1376 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -47,7 +47,7 @@ #include -#include +#include #include namespace TiledArray::expressions { From 6e5df0f44e79c93ef79a1356f3ba1480188356ba Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 4 Sep 2024 11:52:45 -0400 Subject: [PATCH 473/592] device::Env::initialize: use correct page sizes for Umpire allocations + do not allocate anything at the start --- src/TiledArray/external/device.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 44d9c77a68..38bcbbc745 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -503,8 +503,7 @@ class Env { /// \param page_size memory added to the pools supporting `this->um_allocator()`, `this->device_allocator()`, and `this->pinned_allocator()` in chunks of at least /// this size (bytes) [default=2^25] /// \param pinned_alloc_limit the maximum total amount of memory (in bytes) that - /// allocator returned by `this->pinned_allocator()` can allocate; - /// this allocator is not used by default [default=0] + /// allocator returned by `this->pinned_allocator()` can allocate [default=2^40] // clang-format on static void initialize(World& world = TiledArray::get_default_world(), const std::uint64_t page_size = (1ul << 25), @@ -563,8 +562,9 @@ class Env { // allocate all currently-free memory for UM pool auto um_dynamic_pool = rm.makeAllocator( - "UMDynamicPool", rm.getAllocator("UM"), mem_total_free.second, - pinned_alloc_limit); + "UMDynamicPool", rm.getAllocator("UM"), + /* first_minimum_pool_allocation_size = */ 0, + /* next_minimum_pool_allocation_size = */ page_size); // allocate zero memory for device pool auto dev_size_limited_alloc = @@ -573,8 +573,9 @@ class Env { mem_total_free.first); auto dev_dynamic_pool = rm.makeAllocator( - "DEVICEDynamicPool", dev_size_limited_alloc, 0, - pinned_alloc_limit); + "DEVICEDynamicPool", dev_size_limited_alloc, + /* first_minimum_pool_allocation_size = */ 0, + /* next_minimum_pool_allocation_size = */ page_size); // allocate pinned_alloc_limit in pinned memory auto pinned_size_limited_alloc = @@ -584,7 +585,9 @@ class Env { auto pinned_dynamic_pool = rm.makeAllocator( "QuickPool_SizeLimited_PINNED", pinned_size_limited_alloc, - page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE); + /* first_minimum_pool_allocation_size = */ 0, + /* next_minimum_pool_allocation_size = */ page_size, + /* alignment */ TILEDARRAY_ALIGN_SIZE); auto env = std::unique_ptr(new Env( world, num_visible_devices, compute_devices, num_streams_per_device, From 086e1e4a384d1a0792c4e2eb744247840b6cc272 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 4 Sep 2024 11:52:45 -0400 Subject: [PATCH 474/592] device::Env::initialize: use correct page sizes for Umpire allocations + do not allocate anything at the start --- src/TiledArray/external/device.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 44d9c77a68..38bcbbc745 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -503,8 +503,7 @@ class Env { /// \param page_size memory added to the pools supporting `this->um_allocator()`, `this->device_allocator()`, and `this->pinned_allocator()` in chunks of at least /// this size (bytes) [default=2^25] /// \param pinned_alloc_limit the maximum total amount of memory (in bytes) that - /// allocator returned by `this->pinned_allocator()` can allocate; - /// this allocator is not used by default [default=0] + /// allocator returned by `this->pinned_allocator()` can allocate [default=2^40] // clang-format on static void initialize(World& world = TiledArray::get_default_world(), const std::uint64_t page_size = (1ul << 25), @@ -563,8 +562,9 @@ class Env { // allocate all currently-free memory for UM pool auto um_dynamic_pool = rm.makeAllocator( - "UMDynamicPool", rm.getAllocator("UM"), mem_total_free.second, - pinned_alloc_limit); + "UMDynamicPool", rm.getAllocator("UM"), + /* first_minimum_pool_allocation_size = */ 0, + /* next_minimum_pool_allocation_size = */ page_size); // allocate zero memory for device pool auto dev_size_limited_alloc = @@ -573,8 +573,9 @@ class Env { mem_total_free.first); auto dev_dynamic_pool = rm.makeAllocator( - "DEVICEDynamicPool", dev_size_limited_alloc, 0, - pinned_alloc_limit); + "DEVICEDynamicPool", dev_size_limited_alloc, + /* first_minimum_pool_allocation_size = */ 0, + /* next_minimum_pool_allocation_size = */ page_size); // allocate pinned_alloc_limit in pinned memory auto pinned_size_limited_alloc = @@ -584,7 +585,9 @@ class Env { auto pinned_dynamic_pool = rm.makeAllocator( "QuickPool_SizeLimited_PINNED", pinned_size_limited_alloc, - page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE); + /* first_minimum_pool_allocation_size = */ 0, + /* next_minimum_pool_allocation_size = */ page_size, + /* alignment */ TILEDARRAY_ALIGN_SIZE); auto env = std::unique_ptr(new Env( world, num_visible_devices, compute_devices, num_streams_per_device, From bf89f5919675aa390377eccaf2cf932c2926f286 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 4 Sep 2024 16:13:56 -0400 Subject: [PATCH 475/592] [skip ci] svd dox fixup --- src/TiledArray/math/linalg/non-distributed/svd.h | 10 +++++----- src/TiledArray/math/linalg/scalapack/svd.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/math/linalg/non-distributed/svd.h b/src/TiledArray/math/linalg/non-distributed/svd.h index e0094ef906..3e3608240e 100644 --- a/src/TiledArray/math/linalg/non-distributed/svd.h +++ b/src/TiledArray/math/linalg/non-distributed/svd.h @@ -34,16 +34,16 @@ namespace TiledArray::math::linalg::non_distributed { /** - * @brief Compute the singular value decomposition (SVD) via ScaLAPACK + * @brief Compute the singular value decomposition (SVD) via LAPACK * * A(i,j) = S(k) U(i,k) conj(V(j,k)) * * Example Usage: * - * auto S = svd (A, ...) - * auto [S, U] = svd (A, ...) - * auto [S, VT] = svd(A, ...) - * auto [S, U, VT] = svd (A, ...) + * auto S = svd (A, ...) + * auto [S, U] = svd (A, ...) + * auto [S, VT] = svd(A, ...) + * auto [S, U, VT] = svd (A, ...) * * @tparam Array Input array type, must be convertible to BlockCyclicMatrix * diff --git a/src/TiledArray/math/linalg/scalapack/svd.h b/src/TiledArray/math/linalg/scalapack/svd.h index dc68d374c5..aa9f459ba9 100644 --- a/src/TiledArray/math/linalg/scalapack/svd.h +++ b/src/TiledArray/math/linalg/scalapack/svd.h @@ -42,10 +42,10 @@ namespace TiledArray::math::linalg::scalapack { * * Example Usage: * - * auto S = svd (A, ...) - * auto [S, U] = svd (A, ...) - * auto [S, VT] = svd(A, ...) - * auto [S, U, VT] = svd (A, ...) + * auto S = svd (A, ...) + * auto [S, U] = svd (A, ...) + * auto [S, VT] = svd(A, ...) + * auto [S, U, VT] = svd (A, ...) * * @tparam Array Input array type, must be convertible to BlockCyclicMatrix * From e78e231741882563167859cf13db4146f3e8df1d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 8 Sep 2024 15:50:17 -0400 Subject: [PATCH 476/592] TiledRange1{int x} constructs an empty element range at [x,x) --- src/TiledArray/tiled_range1.h | 13 ++++++++----- tests/tiled_range1.cpp | 21 ++++++++++++++++++++- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 4824dec26e..102ea1bcc8 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -338,10 +338,11 @@ class TiledRange1 { /// Validates tile_boundaries template static void valid_(RandIter first, RandIter last) { - // Verify at least 2 elements are present if the vector is not empty. - TA_ASSERT((std::distance(first, last) >= 2) && - "TiledRange1 construction failed: You need at least 2 " - "elements in the tile boundary list."); + // Need at least 1 tile hashmark to position the element range + // (zero hashmarks is handled by the default ctor) + TA_ASSERT((std::distance(first, last) >= 1) && + "TiledRange1 construction failed: You need at least 1 " + "element in the tile boundary list."); // Verify the requirement that a0 <= a1 <= a2 <= ... for (; first != (last - 1); ++first) { TA_ASSERT( @@ -364,7 +365,9 @@ class TiledRange1 { valid_(first, last); #endif // NDEBUG range_.first = start_tile_index; - range_.second = start_tile_index + last - first - 1; + using std::distance; + range_.second = + start_tile_index + static_cast(distance(first, last)) - 1; elements_range_.first = *first; elements_range_.second = *(last - 1); for (; first != (last - 1); ++first) diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index f01a9a208e..056f752e33 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -63,6 +63,25 @@ BOOST_AUTO_TEST_CASE(constructor) { BOOST_CHECK_TA_ASSERT(r.tile(0), Exception); } + // check construction with single tile boundary (hence zero tiles) + { + { + BOOST_REQUIRE_NO_THROW(TiledRange1 r(0)); + TiledRange1 r(0); + BOOST_CHECK_EQUAL(r, TiledRange1{}); + } + { + BOOST_REQUIRE_NO_THROW(TiledRange1 r(1)); + TiledRange1 r(1); + BOOST_CHECK_NE(r, TiledRange1{}); + BOOST_CHECK_EQUAL(r.tiles_range().first, 0); + BOOST_CHECK_EQUAL(r.tiles_range().second, 0); + BOOST_CHECK_EQUAL(r.elements_range().first, 1); + BOOST_CHECK_EQUAL(r.elements_range().second, 1); + BOOST_CHECK_TA_ASSERT(r.tile(0), Exception); + } + } + // check construction with a iterators and the range info. { BOOST_REQUIRE_NO_THROW(TiledRange1 r(a.begin(), a.end())); @@ -200,7 +219,7 @@ BOOST_AUTO_TEST_CASE(constructor) { BOOST_CHECK_TA_ASSERT(TiledRange1 r(boundaries.begin(), boundaries.end()), Exception); BOOST_CHECK_TA_ASSERT(TiledRange1 r(a.begin(), a.begin()), Exception); - BOOST_CHECK_TA_ASSERT(TiledRange1 r(a.begin(), a.begin() + 1), Exception); + BOOST_CHECK_NO_THROW(TiledRange1 r(a.begin(), a.begin() + 1)); boundaries.push_back(2); boundaries.push_back(0); BOOST_CHECK_TA_ASSERT(TiledRange1 r(boundaries.begin(), boundaries.end()), From 8185cc539d4f7450b0d680707740aa69894629fd Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 8 Sep 2024 23:35:03 -0400 Subject: [PATCH 477/592] make Range1 printable and shiftable --- src/TiledArray/range1.h | 34 +++++++++++++++++++++++++++++++++- tests/range1.cpp | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/range1.h b/src/TiledArray/range1.h index dbb4b05a67..8b185936d4 100644 --- a/src/TiledArray/range1.h +++ b/src/TiledArray/range1.h @@ -32,7 +32,8 @@ namespace TiledArray { /// an integer range `[first,second)` /// @note previously represented by std::pair, hence the design struct Range1 { - typedef TA_1INDEX_TYPE index1_type; + using index1_type = TA_1INDEX_TYPE; + using signed_index1_type = std::make_signed_t; index1_type first = 0; index1_type second = 0; //< N.B. second >= first @@ -164,6 +165,31 @@ struct Range1 { /// @} + /// shifts this Range1 + + /// @param[in] shift the shift to apply + /// @return reference to this + Range1& inplace_shift(signed_index1_type shift) { + if (shift == 0) return *this; + // ensure that it's safe to shift + TA_ASSERT(shift <= 0 || upbound() <= 0 || + (shift <= (std::numeric_limits::max() - upbound()))); + TA_ASSERT(shift >= 0 || lobound() >= 0 || + (std::abs(shift) <= + (lobound() - std::numeric_limits::min()))); + first += shift; + second += shift; + return *this; + } + + /// creates a shifted Range1 + + /// @param[in] shift the shift value + /// @return a copy of this shifted by @p shift + [[nodiscard]] Range1 shift(signed_index1_type shift) const { + return Range1(*this).inplace_shift(shift); + } + template >>::type* = nullptr> @@ -190,6 +216,12 @@ inline void swap(Range1& r0, Range1& r1) { // no throw r0.swap(r1); } +/// Range1 ostream operator +inline std::ostream& operator<<(std::ostream& out, const Range1& rng) { + out << "[ " << rng.first << ", " << rng.second << " )"; + return out; +} + /// Test that two Range1 objects are congruent /// This function tests that the sizes of the two Range1 objects coincide. diff --git a/tests/range1.cpp b/tests/range1.cpp index ba49515cd7..f8d05ed4c0 100644 --- a/tests/range1.cpp +++ b/tests/range1.cpp @@ -137,6 +137,43 @@ BOOST_AUTO_TEST_CASE(comparison) { BOOST_CHECK(r1 != r4); } +BOOST_AUTO_TEST_CASE(shift) { + Range1 r0; + Range1 r0_plus_1; + BOOST_REQUIRE_NO_THROW(r0_plus_1 = r0.shift(1)); + BOOST_CHECK_EQUAL(r0_plus_1, Range1(1, 1)); + BOOST_REQUIRE_NO_THROW(r0_plus_1.inplace_shift(-1)); + BOOST_CHECK_EQUAL(r0_plus_1, r0); + + using index1_type = Range1::index1_type; + BOOST_CHECK_TA_ASSERT((Range1{std::numeric_limits::max() - 1, + std::numeric_limits::max()} + .inplace_shift(1)), + Exception); + BOOST_CHECK_TA_ASSERT((Range1{std::numeric_limits::min(), + std::numeric_limits::min() + 1} + .inplace_shift(-1)), + Exception); + Range1 tmp; + BOOST_CHECK_TA_ASSERT( + tmp = (Range1{std::numeric_limits::max() - 1, + std::numeric_limits::max()} + .shift(1)), + Exception); + BOOST_CHECK_TA_ASSERT( + tmp = (Range1{std::numeric_limits::min(), + std::numeric_limits::min() + 1} + .shift(-1)), + Exception); + + Range1 r1{1, 3}; + Range1 r1_minus_1; + BOOST_REQUIRE_NO_THROW(r1_minus_1 = r1.shift(-1)); + BOOST_CHECK_EQUAL(r1_minus_1, Range1(0, 2)); + BOOST_REQUIRE_NO_THROW(r1_minus_1.inplace_shift(1)); + BOOST_CHECK_EQUAL(r1_minus_1, r1); +} + BOOST_AUTO_TEST_CASE(serialization) { Range1 r{1, 10}; From f10d61b9bf9c80a2f024b1939b4941366463d564 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 8 Sep 2024 17:02:24 -0400 Subject: [PATCH 478/592] make TiledRange1 shiftable --- src/TiledArray/tiled_range1.h | 48 +++++++++++++++++++++++++++++++++++ tests/tiled_range1.cpp | 30 ++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 102ea1bcc8..9ea5769203 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -50,6 +50,7 @@ class TiledRange1 { public: using range_type = Range1; using index1_type = range_type::index1_type; + using signed_index1_type = range_type::signed_index1_type; using const_iterator = std::vector::const_iterator; /// Default constructor creates an empty range (tile and element ranges are @@ -305,6 +306,53 @@ class TiledRange1 { return make_uniform(Range1(0, range_extent), target_tile_size); } + /// shifts this TiledRange1 + + /// @param[in] shift the shift to apply + /// @return reference to this + TiledRange1& inplace_shift(signed_index1_type shift) { + if (shift == 0) return *this; + // ensure that it's safe to shift + TA_ASSERT(shift <= 0 || elements_range().upbound() <= 0 || + (shift <= (std::numeric_limits::max() - + elements_range().upbound()))); + TA_ASSERT(shift >= 0 || elements_range().lobound() >= 0 || + (std::abs(shift) <= (elements_range().lobound() - + std::numeric_limits::min()))); + elements_range_.inplace_shift(shift); + for (auto& tile : tiles_ranges_) { + tile.inplace_shift(shift); + } + elem2tile_.reset(); + return *this; + } + + /// creates a shifted TiledRange1 + + /// equivalent to (but more efficient than) `TiledRange1(*this).shift(shift)` + /// @param[in] shift the shift value + [[nodiscard]] TiledRange1 shift(signed_index1_type shift) const { + if (shift == 0) return *this; + // ensure that it's safe to shift + TA_ASSERT(shift <= 0 || elements_range().upbound() <= 0 || + (shift <= (std::numeric_limits::max() - + elements_range().upbound()))); + TA_ASSERT(shift >= 0 || elements_range().lobound() >= 0 || + (std::abs(shift) <= (elements_range().lobound() - + std::numeric_limits::min()))); + std::vector hashmarks; + hashmarks.reserve(tile_extent() + 1); + if (tiles_ranges_.empty()) + hashmarks.emplace_back(elements_range_.lobound() + shift); + else { + for (auto& t : tiles_ranges_) { + hashmarks.push_back(t.first + shift); + } + hashmarks.push_back(elements_range_.upbound() + shift); + } + return TiledRange1(hashmarks.begin(), hashmarks.end()); + } + /// swapper /// \param other the range with which the contents of this range will be diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index 056f752e33..2fe958bd2d 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -360,4 +360,34 @@ BOOST_AUTO_TEST_CASE(make_uniform) { (TiledRange1{0, 10, 20, 30, 40, 50, 59})); } +BOOST_AUTO_TEST_CASE(shift) { + TiledRange1 r0; + TiledRange1 r0_plus_1; + BOOST_REQUIRE_NO_THROW(r0_plus_1 = r0.shift(1)); + BOOST_CHECK_EQUAL(r0_plus_1, TiledRange1(1)); + BOOST_REQUIRE_NO_THROW(r0_plus_1.inplace_shift(-1)); + BOOST_CHECK_EQUAL(r0_plus_1, r0); + + BOOST_CHECK_TA_ASSERT( + TiledRange1{std::numeric_limits::max()}.inplace_shift(1), + Exception); + BOOST_CHECK_TA_ASSERT( + TiledRange1{std::numeric_limits::min()}.inplace_shift(-1), + Exception); + TiledRange1 tmp; + BOOST_CHECK_TA_ASSERT( + tmp = TiledRange1{std::numeric_limits::max()}.shift(1), + Exception); + BOOST_CHECK_TA_ASSERT( + tmp = TiledRange1{std::numeric_limits::min()}.shift(-1), + Exception); + + TiledRange1 r1{1, 3, 7, 9}; + TiledRange1 r1_minus_1; + BOOST_REQUIRE_NO_THROW(r1_minus_1 = r1.shift(-1)); + BOOST_CHECK_EQUAL(r1_minus_1, TiledRange1(0, 2, 6, 8)); + BOOST_REQUIRE_NO_THROW(r1_minus_1.inplace_shift(1)); + BOOST_CHECK_EQUAL(r1_minus_1, r1); +} + BOOST_AUTO_TEST_SUITE_END() From ddce13e607f83504ab2878f8c57b89f725196b94 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 8 Sep 2024 23:37:24 -0400 Subject: [PATCH 479/592] TiledRange1 printer reimplemented in terms of Range1 printer --- src/TiledArray/tiled_range1.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 9ea5769203..46c4b37adc 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -485,10 +485,8 @@ inline bool operator!=(const TiledRange1& r1, const TiledRange1& r2) { /// TiledRange1 ostream operator inline std::ostream& operator<<(std::ostream& out, const TiledRange1& rng) { - out << "( tiles = [ " << rng.tiles_range().first << ", " - << rng.tiles_range().second << " ), elements = [ " - << rng.elements_range().first << ", " << rng.elements_range().second - << " ) )"; + out << "( tiles = " << rng.tiles_range() + << ", elements = " << rng.elements_range() << " )"; return out; } From e00554832f49fa06c51363e49c25e35c46b8ebd6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 9 Sep 2024 00:00:28 -0400 Subject: [PATCH 480/592] [skip ci] dox++ --- src/TiledArray/range.h | 8 ++++---- src/TiledArray/tile.h | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h index 25e4852118..c3ce5aa7f7 100644 --- a/src/TiledArray/range.h +++ b/src/TiledArray/range.h @@ -949,7 +949,7 @@ class Range { return *this; } - /// Shift the lower and upper bound of this range + /// Shifts the lower and upper bounds of this range /// \tparam Index An integral range type /// \param bound_shift The shift to be applied to the range @@ -987,7 +987,7 @@ class Range { return *this; } - /// Shift the lower and upper bound of this range + /// Shifts the lower and upper bounds of this range /// \tparam Index An integral type /// \param bound_shift The shift to be applied to the range @@ -998,7 +998,7 @@ class Range { return inplace_shift>(bound_shift); } - /// Create a Range with shiften lower and upper bounds + /// Create a Range with shifted lower and upper bounds /// \tparam Index An integral range type /// \param bound_shift The shift to be applied to the range @@ -1011,7 +1011,7 @@ class Range { return result; } - /// Create a Range with shiften lower and upper bounds + /// Create a Range with shifted lower and upper bounds /// \tparam Index An integral type /// \param bound_shift The shift to be applied to the range diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h index 7d568f7200..b8c62d95b8 100644 --- a/src/TiledArray/tile.h +++ b/src/TiledArray/tile.h @@ -39,19 +39,19 @@ namespace TiledArray { /// object to be used in TiledArray expressions, users must also define the /// following functions: /// \li \c add -/// \li \c add_to +/// \li \c add_to (in-place add) /// \li \c subt -/// \li \c subt_to +/// \li \c subt_to (in-place subt) /// \li \c mult -/// \li \c mult_to +/// \li \c mult_to (in-place mult) /// \li \c scale -/// \li \c scale_to +/// \li \c scale_to (in-place scale) /// \li \c gemm /// \li \c neg /// \li \c permute /// \li \c empty /// \li \c shift -/// \li \c shift_to +/// \li \c shift_to (in-place shift) /// \li \c trace /// \li \c sum /// \li \c product From ffd81511bc06704aaddc6ac5698fab1220cb48e6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 9 Sep 2024 00:00:04 -0400 Subject: [PATCH 481/592] Range::shift is const --- src/TiledArray/range.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h index c3ce5aa7f7..1363d6b992 100644 --- a/src/TiledArray/range.h +++ b/src/TiledArray/range.h @@ -1005,7 +1005,7 @@ class Range { /// \return A shifted copy of this range template >> - Range_ shift(const Index& bound_shift) { + Range_ shift(const Index& bound_shift) const { Range_ result(*this); result.inplace_shift(bound_shift); return result; @@ -1018,7 +1018,7 @@ class Range { /// \return A shifted copy of this range template >> - Range_ shift(const std::initializer_list& bound_shift) { + Range_ shift(const std::initializer_list& bound_shift) const { Range_ result(*this); result.inplace_shift(bound_shift); return result; From cb9f08503787195dda91ff830f099a9524938498 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 9 Sep 2024 00:22:31 -0400 Subject: [PATCH 482/592] Range::shift is nodiscard --- src/TiledArray/range.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h index 1363d6b992..cdebd7ddfc 100644 --- a/src/TiledArray/range.h +++ b/src/TiledArray/range.h @@ -1005,7 +1005,7 @@ class Range { /// \return A shifted copy of this range template >> - Range_ shift(const Index& bound_shift) const { + [[nodiscard]] Range_ shift(const Index& bound_shift) const { Range_ result(*this); result.inplace_shift(bound_shift); return result; @@ -1018,7 +1018,8 @@ class Range { /// \return A shifted copy of this range template >> - Range_ shift(const std::initializer_list& bound_shift) const { + [[nodiscard]] Range_ shift( + const std::initializer_list& bound_shift) const { Range_ result(*this); result.inplace_shift(bound_shift); return result; From 57907cc6e0df335a129b35072d9828b68855c141 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 9 Sep 2024 00:23:18 -0400 Subject: [PATCH 483/592] TiledRange is shiftable --- src/TiledArray/tiled_range.h | 55 ++++++++++++++++++++++++++++++++++++ tests/tiled_range.cpp | 11 ++++++++ 2 files changed, 66 insertions(+) diff --git a/src/TiledArray/tiled_range.h b/src/TiledArray/tiled_range.h index 27e559da1c..bfcd4c86fc 100644 --- a/src/TiledArray/tiled_range.h +++ b/src/TiledArray/tiled_range.h @@ -324,6 +324,61 @@ class TiledRange { std::swap(ranges_, other.ranges_); } + /// Shifts the lower and upper bounds of this range + + /// \tparam Index An integral range type + /// \param bound_shift The shift to be applied to the range + /// \return A reference to this range + template >> + TiledRange_& inplace_shift(const Index& bound_shift) { + elements_range_.inplace_shift(bound_shift); + using std::begin; + auto bound_shift_it = begin(bound_shift); + for (std::size_t d = 0; d != rank(); ++d, ++bound_shift_it) { + ranges_[d].inplace_shift(*bound_shift_it); + } + return *this; + } + + /// Shifts the lower and upper bound of this range + + /// \tparam Index An integral type + /// \param bound_shift The shift to be applied to the range + /// \return A reference to this range + template >> + TiledRange_& inplace_shift(const std::initializer_list& bound_shift) { + return inplace_shift>(bound_shift); + } + + /// Create a TiledRange with shifted lower and upper bounds + + /// \tparam Index An integral range type + /// \param bound_shift The shift to be applied to the range + /// \return A shifted copy of this range + template >> + [[nodiscard]] TiledRange_ shift(const Index& bound_shift) const { + TiledRange_ result(*this); + result.inplace_shift(bound_shift); + return result; + } + + /// Create a TiledRange with shifted lower and upper bounds + + /// \tparam Index An integral type + /// \param bound_shift The shift to be applied to the range + /// \return A shifted copy of this range + template >> + [[nodiscard]] TiledRange_ shift( + const std::initializer_list& bound_shift) const { + TiledRange_ result(*this); + result.inplace_shift(bound_shift); + return result; + } + template >>::type* = nullptr> diff --git a/tests/tiled_range.cpp b/tests/tiled_range.cpp index 76702831a3..577b395927 100644 --- a/tests/tiled_range.cpp +++ b/tests/tiled_range.cpp @@ -155,6 +155,17 @@ BOOST_AUTO_TEST_CASE(permutation) { r1); // check that the permutation was assigned correctly. } +BOOST_AUTO_TEST_CASE(shift) { + TiledRange tr1 = tr; + const auto shift = std::vector(GlobalFixture::dim, 1); + BOOST_CHECK_NO_THROW(tr1.inplace_shift(shift)); + BOOST_CHECK_EQUAL(tr1.tiles_range(), tr.tiles_range()); + BOOST_CHECK_EQUAL(tr1.elements_range(), tr.elements_range().shift(shift)); + TiledRange tr1_copy; + BOOST_CHECK_NO_THROW(tr1_copy = tr.shift(shift)); + BOOST_CHECK_EQUAL(tr1, tr1_copy); +} + BOOST_AUTO_TEST_CASE(make_tiles_range) { tile_index start(GlobalFixture::dim); tile_index finish(GlobalFixture::dim); From 6e8624ac59d66204cf9e06143ce7c8ad1ffe7617 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 11 Sep 2024 15:53:25 -0400 Subject: [PATCH 484/592] introduced BlkTsrExpr::{{set_,}trange_lobound,preserve_lobound}() that allow to use block tensor expressions even with DistArrays that have non-zero lobound --- src/TiledArray/expressions/blk_tsr_engine.h | 54 +++++-- src/TiledArray/expressions/blk_tsr_expr.h | 36 +++++ src/TiledArray/expressions/expr.h | 11 ++ src/TiledArray/expressions/fwd.h | 13 +- src/TiledArray/expressions/tsr_expr.h | 148 ++++++++++++++++++-- tests/expressions_impl.h | 52 +++++++ 6 files changed, 293 insertions(+), 21 deletions(-) diff --git a/src/TiledArray/expressions/blk_tsr_engine.h b/src/TiledArray/expressions/blk_tsr_engine.h index e85aac7925..9b6e750bb5 100644 --- a/src/TiledArray/expressions/blk_tsr_engine.h +++ b/src/TiledArray/expressions/blk_tsr_engine.h @@ -158,22 +158,29 @@ class BlkTsrEngineBase : public LeafEngine { using LeafEngine_::array_; container::svector - lower_bound_; ///< Lower bound of the tile block + lower_bound_; ///< Tile coordinates of the lower bound of the tile block + ///< in the host array container::svector - upper_bound_; ///< Upper bound of the tile block + upper_bound_; ///< Tile coordinates of the upper bound of the tile block + ///< in the host array + std::optional + trange_lobound_; ///< Lobound of the result trange, modulo permutation + ///< (i.e. referring to the modes of the host array) public: template BlkTsrEngineBase(const BlkTsrExpr& expr) : LeafEngine_(expr), lower_bound_(expr.lower_bound()), - upper_bound_(expr.upper_bound()) {} + upper_bound_(expr.upper_bound()), + trange_lobound_(expr.trange_lobound()) {} template BlkTsrEngineBase(const ScalBlkTsrExpr& expr) : LeafEngine_(expr), lower_bound_(expr.lower_bound()), - upper_bound_(expr.upper_bound()) {} + upper_bound_(expr.upper_bound()), + trange_lobound_(expr.trange_lobound()) {} /// Non-permuting tiled range factory function @@ -199,9 +206,12 @@ class BlkTsrEngineBase : public LeafEngine { if (lower_d != upper_d) { auto i = lower_d; const auto base_d = trange[d].tile(i).first; - trange1_data.emplace_back(0ul); + const auto trange1_lobound = + trange_lobound_ ? (*trange_lobound_)[d] : 0ul; + trange1_data.emplace_back(trange1_lobound); for (; i < upper_d; ++i) - trange1_data.emplace_back(trange[d].tile(i).second - base_d); + trange1_data.emplace_back(trange[d].tile(i).extent() + + trange1_data.back()); // Add the trange1 to the tiled range data trange_data.emplace_back(trange1_data.begin(), trange1_data.end()); trange1_data.resize(0ul); @@ -241,9 +251,12 @@ class BlkTsrEngineBase : public LeafEngine { // Copy, shift, and permute the tiling of the block auto i = lower_i; const auto base_d = trange[inv_perm_d].tile(i).first; - trange1_data.emplace_back(0ul); + const auto trange1_lobound = + trange_lobound_ ? (*trange_lobound_)[inv_perm_d] : 0ul; + trange1_data.emplace_back(trange1_lobound); for (; i < upper_i; ++i) - trange1_data.emplace_back(trange[inv_perm_d].tile(i).second - base_d); + trange1_data.emplace_back(trange[inv_perm_d].tile(i).extent() + + trange1_data.back()); // Add the trange1 to the tiled range data trange_data.emplace_back(trange1_data.begin(), trange1_data.end()); @@ -341,6 +354,7 @@ class BlkTsrEngine protected: // Import base class variables to this scope using BlkTsrEngineBase_::lower_bound_; + using BlkTsrEngineBase_::trange_lobound_; using BlkTsrEngineBase_::upper_bound_; using ExprEngine_::implicit_permute_inner_; using ExprEngine_::implicit_permute_outer_; @@ -391,8 +405,12 @@ class BlkTsrEngine const auto lower_d = lower[d]; const auto upper_d = upper[d]; if (lower_d != upper_d) { + // element lobound of the block in the host const auto base_d = trange[d].tile(lower_d).first; - range_shift.emplace_back(-base_d); + // element lobound of the target of this expression + const auto target_base_d = + trange_lobound_ ? (*trange_lobound_)[d] : 0ul; + range_shift.emplace_back(target_base_d - base_d); } else { range_shift.emplace_back(0l); } @@ -427,8 +445,11 @@ class BlkTsrEngine const auto lower_d = lower[d]; const auto upper_d = upper[d]; if (lower_d != upper_d) { + // element lobound of the block in the host const auto base_d = trange[d].tile(lower_d).first; - range_shift[perm_d] = -base_d; + // element lobound of the target of this expression + const auto target_base_d = trange_lobound_ ? (*trange_lobound_)[d] : 0; + range_shift[perm_d] = target_base_d - base_d; } } @@ -496,6 +517,7 @@ class ScalBlkTsrEngine protected: // Import base class variables to this scope using BlkTsrEngineBase_::lower_bound_; + using BlkTsrEngineBase_::trange_lobound_; using BlkTsrEngineBase_::upper_bound_; using ExprEngine_::implicit_permute_inner_; using ExprEngine_::implicit_permute_outer_; @@ -549,8 +571,12 @@ class ScalBlkTsrEngine const auto lower_d = lower[d]; const auto upper_d = upper[d]; if (lower_d != upper_d) { + // element lobound of the block in the host const auto base_d = trange[d].tile(lower_d).first; - range_shift.emplace_back(-base_d); + // element lobound of the target of this expression + const auto target_base_d = + trange_lobound_ ? (*trange_lobound_)[d] : 0ul; + range_shift.emplace_back(target_base_d - base_d); } else range_shift.emplace_back(0); } @@ -584,8 +610,12 @@ class ScalBlkTsrEngine const auto lower_d = lower[d]; const auto upper_d = upper[d]; if (lower_d != upper_d) { + // element lobound of the block in the host const auto base_d = trange[d].tile(lower_d).first; - range_shift[perm_d] = -base_d; + // element lobound of the target of this expression + const auto target_base_d = + trange_lobound_ ? (*trange_lobound_)[d] : 0ul; + range_shift[perm_d] = target_base_d - base_d; } } diff --git a/src/TiledArray/expressions/blk_tsr_expr.h b/src/TiledArray/expressions/blk_tsr_expr.h index 5d6612d5cc..661e2ff666 100644 --- a/src/TiledArray/expressions/blk_tsr_expr.h +++ b/src/TiledArray/expressions/blk_tsr_expr.h @@ -32,6 +32,8 @@ #include #include "blk_tsr_engine.h" +#include + namespace TiledArray { namespace expressions { @@ -118,6 +120,10 @@ class BlkTsrExprBase : public Expr { lower_bound_; ///< Lower bound of the tile block container::svector upper_bound_; ///< Upper bound of the tile block + /// If non-null, element lobound of the expression trange (else zeros will be + /// used) Fusing permutation does not affect this (i.e. this refers to the + /// modes of the host array). + std::optional trange_lobound_; void check_valid() const { TA_ASSERT(array_); @@ -285,6 +291,36 @@ class BlkTsrExprBase : public Expr { /// \return The block upper bound const auto& upper_bound() const { return upper_bound_; } + /// Sets result trange lobound + /// @param[in] trange_lobound The result trange lobound + template >> + Derived& set_trange_lobound(const Index1& trange_lobound) { + trange_lobound_.emplace(std::begin(trange_lobound), + std::end(trange_lobound)); + return static_cast(*this); + } + + /// Sets result trange lobound + /// @param[in] trange_lobound The result trange lobound + template >> + Derived& set_trange_lobound(std::initializer_list trange_lobound) { + return this->set_trange_lobound>( + trange_lobound); + } + + /// Sets result trange lobound such that the tile lobounds are not changed + Derived& preserve_lobound() { + return set_trange_lobound( + array_.trange().make_tile_range(lower_bound()).lobound()); + } + + /// @return optional to result trange lobound; if null, the result trange + /// lobound is zero + const auto& trange_lobound() const { return trange_lobound_; } + }; // class BlkTsrExprBase /// Block expression diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index f6d2ff1376..8e3f925310 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -47,6 +47,7 @@ #include +#include #include #include @@ -464,6 +465,16 @@ class Expr { // set even though this is a requirement. #endif // NDEBUG + // Assignment to block expression uses trange of the array it is bounded to + // Assert that the user did not try to override the trange by accident using + // set_trange_lobound or at least that it matches tsr.array's trange + TA_ASSERT(!tsr.trange_lobound().has_value() || + (ranges::equal(tsr.trange_lobound().value(), + tsr.array() + .trange() + .make_tile_range(tsr.lower_bound()) + .lobound()))); + // Get the target world. World& world = tsr.array().world(); diff --git a/src/TiledArray/expressions/fwd.h b/src/TiledArray/expressions/fwd.h index 7960baf648..1d234b6dc5 100644 --- a/src/TiledArray/expressions/fwd.h +++ b/src/TiledArray/expressions/fwd.h @@ -28,7 +28,6 @@ #include - namespace TiledArray::expressions { template @@ -43,6 +42,10 @@ class BlkTsrExpr; template class ScalBlkTsrExpr; +/// used to indicate that block tensor expression should preserve the underlying +/// tensor's trange lobound +struct preserve_lobound_t {}; + template struct is_aliased : std::true_type {}; @@ -68,6 +71,14 @@ class ScalTsrExpr; template class ScalTsrEngine; +} // namespace TiledArray::expressions + +namespace TiledArray { + +/// used to tag block tensor expression methods that preserve the underlying +/// tensor's trange lobound +inline constexpr expressions::preserve_lobound_t preserve_lobound; + } // namespace TiledArray #endif // TILEDARRAY_EXPRESSIONS_FWD_H__INCLUDED diff --git a/src/TiledArray/expressions/tsr_expr.h b/src/TiledArray/expressions/tsr_expr.h index 8430a3c852..68e036f4c4 100644 --- a/src/TiledArray/expressions/tsr_expr.h +++ b/src/TiledArray/expressions/tsr_expr.h @@ -197,7 +197,7 @@ class TsrExpr : public Expr> { return TsrExpr(array(), annotation_); } - /// immutable Block expression factory + /// makes an immutable Block expression /// \tparam Index1 An integral range type /// \tparam Index2 An integral range type @@ -213,7 +213,26 @@ class TsrExpr : public Expr> { upper_bound); } - /// immutable Block expression factory + /// makes an immutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam Index1 An integral range type + /// \tparam Index2 An integral range type + /// \param lower_bound The lower_bound of the block + /// \param upper_bound The upper_bound of the block + template && + TiledArray::detail::is_integral_range_v>> + BlkTsrExpr block(const Index1& lower_bound, + const Index2& upper_bound, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, lower_bound, + upper_bound) + .preserve_lobound(); + } + + /// makes an immutable Block expression /// \tparam Index1 An integral type /// \tparam Index2 An integral type @@ -229,7 +248,26 @@ class TsrExpr : public Expr> { upper_bound); } - /// immutable Block expression factory + /// makes an immutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam Index1 An integral type + /// \tparam Index2 An integral type + /// \param lower_bound The lower_bound of the block + /// \param upper_bound The upper_bound of the block + template && + std::is_integral_v>> + BlkTsrExpr block( + const std::initializer_list& lower_bound, + const std::initializer_list& upper_bound, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, lower_bound, + upper_bound) + .preserve_lobound(); + } + + /// makes an immutable Block expression /// \tparam PairRange Type representing a range of generalized pairs (see /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of @@ -241,7 +279,22 @@ class TsrExpr : public Expr> { return BlkTsrExpr(array_, annotation_, bounds); } - /// immutable Block expression factory + /// makes an immutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam PairRange Type representing a range of generalized pairs (see + /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of + /// the block + template >> + BlkTsrExpr block(const PairRange& bounds, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, bounds) + .preserve_lobound(); + } + + /// makes an immutable Block expression /// \tparam Index An integral type /// \param bounds The {lower,upper} bounds of the block @@ -252,7 +305,21 @@ class TsrExpr : public Expr> { return BlkTsrExpr(array_, annotation_, bounds); } - /// mutable Block expression factory + /// makes an immutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam Index An integral type + /// \param bounds The {lower,upper} bounds of the block + template >> + BlkTsrExpr block( + const std::initializer_list>& bounds, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, bounds) + .preserve_lobound(); + } + + /// makes a mutable Block expression /// \tparam Index1 An integral range type /// \tparam Index2 An integral range type @@ -268,7 +335,26 @@ class TsrExpr : public Expr> { upper_bound); } - /// mutable Block expression factory + /// makes a mutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam Index1 An integral range type + /// \tparam Index2 An integral range type + /// \param lower_bound The lower_bound of the block + /// \param upper_bound The upper_bound of the block + template && + TiledArray::detail::is_integral_range_v>> + BlkTsrExpr block(const Index1& lower_bound, + const Index2& upper_bound, + preserve_lobound_t) { + return BlkTsrExpr(array_, annotation_, lower_bound, + upper_bound) + .preserve_lobound(); + } + + /// makes a mutable Block expression /// \tparam Index1 An integral type /// \tparam Index2 An integral type @@ -284,7 +370,25 @@ class TsrExpr : public Expr> { upper_bound); } - /// mutable Block expression factory + /// makes a mutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam Index1 An integral type + /// \tparam Index2 An integral type + /// \param lower_bound The lower_bound of the block + /// \param upper_bound The upper_bound of the block + template && + std::is_integral_v>> + BlkTsrExpr block( + const std::initializer_list& lower_bound, + const std::initializer_list& upper_bound, preserve_lobound_t) { + return BlkTsrExpr(array_, annotation_, lower_bound, + upper_bound) + .preserve_lobound(); + } + + /// makes a mutable Block expression /// \tparam PairRange Type representing a range of generalized pairs (see /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of @@ -296,7 +400,21 @@ class TsrExpr : public Expr> { return BlkTsrExpr(array_, annotation_, bounds); } - /// mutable Block expression factory + /// makes a mutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam PairRange Type representing a range of generalized pairs (see + /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of + /// the block + template >> + BlkTsrExpr block(const PairRange& bounds, preserve_lobound_t) { + return BlkTsrExpr(array_, annotation_, bounds) + .preserve_lobound(); + } + + /// makes a mutable Block expression /// \tparam Index An integral type /// \param bounds The {lower,upper} bounds of the block @@ -307,6 +425,20 @@ class TsrExpr : public Expr> { return BlkTsrExpr(array_, annotation_, bounds); } + /// makes a mutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam Index An integral type + /// \param bounds The {lower,upper} bounds of the block + template >> + BlkTsrExpr block( + const std::initializer_list>& bounds, + preserve_lobound_t) { + return BlkTsrExpr(array_, annotation_, bounds) + .preserve_lobound(); + } + /// Conjugated-tensor expression factor /// \return A conjugated expression object diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h index ca8027c03d..e7c781ccc6 100644 --- a/tests/expressions_impl.h +++ b/tests/expressions_impl.h @@ -619,6 +619,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_block, F, Fixtures, F) { for (int repeat = 0; repeat != nrepeats; ++repeat) BOOST_REQUIRE_NO_THROW(c("a,b,c").block({3, 3, 3}, {5, 5, 5}) = 2 * a("a,b,c").block({3, 3, 3}, {5, 5, 5})); + BOOST_REQUIRE(tile_ranges_match_trange(c)); BlockRange block_range(a.trange().tiles_range(), {3, 3, 3}, {5, 5, 5}); @@ -698,18 +699,69 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_block_base1, F, Fixtures, F) { c.fill_local(0.0); c_base1.fill_local(0.0); + // block expressions by default have trange lobound (=base) set to 0 ... + // this is done to allow block expressions involving multiple arrays with + // different lobounds all work correctly BOOST_REQUIRE_NO_THROW(c("a,b,c").block({3, 3, 3}, {5, 5, 5}) = a_base1("a,b,c").block({3, 3, 3}, {5, 5, 5})); BOOST_REQUIRE(tile_ranges_match_trange(c)); BOOST_REQUIRE_NO_THROW(c_base1("a,b,c").block({3, 3, 3}, {5, 5, 5}) = a("a,b,c").block({3, 3, 3}, {5, 5, 5})); BOOST_REQUIRE(tile_ranges_match_trange(c_base1)); + BOOST_REQUIRE_NO_THROW(c_base1("a,b,c").block({3, 3, 3}, {5, 5, 5}) = + a_base1("a,b,c").block({3, 3, 3}, {5, 5, 5})); + BOOST_REQUIRE(tile_ranges_match_trange(c_base1)); BOOST_REQUIRE_NO_THROW(c("a,b,c").block({0, 0, 0}, {ntiles, ntiles, ntiles}) = a_base1("a,b,c")); BOOST_REQUIRE(tile_ranges_match_trange(c)); BOOST_REQUIRE_NO_THROW( c_base1("a,b,c").block({0, 0, 0}, {ntiles, ntiles, ntiles}) = a("a,b,c")); BOOST_REQUIRE(tile_ranges_match_trange(c_base1)); + + // however user can override the trange lobound using set_trange_lobound + { + decltype(F::c) a_block; + // default trange lobound is 0 + BOOST_REQUIRE_NO_THROW(a_block("a,b,c") = + a_base1("a,b,c").block({3, 3, 3}, {5, 5, 5})); + BOOST_REQUIRE_EQUAL(a_block.trange().elements_range().lobound(), + (Range::index_type{0, 0, 0})); + + // this preserves tile's lobounds, so that tile {0,0,0} in a_block has + // identical range to that of tile {3, 3, 3} in a_base1 + BOOST_REQUIRE_NO_THROW(a_block("a,b,c") = a_base1("a,b,c").block( + {3, 3, 3}, {5, 5, 5}, preserve_lobound)); + BOOST_REQUIRE_EQUAL(a_block.trange().elements_range().lobound(), + a_base1.trange().make_tile_range({3, 3, 3}).lobound()); + // this explicitly makes the trange lobound of a_block to be {1,1,1} + BOOST_REQUIRE_NO_THROW(a_block("a,b,c") = + a("a,b,c") + .block({3, 3, 3}, {5, 5, 5}) + .set_trange_lobound({1, 1, 1})); + BOOST_REQUIRE_EQUAL(a_block.trange().elements_range().lobound(), + Range::index_type({1, 1, 1})); + // trange of source block is ignored when it is assigned to a block of an + // existing array + BOOST_REQUIRE_NO_THROW(a_block("a,b,c").block({0, 0, 0}, {2, 2, 2}) = + a_base1("a,b,c") + .block({3, 3, 3}, {5, 5, 5}) + .set_trange_lobound({0, 0, 0})); + // overriding trange of result block is not allowed ... + BOOST_REQUIRE_THROW( + a_block("a,b,c") + .block({0, 0, 0}, {2, 2, 2}) + .set_trange_lobound({0, 0, 0}) = a_base1("a,b,c") + .block({3, 3, 3}, {5, 5, 5}) + .set_trange_lobound({0, 0, 0}), + Exception); + // ... unless makes it same as trange lobound of the underlying array + BOOST_REQUIRE_NO_THROW(a_block("a,b,c") + .block({0, 0, 0}, {2, 2, 2}) + .set_trange_lobound({1, 1, 1}) = + a_base1("a,b,c") + .block({3, 3, 3}, {5, 5, 5}) + .set_trange_lobound({0, 0, 0})); + } } BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_permute_block, F, Fixtures, From 5cc3ce3e2c76f12749be0f3e64c44d1a40f56484 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 13 Sep 2024 05:53:48 -0400 Subject: [PATCH 485/592] dox fixup [skip ci] --- src/TiledArray/tiled_range1.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 46c4b37adc..8cc830046b 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -523,9 +523,8 @@ inline TiledRange1 concat(const TiledRange1& r1, const TiledRange1& r2) { /// Test that two TiledRange1 objects are congruent /// This function tests that the tile sizes of the two ranges coincide. -/// \tparam Range The range type -/// \param r1 an TiledRange1 object -/// \param r2 an TiledRange1 object +/// \param r1 a TiledRange1 object +/// \param r2 a TiledRange1 object inline bool is_congruent(const TiledRange1& r1, const TiledRange1& r2) { return r1.tile_extent() == r2.tile_extent() && std::equal(r1.begin(), r1.end(), r2.begin(), From 2527e8057d360fe57ab37459dac384234a4efc64 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 13 Sep 2024 06:06:37 -0400 Subject: [PATCH 486/592] introduced TiledRange::is_congruent --- src/TiledArray/tiled_range.h | 13 +++++++++++++ tests/tiled_range.cpp | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/TiledArray/tiled_range.h b/src/TiledArray/tiled_range.h index bfcd4c86fc..fb73512560 100644 --- a/src/TiledArray/tiled_range.h +++ b/src/TiledArray/tiled_range.h @@ -423,6 +423,19 @@ inline bool operator==(const TiledRange& r1, const TiledRange& r2) { std::equal(r1.data().begin(), r1.data().end(), r2.data().begin()); } +/// Test that two TiledRange objects are congruent + +/// Two tranges are congruent if one is a translation of another (i.e. their +/// ranks and extents of all tiles) agree \param r1 a TiledRange object \param +/// r2 a TiledRange object +inline bool is_congruent(const TiledRange& r1, const TiledRange& r2) { + return r1.rank() == r2.rank() && + std::equal(r1.begin(), r1.end(), r2.begin(), + [](const auto& tr1_1, const auto& tr1_2) { + return is_congruent(tr1_1, tr1_2); + }); +} + inline bool operator!=(const TiledRange& r1, const TiledRange& r2) { return !operator==(r1, r2); } diff --git a/tests/tiled_range.cpp b/tests/tiled_range.cpp index 577b395927..eb557b761f 100644 --- a/tests/tiled_range.cpp +++ b/tests/tiled_range.cpp @@ -119,6 +119,7 @@ BOOST_AUTO_TEST_CASE(comparison) { TiledRange r1{{0, 2, 4, 6, 8, 10}, {0, 2, 4, 6, 8, 10}}; TiledRange r2{{0, 2, 4, 6, 8, 10}, {0, 2, 4, 6, 8, 10}}; TiledRange r3{{0, 3, 6, 9, 12, 15}, {0, 3, 6, 9, 12, 15}}; + BOOST_CHECK(r1 == r1); // self-comparison BOOST_CHECK(r1 == r2); // check equality operator BOOST_CHECK(!(r1 != r2)); // check not-equal operator BOOST_CHECK( @@ -126,6 +127,18 @@ BOOST_AUTO_TEST_CASE(comparison) { BOOST_CHECK(r1 != r3); } +BOOST_AUTO_TEST_CASE(congruency) { + TiledRange r1{{0, 2, 4, 6, 8, 10}, {0, 2, 4, 6, 8, 10}}; + TiledRange r2{{1, 3, 5, 7, 9, 11}, {2, 4, 6, 8, 10, 12}}; + TiledRange r3{{0, 3, 6, 9, 12, 15}, {0, 3, 6, 9, 12, 15}}; + BOOST_CHECK(r1 == r1 && is_congruent(r1, r1)); // congruent with self + BOOST_CHECK(r1 != r2 && + is_congruent(r1, r2)); // r1 and r2 are not equal but congruent + BOOST_CHECK( + r1 != r3 && + !is_congruent(r1, r3)); // r1 and r3 are not equal and not congruent +} + BOOST_AUTO_TEST_CASE(assignment) { TiledRange r1; From 1471c8b94f94aec6abecf96419527e3887f4689a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 13 Sep 2024 06:07:29 -0400 Subject: [PATCH 487/592] like contraction, reduction expression and binary expression can ignore absolute positions if ignore_tile_position() is on --- src/TiledArray/expressions/binary_engine.h | 22 +++++++++++++++++----- src/TiledArray/expressions/expr.h | 17 +++++++++++++++-- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h index 33318b57a6..486c5421a1 100644 --- a/src/TiledArray/expressions/binary_engine.h +++ b/src/TiledArray/expressions/binary_engine.h @@ -235,18 +235,30 @@ class BinaryEngine : public ExprEngine { left_.init_struct(left_indices_); right_.init_struct(right_indices_); #ifndef NDEBUG - if (left_.trange() != right_.trange()) { + if (ignore_tile_position()) { + if (!is_congruent(left_.trange(), right_.trange())) { + if (TiledArray::get_default_world().rank() == 0) { + TA_USER_ERROR_MESSAGE( + "The TiledRanges of the left- and right-hand arguments the " + "binary " + "expression are not congruent:" + << "\n left = " << left_.trange() + << "\n right = " << right_.trange()); + } + TA_EXCEPTION( + "The TiledRange objects of a binary expression are not congruent."); + } + } else if (left_.trange() != right_.trange()) { if (TiledArray::get_default_world().rank() == 0) { TA_USER_ERROR_MESSAGE( - "The TiledRanges of the left- and right-hand arguments of the " - "binary operation are not equal:" + "The TiledRanges of the left- and right-hand arguments the binary " + "expression are not equal:" << "\n left = " << left_.trange() << "\n right = " << right_.trange()); } TA_EXCEPTION( - "The TiledRanges of the left- and right-hand arguments " - "of the binary operation are not equal."); + "The TiledRange objects of a binary expression are not equal."); } #endif // NDEBUG ExprEngine_::init_struct(target_indices); diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index 8e3f925310..3b1e9f43be 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -664,7 +664,20 @@ class Expr { right_dist_eval.eval(); #ifndef NDEBUG - if (left_dist_eval.trange() != right_dist_eval.trange()) { + if (ignore_tile_position()) { + if (!is_congruent(left_dist_eval.trange(), right_dist_eval.trange())) { + if (TiledArray::get_default_world().rank() == 0) { + TA_USER_ERROR_MESSAGE( + "The TiledRanges of the left- and right-hand arguments the " + "binary " + "reduction are not congruent:" + << "\n left = " << left_dist_eval.trange() + << "\n right = " << right_dist_eval.trange()); + } + TA_EXCEPTION( + "The TiledRange objects of a binary reduction are not congruent."); + } + } else if (left_dist_eval.trange() != right_dist_eval.trange()) { if (TiledArray::get_default_world().rank() == 0) { TA_USER_ERROR_MESSAGE( "The TiledRanges of the left- and right-hand arguments the binary " @@ -674,7 +687,7 @@ class Expr { } TA_EXCEPTION( - "The TiledRange objects of a binary expression are not equal."); + "The TiledRange objects of a binary reduction are not equal."); } #endif // NDEBUG From d4142d7c7b1eff6e766ad851a95799aebfe00a1b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 14 Sep 2024 07:22:55 -0400 Subject: [PATCH 488/592] introduced TiledRange1::{lo,up}bound which feel to have unambiguous meaning --- src/TiledArray/tiled_range1.h | 12 ++++++++++++ tests/tiled_range1.cpp | 24 +++++++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 8cc830046b..5fbe87c64d 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -180,6 +180,18 @@ class TiledRange1 { /// \return the number of elements in the range index1_type extent() const { return TiledArray::extent(elements_range_); } + // clang-format off + /// Elements range lobound accessor + /// \return lower bound of the elements range (i.e., the smallest index in the elements range, `a` in `[a,b)`) + // clang-format on + index1_type lobound() const { return elements_range_.lobound(); } + + // clang-format off + /// Elements range upbound accessor + /// \return upper bound of the elements range (i.e., the smallest index greater than any in the elements range, `b` in `[a,b)`) + // clang-format on + index1_type upbound() const { return elements_range_.upbound(); } + /// Computes hashmarks /// \return the hashmarks of the tiled range, consisting of the following /// values: diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index 2fe958bd2d..12b94578b5 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -32,6 +32,10 @@ BOOST_AUTO_TEST_CASE(range_accessor) { BOOST_CHECK_EQUAL(tr1.tiles_range().second, tiles.second); BOOST_CHECK_EQUAL(tr1.elements_range().first, elements.first); BOOST_CHECK_EQUAL(tr1.elements_range().second, elements.second); + BOOST_CHECK_EQUAL(tr1.tile_extent(), tiles.second - tiles.first); + BOOST_CHECK_EQUAL(tr1.extent(), elements.second - elements.first); + BOOST_CHECK_EQUAL(tr1.lobound(), elements.first); + BOOST_CHECK_EQUAL(tr1.upbound(), elements.second); // Check individual tiles for (std::size_t i = 0; i < a.size() - 1; ++i) { @@ -43,12 +47,30 @@ BOOST_AUTO_TEST_CASE(range_accessor) { BOOST_AUTO_TEST_CASE(range_info) { BOOST_CHECK_EQUAL(tr1.tiles_range().first, 0ul); BOOST_CHECK_EQUAL(tr1.tiles_range().second, a.size() - 1); - BOOST_CHECK_EQUAL(tr1.elements_range().first, 0ul); + BOOST_CHECK_EQUAL(tr1.elements_range().first, a.front()); BOOST_CHECK_EQUAL(tr1.elements_range().second, a.back()); + BOOST_CHECK_EQUAL(tr1.tile_extent(), a.size() - 1); + BOOST_CHECK_EQUAL(tr1.extent(), a.back() - a.front()); + BOOST_CHECK_EQUAL(tr1.lobound(), a.front()); + BOOST_CHECK_EQUAL(tr1.upbound(), a.back()); for (std::size_t i = 0; i < a.size() - 1; ++i) { BOOST_CHECK_EQUAL(tr1.tile(i).first, a[i]); BOOST_CHECK_EQUAL(tr1.tile(i).second, a[i + 1]); } + + auto a_base1 = make_hashmarks(1); + BOOST_CHECK_EQUAL(tr1_base1.tiles_range().first, 0ul); + BOOST_CHECK_EQUAL(tr1_base1.tiles_range().second, a_base1.size() - 1); + BOOST_CHECK_EQUAL(tr1_base1.elements_range().first, a_base1.front()); + BOOST_CHECK_EQUAL(tr1_base1.elements_range().second, a_base1.back()); + BOOST_CHECK_EQUAL(tr1_base1.tile_extent(), a_base1.size() - 1); + BOOST_CHECK_EQUAL(tr1_base1.extent(), a_base1.back() - a_base1.front()); + BOOST_CHECK_EQUAL(tr1_base1.lobound(), a_base1.front()); + BOOST_CHECK_EQUAL(tr1_base1.upbound(), a_base1.back()); + for (std::size_t i = 0; i < a.size() - 1; ++i) { + BOOST_CHECK_EQUAL(tr1_base1.tile(i).first, a_base1[i]); + BOOST_CHECK_EQUAL(tr1_base1.tile(i).second, a_base1[i + 1]); + } } BOOST_AUTO_TEST_CASE(constructor) { From 2f80dc0c9e268fc614addcd25615e6586ea79a91 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 14 Sep 2024 07:24:17 -0400 Subject: [PATCH 489/592] more TsrExpr::block variants tagged by preserve_lobound_t --- src/TiledArray/expressions/tsr_expr.h | 67 ++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/tsr_expr.h b/src/TiledArray/expressions/tsr_expr.h index 68e036f4c4..e17ee2ddfa 100644 --- a/src/TiledArray/expressions/tsr_expr.h +++ b/src/TiledArray/expressions/tsr_expr.h @@ -523,6 +523,24 @@ class TsrExpr : public Expr> { /// Block expression + /// \tparam Index1 An integral range type + /// \tparam Index2 An integral range type + /// \param lower_bound The lower_bound of the block + /// \param upper_bound The upper_bound of the block + template && + TiledArray::detail::is_integral_range_v>> + BlkTsrExpr block(const Index1& lower_bound, + const Index2& upper_bound, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, lower_bound, + upper_bound) + .preserve_lobound(); + } + + /// Block expression + /// \tparam Index1 An integral type /// \tparam Index2 An integral type /// \param lower_bound The lower_bound of the block @@ -539,8 +557,27 @@ class TsrExpr : public Expr> { /// Block expression + /// \tparam Index1 An integral type + /// \tparam Index2 An integral type + /// \param lower_bound The lower_bound of the block + /// \param upper_bound The upper_bound of the block + template && + std::is_integral_v>> + BlkTsrExpr block( + const std::initializer_list& lower_bound, + const std::initializer_list& upper_bound, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, lower_bound, + upper_bound) + .preserve_lobound(); + } + + /// Block expression + /// \tparam PairRange Type representing a range of generalized pairs (see - /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of + /// TiledArray::detail::is_gpair_v ) + /// \param bounds The {lower,upper} bounds of /// the block template : public Expr> { /// Block expression + /// \tparam PairRange Type representing a range of generalized pairs (see + /// TiledArray::detail::is_gpair_v ) + /// \param bounds The {lower,upper} bounds of + /// the block + template >> + BlkTsrExpr block(const PairRange& bounds, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, bounds) + .preserve_lobound(); + } + + /// Block expression + /// \tparam Index An integral type /// \param bounds The {lower,upper} bounds of the block template : public Expr> { return BlkTsrExpr(array_, annotation_, bounds); } + /// Block expression + + /// \tparam Index An integral type + /// \param bounds The {lower,upper} bounds of the block + template >> + BlkTsrExpr block( + const std::initializer_list>& bounds, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, bounds) + .preserve_lobound(); + } + /// Conjugated-tensor expression factor /// \return A conjugated expression object From 7820909905ebded48ada01283def6cba62dacee0 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 15 Sep 2024 11:15:38 -0400 Subject: [PATCH 490/592] TA::expressions::preserve_lobound_t -> TA::preserve_lobound_t --- src/TiledArray/expressions/fwd.h | 12 ------------ src/TiledArray/fwd.h | 8 ++++++++ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/TiledArray/expressions/fwd.h b/src/TiledArray/expressions/fwd.h index 1d234b6dc5..e56dea8b83 100644 --- a/src/TiledArray/expressions/fwd.h +++ b/src/TiledArray/expressions/fwd.h @@ -42,10 +42,6 @@ class BlkTsrExpr; template class ScalBlkTsrExpr; -/// used to indicate that block tensor expression should preserve the underlying -/// tensor's trange lobound -struct preserve_lobound_t {}; - template struct is_aliased : std::true_type {}; @@ -73,12 +69,4 @@ class ScalTsrEngine; } // namespace TiledArray::expressions -namespace TiledArray { - -/// used to tag block tensor expression methods that preserve the underlying -/// tensor's trange lobound -inline constexpr expressions::preserve_lobound_t preserve_lobound; - -} // namespace TiledArray - #endif // TILEDARRAY_EXPRESSIONS_FWD_H__INCLUDED diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index 073e8bacd3..97d91a9a00 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -207,6 +207,14 @@ struct to; } // namespace conversions +/// used to indicate that block tensor expression should preserve the underlying +/// tensor's trange lobound +struct preserve_lobound_t {}; + +/// used to tag block tensor expression methods that preserve the underlying +/// tensor's trange lobound +inline constexpr preserve_lobound_t preserve_lobound; + } // namespace TiledArray #ifndef TILEDARRAY_DISABLE_NAMESPACE_TA From 4d4c06bf160cb8a0b74f37196d7852fd6b4fd574 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 15 Sep 2024 11:16:34 -0400 Subject: [PATCH 491/592] btas <-> ta tensor conversions work for non-0-lobound --- src/TiledArray/conversions/btas.h | 248 ++++++++++++++++++++++++------ tests/CMakeLists.txt | 1 + tests/btas.cpp | 21 ++- 3 files changed, 220 insertions(+), 50 deletions(-) diff --git a/src/TiledArray/conversions/btas.h b/src/TiledArray/conversions/btas.h index 28e5790e8f..ab07e97b53 100644 --- a/src/TiledArray/conversions/btas.h +++ b/src/TiledArray/conversions/btas.h @@ -36,6 +36,9 @@ #include #include +#include +#include + namespace TiledArray { // clang-format off @@ -49,11 +52,12 @@ namespace TiledArray { /// \tparam Storage_ The storage type of the source btas::Tensor object /// \tparam Tensor_ A tensor type (e.g., TiledArray::Tensor or btas::Tensor, /// optionally wrapped into TiledArray::Tile) -/// \param[in] src The source object; its subblock defined by the {lower,upper} -/// bounds \c {dst.lobound(),dst.upbound()} will be copied to \c dst +/// \param[in] src The source object; its subblock +/// `{dst.lobound(),dst.upbound()}` +/// will be copied to \c dst /// \param[out] dst The object that will contain the contents of the /// corresponding subblock of src -/// \throw TiledArray::Exception When the dimensions of \c src and \c dst do not +/// \throw TiledArray::Exception When the dimensions of \p src and \p dst do not /// match. // clang-format on template @@ -73,6 +77,57 @@ inline void btas_subtensor_to_tensor( dst_view = src_view; } +// clang-format off +/// Copy a block of a btas::Tensor into a TiledArray::Tensor + +/// A block of btas::Tensor \c src will be copied into TiledArray::Tensor \c +/// dst. The block dimensions will be determined by the dimensions of the range +/// of \c dst . +/// \tparam T The tensor element type +/// \tparam Range_ The range type of the source btas::Tensor object +/// \tparam Storage_ The storage type of the source btas::Tensor object +/// \tparam Tensor_ A tensor type (e.g., TiledArray::Tensor or btas::Tensor, +/// optionally wrapped into TiledArray::Tile) +/// \param[in] src The source object; its subblock +/// `{dst.lobound() + offset,dst.upbound() + offset}` +/// will be copied to \c dst +/// \param[out] dst The object that will contain the contents of the +/// corresponding subblock of src +/// \param[out] offset the offset to be applied to the coordinates of `dst.range()` to determine the block in \p src to be copied; this is needed if the DistArray that will contain \p dst will have a range whose lobound is different from `src.lobound()` +/// \throw TiledArray::Exception When the dimensions of \p src and \p dst do not +/// match. +// clang-format on +template < + typename T, typename Range_, typename Storage_, typename Tensor_, + typename IntegerRange, + typename = std::enable_if_t>> +inline void btas_subtensor_to_tensor( + const btas::Tensor& src, Tensor_& dst, + IntegerRange&& offset) { + TA_ASSERT(dst.range().rank() == src.range().rank()); + TA_ASSERT(ranges::size(offset) == src.range().rank()); + + const auto& src_range = src.range(); + const auto& dst_range = dst.range(); + auto src_blk_range = + TiledArray::BlockRange(detail::make_ta_range(src_range), + ranges::views::zip(dst_range.lobound(), offset) | + ranges::views::transform([](auto&& i_j) { + auto&& [i, j] = i_j; + return i + j; + }), + ranges::views::zip(dst_range.upbound(), offset) | + ranges::views::transform([](auto&& i_j) { + auto&& [i, j] = i_j; + return i + j; + })); + using std::data; + auto src_view = TiledArray::make_const_map(data(src), src_blk_range); + auto dst_view = TiledArray::make_map(data(dst), dst_range); + + dst_view = src_view; +} + // clang-format off /// Copy a TiledArray::Tensor into a block of a btas::Tensor @@ -86,8 +141,8 @@ inline void btas_subtensor_to_tensor( /// \tparam Storage_ The storage type of the destination btas::Tensor object /// \param[in] src The source object whose contents will be copied into /// a subblock of \c dst -/// \param[out] dst The destination object; its subblock defined by the -/// {lower,upper} bounds \c {src.lobound(),src.upbound()} will be +/// \param[out] dst The destination object; its subblock +/// `{src.lobound(),src.upbound()}` will be /// overwritten with the content of \c src /// \throw TiledArray::Exception When the dimensions /// of \c src and \c dst do not match. @@ -109,6 +164,57 @@ inline void tensor_to_btas_subtensor(const Tensor_& src, dst_view = src_view; } +// clang-format off +/// Copy a TiledArray::Tensor into a block of a btas::Tensor + +/// TiledArray::Tensor \c src will be copied into a block of btas::Tensor +/// \c dst. The block dimensions will be determined by the dimensions of the range +/// of \c src . +/// \tparam Tensor_ A tensor type (e.g., TiledArray::Tensor or btas::Tensor, +/// optionally wrapped into TiledArray::Tile) +/// \tparam T The tensor element type +/// \tparam Range_ The range type of the destination btas::Tensor object +/// \tparam Storage_ The storage type of the destination btas::Tensor object +/// \param[in] src The source object whose contents will be copied into +/// a subblock of \c dst +/// \param[out] dst The destination object; its subblock +/// `{src.lobound()+offset,src.upbound()+offset}` will be +/// overwritten with the content of \c src +/// \param[out] offset the offset to be applied to the coordinates of `src.range()` to determine the block in \p dst to be copied; this is needed if the DistArray that contains \p src has a range whose lobound is different from `dst.lobound()` +/// \throw TiledArray::Exception When the dimensions +/// of \c src and \c dst do not match. +// clang-format on +template < + typename Tensor_, typename T, typename Range_, typename Storage_, + typename IntegerRange, + typename = std::enable_if_t>> +inline void tensor_to_btas_subtensor(const Tensor_& src, + btas::Tensor& dst, + IntegerRange&& offset) { + TA_ASSERT(dst.range().rank() == src.range().rank()); + TA_ASSERT(ranges::size(offset) == src.range().rank()); + + const auto& src_range = src.range(); + const auto& dst_range = dst.range(); + auto dst_blk_range = + TiledArray::BlockRange(detail::make_ta_range(dst_range), + ranges::views::zip(src_range.lobound(), offset) | + ranges::views::transform([](auto&& i_j) { + auto&& [i, j] = i_j; + return i + j; + }), + ranges::views::zip(src_range.upbound(), offset) | + ranges::views::transform([](auto&& i_j) { + auto&& [i, j] = i_j; + return i + j; + })); + using std::data; + auto src_view = TiledArray::make_const_map(data(src), src_range); + auto dst_view = TiledArray::make_map(data(dst), dst_blk_range); + + dst_view = src_view; +} + namespace detail { /// Task function for converting btas::Tensor subblock to a @@ -127,7 +233,13 @@ void counted_btas_subtensor_to_tensor(const BTAS_Tensor_* src, DistArray_* dst, const typename Range::index_type i, madness::AtomicInt* counter) { typename DistArray_::value_type tensor(dst->trange().make_tile_range(i)); - btas_subtensor_to_tensor(*src, tensor); + auto offset = ranges::views::zip(ranges::views::all(src->range().lobound()), + dst->trange().elements_range().lobound()) | + ranges::views::transform([](const auto& s_d) { + auto&& [s, d] = s_d; + return s - d; + }); + btas_subtensor_to_tensor(*src, tensor, offset); dst->set(i, tensor); (*counter)++; } @@ -137,12 +249,24 @@ void counted_btas_subtensor_to_tensor(const BTAS_Tensor_* src, DistArray_* dst, /// \tparam TA_Tensor_ a TiledArray::Tensor type /// \tparam BTAS_Tensor_ a btas::Tensor type /// \param src The source tensor -/// \param dst The destination tensor -/// \param counter The task counter -template -void counted_tensor_to_btas_subtensor(const TA_Tensor_& src, BTAS_Tensor_* dst, +/// \param src_array_lobound the lobound of the DistArrany that contains src, +/// used to compute the offset to be applied to the coordinates of `src.range()` +/// to determine the block in \p dst to be copied into \param dst The +/// destination tensor \param counter The task counter +template < + typename TA_Tensor_, typename BTAS_Tensor_, typename IntegerRange, + typename = std::enable_if_t>> +void counted_tensor_to_btas_subtensor(const TA_Tensor_& src, + IntegerRange src_array_lobound, + BTAS_Tensor_* dst, madness::AtomicInt* counter) { - tensor_to_btas_subtensor(src, *dst); + auto offset = ranges::views::zip(ranges::views::all(dst->range().lobound()), + src_array_lobound) | + ranges::views::transform([](const auto& d_s) { + auto&& [d, s] = d_s; + return d - s; + }); + tensor_to_btas_subtensor(src, *dst, offset); (*counter)++; } @@ -267,41 +391,14 @@ DistArray_ btas_tensor_to_array( return array; } -/// Convert a TiledArray::DistArray object into a btas::Tensor object +namespace detail { -/// This function will copy the contents of \c src into a \c btas::Tensor -/// object. The copy operation is done in parallel, and this function will block -/// until all elements of \c src have been copied into the result array tiles. -/// The size of \c src.world().size() must be equal to 1 or \c src must be a -/// replicated TiledArray::DistArray. Usage: -/// \code -/// TiledArray::TArrayD -/// array(world, trange); -/// // Set tiles of array ... -/// -/// auto t = array_to_btas_tensor(array); -/// \endcode -/// \tparam Tile the tile type of \c src -/// \tparam Policy the policy type of \c src -/// \tparam Range_ the range type of the result (either, btas::RangeNd or -/// TiledArray::Range) -/// \tparam Storage_ the storage type of the result -/// \param[in] src The TiledArray::DistArray object whose contents -/// will be copied to the result. -/// \return A \c btas::Tensor object that is a copy of \c src -/// \throw TiledArray::Exception When world size is greater than -/// 1 and \c src is not replicated -/// \param[in] target_rank the rank on which to create the BTAS tensor -/// containing the data of \c src ; if \c target_rank=-1 then -/// create the BTAS tensor on every rank (this requires -/// that \c src.is_replicated()==true ) -/// \return BTAS tensor object containing the data of \c src , if my rank equals -/// \c target_rank or \c target_rank==-1 , -/// default-initialized BTAS tensor otherwise. +/// \sa TiledArray::array_to_btas_tensor() template > -btas::Tensor array_to_btas_tensor( - const TiledArray::DistArray& src, int target_rank = -1) { +btas::Tensor +array_to_btas_tensor_impl(const TiledArray::DistArray& src, + const Range_& result_range, int target_rank) { // Test preconditions if (target_rank == -1 && src.world().size() > 1 && !src.pmap()->is_replicated()) @@ -314,13 +411,11 @@ btas::Tensor array_to_btas_tensor( using result_type = btas::Tensor::element_type, Range_, Storage_>; - using result_range_type = typename result_type::range_type; // Construct the result if (target_rank == -1 || src.world().rank() == target_rank) { // if array is sparse must initialize to zero - result_type result( - result_range_type(src.trange().elements_range().extent()), 0.0); + result_type result(result_range, 0.0); // Spawn tasks to copy array tiles to btas::Tensor madness::AtomicInt counter; @@ -329,8 +424,12 @@ btas::Tensor array_to_btas_tensor( for (std::size_t i = 0; i < src.size(); ++i) { if (!src.is_zero(i)) { src.world().taskq.add( - &detail::counted_tensor_to_btas_subtensor, - src.find(i), &result, &counter); + &detail::counted_tensor_to_btas_subtensor< + Tile, result_type, + std::decay_t< + decltype(src.trange().elements_range().lobound())>>, + src.find(i), src.trange().elements_range().lobound(), &result, + &counter); ++n; } } @@ -343,6 +442,59 @@ btas::Tensor array_to_btas_tensor( return result_type{}; } +} // namespace detail + +/// Convert a TiledArray::DistArray object into a btas::Tensor object + +/// This function will copy the contents of \c src into a \c btas::Tensor +/// object. The copy operation is done in parallel, and this function will block +/// until all elements of \c src have been copied into the result array tiles. +/// The size of \c src.world().size() must be equal to 1 or \c src must be a +/// replicated TiledArray::DistArray. Usage: +/// \code +/// TiledArray::TArrayD +/// array(world, trange); +/// // Set tiles of array ... +/// +/// auto t = array_to_btas_tensor(array); +/// \endcode +/// \tparam Tile the tile type of \c src +/// \tparam Policy the policy type of \c src +/// \tparam Range_ the range type of the result (either, btas::RangeNd or +/// TiledArray::Range) +/// \tparam Storage_ the storage type of the result +/// \param[in] src The TiledArray::DistArray object whose contents +/// will be copied to the result. +/// \param[in] target_rank the rank on which to create the BTAS tensor +/// containing the data of \c src ; if \c target_rank=-1 then +/// create the BTAS tensor on every rank (this requires +/// that \c src.is_replicated()==true ) +/// \return BTAS tensor object containing the data of \c src , if my rank equals +/// \c target_rank or \c target_rank==-1 , +/// default-initialized BTAS tensor otherwise. +/// \warning The range of \c src is +/// not preserved, i.e. the lobound of the result is zero. Use the +/// variant of this function tagged with preserve_lobound_t to +/// preserve the range. +/// \throw TiledArray::Exception When world size is greater than +/// 1 and \c src is not replicated +template > +btas::Tensor array_to_btas_tensor( + const TiledArray::DistArray& src, int target_rank = -1) { + return detail::array_to_btas_tensor_impl( + src, Range_(src.trange().elements_range().extent()), target_rank); +} + +template > +btas::Tensor array_to_btas_tensor( + const TiledArray::DistArray& src, preserve_lobound_t, + int target_rank = -1) { + return detail::array_to_btas_tensor_impl(src, src.trange().elements_range(), + target_rank); +} + } // namespace TiledArray #endif // TILEDARRAY_CONVERSIONS_BTAS_H__INCLUDED diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 823e13bec8..85d30d7728 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -101,6 +101,7 @@ set(ta_test_src_files ta_test.cpp einsum.cpp linalg.cpp cp.cpp + btas.cpp ) if(CUDA_FOUND OR HIP_FOUND) diff --git a/tests/btas.cpp b/tests/btas.cpp index a31329a80d..9c15540e9a 100644 --- a/tests/btas.cpp +++ b/tests/btas.cpp @@ -324,8 +324,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(dense_array_conversion, bTensor, tensor_types) { // make tiled range using trange1_t = TiledArray::TiledRange1; - TiledArray::TiledRange trange( - {trange1_t(0, 10, 20), trange1_t(0, 11, 22), trange1_t(0, 12, 24)}); + TiledArray::TiledRange trange({trange1_t(0, 10, 20), + trange1_t(0, 11, 22).inplace_shift(1), + trange1_t(0, 12, 24).inplace_shift(2)}); // convert to a replicated DistArray using T = typename bTensor::value_type; @@ -371,6 +372,22 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(dense_array_conversion, bTensor, tensor_types) { BOOST_CHECK(src_copy == btas::Tensor{}); } } + + // convert the replicated DistArray back to a btas::Tensor while preserving + // the DistArray range + { + btas::Tensor src_copy; + BOOST_REQUIRE_NO_THROW( + src_copy = array_to_btas_tensor(dst, TiledArray::preserve_lobound)); + BOOST_CHECK(ranges::equal(src_copy.range().lobound(), + dst.trange().elements_range().lobound())); + for (const auto& i : src.range()) { + auto i_copy = i; + i_copy[1] += 1; + i_copy[2] += 2; + BOOST_CHECK_EQUAL(src(i), src_copy(i_copy)); + } + } } BOOST_AUTO_TEST_CASE_TEMPLATE(sparse_array_conversion, bTensor, tensor_types) { From 057df5aad0f3e28e5637a293af29d3918bd8e863 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 15 Sep 2024 23:40:47 -0400 Subject: [PATCH 492/592] introduced member versions of TiledRange1::make_uniform --- src/TiledArray/tiled_range1.h | 15 +++++++++++++++ tests/tiled_range1.cpp | 8 ++++++++ 2 files changed, 23 insertions(+) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 5fbe87c64d..e78e647c10 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -318,6 +318,21 @@ class TiledRange1 { return make_uniform(Range1(0, range_extent), target_tile_size); } + /// same as make_uniform(const Range1&, std::size_t), using the element_range + /// of this TiledRange1 + TiledRange1 make_uniform(std::size_t target_tile_size) const { + return make_uniform(this->elements_range(), target_tile_size); + } + + /// make as uniformly-tiled range as possible out of this TiledRange1, with + /// the same number of tiles as this + TiledRange1 make_uniform() const { + return make_uniform( + this->elements_range(), + (this->elements_range().extent() + this->tile_extent() - 1) / + this->tile_extent()); + } + /// shifts this TiledRange1 /// @param[in] shift the shift to apply diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index 12b94578b5..39bd7fa7c4 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -380,6 +380,14 @@ BOOST_AUTO_TEST_CASE(make_uniform) { BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(59, 10)); BOOST_CHECK(TiledRange1::make_uniform(59, 10) == (TiledRange1{0, 10, 20, 30, 40, 50, 59})); + + // member versions + BOOST_REQUIRE_NO_THROW((TiledRange1{0, 10, 20, 30, 40, 50}.make_uniform(30))); + BOOST_CHECK((TiledRange1{0, 10, 20, 30, 40, 50}.make_uniform(30) == + TiledRange1{0, 25, 50})); + BOOST_REQUIRE_NO_THROW((TiledRange1{0, 40, 50}.make_uniform())); + BOOST_CHECK( + (TiledRange1{0, 40, 50}.make_uniform() == TiledRange1{0, 25, 50})); } BOOST_AUTO_TEST_CASE(shift) { From ede81f34c632123289c0e72181e26e4512b70632 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 16 Sep 2024 12:27:16 -0400 Subject: [PATCH 493/592] [ci skip][wip] `TA::retile` support for `DistArray` with tensor-of-tensor tiles. --- src/TiledArray/array_impl.h | 7 ++- src/TiledArray/dist_array.h | 2 +- src/TiledArray/tensor/tensor_interface.h | 22 +++++++-- tests/retile.cpp | 61 ++++++++++++++++++++++++ 4 files changed, 84 insertions(+), 8 deletions(-) diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h index df7138a9e7..92680722cf 100644 --- a/src/TiledArray/array_impl.h +++ b/src/TiledArray/array_impl.h @@ -425,6 +425,9 @@ class ArrayImpl : public TensorImpl, typedef typename TensorImpl_::pmap_interface pmap_interface; ///< process map interface type typedef Tile value_type; ///< Tile or data type + typedef typename Tile::value_type + element_type; ///< The value type of a tile. It is the numeric_type for + ///< tensor-of-scalars tiles. typedef typename eval_trait::type eval_type; ///< The tile evaluation type typedef typename numeric_type::type @@ -854,8 +857,8 @@ template std::shared_ptr> make_with_new_trange( const std::shared_ptr>& source_array_sptr, const TiledRange& target_trange, - typename ArrayImpl::numeric_type new_value_fill = - typename ArrayImpl::numeric_type{0}) { + typename ArrayImpl::element_type new_value_fill = + typename ArrayImpl::element_type{}) { TA_ASSERT(source_array_sptr); auto& source_array = *source_array_sptr; auto& world = source_array.world(); diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index c2645dd7ce..6baee2abe0 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -461,7 +461,7 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// This constructor remaps the data of \p other according to \p new_trange , /// with \p new_value_fill used to fill the new elements, if any DistArray(const DistArray& other, const trange_type& new_trange, - numeric_type new_value_fill = numeric_type{0}) + element_type new_value_fill = element_type{}) : pimpl_( make_with_new_trange(other.pimpl(), new_trange, new_value_fill)) { this->truncate(); diff --git a/src/TiledArray/tensor/tensor_interface.h b/src/TiledArray/tensor/tensor_interface.h index 7a23307036..a9e67318d0 100644 --- a/src/TiledArray/tensor/tensor_interface.h +++ b/src/TiledArray/tensor/tensor_interface.h @@ -110,6 +110,9 @@ class TensorInterface { template using numeric_t = typename TiledArray::detail::numeric_type::type; + template + using value_t = typename std::remove_reference_t::value_type; + template friend class TensorInterface; @@ -188,16 +191,25 @@ class TensorInterface { TA_ASSERT(data); } - template ::value>::type* = nullptr> + template ::value>::type* = nullptr> TensorInterface_& operator=(const T1& other) { if constexpr (std::is_same_v>) { TA_ASSERT(data_ != other.data()); } - detail::inplace_tensor_op([](numeric_type& MADNESS_RESTRICT result, - const numeric_t arg) { result = arg; }, - *this, other); + if constexpr (detail::is_tensor_v) { + range_ = BlockRange(other.range(), other.range().lobound(), + other.range().upbound()); + data_ = new value_type[other.total_size()]; + auto cpy = other.clone(); + for (auto i = 0; i < other.total_size(); ++i) + std::swap(data_[i], cpy.data()[i]); + } else { + detail::inplace_tensor_op([](numeric_type& MADNESS_RESTRICT result, + const numeric_t arg) { result = arg; }, + *this, other); + } return *this; } diff --git a/tests/retile.cpp b/tests/retile.cpp index 2d9884e8af..8d72dc6903 100644 --- a/tests/retile.cpp +++ b/tests/retile.cpp @@ -28,4 +28,65 @@ BOOST_AUTO_TEST_CASE(retile_tensor) { BOOST_CHECK_EQUAL(result_sparse.trange(), trange); } +BOOST_AUTO_TEST_CASE(retile_more) { + using Numeric = int; + using T = TA::Tensor; + using ToT = TA::Tensor; + using ArrayT = TA::DistArray; + using ArrayToT = TA::DistArray; + + auto& world = TA::get_default_world(); + + auto const tr_source = TA::TiledRange({{0, 2, 4, 8}, {0, 3, 5}}); + auto const tr_target = TA::TiledRange({{0, 4, 6, 8}, {0, 2, 4, 5}}); + + auto rand_num = [](auto&&) { + return TA::detail::MakeRandom::generate_value(); + }; + + auto rand_tensor = [rand_num](auto const& rng) -> T { + return T(rng, rand_num); + }; + + auto rand_tensor_of_tensor = [rand_tensor](auto const& inner_rng) { + return [rand_tensor, inner_rng](auto const& rng) -> ToT { + return ToT(rng, rand_tensor(inner_rng)); + }; + }; + + auto set_random_tensor_of_tensor_tile = [rand_tensor_of_tensor]( + auto const& inner_rng) { + return + [gen = rand_tensor_of_tensor(inner_rng)](auto& tile, auto const& rng) { + tile = gen(rng); + return tile.norm(); + }; + }; + + auto const inner_rng = TA::Range({3, 3}); + auto arr_source = TA::make_array( + world, tr_source, set_random_tensor_of_tensor_tile(inner_rng)); + arr_source.truncate(); + + auto arr_target = TA::retile(arr_source, tr_target); + + arr_source.make_replicated(); + world.gop.fence(); + arr_target.make_replicated(); + world.gop.fence(); + + auto const& elem_rng = tr_source.elements_range(); + BOOST_REQUIRE(elem_rng.volume() == tr_target.elements_range().volume()); + + auto get_elem = [](ArrayToT const& arr, auto const& eix) { + auto tix = arr.trange().element_to_tile(eix); + auto&& tile = arr.find(tix).get(false); + return tile(eix); + }; + + for (auto&& eix : elem_rng) { + BOOST_REQUIRE(get_elem(arr_source, eix) == get_elem(arr_target, eix)); + } +} + BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file From 34092359a6e5c02c0435fe36109be7a585d9625e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 17 Sep 2024 11:40:43 -0400 Subject: [PATCH 494/592] TiledRange1::make_uniform(rng) with empty range preserves its lobound --- src/TiledArray/tiled_range1.h | 2 +- tests/tiled_range1.cpp | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index e78e647c10..aa75916442 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -308,7 +308,7 @@ class TiledRange1 { hashmarks.push_back(range.upbound()); return TiledRange1(hashmarks.begin(), hashmarks.end()); } else - return TiledRange1{}; + return TiledRange1{range.lobound()}; } /// same as make_uniform(const Range1&, std::size_t) for a 0-based range diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index 39bd7fa7c4..947142f6dc 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -350,8 +350,10 @@ BOOST_AUTO_TEST_CASE(concatenation) { } BOOST_AUTO_TEST_CASE(make_uniform) { + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{0, 0}, 0)); + BOOST_CHECK(TiledRange1::make_uniform(Range1{0, 0}, 0) == TiledRange1{}); BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{1, 1}, 0)); - BOOST_CHECK(TiledRange1::make_uniform(Range1{1, 1}, 0) == TiledRange1{}); + BOOST_CHECK(TiledRange1::make_uniform(Range1{1, 1}, 0) == TiledRange1{1}); BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{3, 6}, 10)); BOOST_CHECK(TiledRange1::make_uniform(Range1{3, 6}, 10) == (TiledRange1{3, 6})); From 456b7905f0a9e1cf2c413dad607e9ca740239aff Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 17 Sep 2024 11:41:22 -0400 Subject: [PATCH 495/592] heig: work around the n=0 corner case --- src/TiledArray/math/linalg/rank-local.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/TiledArray/math/linalg/rank-local.cpp b/src/TiledArray/math/linalg/rank-local.cpp index d23f3b4e3f..6db050ee5c 100644 --- a/src/TiledArray/math/linalg/rank-local.cpp +++ b/src/TiledArray/math/linalg/rank-local.cpp @@ -121,6 +121,7 @@ void heig(Matrix& A, std::vector>& W) { integer lda = A.rows(); W.resize(n); auto* w = W.data(); + if (n == 0) return; if constexpr (TiledArray::detail::is_complex_v) TA_LAPACK(heev, jobz, uplo, n, a, lda, w); else @@ -140,6 +141,7 @@ void heig(Matrix& A, Matrix& B, integer ldb = B.rows(); W.resize(n); auto* w = W.data(); + if (n == 0) return; if constexpr (TiledArray::detail::is_complex_v) TA_LAPACK(hegv, itype, jobz, uplo, n, a, lda, b, ldb, w); else From d6223831afcb5ddb9e9962732f0768c43610000a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 17 Sep 2024 11:52:20 -0400 Subject: [PATCH 496/592] bump MAD tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/547 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 96e7259ed5..db11ed24df 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -43,7 +43,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20 and later. - [BTAS](http://github.com/ValeevGroup/BTAS), tag 4e8f5233aa7881dccdfcc37ce07128833926d3c2 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 96ac90e8f193ccfaf16f346b4652927d2d362e75 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 95589b0d020a076f93d02eead6da654b23dd3d91 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index a005bcdec5..87804775f9 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -11,8 +11,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 3d0ae2fad1b97e347ca6dd98b9f1b9e74e629f52) +set(TA_TRACKED_MADNESS_TAG 95589b0d020a076f93d02eead6da654b23dd3d91) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From a0c04508a2c32e5c56c528bc917c5acf8fd9fc17 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 18 Sep 2024 00:10:14 -0400 Subject: [PATCH 497/592] BinaryExpr: account for ignore_tile_position when checking preconditions --- src/TiledArray/dist_eval/binary_eval.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/dist_eval/binary_eval.h b/src/TiledArray/dist_eval/binary_eval.h index 62bbdb64ce..87cce91656 100644 --- a/src/TiledArray/dist_eval/binary_eval.h +++ b/src/TiledArray/dist_eval/binary_eval.h @@ -107,7 +107,10 @@ class BinaryEvalImpl : public DistEvalImpl, right_ntiles_discarded_(0) #endif { - TA_ASSERT(left.trange() == right.trange()); + TA_ASSERT(ignore_tile_position() + ? left.trange().elements_range().extent() == + right.trange().elements_range().extent() + : left.trange() == right.trange()); } virtual ~BinaryEvalImpl() {} From 342dd255d9d3952062958b1ca186a41debfeb05a Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Wed, 18 Sep 2024 15:35:37 -0400 Subject: [PATCH 498/592] Completes `TA::retile` support for `DistArray` with tensor-of-tensor tiles. --- src/TiledArray/tensor/kernels.h | 13 ++++- src/TiledArray/tensor/tensor_interface.h | 15 ++---- tests/retile.cpp | 65 ++++++++++++++---------- 3 files changed, 51 insertions(+), 42 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 699496d77e..5d40ce5c14 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -599,8 +599,17 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) { [&op, stride]( typename TR::pointer MADNESS_RESTRICT const result_data, typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) { - for (decltype(result.range().volume()) i = 0ul; i < stride; ++i) - inplace_tensor_op(op, result_data[i], tensors_data[i]...); + for (decltype(result.range().volume()) i = 0ul; i < stride; ++i) { + if constexpr (std::is_invocable_v< + std::remove_reference_t, + typename std::remove_reference_t::value_type&, + typename std::remove_reference_t< + Ts>::value_type const&...>) { + std::forward(op)(result_data[i], tensors_data[i]...); + } else { + inplace_tensor_op(op, result_data[i], tensors_data[i]...); + } + } }; for (std::decay_t ord = 0ul; ord < volume; ord += stride) diff --git a/src/TiledArray/tensor/tensor_interface.h b/src/TiledArray/tensor/tensor_interface.h index a9e67318d0..6ba8f0430e 100644 --- a/src/TiledArray/tensor/tensor_interface.h +++ b/src/TiledArray/tensor/tensor_interface.h @@ -198,18 +198,9 @@ class TensorInterface { TA_ASSERT(data_ != other.data()); } - if constexpr (detail::is_tensor_v) { - range_ = BlockRange(other.range(), other.range().lobound(), - other.range().upbound()); - data_ = new value_type[other.total_size()]; - auto cpy = other.clone(); - for (auto i = 0; i < other.total_size(); ++i) - std::swap(data_[i], cpy.data()[i]); - } else { - detail::inplace_tensor_op([](numeric_type& MADNESS_RESTRICT result, - const numeric_t arg) { result = arg; }, - *this, other); - } + detail::inplace_tensor_op( + [](value_type& MADNESS_RESTRICT result, auto&& arg) { result = arg; }, + *this, other); return *this; } diff --git a/tests/retile.cpp b/tests/retile.cpp index 8d72dc6903..0f4100d4c8 100644 --- a/tests/retile.cpp +++ b/tests/retile.cpp @@ -39,51 +39,60 @@ BOOST_AUTO_TEST_CASE(retile_more) { auto const tr_source = TA::TiledRange({{0, 2, 4, 8}, {0, 3, 5}}); auto const tr_target = TA::TiledRange({{0, 4, 6, 8}, {0, 2, 4, 5}}); + auto const& elem_rng = tr_source.elements_range(); + + BOOST_REQUIRE(elem_rng.volume() == tr_target.elements_range().volume()); - auto rand_num = [](auto&&) { - return TA::detail::MakeRandom::generate_value(); + auto const inner_rng = TA::Range({3, 3}); + + auto rand_tensor = [](auto const& rng) -> T { + return T(rng, [](auto&&) { + return TA::detail::MakeRandom::generate_value(); + }); }; - auto rand_tensor = [rand_num](auto const& rng) -> T { - return T(rng, rand_num); + auto set_random_tensor_tile = [rand_tensor](auto& tile, auto const& rng) { + tile = rand_tensor(rng); + return tile.norm(); }; - auto rand_tensor_of_tensor = [rand_tensor](auto const& inner_rng) { - return [rand_tensor, inner_rng](auto const& rng) -> ToT { - return ToT(rng, rand_tensor(inner_rng)); - }; + auto rand_tensor_of_tensor = [rand_tensor, + inner_rng](auto const& rng) -> ToT { + return ToT(rng, [rand_tensor, inner_rng](auto&&) { + return rand_tensor(inner_rng); + }); }; auto set_random_tensor_of_tensor_tile = [rand_tensor_of_tensor]( - auto const& inner_rng) { - return - [gen = rand_tensor_of_tensor(inner_rng)](auto& tile, auto const& rng) { - tile = gen(rng); - return tile.norm(); - }; + auto& tile, auto const& rng) { + tile = rand_tensor_of_tensor(rng); + return tile.norm(); }; - auto const inner_rng = TA::Range({3, 3}); - auto arr_source = TA::make_array( - world, tr_source, set_random_tensor_of_tensor_tile(inner_rng)); - arr_source.truncate(); + auto get_elem = [](auto const& arr, auto const& eix) { + auto tix = arr.trange().element_to_tile(eix); + auto&& tile = arr.find(tix).get(false); + return tile(eix); + }; + + auto arr_source0 = + TA::make_array(world, tr_source, set_random_tensor_tile); + auto arr_target0 = TA::retile(arr_source0, tr_target); + for (auto&& eix : elem_rng) { + BOOST_REQUIRE(get_elem(arr_source0, eix) == get_elem(arr_target0, eix)); + } + + auto arr_source = TA::make_array(world, tr_source, + set_random_tensor_of_tensor_tile); auto arr_target = TA::retile(arr_source, tr_target); arr_source.make_replicated(); - world.gop.fence(); arr_target.make_replicated(); + arr_source.truncate(); + arr_target.truncate(); world.gop.fence(); - auto const& elem_rng = tr_source.elements_range(); - BOOST_REQUIRE(elem_rng.volume() == tr_target.elements_range().volume()); - - auto get_elem = [](ArrayToT const& arr, auto const& eix) { - auto tix = arr.trange().element_to_tile(eix); - auto&& tile = arr.find(tix).get(false); - return tile(eix); - }; - for (auto&& eix : elem_rng) { BOOST_REQUIRE(get_elem(arr_source, eix) == get_elem(arr_target, eix)); } From 2ba8f8baae8bacdd269a521235f0114a98c40c11 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 18 Sep 2024 17:09:24 -0400 Subject: [PATCH 499/592] [python] simplify make_trange by using TiledRange1::make_uniform --- python/src/TiledArray/python/trange.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/python/src/TiledArray/python/trange.h b/python/src/TiledArray/python/trange.h index 488421291d..8c008c1fa9 100644 --- a/python/src/TiledArray/python/trange.h +++ b/python/src/TiledArray/python/trange.h @@ -45,7 +45,6 @@ auto list(const TiledRange &trange) { return v; } -// template<> inline TiledRange make_trange(std::vector > trange) { std::vector trange1; for (auto tr : trange) { @@ -58,11 +57,7 @@ inline TiledRange make_trange(std::vector > trange) { inline TiledRange make_trange(std::vector shape, size_t block) { std::vector trange1; for (size_t i = 0; i < shape.size(); ++i) { - std::vector tr1; - for (size_t j = 0; j <= (shape[i] + block - 1); j += block) { - tr1.push_back(std::min(j, shape[i])); - } - trange1.push_back(TiledRange1(tr1.begin(), tr1.end())); + trange1.emplace_back(TiledRange1::make_uniform(shape[i], block)); } return TiledRange(trange1.begin(), trange1.end()); } From ae1cf06592aca8171904be8605cf0a95973fa31d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 18 Sep 2024 19:21:45 -0400 Subject: [PATCH 500/592] [ci] greatly reduce the gitlab matrix, replace rel/deb builds with relwithdebinfo --- .gitlab-ci.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b57a210430..02c3edc266 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -57,22 +57,22 @@ ubuntu: metrics: build/metrics.txt parallel: matrix: - - IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ] + - IMAGE : [ "ubuntu:20.04" ] CXX: [ g++ ] - BUILD_TYPE : [ "Release" ] + BUILD_TYPE : [ "RelWithDebInfo" ] BLA_VENDOR : [ "BLAS_PREFERENCE_LIST=IntelMKL" ] BLA_THREADS : [ "IntelMKL_THREAD_LAYER=tbb" ] # ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ] TA_PYTHON : [ "TA_PYTHON=OFF" ] # needs to be fixed for MKL RUNNER_TAGS: [ saas-linux-small-amd64 ] - - IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ] + - IMAGE : [ "ubuntu:22.04" ] CXX: [ g++, clang++-13 ] - BUILD_TYPE : [ "Release", "Debug" ] + BUILD_TYPE : [ "RelWithDebInfo" ] ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ] RUNNER_TAGS: [ saas-linux-small-amd64 ] - - IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ] + - IMAGE : [ "ubuntu:22.04" ] CXX: [ g++ ] - BUILD_TYPE : [ "Release", "Debug" ] + BUILD_TYPE : [ "RelWithDebInfo" ] ENABLE_CUDA : [ "ENABLE_CUDA=ON" ] TA_TARGETS : [ "tiledarray examples-tiledarray check_serial-tiledarray" ] RUNNER_TAGS: [ cuda ] From f294db31bea86d08b8d875d218f24c65221dca76 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 22 Sep 2024 06:32:39 -0400 Subject: [PATCH 501/592] TA::host_allocator is serializable, so that btas::Tensor can be used as a tile again --- src/CMakeLists.txt | 2 - src/TiledArray/device/allocators.h | 138 ---------------------------- src/TiledArray/device/um_storage.cu | 2 +- src/TiledArray/device/um_storage.h | 2 +- src/TiledArray/external/device.h | 15 ++- src/TiledArray/external/umpire.h | 83 ++++++++++++++++- src/TiledArray/fwd.h | 32 ++++--- src/TiledArray/host/allocator.h | 78 ---------------- src/TiledArray/host/env.h | 10 ++ src/TiledArray/tensor/tensor.h | 3 +- 10 files changed, 127 insertions(+), 238 deletions(-) delete mode 100644 src/TiledArray/device/allocators.h delete mode 100644 src/TiledArray/host/allocator.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c426d1ffbe..3d6b94ea9a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -134,7 +134,6 @@ TiledArray/external/btas.h TiledArray/external/madness.h TiledArray/external/umpire.h TiledArray/host/env.h -TiledArray/host/allocator.h TiledArray/math/blas.h TiledArray/math/gemm_helper.h TiledArray/math/outer.h @@ -223,7 +222,6 @@ if(CUDA_FOUND OR HIP_FOUND) TiledArray/device/kernel/thrust/reduce_kernel.h TiledArray/device/platform.h TiledArray/device/thrust.h - TiledArray/device/allocators.h TiledArray/device/um_storage.h) if(CUDA_FOUND) list(APPEND TILEDARRAY_HEADER_FILES diff --git a/src/TiledArray/device/allocators.h b/src/TiledArray/device/allocators.h deleted file mode 100644 index 2bda79e768..0000000000 --- a/src/TiledArray/device/allocators.h +++ /dev/null @@ -1,138 +0,0 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2018 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Eduard Valeyev - * Department of Chemistry, Virginia Tech - * Jan 31, 2018 - * - */ - -#ifndef TILEDARRAY_DEVICE_ALLOCATORS_H___INCLUDED -#define TILEDARRAY_DEVICE_ALLOCATORS_H___INCLUDED - -#include - -#ifdef TILEDARRAY_HAS_DEVICE - -#include -#include - -#include - -#include -#include - -namespace TiledArray { - -template -class umpire_based_allocator - : public umpire_based_allocator_impl { - public: - using base_type = umpire_based_allocator_impl; - using typename base_type::const_pointer; - using typename base_type::const_reference; - using typename base_type::pointer; - using typename base_type::reference; - using typename base_type::value_type; - - umpire_based_allocator() noexcept : base_type(&UmpireAllocatorAccessor{}()) {} - - template - umpire_based_allocator( - const umpire_based_allocator& - rhs) noexcept - : base_type( - static_cast&>( - rhs)) {} - - template - friend bool operator==( - const umpire_based_allocator& - lhs, - const umpire_based_allocator& - rhs) noexcept; -}; // class umpire_based_allocator - -template -bool operator==( - const umpire_based_allocator& lhs, - const umpire_based_allocator& - rhs) noexcept { - return lhs.umpire_allocator() == rhs.umpire_allocator(); -} - -template -bool operator!=( - const umpire_based_allocator& lhs, - const umpire_based_allocator& - rhs) noexcept { - return !(lhs == rhs); -} - -namespace detail { - -struct get_um_allocator { - umpire::Allocator& operator()() { - return deviceEnv::instance()->um_allocator(); - } -}; - -struct get_pinned_allocator { - umpire::Allocator& operator()() { - return deviceEnv::instance()->pinned_allocator(); - } -}; - -} // namespace detail - -} // namespace TiledArray - -namespace madness { -namespace archive { - -template -struct ArchiveLoadImpl> { - static inline void load( - const Archive& ar, - TiledArray::umpire_based_allocator& allocator) { - allocator = TiledArray::umpire_based_allocator{}; - } -}; - -template -struct ArchiveStoreImpl> { - static inline void store( - const Archive& ar, - const TiledArray::umpire_based_allocator< - T, StaticLock, UmpireAllocatorAccessor>& allocator) {} -}; - -} // namespace archive -} // namespace madness - -#endif // TILEDARRAY_HAS_DEVICE - -#endif // TILEDARRAY_DEVICE_ALLOCATORS_H___INCLUDED diff --git a/src/TiledArray/device/um_storage.cu b/src/TiledArray/device/um_storage.cu index cc3a1aae55..8879c246f8 100644 --- a/src/TiledArray/device/um_storage.cu +++ b/src/TiledArray/device/um_storage.cu @@ -22,7 +22,7 @@ */ -#include +#include #include #ifdef TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/device/um_storage.h b/src/TiledArray/device/um_storage.h index d151a3c316..d91c032312 100644 --- a/src/TiledArray/device/um_storage.h +++ b/src/TiledArray/device/um_storage.h @@ -24,7 +24,7 @@ #ifndef TILEDARRAY_DEVICE_UM_VECTOR_H__INCLUDED #define TILEDARRAY_DEVICE_UM_VECTOR_H__INCLUDED -#include +#include #ifdef TILEDARRAY_HAS_DEVICE diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 38bcbbc745..597643b225 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -798,9 +798,22 @@ class Env { static std::unique_ptr instance_{nullptr}; return instance_; } -}; +}; // class Env namespace detail { + +struct get_um_allocator { + umpire::Allocator& operator()() { + return deviceEnv::instance()->um_allocator(); + } +}; + +struct get_pinned_allocator { + umpire::Allocator& operator()() { + return deviceEnv::instance()->pinned_allocator(); + } +}; + // in a madness device task point to its local optional stream to use by // madness_task_stream_opt; set to nullptr after task callable finished inline std::optional*& madness_task_stream_opt_ptr_accessor() { diff --git a/src/TiledArray/external/umpire.h b/src/TiledArray/external/umpire.h index e8d0d48632..ac23a60260 100644 --- a/src/TiledArray/external/umpire.h +++ b/src/TiledArray/external/umpire.h @@ -156,6 +156,54 @@ bool operator!=( return !(lhs == rhs); } +template +class umpire_based_allocator + : public umpire_based_allocator_impl { + public: + using base_type = umpire_based_allocator_impl; + using typename base_type::const_pointer; + using typename base_type::const_reference; + using typename base_type::pointer; + using typename base_type::reference; + using typename base_type::value_type; + + umpire_based_allocator() noexcept : base_type(&UmpireAllocatorAccessor{}()) {} + + template + umpire_based_allocator( + const umpire_based_allocator& + rhs) noexcept + : base_type( + static_cast&>( + rhs)) {} + + template + friend bool operator==( + const umpire_based_allocator& + lhs, + const umpire_based_allocator& + rhs) noexcept; +}; // class umpire_based_allocator + +template +bool operator==( + const umpire_based_allocator& lhs, + const umpire_based_allocator& + rhs) noexcept { + return lhs.umpire_allocator() == rhs.umpire_allocator(); +} + +template +bool operator!=( + const umpire_based_allocator& lhs, + const umpire_based_allocator& + rhs) noexcept { + return !(lhs == rhs); +} + /// see /// https://stackoverflow.com/questions/21028299/is-this-behavior-of-vectorresizesize-type-n-under-c11-and-boost-container/21028912#21028912 template @@ -202,7 +250,7 @@ struct ArchiveLoadImpl& allocator) { std::string allocator_name; - ar& allocator_name; + ar & allocator_name; allocator = TiledArray::umpire_based_allocator_impl( umpire::ResourceManager::getInstance().getAllocator(allocator_name)); } @@ -214,7 +262,7 @@ struct ArchiveStoreImpl< static inline void store( const Archive& ar, const TiledArray::umpire_based_allocator_impl& allocator) { - ar& allocator.umpire_allocator()->getName(); + ar & allocator.umpire_allocator()->getName(); } }; @@ -224,7 +272,7 @@ struct ArchiveLoadImpl> { TiledArray::default_init_allocator& allocator) { if constexpr (!std::allocator_traits::is_always_equal::value) { A base_allocator; - ar& base_allocator; + ar & base_allocator; allocator = TiledArray::default_init_allocator(base_allocator); } } @@ -244,4 +292,33 @@ struct ArchiveStoreImpl> { } // namespace archive } // namespace madness +namespace madness { +namespace archive { + +template +struct ArchiveLoadImpl> { + static inline void load( + const Archive& ar, + TiledArray::umpire_based_allocator& allocator) { + allocator = TiledArray::umpire_based_allocator{}; + } +}; + +template +struct ArchiveStoreImpl> { + static inline void store( + const Archive& ar, + const TiledArray::umpire_based_allocator< + T, StaticLock, UmpireAllocatorAccessor>& allocator) {} +}; + +} // namespace archive +} // namespace madness + #endif // TILEDARRAY_EXTERNAL_UMPIRE_H___INCLUDED diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index 97d91a9a00..6127db32f3 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -36,12 +36,27 @@ class aligned_allocator; // fwddecl host_allocator namespace TiledArray { -template -class host_allocator_impl; -template +namespace detail { +struct get_host_allocator; +struct NullLock; +template +class MutexLock; +} // namespace detail + +template +class umpire_based_allocator; + +template > class default_init_allocator; + +class hostEnv; + +/// pooled thread-safe host memory allocator template -using host_allocator = default_init_allocator>; +using host_allocator = + default_init_allocator, + detail::get_host_allocator>>; } // namespace TiledArray namespace madness { @@ -87,18 +102,9 @@ class Env; } using deviceEnv = device::Env; -template -class umpire_based_allocator; - -template > -class default_init_allocator; - namespace detail { struct get_um_allocator; struct get_pinned_allocator; -struct NullLock; -template -class MutexLock; } // namespace detail /// pooled thread-safe unified memory (UM) allocator for device computing diff --git a/src/TiledArray/host/allocator.h b/src/TiledArray/host/allocator.h deleted file mode 100644 index a22613fb38..0000000000 --- a/src/TiledArray/host/allocator.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2021 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Eduard Valeyev - * Department of Chemistry, Virginia Tech - * Jan 31, 2018 - * - */ - -#ifndef TILEDARRAY_HOST_ALLOCATOR_H___INCLUDED -#define TILEDARRAY_HOST_ALLOCATOR_H___INCLUDED - -#include - -#include -#include - -#include - -#include -#include - -namespace TiledArray { - -/// pooled, thread-safe allocator for host memory -template -class host_allocator_impl - : public umpire_based_allocator_impl> { - public: - using base_type = umpire_based_allocator_impl>; - using typename base_type::const_pointer; - using typename base_type::const_reference; - using typename base_type::pointer; - using typename base_type::reference; - using typename base_type::value_type; - - host_allocator_impl() noexcept - : base_type(&hostEnv::instance()->host_allocator()) {} - - template - host_allocator_impl(const host_allocator_impl& rhs) noexcept - : base_type(static_cast>&>(rhs)) {} - - template - friend bool operator==(const host_allocator_impl& lhs, - const host_allocator_impl& rhs) noexcept; -}; // class host_allocator_impl - -template -bool operator==(const host_allocator_impl& lhs, - const host_allocator_impl& rhs) noexcept { - return lhs.umpire_allocator() == rhs.umpire_allocator(); -} - -template -bool operator!=(const host_allocator_impl& lhs, - const host_allocator_impl& rhs) noexcept { - return !(lhs == rhs); -} - -} // namespace TiledArray - -#endif // TILEDARRAY_HOST_ALLOCATOR_H___INCLUDED diff --git a/src/TiledArray/host/env.h b/src/TiledArray/host/env.h index 1b3c4f277f..be1de5369c 100644 --- a/src/TiledArray/host/env.h +++ b/src/TiledArray/host/env.h @@ -148,6 +148,16 @@ class hostEnv { } }; +namespace detail { + +struct get_host_allocator { + umpire::Allocator& operator()() { + return hostEnv::instance()->host_allocator(); + } +}; + +} // namespace detail + } // namespace TiledArray #endif // TILEDARRAY_HOST_ENV_H__INCLUDED diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 12479ef53c..171dac2eea 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -22,7 +22,8 @@ #include "TiledArray/config.h" -#include "TiledArray/host/allocator.h" +#include "TiledArray/external/umpire.h" +#include "TiledArray/host/env.h" #include "TiledArray/math/blas.h" #include "TiledArray/math/gemm_helper.h" From f613831844410bde0be87c9833448511eb2eb4fd Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 22 Sep 2024 06:34:00 -0400 Subject: [PATCH 502/592] introduced TA::Tile::at_ordinal + strengthen disambiguation checks for potential at_ordinal uses --- src/TiledArray/tensor/tensor.h | 20 ++++++++++--- src/TiledArray/tile.h | 52 ++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 171dac2eea..bf729e59d9 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -705,7 +705,7 @@ class Tensor { const_reference operator[](const Ordinal ord) const { TA_ASSERT(!this->empty()); // can't distinguish between operator[](Index...) and operator[](ordinal) - // thus assume at_ordinal() if this->rank()==1 + // thus insist on at_ordinal() if this->rank()==1 TA_ASSERT(this->range_.rank() != 1 && "use Tensor::operator[](index) or " "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); @@ -726,7 +726,7 @@ class Tensor { reference operator[](const Ordinal ord) { TA_ASSERT(!this->empty()); // can't distinguish between operator[](Index...) and operator[](ordinal) - // thus assume at_ordinal() if this->rank()==1 + // thus insist on at_ordinal() if this->rank()==1 TA_ASSERT(this->range_.rank() != 1 && "use Tensor::operator[](index) or " "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); @@ -848,7 +848,7 @@ class Tensor { TA_ASSERT(!this->empty()); TA_ASSERT(this->nbatch() == 1); // can't distinguish between operator[](Index...) and operator[](ordinal) - // thus assume at_ordinal() if this->rank()==1 + // thus insist on at_ordinal() if this->rank()==1 TA_ASSERT(this->range_.rank() != 1 && "use Tensor::operator()(index) or " "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); @@ -869,7 +869,7 @@ class Tensor { TA_ASSERT(!this->empty()); TA_ASSERT(this->nbatch() == 1); // can't distinguish between operator[](Index...) and operator[](ordinal) - // thus assume at_ordinal() if this->rank()==1 + // thus insist on at_ordinal() if this->rank()==1 TA_ASSERT(this->range_.rank() != 1 && "use Tensor::operator()(index) or " "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); @@ -960,6 +960,12 @@ class Tensor { const_reference operator()(const Index&... i) const { TA_ASSERT(!this->empty()); TA_ASSERT(this->nbatch() == 1); + TA_ASSERT(this->range().rank() == sizeof...(Index)); + // can't distinguish between operator()(Index...) and operator()(ordinal) + // thus insist on at_ordinal() if this->rank()==1 + TA_ASSERT(this->range_.rank() != 1 && + "use Tensor::operator()(index) or " + "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); using Int = std::common_type_t; const auto iord = this->range_.ordinal( std::array{{static_cast(i)...}}); @@ -982,6 +988,12 @@ class Tensor { reference operator()(const Index&... i) { TA_ASSERT(!this->empty()); TA_ASSERT(this->nbatch() == 1); + TA_ASSERT(this->range().rank() == sizeof...(Index)); + // can't distinguish between operator()(Index...) and operator()(ordinal) + // thus insist on at_ordinal() if this->rank()==1 + TA_ASSERT(this->range_.rank() != 1 && + "use Tensor::operator()(index) or " + "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); using Int = std::common_type_t; const auto iord = this->range_.ordinal( std::array{{static_cast(i)...}}); diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h index b8c62d95b8..39fca37d9e 100644 --- a/src/TiledArray/tile.h +++ b/src/TiledArray/tile.h @@ -250,6 +250,11 @@ class Tile { std::enable_if_t::value>* = nullptr> const_reference operator[](const Ordinal ord) const { TA_ASSERT(pimpl_); + // can't distinguish between operator[](Index...) and operator[](ordinal) + // thus insist on at_ordinal() if this->rank()==1 + TA_ASSERT(this->range().rank() != 1 && + "use Tile::operator[](index) or " + "Tile::at_ordinal(index_ordinal) if this->range().rank()==1"); TA_ASSERT(tensor().range().includes_ordinal(ord)); return tensor().data()[ord]; } @@ -264,6 +269,41 @@ class Tile { template ::value>* = nullptr> reference operator[](const Ordinal ord) { + TA_ASSERT(pimpl_); + // can't distinguish between operator[](Index...) and operator[](ordinal) + // thus insist on at_ordinal() if this->rank()==1 + TA_ASSERT(this->range().rank() != 1 && + "use Tile::operator[](index) or " + "Tile::at_ordinal(index_ordinal) if this->range().rank()==1"); + TA_ASSERT(tensor().range().includes_ordinal(ord)); + return tensor().data()[ord]; + } + + /// Const element accessor + + /// \tparam Ordinal an integer type that represents an ordinal + /// \param[in] ord an ordinal index + /// \return Const reference to the element at position \c ord . + /// \note This asserts (using TA_ASSERT) that this is not empty and ord is + /// included in the range + template ::value>* = nullptr> + const_reference at_ordinal(const Ordinal ord) const { + TA_ASSERT(pimpl_); + TA_ASSERT(tensor().range().includes_ordinal(ord)); + return tensor().data()[ord]; + } + + /// Element accessor + + /// \tparam Ordinal an integer type that represents an ordinal + /// \param[in] ord an ordinal index + /// \return Reference to the element at position \c ord . + /// \note This asserts (using TA_ASSERT) that this is not empty and ord is + /// included in the range + template ::value>* = nullptr> + reference at_ordinal(const Ordinal ord) { TA_ASSERT(pimpl_); TA_ASSERT(tensor().range().includes_ordinal(ord)); return tensor().data()[ord]; @@ -401,6 +441,12 @@ class Tile { detail::is_integral_list::value>* = nullptr> const_reference operator()(const Index&... i) const { TA_ASSERT(pimpl_); + TA_ASSERT(this->range().rank() == sizeof...(Index)); + // can't distinguish between operator()(Index...) and operator()(ordinal) + // thus insist on at_ordinal() if this->rank()==1 + TA_ASSERT(this->range().rank() != 1 && + "use Tile::operator()(index) or " + "Tile::at_ordinal(index_ordinal) if this->range().rank()==1"); TA_ASSERT(tensor().range().includes(i...)); return tensor().data()[tensor().range().ordinal(i...)]; } @@ -417,6 +463,12 @@ class Tile { detail::is_integral_list::value>* = nullptr> reference operator()(const Index&... i) { TA_ASSERT(pimpl_); + TA_ASSERT(this->range().rank() == sizeof...(Index)); + // can't distinguish between operator()(Index...) and operator()(ordinal) + // thus insist on at_ordinal() if this->rank()==1 + TA_ASSERT(this->range().rank() != 1 && + "use Tile::operator()(index) or " + "Tile::at_ordinal(index_ordinal) if this->range().rank()==1"); TA_ASSERT(tensor().range().includes(i...)); return tensor().data()[tensor().range().ordinal(i...)]; } From 64723263b72a13eb0b494cfe1ed535fdd29f4554 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 07:39:35 -0400 Subject: [PATCH 503/592] hostEnv -> host::Env + fixup to make f294db31bea86d08b8d875d218f24c65221dca76 build --- src/CMakeLists.txt | 7 ++--- src/TiledArray/external/device.h | 40 +++++++++++++++++---------- src/TiledArray/fwd.h | 5 +++- src/TiledArray/host/env.cpp | 36 ++++++++++++++++++++++++ src/TiledArray/host/env.h | 47 ++++++++++++++++---------------- 5 files changed, 92 insertions(+), 43 deletions(-) create mode 100644 src/TiledArray/host/env.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3d6b94ea9a..80f2a49710 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -133,6 +133,7 @@ TiledArray/expressions/index_list.h TiledArray/external/btas.h TiledArray/external/madness.h TiledArray/external/umpire.h +TiledArray/host/env.cpp TiledArray/host/env.h TiledArray/math/blas.h TiledArray/math/gemm_helper.h @@ -206,11 +207,7 @@ TiledArray/util/vector.h if(HIP_FOUND OR CUDA_FOUND) list(APPEND TILEDARRAY_HEADER_FILES TiledArray/external/device.h - TiledArray/external/librett.h) -endif() - -if(CUDA_FOUND OR HIP_FOUND) - list(APPEND TILEDARRAY_HEADER_FILES + TiledArray/external/librett.h TiledArray/device/blas.cpp TiledArray/device/blas.h TiledArray/device/btas.h diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 597643b225..4f9d365e0a 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -41,8 +41,6 @@ #include #endif -#include - #include #include #include @@ -51,6 +49,20 @@ #include #include +#include + +namespace TiledArray::detail { + +struct get_um_allocator { + inline umpire::Allocator& operator()(); +}; + +struct get_pinned_allocator { + inline umpire::Allocator& operator()(); +}; + +} // namespace TiledArray::detail + #if defined(TILEDARRAY_HAS_CUDA) inline void __DeviceSafeCall(cudaError err, const char* file, const int line) { @@ -802,18 +814,6 @@ class Env { namespace detail { -struct get_um_allocator { - umpire::Allocator& operator()() { - return deviceEnv::instance()->um_allocator(); - } -}; - -struct get_pinned_allocator { - umpire::Allocator& operator()() { - return deviceEnv::instance()->pinned_allocator(); - } -}; - // in a madness device task point to its local optional stream to use by // madness_task_stream_opt; set to nullptr after task callable finished inline std::optional*& madness_task_stream_opt_ptr_accessor() { @@ -905,6 +905,18 @@ device::Stream stream_for(const Range& range) { } // namespace device +namespace detail { + +inline umpire::Allocator& get_um_allocator::operator()() { + return deviceEnv::instance()->um_allocator(); +} + +inline umpire::Allocator& get_pinned_allocator::operator()() { + return deviceEnv::instance()->pinned_allocator(); +} + +} // namespace detail + #endif // TILEDARRAY_HAS_DEVICE #ifdef TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index 6127db32f3..652b835fab 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -49,7 +49,10 @@ class umpire_based_allocator; template > class default_init_allocator; -class hostEnv; +namespace host { +class Env; +} +using hostEnv = host::Env; /// pooled thread-safe host memory allocator template diff --git a/src/TiledArray/host/env.cpp b/src/TiledArray/host/env.cpp new file mode 100644 index 0000000000..16d3a71a50 --- /dev/null +++ b/src/TiledArray/host/env.cpp @@ -0,0 +1,36 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2021 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Chong Peng + * Department of Chemistry, Virginia Tech + * July 23, 2018 + * + */ + +#include + +namespace TiledArray { + +namespace detail { + +umpire::Allocator& get_host_allocator::operator()() { + return TiledArray::host::Env::instance()->host_allocator(); +} + +} // namespace detail + +} // namespace TiledArray diff --git a/src/TiledArray/host/env.h b/src/TiledArray/host/env.h index be1de5369c..b469704a72 100644 --- a/src/TiledArray/host/env.h +++ b/src/TiledArray/host/env.h @@ -41,24 +41,34 @@ namespace TiledArray { +namespace detail { + +struct get_host_allocator { + umpire::Allocator& operator()(); +}; + +} // namespace detail + +namespace host { + /** - * hostEnv maintains the (host-side, as opposed to device-side) environment, + * Env maintains the (host-side, as opposed to device-side) environment, * such as memory allocators * * \note this is a Singleton */ -class hostEnv { +class Env { public: - ~hostEnv() = default; + ~Env() = default; - hostEnv(const hostEnv&) = delete; - hostEnv(hostEnv&&) = delete; - hostEnv& operator=(const hostEnv&) = delete; - hostEnv& operator=(hostEnv&&) = delete; + Env(const Env&) = delete; + Env(Env&&) = delete; + Env& operator=(const Env&) = delete; + Env& operator=(Env&&) = delete; /// access the singleton instance; if not initialized will be - /// initialized via hostEnv::initialize() with the default params - static std::unique_ptr& instance() { + /// initialized via Env::initialize() with the default params + static std::unique_ptr& instance() { if (!instance_accessor()) { initialize(); } @@ -103,8 +113,7 @@ class hostEnv { "QuickPool_SizeLimited_HOST", host_size_limited_alloc, page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE); - auto host_env = - std::unique_ptr(new hostEnv(world, host_dynamic_pool)); + auto host_env = std::unique_ptr(new Env(world, host_dynamic_pool)); instance_accessor() = std::move(host_env); } } @@ -131,7 +140,7 @@ class hostEnv { } protected: - hostEnv(World& world, umpire::Allocator host_alloc) + Env(World& world, umpire::Allocator host_alloc) : world_(&world), host_allocator_(host_alloc) {} private: @@ -142,21 +151,13 @@ class hostEnv { // N.B. not thread safe, so must be wrapped into umpire_based_allocator_impl umpire::Allocator host_allocator_; - inline static std::unique_ptr& instance_accessor() { - static std::unique_ptr instance_{nullptr}; + inline static std::unique_ptr& instance_accessor() { + static std::unique_ptr instance_{nullptr}; return instance_; } }; -namespace detail { - -struct get_host_allocator { - umpire::Allocator& operator()() { - return hostEnv::instance()->host_allocator(); - } -}; - -} // namespace detail +} // namespace host } // namespace TiledArray From 57eb4e14098b35481f028e8a85cf3d0c51e10930 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 15:43:49 -0400 Subject: [PATCH 504/592] [ci] do not use gcc toolchain on macos, instead try linux + enable ccache --- .github/workflows/ci.yml | 55 +++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2339070e54..8e71db9403 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,16 +12,26 @@ jobs: strategy: fail-fast: false matrix: - os : [ macos-latest ] - cxx : [ clang++, /opt/homebrew/bin/g++-11 ] + os : [ macos-latest, ubuntu-22.04 ] build_type : [ Release, Debug ] task_backend: [ Pthreads, PaRSEC ] - prerequisites : [ gcc@11 boost eigen open-mpi bison scalapack ] + include: + - os: ubuntu-22.04 + cc: /usr/bin/gcc-12 + cxx: /usr/bin/g++-12 + - os: macos-latest + cc: clang + cxx: clang++ name: "${{ matrix.os }}: ${{ matrix.cxx }} ${{ matrix.build_type }} ${{ matrix.task_backend }}" runs-on: ${{ matrix.os }} env: CXX : ${{ matrix.cxx }} + CCACHE_DIR : ${{github.workspace}}/build/.ccache + CCACHE_COMPRESS : true + CCACHE_COMPRESSLEVEL : 6 + OMPI_MCA_btl_vader_single_copy_mechanism : none + PARSEC_MCA_runtime_bind_threads : 0 BUILD_CONFIG : > -DMADNESS_TASK_BACKEND=${{ matrix.task_backend }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} @@ -33,18 +43,40 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: maxim-lobanov/setup-xcode@v1 - with: - xcode-version: 'latest-stable' - - name: Host system info shell: bash run: cmake -P ${{github.workspace}}/ci/host_system_info.cmake - - name: Install ${{matrix.prerequisites}} + + - name: Install prerequisite MacOS packages + if: ${{ matrix.os == 'macos-latest' }} + run: | + brew install ninja boost eigen open-mpi bison scalapack ccache + echo "MPIEXEC=/opt/homebrew/bin/mpiexec" >> $GITHUB_ENV + + - name: Install prerequisites Ubuntu packages + if: ${{ matrix.os == 'ubuntu-22.04' }} run: | - brew install ${{matrix.prerequisites}} - echo "/usr/local/opt/bison/bin" >> $GITHUB_PATH + wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null + sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" + sudo apt-get update + sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison libscalapack-openmpi-dev cmake doxygen + echo "MPIEXEC=/usr/bin/mpiexec" >> $GITHUB_ENV + + - name: Prepare ccache timestamp + id: ccache_cache_timestamp + shell: cmake -P {0} + run: | + string(TIMESTAMP current_date "%Y-%m-%d-%H;%M;%S" UTC) + message("::set-output name=timestamp::${current_date}") + + - name: Setup ccache cache files + uses: actions/cache@v1.1.0 + with: + path: ${{github.workspace}}/build/.ccache + key: ${{ matrix.config.name }}-ccache-${{ steps.ccache_cache_timestamp.outputs.timestamp }} + restore-keys: | + ${{ matrix.config.name }}-ccache- - name: "Configure build: ${{ env.BUILD_CONFIG }}" shell: bash @@ -56,8 +88,7 @@ jobs: working-directory: ${{github.workspace}}/build shell: bash run: | - cmake --build . --target tiledarray - cmake --build . --target examples + ccache -p && ccache -z && cmake --build . --target tiledarray && cmake --build . --target examples && ccache -s - name: Test working-directory: ${{github.workspace}}/build From 7343ae68308807ebe55e0c83413be75b1fe36ce5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 16:02:46 -0400 Subject: [PATCH 505/592] [ci] try symlinking libscalapack-openmpi.so to libscalapack.so to help out FindReferenceSCALAPACK --- .github/workflows/ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8e71db9403..143c88f8ea 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -60,7 +60,8 @@ jobs: wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" sudo apt-get update - sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison libscalapack-openmpi-dev cmake doxygen + sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison libscalapack-mpi-dev cmake doxygen + sudo ln -s /usr/lib/x86_64-linux-gnu/libscalapack-openmpi.so /usr/lib/x86_64-linux-gnu/libscalapack.so echo "MPIEXEC=/usr/bin/mpiexec" >> $GITHUB_ENV - name: Prepare ccache timestamp @@ -82,7 +83,7 @@ jobs: shell: bash run: | set -x; - cmake -B${{github.workspace}}/build $BUILD_CONFIG || (cat CMakeFiles/CMakeOutput.log && cat CMakeFiles/CMakeError.log) + cmake -B${{github.workspace}}/build $BUILD_CONFIG || (cat CMakeFiles/CMakeConfigureLog.yaml) - name: Build working-directory: ${{github.workspace}}/build From df09400150a095500421a04397b41f54567da86a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 16:48:02 -0400 Subject: [PATCH 506/592] [unit] [cuda] another disambiguation via at_ordinal --- tests/expressions_device_um.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/expressions_device_um.cpp b/tests/expressions_device_um.cpp index e624756561..d49b425372 100644 --- a/tests/expressions_device_um.cpp +++ b/tests/expressions_device_um.cpp @@ -85,7 +85,8 @@ struct UMExpressionsFixture : public TiledRangeFixture { template static Tile make_rand_tile(const typename TA::Range& r) { Tile tile(r); - for (std::size_t i = 0ul; i < tile.size(); ++i) set_random(tile[i]); + for (std::size_t i = 0ul; i < tile.size(); ++i) + set_random(tile.at_ordinal(i)); return tile; } From 0680b70b453751178c7f582259c8f9e623525376 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 16:48:25 -0400 Subject: [PATCH 507/592] [ci] disable Gitlab jobs except CUDA --- .gitlab-ci.yml | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 02c3edc266..8b675a692c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -57,22 +57,10 @@ ubuntu: metrics: build/metrics.txt parallel: matrix: - - IMAGE : [ "ubuntu:20.04" ] - CXX: [ g++ ] - BUILD_TYPE : [ "RelWithDebInfo" ] - BLA_VENDOR : [ "BLAS_PREFERENCE_LIST=IntelMKL" ] - BLA_THREADS : [ "IntelMKL_THREAD_LAYER=tbb" ] - # ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ] - TA_PYTHON : [ "TA_PYTHON=OFF" ] # needs to be fixed for MKL - RUNNER_TAGS: [ saas-linux-small-amd64 ] - - IMAGE : [ "ubuntu:22.04" ] - CXX: [ g++, clang++-13 ] - BUILD_TYPE : [ "RelWithDebInfo" ] - ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ] - RUNNER_TAGS: [ saas-linux-small-amd64 ] - IMAGE : [ "ubuntu:22.04" ] CXX: [ g++ ] BUILD_TYPE : [ "RelWithDebInfo" ] + TA_PYTHON : [ "TA_PYTHON=OFF" ] ENABLE_CUDA : [ "ENABLE_CUDA=ON" ] TA_TARGETS : [ "tiledarray examples-tiledarray check_serial-tiledarray" ] RUNNER_TAGS: [ cuda ] From 016cc8155878b70b0216c517b8a12dcca98fd196 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 20:42:56 -0400 Subject: [PATCH 508/592] introduce {Tensor,Tile}::c{begin,end} --- src/TiledArray/tensor/tensor.h | 28 ++++++++++++++++++++++++++-- src/TiledArray/tile.h | 20 ++++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index bf729e59d9..bd72af487c 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1003,7 +1003,7 @@ class Tensor { /// Iterator factory - /// \return An iterator to the first data element + /// \return A const iterator to the first data element const_iterator begin() const { return (this->data() ? this->data() : NULL); } /// Iterator factory @@ -1013,7 +1013,7 @@ class Tensor { /// Iterator factory - /// \return An iterator to the last data element + /// \return A const iterator to the last data element const_iterator end() const { return (this->data() ? this->data() + this->size() : NULL); } @@ -1023,6 +1023,30 @@ class Tensor { /// \return An iterator to the last data element iterator end() { return (this->data() ? this->data() + this->size() : NULL); } + /// Iterator factory + + /// \return A const iterator to the first data element + const_iterator cbegin() const { return (this->data() ? this->data() : NULL); } + + /// Iterator factory + + /// \return A const iterator to the first data element + const_iterator cbegin() { return (this->data() ? this->data() : NULL); } + + /// Iterator factory + + /// \return A const iterator to the last data element + const_iterator cend() const { + return (this->data() ? this->data() + this->size() : NULL); + } + + /// Iterator factory + + /// \return A const iterator to the last data element + const_iterator cend() { + return (this->data() ? this->data() + this->size() : NULL); + } + /// Read-only access to the data /// \return A const pointer to the tensor data diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h index 39fca37d9e..90f7366bbc 100644 --- a/src/TiledArray/tile.h +++ b/src/TiledArray/tile.h @@ -201,6 +201,26 @@ class Tile { /// \return A const iterator to the last data element decltype(auto) end() const { return std::end(tensor()); } + /// Iterator factory + + /// \return A const iterator to the first data element + decltype(auto) cbegin() { return std::cbegin(tensor()); } + + /// Iterator factory + + /// \return A const iterator to the first data element + decltype(auto) cbegin() const { return std::cbegin(tensor()); } + + /// Iterator factory + + /// \return A const iterator to the last data element + decltype(auto) cend() { return std::cend(tensor()); } + + /// Iterator factory + + /// \return A const iterator to the last data element + decltype(auto) cend() const { return std::cend(tensor()); } + // Data accessor ------------------------------------------------------- /// Data direct access From a125ad7f1de1b0f9bc54454d35fe46970290cec6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 20:44:21 -0400 Subject: [PATCH 509/592] TensorInterface is a range --- src/TiledArray/tensor/tensor_interface.h | 76 ++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/src/TiledArray/tensor/tensor_interface.h b/src/TiledArray/tensor/tensor_interface.h index 7a23307036..7a2e350a2f 100644 --- a/src/TiledArray/tensor/tensor_interface.h +++ b/src/TiledArray/tensor/tensor_interface.h @@ -273,6 +273,82 @@ class TensorInterface { return data_[range_.ordinal(idx...)]; } + /// \brief Tensor interface iterator type + /// + /// Iterates over elements of a tensor interface whose range is iterable + template + class Iterator : public boost::iterator_facade< + Iterator, + std::conditional_t, + const typename TI::value_type, + typename TI::value_type>, + boost::forward_traversal_tag> { + public: + using range_iterator = typename TI::range_type::const_iterator; + + Iterator(range_iterator idx_it, TI& ti) : idx_it(idx_it), ti(ti) {} + + private: + range_iterator idx_it; + TI& ti; + + friend class boost::iterator_core_access; + + /// \brief increments this iterator + void increment() { ++idx_it; } + + /// \brief Iterator comparer + /// \return true, if \c `*this==*other` + bool equal(Iterator const& other) const { + return this->idx_it == other.idx_it; + } + + /// \brief dereferences this iterator + /// \return const reference to the current index + auto& dereference() const { + return ti.at_ordinal(ti.range().ordinal(*idx_it)); + } + }; + friend class Iterator; + friend class Iterator; + + typedef Iterator iterator; ///< Iterator type + typedef Iterator const_iterator; ///< Iterator type + + /// Const begin iterator + + /// \return An iterator that points to the beginning of this tensor view + const_iterator begin() const { + return const_iterator(range().begin(), *this); + } + + /// Const end iterator + + /// \return An iterator that points to the end of this tensor view + const_iterator end() const { return const_iterator(range().end(), *this); } + + /// Nonconst begin iterator + + /// \return An iterator that points to the beginning of this tensor view + iterator begin() { return iterator(range().begin(), *this); } + + /// Nonconst begin iterator + + /// \return An iterator that points to the beginning of this tensor view + iterator end() { return iterator(range().end(), *this); } + + /// Const begin iterator + + /// \return An iterator that points to the beginning of this tensor view + const_iterator cbegin() const { + return const_iterator(range().begin(), *this); + } + + /// Const end iterator + + /// \return An iterator that points to the end of this tensor view + const_iterator cend() const { return const_iterator(range().end(), *this); } + /// Check for empty view /// \return \c false From 81a3af5c3643e8d6a9a6f7acc010499d8f50a939 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 20:44:41 -0400 Subject: [PATCH 510/592] dox fixup --- src/TiledArray/range1.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/TiledArray/range1.h b/src/TiledArray/range1.h index 8b185936d4..a29e0d607c 100644 --- a/src/TiledArray/range1.h +++ b/src/TiledArray/range1.h @@ -163,8 +163,6 @@ struct Range1 { /// \return An iterator that points to the beginning of the local element set const_iterator cend() const { return end(); } - /// @} - /// shifts this Range1 /// @param[in] shift the shift to apply From 87b024b908f46dafd65cc30d1332c385957bc659 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 20:46:12 -0400 Subject: [PATCH 511/592] btas::Tensor can be copied into from Tensor and TensorInterface ... this allows TA::retile on DistArrays of btas::Tensors --- src/TiledArray/external/btas.h | 7 +++++++ tests/btas.cpp | 15 +++++++++++++++ tests/expressions_btas.cpp | 2 ++ 3 files changed, 24 insertions(+) diff --git a/src/TiledArray/external/btas.h b/src/TiledArray/external/btas.h index fe84e6f0c6..c22afd3813 100644 --- a/src/TiledArray/external/btas.h +++ b/src/TiledArray/external/btas.h @@ -62,6 +62,13 @@ class boxrange_iteration_order { static constexpr int value = row_major; }; +template +class is_tensor> : public std::true_type {}; + +template +class is_tensor> + : public std::true_type {}; + } // namespace btas namespace TiledArray { diff --git a/tests/btas.cpp b/tests/btas.cpp index 9c15540e9a..4e972cfc28 100644 --- a/tests/btas.cpp +++ b/tests/btas.cpp @@ -256,6 +256,21 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tensor_ctor, Tensor, tensor_types) { BOOST_REQUIRE_NO_THROW(Tensor t1 = t0); Tensor t1 = t0; BOOST_CHECK(t1.empty()); + + // can copy TA::Tensor to btas::Tensor + TA::Tensor ta_tensor(r); + BOOST_REQUIRE_NO_THROW(Tensor(ta_tensor)); + Tensor t2(ta_tensor); + for (auto i : r) { + BOOST_CHECK_EQUAL(ta_tensor(i), t2(i)); + } + + // can copy TA::TensorInterface to btas::Tensor + BOOST_REQUIRE_NO_THROW(Tensor(ta_tensor.block(r.lobound(), r.upbound()))); + Tensor t3(ta_tensor.block(r.lobound(), r.upbound())); + for (auto i : r) { + BOOST_CHECK_EQUAL(ta_tensor(i), t3(i)); + } } BOOST_AUTO_TEST_CASE_TEMPLATE(copy, Array, array_types) { diff --git a/tests/expressions_btas.cpp b/tests/expressions_btas.cpp index 83ff4b1ed0..7b1ae422ce 100644 --- a/tests/expressions_btas.cpp +++ b/tests/expressions_btas.cpp @@ -23,6 +23,8 @@ * */ +#include + #ifdef TILEDARRAY_HAS_BTAS #include "expressions_fixture.h" From f95e0dbe609c8090d2e5ab0203030b19e3ea8fb6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 24 Sep 2024 08:37:10 -0400 Subject: [PATCH 512/592] [unit] btas_suite/tensor_ctor: initialize ta_tensor properly --- tests/btas.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/btas.cpp b/tests/btas.cpp index 4e972cfc28..ebaf2f02a4 100644 --- a/tests/btas.cpp +++ b/tests/btas.cpp @@ -258,7 +258,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tensor_ctor, Tensor, tensor_types) { BOOST_CHECK(t1.empty()); // can copy TA::Tensor to btas::Tensor - TA::Tensor ta_tensor(r); + TA::Tensor ta_tensor; + ta_tensor = make_rand_tile(r); BOOST_REQUIRE_NO_THROW(Tensor(ta_tensor)); Tensor t2(ta_tensor); for (auto i : r) { From 6a926a9fc0f6168d142717a347b72afb614fd1be Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 24 Sep 2024 08:55:11 -0400 Subject: [PATCH 513/592] fixup TensorInterface::Iterator::deference() --- src/TiledArray/tensor/tensor_interface.h | 12 +++++------- tests/btas.cpp | 13 +++++++++---- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/TiledArray/tensor/tensor_interface.h b/src/TiledArray/tensor/tensor_interface.h index 7a2e350a2f..46663aad2f 100644 --- a/src/TiledArray/tensor/tensor_interface.h +++ b/src/TiledArray/tensor/tensor_interface.h @@ -259,8 +259,8 @@ class TensorInterface { /// \param idx The index pack template reference operator()(const Index&... idx) { - TA_ASSERT(range_.includes(idx...)); - return data_[range_.ordinal(idx...)]; + const auto ord = range_.ordinal(idx...); + return data_[ord]; } /// Element accessor @@ -269,8 +269,8 @@ class TensorInterface { /// \param idx The index pack template const_reference operator()(const Index&... idx) const { - TA_ASSERT(range_.includes(idx...)); - return data_[range_.ordinal(idx...)]; + const auto ord = range_.ordinal(idx...); + return data_[ord]; } /// \brief Tensor interface iterator type @@ -305,9 +305,7 @@ class TensorInterface { /// \brief dereferences this iterator /// \return const reference to the current index - auto& dereference() const { - return ti.at_ordinal(ti.range().ordinal(*idx_it)); - } + auto& dereference() const { return ti(*idx_it); } }; friend class Iterator; friend class Iterator; diff --git a/tests/btas.cpp b/tests/btas.cpp index ebaf2f02a4..c396110a2f 100644 --- a/tests/btas.cpp +++ b/tests/btas.cpp @@ -267,10 +267,15 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tensor_ctor, Tensor, tensor_types) { } // can copy TA::TensorInterface to btas::Tensor - BOOST_REQUIRE_NO_THROW(Tensor(ta_tensor.block(r.lobound(), r.upbound()))); - Tensor t3(ta_tensor.block(r.lobound(), r.upbound())); - for (auto i : r) { - BOOST_CHECK_EQUAL(ta_tensor(i), t3(i)); + { + const auto l = {3, 3, 3}; + const auto u = r.upbound(); + BOOST_REQUIRE(r.includes(l)); + BOOST_REQUIRE_NO_THROW(Tensor(ta_tensor.block(l, u))); + Tensor t3(ta_tensor.block(l, u)); + for (auto i : t3.range()) { + BOOST_CHECK_EQUAL(ta_tensor(i), t3(i)); + } } } From 3c2f7e579668062debe9fd9105dbd8cfaf33f857 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 24 Sep 2024 09:05:19 -0400 Subject: [PATCH 514/592] pull in https://github.com/ValeevGroup/BTAS/pull/179 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index db11ed24df..0e573bb050 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -41,7 +41,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20 and later. -- [BTAS](http://github.com/ValeevGroup/BTAS), tag 4e8f5233aa7881dccdfcc37ce07128833926d3c2 . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag 4b3757cc2b5862f93589afc1e37523e543779c7a . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 95589b0d020a076f93d02eead6da654b23dd3d91 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. diff --git a/external/versions.cmake b/external/versions.cmake index 87804775f9..3363908bf3 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -16,8 +16,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) -set(TA_TRACKED_BTAS_PREVIOUS_TAG b7b2ea7513b087e35c6f1b26184a3904ac1e6b14) +set(TA_TRACKED_BTAS_TAG 4b3757cc2b5862f93589afc1e37523e543779c7a) +set(TA_TRACKED_BTAS_PREVIOUS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) From 1f186cc7bd6988fe8083a4faad5b579f87e71c44 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 31 Aug 2024 07:47:55 -0400 Subject: [PATCH 515/592] TiledRange1: can construct using a range of tile hashmarks --- src/TiledArray/tiled_range1.h | 29 ++++++++++++++++++++++++----- tests/tiled_range1.cpp | 15 +++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index e25c8a5357..4824dec26e 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -98,19 +98,38 @@ class TiledRange1 { /// Construct a 1D tiled range. - /// This will construct a 1D tiled range with tile boundaries ("hashmarks") - /// {\p t0 , \p t_rest... } + /// This will construct a 1D tiled range from range {t0, t1, t2, ... tn} + /// specifying the tile boundaries (hashmarks). + /// The number of tile boundaries is n + 1, where n is the number of tiles. + /// Tiles are defined as [\p t0 , t1), [t1, t2), [t2, t3), ... + /// Tiles are indexed starting with 0. + /// \tparam Integer An integral type + /// \param tile_boundaries The list of tile boundaries in order from smallest + /// to largest + /// \note validity of the {\p t0 , \p t_rest... } range is checked using + /// #TA_ASSERT() only if preprocessor macro \c NDEBUG is not defined + template >> + explicit TiledRange1(Range&& tile_boundaries) { + init_tiles_(tile_boundaries.begin(), tile_boundaries.end(), 0); + } + + /// Construct a 1D tiled range. + + /// This will construct a 1D tiled range from range {t0, t1, t2, ... tn} + /// specifying the tile boundaries (hashmarks). /// The number of tile boundaries is n + 1, where n is the number of tiles. /// Tiles are defined as [\p t0 , t1), [t1, t2), [t2, t3), ... /// Tiles are indexed starting with 0. /// \tparam Integer An integral type - /// \param list The list of tile boundaries in order from smallest to largest + /// \param tile_boundaries The list of tile boundaries in order from smallest + /// to largest /// \note validity of the {\p t0 , \p t_rest... } range is checked using /// #TA_ASSERT() only if preprocessor macro \c NDEBUG is not defined template >> - explicit TiledRange1(const std::initializer_list& list) { - init_tiles_(list.begin(), list.end(), 0); + explicit TiledRange1(const std::initializer_list& tile_boundaries) { + init_tiles_(tile_boundaries.begin(), tile_boundaries.end(), 0); } /// Copy assignment operator diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index b4aef7f51c..f01a9a208e 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -110,6 +110,21 @@ BOOST_AUTO_TEST_CASE(constructor) { } } + // check constructor using range of tile boundaries. + { + if (Range1Fixture::ntiles == 5) { + TiledRange1 r(a); + BOOST_CHECK_EQUAL(r.tiles_range().first, tiles.first); + BOOST_CHECK_EQUAL(r.tiles_range().second, tiles.second); + BOOST_CHECK_EQUAL(r.elements_range().first, elements.first); + BOOST_CHECK_EQUAL(r.elements_range().second, elements.second); + for (std::size_t i = 0; i < a.size() - 1; ++i) { + BOOST_CHECK_EQUAL(r.tile(i).first, a[i]); + BOOST_CHECK_EQUAL(r.tile(i).second, a[i + 1]); + } + } + } + // check construction with negative index values #ifdef TA_SIGNED_1INDEX_TYPE { From 91b7f26f46c8b5ba5020f9905d73a40c89538f89 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 31 Aug 2024 08:04:34 -0400 Subject: [PATCH 516/592] <-> Eigen::{Vector,Matrix,Tensor} conversions can handle DistArrays with non-zero base Ranges --- src/TiledArray/conversions/eigen.h | 209 +++++++----- tests/eigen.cpp | 491 ++++++++++++++++------------- tests/range_fixture.h | 47 ++- 3 files changed, 443 insertions(+), 304 deletions(-) diff --git a/src/TiledArray/conversions/eigen.h b/src/TiledArray/conversions/eigen.h index 816a8bfe24..3caeecc178 100644 --- a/src/TiledArray/conversions/eigen.h +++ b/src/TiledArray/conversions/eigen.h @@ -196,20 +196,26 @@ eigen_map(T& tensor) { /// Copy a block of an Eigen matrix into a tensor -/// A block of \c matrix will be copied into \c tensor. The block -/// dimensions will be determined by the dimensions of the tensor's range. +// clang-format off +/// A block of \c matrix will be copied into \c tensor. If `tensor.rank()==2` +/// the block is `[tensor.range().lobound()[0] - base_offsets[0], tensor.range().upbound()[0] - base_offsets[0]) x `[tensor.range().lobound()[1] - base_offsets[1], tensor.range().upbound()[1] - base_offsets[1])`, +/// else it is `[tensor.range().lobound()[0] - base_offsets[0], tensor.range().upbound()[0] - base_offsets[0])`. +/// /// \tparam T A tensor type, e.g. TiledArray::Tensor /// \tparam Derived The derived type of an Eigen matrix /// \param[in] matrix The object that will be assigned the content of \c tensor -/// \param[out] tensor The object that will be assigned the content of \c matrix +/// \param[out] tensor The object that will contain the block of \c matrix +/// \param[in] base_offsets The base offsets for the tensor range (should be lobound of the array that will contain tensor as a tile) /// \throw TiledArray::Exception When the dimensions of \c tensor are not equal /// to 1 or 2. /// \throw TiledArray::Exception When the range of \c tensor is outside the /// range of \c matrix . +// clang-format on template >* = nullptr> -inline void eigen_submatrix_to_tensor(const Eigen::MatrixBase& matrix, - T& tensor) { +inline void eigen_submatrix_to_tensor( + const Eigen::MatrixBase& matrix, T& tensor, + std::array base_offsets = {0, 0}) { [[maybe_unused]] typedef typename T::index1_type size_type; TA_ASSERT((tensor.range().rank() == 2u) || (tensor.range().rank() == 1u)); @@ -223,60 +229,71 @@ inline void eigen_submatrix_to_tensor(const Eigen::MatrixBase& matrix, if (tensor.range().rank() == 2u) { // Get tensor range data - const std::size_t tensor_lower_0 = tensor_lower[0]; - const std::size_t tensor_lower_1 = tensor_lower[1]; - [[maybe_unused]] const std::size_t tensor_upper_0 = tensor_upper[0]; - [[maybe_unused]] const std::size_t tensor_upper_1 = tensor_upper[1]; - const std::size_t tensor_extent_0 = tensor_extent[0]; - const std::size_t tensor_extent_1 = tensor_extent[1]; - - TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.rows())); - TA_ASSERT(tensor_upper_1 <= std::size_t(matrix.cols())); + const size_type tensor_lower_0 = tensor_lower[0]; + const size_type tensor_lower_1 = tensor_lower[1]; + [[maybe_unused]] const size_type tensor_upper_0 = tensor_upper[0]; + [[maybe_unused]] const size_type tensor_upper_1 = tensor_upper[1]; + const size_type tensor_extent_0 = tensor_extent[0]; + const size_type tensor_extent_1 = tensor_extent[1]; + + TA_ASSERT(tensor_extent_0 <= size_type(matrix.rows())); + TA_ASSERT(tensor_extent_1 <= size_type(matrix.cols())); + TA_ASSERT(tensor_lower_0 >= base_offsets[0]); + TA_ASSERT(tensor_lower_1 >= base_offsets[1]); // Copy matrix eigen_map(tensor, tensor_extent_0, tensor_extent_1) = matrix.block( - tensor_lower_0, tensor_lower_1, tensor_extent_0, tensor_extent_1); + tensor_lower_0 - base_offsets[0], tensor_lower_1 - base_offsets[1], + tensor_extent_0, tensor_extent_1); } else { // Get tensor range data - const std::size_t tensor_lower_0 = tensor_lower[0]; - [[maybe_unused]] const std::size_t tensor_upper_0 = tensor_upper[0]; - const std::size_t tensor_extent_0 = tensor_extent[0]; + const size_type tensor_lower_0 = tensor_lower[0]; + [[maybe_unused]] const size_type tensor_upper_0 = tensor_upper[0]; + const size_type tensor_extent_0 = tensor_extent[0]; // Check that matrix is a vector. TA_ASSERT((matrix.rows() == 1) || (matrix.cols() == 1)); if (matrix.rows() == 1) { - TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.cols())); + TA_ASSERT(tensor_extent_0 <= size_type(matrix.cols())); + TA_ASSERT(tensor_lower_0 >= base_offsets[0]); // Copy the row vector to tensor eigen_map(tensor, 1, tensor_extent_0) = - matrix.block(0, tensor_lower_0, 1, tensor_extent_0); + matrix.block(0, tensor_lower_0 - base_offsets[0], 1, tensor_extent_0); } else { - TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.rows())); + TA_ASSERT(tensor_extent_0 <= size_type(matrix.rows())); + TA_ASSERT(tensor_lower_0 >= base_offsets[0]); // Copy the column vector to tensor eigen_map(tensor, tensor_extent_0, 1) = - matrix.block(tensor_lower_0, 0, tensor_extent_0, 1); + matrix.block(tensor_lower_0 - base_offsets[0], 0, tensor_extent_0, 1); } } } /// Copy the content of a tensor into an Eigen matrix block -/// The content of tensor will be copied into a block of matrix. The block -/// dimensions will be determined by the dimensions of the tensor's range. -/// \tparam T A tensor type, e.g. TiledArray::Tensor -/// \tparam Derived The derived type of an Eigen matrix -/// \param[in] tensor The object that will be copied to \c matrix -/// \param[out] matrix The object that will be assigned the content of \c tensor -/// \throw TiledArray::Exception When the dimensions of \c tensor are not equal -/// to 1 or 2. -/// \throw TiledArray::Exception When the range of \c tensor is outside the -/// range of \c matrix . +/// The content of tensor will be copied into a block of matrix. +/// If `tensor.rank()==2` +/// the block is `[tensor.range().lobound()[0] - base_offsets[0], +/// tensor.range().upbound()[0] - base_offsets[0]) x +/// `[tensor.range().lobound()[1] - base_offsets[1], tensor.range().upbound()[1] +/// - base_offsets[1])`, else it is `[tensor.range().lobound()[0] - +/// base_offsets[0], tensor.range().upbound()[0] - base_offsets[0])`. \tparam T +/// A tensor type, e.g. TiledArray::Tensor \tparam Derived The derived type of +/// an Eigen matrix \param[in] tensor The object that will be copied to \c +/// matrix \param[out] matrix The object that will be assigned the content of \c +/// tensor \param[in] base_offsets The base offsets for the tensor range (should +/// be lobound of the array that will contain tensor as a tile) \throw +/// TiledArray::Exception When the dimensions of \c tensor are not equal to 1 +/// or 2. \throw TiledArray::Exception When the range of \c tensor is outside +/// the range of \c matrix . template >* = nullptr> -inline void tensor_to_eigen_submatrix(const T& tensor, - Eigen::MatrixBase& matrix) { +inline void tensor_to_eigen_submatrix( + const T& tensor, Eigen::MatrixBase& matrix, + std::array base_offsets = {0, 0}) { [[maybe_unused]] typedef typename T::index1_type size_type; TA_ASSERT((tensor.range().rank() == 2u) || (tensor.range().rank() == 1u)); @@ -290,39 +307,44 @@ inline void tensor_to_eigen_submatrix(const T& tensor, if (tensor.range().rank() == 2) { // Get tensor range data - const std::size_t tensor_lower_0 = tensor_lower[0]; - const std::size_t tensor_lower_1 = tensor_lower[1]; - [[maybe_unused]] const std::size_t tensor_upper_0 = tensor_upper[0]; - [[maybe_unused]] const std::size_t tensor_upper_1 = tensor_upper[1]; - const std::size_t tensor_extent_0 = tensor_extent[0]; - const std::size_t tensor_extent_1 = tensor_extent[1]; - - TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.rows())); - TA_ASSERT(tensor_upper_1 <= std::size_t(matrix.cols())); + const size_type tensor_lower_0 = tensor_lower[0]; + const size_type tensor_lower_1 = tensor_lower[1]; + [[maybe_unused]] const size_type tensor_upper_0 = tensor_upper[0]; + [[maybe_unused]] const size_type tensor_upper_1 = tensor_upper[1]; + const size_type tensor_extent_0 = tensor_extent[0]; + const size_type tensor_extent_1 = tensor_extent[1]; + + TA_ASSERT(tensor_extent_0 <= size_type(matrix.rows())); + TA_ASSERT(tensor_extent_1 <= size_type(matrix.cols())); + TA_ASSERT(tensor_lower_0 >= base_offsets[0]); + TA_ASSERT(tensor_lower_1 >= base_offsets[1]); // Copy tensor into matrix - matrix.block(tensor_lower_0, tensor_lower_1, tensor_extent_0, + matrix.block(tensor_lower_0 - base_offsets[0], + tensor_lower_1 - base_offsets[1], tensor_extent_0, tensor_extent_1) = eigen_map(tensor, tensor_extent_0, tensor_extent_1); } else { // Get tensor range data - const std::size_t tensor_lower_0 = tensor_lower[0]; - [[maybe_unused]] const std::size_t tensor_upper_0 = tensor_upper[0]; - const std::size_t tensor_extent_0 = tensor_extent[0]; + const size_type tensor_lower_0 = tensor_lower[0]; + [[maybe_unused]] const size_type tensor_upper_0 = tensor_upper[0]; + const size_type tensor_extent_0 = tensor_extent[0]; TA_ASSERT((matrix.rows() == 1) || (matrix.cols() == 1)); if (matrix.rows() == 1) { - TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.cols())); + TA_ASSERT(tensor_extent_0 <= size_type(matrix.cols())); + TA_ASSERT(tensor_lower_0 >= base_offsets[0]); // Copy tensor into row vector - matrix.block(0, tensor_lower_0, 1, tensor_extent_0) = + matrix.block(0, tensor_lower_0 - base_offsets[0], 1, tensor_extent_0) = eigen_map(tensor, 1, tensor_extent_0); } else { - TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.rows())); + TA_ASSERT(tensor_extent_0 <= size_type(matrix.rows())); + TA_ASSERT(tensor_lower_0 >= base_offsets[0]); // Copy tensor into column vector - matrix.block(tensor_lower_0, 0, tensor_extent_0, 1) = + matrix.block(tensor_lower_0 - base_offsets[0], 0, tensor_extent_0, 1) = eigen_map(tensor, tensor_extent_0, 1); } } @@ -344,7 +366,12 @@ void counted_eigen_submatrix_to_tensor(const Eigen::MatrixBase* matrix, const typename A::ordinal_type i, madness::AtomicInt* counter) { typename A::value_type tensor(array->trange().make_tile_range(i)); - eigen_submatrix_to_tensor(*matrix, tensor); + // array lobound, in case not base-0 + const auto* range_lobound_data = + array->trange().elements_range().lobound_data(); + std::array array_lobound{ + {range_lobound_data[0], range_lobound_data[1]}}; + eigen_submatrix_to_tensor(*matrix, tensor, array_lobound); array->set(i, tensor); (*counter)++; } @@ -357,10 +384,11 @@ void counted_eigen_submatrix_to_tensor(const Eigen::MatrixBase* matrix, /// \param tensor The tensor to be copied /// \param counter The task counter template -void counted_tensor_to_eigen_submatrix(const T& tensor, - Eigen::MatrixBase* matrix, - madness::AtomicInt* counter) { - tensor_to_eigen_submatrix(tensor, *matrix); +void counted_tensor_to_eigen_submatrix( + const T& tensor, Eigen::MatrixBase* matrix, + std::array base_offsets, + madness::AtomicInt* counter) { + tensor_to_eigen_submatrix(tensor, *matrix, base_offsets); (*counter)++; } @@ -524,6 +552,12 @@ array_to_eigen(const DistArray& array) { EigenMatrix matrix = EigenMatrix::Zero(array_extent[0], (rank == 2 ? array_extent[1] : 1)); + // array lobound, in case not base-0 + const auto* range_lobound_data = + array.trange().elements_range().lobound_data(); + std::array array_lobound{ + {range_lobound_data[0], range_lobound_data[1]}}; + // Spawn tasks to copy array tiles to the Eigen matrix madness::AtomicInt counter; counter = 0; @@ -533,7 +567,7 @@ array_to_eigen(const DistArray& array) { array.world().taskq.add( &detail::counted_tensor_to_eigen_submatrix< EigenMatrix, typename DistArray::value_type>, - array.find(i), &matrix, &counter); + array.find(i), &matrix, array_lobound, &counter); ++n; } } @@ -565,6 +599,7 @@ array_to_eigen(const DistArray& array) { /// // Create a range for the new array object /// std::vector blocks; /// for(std::size_t i = 0ul; i <= 100ul; i += 10ul) +/// // N.B. can create non-0-base range, replace i -> i + base_offse /// blocks.push_back(i); /// std::array blocks2 = /// {{ TiledArray::TiledRange1(blocks.begin(), blocks.end()), @@ -634,6 +669,7 @@ inline A row_major_buffer_to_array( /// // Create a range for the new array object /// std::vector blocks; /// for(std::size_t i = 0ul; i <= 100ul; i += 10ul) +/// // N.B. can create non-0-base range, replace i -> i + base_offse /// blocks.push_back(i); /// std::array blocks2 = /// {{ TiledArray::TiledRange1(blocks.begin(), blocks.end()), @@ -705,11 +741,13 @@ inline A column_major_buffer_to_array( /// match. // clang-format on template + typename Tensor_, std::size_t NumIndices_Sz = NumIndices_> inline void eigen_subtensor_to_tensor( const Eigen::Tensor& src, - Tensor_& dst) { + Tensor_& dst, + std::array base_offsets = {}) { TA_ASSERT(dst.range().rank() == NumIndices_); + static_assert(NumIndices_Sz == NumIndices_); auto to_array = [](const auto& seq) { TA_ASSERT(seq.size() == NumIndices_); @@ -718,6 +756,13 @@ inline void eigen_subtensor_to_tensor( return result; }; + auto to_base0 = [&](const auto& arr) { + TA_ASSERT(arr.size() == NumIndices_); + std::array result; + for (int i = 0; i < NumIndices_; ++i) result[i] = arr[i] - base_offsets[i]; + return result; + }; + [[maybe_unused]] auto reverse_extent_indices = []() { std::array result; std::iota(result.rbegin(), result.rend(), 0); @@ -725,8 +770,8 @@ inline void eigen_subtensor_to_tensor( }; const auto& dst_range = dst.range(); - auto src_block = - src.slice(to_array(dst_range.lobound()), to_array(dst_range.extent())); + auto src_block = src.slice(to_base0(to_array(dst_range.lobound())), + to_array(dst_range.extent())); auto dst_eigen_map = Eigen::TensorMap< Eigen::Tensor>( dst.data(), to_array(dst_range.extent())); @@ -758,11 +803,13 @@ inline void eigen_subtensor_to_tensor( /// of \c src and \c dst do not match. // clang-format on template + typename IndexType_, std::size_t NumIndices_Sz = NumIndices_> inline void tensor_to_eigen_subtensor( const Tensor_& src, - Eigen::Tensor& dst) { + Eigen::Tensor& dst, + std::array base_offsets = {}) { TA_ASSERT(src.range().rank() == NumIndices_); + static_assert(NumIndices_Sz == NumIndices_); auto to_array = [](const auto& seq) { TA_ASSERT(seq.size() == NumIndices_); @@ -771,6 +818,13 @@ inline void tensor_to_eigen_subtensor( return result; }; + auto to_base0 = [&](const auto& arr) { + TA_ASSERT(arr.size() == NumIndices_); + std::array result; + for (int i = 0; i < NumIndices_; ++i) result[i] = arr[i] - base_offsets[i]; + return result; + }; + [[maybe_unused]] auto reverse_extent_indices = []() { std::array result; std::iota(result.rbegin(), result.rend(), 0); @@ -778,8 +832,8 @@ inline void tensor_to_eigen_subtensor( }; const auto& src_range = src.range(); - auto dst_block = - dst.slice(to_array(src_range.lobound()), to_array(src_range.extent())); + auto dst_block = dst.slice(to_base0(to_array(src_range.lobound())), + to_array(src_range.extent())); auto src_eigen_map = Eigen::TensorMap< Eigen::Tensor>( src.data(), to_array(src_range.extent())); @@ -809,7 +863,13 @@ void counted_eigen_subtensor_to_tensor(const Eigen_Tensor_* src, const typename Range::index_type i, madness::AtomicInt* counter) { typename DistArray_::value_type tensor(dst->trange().make_tile_range(i)); - eigen_subtensor_to_tensor(*src, tensor); + // array lobound, in case not base-0 + const auto* range_lobound_data = + dst->trange().elements_range().lobound_data(); + std::array array_lobound; + std::copy(range_lobound_data, range_lobound_data + dst->trange().rank(), + array_lobound.begin()); + eigen_subtensor_to_tensor(*src, tensor, array_lobound); dst->set(i, tensor); (*counter)++; } @@ -822,10 +882,11 @@ void counted_eigen_subtensor_to_tensor(const Eigen_Tensor_* src, /// \param dst The destination tensor /// \param counter The task counter template -void counted_tensor_to_eigen_subtensor(const TA_Tensor_& src, - Eigen_Tensor_* dst, - madness::AtomicInt* counter) { - tensor_to_eigen_subtensor(src, *dst); +void counted_tensor_to_eigen_subtensor( + const TA_Tensor_& src, Eigen_Tensor_* dst, + std::array base_offsets, + madness::AtomicInt* counter) { + tensor_to_eigen_subtensor(src, *dst, base_offsets); (*counter)++; } @@ -1004,6 +1065,12 @@ Tensor array_to_eigen_tensor(const TiledArray::DistArray& src, result_type result(src.trange().elements_range().extent()); result.setZero(); + const auto* range_lobound_data = + src.trange().elements_range().lobound_data(); + std::array array_lobound; + std::copy(range_lobound_data, range_lobound_data + src.trange().rank(), + array_lobound.begin()); + // Spawn tasks to copy array tiles to btas::Tensor madness::AtomicInt counter; counter = 0; @@ -1012,7 +1079,7 @@ Tensor array_to_eigen_tensor(const TiledArray::DistArray& src, if (!src.is_zero(i)) { src.world().taskq.add( &detail::counted_tensor_to_eigen_subtensor, - src.find(i), &result, &counter); + src.find(i), &result, array_lobound, &counter); ++n; } } diff --git a/tests/eigen.cpp b/tests/eigen.cpp index d577804417..11ca7088b1 100644 --- a/tests/eigen.cpp +++ b/tests/eigen.cpp @@ -29,9 +29,16 @@ struct EigenFixture : public TiledRangeFixture { : trange(dims.begin(), dims.begin() + 2), trange1(dims.begin(), dims.begin() + 1), trangeN(dims.begin(), dims.begin() + GlobalFixture::dim), + trange_base1(dims_base1.begin(), dims_base1.begin() + 2), + trange1_base1(dims_base1.begin(), dims_base1.begin() + 1), + trangeN_base1(dims_base1.begin(), + dims_base1.begin() + GlobalFixture::dim), array(*GlobalFixture::world, trange), array1(*GlobalFixture::world, trange1), arrayN(*GlobalFixture::world, trangeN), + array_base1(*GlobalFixture::world, trange_base1), + array1_base1(*GlobalFixture::world, trange1_base1), + arrayN_base1(*GlobalFixture::world, trangeN_base1), matrix(dims[0].elements_range().second, dims[1].elements_range().second), rmatrix(dims[0].elements_range().second, @@ -43,9 +50,15 @@ struct EigenFixture : public TiledRangeFixture { TiledRange trange; TiledRange trange1; TiledRange trangeN; + TiledRange trange_base1; // base-1 version of trange + TiledRange trange1_base1; // base-1 version of trange1 + TiledRange trangeN_base1; // base-1 version of trangeN TArrayI array; TArrayI array1; TArrayI arrayN; + TArrayI array_base1; // base-1 version of array + TArrayI array1_base1; // base-1 version of array1 + TArrayI arrayN_base1; // base-1 version of array1 Eigen::MatrixXi matrix; EigenMatrixXi rmatrix; Eigen::VectorXi vector; @@ -172,15 +185,23 @@ BOOST_AUTO_TEST_CASE(matrix_to_array) { (array = eigen_to_array(*GlobalFixture::world, trange, matrix))); // Check that the data in array is equal to that in matrix - for (Range::const_iterator it = array.tiles_range().begin(); - it != array.tiles_range().end(); ++it) { - Future tile = array.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(tile.get()[*tile_it], - matrix((*tile_it)[0], (*tile_it)[1])); + auto test = [&](const auto& array, auto base = 0) { + for (Range::const_iterator it = array.tiles_range().begin(); + it != array.tiles_range().end(); ++it) { + Future tile = array.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(tile.get()[*tile_it], + matrix((*tile_it)[0] - base, (*tile_it)[1] - base)); + } } - } + }; + test(array, 0); + + // same with base-1 + BOOST_CHECK_NO_THROW((array_base1 = eigen_to_array( + *GlobalFixture::world, trange_base1, matrix))); + test(array_base1, 1); } BOOST_AUTO_TEST_CASE(vector_to_array) { @@ -193,14 +214,23 @@ BOOST_AUTO_TEST_CASE(vector_to_array) { trange1, vector))); // Check that the data in array matches the data in vector - for (Range::const_iterator it = array1.tiles_range().begin(); - it != array1.tiles_range().end(); ++it) { - Future tile = array1.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(tile.get()[*tile_it], vector((*tile_it)[0])); + auto test = [&](const auto& array1, auto base = 0) { + for (Range::const_iterator it = array1.tiles_range().begin(); + it != array1.tiles_range().end(); ++it) { + Future tile = array1.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(tile.get()[*tile_it], vector((*tile_it)[0] - base)); + } } - } + }; + + test(array1, 0); + + // same with base-1 + BOOST_CHECK_NO_THROW((array1_base1 = eigen_to_array( + *GlobalFixture::world, trange1_base1, vector))); + test(array1_base1, 1); } BOOST_AUTO_TEST_CASE(array_to_matrix) { @@ -208,168 +238,180 @@ BOOST_AUTO_TEST_CASE(array_to_matrix) { return array_to_eigen, DensePolicy, Eigen::RowMajor>(array); }; - if (GlobalFixture::world->size() == 1) { - // Fill the array with random data - GlobalFixture::world->srand(27); - for (Range::const_iterator it = array.tiles_range().begin(); - it != array.tiles_range().end(); ++it) { - TArrayI::value_type tile(array.trange().make_tile_range(*it)); - for (TArrayI::value_type::iterator tile_it = tile.begin(); - tile_it != tile.end(); ++tile_it) { - *tile_it = GlobalFixture::world->rand(); + for (auto base : {0, 1}) { + auto& arr = base == 1 ? array_base1 : array; + + if (GlobalFixture::world->size() == 1) { + // Fill the array with random data + GlobalFixture::world->srand(27); + for (Range::const_iterator it = arr.tiles_range().begin(); + it != arr.tiles_range().end(); ++it) { + TArrayI::value_type tile(arr.trange().make_tile_range(*it)); + for (TArrayI::value_type::iterator tile_it = tile.begin(); + tile_it != tile.end(); ++tile_it) { + *tile_it = GlobalFixture::world->rand(); + } + arr.set(*it, tile); } - array.set(*it, tile); - } - - // Convert the array to an Eigen matrices: column-major (matrix) and - // row-major (rmatrix) - BOOST_CHECK_NO_THROW(matrix = array_to_eigen(array)); - BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(array)); - // Check that the matrix dimensions are the same as the array - BOOST_CHECK_EQUAL(matrix.rows(), array.trange().elements_range().extent(0)); - BOOST_CHECK_EQUAL(matrix.cols(), array.trange().elements_range().extent(1)); - BOOST_CHECK_EQUAL(rmatrix.rows(), - array.trange().elements_range().extent(0)); - BOOST_CHECK_EQUAL(rmatrix.cols(), - array.trange().elements_range().extent(1)); - - // Check that the data in matrix matches the data in array - for (Range::const_iterator it = array.tiles_range().begin(); - it != array.tiles_range().end(); ++it) { - Future tile = array.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(matrix((*tile_it)[0], (*tile_it)[1]), - tile.get()[*tile_it]); - BOOST_CHECK_EQUAL(rmatrix((*tile_it)[0], (*tile_it)[1]), - tile.get()[*tile_it]); + // Convert the array to an Eigen matrices: column-major (matrix) and + // row-major (rmatrix) + BOOST_CHECK_NO_THROW(matrix = array_to_eigen(arr)); + BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(arr)); + BOOST_CHECK_NO_THROW(matrix = array_to_eigen(arr)); + BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(arr)); + + // Check that the matrix dimensions are the same as the array + BOOST_CHECK_EQUAL(matrix.rows(), arr.trange().elements_range().extent(0)); + BOOST_CHECK_EQUAL(matrix.cols(), arr.trange().elements_range().extent(1)); + BOOST_CHECK_EQUAL(rmatrix.rows(), + arr.trange().elements_range().extent(0)); + BOOST_CHECK_EQUAL(rmatrix.cols(), + arr.trange().elements_range().extent(1)); + + // Check that the data in matrix matches the data in array + for (Range::const_iterator it = arr.tiles_range().begin(); + it != arr.tiles_range().end(); ++it) { + Future tile = arr.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(matrix((*tile_it)[0] - base, (*tile_it)[1] - base), + tile.get()[*tile_it]); + BOOST_CHECK_EQUAL(rmatrix((*tile_it)[0] - base, (*tile_it)[1] - base), + tile.get()[*tile_it]); + } } - } - } else { - // Check that eigen_to_array throws when there is more than one node - BOOST_CHECK_THROW(array_to_eigen(array), TiledArray::Exception); - - // Fill local tiles with data - GlobalFixture::world->srand(27); - TArrayI::pmap_interface::const_iterator it = array.pmap()->begin(); - TArrayI::pmap_interface::const_iterator end = array.pmap()->end(); - for (; it != end; ++it) { - TArrayI::value_type tile(array.trange().make_tile_range(*it)); - for (TArrayI::value_type::iterator tile_it = tile.begin(); - tile_it != tile.end(); ++tile_it) { - *tile_it = GlobalFixture::world->rand(); + } else { + // Check that eigen_to_array throws when there is more than one node + BOOST_CHECK_THROW(array_to_eigen(arr), TiledArray::Exception); + + // Fill local tiles with data + GlobalFixture::world->srand(27); + TArrayI::pmap_interface::const_iterator it = arr.pmap()->begin(); + TArrayI::pmap_interface::const_iterator end = arr.pmap()->end(); + for (; it != end; ++it) { + TArrayI::value_type tile(arr.trange().make_tile_range(*it)); + for (TArrayI::value_type::iterator tile_it = tile.begin(); + tile_it != tile.end(); ++tile_it) { + *tile_it = GlobalFixture::world->rand(); + } + arr.set(*it, tile); } - array.set(*it, tile); - } - - // Distribute the data of array1 to all nodes - array.make_replicated(); - - BOOST_CHECK(array.pmap()->is_replicated()); - - // Convert the array to an Eigen matrix - BOOST_CHECK_NO_THROW(matrix = array_to_eigen(array)); - BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(array)); - - // Check that the matrix dimensions are the same as the array - BOOST_CHECK_EQUAL(matrix.rows(), array.trange().elements_range().extent(0)); - BOOST_CHECK_EQUAL(matrix.cols(), array.trange().elements_range().extent(1)); - BOOST_CHECK_EQUAL(rmatrix.rows(), - array.trange().elements_range().extent(0)); - BOOST_CHECK_EQUAL(rmatrix.cols(), - array.trange().elements_range().extent(1)); - // Check that the data in vector matches the data in array - for (Range::const_iterator it = array.tiles_range().begin(); - it != array.tiles_range().end(); ++it) { - BOOST_CHECK(array.is_local(*it)); - - Future tile = array.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(matrix((*tile_it)[0], (*tile_it)[1]), - tile.get()[*tile_it]); - BOOST_CHECK_EQUAL(rmatrix((*tile_it)[0], (*tile_it)[1]), - tile.get()[*tile_it]); + // Distribute the data of array1 to all nodes + arr.make_replicated(); + + BOOST_CHECK(arr.pmap()->is_replicated()); + + // Convert the array to an Eigen matrix + BOOST_CHECK_NO_THROW(matrix = array_to_eigen(arr)); + BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(arr)); + + // Check that the matrix dimensions are the same as the array + BOOST_CHECK_EQUAL(matrix.rows(), arr.trange().elements_range().extent(0)); + BOOST_CHECK_EQUAL(matrix.cols(), arr.trange().elements_range().extent(1)); + BOOST_CHECK_EQUAL(rmatrix.rows(), + arr.trange().elements_range().extent(0)); + BOOST_CHECK_EQUAL(rmatrix.cols(), + arr.trange().elements_range().extent(1)); + + // Check that the data in vector matches the data in array + for (Range::const_iterator it = arr.tiles_range().begin(); + it != arr.tiles_range().end(); ++it) { + BOOST_CHECK(arr.is_local(*it)); + + Future tile = arr.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(matrix((*tile_it)[0] - base, (*tile_it)[1] - base), + tile.get()[*tile_it]); + BOOST_CHECK_EQUAL(rmatrix((*tile_it)[0] - base, (*tile_it)[1] - base), + tile.get()[*tile_it]); + } } } - } + + } // base=0,1 } BOOST_AUTO_TEST_CASE(array_to_vector) { - if (GlobalFixture::world->size() == 1) { - // Fill the array with random data - GlobalFixture::world->srand(27); - for (Range::const_iterator it = array1.tiles_range().begin(); - it != array1.tiles_range().end(); ++it) { - TArrayI::value_type tile(array1.trange().make_tile_range(*it)); - for (TArrayI::value_type::iterator tile_it = tile.begin(); - tile_it != tile.end(); ++tile_it) { - *tile_it = GlobalFixture::world->rand(); + for (auto base : {0, 1}) { + auto& arr1 = base == 1 ? array1_base1 : array1; + + if (GlobalFixture::world->size() == 1) { + // Fill the array with random data + GlobalFixture::world->srand(27); + for (Range::const_iterator it = arr1.tiles_range().begin(); + it != arr1.tiles_range().end(); ++it) { + TArrayI::value_type tile(arr1.trange().make_tile_range(*it)); + for (TArrayI::value_type::iterator tile_it = tile.begin(); + tile_it != tile.end(); ++tile_it) { + *tile_it = GlobalFixture::world->rand(); + } + arr1.set(*it, tile); } - array1.set(*it, tile); - } - - // Convert the array to an Eigen vector - BOOST_CHECK_NO_THROW(vector = array_to_eigen(array1)); - - // Check that the matrix dimensions are the same as the array - BOOST_CHECK_EQUAL(vector.rows(), - array1.trange().elements_range().extent(0)); - BOOST_CHECK_EQUAL(vector.cols(), 1); - // Check that the data in vector matches the data in array - for (Range::const_iterator it = array1.tiles_range().begin(); - it != array1.tiles_range().end(); ++it) { - Future tile = array1.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(vector((*tile_it)[0]), tile.get()[*tile_it]); + // Convert the array to an Eigen vector + BOOST_CHECK_NO_THROW(vector = array_to_eigen(arr1)); + + // Check that the matrix dimensions are the same as the array + BOOST_CHECK_EQUAL(vector.rows(), + arr1.trange().elements_range().extent(0)); + BOOST_CHECK_EQUAL(vector.cols(), 1); + + // Check that the data in vector matches the data in array + for (Range::const_iterator it = arr1.tiles_range().begin(); + it != arr1.tiles_range().end(); ++it) { + Future tile = arr1.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(vector((*tile_it)[0] - base), tile.get()[*tile_it]); + } } - } - } else { - // Check that eigen_to_array throws when there is more than one node - BOOST_CHECK_THROW(array_to_eigen(array1), TiledArray::Exception); - - // Fill local tiles with data - GlobalFixture::world->srand(27); - TArrayI::pmap_interface::const_iterator it = array1.pmap()->begin(); - TArrayI::pmap_interface::const_iterator end = array1.pmap()->end(); - for (; it != end; ++it) { - TArrayI::value_type tile(array1.trange().make_tile_range(*it)); - for (TArrayI::value_type::iterator tile_it = tile.begin(); - tile_it != tile.end(); ++tile_it) { - *tile_it = GlobalFixture::world->rand(); + } else { + // Check that eigen_to_array throws when there is more than one node + BOOST_CHECK_THROW(array_to_eigen(arr1), TiledArray::Exception); + + // Fill local tiles with data + GlobalFixture::world->srand(27); + TArrayI::pmap_interface::const_iterator it = arr1.pmap()->begin(); + TArrayI::pmap_interface::const_iterator end = arr1.pmap()->end(); + for (; it != end; ++it) { + TArrayI::value_type tile(arr1.trange().make_tile_range(*it)); + for (TArrayI::value_type::iterator tile_it = tile.begin(); + tile_it != tile.end(); ++tile_it) { + *tile_it = GlobalFixture::world->rand(); + } + arr1.set(*it, tile); } - array1.set(*it, tile); - } - // Distribute the data of array1 to all nodes - array1.make_replicated(); + // Distribute the data of array1 to all nodes + arr1.make_replicated(); - BOOST_CHECK(array1.pmap()->is_replicated()); + BOOST_CHECK(arr1.pmap()->is_replicated()); - // Convert the array to an Eigen vector - BOOST_CHECK_NO_THROW(vector = array_to_eigen(array1)); + // Convert the array to an Eigen vector + BOOST_CHECK_NO_THROW(vector = array_to_eigen(arr1)); - // Check that the matrix dimensions are the same as the array - BOOST_CHECK_EQUAL(vector.rows(), - array1.trange().elements_range().extent(0)); - BOOST_CHECK_EQUAL(vector.cols(), 1); + // Check that the matrix dimensions are the same as the array + BOOST_CHECK_EQUAL(vector.rows(), + arr1.trange().elements_range().extent(0)); + BOOST_CHECK_EQUAL(vector.cols(), 1); - // Check that the data in vector matches the data in array - for (Range::const_iterator it = array1.tiles_range().begin(); - it != array1.tiles_range().end(); ++it) { - BOOST_CHECK(array1.is_local(*it)); + // Check that the data in vector matches the data in array + for (Range::const_iterator it = arr1.tiles_range().begin(); + it != arr1.tiles_range().end(); ++it) { + BOOST_CHECK(arr1.is_local(*it)); - Future tile = array1.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(vector((*tile_it)[0]), tile.get()[*tile_it]); + Future tile = arr1.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(vector((*tile_it)[0] - base), tile.get()[*tile_it]); + } } } - } + + } // base=0,1 } BOOST_AUTO_TEST_CASE(subtensor_to_tensor) { @@ -430,22 +472,26 @@ BOOST_AUTO_TEST_CASE(tensor_to_array) { BOOST_CHECK(eq() == true); } - // Copy matrix to array - BOOST_CHECK_NO_THROW((array = eigen_tensor_to_array( - *GlobalFixture::world, trangeN, tensor))); - - // Check that the data in array is equal to that in matrix - for (Range::const_iterator it = array.tiles_range().begin(); - it != array.tiles_range().end(); ++it) { - Future tile = array.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - std::array idx; - auto& t_idx = *tile_it; - std::copy(t_idx.begin(), t_idx.end(), idx.begin()); - BOOST_CHECK_EQUAL(tile.get()[*tile_it], tensor(idx)); + for (auto base : {0, 1}) { + auto& tr = base == 1 ? trangeN_base1 : trangeN; + auto& arr = base == 1 ? arrayN_base1 : arrayN; + // Copy matrix to array + BOOST_CHECK_NO_THROW((arr = eigen_tensor_to_array( + *GlobalFixture::world, tr, tensor))); + + // Check that the data in array is equal to that in matrix + for (Range::const_iterator it = arr.tiles_range().begin(); + it != arr.tiles_range().end(); ++it) { + Future tile = arr.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + auto& t_idx = *tile_it; + std::array idx; + for (auto d = 0; d != GlobalFixture::dim; ++d) idx[d] = t_idx[d] - base; + BOOST_CHECK_EQUAL(tile.get()[*tile_it], tensor(idx)); + } } - } + } // base } BOOST_AUTO_TEST_CASE(array_to_tensor) { @@ -462,57 +508,70 @@ BOOST_AUTO_TEST_CASE(array_to_tensor) { return result; }; - // Fill local tiles with data - GlobalFixture::world->srand(27); - TArrayI::pmap_interface::const_iterator it = arrayN.pmap()->begin(); - TArrayI::pmap_interface::const_iterator end = arrayN.pmap()->end(); - for (; it != end; ++it) { - TArrayI::value_type tile(arrayN.trange().make_tile_range(*it)); - for (TArrayI::value_type::iterator tile_it = tile.begin(); - tile_it != tile.end(); ++tile_it) { - *tile_it = GlobalFixture::world->rand(); + for (auto base : {0, 1}) { + auto& arr = base == 1 ? arrayN_base1 : arrayN; + + auto to_base0 = [&](const auto& arr) { + std::array result; + for (int i = 0; i < GlobalFixture::dim; ++i) result[i] = arr[i] - base; + return result; + }; + + // Fill local tiles with data + GlobalFixture::world->srand(27); + TArrayI::pmap_interface::const_iterator it = arr.pmap()->begin(); + TArrayI::pmap_interface::const_iterator end = arr.pmap()->end(); + for (; it != end; ++it) { + TArrayI::value_type tile(arr.trange().make_tile_range(*it)); + for (TArrayI::value_type::iterator tile_it = tile.begin(); + tile_it != tile.end(); ++tile_it) { + *tile_it = GlobalFixture::world->rand(); + } + arr.set(*it, tile); } - arrayN.set(*it, tile); - } - if (GlobalFixture::world->size() > 1) { - // Check that array_to_eigen_tensor throws when there is more than one node - BOOST_CHECK_THROW(array_to_eigen_tensor(arrayN), - TiledArray::Exception); - } + if (GlobalFixture::world->size() > 1) { + // Check that array_to_eigen_tensor throws when there is more than one + // node + BOOST_CHECK_THROW(array_to_eigen_tensor(arr), + TiledArray::Exception); + } - // Distribute the data of arrayN to all nodes - if (GlobalFixture::world->size() > 1) { - arrayN.make_replicated(); - BOOST_CHECK(arrayN.pmap()->is_replicated()); - } + // Distribute the data of arrayN to all nodes + if (GlobalFixture::world->size() > 1) { + arr.make_replicated(); + BOOST_CHECK(arr.pmap()->is_replicated()); + } + + // Convert the array to an Eigen matrix + BOOST_CHECK_NO_THROW(tensor = array_to_eigen_tensor(arr)); + BOOST_CHECK_NO_THROW(rtensor = a_to_e_rowmajor(arr)); + + // Check that the matrix dimensions are the same as the array + BOOST_CHECK_EQUAL_COLLECTIONS( + tensor.dimensions().begin(), tensor.dimensions().end(), + arr.trange().elements_range().extent().begin(), + arr.trange().elements_range().extent().end()); + BOOST_CHECK_EQUAL_COLLECTIONS( + rtensor.dimensions().begin(), rtensor.dimensions().end(), + arr.trange().elements_range().extent().begin(), + arr.trange().elements_range().extent().end()); - // Convert the array to an Eigen matrix - BOOST_CHECK_NO_THROW(tensor = array_to_eigen_tensor(arrayN)); - BOOST_CHECK_NO_THROW(rtensor = a_to_e_rowmajor(arrayN)); - - // Check that the matrix dimensions are the same as the array - BOOST_CHECK_EQUAL_COLLECTIONS( - tensor.dimensions().begin(), tensor.dimensions().end(), - arrayN.trange().elements_range().extent().begin(), - arrayN.trange().elements_range().extent().end()); - BOOST_CHECK_EQUAL_COLLECTIONS( - rtensor.dimensions().begin(), rtensor.dimensions().end(), - arrayN.trange().elements_range().extent().begin(), - arrayN.trange().elements_range().extent().end()); - - // Check that the data in vector matches the data in array - for (Range::const_iterator it = arrayN.tiles_range().begin(); - it != arrayN.tiles_range().end(); ++it) { - BOOST_CHECK(arrayN.is_local(*it)); - - Future tile = arrayN.find(*it); - for (Range::const_iterator tile_it = tile.get().range().begin(); - tile_it != tile.get().range().end(); ++tile_it) { - BOOST_CHECK_EQUAL(tensor(to_array(*tile_it)), tile.get()[*tile_it]); - BOOST_CHECK_EQUAL(rtensor(to_array(*tile_it)), tile.get()[*tile_it]); + // Check that the data in vector matches the data in array + for (Range::const_iterator it = arr.tiles_range().begin(); + it != arr.tiles_range().end(); ++it) { + BOOST_CHECK(arr.is_local(*it)); + + Future tile = arr.find(*it); + for (Range::const_iterator tile_it = tile.get().range().begin(); + tile_it != tile.get().range().end(); ++tile_it) { + BOOST_CHECK_EQUAL(tensor(to_base0(to_array(*tile_it))), + tile.get()[*tile_it]); + BOOST_CHECK_EQUAL(rtensor(to_base0(to_array(*tile_it))), + tile.get()[*tile_it]); + } } - } + } // base=0,1 } BOOST_AUTO_TEST_SUITE_END() diff --git a/tests/range_fixture.h b/tests/range_fixture.h index 3eb9afd611..6b0fcd1604 100644 --- a/tests/range_fixture.h +++ b/tests/range_fixture.h @@ -65,37 +65,46 @@ struct RangeFixture { }; struct Range1Fixture { + using index1_type = Range1::index1_type; static const size_t ntiles = 5; Range1Fixture() - : a(init_tiling()), - tiles(0, a.size() - 1), - elements(a.front(), a.back()), - tr1(a.begin(), a.end()) {} + : tr1_hashmarks(make_hashmarks()), + a(tr1_hashmarks), + tiles(0, tr1_hashmarks.size() - 1), + elements(tr1_hashmarks.front(), tr1_hashmarks.back()), + tr1(tr1_hashmarks), + tr1_base1(make_hashmarks(1)) {} ~Range1Fixture() {} template - static std::array init_tiling() { - std::array result; - result[0] = 0u; + static std::array make_hashmarks(index1_type offset = 0) { + std::array result; + result[0] = offset; for (std::size_t i = 1; i < D; ++i) result[i] = result[i - 1] + GlobalFixture::primes[i - 1]; return result; } - const std::array a; - const TiledRange1::range_type tiles; - const TiledRange1::range_type elements; - TiledRange1 tr1; + const std::array tr1_hashmarks; + const std::array + a; // copy of tr1_hashmarks, to make legacy tests build + const TiledRange1::range_type tiles; // = tr1.tiles_range() + const TiledRange1::range_type elements; // = tr1.elements_range() + TiledRange1 tr1; // base-0 TiledRange1 std::array tile; + TiledRange1 tr1_base1; // base-1 TiledRange1 }; struct TiledRangeFixtureBase : public Range1Fixture { TiledRangeFixtureBase() { std::fill(dims.begin(), dims.end(), tr1); std::fill(extents.begin(), extents.end(), tr1.extent()); + std::fill(dims_base1.begin(), dims_base1.end(), tr1_base1); } - std::array dims; + std::array dims; // base-0 TiledRange1's + std::array + dims_base1; // base-1 version of dims std::array extents; }; // struct TiledRangeFixtureBase @@ -106,17 +115,21 @@ struct TiledRangeFixture : public RangeFixture, public TiledRangeFixtureBase { TiledRangeFixture() : tiles_range(TiledRangeFixture::index(GlobalFixture::dim, 0), TiledRangeFixture::index(GlobalFixture::dim, 5)), - elements_range(TiledRangeFixture::tile_index(GlobalFixture::dim, 0), - TiledRangeFixture::tile_index(GlobalFixture::dim, a[5])), - tr(dims.begin(), dims.end()) {} + elements_range(TiledRangeFixture::tile_index(GlobalFixture::dim, + tr1_hashmarks.front()), + TiledRangeFixture::tile_index(GlobalFixture::dim, + tr1_hashmarks.back())), + tr(dims.begin(), dims.end()), + tr_base1(dims_base1.begin(), dims_base1.end()) {} ~TiledRangeFixture() {} static tile_index fill_tile_index(TRangeN::range_type::index::value_type); const TRangeN::range_type tiles_range; - const TRangeN::range_type elements_range; - TRangeN tr; + const TRangeN::range_type elements_range; // elements range of tr + TRangeN tr; // base-0 TiledRangeN + TRangeN tr_base1; // base-1 version of tr }; #endif // TILEDARRAY_RANGE_FIXTURE_H__INCLUDED From d4be91c42da3dc6a235e129564cb59b89971781b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 3 Sep 2024 23:33:05 -0400 Subject: [PATCH 517/592] [skip ci] typo --- src/TiledArray/conversions/concat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/conversions/concat.h b/src/TiledArray/conversions/concat.h index cc55f91e17..dd35e09456 100644 --- a/src/TiledArray/conversions/concat.h +++ b/src/TiledArray/conversions/concat.h @@ -64,7 +64,7 @@ DistArray concat( using std::begin; using std::end; - index b(r), e(r); // updated for concatted modes only + index b(r), e(r); // updated for concatenated modes only std::fill(begin(b), end(b), 0); for (auto i = 0ul; i != arrays.size(); ++i) { auto& tr = arrays[i].trange(); From f2b319b4721a35b4e45da705b5a65da06ed53358 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 3 Sep 2024 23:34:27 -0400 Subject: [PATCH 518/592] remove duplicate vlock copy in concat --- src/TiledArray/conversions/concat.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/TiledArray/conversions/concat.h b/src/TiledArray/conversions/concat.h index dd35e09456..e7b3e9da55 100644 --- a/src/TiledArray/conversions/concat.h +++ b/src/TiledArray/conversions/concat.h @@ -97,9 +97,6 @@ DistArray concat( result.make_tsrexpr(annot).block(tile_begin_end[i].first, tile_begin_end[i].second) = arrays[i].make_tsrexpr(annot); - result.make_tsrexpr(annot).block(tile_begin_end[i].first, - tile_begin_end[i].second) = - arrays[i].make_tsrexpr(annot); } } result.world().gop.fence(); From bd5f35beaf015dd1527d1fe04b8da3facf63187e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 3 Sep 2024 23:35:10 -0400 Subject: [PATCH 519/592] introduced tile_ranges_match_trange(DistArray) for validating tile ranges against trange --- src/TiledArray/dist_array.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index 6baee2abe0..8ed2c8b043 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -1779,6 +1779,22 @@ auto rank(const DistArray& a) { return a.trange().tiles_range().rank(); } +/// Checks if for every tile `i` its range matches the tile range produced by +/// `a.trange()` + +/// @return `a.get(i)->range() == a.trange().make_tile_range(i)` for every tile +/// `i` +template +bool tile_ranges_match_trange(const DistArray& a) { + auto end = a.end(); + for (auto it = a.begin(); it != end; ++it) { + if (it->is_local() && !a.is_zero(it.index())) + if ((*it).get().range() != a.trange().make_tile_range(it.index())) + return false; + } + return true; +} + /// /// \brief Get the total elements in the non-zero tiles of an array. /// For tensor-of-tensor tiles, the total is the sum of the number of From 59b0a37f93e7d406742bb5cae773e4719407e7fa Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 3 Sep 2024 23:35:47 -0400 Subject: [PATCH 520/592] SizeArray is a viewable range --- src/TiledArray/size_array.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/size_array.h b/src/TiledArray/size_array.h index bd52139ce5..ef2ed1e121 100644 --- a/src/TiledArray/size_array.h +++ b/src/TiledArray/size_array.h @@ -26,6 +26,8 @@ #include #include +#include + namespace TiledArray { namespace detail { @@ -445,6 +447,20 @@ class SizeArray { }; // class SizeArray +} // namespace detail +} // namespace TiledArray + +namespace ranges { +template +inline constexpr bool enable_view> = true; +} // namespace ranges + +static_assert(ranges::range>); +static_assert( + ranges::viewable_range>); + +namespace TiledArray::detail { + template std::enable_if_t< is_sized_range_v> && @@ -473,7 +489,6 @@ inline std::ostream& operator<<(std::ostream& os, return os; } -} // namespace detail -} // namespace TiledArray +} // namespace TiledArray::detail #endif // TILEDARRAY_SIZE_ARRAY_H__INCLUDED From aed712869d210cdb135c1c37fe55f17153df8518 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 3 Sep 2024 23:38:19 -0400 Subject: [PATCH 521/592] fixed assignment to block expression from an expression with nonzero base --- src/TiledArray/expressions/expr.h | 13 +++++++++++-- tests/expressions_fixture.h | 19 +++++++++++++++++++ tests/expressions_impl.h | 29 +++++++++++++++++++++++++++++ tests/range_fixture.h | 2 +- 4 files changed, 60 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index c3fdd6423b..8d52990eef 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -47,6 +47,9 @@ #include +#include +#include + namespace TiledArray::expressions { template @@ -509,8 +512,14 @@ class Expr { if (tsr.array().trange().tiles_range().volume() != 0) { // N.B. must deep copy TA_ASSERT(tsr.array().trange().tiles_range().includes(tsr.lower_bound())); - const container::svector shift = - tsr.array().trange().make_tile_range(tsr.lower_bound()).lobound(); + // N.B. this expression's range, + // dist_eval.trange().elements_range().lobound(), may not be zero! + const auto shift = + ranges::views::zip_with( + [](auto a, auto b) { return a - b; }, + tsr.array().trange().make_tile_range(tsr.lower_bound()).lobound(), + dist_eval.trange().elements_range().lobound()) | + ranges::to>(); std::shared_ptr shift_op = std::make_shared(shift_op_type(shift)); diff --git a/tests/expressions_fixture.h b/tests/expressions_fixture.h index 94c09a7449..7a7be4c9af 100644 --- a/tests/expressions_fixture.h +++ b/tests/expressions_fixture.h @@ -57,6 +57,8 @@ struct ExpressionsFixture : public TiledRangeFixture { ExpressionsFixture() : s_tr_1(make_random_sparseshape(tr)), s_tr_2(make_random_sparseshape(tr)), + s_tr_base1_1(make_random_sparseshape(tr_base1)), + s_tr_base1_2(make_random_sparseshape(tr_base1)), s_tr1_1(make_random_sparseshape(trange1)), s_tr1_2(make_random_sparseshape(trange1)), s_tr2(make_random_sparseshape(trange2)), @@ -65,6 +67,9 @@ struct ExpressionsFixture : public TiledRangeFixture { a(*GlobalFixture::world, tr, s_tr_1), b(*GlobalFixture::world, tr, s_tr_2), c(*GlobalFixture::world, tr, s_tr_2), + a_base1(*GlobalFixture::world, tr_base1, s_tr_base1_1), + b_base1(*GlobalFixture::world, tr_base1, s_tr_base1_2), + c_base1(*GlobalFixture::world, tr_base1, s_tr_base1_2), aC(*GlobalFixture::world, trangeC, s_trC), aC_f(*GlobalFixture::world, trangeC_f, s_trC_f), u(*GlobalFixture::world, trange1, s_tr1_1), @@ -72,12 +77,16 @@ struct ExpressionsFixture : public TiledRangeFixture { w(*GlobalFixture::world, trange2, s_tr2) { random_fill(a); random_fill(b); + random_fill(a_base1); + random_fill(b_base1); random_fill(u); random_fill(v); random_fill(aC); GlobalFixture::world->gop.fence(); a.truncate(); b.truncate(); + a_base1.truncate(); + b_base1.truncate(); u.truncate(); v.truncate(); } @@ -89,6 +98,9 @@ struct ExpressionsFixture : public TiledRangeFixture { : a(*GlobalFixture::world, tr), b(*GlobalFixture::world, tr), c(*GlobalFixture::world, tr), + a_base1(*GlobalFixture::world, tr_base1), + b_base1(*GlobalFixture::world, tr_base1), + c_base1(*GlobalFixture::world, tr_base1), u(*GlobalFixture::world, trange1), v(*GlobalFixture::world, trange1), w(*GlobalFixture::world, trange2), @@ -96,6 +108,8 @@ struct ExpressionsFixture : public TiledRangeFixture { aC_f(*GlobalFixture::world, trangeC_f) { random_fill(a); random_fill(b); + random_fill(a_base1); + random_fill(b_base1); random_fill(u); random_fill(v); random_fill(aC); @@ -229,6 +243,8 @@ struct ExpressionsFixture : public TiledRangeFixture { SparseShape s_tr_1; SparseShape s_tr_2; + SparseShape s_tr_base1_1; + SparseShape s_tr_base1_2; SparseShape s_tr1_1; SparseShape s_tr1_2; SparseShape s_tr2; @@ -237,6 +253,9 @@ struct ExpressionsFixture : public TiledRangeFixture { TArray a; TArray b; TArray c; + TArray a_base1; + TArray b_base1; + TArray c_base1; TArray u; TArray v; TArray w; diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h index 268b118568..ca8027c03d 100644 --- a/tests/expressions_impl.h +++ b/tests/expressions_impl.h @@ -32,6 +32,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(tensor_factories, F, Fixtures, F) { auto& a = F::a; auto& c = F::c; auto& aC = F::aC; + auto& a_base1 = F::a_base1; const auto& ca = a; const std::array lobound{{3, 3, 3}}; @@ -66,6 +67,8 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(tensor_factories, F, Fixtures, F) { BOOST_CHECK_NO_THROW(c("a,b,c") = ca("a,b,c").block(iv(3, 3, 3), iv(5, 5, 5))); + BOOST_CHECK_NO_THROW(c("a,b,c") = a_base1("a,b,c").block(lobound, upbound)); + // make sure that c("abc") = a("abc") does a deep copy { BOOST_CHECK_NO_THROW(c("a,b,c") = a("a, b, c")); @@ -291,6 +294,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(block, F, Fixtures, F) { auto& a = F::a; auto& b = F::b; auto& c = F::c; + auto& a_base1 = F::a_base1; BlockRange block_range(a.trange().tiles_range(), {3, 3, 3}, {5, 5, 5}); @@ -683,6 +687,31 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_block, F, Fixtures, F) { } } +BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_block_base1, F, Fixtures, F) { + auto& a = F::a; + auto& b = F::b; + auto& c = F::c; + auto& a_base1 = F::a_base1; + auto& c_base1 = F::c_base1; + auto& ntiles = F::ntiles; + + c.fill_local(0.0); + c_base1.fill_local(0.0); + + BOOST_REQUIRE_NO_THROW(c("a,b,c").block({3, 3, 3}, {5, 5, 5}) = + a_base1("a,b,c").block({3, 3, 3}, {5, 5, 5})); + BOOST_REQUIRE(tile_ranges_match_trange(c)); + BOOST_REQUIRE_NO_THROW(c_base1("a,b,c").block({3, 3, 3}, {5, 5, 5}) = + a("a,b,c").block({3, 3, 3}, {5, 5, 5})); + BOOST_REQUIRE(tile_ranges_match_trange(c_base1)); + BOOST_REQUIRE_NO_THROW(c("a,b,c").block({0, 0, 0}, {ntiles, ntiles, ntiles}) = + a_base1("a,b,c")); + BOOST_REQUIRE(tile_ranges_match_trange(c)); + BOOST_REQUIRE_NO_THROW( + c_base1("a,b,c").block({0, 0, 0}, {ntiles, ntiles, ntiles}) = a("a,b,c")); + BOOST_REQUIRE(tile_ranges_match_trange(c_base1)); +} + BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_permute_block, F, Fixtures, F) { auto& a = F::a; diff --git a/tests/range_fixture.h b/tests/range_fixture.h index 6b0fcd1604..5a554eab7c 100644 --- a/tests/range_fixture.h +++ b/tests/range_fixture.h @@ -66,7 +66,7 @@ struct RangeFixture { struct Range1Fixture { using index1_type = Range1::index1_type; - static const size_t ntiles = 5; + static const inline size_t ntiles = 5; Range1Fixture() : tr1_hashmarks(make_hashmarks()), From 1662f8b933f1232be83e9c392cd5966a280d2f1d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 4 Sep 2024 11:24:53 -0400 Subject: [PATCH 522/592] [skip ci] to_container.hpp -> range/conversion.hpp to_container.hpp is deprecated --- src/TiledArray/expressions/expr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index 8d52990eef..f6d2ff1376 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -47,7 +47,7 @@ #include -#include +#include #include namespace TiledArray::expressions { From 5b26108980c5159bbb272a87739a53391089e65e Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 4 Sep 2024 16:13:56 -0400 Subject: [PATCH 523/592] [skip ci] svd dox fixup --- src/TiledArray/math/linalg/non-distributed/svd.h | 10 +++++----- src/TiledArray/math/linalg/scalapack/svd.h | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/math/linalg/non-distributed/svd.h b/src/TiledArray/math/linalg/non-distributed/svd.h index e0094ef906..3e3608240e 100644 --- a/src/TiledArray/math/linalg/non-distributed/svd.h +++ b/src/TiledArray/math/linalg/non-distributed/svd.h @@ -34,16 +34,16 @@ namespace TiledArray::math::linalg::non_distributed { /** - * @brief Compute the singular value decomposition (SVD) via ScaLAPACK + * @brief Compute the singular value decomposition (SVD) via LAPACK * * A(i,j) = S(k) U(i,k) conj(V(j,k)) * * Example Usage: * - * auto S = svd (A, ...) - * auto [S, U] = svd (A, ...) - * auto [S, VT] = svd(A, ...) - * auto [S, U, VT] = svd (A, ...) + * auto S = svd (A, ...) + * auto [S, U] = svd (A, ...) + * auto [S, VT] = svd(A, ...) + * auto [S, U, VT] = svd (A, ...) * * @tparam Array Input array type, must be convertible to BlockCyclicMatrix * diff --git a/src/TiledArray/math/linalg/scalapack/svd.h b/src/TiledArray/math/linalg/scalapack/svd.h index dc68d374c5..aa9f459ba9 100644 --- a/src/TiledArray/math/linalg/scalapack/svd.h +++ b/src/TiledArray/math/linalg/scalapack/svd.h @@ -42,10 +42,10 @@ namespace TiledArray::math::linalg::scalapack { * * Example Usage: * - * auto S = svd (A, ...) - * auto [S, U] = svd (A, ...) - * auto [S, VT] = svd(A, ...) - * auto [S, U, VT] = svd (A, ...) + * auto S = svd (A, ...) + * auto [S, U] = svd (A, ...) + * auto [S, VT] = svd(A, ...) + * auto [S, U, VT] = svd (A, ...) * * @tparam Array Input array type, must be convertible to BlockCyclicMatrix * From 9c84513a48a16fbef5829d9f6a9ba91114300702 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 8 Sep 2024 15:50:17 -0400 Subject: [PATCH 524/592] TiledRange1{int x} constructs an empty element range at [x,x) --- src/TiledArray/tiled_range1.h | 13 ++++++++----- tests/tiled_range1.cpp | 21 ++++++++++++++++++++- 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 4824dec26e..102ea1bcc8 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -338,10 +338,11 @@ class TiledRange1 { /// Validates tile_boundaries template static void valid_(RandIter first, RandIter last) { - // Verify at least 2 elements are present if the vector is not empty. - TA_ASSERT((std::distance(first, last) >= 2) && - "TiledRange1 construction failed: You need at least 2 " - "elements in the tile boundary list."); + // Need at least 1 tile hashmark to position the element range + // (zero hashmarks is handled by the default ctor) + TA_ASSERT((std::distance(first, last) >= 1) && + "TiledRange1 construction failed: You need at least 1 " + "element in the tile boundary list."); // Verify the requirement that a0 <= a1 <= a2 <= ... for (; first != (last - 1); ++first) { TA_ASSERT( @@ -364,7 +365,9 @@ class TiledRange1 { valid_(first, last); #endif // NDEBUG range_.first = start_tile_index; - range_.second = start_tile_index + last - first - 1; + using std::distance; + range_.second = + start_tile_index + static_cast(distance(first, last)) - 1; elements_range_.first = *first; elements_range_.second = *(last - 1); for (; first != (last - 1); ++first) diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index f01a9a208e..056f752e33 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -63,6 +63,25 @@ BOOST_AUTO_TEST_CASE(constructor) { BOOST_CHECK_TA_ASSERT(r.tile(0), Exception); } + // check construction with single tile boundary (hence zero tiles) + { + { + BOOST_REQUIRE_NO_THROW(TiledRange1 r(0)); + TiledRange1 r(0); + BOOST_CHECK_EQUAL(r, TiledRange1{}); + } + { + BOOST_REQUIRE_NO_THROW(TiledRange1 r(1)); + TiledRange1 r(1); + BOOST_CHECK_NE(r, TiledRange1{}); + BOOST_CHECK_EQUAL(r.tiles_range().first, 0); + BOOST_CHECK_EQUAL(r.tiles_range().second, 0); + BOOST_CHECK_EQUAL(r.elements_range().first, 1); + BOOST_CHECK_EQUAL(r.elements_range().second, 1); + BOOST_CHECK_TA_ASSERT(r.tile(0), Exception); + } + } + // check construction with a iterators and the range info. { BOOST_REQUIRE_NO_THROW(TiledRange1 r(a.begin(), a.end())); @@ -200,7 +219,7 @@ BOOST_AUTO_TEST_CASE(constructor) { BOOST_CHECK_TA_ASSERT(TiledRange1 r(boundaries.begin(), boundaries.end()), Exception); BOOST_CHECK_TA_ASSERT(TiledRange1 r(a.begin(), a.begin()), Exception); - BOOST_CHECK_TA_ASSERT(TiledRange1 r(a.begin(), a.begin() + 1), Exception); + BOOST_CHECK_NO_THROW(TiledRange1 r(a.begin(), a.begin() + 1)); boundaries.push_back(2); boundaries.push_back(0); BOOST_CHECK_TA_ASSERT(TiledRange1 r(boundaries.begin(), boundaries.end()), From 64ba684c61696f9f945f96ef7c97190ad97854d3 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 8 Sep 2024 23:35:03 -0400 Subject: [PATCH 525/592] make Range1 printable and shiftable --- src/TiledArray/range1.h | 34 +++++++++++++++++++++++++++++++++- tests/range1.cpp | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/range1.h b/src/TiledArray/range1.h index dbb4b05a67..8b185936d4 100644 --- a/src/TiledArray/range1.h +++ b/src/TiledArray/range1.h @@ -32,7 +32,8 @@ namespace TiledArray { /// an integer range `[first,second)` /// @note previously represented by std::pair, hence the design struct Range1 { - typedef TA_1INDEX_TYPE index1_type; + using index1_type = TA_1INDEX_TYPE; + using signed_index1_type = std::make_signed_t; index1_type first = 0; index1_type second = 0; //< N.B. second >= first @@ -164,6 +165,31 @@ struct Range1 { /// @} + /// shifts this Range1 + + /// @param[in] shift the shift to apply + /// @return reference to this + Range1& inplace_shift(signed_index1_type shift) { + if (shift == 0) return *this; + // ensure that it's safe to shift + TA_ASSERT(shift <= 0 || upbound() <= 0 || + (shift <= (std::numeric_limits::max() - upbound()))); + TA_ASSERT(shift >= 0 || lobound() >= 0 || + (std::abs(shift) <= + (lobound() - std::numeric_limits::min()))); + first += shift; + second += shift; + return *this; + } + + /// creates a shifted Range1 + + /// @param[in] shift the shift value + /// @return a copy of this shifted by @p shift + [[nodiscard]] Range1 shift(signed_index1_type shift) const { + return Range1(*this).inplace_shift(shift); + } + template >>::type* = nullptr> @@ -190,6 +216,12 @@ inline void swap(Range1& r0, Range1& r1) { // no throw r0.swap(r1); } +/// Range1 ostream operator +inline std::ostream& operator<<(std::ostream& out, const Range1& rng) { + out << "[ " << rng.first << ", " << rng.second << " )"; + return out; +} + /// Test that two Range1 objects are congruent /// This function tests that the sizes of the two Range1 objects coincide. diff --git a/tests/range1.cpp b/tests/range1.cpp index ba49515cd7..f8d05ed4c0 100644 --- a/tests/range1.cpp +++ b/tests/range1.cpp @@ -137,6 +137,43 @@ BOOST_AUTO_TEST_CASE(comparison) { BOOST_CHECK(r1 != r4); } +BOOST_AUTO_TEST_CASE(shift) { + Range1 r0; + Range1 r0_plus_1; + BOOST_REQUIRE_NO_THROW(r0_plus_1 = r0.shift(1)); + BOOST_CHECK_EQUAL(r0_plus_1, Range1(1, 1)); + BOOST_REQUIRE_NO_THROW(r0_plus_1.inplace_shift(-1)); + BOOST_CHECK_EQUAL(r0_plus_1, r0); + + using index1_type = Range1::index1_type; + BOOST_CHECK_TA_ASSERT((Range1{std::numeric_limits::max() - 1, + std::numeric_limits::max()} + .inplace_shift(1)), + Exception); + BOOST_CHECK_TA_ASSERT((Range1{std::numeric_limits::min(), + std::numeric_limits::min() + 1} + .inplace_shift(-1)), + Exception); + Range1 tmp; + BOOST_CHECK_TA_ASSERT( + tmp = (Range1{std::numeric_limits::max() - 1, + std::numeric_limits::max()} + .shift(1)), + Exception); + BOOST_CHECK_TA_ASSERT( + tmp = (Range1{std::numeric_limits::min(), + std::numeric_limits::min() + 1} + .shift(-1)), + Exception); + + Range1 r1{1, 3}; + Range1 r1_minus_1; + BOOST_REQUIRE_NO_THROW(r1_minus_1 = r1.shift(-1)); + BOOST_CHECK_EQUAL(r1_minus_1, Range1(0, 2)); + BOOST_REQUIRE_NO_THROW(r1_minus_1.inplace_shift(1)); + BOOST_CHECK_EQUAL(r1_minus_1, r1); +} + BOOST_AUTO_TEST_CASE(serialization) { Range1 r{1, 10}; From 6a8f75fe5f0032339850c51c59ab753c3d1d54b2 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 8 Sep 2024 17:02:24 -0400 Subject: [PATCH 526/592] make TiledRange1 shiftable --- src/TiledArray/tiled_range1.h | 48 +++++++++++++++++++++++++++++++++++ tests/tiled_range1.cpp | 30 ++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 102ea1bcc8..9ea5769203 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -50,6 +50,7 @@ class TiledRange1 { public: using range_type = Range1; using index1_type = range_type::index1_type; + using signed_index1_type = range_type::signed_index1_type; using const_iterator = std::vector::const_iterator; /// Default constructor creates an empty range (tile and element ranges are @@ -305,6 +306,53 @@ class TiledRange1 { return make_uniform(Range1(0, range_extent), target_tile_size); } + /// shifts this TiledRange1 + + /// @param[in] shift the shift to apply + /// @return reference to this + TiledRange1& inplace_shift(signed_index1_type shift) { + if (shift == 0) return *this; + // ensure that it's safe to shift + TA_ASSERT(shift <= 0 || elements_range().upbound() <= 0 || + (shift <= (std::numeric_limits::max() - + elements_range().upbound()))); + TA_ASSERT(shift >= 0 || elements_range().lobound() >= 0 || + (std::abs(shift) <= (elements_range().lobound() - + std::numeric_limits::min()))); + elements_range_.inplace_shift(shift); + for (auto& tile : tiles_ranges_) { + tile.inplace_shift(shift); + } + elem2tile_.reset(); + return *this; + } + + /// creates a shifted TiledRange1 + + /// equivalent to (but more efficient than) `TiledRange1(*this).shift(shift)` + /// @param[in] shift the shift value + [[nodiscard]] TiledRange1 shift(signed_index1_type shift) const { + if (shift == 0) return *this; + // ensure that it's safe to shift + TA_ASSERT(shift <= 0 || elements_range().upbound() <= 0 || + (shift <= (std::numeric_limits::max() - + elements_range().upbound()))); + TA_ASSERT(shift >= 0 || elements_range().lobound() >= 0 || + (std::abs(shift) <= (elements_range().lobound() - + std::numeric_limits::min()))); + std::vector hashmarks; + hashmarks.reserve(tile_extent() + 1); + if (tiles_ranges_.empty()) + hashmarks.emplace_back(elements_range_.lobound() + shift); + else { + for (auto& t : tiles_ranges_) { + hashmarks.push_back(t.first + shift); + } + hashmarks.push_back(elements_range_.upbound() + shift); + } + return TiledRange1(hashmarks.begin(), hashmarks.end()); + } + /// swapper /// \param other the range with which the contents of this range will be diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index 056f752e33..2fe958bd2d 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -360,4 +360,34 @@ BOOST_AUTO_TEST_CASE(make_uniform) { (TiledRange1{0, 10, 20, 30, 40, 50, 59})); } +BOOST_AUTO_TEST_CASE(shift) { + TiledRange1 r0; + TiledRange1 r0_plus_1; + BOOST_REQUIRE_NO_THROW(r0_plus_1 = r0.shift(1)); + BOOST_CHECK_EQUAL(r0_plus_1, TiledRange1(1)); + BOOST_REQUIRE_NO_THROW(r0_plus_1.inplace_shift(-1)); + BOOST_CHECK_EQUAL(r0_plus_1, r0); + + BOOST_CHECK_TA_ASSERT( + TiledRange1{std::numeric_limits::max()}.inplace_shift(1), + Exception); + BOOST_CHECK_TA_ASSERT( + TiledRange1{std::numeric_limits::min()}.inplace_shift(-1), + Exception); + TiledRange1 tmp; + BOOST_CHECK_TA_ASSERT( + tmp = TiledRange1{std::numeric_limits::max()}.shift(1), + Exception); + BOOST_CHECK_TA_ASSERT( + tmp = TiledRange1{std::numeric_limits::min()}.shift(-1), + Exception); + + TiledRange1 r1{1, 3, 7, 9}; + TiledRange1 r1_minus_1; + BOOST_REQUIRE_NO_THROW(r1_minus_1 = r1.shift(-1)); + BOOST_CHECK_EQUAL(r1_minus_1, TiledRange1(0, 2, 6, 8)); + BOOST_REQUIRE_NO_THROW(r1_minus_1.inplace_shift(1)); + BOOST_CHECK_EQUAL(r1_minus_1, r1); +} + BOOST_AUTO_TEST_SUITE_END() From c0ebdd3b1fe64122fbbb87f26e1ca42faf4e3cac Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 8 Sep 2024 23:37:24 -0400 Subject: [PATCH 527/592] TiledRange1 printer reimplemented in terms of Range1 printer --- src/TiledArray/tiled_range1.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 9ea5769203..46c4b37adc 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -485,10 +485,8 @@ inline bool operator!=(const TiledRange1& r1, const TiledRange1& r2) { /// TiledRange1 ostream operator inline std::ostream& operator<<(std::ostream& out, const TiledRange1& rng) { - out << "( tiles = [ " << rng.tiles_range().first << ", " - << rng.tiles_range().second << " ), elements = [ " - << rng.elements_range().first << ", " << rng.elements_range().second - << " ) )"; + out << "( tiles = " << rng.tiles_range() + << ", elements = " << rng.elements_range() << " )"; return out; } From b0803257c30261dfa98f251a11847e0ccb622890 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 9 Sep 2024 00:00:28 -0400 Subject: [PATCH 528/592] [skip ci] dox++ --- src/TiledArray/range.h | 8 ++++---- src/TiledArray/tile.h | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h index 25e4852118..c3ce5aa7f7 100644 --- a/src/TiledArray/range.h +++ b/src/TiledArray/range.h @@ -949,7 +949,7 @@ class Range { return *this; } - /// Shift the lower and upper bound of this range + /// Shifts the lower and upper bounds of this range /// \tparam Index An integral range type /// \param bound_shift The shift to be applied to the range @@ -987,7 +987,7 @@ class Range { return *this; } - /// Shift the lower and upper bound of this range + /// Shifts the lower and upper bounds of this range /// \tparam Index An integral type /// \param bound_shift The shift to be applied to the range @@ -998,7 +998,7 @@ class Range { return inplace_shift>(bound_shift); } - /// Create a Range with shiften lower and upper bounds + /// Create a Range with shifted lower and upper bounds /// \tparam Index An integral range type /// \param bound_shift The shift to be applied to the range @@ -1011,7 +1011,7 @@ class Range { return result; } - /// Create a Range with shiften lower and upper bounds + /// Create a Range with shifted lower and upper bounds /// \tparam Index An integral type /// \param bound_shift The shift to be applied to the range diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h index 7d568f7200..b8c62d95b8 100644 --- a/src/TiledArray/tile.h +++ b/src/TiledArray/tile.h @@ -39,19 +39,19 @@ namespace TiledArray { /// object to be used in TiledArray expressions, users must also define the /// following functions: /// \li \c add -/// \li \c add_to +/// \li \c add_to (in-place add) /// \li \c subt -/// \li \c subt_to +/// \li \c subt_to (in-place subt) /// \li \c mult -/// \li \c mult_to +/// \li \c mult_to (in-place mult) /// \li \c scale -/// \li \c scale_to +/// \li \c scale_to (in-place scale) /// \li \c gemm /// \li \c neg /// \li \c permute /// \li \c empty /// \li \c shift -/// \li \c shift_to +/// \li \c shift_to (in-place shift) /// \li \c trace /// \li \c sum /// \li \c product From c22bf70377013233e54cb8285e4e791f6ea8846f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 9 Sep 2024 00:00:04 -0400 Subject: [PATCH 529/592] Range::shift is const --- src/TiledArray/range.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h index c3ce5aa7f7..1363d6b992 100644 --- a/src/TiledArray/range.h +++ b/src/TiledArray/range.h @@ -1005,7 +1005,7 @@ class Range { /// \return A shifted copy of this range template >> - Range_ shift(const Index& bound_shift) { + Range_ shift(const Index& bound_shift) const { Range_ result(*this); result.inplace_shift(bound_shift); return result; @@ -1018,7 +1018,7 @@ class Range { /// \return A shifted copy of this range template >> - Range_ shift(const std::initializer_list& bound_shift) { + Range_ shift(const std::initializer_list& bound_shift) const { Range_ result(*this); result.inplace_shift(bound_shift); return result; From 78931bc6526d2da4f98a143304d85475c9c3e95c Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 9 Sep 2024 00:22:31 -0400 Subject: [PATCH 530/592] Range::shift is nodiscard --- src/TiledArray/range.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h index 1363d6b992..cdebd7ddfc 100644 --- a/src/TiledArray/range.h +++ b/src/TiledArray/range.h @@ -1005,7 +1005,7 @@ class Range { /// \return A shifted copy of this range template >> - Range_ shift(const Index& bound_shift) const { + [[nodiscard]] Range_ shift(const Index& bound_shift) const { Range_ result(*this); result.inplace_shift(bound_shift); return result; @@ -1018,7 +1018,8 @@ class Range { /// \return A shifted copy of this range template >> - Range_ shift(const std::initializer_list& bound_shift) const { + [[nodiscard]] Range_ shift( + const std::initializer_list& bound_shift) const { Range_ result(*this); result.inplace_shift(bound_shift); return result; From a847f70f750c77c705c39f50fb08e11652122ae8 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 9 Sep 2024 00:23:18 -0400 Subject: [PATCH 531/592] TiledRange is shiftable --- src/TiledArray/tiled_range.h | 55 ++++++++++++++++++++++++++++++++++++ tests/tiled_range.cpp | 11 ++++++++ 2 files changed, 66 insertions(+) diff --git a/src/TiledArray/tiled_range.h b/src/TiledArray/tiled_range.h index 27e559da1c..bfcd4c86fc 100644 --- a/src/TiledArray/tiled_range.h +++ b/src/TiledArray/tiled_range.h @@ -324,6 +324,61 @@ class TiledRange { std::swap(ranges_, other.ranges_); } + /// Shifts the lower and upper bounds of this range + + /// \tparam Index An integral range type + /// \param bound_shift The shift to be applied to the range + /// \return A reference to this range + template >> + TiledRange_& inplace_shift(const Index& bound_shift) { + elements_range_.inplace_shift(bound_shift); + using std::begin; + auto bound_shift_it = begin(bound_shift); + for (std::size_t d = 0; d != rank(); ++d, ++bound_shift_it) { + ranges_[d].inplace_shift(*bound_shift_it); + } + return *this; + } + + /// Shifts the lower and upper bound of this range + + /// \tparam Index An integral type + /// \param bound_shift The shift to be applied to the range + /// \return A reference to this range + template >> + TiledRange_& inplace_shift(const std::initializer_list& bound_shift) { + return inplace_shift>(bound_shift); + } + + /// Create a TiledRange with shifted lower and upper bounds + + /// \tparam Index An integral range type + /// \param bound_shift The shift to be applied to the range + /// \return A shifted copy of this range + template >> + [[nodiscard]] TiledRange_ shift(const Index& bound_shift) const { + TiledRange_ result(*this); + result.inplace_shift(bound_shift); + return result; + } + + /// Create a TiledRange with shifted lower and upper bounds + + /// \tparam Index An integral type + /// \param bound_shift The shift to be applied to the range + /// \return A shifted copy of this range + template >> + [[nodiscard]] TiledRange_ shift( + const std::initializer_list& bound_shift) const { + TiledRange_ result(*this); + result.inplace_shift(bound_shift); + return result; + } + template >>::type* = nullptr> diff --git a/tests/tiled_range.cpp b/tests/tiled_range.cpp index 76702831a3..577b395927 100644 --- a/tests/tiled_range.cpp +++ b/tests/tiled_range.cpp @@ -155,6 +155,17 @@ BOOST_AUTO_TEST_CASE(permutation) { r1); // check that the permutation was assigned correctly. } +BOOST_AUTO_TEST_CASE(shift) { + TiledRange tr1 = tr; + const auto shift = std::vector(GlobalFixture::dim, 1); + BOOST_CHECK_NO_THROW(tr1.inplace_shift(shift)); + BOOST_CHECK_EQUAL(tr1.tiles_range(), tr.tiles_range()); + BOOST_CHECK_EQUAL(tr1.elements_range(), tr.elements_range().shift(shift)); + TiledRange tr1_copy; + BOOST_CHECK_NO_THROW(tr1_copy = tr.shift(shift)); + BOOST_CHECK_EQUAL(tr1, tr1_copy); +} + BOOST_AUTO_TEST_CASE(make_tiles_range) { tile_index start(GlobalFixture::dim); tile_index finish(GlobalFixture::dim); From 9ef3a8a47a7b470a5c1759d8ba0d62476bb6e0ab Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 11 Sep 2024 15:53:25 -0400 Subject: [PATCH 532/592] introduced BlkTsrExpr::{{set_,}trange_lobound,preserve_lobound}() that allow to use block tensor expressions even with DistArrays that have non-zero lobound --- src/TiledArray/expressions/blk_tsr_engine.h | 54 +++++-- src/TiledArray/expressions/blk_tsr_expr.h | 36 +++++ src/TiledArray/expressions/expr.h | 11 ++ src/TiledArray/expressions/fwd.h | 13 +- src/TiledArray/expressions/tsr_expr.h | 148 ++++++++++++++++++-- tests/expressions_impl.h | 52 +++++++ 6 files changed, 293 insertions(+), 21 deletions(-) diff --git a/src/TiledArray/expressions/blk_tsr_engine.h b/src/TiledArray/expressions/blk_tsr_engine.h index e85aac7925..9b6e750bb5 100644 --- a/src/TiledArray/expressions/blk_tsr_engine.h +++ b/src/TiledArray/expressions/blk_tsr_engine.h @@ -158,22 +158,29 @@ class BlkTsrEngineBase : public LeafEngine { using LeafEngine_::array_; container::svector - lower_bound_; ///< Lower bound of the tile block + lower_bound_; ///< Tile coordinates of the lower bound of the tile block + ///< in the host array container::svector - upper_bound_; ///< Upper bound of the tile block + upper_bound_; ///< Tile coordinates of the upper bound of the tile block + ///< in the host array + std::optional + trange_lobound_; ///< Lobound of the result trange, modulo permutation + ///< (i.e. referring to the modes of the host array) public: template BlkTsrEngineBase(const BlkTsrExpr& expr) : LeafEngine_(expr), lower_bound_(expr.lower_bound()), - upper_bound_(expr.upper_bound()) {} + upper_bound_(expr.upper_bound()), + trange_lobound_(expr.trange_lobound()) {} template BlkTsrEngineBase(const ScalBlkTsrExpr& expr) : LeafEngine_(expr), lower_bound_(expr.lower_bound()), - upper_bound_(expr.upper_bound()) {} + upper_bound_(expr.upper_bound()), + trange_lobound_(expr.trange_lobound()) {} /// Non-permuting tiled range factory function @@ -199,9 +206,12 @@ class BlkTsrEngineBase : public LeafEngine { if (lower_d != upper_d) { auto i = lower_d; const auto base_d = trange[d].tile(i).first; - trange1_data.emplace_back(0ul); + const auto trange1_lobound = + trange_lobound_ ? (*trange_lobound_)[d] : 0ul; + trange1_data.emplace_back(trange1_lobound); for (; i < upper_d; ++i) - trange1_data.emplace_back(trange[d].tile(i).second - base_d); + trange1_data.emplace_back(trange[d].tile(i).extent() + + trange1_data.back()); // Add the trange1 to the tiled range data trange_data.emplace_back(trange1_data.begin(), trange1_data.end()); trange1_data.resize(0ul); @@ -241,9 +251,12 @@ class BlkTsrEngineBase : public LeafEngine { // Copy, shift, and permute the tiling of the block auto i = lower_i; const auto base_d = trange[inv_perm_d].tile(i).first; - trange1_data.emplace_back(0ul); + const auto trange1_lobound = + trange_lobound_ ? (*trange_lobound_)[inv_perm_d] : 0ul; + trange1_data.emplace_back(trange1_lobound); for (; i < upper_i; ++i) - trange1_data.emplace_back(trange[inv_perm_d].tile(i).second - base_d); + trange1_data.emplace_back(trange[inv_perm_d].tile(i).extent() + + trange1_data.back()); // Add the trange1 to the tiled range data trange_data.emplace_back(trange1_data.begin(), trange1_data.end()); @@ -341,6 +354,7 @@ class BlkTsrEngine protected: // Import base class variables to this scope using BlkTsrEngineBase_::lower_bound_; + using BlkTsrEngineBase_::trange_lobound_; using BlkTsrEngineBase_::upper_bound_; using ExprEngine_::implicit_permute_inner_; using ExprEngine_::implicit_permute_outer_; @@ -391,8 +405,12 @@ class BlkTsrEngine const auto lower_d = lower[d]; const auto upper_d = upper[d]; if (lower_d != upper_d) { + // element lobound of the block in the host const auto base_d = trange[d].tile(lower_d).first; - range_shift.emplace_back(-base_d); + // element lobound of the target of this expression + const auto target_base_d = + trange_lobound_ ? (*trange_lobound_)[d] : 0ul; + range_shift.emplace_back(target_base_d - base_d); } else { range_shift.emplace_back(0l); } @@ -427,8 +445,11 @@ class BlkTsrEngine const auto lower_d = lower[d]; const auto upper_d = upper[d]; if (lower_d != upper_d) { + // element lobound of the block in the host const auto base_d = trange[d].tile(lower_d).first; - range_shift[perm_d] = -base_d; + // element lobound of the target of this expression + const auto target_base_d = trange_lobound_ ? (*trange_lobound_)[d] : 0; + range_shift[perm_d] = target_base_d - base_d; } } @@ -496,6 +517,7 @@ class ScalBlkTsrEngine protected: // Import base class variables to this scope using BlkTsrEngineBase_::lower_bound_; + using BlkTsrEngineBase_::trange_lobound_; using BlkTsrEngineBase_::upper_bound_; using ExprEngine_::implicit_permute_inner_; using ExprEngine_::implicit_permute_outer_; @@ -549,8 +571,12 @@ class ScalBlkTsrEngine const auto lower_d = lower[d]; const auto upper_d = upper[d]; if (lower_d != upper_d) { + // element lobound of the block in the host const auto base_d = trange[d].tile(lower_d).first; - range_shift.emplace_back(-base_d); + // element lobound of the target of this expression + const auto target_base_d = + trange_lobound_ ? (*trange_lobound_)[d] : 0ul; + range_shift.emplace_back(target_base_d - base_d); } else range_shift.emplace_back(0); } @@ -584,8 +610,12 @@ class ScalBlkTsrEngine const auto lower_d = lower[d]; const auto upper_d = upper[d]; if (lower_d != upper_d) { + // element lobound of the block in the host const auto base_d = trange[d].tile(lower_d).first; - range_shift[perm_d] = -base_d; + // element lobound of the target of this expression + const auto target_base_d = + trange_lobound_ ? (*trange_lobound_)[d] : 0ul; + range_shift[perm_d] = target_base_d - base_d; } } diff --git a/src/TiledArray/expressions/blk_tsr_expr.h b/src/TiledArray/expressions/blk_tsr_expr.h index 5d6612d5cc..661e2ff666 100644 --- a/src/TiledArray/expressions/blk_tsr_expr.h +++ b/src/TiledArray/expressions/blk_tsr_expr.h @@ -32,6 +32,8 @@ #include #include "blk_tsr_engine.h" +#include + namespace TiledArray { namespace expressions { @@ -118,6 +120,10 @@ class BlkTsrExprBase : public Expr { lower_bound_; ///< Lower bound of the tile block container::svector upper_bound_; ///< Upper bound of the tile block + /// If non-null, element lobound of the expression trange (else zeros will be + /// used) Fusing permutation does not affect this (i.e. this refers to the + /// modes of the host array). + std::optional trange_lobound_; void check_valid() const { TA_ASSERT(array_); @@ -285,6 +291,36 @@ class BlkTsrExprBase : public Expr { /// \return The block upper bound const auto& upper_bound() const { return upper_bound_; } + /// Sets result trange lobound + /// @param[in] trange_lobound The result trange lobound + template >> + Derived& set_trange_lobound(const Index1& trange_lobound) { + trange_lobound_.emplace(std::begin(trange_lobound), + std::end(trange_lobound)); + return static_cast(*this); + } + + /// Sets result trange lobound + /// @param[in] trange_lobound The result trange lobound + template >> + Derived& set_trange_lobound(std::initializer_list trange_lobound) { + return this->set_trange_lobound>( + trange_lobound); + } + + /// Sets result trange lobound such that the tile lobounds are not changed + Derived& preserve_lobound() { + return set_trange_lobound( + array_.trange().make_tile_range(lower_bound()).lobound()); + } + + /// @return optional to result trange lobound; if null, the result trange + /// lobound is zero + const auto& trange_lobound() const { return trange_lobound_; } + }; // class BlkTsrExprBase /// Block expression diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index f6d2ff1376..8e3f925310 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -47,6 +47,7 @@ #include +#include #include #include @@ -464,6 +465,16 @@ class Expr { // set even though this is a requirement. #endif // NDEBUG + // Assignment to block expression uses trange of the array it is bounded to + // Assert that the user did not try to override the trange by accident using + // set_trange_lobound or at least that it matches tsr.array's trange + TA_ASSERT(!tsr.trange_lobound().has_value() || + (ranges::equal(tsr.trange_lobound().value(), + tsr.array() + .trange() + .make_tile_range(tsr.lower_bound()) + .lobound()))); + // Get the target world. World& world = tsr.array().world(); diff --git a/src/TiledArray/expressions/fwd.h b/src/TiledArray/expressions/fwd.h index 7960baf648..1d234b6dc5 100644 --- a/src/TiledArray/expressions/fwd.h +++ b/src/TiledArray/expressions/fwd.h @@ -28,7 +28,6 @@ #include - namespace TiledArray::expressions { template @@ -43,6 +42,10 @@ class BlkTsrExpr; template class ScalBlkTsrExpr; +/// used to indicate that block tensor expression should preserve the underlying +/// tensor's trange lobound +struct preserve_lobound_t {}; + template struct is_aliased : std::true_type {}; @@ -68,6 +71,14 @@ class ScalTsrExpr; template class ScalTsrEngine; +} // namespace TiledArray::expressions + +namespace TiledArray { + +/// used to tag block tensor expression methods that preserve the underlying +/// tensor's trange lobound +inline constexpr expressions::preserve_lobound_t preserve_lobound; + } // namespace TiledArray #endif // TILEDARRAY_EXPRESSIONS_FWD_H__INCLUDED diff --git a/src/TiledArray/expressions/tsr_expr.h b/src/TiledArray/expressions/tsr_expr.h index 8430a3c852..68e036f4c4 100644 --- a/src/TiledArray/expressions/tsr_expr.h +++ b/src/TiledArray/expressions/tsr_expr.h @@ -197,7 +197,7 @@ class TsrExpr : public Expr> { return TsrExpr(array(), annotation_); } - /// immutable Block expression factory + /// makes an immutable Block expression /// \tparam Index1 An integral range type /// \tparam Index2 An integral range type @@ -213,7 +213,26 @@ class TsrExpr : public Expr> { upper_bound); } - /// immutable Block expression factory + /// makes an immutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam Index1 An integral range type + /// \tparam Index2 An integral range type + /// \param lower_bound The lower_bound of the block + /// \param upper_bound The upper_bound of the block + template && + TiledArray::detail::is_integral_range_v>> + BlkTsrExpr block(const Index1& lower_bound, + const Index2& upper_bound, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, lower_bound, + upper_bound) + .preserve_lobound(); + } + + /// makes an immutable Block expression /// \tparam Index1 An integral type /// \tparam Index2 An integral type @@ -229,7 +248,26 @@ class TsrExpr : public Expr> { upper_bound); } - /// immutable Block expression factory + /// makes an immutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam Index1 An integral type + /// \tparam Index2 An integral type + /// \param lower_bound The lower_bound of the block + /// \param upper_bound The upper_bound of the block + template && + std::is_integral_v>> + BlkTsrExpr block( + const std::initializer_list& lower_bound, + const std::initializer_list& upper_bound, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, lower_bound, + upper_bound) + .preserve_lobound(); + } + + /// makes an immutable Block expression /// \tparam PairRange Type representing a range of generalized pairs (see /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of @@ -241,7 +279,22 @@ class TsrExpr : public Expr> { return BlkTsrExpr(array_, annotation_, bounds); } - /// immutable Block expression factory + /// makes an immutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam PairRange Type representing a range of generalized pairs (see + /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of + /// the block + template >> + BlkTsrExpr block(const PairRange& bounds, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, bounds) + .preserve_lobound(); + } + + /// makes an immutable Block expression /// \tparam Index An integral type /// \param bounds The {lower,upper} bounds of the block @@ -252,7 +305,21 @@ class TsrExpr : public Expr> { return BlkTsrExpr(array_, annotation_, bounds); } - /// mutable Block expression factory + /// makes an immutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam Index An integral type + /// \param bounds The {lower,upper} bounds of the block + template >> + BlkTsrExpr block( + const std::initializer_list>& bounds, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, bounds) + .preserve_lobound(); + } + + /// makes a mutable Block expression /// \tparam Index1 An integral range type /// \tparam Index2 An integral range type @@ -268,7 +335,26 @@ class TsrExpr : public Expr> { upper_bound); } - /// mutable Block expression factory + /// makes a mutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam Index1 An integral range type + /// \tparam Index2 An integral range type + /// \param lower_bound The lower_bound of the block + /// \param upper_bound The upper_bound of the block + template && + TiledArray::detail::is_integral_range_v>> + BlkTsrExpr block(const Index1& lower_bound, + const Index2& upper_bound, + preserve_lobound_t) { + return BlkTsrExpr(array_, annotation_, lower_bound, + upper_bound) + .preserve_lobound(); + } + + /// makes a mutable Block expression /// \tparam Index1 An integral type /// \tparam Index2 An integral type @@ -284,7 +370,25 @@ class TsrExpr : public Expr> { upper_bound); } - /// mutable Block expression factory + /// makes a mutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam Index1 An integral type + /// \tparam Index2 An integral type + /// \param lower_bound The lower_bound of the block + /// \param upper_bound The upper_bound of the block + template && + std::is_integral_v>> + BlkTsrExpr block( + const std::initializer_list& lower_bound, + const std::initializer_list& upper_bound, preserve_lobound_t) { + return BlkTsrExpr(array_, annotation_, lower_bound, + upper_bound) + .preserve_lobound(); + } + + /// makes a mutable Block expression /// \tparam PairRange Type representing a range of generalized pairs (see /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of @@ -296,7 +400,21 @@ class TsrExpr : public Expr> { return BlkTsrExpr(array_, annotation_, bounds); } - /// mutable Block expression factory + /// makes a mutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam PairRange Type representing a range of generalized pairs (see + /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of + /// the block + template >> + BlkTsrExpr block(const PairRange& bounds, preserve_lobound_t) { + return BlkTsrExpr(array_, annotation_, bounds) + .preserve_lobound(); + } + + /// makes a mutable Block expression /// \tparam Index An integral type /// \param bounds The {lower,upper} bounds of the block @@ -307,6 +425,20 @@ class TsrExpr : public Expr> { return BlkTsrExpr(array_, annotation_, bounds); } + /// makes a mutable Block expression that preserves the underlying tensor's + /// trange lobound + + /// \tparam Index An integral type + /// \param bounds The {lower,upper} bounds of the block + template >> + BlkTsrExpr block( + const std::initializer_list>& bounds, + preserve_lobound_t) { + return BlkTsrExpr(array_, annotation_, bounds) + .preserve_lobound(); + } + /// Conjugated-tensor expression factor /// \return A conjugated expression object diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h index ca8027c03d..e7c781ccc6 100644 --- a/tests/expressions_impl.h +++ b/tests/expressions_impl.h @@ -619,6 +619,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_block, F, Fixtures, F) { for (int repeat = 0; repeat != nrepeats; ++repeat) BOOST_REQUIRE_NO_THROW(c("a,b,c").block({3, 3, 3}, {5, 5, 5}) = 2 * a("a,b,c").block({3, 3, 3}, {5, 5, 5})); + BOOST_REQUIRE(tile_ranges_match_trange(c)); BlockRange block_range(a.trange().tiles_range(), {3, 3, 3}, {5, 5, 5}); @@ -698,18 +699,69 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_block_base1, F, Fixtures, F) { c.fill_local(0.0); c_base1.fill_local(0.0); + // block expressions by default have trange lobound (=base) set to 0 ... + // this is done to allow block expressions involving multiple arrays with + // different lobounds all work correctly BOOST_REQUIRE_NO_THROW(c("a,b,c").block({3, 3, 3}, {5, 5, 5}) = a_base1("a,b,c").block({3, 3, 3}, {5, 5, 5})); BOOST_REQUIRE(tile_ranges_match_trange(c)); BOOST_REQUIRE_NO_THROW(c_base1("a,b,c").block({3, 3, 3}, {5, 5, 5}) = a("a,b,c").block({3, 3, 3}, {5, 5, 5})); BOOST_REQUIRE(tile_ranges_match_trange(c_base1)); + BOOST_REQUIRE_NO_THROW(c_base1("a,b,c").block({3, 3, 3}, {5, 5, 5}) = + a_base1("a,b,c").block({3, 3, 3}, {5, 5, 5})); + BOOST_REQUIRE(tile_ranges_match_trange(c_base1)); BOOST_REQUIRE_NO_THROW(c("a,b,c").block({0, 0, 0}, {ntiles, ntiles, ntiles}) = a_base1("a,b,c")); BOOST_REQUIRE(tile_ranges_match_trange(c)); BOOST_REQUIRE_NO_THROW( c_base1("a,b,c").block({0, 0, 0}, {ntiles, ntiles, ntiles}) = a("a,b,c")); BOOST_REQUIRE(tile_ranges_match_trange(c_base1)); + + // however user can override the trange lobound using set_trange_lobound + { + decltype(F::c) a_block; + // default trange lobound is 0 + BOOST_REQUIRE_NO_THROW(a_block("a,b,c") = + a_base1("a,b,c").block({3, 3, 3}, {5, 5, 5})); + BOOST_REQUIRE_EQUAL(a_block.trange().elements_range().lobound(), + (Range::index_type{0, 0, 0})); + + // this preserves tile's lobounds, so that tile {0,0,0} in a_block has + // identical range to that of tile {3, 3, 3} in a_base1 + BOOST_REQUIRE_NO_THROW(a_block("a,b,c") = a_base1("a,b,c").block( + {3, 3, 3}, {5, 5, 5}, preserve_lobound)); + BOOST_REQUIRE_EQUAL(a_block.trange().elements_range().lobound(), + a_base1.trange().make_tile_range({3, 3, 3}).lobound()); + // this explicitly makes the trange lobound of a_block to be {1,1,1} + BOOST_REQUIRE_NO_THROW(a_block("a,b,c") = + a("a,b,c") + .block({3, 3, 3}, {5, 5, 5}) + .set_trange_lobound({1, 1, 1})); + BOOST_REQUIRE_EQUAL(a_block.trange().elements_range().lobound(), + Range::index_type({1, 1, 1})); + // trange of source block is ignored when it is assigned to a block of an + // existing array + BOOST_REQUIRE_NO_THROW(a_block("a,b,c").block({0, 0, 0}, {2, 2, 2}) = + a_base1("a,b,c") + .block({3, 3, 3}, {5, 5, 5}) + .set_trange_lobound({0, 0, 0})); + // overriding trange of result block is not allowed ... + BOOST_REQUIRE_THROW( + a_block("a,b,c") + .block({0, 0, 0}, {2, 2, 2}) + .set_trange_lobound({0, 0, 0}) = a_base1("a,b,c") + .block({3, 3, 3}, {5, 5, 5}) + .set_trange_lobound({0, 0, 0}), + Exception); + // ... unless makes it same as trange lobound of the underlying array + BOOST_REQUIRE_NO_THROW(a_block("a,b,c") + .block({0, 0, 0}, {2, 2, 2}) + .set_trange_lobound({1, 1, 1}) = + a_base1("a,b,c") + .block({3, 3, 3}, {5, 5, 5}) + .set_trange_lobound({0, 0, 0})); + } } BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_permute_block, F, Fixtures, From 5391346bd645c71da4782ffd91e14ee2cc1de937 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 13 Sep 2024 05:53:48 -0400 Subject: [PATCH 533/592] dox fixup [skip ci] --- src/TiledArray/tiled_range1.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 46c4b37adc..8cc830046b 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -523,9 +523,8 @@ inline TiledRange1 concat(const TiledRange1& r1, const TiledRange1& r2) { /// Test that two TiledRange1 objects are congruent /// This function tests that the tile sizes of the two ranges coincide. -/// \tparam Range The range type -/// \param r1 an TiledRange1 object -/// \param r2 an TiledRange1 object +/// \param r1 a TiledRange1 object +/// \param r2 a TiledRange1 object inline bool is_congruent(const TiledRange1& r1, const TiledRange1& r2) { return r1.tile_extent() == r2.tile_extent() && std::equal(r1.begin(), r1.end(), r2.begin(), From 079392967e9c8e37e36c2a5c272e4e942400eade Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 13 Sep 2024 06:06:37 -0400 Subject: [PATCH 534/592] introduced TiledRange::is_congruent --- src/TiledArray/tiled_range.h | 13 +++++++++++++ tests/tiled_range.cpp | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/TiledArray/tiled_range.h b/src/TiledArray/tiled_range.h index bfcd4c86fc..fb73512560 100644 --- a/src/TiledArray/tiled_range.h +++ b/src/TiledArray/tiled_range.h @@ -423,6 +423,19 @@ inline bool operator==(const TiledRange& r1, const TiledRange& r2) { std::equal(r1.data().begin(), r1.data().end(), r2.data().begin()); } +/// Test that two TiledRange objects are congruent + +/// Two tranges are congruent if one is a translation of another (i.e. their +/// ranks and extents of all tiles) agree \param r1 a TiledRange object \param +/// r2 a TiledRange object +inline bool is_congruent(const TiledRange& r1, const TiledRange& r2) { + return r1.rank() == r2.rank() && + std::equal(r1.begin(), r1.end(), r2.begin(), + [](const auto& tr1_1, const auto& tr1_2) { + return is_congruent(tr1_1, tr1_2); + }); +} + inline bool operator!=(const TiledRange& r1, const TiledRange& r2) { return !operator==(r1, r2); } diff --git a/tests/tiled_range.cpp b/tests/tiled_range.cpp index 577b395927..eb557b761f 100644 --- a/tests/tiled_range.cpp +++ b/tests/tiled_range.cpp @@ -119,6 +119,7 @@ BOOST_AUTO_TEST_CASE(comparison) { TiledRange r1{{0, 2, 4, 6, 8, 10}, {0, 2, 4, 6, 8, 10}}; TiledRange r2{{0, 2, 4, 6, 8, 10}, {0, 2, 4, 6, 8, 10}}; TiledRange r3{{0, 3, 6, 9, 12, 15}, {0, 3, 6, 9, 12, 15}}; + BOOST_CHECK(r1 == r1); // self-comparison BOOST_CHECK(r1 == r2); // check equality operator BOOST_CHECK(!(r1 != r2)); // check not-equal operator BOOST_CHECK( @@ -126,6 +127,18 @@ BOOST_AUTO_TEST_CASE(comparison) { BOOST_CHECK(r1 != r3); } +BOOST_AUTO_TEST_CASE(congruency) { + TiledRange r1{{0, 2, 4, 6, 8, 10}, {0, 2, 4, 6, 8, 10}}; + TiledRange r2{{1, 3, 5, 7, 9, 11}, {2, 4, 6, 8, 10, 12}}; + TiledRange r3{{0, 3, 6, 9, 12, 15}, {0, 3, 6, 9, 12, 15}}; + BOOST_CHECK(r1 == r1 && is_congruent(r1, r1)); // congruent with self + BOOST_CHECK(r1 != r2 && + is_congruent(r1, r2)); // r1 and r2 are not equal but congruent + BOOST_CHECK( + r1 != r3 && + !is_congruent(r1, r3)); // r1 and r3 are not equal and not congruent +} + BOOST_AUTO_TEST_CASE(assignment) { TiledRange r1; From d49c7bc12a8cfc00a1b28ee2a75e1b7ce9d2028b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 13 Sep 2024 06:07:29 -0400 Subject: [PATCH 535/592] like contraction, reduction expression and binary expression can ignore absolute positions if ignore_tile_position() is on --- src/TiledArray/expressions/binary_engine.h | 22 +++++++++++++++++----- src/TiledArray/expressions/expr.h | 17 +++++++++++++++-- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h index 33318b57a6..486c5421a1 100644 --- a/src/TiledArray/expressions/binary_engine.h +++ b/src/TiledArray/expressions/binary_engine.h @@ -235,18 +235,30 @@ class BinaryEngine : public ExprEngine { left_.init_struct(left_indices_); right_.init_struct(right_indices_); #ifndef NDEBUG - if (left_.trange() != right_.trange()) { + if (ignore_tile_position()) { + if (!is_congruent(left_.trange(), right_.trange())) { + if (TiledArray::get_default_world().rank() == 0) { + TA_USER_ERROR_MESSAGE( + "The TiledRanges of the left- and right-hand arguments the " + "binary " + "expression are not congruent:" + << "\n left = " << left_.trange() + << "\n right = " << right_.trange()); + } + TA_EXCEPTION( + "The TiledRange objects of a binary expression are not congruent."); + } + } else if (left_.trange() != right_.trange()) { if (TiledArray::get_default_world().rank() == 0) { TA_USER_ERROR_MESSAGE( - "The TiledRanges of the left- and right-hand arguments of the " - "binary operation are not equal:" + "The TiledRanges of the left- and right-hand arguments the binary " + "expression are not equal:" << "\n left = " << left_.trange() << "\n right = " << right_.trange()); } TA_EXCEPTION( - "The TiledRanges of the left- and right-hand arguments " - "of the binary operation are not equal."); + "The TiledRange objects of a binary expression are not equal."); } #endif // NDEBUG ExprEngine_::init_struct(target_indices); diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h index 8e3f925310..3b1e9f43be 100644 --- a/src/TiledArray/expressions/expr.h +++ b/src/TiledArray/expressions/expr.h @@ -664,7 +664,20 @@ class Expr { right_dist_eval.eval(); #ifndef NDEBUG - if (left_dist_eval.trange() != right_dist_eval.trange()) { + if (ignore_tile_position()) { + if (!is_congruent(left_dist_eval.trange(), right_dist_eval.trange())) { + if (TiledArray::get_default_world().rank() == 0) { + TA_USER_ERROR_MESSAGE( + "The TiledRanges of the left- and right-hand arguments the " + "binary " + "reduction are not congruent:" + << "\n left = " << left_dist_eval.trange() + << "\n right = " << right_dist_eval.trange()); + } + TA_EXCEPTION( + "The TiledRange objects of a binary reduction are not congruent."); + } + } else if (left_dist_eval.trange() != right_dist_eval.trange()) { if (TiledArray::get_default_world().rank() == 0) { TA_USER_ERROR_MESSAGE( "The TiledRanges of the left- and right-hand arguments the binary " @@ -674,7 +687,7 @@ class Expr { } TA_EXCEPTION( - "The TiledRange objects of a binary expression are not equal."); + "The TiledRange objects of a binary reduction are not equal."); } #endif // NDEBUG From 9e85c7d642968ab0b40f92c75704171c10c11306 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 14 Sep 2024 07:22:55 -0400 Subject: [PATCH 536/592] introduced TiledRange1::{lo,up}bound which feel to have unambiguous meaning --- src/TiledArray/tiled_range1.h | 12 ++++++++++++ tests/tiled_range1.cpp | 24 +++++++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 8cc830046b..5fbe87c64d 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -180,6 +180,18 @@ class TiledRange1 { /// \return the number of elements in the range index1_type extent() const { return TiledArray::extent(elements_range_); } + // clang-format off + /// Elements range lobound accessor + /// \return lower bound of the elements range (i.e., the smallest index in the elements range, `a` in `[a,b)`) + // clang-format on + index1_type lobound() const { return elements_range_.lobound(); } + + // clang-format off + /// Elements range upbound accessor + /// \return upper bound of the elements range (i.e., the smallest index greater than any in the elements range, `b` in `[a,b)`) + // clang-format on + index1_type upbound() const { return elements_range_.upbound(); } + /// Computes hashmarks /// \return the hashmarks of the tiled range, consisting of the following /// values: diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index 2fe958bd2d..12b94578b5 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -32,6 +32,10 @@ BOOST_AUTO_TEST_CASE(range_accessor) { BOOST_CHECK_EQUAL(tr1.tiles_range().second, tiles.second); BOOST_CHECK_EQUAL(tr1.elements_range().first, elements.first); BOOST_CHECK_EQUAL(tr1.elements_range().second, elements.second); + BOOST_CHECK_EQUAL(tr1.tile_extent(), tiles.second - tiles.first); + BOOST_CHECK_EQUAL(tr1.extent(), elements.second - elements.first); + BOOST_CHECK_EQUAL(tr1.lobound(), elements.first); + BOOST_CHECK_EQUAL(tr1.upbound(), elements.second); // Check individual tiles for (std::size_t i = 0; i < a.size() - 1; ++i) { @@ -43,12 +47,30 @@ BOOST_AUTO_TEST_CASE(range_accessor) { BOOST_AUTO_TEST_CASE(range_info) { BOOST_CHECK_EQUAL(tr1.tiles_range().first, 0ul); BOOST_CHECK_EQUAL(tr1.tiles_range().second, a.size() - 1); - BOOST_CHECK_EQUAL(tr1.elements_range().first, 0ul); + BOOST_CHECK_EQUAL(tr1.elements_range().first, a.front()); BOOST_CHECK_EQUAL(tr1.elements_range().second, a.back()); + BOOST_CHECK_EQUAL(tr1.tile_extent(), a.size() - 1); + BOOST_CHECK_EQUAL(tr1.extent(), a.back() - a.front()); + BOOST_CHECK_EQUAL(tr1.lobound(), a.front()); + BOOST_CHECK_EQUAL(tr1.upbound(), a.back()); for (std::size_t i = 0; i < a.size() - 1; ++i) { BOOST_CHECK_EQUAL(tr1.tile(i).first, a[i]); BOOST_CHECK_EQUAL(tr1.tile(i).second, a[i + 1]); } + + auto a_base1 = make_hashmarks(1); + BOOST_CHECK_EQUAL(tr1_base1.tiles_range().first, 0ul); + BOOST_CHECK_EQUAL(tr1_base1.tiles_range().second, a_base1.size() - 1); + BOOST_CHECK_EQUAL(tr1_base1.elements_range().first, a_base1.front()); + BOOST_CHECK_EQUAL(tr1_base1.elements_range().second, a_base1.back()); + BOOST_CHECK_EQUAL(tr1_base1.tile_extent(), a_base1.size() - 1); + BOOST_CHECK_EQUAL(tr1_base1.extent(), a_base1.back() - a_base1.front()); + BOOST_CHECK_EQUAL(tr1_base1.lobound(), a_base1.front()); + BOOST_CHECK_EQUAL(tr1_base1.upbound(), a_base1.back()); + for (std::size_t i = 0; i < a.size() - 1; ++i) { + BOOST_CHECK_EQUAL(tr1_base1.tile(i).first, a_base1[i]); + BOOST_CHECK_EQUAL(tr1_base1.tile(i).second, a_base1[i + 1]); + } } BOOST_AUTO_TEST_CASE(constructor) { From f9b3f255b4b260633bb64bfe2bfed6a670c4cf8a Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 14 Sep 2024 07:24:17 -0400 Subject: [PATCH 537/592] more TsrExpr::block variants tagged by preserve_lobound_t --- src/TiledArray/expressions/tsr_expr.h | 67 ++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/expressions/tsr_expr.h b/src/TiledArray/expressions/tsr_expr.h index 68e036f4c4..e17ee2ddfa 100644 --- a/src/TiledArray/expressions/tsr_expr.h +++ b/src/TiledArray/expressions/tsr_expr.h @@ -523,6 +523,24 @@ class TsrExpr : public Expr> { /// Block expression + /// \tparam Index1 An integral range type + /// \tparam Index2 An integral range type + /// \param lower_bound The lower_bound of the block + /// \param upper_bound The upper_bound of the block + template && + TiledArray::detail::is_integral_range_v>> + BlkTsrExpr block(const Index1& lower_bound, + const Index2& upper_bound, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, lower_bound, + upper_bound) + .preserve_lobound(); + } + + /// Block expression + /// \tparam Index1 An integral type /// \tparam Index2 An integral type /// \param lower_bound The lower_bound of the block @@ -539,8 +557,27 @@ class TsrExpr : public Expr> { /// Block expression + /// \tparam Index1 An integral type + /// \tparam Index2 An integral type + /// \param lower_bound The lower_bound of the block + /// \param upper_bound The upper_bound of the block + template && + std::is_integral_v>> + BlkTsrExpr block( + const std::initializer_list& lower_bound, + const std::initializer_list& upper_bound, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, lower_bound, + upper_bound) + .preserve_lobound(); + } + + /// Block expression + /// \tparam PairRange Type representing a range of generalized pairs (see - /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of + /// TiledArray::detail::is_gpair_v ) + /// \param bounds The {lower,upper} bounds of /// the block template : public Expr> { /// Block expression + /// \tparam PairRange Type representing a range of generalized pairs (see + /// TiledArray::detail::is_gpair_v ) + /// \param bounds The {lower,upper} bounds of + /// the block + template >> + BlkTsrExpr block(const PairRange& bounds, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, bounds) + .preserve_lobound(); + } + + /// Block expression + /// \tparam Index An integral type /// \param bounds The {lower,upper} bounds of the block template : public Expr> { return BlkTsrExpr(array_, annotation_, bounds); } + /// Block expression + + /// \tparam Index An integral type + /// \param bounds The {lower,upper} bounds of the block + template >> + BlkTsrExpr block( + const std::initializer_list>& bounds, + preserve_lobound_t) const { + return BlkTsrExpr(array_, annotation_, bounds) + .preserve_lobound(); + } + /// Conjugated-tensor expression factor /// \return A conjugated expression object From ea9c8347470f1933a681abe2640a8938a0c66d89 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 15 Sep 2024 11:15:38 -0400 Subject: [PATCH 538/592] TA::expressions::preserve_lobound_t -> TA::preserve_lobound_t --- src/TiledArray/expressions/fwd.h | 12 ------------ src/TiledArray/fwd.h | 8 ++++++++ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/TiledArray/expressions/fwd.h b/src/TiledArray/expressions/fwd.h index 1d234b6dc5..e56dea8b83 100644 --- a/src/TiledArray/expressions/fwd.h +++ b/src/TiledArray/expressions/fwd.h @@ -42,10 +42,6 @@ class BlkTsrExpr; template class ScalBlkTsrExpr; -/// used to indicate that block tensor expression should preserve the underlying -/// tensor's trange lobound -struct preserve_lobound_t {}; - template struct is_aliased : std::true_type {}; @@ -73,12 +69,4 @@ class ScalTsrEngine; } // namespace TiledArray::expressions -namespace TiledArray { - -/// used to tag block tensor expression methods that preserve the underlying -/// tensor's trange lobound -inline constexpr expressions::preserve_lobound_t preserve_lobound; - -} // namespace TiledArray - #endif // TILEDARRAY_EXPRESSIONS_FWD_H__INCLUDED diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index 073e8bacd3..97d91a9a00 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -207,6 +207,14 @@ struct to; } // namespace conversions +/// used to indicate that block tensor expression should preserve the underlying +/// tensor's trange lobound +struct preserve_lobound_t {}; + +/// used to tag block tensor expression methods that preserve the underlying +/// tensor's trange lobound +inline constexpr preserve_lobound_t preserve_lobound; + } // namespace TiledArray #ifndef TILEDARRAY_DISABLE_NAMESPACE_TA From 8e580397704378ec63be8db51257bc5f6c0b5ac3 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 15 Sep 2024 11:16:34 -0400 Subject: [PATCH 539/592] btas <-> ta tensor conversions work for non-0-lobound --- src/TiledArray/conversions/btas.h | 248 ++++++++++++++++++++++++------ tests/CMakeLists.txt | 1 + tests/btas.cpp | 21 ++- 3 files changed, 220 insertions(+), 50 deletions(-) diff --git a/src/TiledArray/conversions/btas.h b/src/TiledArray/conversions/btas.h index 28e5790e8f..ab07e97b53 100644 --- a/src/TiledArray/conversions/btas.h +++ b/src/TiledArray/conversions/btas.h @@ -36,6 +36,9 @@ #include #include +#include +#include + namespace TiledArray { // clang-format off @@ -49,11 +52,12 @@ namespace TiledArray { /// \tparam Storage_ The storage type of the source btas::Tensor object /// \tparam Tensor_ A tensor type (e.g., TiledArray::Tensor or btas::Tensor, /// optionally wrapped into TiledArray::Tile) -/// \param[in] src The source object; its subblock defined by the {lower,upper} -/// bounds \c {dst.lobound(),dst.upbound()} will be copied to \c dst +/// \param[in] src The source object; its subblock +/// `{dst.lobound(),dst.upbound()}` +/// will be copied to \c dst /// \param[out] dst The object that will contain the contents of the /// corresponding subblock of src -/// \throw TiledArray::Exception When the dimensions of \c src and \c dst do not +/// \throw TiledArray::Exception When the dimensions of \p src and \p dst do not /// match. // clang-format on template @@ -73,6 +77,57 @@ inline void btas_subtensor_to_tensor( dst_view = src_view; } +// clang-format off +/// Copy a block of a btas::Tensor into a TiledArray::Tensor + +/// A block of btas::Tensor \c src will be copied into TiledArray::Tensor \c +/// dst. The block dimensions will be determined by the dimensions of the range +/// of \c dst . +/// \tparam T The tensor element type +/// \tparam Range_ The range type of the source btas::Tensor object +/// \tparam Storage_ The storage type of the source btas::Tensor object +/// \tparam Tensor_ A tensor type (e.g., TiledArray::Tensor or btas::Tensor, +/// optionally wrapped into TiledArray::Tile) +/// \param[in] src The source object; its subblock +/// `{dst.lobound() + offset,dst.upbound() + offset}` +/// will be copied to \c dst +/// \param[out] dst The object that will contain the contents of the +/// corresponding subblock of src +/// \param[out] offset the offset to be applied to the coordinates of `dst.range()` to determine the block in \p src to be copied; this is needed if the DistArray that will contain \p dst will have a range whose lobound is different from `src.lobound()` +/// \throw TiledArray::Exception When the dimensions of \p src and \p dst do not +/// match. +// clang-format on +template < + typename T, typename Range_, typename Storage_, typename Tensor_, + typename IntegerRange, + typename = std::enable_if_t>> +inline void btas_subtensor_to_tensor( + const btas::Tensor& src, Tensor_& dst, + IntegerRange&& offset) { + TA_ASSERT(dst.range().rank() == src.range().rank()); + TA_ASSERT(ranges::size(offset) == src.range().rank()); + + const auto& src_range = src.range(); + const auto& dst_range = dst.range(); + auto src_blk_range = + TiledArray::BlockRange(detail::make_ta_range(src_range), + ranges::views::zip(dst_range.lobound(), offset) | + ranges::views::transform([](auto&& i_j) { + auto&& [i, j] = i_j; + return i + j; + }), + ranges::views::zip(dst_range.upbound(), offset) | + ranges::views::transform([](auto&& i_j) { + auto&& [i, j] = i_j; + return i + j; + })); + using std::data; + auto src_view = TiledArray::make_const_map(data(src), src_blk_range); + auto dst_view = TiledArray::make_map(data(dst), dst_range); + + dst_view = src_view; +} + // clang-format off /// Copy a TiledArray::Tensor into a block of a btas::Tensor @@ -86,8 +141,8 @@ inline void btas_subtensor_to_tensor( /// \tparam Storage_ The storage type of the destination btas::Tensor object /// \param[in] src The source object whose contents will be copied into /// a subblock of \c dst -/// \param[out] dst The destination object; its subblock defined by the -/// {lower,upper} bounds \c {src.lobound(),src.upbound()} will be +/// \param[out] dst The destination object; its subblock +/// `{src.lobound(),src.upbound()}` will be /// overwritten with the content of \c src /// \throw TiledArray::Exception When the dimensions /// of \c src and \c dst do not match. @@ -109,6 +164,57 @@ inline void tensor_to_btas_subtensor(const Tensor_& src, dst_view = src_view; } +// clang-format off +/// Copy a TiledArray::Tensor into a block of a btas::Tensor + +/// TiledArray::Tensor \c src will be copied into a block of btas::Tensor +/// \c dst. The block dimensions will be determined by the dimensions of the range +/// of \c src . +/// \tparam Tensor_ A tensor type (e.g., TiledArray::Tensor or btas::Tensor, +/// optionally wrapped into TiledArray::Tile) +/// \tparam T The tensor element type +/// \tparam Range_ The range type of the destination btas::Tensor object +/// \tparam Storage_ The storage type of the destination btas::Tensor object +/// \param[in] src The source object whose contents will be copied into +/// a subblock of \c dst +/// \param[out] dst The destination object; its subblock +/// `{src.lobound()+offset,src.upbound()+offset}` will be +/// overwritten with the content of \c src +/// \param[out] offset the offset to be applied to the coordinates of `src.range()` to determine the block in \p dst to be copied; this is needed if the DistArray that contains \p src has a range whose lobound is different from `dst.lobound()` +/// \throw TiledArray::Exception When the dimensions +/// of \c src and \c dst do not match. +// clang-format on +template < + typename Tensor_, typename T, typename Range_, typename Storage_, + typename IntegerRange, + typename = std::enable_if_t>> +inline void tensor_to_btas_subtensor(const Tensor_& src, + btas::Tensor& dst, + IntegerRange&& offset) { + TA_ASSERT(dst.range().rank() == src.range().rank()); + TA_ASSERT(ranges::size(offset) == src.range().rank()); + + const auto& src_range = src.range(); + const auto& dst_range = dst.range(); + auto dst_blk_range = + TiledArray::BlockRange(detail::make_ta_range(dst_range), + ranges::views::zip(src_range.lobound(), offset) | + ranges::views::transform([](auto&& i_j) { + auto&& [i, j] = i_j; + return i + j; + }), + ranges::views::zip(src_range.upbound(), offset) | + ranges::views::transform([](auto&& i_j) { + auto&& [i, j] = i_j; + return i + j; + })); + using std::data; + auto src_view = TiledArray::make_const_map(data(src), src_range); + auto dst_view = TiledArray::make_map(data(dst), dst_blk_range); + + dst_view = src_view; +} + namespace detail { /// Task function for converting btas::Tensor subblock to a @@ -127,7 +233,13 @@ void counted_btas_subtensor_to_tensor(const BTAS_Tensor_* src, DistArray_* dst, const typename Range::index_type i, madness::AtomicInt* counter) { typename DistArray_::value_type tensor(dst->trange().make_tile_range(i)); - btas_subtensor_to_tensor(*src, tensor); + auto offset = ranges::views::zip(ranges::views::all(src->range().lobound()), + dst->trange().elements_range().lobound()) | + ranges::views::transform([](const auto& s_d) { + auto&& [s, d] = s_d; + return s - d; + }); + btas_subtensor_to_tensor(*src, tensor, offset); dst->set(i, tensor); (*counter)++; } @@ -137,12 +249,24 @@ void counted_btas_subtensor_to_tensor(const BTAS_Tensor_* src, DistArray_* dst, /// \tparam TA_Tensor_ a TiledArray::Tensor type /// \tparam BTAS_Tensor_ a btas::Tensor type /// \param src The source tensor -/// \param dst The destination tensor -/// \param counter The task counter -template -void counted_tensor_to_btas_subtensor(const TA_Tensor_& src, BTAS_Tensor_* dst, +/// \param src_array_lobound the lobound of the DistArrany that contains src, +/// used to compute the offset to be applied to the coordinates of `src.range()` +/// to determine the block in \p dst to be copied into \param dst The +/// destination tensor \param counter The task counter +template < + typename TA_Tensor_, typename BTAS_Tensor_, typename IntegerRange, + typename = std::enable_if_t>> +void counted_tensor_to_btas_subtensor(const TA_Tensor_& src, + IntegerRange src_array_lobound, + BTAS_Tensor_* dst, madness::AtomicInt* counter) { - tensor_to_btas_subtensor(src, *dst); + auto offset = ranges::views::zip(ranges::views::all(dst->range().lobound()), + src_array_lobound) | + ranges::views::transform([](const auto& d_s) { + auto&& [d, s] = d_s; + return d - s; + }); + tensor_to_btas_subtensor(src, *dst, offset); (*counter)++; } @@ -267,41 +391,14 @@ DistArray_ btas_tensor_to_array( return array; } -/// Convert a TiledArray::DistArray object into a btas::Tensor object +namespace detail { -/// This function will copy the contents of \c src into a \c btas::Tensor -/// object. The copy operation is done in parallel, and this function will block -/// until all elements of \c src have been copied into the result array tiles. -/// The size of \c src.world().size() must be equal to 1 or \c src must be a -/// replicated TiledArray::DistArray. Usage: -/// \code -/// TiledArray::TArrayD -/// array(world, trange); -/// // Set tiles of array ... -/// -/// auto t = array_to_btas_tensor(array); -/// \endcode -/// \tparam Tile the tile type of \c src -/// \tparam Policy the policy type of \c src -/// \tparam Range_ the range type of the result (either, btas::RangeNd or -/// TiledArray::Range) -/// \tparam Storage_ the storage type of the result -/// \param[in] src The TiledArray::DistArray object whose contents -/// will be copied to the result. -/// \return A \c btas::Tensor object that is a copy of \c src -/// \throw TiledArray::Exception When world size is greater than -/// 1 and \c src is not replicated -/// \param[in] target_rank the rank on which to create the BTAS tensor -/// containing the data of \c src ; if \c target_rank=-1 then -/// create the BTAS tensor on every rank (this requires -/// that \c src.is_replicated()==true ) -/// \return BTAS tensor object containing the data of \c src , if my rank equals -/// \c target_rank or \c target_rank==-1 , -/// default-initialized BTAS tensor otherwise. +/// \sa TiledArray::array_to_btas_tensor() template > -btas::Tensor array_to_btas_tensor( - const TiledArray::DistArray& src, int target_rank = -1) { +btas::Tensor +array_to_btas_tensor_impl(const TiledArray::DistArray& src, + const Range_& result_range, int target_rank) { // Test preconditions if (target_rank == -1 && src.world().size() > 1 && !src.pmap()->is_replicated()) @@ -314,13 +411,11 @@ btas::Tensor array_to_btas_tensor( using result_type = btas::Tensor::element_type, Range_, Storage_>; - using result_range_type = typename result_type::range_type; // Construct the result if (target_rank == -1 || src.world().rank() == target_rank) { // if array is sparse must initialize to zero - result_type result( - result_range_type(src.trange().elements_range().extent()), 0.0); + result_type result(result_range, 0.0); // Spawn tasks to copy array tiles to btas::Tensor madness::AtomicInt counter; @@ -329,8 +424,12 @@ btas::Tensor array_to_btas_tensor( for (std::size_t i = 0; i < src.size(); ++i) { if (!src.is_zero(i)) { src.world().taskq.add( - &detail::counted_tensor_to_btas_subtensor, - src.find(i), &result, &counter); + &detail::counted_tensor_to_btas_subtensor< + Tile, result_type, + std::decay_t< + decltype(src.trange().elements_range().lobound())>>, + src.find(i), src.trange().elements_range().lobound(), &result, + &counter); ++n; } } @@ -343,6 +442,59 @@ btas::Tensor array_to_btas_tensor( return result_type{}; } +} // namespace detail + +/// Convert a TiledArray::DistArray object into a btas::Tensor object + +/// This function will copy the contents of \c src into a \c btas::Tensor +/// object. The copy operation is done in parallel, and this function will block +/// until all elements of \c src have been copied into the result array tiles. +/// The size of \c src.world().size() must be equal to 1 or \c src must be a +/// replicated TiledArray::DistArray. Usage: +/// \code +/// TiledArray::TArrayD +/// array(world, trange); +/// // Set tiles of array ... +/// +/// auto t = array_to_btas_tensor(array); +/// \endcode +/// \tparam Tile the tile type of \c src +/// \tparam Policy the policy type of \c src +/// \tparam Range_ the range type of the result (either, btas::RangeNd or +/// TiledArray::Range) +/// \tparam Storage_ the storage type of the result +/// \param[in] src The TiledArray::DistArray object whose contents +/// will be copied to the result. +/// \param[in] target_rank the rank on which to create the BTAS tensor +/// containing the data of \c src ; if \c target_rank=-1 then +/// create the BTAS tensor on every rank (this requires +/// that \c src.is_replicated()==true ) +/// \return BTAS tensor object containing the data of \c src , if my rank equals +/// \c target_rank or \c target_rank==-1 , +/// default-initialized BTAS tensor otherwise. +/// \warning The range of \c src is +/// not preserved, i.e. the lobound of the result is zero. Use the +/// variant of this function tagged with preserve_lobound_t to +/// preserve the range. +/// \throw TiledArray::Exception When world size is greater than +/// 1 and \c src is not replicated +template > +btas::Tensor array_to_btas_tensor( + const TiledArray::DistArray& src, int target_rank = -1) { + return detail::array_to_btas_tensor_impl( + src, Range_(src.trange().elements_range().extent()), target_rank); +} + +template > +btas::Tensor array_to_btas_tensor( + const TiledArray::DistArray& src, preserve_lobound_t, + int target_rank = -1) { + return detail::array_to_btas_tensor_impl(src, src.trange().elements_range(), + target_rank); +} + } // namespace TiledArray #endif // TILEDARRAY_CONVERSIONS_BTAS_H__INCLUDED diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 823e13bec8..85d30d7728 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -101,6 +101,7 @@ set(ta_test_src_files ta_test.cpp einsum.cpp linalg.cpp cp.cpp + btas.cpp ) if(CUDA_FOUND OR HIP_FOUND) diff --git a/tests/btas.cpp b/tests/btas.cpp index a31329a80d..9c15540e9a 100644 --- a/tests/btas.cpp +++ b/tests/btas.cpp @@ -324,8 +324,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(dense_array_conversion, bTensor, tensor_types) { // make tiled range using trange1_t = TiledArray::TiledRange1; - TiledArray::TiledRange trange( - {trange1_t(0, 10, 20), trange1_t(0, 11, 22), trange1_t(0, 12, 24)}); + TiledArray::TiledRange trange({trange1_t(0, 10, 20), + trange1_t(0, 11, 22).inplace_shift(1), + trange1_t(0, 12, 24).inplace_shift(2)}); // convert to a replicated DistArray using T = typename bTensor::value_type; @@ -371,6 +372,22 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(dense_array_conversion, bTensor, tensor_types) { BOOST_CHECK(src_copy == btas::Tensor{}); } } + + // convert the replicated DistArray back to a btas::Tensor while preserving + // the DistArray range + { + btas::Tensor src_copy; + BOOST_REQUIRE_NO_THROW( + src_copy = array_to_btas_tensor(dst, TiledArray::preserve_lobound)); + BOOST_CHECK(ranges::equal(src_copy.range().lobound(), + dst.trange().elements_range().lobound())); + for (const auto& i : src.range()) { + auto i_copy = i; + i_copy[1] += 1; + i_copy[2] += 2; + BOOST_CHECK_EQUAL(src(i), src_copy(i_copy)); + } + } } BOOST_AUTO_TEST_CASE_TEMPLATE(sparse_array_conversion, bTensor, tensor_types) { From 548d5ea7ed51d5817cbe873b6f18b219b7739600 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 15 Sep 2024 23:40:47 -0400 Subject: [PATCH 540/592] introduced member versions of TiledRange1::make_uniform --- src/TiledArray/tiled_range1.h | 15 +++++++++++++++ tests/tiled_range1.cpp | 8 ++++++++ 2 files changed, 23 insertions(+) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index 5fbe87c64d..e78e647c10 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -318,6 +318,21 @@ class TiledRange1 { return make_uniform(Range1(0, range_extent), target_tile_size); } + /// same as make_uniform(const Range1&, std::size_t), using the element_range + /// of this TiledRange1 + TiledRange1 make_uniform(std::size_t target_tile_size) const { + return make_uniform(this->elements_range(), target_tile_size); + } + + /// make as uniformly-tiled range as possible out of this TiledRange1, with + /// the same number of tiles as this + TiledRange1 make_uniform() const { + return make_uniform( + this->elements_range(), + (this->elements_range().extent() + this->tile_extent() - 1) / + this->tile_extent()); + } + /// shifts this TiledRange1 /// @param[in] shift the shift to apply diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index 12b94578b5..39bd7fa7c4 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -380,6 +380,14 @@ BOOST_AUTO_TEST_CASE(make_uniform) { BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(59, 10)); BOOST_CHECK(TiledRange1::make_uniform(59, 10) == (TiledRange1{0, 10, 20, 30, 40, 50, 59})); + + // member versions + BOOST_REQUIRE_NO_THROW((TiledRange1{0, 10, 20, 30, 40, 50}.make_uniform(30))); + BOOST_CHECK((TiledRange1{0, 10, 20, 30, 40, 50}.make_uniform(30) == + TiledRange1{0, 25, 50})); + BOOST_REQUIRE_NO_THROW((TiledRange1{0, 40, 50}.make_uniform())); + BOOST_CHECK( + (TiledRange1{0, 40, 50}.make_uniform() == TiledRange1{0, 25, 50})); } BOOST_AUTO_TEST_CASE(shift) { From 49573616b020aa643dbce3571b55fb74eaa79419 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 17 Sep 2024 11:40:43 -0400 Subject: [PATCH 541/592] TiledRange1::make_uniform(rng) with empty range preserves its lobound --- src/TiledArray/tiled_range1.h | 2 +- tests/tiled_range1.cpp | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h index e78e647c10..aa75916442 100644 --- a/src/TiledArray/tiled_range1.h +++ b/src/TiledArray/tiled_range1.h @@ -308,7 +308,7 @@ class TiledRange1 { hashmarks.push_back(range.upbound()); return TiledRange1(hashmarks.begin(), hashmarks.end()); } else - return TiledRange1{}; + return TiledRange1{range.lobound()}; } /// same as make_uniform(const Range1&, std::size_t) for a 0-based range diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp index 39bd7fa7c4..947142f6dc 100644 --- a/tests/tiled_range1.cpp +++ b/tests/tiled_range1.cpp @@ -350,8 +350,10 @@ BOOST_AUTO_TEST_CASE(concatenation) { } BOOST_AUTO_TEST_CASE(make_uniform) { + BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{0, 0}, 0)); + BOOST_CHECK(TiledRange1::make_uniform(Range1{0, 0}, 0) == TiledRange1{}); BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{1, 1}, 0)); - BOOST_CHECK(TiledRange1::make_uniform(Range1{1, 1}, 0) == TiledRange1{}); + BOOST_CHECK(TiledRange1::make_uniform(Range1{1, 1}, 0) == TiledRange1{1}); BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{3, 6}, 10)); BOOST_CHECK(TiledRange1::make_uniform(Range1{3, 6}, 10) == (TiledRange1{3, 6})); From bb1e83a5d9c390bae089c16f8bf35fedeb1b3023 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 17 Sep 2024 11:41:22 -0400 Subject: [PATCH 542/592] heig: work around the n=0 corner case --- src/TiledArray/math/linalg/rank-local.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/TiledArray/math/linalg/rank-local.cpp b/src/TiledArray/math/linalg/rank-local.cpp index d23f3b4e3f..6db050ee5c 100644 --- a/src/TiledArray/math/linalg/rank-local.cpp +++ b/src/TiledArray/math/linalg/rank-local.cpp @@ -121,6 +121,7 @@ void heig(Matrix& A, std::vector>& W) { integer lda = A.rows(); W.resize(n); auto* w = W.data(); + if (n == 0) return; if constexpr (TiledArray::detail::is_complex_v) TA_LAPACK(heev, jobz, uplo, n, a, lda, w); else @@ -140,6 +141,7 @@ void heig(Matrix& A, Matrix& B, integer ldb = B.rows(); W.resize(n); auto* w = W.data(); + if (n == 0) return; if constexpr (TiledArray::detail::is_complex_v) TA_LAPACK(hegv, itype, jobz, uplo, n, a, lda, b, ldb, w); else From 6429585e8646926edd5b2cf86bb35d1d0c0f9a08 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 17 Sep 2024 11:52:20 -0400 Subject: [PATCH 543/592] bump MAD tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/547 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 96e7259ed5..db11ed24df 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -43,7 +43,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20 and later. - [BTAS](http://github.com/ValeevGroup/BTAS), tag 4e8f5233aa7881dccdfcc37ce07128833926d3c2 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 96ac90e8f193ccfaf16f346b4652927d2d362e75 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 95589b0d020a076f93d02eead6da654b23dd3d91 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index a005bcdec5..87804775f9 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -11,8 +11,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 3d0ae2fad1b97e347ca6dd98b9f1b9e74e629f52) +set(TA_TRACKED_MADNESS_TAG 95589b0d020a076f93d02eead6da654b23dd3d91) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From 28b1966723664c6fdd8023843319a8d68b4fabc7 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 18 Sep 2024 00:10:14 -0400 Subject: [PATCH 544/592] BinaryExpr: account for ignore_tile_position when checking preconditions --- src/TiledArray/dist_eval/binary_eval.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/dist_eval/binary_eval.h b/src/TiledArray/dist_eval/binary_eval.h index 62bbdb64ce..87cce91656 100644 --- a/src/TiledArray/dist_eval/binary_eval.h +++ b/src/TiledArray/dist_eval/binary_eval.h @@ -107,7 +107,10 @@ class BinaryEvalImpl : public DistEvalImpl, right_ntiles_discarded_(0) #endif { - TA_ASSERT(left.trange() == right.trange()); + TA_ASSERT(ignore_tile_position() + ? left.trange().elements_range().extent() == + right.trange().elements_range().extent() + : left.trange() == right.trange()); } virtual ~BinaryEvalImpl() {} From 9a3594a3d51dccbf79b202dea500a89eacc4c8ee Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 18 Sep 2024 17:09:24 -0400 Subject: [PATCH 545/592] [python] simplify make_trange by using TiledRange1::make_uniform --- python/src/TiledArray/python/trange.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/python/src/TiledArray/python/trange.h b/python/src/TiledArray/python/trange.h index 488421291d..8c008c1fa9 100644 --- a/python/src/TiledArray/python/trange.h +++ b/python/src/TiledArray/python/trange.h @@ -45,7 +45,6 @@ auto list(const TiledRange &trange) { return v; } -// template<> inline TiledRange make_trange(std::vector > trange) { std::vector trange1; for (auto tr : trange) { @@ -58,11 +57,7 @@ inline TiledRange make_trange(std::vector > trange) { inline TiledRange make_trange(std::vector shape, size_t block) { std::vector trange1; for (size_t i = 0; i < shape.size(); ++i) { - std::vector tr1; - for (size_t j = 0; j <= (shape[i] + block - 1); j += block) { - tr1.push_back(std::min(j, shape[i])); - } - trange1.push_back(TiledRange1(tr1.begin(), tr1.end())); + trange1.emplace_back(TiledRange1::make_uniform(shape[i], block)); } return TiledRange(trange1.begin(), trange1.end()); } From 6e7e2508edfe7fe9c3bf7e505ec93bc941b2a503 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 18 Sep 2024 19:21:45 -0400 Subject: [PATCH 546/592] [ci] greatly reduce the gitlab matrix, replace rel/deb builds with relwithdebinfo --- .gitlab-ci.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b57a210430..02c3edc266 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -57,22 +57,22 @@ ubuntu: metrics: build/metrics.txt parallel: matrix: - - IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ] + - IMAGE : [ "ubuntu:20.04" ] CXX: [ g++ ] - BUILD_TYPE : [ "Release" ] + BUILD_TYPE : [ "RelWithDebInfo" ] BLA_VENDOR : [ "BLAS_PREFERENCE_LIST=IntelMKL" ] BLA_THREADS : [ "IntelMKL_THREAD_LAYER=tbb" ] # ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ] TA_PYTHON : [ "TA_PYTHON=OFF" ] # needs to be fixed for MKL RUNNER_TAGS: [ saas-linux-small-amd64 ] - - IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ] + - IMAGE : [ "ubuntu:22.04" ] CXX: [ g++, clang++-13 ] - BUILD_TYPE : [ "Release", "Debug" ] + BUILD_TYPE : [ "RelWithDebInfo" ] ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ] RUNNER_TAGS: [ saas-linux-small-amd64 ] - - IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ] + - IMAGE : [ "ubuntu:22.04" ] CXX: [ g++ ] - BUILD_TYPE : [ "Release", "Debug" ] + BUILD_TYPE : [ "RelWithDebInfo" ] ENABLE_CUDA : [ "ENABLE_CUDA=ON" ] TA_TARGETS : [ "tiledarray examples-tiledarray check_serial-tiledarray" ] RUNNER_TAGS: [ cuda ] From 18bc8f48547e7c2727b492e48b50e9a0acbb6851 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 22 Sep 2024 06:32:39 -0400 Subject: [PATCH 547/592] TA::host_allocator is serializable, so that btas::Tensor can be used as a tile again --- src/CMakeLists.txt | 2 - src/TiledArray/device/allocators.h | 138 ---------------------------- src/TiledArray/device/um_storage.cu | 2 +- src/TiledArray/device/um_storage.h | 2 +- src/TiledArray/external/device.h | 15 ++- src/TiledArray/external/umpire.h | 83 ++++++++++++++++- src/TiledArray/fwd.h | 32 ++++--- src/TiledArray/host/allocator.h | 78 ---------------- src/TiledArray/host/env.h | 10 ++ src/TiledArray/tensor/tensor.h | 3 +- 10 files changed, 127 insertions(+), 238 deletions(-) delete mode 100644 src/TiledArray/device/allocators.h delete mode 100644 src/TiledArray/host/allocator.h diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c426d1ffbe..3d6b94ea9a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -134,7 +134,6 @@ TiledArray/external/btas.h TiledArray/external/madness.h TiledArray/external/umpire.h TiledArray/host/env.h -TiledArray/host/allocator.h TiledArray/math/blas.h TiledArray/math/gemm_helper.h TiledArray/math/outer.h @@ -223,7 +222,6 @@ if(CUDA_FOUND OR HIP_FOUND) TiledArray/device/kernel/thrust/reduce_kernel.h TiledArray/device/platform.h TiledArray/device/thrust.h - TiledArray/device/allocators.h TiledArray/device/um_storage.h) if(CUDA_FOUND) list(APPEND TILEDARRAY_HEADER_FILES diff --git a/src/TiledArray/device/allocators.h b/src/TiledArray/device/allocators.h deleted file mode 100644 index 2bda79e768..0000000000 --- a/src/TiledArray/device/allocators.h +++ /dev/null @@ -1,138 +0,0 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2018 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Eduard Valeyev - * Department of Chemistry, Virginia Tech - * Jan 31, 2018 - * - */ - -#ifndef TILEDARRAY_DEVICE_ALLOCATORS_H___INCLUDED -#define TILEDARRAY_DEVICE_ALLOCATORS_H___INCLUDED - -#include - -#ifdef TILEDARRAY_HAS_DEVICE - -#include -#include - -#include - -#include -#include - -namespace TiledArray { - -template -class umpire_based_allocator - : public umpire_based_allocator_impl { - public: - using base_type = umpire_based_allocator_impl; - using typename base_type::const_pointer; - using typename base_type::const_reference; - using typename base_type::pointer; - using typename base_type::reference; - using typename base_type::value_type; - - umpire_based_allocator() noexcept : base_type(&UmpireAllocatorAccessor{}()) {} - - template - umpire_based_allocator( - const umpire_based_allocator& - rhs) noexcept - : base_type( - static_cast&>( - rhs)) {} - - template - friend bool operator==( - const umpire_based_allocator& - lhs, - const umpire_based_allocator& - rhs) noexcept; -}; // class umpire_based_allocator - -template -bool operator==( - const umpire_based_allocator& lhs, - const umpire_based_allocator& - rhs) noexcept { - return lhs.umpire_allocator() == rhs.umpire_allocator(); -} - -template -bool operator!=( - const umpire_based_allocator& lhs, - const umpire_based_allocator& - rhs) noexcept { - return !(lhs == rhs); -} - -namespace detail { - -struct get_um_allocator { - umpire::Allocator& operator()() { - return deviceEnv::instance()->um_allocator(); - } -}; - -struct get_pinned_allocator { - umpire::Allocator& operator()() { - return deviceEnv::instance()->pinned_allocator(); - } -}; - -} // namespace detail - -} // namespace TiledArray - -namespace madness { -namespace archive { - -template -struct ArchiveLoadImpl> { - static inline void load( - const Archive& ar, - TiledArray::umpire_based_allocator& allocator) { - allocator = TiledArray::umpire_based_allocator{}; - } -}; - -template -struct ArchiveStoreImpl> { - static inline void store( - const Archive& ar, - const TiledArray::umpire_based_allocator< - T, StaticLock, UmpireAllocatorAccessor>& allocator) {} -}; - -} // namespace archive -} // namespace madness - -#endif // TILEDARRAY_HAS_DEVICE - -#endif // TILEDARRAY_DEVICE_ALLOCATORS_H___INCLUDED diff --git a/src/TiledArray/device/um_storage.cu b/src/TiledArray/device/um_storage.cu index cc3a1aae55..8879c246f8 100644 --- a/src/TiledArray/device/um_storage.cu +++ b/src/TiledArray/device/um_storage.cu @@ -22,7 +22,7 @@ */ -#include +#include #include #ifdef TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/device/um_storage.h b/src/TiledArray/device/um_storage.h index d151a3c316..d91c032312 100644 --- a/src/TiledArray/device/um_storage.h +++ b/src/TiledArray/device/um_storage.h @@ -24,7 +24,7 @@ #ifndef TILEDARRAY_DEVICE_UM_VECTOR_H__INCLUDED #define TILEDARRAY_DEVICE_UM_VECTOR_H__INCLUDED -#include +#include #ifdef TILEDARRAY_HAS_DEVICE diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 38bcbbc745..597643b225 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -798,9 +798,22 @@ class Env { static std::unique_ptr instance_{nullptr}; return instance_; } -}; +}; // class Env namespace detail { + +struct get_um_allocator { + umpire::Allocator& operator()() { + return deviceEnv::instance()->um_allocator(); + } +}; + +struct get_pinned_allocator { + umpire::Allocator& operator()() { + return deviceEnv::instance()->pinned_allocator(); + } +}; + // in a madness device task point to its local optional stream to use by // madness_task_stream_opt; set to nullptr after task callable finished inline std::optional*& madness_task_stream_opt_ptr_accessor() { diff --git a/src/TiledArray/external/umpire.h b/src/TiledArray/external/umpire.h index e8d0d48632..ac23a60260 100644 --- a/src/TiledArray/external/umpire.h +++ b/src/TiledArray/external/umpire.h @@ -156,6 +156,54 @@ bool operator!=( return !(lhs == rhs); } +template +class umpire_based_allocator + : public umpire_based_allocator_impl { + public: + using base_type = umpire_based_allocator_impl; + using typename base_type::const_pointer; + using typename base_type::const_reference; + using typename base_type::pointer; + using typename base_type::reference; + using typename base_type::value_type; + + umpire_based_allocator() noexcept : base_type(&UmpireAllocatorAccessor{}()) {} + + template + umpire_based_allocator( + const umpire_based_allocator& + rhs) noexcept + : base_type( + static_cast&>( + rhs)) {} + + template + friend bool operator==( + const umpire_based_allocator& + lhs, + const umpire_based_allocator& + rhs) noexcept; +}; // class umpire_based_allocator + +template +bool operator==( + const umpire_based_allocator& lhs, + const umpire_based_allocator& + rhs) noexcept { + return lhs.umpire_allocator() == rhs.umpire_allocator(); +} + +template +bool operator!=( + const umpire_based_allocator& lhs, + const umpire_based_allocator& + rhs) noexcept { + return !(lhs == rhs); +} + /// see /// https://stackoverflow.com/questions/21028299/is-this-behavior-of-vectorresizesize-type-n-under-c11-and-boost-container/21028912#21028912 template @@ -202,7 +250,7 @@ struct ArchiveLoadImpl& allocator) { std::string allocator_name; - ar& allocator_name; + ar & allocator_name; allocator = TiledArray::umpire_based_allocator_impl( umpire::ResourceManager::getInstance().getAllocator(allocator_name)); } @@ -214,7 +262,7 @@ struct ArchiveStoreImpl< static inline void store( const Archive& ar, const TiledArray::umpire_based_allocator_impl& allocator) { - ar& allocator.umpire_allocator()->getName(); + ar & allocator.umpire_allocator()->getName(); } }; @@ -224,7 +272,7 @@ struct ArchiveLoadImpl> { TiledArray::default_init_allocator& allocator) { if constexpr (!std::allocator_traits::is_always_equal::value) { A base_allocator; - ar& base_allocator; + ar & base_allocator; allocator = TiledArray::default_init_allocator(base_allocator); } } @@ -244,4 +292,33 @@ struct ArchiveStoreImpl> { } // namespace archive } // namespace madness +namespace madness { +namespace archive { + +template +struct ArchiveLoadImpl> { + static inline void load( + const Archive& ar, + TiledArray::umpire_based_allocator& allocator) { + allocator = TiledArray::umpire_based_allocator{}; + } +}; + +template +struct ArchiveStoreImpl> { + static inline void store( + const Archive& ar, + const TiledArray::umpire_based_allocator< + T, StaticLock, UmpireAllocatorAccessor>& allocator) {} +}; + +} // namespace archive +} // namespace madness + #endif // TILEDARRAY_EXTERNAL_UMPIRE_H___INCLUDED diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index 97d91a9a00..6127db32f3 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -36,12 +36,27 @@ class aligned_allocator; // fwddecl host_allocator namespace TiledArray { -template -class host_allocator_impl; -template +namespace detail { +struct get_host_allocator; +struct NullLock; +template +class MutexLock; +} // namespace detail + +template +class umpire_based_allocator; + +template > class default_init_allocator; + +class hostEnv; + +/// pooled thread-safe host memory allocator template -using host_allocator = default_init_allocator>; +using host_allocator = + default_init_allocator, + detail::get_host_allocator>>; } // namespace TiledArray namespace madness { @@ -87,18 +102,9 @@ class Env; } using deviceEnv = device::Env; -template -class umpire_based_allocator; - -template > -class default_init_allocator; - namespace detail { struct get_um_allocator; struct get_pinned_allocator; -struct NullLock; -template -class MutexLock; } // namespace detail /// pooled thread-safe unified memory (UM) allocator for device computing diff --git a/src/TiledArray/host/allocator.h b/src/TiledArray/host/allocator.h deleted file mode 100644 index a22613fb38..0000000000 --- a/src/TiledArray/host/allocator.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * This file is a part of TiledArray. - * Copyright (C) 2021 Virginia Tech - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Eduard Valeyev - * Department of Chemistry, Virginia Tech - * Jan 31, 2018 - * - */ - -#ifndef TILEDARRAY_HOST_ALLOCATOR_H___INCLUDED -#define TILEDARRAY_HOST_ALLOCATOR_H___INCLUDED - -#include - -#include -#include - -#include - -#include -#include - -namespace TiledArray { - -/// pooled, thread-safe allocator for host memory -template -class host_allocator_impl - : public umpire_based_allocator_impl> { - public: - using base_type = umpire_based_allocator_impl>; - using typename base_type::const_pointer; - using typename base_type::const_reference; - using typename base_type::pointer; - using typename base_type::reference; - using typename base_type::value_type; - - host_allocator_impl() noexcept - : base_type(&hostEnv::instance()->host_allocator()) {} - - template - host_allocator_impl(const host_allocator_impl& rhs) noexcept - : base_type(static_cast>&>(rhs)) {} - - template - friend bool operator==(const host_allocator_impl& lhs, - const host_allocator_impl& rhs) noexcept; -}; // class host_allocator_impl - -template -bool operator==(const host_allocator_impl& lhs, - const host_allocator_impl& rhs) noexcept { - return lhs.umpire_allocator() == rhs.umpire_allocator(); -} - -template -bool operator!=(const host_allocator_impl& lhs, - const host_allocator_impl& rhs) noexcept { - return !(lhs == rhs); -} - -} // namespace TiledArray - -#endif // TILEDARRAY_HOST_ALLOCATOR_H___INCLUDED diff --git a/src/TiledArray/host/env.h b/src/TiledArray/host/env.h index 1b3c4f277f..be1de5369c 100644 --- a/src/TiledArray/host/env.h +++ b/src/TiledArray/host/env.h @@ -148,6 +148,16 @@ class hostEnv { } }; +namespace detail { + +struct get_host_allocator { + umpire::Allocator& operator()() { + return hostEnv::instance()->host_allocator(); + } +}; + +} // namespace detail + } // namespace TiledArray #endif // TILEDARRAY_HOST_ENV_H__INCLUDED diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 12479ef53c..171dac2eea 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -22,7 +22,8 @@ #include "TiledArray/config.h" -#include "TiledArray/host/allocator.h" +#include "TiledArray/external/umpire.h" +#include "TiledArray/host/env.h" #include "TiledArray/math/blas.h" #include "TiledArray/math/gemm_helper.h" From 5d35fb414a48856625f781f152ae79e1aabf3343 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 22 Sep 2024 06:34:00 -0400 Subject: [PATCH 548/592] introduced TA::Tile::at_ordinal + strengthen disambiguation checks for potential at_ordinal uses --- src/TiledArray/tensor/tensor.h | 20 ++++++++++--- src/TiledArray/tile.h | 52 ++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index 171dac2eea..bf729e59d9 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -705,7 +705,7 @@ class Tensor { const_reference operator[](const Ordinal ord) const { TA_ASSERT(!this->empty()); // can't distinguish between operator[](Index...) and operator[](ordinal) - // thus assume at_ordinal() if this->rank()==1 + // thus insist on at_ordinal() if this->rank()==1 TA_ASSERT(this->range_.rank() != 1 && "use Tensor::operator[](index) or " "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); @@ -726,7 +726,7 @@ class Tensor { reference operator[](const Ordinal ord) { TA_ASSERT(!this->empty()); // can't distinguish between operator[](Index...) and operator[](ordinal) - // thus assume at_ordinal() if this->rank()==1 + // thus insist on at_ordinal() if this->rank()==1 TA_ASSERT(this->range_.rank() != 1 && "use Tensor::operator[](index) or " "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); @@ -848,7 +848,7 @@ class Tensor { TA_ASSERT(!this->empty()); TA_ASSERT(this->nbatch() == 1); // can't distinguish between operator[](Index...) and operator[](ordinal) - // thus assume at_ordinal() if this->rank()==1 + // thus insist on at_ordinal() if this->rank()==1 TA_ASSERT(this->range_.rank() != 1 && "use Tensor::operator()(index) or " "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); @@ -869,7 +869,7 @@ class Tensor { TA_ASSERT(!this->empty()); TA_ASSERT(this->nbatch() == 1); // can't distinguish between operator[](Index...) and operator[](ordinal) - // thus assume at_ordinal() if this->rank()==1 + // thus insist on at_ordinal() if this->rank()==1 TA_ASSERT(this->range_.rank() != 1 && "use Tensor::operator()(index) or " "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); @@ -960,6 +960,12 @@ class Tensor { const_reference operator()(const Index&... i) const { TA_ASSERT(!this->empty()); TA_ASSERT(this->nbatch() == 1); + TA_ASSERT(this->range().rank() == sizeof...(Index)); + // can't distinguish between operator()(Index...) and operator()(ordinal) + // thus insist on at_ordinal() if this->rank()==1 + TA_ASSERT(this->range_.rank() != 1 && + "use Tensor::operator()(index) or " + "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); using Int = std::common_type_t; const auto iord = this->range_.ordinal( std::array{{static_cast(i)...}}); @@ -982,6 +988,12 @@ class Tensor { reference operator()(const Index&... i) { TA_ASSERT(!this->empty()); TA_ASSERT(this->nbatch() == 1); + TA_ASSERT(this->range().rank() == sizeof...(Index)); + // can't distinguish between operator()(Index...) and operator()(ordinal) + // thus insist on at_ordinal() if this->rank()==1 + TA_ASSERT(this->range_.rank() != 1 && + "use Tensor::operator()(index) or " + "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1"); using Int = std::common_type_t; const auto iord = this->range_.ordinal( std::array{{static_cast(i)...}}); diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h index b8c62d95b8..39fca37d9e 100644 --- a/src/TiledArray/tile.h +++ b/src/TiledArray/tile.h @@ -250,6 +250,11 @@ class Tile { std::enable_if_t::value>* = nullptr> const_reference operator[](const Ordinal ord) const { TA_ASSERT(pimpl_); + // can't distinguish between operator[](Index...) and operator[](ordinal) + // thus insist on at_ordinal() if this->rank()==1 + TA_ASSERT(this->range().rank() != 1 && + "use Tile::operator[](index) or " + "Tile::at_ordinal(index_ordinal) if this->range().rank()==1"); TA_ASSERT(tensor().range().includes_ordinal(ord)); return tensor().data()[ord]; } @@ -264,6 +269,41 @@ class Tile { template ::value>* = nullptr> reference operator[](const Ordinal ord) { + TA_ASSERT(pimpl_); + // can't distinguish between operator[](Index...) and operator[](ordinal) + // thus insist on at_ordinal() if this->rank()==1 + TA_ASSERT(this->range().rank() != 1 && + "use Tile::operator[](index) or " + "Tile::at_ordinal(index_ordinal) if this->range().rank()==1"); + TA_ASSERT(tensor().range().includes_ordinal(ord)); + return tensor().data()[ord]; + } + + /// Const element accessor + + /// \tparam Ordinal an integer type that represents an ordinal + /// \param[in] ord an ordinal index + /// \return Const reference to the element at position \c ord . + /// \note This asserts (using TA_ASSERT) that this is not empty and ord is + /// included in the range + template ::value>* = nullptr> + const_reference at_ordinal(const Ordinal ord) const { + TA_ASSERT(pimpl_); + TA_ASSERT(tensor().range().includes_ordinal(ord)); + return tensor().data()[ord]; + } + + /// Element accessor + + /// \tparam Ordinal an integer type that represents an ordinal + /// \param[in] ord an ordinal index + /// \return Reference to the element at position \c ord . + /// \note This asserts (using TA_ASSERT) that this is not empty and ord is + /// included in the range + template ::value>* = nullptr> + reference at_ordinal(const Ordinal ord) { TA_ASSERT(pimpl_); TA_ASSERT(tensor().range().includes_ordinal(ord)); return tensor().data()[ord]; @@ -401,6 +441,12 @@ class Tile { detail::is_integral_list::value>* = nullptr> const_reference operator()(const Index&... i) const { TA_ASSERT(pimpl_); + TA_ASSERT(this->range().rank() == sizeof...(Index)); + // can't distinguish between operator()(Index...) and operator()(ordinal) + // thus insist on at_ordinal() if this->rank()==1 + TA_ASSERT(this->range().rank() != 1 && + "use Tile::operator()(index) or " + "Tile::at_ordinal(index_ordinal) if this->range().rank()==1"); TA_ASSERT(tensor().range().includes(i...)); return tensor().data()[tensor().range().ordinal(i...)]; } @@ -417,6 +463,12 @@ class Tile { detail::is_integral_list::value>* = nullptr> reference operator()(const Index&... i) { TA_ASSERT(pimpl_); + TA_ASSERT(this->range().rank() == sizeof...(Index)); + // can't distinguish between operator()(Index...) and operator()(ordinal) + // thus insist on at_ordinal() if this->rank()==1 + TA_ASSERT(this->range().rank() != 1 && + "use Tile::operator()(index) or " + "Tile::at_ordinal(index_ordinal) if this->range().rank()==1"); TA_ASSERT(tensor().range().includes(i...)); return tensor().data()[tensor().range().ordinal(i...)]; } From 793b7c4057350f0309f63b47ba82ff933310cdf7 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 07:39:35 -0400 Subject: [PATCH 549/592] hostEnv -> host::Env + fixup to make f294db31bea86d08b8d875d218f24c65221dca76 build --- src/CMakeLists.txt | 7 ++--- src/TiledArray/external/device.h | 40 +++++++++++++++++---------- src/TiledArray/fwd.h | 5 +++- src/TiledArray/host/env.cpp | 36 ++++++++++++++++++++++++ src/TiledArray/host/env.h | 47 ++++++++++++++++---------------- 5 files changed, 92 insertions(+), 43 deletions(-) create mode 100644 src/TiledArray/host/env.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3d6b94ea9a..80f2a49710 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -133,6 +133,7 @@ TiledArray/expressions/index_list.h TiledArray/external/btas.h TiledArray/external/madness.h TiledArray/external/umpire.h +TiledArray/host/env.cpp TiledArray/host/env.h TiledArray/math/blas.h TiledArray/math/gemm_helper.h @@ -206,11 +207,7 @@ TiledArray/util/vector.h if(HIP_FOUND OR CUDA_FOUND) list(APPEND TILEDARRAY_HEADER_FILES TiledArray/external/device.h - TiledArray/external/librett.h) -endif() - -if(CUDA_FOUND OR HIP_FOUND) - list(APPEND TILEDARRAY_HEADER_FILES + TiledArray/external/librett.h TiledArray/device/blas.cpp TiledArray/device/blas.h TiledArray/device/btas.h diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 597643b225..4f9d365e0a 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -41,8 +41,6 @@ #include #endif -#include - #include #include #include @@ -51,6 +49,20 @@ #include #include +#include + +namespace TiledArray::detail { + +struct get_um_allocator { + inline umpire::Allocator& operator()(); +}; + +struct get_pinned_allocator { + inline umpire::Allocator& operator()(); +}; + +} // namespace TiledArray::detail + #if defined(TILEDARRAY_HAS_CUDA) inline void __DeviceSafeCall(cudaError err, const char* file, const int line) { @@ -802,18 +814,6 @@ class Env { namespace detail { -struct get_um_allocator { - umpire::Allocator& operator()() { - return deviceEnv::instance()->um_allocator(); - } -}; - -struct get_pinned_allocator { - umpire::Allocator& operator()() { - return deviceEnv::instance()->pinned_allocator(); - } -}; - // in a madness device task point to its local optional stream to use by // madness_task_stream_opt; set to nullptr after task callable finished inline std::optional*& madness_task_stream_opt_ptr_accessor() { @@ -905,6 +905,18 @@ device::Stream stream_for(const Range& range) { } // namespace device +namespace detail { + +inline umpire::Allocator& get_um_allocator::operator()() { + return deviceEnv::instance()->um_allocator(); +} + +inline umpire::Allocator& get_pinned_allocator::operator()() { + return deviceEnv::instance()->pinned_allocator(); +} + +} // namespace detail + #endif // TILEDARRAY_HAS_DEVICE #ifdef TILEDARRAY_HAS_CUDA diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index 6127db32f3..652b835fab 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -49,7 +49,10 @@ class umpire_based_allocator; template > class default_init_allocator; -class hostEnv; +namespace host { +class Env; +} +using hostEnv = host::Env; /// pooled thread-safe host memory allocator template diff --git a/src/TiledArray/host/env.cpp b/src/TiledArray/host/env.cpp new file mode 100644 index 0000000000..16d3a71a50 --- /dev/null +++ b/src/TiledArray/host/env.cpp @@ -0,0 +1,36 @@ +/* + * This file is a part of TiledArray. + * Copyright (C) 2021 Virginia Tech + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Chong Peng + * Department of Chemistry, Virginia Tech + * July 23, 2018 + * + */ + +#include + +namespace TiledArray { + +namespace detail { + +umpire::Allocator& get_host_allocator::operator()() { + return TiledArray::host::Env::instance()->host_allocator(); +} + +} // namespace detail + +} // namespace TiledArray diff --git a/src/TiledArray/host/env.h b/src/TiledArray/host/env.h index be1de5369c..b469704a72 100644 --- a/src/TiledArray/host/env.h +++ b/src/TiledArray/host/env.h @@ -41,24 +41,34 @@ namespace TiledArray { +namespace detail { + +struct get_host_allocator { + umpire::Allocator& operator()(); +}; + +} // namespace detail + +namespace host { + /** - * hostEnv maintains the (host-side, as opposed to device-side) environment, + * Env maintains the (host-side, as opposed to device-side) environment, * such as memory allocators * * \note this is a Singleton */ -class hostEnv { +class Env { public: - ~hostEnv() = default; + ~Env() = default; - hostEnv(const hostEnv&) = delete; - hostEnv(hostEnv&&) = delete; - hostEnv& operator=(const hostEnv&) = delete; - hostEnv& operator=(hostEnv&&) = delete; + Env(const Env&) = delete; + Env(Env&&) = delete; + Env& operator=(const Env&) = delete; + Env& operator=(Env&&) = delete; /// access the singleton instance; if not initialized will be - /// initialized via hostEnv::initialize() with the default params - static std::unique_ptr& instance() { + /// initialized via Env::initialize() with the default params + static std::unique_ptr& instance() { if (!instance_accessor()) { initialize(); } @@ -103,8 +113,7 @@ class hostEnv { "QuickPool_SizeLimited_HOST", host_size_limited_alloc, page_size, page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE); - auto host_env = - std::unique_ptr(new hostEnv(world, host_dynamic_pool)); + auto host_env = std::unique_ptr(new Env(world, host_dynamic_pool)); instance_accessor() = std::move(host_env); } } @@ -131,7 +140,7 @@ class hostEnv { } protected: - hostEnv(World& world, umpire::Allocator host_alloc) + Env(World& world, umpire::Allocator host_alloc) : world_(&world), host_allocator_(host_alloc) {} private: @@ -142,21 +151,13 @@ class hostEnv { // N.B. not thread safe, so must be wrapped into umpire_based_allocator_impl umpire::Allocator host_allocator_; - inline static std::unique_ptr& instance_accessor() { - static std::unique_ptr instance_{nullptr}; + inline static std::unique_ptr& instance_accessor() { + static std::unique_ptr instance_{nullptr}; return instance_; } }; -namespace detail { - -struct get_host_allocator { - umpire::Allocator& operator()() { - return hostEnv::instance()->host_allocator(); - } -}; - -} // namespace detail +} // namespace host } // namespace TiledArray From 28323a087ecd632cfabf4faa2637177eccdb2217 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 15:43:49 -0400 Subject: [PATCH 550/592] [ci] do not use gcc toolchain on macos, instead try linux + enable ccache --- .github/workflows/ci.yml | 55 +++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2339070e54..8e71db9403 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -12,16 +12,26 @@ jobs: strategy: fail-fast: false matrix: - os : [ macos-latest ] - cxx : [ clang++, /opt/homebrew/bin/g++-11 ] + os : [ macos-latest, ubuntu-22.04 ] build_type : [ Release, Debug ] task_backend: [ Pthreads, PaRSEC ] - prerequisites : [ gcc@11 boost eigen open-mpi bison scalapack ] + include: + - os: ubuntu-22.04 + cc: /usr/bin/gcc-12 + cxx: /usr/bin/g++-12 + - os: macos-latest + cc: clang + cxx: clang++ name: "${{ matrix.os }}: ${{ matrix.cxx }} ${{ matrix.build_type }} ${{ matrix.task_backend }}" runs-on: ${{ matrix.os }} env: CXX : ${{ matrix.cxx }} + CCACHE_DIR : ${{github.workspace}}/build/.ccache + CCACHE_COMPRESS : true + CCACHE_COMPRESSLEVEL : 6 + OMPI_MCA_btl_vader_single_copy_mechanism : none + PARSEC_MCA_runtime_bind_threads : 0 BUILD_CONFIG : > -DMADNESS_TASK_BACKEND=${{ matrix.task_backend }} -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} @@ -33,18 +43,40 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: maxim-lobanov/setup-xcode@v1 - with: - xcode-version: 'latest-stable' - - name: Host system info shell: bash run: cmake -P ${{github.workspace}}/ci/host_system_info.cmake - - name: Install ${{matrix.prerequisites}} + + - name: Install prerequisite MacOS packages + if: ${{ matrix.os == 'macos-latest' }} + run: | + brew install ninja boost eigen open-mpi bison scalapack ccache + echo "MPIEXEC=/opt/homebrew/bin/mpiexec" >> $GITHUB_ENV + + - name: Install prerequisites Ubuntu packages + if: ${{ matrix.os == 'ubuntu-22.04' }} run: | - brew install ${{matrix.prerequisites}} - echo "/usr/local/opt/bison/bin" >> $GITHUB_PATH + wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null + sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" + sudo apt-get update + sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison libscalapack-openmpi-dev cmake doxygen + echo "MPIEXEC=/usr/bin/mpiexec" >> $GITHUB_ENV + + - name: Prepare ccache timestamp + id: ccache_cache_timestamp + shell: cmake -P {0} + run: | + string(TIMESTAMP current_date "%Y-%m-%d-%H;%M;%S" UTC) + message("::set-output name=timestamp::${current_date}") + + - name: Setup ccache cache files + uses: actions/cache@v1.1.0 + with: + path: ${{github.workspace}}/build/.ccache + key: ${{ matrix.config.name }}-ccache-${{ steps.ccache_cache_timestamp.outputs.timestamp }} + restore-keys: | + ${{ matrix.config.name }}-ccache- - name: "Configure build: ${{ env.BUILD_CONFIG }}" shell: bash @@ -56,8 +88,7 @@ jobs: working-directory: ${{github.workspace}}/build shell: bash run: | - cmake --build . --target tiledarray - cmake --build . --target examples + ccache -p && ccache -z && cmake --build . --target tiledarray && cmake --build . --target examples && ccache -s - name: Test working-directory: ${{github.workspace}}/build From 91130e1f40f86d021655789054264038a8194d4f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 16:02:46 -0400 Subject: [PATCH 551/592] [ci] try symlinking libscalapack-openmpi.so to libscalapack.so to help out FindReferenceSCALAPACK --- .github/workflows/ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8e71db9403..143c88f8ea 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -60,7 +60,8 @@ jobs: wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" sudo apt-get update - sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison libscalapack-openmpi-dev cmake doxygen + sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison libscalapack-mpi-dev cmake doxygen + sudo ln -s /usr/lib/x86_64-linux-gnu/libscalapack-openmpi.so /usr/lib/x86_64-linux-gnu/libscalapack.so echo "MPIEXEC=/usr/bin/mpiexec" >> $GITHUB_ENV - name: Prepare ccache timestamp @@ -82,7 +83,7 @@ jobs: shell: bash run: | set -x; - cmake -B${{github.workspace}}/build $BUILD_CONFIG || (cat CMakeFiles/CMakeOutput.log && cat CMakeFiles/CMakeError.log) + cmake -B${{github.workspace}}/build $BUILD_CONFIG || (cat CMakeFiles/CMakeConfigureLog.yaml) - name: Build working-directory: ${{github.workspace}}/build From cd2f48f6780337852937cce73d77a6194cb99899 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 16:48:02 -0400 Subject: [PATCH 552/592] [unit] [cuda] another disambiguation via at_ordinal --- tests/expressions_device_um.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/expressions_device_um.cpp b/tests/expressions_device_um.cpp index e624756561..d49b425372 100644 --- a/tests/expressions_device_um.cpp +++ b/tests/expressions_device_um.cpp @@ -85,7 +85,8 @@ struct UMExpressionsFixture : public TiledRangeFixture { template static Tile make_rand_tile(const typename TA::Range& r) { Tile tile(r); - for (std::size_t i = 0ul; i < tile.size(); ++i) set_random(tile[i]); + for (std::size_t i = 0ul; i < tile.size(); ++i) + set_random(tile.at_ordinal(i)); return tile; } From 2cc06291ce94d72d142167a91ef81fe8ce9dc8df Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 16:48:25 -0400 Subject: [PATCH 553/592] [ci] disable Gitlab jobs except CUDA --- .gitlab-ci.yml | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 02c3edc266..8b675a692c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -57,22 +57,10 @@ ubuntu: metrics: build/metrics.txt parallel: matrix: - - IMAGE : [ "ubuntu:20.04" ] - CXX: [ g++ ] - BUILD_TYPE : [ "RelWithDebInfo" ] - BLA_VENDOR : [ "BLAS_PREFERENCE_LIST=IntelMKL" ] - BLA_THREADS : [ "IntelMKL_THREAD_LAYER=tbb" ] - # ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ] - TA_PYTHON : [ "TA_PYTHON=OFF" ] # needs to be fixed for MKL - RUNNER_TAGS: [ saas-linux-small-amd64 ] - - IMAGE : [ "ubuntu:22.04" ] - CXX: [ g++, clang++-13 ] - BUILD_TYPE : [ "RelWithDebInfo" ] - ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ] - RUNNER_TAGS: [ saas-linux-small-amd64 ] - IMAGE : [ "ubuntu:22.04" ] CXX: [ g++ ] BUILD_TYPE : [ "RelWithDebInfo" ] + TA_PYTHON : [ "TA_PYTHON=OFF" ] ENABLE_CUDA : [ "ENABLE_CUDA=ON" ] TA_TARGETS : [ "tiledarray examples-tiledarray check_serial-tiledarray" ] RUNNER_TAGS: [ cuda ] From 8bea07822f5df7a01a1b10345823ed042a0c7e17 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 20:42:56 -0400 Subject: [PATCH 554/592] introduce {Tensor,Tile}::c{begin,end} --- src/TiledArray/tensor/tensor.h | 28 ++++++++++++++++++++++++++-- src/TiledArray/tile.h | 20 ++++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index bf729e59d9..bd72af487c 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1003,7 +1003,7 @@ class Tensor { /// Iterator factory - /// \return An iterator to the first data element + /// \return A const iterator to the first data element const_iterator begin() const { return (this->data() ? this->data() : NULL); } /// Iterator factory @@ -1013,7 +1013,7 @@ class Tensor { /// Iterator factory - /// \return An iterator to the last data element + /// \return A const iterator to the last data element const_iterator end() const { return (this->data() ? this->data() + this->size() : NULL); } @@ -1023,6 +1023,30 @@ class Tensor { /// \return An iterator to the last data element iterator end() { return (this->data() ? this->data() + this->size() : NULL); } + /// Iterator factory + + /// \return A const iterator to the first data element + const_iterator cbegin() const { return (this->data() ? this->data() : NULL); } + + /// Iterator factory + + /// \return A const iterator to the first data element + const_iterator cbegin() { return (this->data() ? this->data() : NULL); } + + /// Iterator factory + + /// \return A const iterator to the last data element + const_iterator cend() const { + return (this->data() ? this->data() + this->size() : NULL); + } + + /// Iterator factory + + /// \return A const iterator to the last data element + const_iterator cend() { + return (this->data() ? this->data() + this->size() : NULL); + } + /// Read-only access to the data /// \return A const pointer to the tensor data diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h index 39fca37d9e..90f7366bbc 100644 --- a/src/TiledArray/tile.h +++ b/src/TiledArray/tile.h @@ -201,6 +201,26 @@ class Tile { /// \return A const iterator to the last data element decltype(auto) end() const { return std::end(tensor()); } + /// Iterator factory + + /// \return A const iterator to the first data element + decltype(auto) cbegin() { return std::cbegin(tensor()); } + + /// Iterator factory + + /// \return A const iterator to the first data element + decltype(auto) cbegin() const { return std::cbegin(tensor()); } + + /// Iterator factory + + /// \return A const iterator to the last data element + decltype(auto) cend() { return std::cend(tensor()); } + + /// Iterator factory + + /// \return A const iterator to the last data element + decltype(auto) cend() const { return std::cend(tensor()); } + // Data accessor ------------------------------------------------------- /// Data direct access From d5d6b6f94e459b60290eeba7dbc704451e5ae377 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 20:44:21 -0400 Subject: [PATCH 555/592] TensorInterface is a range --- src/TiledArray/tensor/tensor_interface.h | 76 ++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/src/TiledArray/tensor/tensor_interface.h b/src/TiledArray/tensor/tensor_interface.h index 6ba8f0430e..c5e103dad9 100644 --- a/src/TiledArray/tensor/tensor_interface.h +++ b/src/TiledArray/tensor/tensor_interface.h @@ -276,6 +276,82 @@ class TensorInterface { return data_[range_.ordinal(idx...)]; } + /// \brief Tensor interface iterator type + /// + /// Iterates over elements of a tensor interface whose range is iterable + template + class Iterator : public boost::iterator_facade< + Iterator, + std::conditional_t, + const typename TI::value_type, + typename TI::value_type>, + boost::forward_traversal_tag> { + public: + using range_iterator = typename TI::range_type::const_iterator; + + Iterator(range_iterator idx_it, TI& ti) : idx_it(idx_it), ti(ti) {} + + private: + range_iterator idx_it; + TI& ti; + + friend class boost::iterator_core_access; + + /// \brief increments this iterator + void increment() { ++idx_it; } + + /// \brief Iterator comparer + /// \return true, if \c `*this==*other` + bool equal(Iterator const& other) const { + return this->idx_it == other.idx_it; + } + + /// \brief dereferences this iterator + /// \return const reference to the current index + auto& dereference() const { + return ti.at_ordinal(ti.range().ordinal(*idx_it)); + } + }; + friend class Iterator; + friend class Iterator; + + typedef Iterator iterator; ///< Iterator type + typedef Iterator const_iterator; ///< Iterator type + + /// Const begin iterator + + /// \return An iterator that points to the beginning of this tensor view + const_iterator begin() const { + return const_iterator(range().begin(), *this); + } + + /// Const end iterator + + /// \return An iterator that points to the end of this tensor view + const_iterator end() const { return const_iterator(range().end(), *this); } + + /// Nonconst begin iterator + + /// \return An iterator that points to the beginning of this tensor view + iterator begin() { return iterator(range().begin(), *this); } + + /// Nonconst begin iterator + + /// \return An iterator that points to the beginning of this tensor view + iterator end() { return iterator(range().end(), *this); } + + /// Const begin iterator + + /// \return An iterator that points to the beginning of this tensor view + const_iterator cbegin() const { + return const_iterator(range().begin(), *this); + } + + /// Const end iterator + + /// \return An iterator that points to the end of this tensor view + const_iterator cend() const { return const_iterator(range().end(), *this); } + /// Check for empty view /// \return \c false From 469ae8ae6e079406db2ffd6bb09a7d41fa57665d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 20:44:41 -0400 Subject: [PATCH 556/592] dox fixup --- src/TiledArray/range1.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/TiledArray/range1.h b/src/TiledArray/range1.h index 8b185936d4..a29e0d607c 100644 --- a/src/TiledArray/range1.h +++ b/src/TiledArray/range1.h @@ -163,8 +163,6 @@ struct Range1 { /// \return An iterator that points to the beginning of the local element set const_iterator cend() const { return end(); } - /// @} - /// shifts this Range1 /// @param[in] shift the shift to apply From 93b96a32ee5e512ab87f17876ec4ea1470b4c40d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 23 Sep 2024 20:46:12 -0400 Subject: [PATCH 557/592] btas::Tensor can be copied into from Tensor and TensorInterface ... this allows TA::retile on DistArrays of btas::Tensors --- src/TiledArray/external/btas.h | 7 +++++++ tests/btas.cpp | 15 +++++++++++++++ tests/expressions_btas.cpp | 2 ++ 3 files changed, 24 insertions(+) diff --git a/src/TiledArray/external/btas.h b/src/TiledArray/external/btas.h index fe84e6f0c6..c22afd3813 100644 --- a/src/TiledArray/external/btas.h +++ b/src/TiledArray/external/btas.h @@ -62,6 +62,13 @@ class boxrange_iteration_order { static constexpr int value = row_major; }; +template +class is_tensor> : public std::true_type {}; + +template +class is_tensor> + : public std::true_type {}; + } // namespace btas namespace TiledArray { diff --git a/tests/btas.cpp b/tests/btas.cpp index 9c15540e9a..4e972cfc28 100644 --- a/tests/btas.cpp +++ b/tests/btas.cpp @@ -256,6 +256,21 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tensor_ctor, Tensor, tensor_types) { BOOST_REQUIRE_NO_THROW(Tensor t1 = t0); Tensor t1 = t0; BOOST_CHECK(t1.empty()); + + // can copy TA::Tensor to btas::Tensor + TA::Tensor ta_tensor(r); + BOOST_REQUIRE_NO_THROW(Tensor(ta_tensor)); + Tensor t2(ta_tensor); + for (auto i : r) { + BOOST_CHECK_EQUAL(ta_tensor(i), t2(i)); + } + + // can copy TA::TensorInterface to btas::Tensor + BOOST_REQUIRE_NO_THROW(Tensor(ta_tensor.block(r.lobound(), r.upbound()))); + Tensor t3(ta_tensor.block(r.lobound(), r.upbound())); + for (auto i : r) { + BOOST_CHECK_EQUAL(ta_tensor(i), t3(i)); + } } BOOST_AUTO_TEST_CASE_TEMPLATE(copy, Array, array_types) { diff --git a/tests/expressions_btas.cpp b/tests/expressions_btas.cpp index 83ff4b1ed0..7b1ae422ce 100644 --- a/tests/expressions_btas.cpp +++ b/tests/expressions_btas.cpp @@ -23,6 +23,8 @@ * */ +#include + #ifdef TILEDARRAY_HAS_BTAS #include "expressions_fixture.h" From 3fdbaa694ae4c95a9d9b0ba6c328bc1f73deb432 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 24 Sep 2024 08:37:10 -0400 Subject: [PATCH 558/592] [unit] btas_suite/tensor_ctor: initialize ta_tensor properly --- tests/btas.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/btas.cpp b/tests/btas.cpp index 4e972cfc28..ebaf2f02a4 100644 --- a/tests/btas.cpp +++ b/tests/btas.cpp @@ -258,7 +258,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tensor_ctor, Tensor, tensor_types) { BOOST_CHECK(t1.empty()); // can copy TA::Tensor to btas::Tensor - TA::Tensor ta_tensor(r); + TA::Tensor ta_tensor; + ta_tensor = make_rand_tile(r); BOOST_REQUIRE_NO_THROW(Tensor(ta_tensor)); Tensor t2(ta_tensor); for (auto i : r) { From ac83135ce2a236f4e73fe9d3e09e73560c999e80 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 24 Sep 2024 08:55:11 -0400 Subject: [PATCH 559/592] fixup TensorInterface::Iterator::deference() --- src/TiledArray/tensor/tensor_interface.h | 12 +++++------- tests/btas.cpp | 13 +++++++++---- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/TiledArray/tensor/tensor_interface.h b/src/TiledArray/tensor/tensor_interface.h index c5e103dad9..5aaf9f511c 100644 --- a/src/TiledArray/tensor/tensor_interface.h +++ b/src/TiledArray/tensor/tensor_interface.h @@ -262,8 +262,8 @@ class TensorInterface { /// \param idx The index pack template reference operator()(const Index&... idx) { - TA_ASSERT(range_.includes(idx...)); - return data_[range_.ordinal(idx...)]; + const auto ord = range_.ordinal(idx...); + return data_[ord]; } /// Element accessor @@ -272,8 +272,8 @@ class TensorInterface { /// \param idx The index pack template const_reference operator()(const Index&... idx) const { - TA_ASSERT(range_.includes(idx...)); - return data_[range_.ordinal(idx...)]; + const auto ord = range_.ordinal(idx...); + return data_[ord]; } /// \brief Tensor interface iterator type @@ -308,9 +308,7 @@ class TensorInterface { /// \brief dereferences this iterator /// \return const reference to the current index - auto& dereference() const { - return ti.at_ordinal(ti.range().ordinal(*idx_it)); - } + auto& dereference() const { return ti(*idx_it); } }; friend class Iterator; friend class Iterator; diff --git a/tests/btas.cpp b/tests/btas.cpp index ebaf2f02a4..c396110a2f 100644 --- a/tests/btas.cpp +++ b/tests/btas.cpp @@ -267,10 +267,15 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tensor_ctor, Tensor, tensor_types) { } // can copy TA::TensorInterface to btas::Tensor - BOOST_REQUIRE_NO_THROW(Tensor(ta_tensor.block(r.lobound(), r.upbound()))); - Tensor t3(ta_tensor.block(r.lobound(), r.upbound())); - for (auto i : r) { - BOOST_CHECK_EQUAL(ta_tensor(i), t3(i)); + { + const auto l = {3, 3, 3}; + const auto u = r.upbound(); + BOOST_REQUIRE(r.includes(l)); + BOOST_REQUIRE_NO_THROW(Tensor(ta_tensor.block(l, u))); + Tensor t3(ta_tensor.block(l, u)); + for (auto i : t3.range()) { + BOOST_CHECK_EQUAL(ta_tensor(i), t3(i)); + } } } From 280983e013184d142f018f91affbab0def1876ef Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 24 Sep 2024 09:05:19 -0400 Subject: [PATCH 560/592] pull in https://github.com/ValeevGroup/BTAS/pull/179 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index db11ed24df..0e573bb050 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -41,7 +41,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20 and later. -- [BTAS](http://github.com/ValeevGroup/BTAS), tag 4e8f5233aa7881dccdfcc37ce07128833926d3c2 . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag 4b3757cc2b5862f93589afc1e37523e543779c7a . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 95589b0d020a076f93d02eead6da654b23dd3d91 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. diff --git a/external/versions.cmake b/external/versions.cmake index 87804775f9..3363908bf3 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -16,8 +16,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) -set(TA_TRACKED_BTAS_PREVIOUS_TAG b7b2ea7513b087e35c6f1b26184a3904ac1e6b14) +set(TA_TRACKED_BTAS_TAG 4b3757cc2b5862f93589afc1e37523e543779c7a) +set(TA_TRACKED_BTAS_PREVIOUS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) From b1210c497b70b27c102c2aa4dcebccc38a2c916b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 25 Sep 2024 15:34:12 -0400 Subject: [PATCH 561/592] ccache needs to be discovered before other prereqs --- CMakeLists.txt | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 101b1b0d16..8763e0da18 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -299,6 +299,15 @@ include_directories(${PROJECT_SOURCE_DIR}/src ${PROJECT_BINARY_DIR}/src) ########################## add_custom_target(External-tiledarray) +# ccache is an optional dep but must be found first so that the rest of dependencies can use it +find_program(CCACHE ccache) +if(CCACHE) + mark_as_advanced(CCACHE) + message (STATUS "Found ccache: ${CCACHE}") + set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C++") + set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C") +endif(CCACHE) + # required deps: # 1. derive runtime (CUDA/HIP/...) first since others may depend on it if(ENABLE_CUDA) @@ -336,15 +345,7 @@ if(ENABLE_SCALAPACK) include(external/scalapackpp.cmake) endif() -# optional deps: -# 1. ccache -find_program(CCACHE ccache) -if(CCACHE) - mark_as_advanced(CCACHE) - message (STATUS "Found ccache: ${CCACHE}") - set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C++") - set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C") -endif(CCACHE) +# other optional deps: # 2. TTG # N.B. make sure TA configures MADNESS correctly #if (TA_TTG) From 144c55b9d5ec72cbe7d3041dbb9cca86eea80c1f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 25 Sep 2024 15:35:29 -0400 Subject: [PATCH 562/592] [ci] use hendrikmuhs/ccache-action@v1.2 for proper use of ccache --- .github/workflows/ci.yml | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 143c88f8ea..b085f9c8f1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,20 +64,10 @@ jobs: sudo ln -s /usr/lib/x86_64-linux-gnu/libscalapack-openmpi.so /usr/lib/x86_64-linux-gnu/libscalapack.so echo "MPIEXEC=/usr/bin/mpiexec" >> $GITHUB_ENV - - name: Prepare ccache timestamp - id: ccache_cache_timestamp - shell: cmake -P {0} - run: | - string(TIMESTAMP current_date "%Y-%m-%d-%H;%M;%S" UTC) - message("::set-output name=timestamp::${current_date}") - - - name: Setup ccache cache files - uses: actions/cache@v1.1.0 + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 with: - path: ${{github.workspace}}/build/.ccache - key: ${{ matrix.config.name }}-ccache-${{ steps.ccache_cache_timestamp.outputs.timestamp }} - restore-keys: | - ${{ matrix.config.name }}-ccache- + key: ccache-${{ matrix.os }}-${{ matrix.build_type }}-${{ matrix.task_backend }} - name: "Configure build: ${{ env.BUILD_CONFIG }}" shell: bash From a263802aec44da64bf0d8d720511d05fbb8299a6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 25 Sep 2024 16:14:46 -0400 Subject: [PATCH 563/592] use ccache for CUDA --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8763e0da18..a130211293 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -306,6 +306,7 @@ if(CCACHE) message (STATUS "Found ccache: ${CCACHE}") set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C++") set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C") + set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling CUDA") endif(CCACHE) # required deps: From 4523aa68b563949a86ce76b4401759bcb9efd99b Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 25 Sep 2024 16:15:11 -0400 Subject: [PATCH 564/592] pass compiler launchers to LibreTT & Umpire --- external/librett.cmake | 7 +++++++ external/umpire.cmake | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/external/librett.cmake b/external/librett.cmake index afebabb486..5eca3314ce 100644 --- a/external/librett.cmake +++ b/external/librett.cmake @@ -98,6 +98,13 @@ else() "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") endif(CMAKE_TOOLCHAIN_FILE) + foreach(lang C CXX CUDA) + if (DEFINED CMAKE_${lang}_COMPILER_LAUNCHER) + list(APPEND LIBRETT_CMAKE_ARGS + "-DCMAKE_${lang}_COMPILER_LAUNCHER=${CMAKE_${lang}_COMPILER_LAUNCHER}") + endif() + endforeach() + if (BUILD_SHARED_LIBS) set(LIBRETT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) else(BUILD_SHARED_LIBS) diff --git a/external/umpire.cmake b/external/umpire.cmake index 37152e98d2..c6abe2dfd0 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -152,6 +152,13 @@ else() ) endif(CMAKE_TOOLCHAIN_FILE) + foreach(lang C CXX CUDA) + if (DEFINED CMAKE_${lang}_COMPILER_LAUNCHER) + list(APPEND UMPIRE_CMAKE_ARGS + "-DCMAKE_${lang}_COMPILER_LAUNCHER=${CMAKE_${lang}_COMPILER_LAUNCHER}") + endif() + endforeach() + if (BUILD_SHARED_LIBS) set(UMPIRE_DEFAULT_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) else(BUILD_SHARED_LIBS) From 23e0eaaadda0e42ecb5e513743c0b9d65f9b5f0d Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 25 Sep 2024 16:28:08 -0400 Subject: [PATCH 565/592] [ci] build ta_test as part of "Build" step --- .github/workflows/ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b085f9c8f1..4c6a097d9a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -79,7 +79,7 @@ jobs: working-directory: ${{github.workspace}}/build shell: bash run: | - ccache -p && ccache -z && cmake --build . --target tiledarray && cmake --build . --target examples && ccache -s + ccache -p && ccache -z && cmake --build . --target tiledarray ta_test examples && ccache -s - name: Test working-directory: ${{github.workspace}}/build @@ -87,5 +87,4 @@ jobs: #run: ctest -C $${{matrix.build_type}} run: | source ${{github.workspace}}/ci/openmpi.env - cmake --build . --target ta_test cmake --build . --target check-tiledarray From af4c88e08f4d32ba5412ddd349cfbbbdafd73a00 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 25 Sep 2024 16:56:45 -0400 Subject: [PATCH 566/592] [cmake] pull in https://github.com/ValeevGroup/BTAS/pull/179 to speed up btas::Tensor construction from TensorInterface --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index 0e573bb050..1b8a5de202 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -41,7 +41,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20 and later. -- [BTAS](http://github.com/ValeevGroup/BTAS), tag 4b3757cc2b5862f93589afc1e37523e543779c7a . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag 1cfcb12647c768ccd83b098c64cda723e1275e49 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. - [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 95589b0d020a076f93d02eead6da654b23dd3d91 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. diff --git a/external/versions.cmake b/external/versions.cmake index 3363908bf3..6c87fa5a72 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -16,8 +16,8 @@ set(TA_TRACKED_MADNESS_PREVIOUS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG 4b3757cc2b5862f93589afc1e37523e543779c7a) -set(TA_TRACKED_BTAS_PREVIOUS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) +set(TA_TRACKED_BTAS_TAG 1cfcb12647c768ccd83b098c64cda723e1275e49) +set(TA_TRACKED_BTAS_PREVIOUS_TAG 4b3757cc2b5862f93589afc1e37523e543779c7a) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) From 90d2ef72b6e199b981a3d0a33d45b4ef10b54be2 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 25 Sep 2024 18:28:39 -0400 Subject: [PATCH 567/592] [ci] control location of ccache cache + monitor ccache stats --- .gitlab-ci.yml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8b675a692c..33a8d0c9bf 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -25,6 +25,14 @@ before_script: # TODO optimize ta_test build memory consumption - export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:=1} - echo "CMAKE_BUILD_PARALLEL_LEVEL=$CMAKE_BUILD_PARALLEL_LEVEL" + # configure ccache + - export CCACHE_DIR=/root/.ccache + - export CCACHE_COMPRESS=true + - export CCACHE_COMPRESSLEVEL=6 + # print out the ccache configuration + - ccache -p + # zero out the ccache statistics + - ccache -z ubuntu: stage: build @@ -64,3 +72,8 @@ ubuntu: ENABLE_CUDA : [ "ENABLE_CUDA=ON" ] TA_TARGETS : [ "tiledarray examples-tiledarray check_serial-tiledarray" ] RUNNER_TAGS: [ cuda ] + + +after_script: + # print out the ccache statistics + - ccache -s From 5a43925eb506efb07fb5c0b3e7ba402d2d10f7d6 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 27 Sep 2024 16:57:48 -0400 Subject: [PATCH 568/592] pull in Umpire https://github.com/LLNL/Umpire/pull/913 which makes https://github.com/ValeevGroup/tiledarray/commit/2e4572af6dae9c2ed92a3ace8807925f9acf99a3 obsolete --- INSTALL.md | 2 +- external/umpire.cmake | 2 -- external/umpire.finalize_io.patch | 47 ------------------------------- external/versions.cmake | 4 +-- 4 files changed, 3 insertions(+), 52 deletions(-) delete mode 100644 external/umpire.finalize_io.patch diff --git a/INSTALL.md b/INSTALL.md index 1b8a5de202..f2891672e2 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -69,7 +69,7 @@ Optional prerequisites: - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required. - [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators. Note that TiledArray does not use ROCm directly but its C++ Heterogeneous-Compute Interface for Portability, `HIP`; although HIP can also be used to program CUDA-enabled devices, in TiledArray it is used only to program ROCm devices, hence ROCm and HIP will be used interchangeably. - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 6eed30d4dd2a5aa58840fe895dcffd80be7fbece). - - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag v2024.02.1). + - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag 8c85866107f78a58403e20a2ae8e1f24c9852287). - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later). - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing: - [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ (C++17) wrapper for ScaLAPACK (tag 6397f52cf11c0dfd82a79698ee198a2fce515d81); pulls and builds the following additional prerequisite diff --git a/external/umpire.cmake b/external/umpire.cmake index c6abe2dfd0..ee2fa490e1 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -177,8 +177,6 @@ else() DOWNLOAD_DIR ${EXTERNAL_SOURCE_DIR} GIT_REPOSITORY ${UMPIRE_URL} GIT_TAG ${UMPIRE_TAG} - #--Patch step----------------- - PATCH_COMMAND patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/external/umpire.finalize_io.patch #--Configure step------------- SOURCE_DIR ${EXTERNAL_SOURCE_DIR} LIST_SEPARATOR :: diff --git a/external/umpire.finalize_io.patch b/external/umpire.finalize_io.patch deleted file mode 100644 index fa78727d7f..0000000000 --- a/external/umpire.finalize_io.patch +++ /dev/null @@ -1,47 +0,0 @@ -diff --git a/src/umpire/util/io.cpp b/src/umpire/util/io.cpp -index 806fb9e3..551c5e82 100644 ---- a/src/umpire/util/io.cpp -+++ b/src/umpire/util/io.cpp -@@ -52,10 +52,23 @@ std::ostream& error() - - namespace util { - -+namespace detail { -+OutputBuffer& s_log_buffer_accessor() -+{ -+ static OutputBuffer buffer; -+ return buffer; -+} -+OutputBuffer& s_error_buffer_accessor() -+{ -+ static OutputBuffer buffer; -+ return buffer; -+} -+} -+ - void initialize_io(const bool enable_log) - { -- static util::OutputBuffer s_log_buffer; -- static util::OutputBuffer s_error_buffer; -+ OutputBuffer& s_log_buffer = detail::s_log_buffer_accessor(); -+ OutputBuffer& s_error_buffer = detail::s_error_buffer_accessor(); - - s_log_buffer.setConsoleStream(nullptr); - s_error_buffer.setConsoleStream(&std::cerr); -@@ -121,6 +134,16 @@ void initialize_io(const bool enable_log) - MPI::logMpiInfo(); - } - -+void finalize_io() -+{ -+ detail::s_log_buffer_accessor().sync(); -+ detail::s_log_buffer_accessor().setConsoleStream(nullptr); -+ detail::s_log_buffer_accessor().setFileStream(nullptr); -+ detail::s_error_buffer_accessor().sync(); -+ detail::s_error_buffer_accessor().setConsoleStream(nullptr); -+ detail::s_error_buffer_accessor().setFileStream(nullptr); -+} -+ - void flush_files() - { - log().flush(); diff --git a/external/versions.cmake b/external/versions.cmake index 6c87fa5a72..909a969c28 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -22,8 +22,8 @@ set(TA_TRACKED_BTAS_PREVIOUS_TAG 4b3757cc2b5862f93589afc1e37523e543779c7a) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) -set(TA_TRACKED_UMPIRE_TAG v2024.02.1) -set(TA_TRACKED_UMPIRE_PREVIOUS_TAG 20839b2e8e8972070dd8f75c7f00d50d6c399716) +set(TA_TRACKED_UMPIRE_TAG 8c85866107f78a58403e20a2ae8e1f24c9852287) +set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v2024.02.1) set(TA_TRACKED_SCALAPACKPP_TAG 6397f52cf11c0dfd82a79698ee198a2fce515d81) set(TA_TRACKED_SCALAPACKPP_PREVIOUS_TAG 711ef363479a90c88788036f9c6c8adb70736cbf ) From 7ad066d0d93ad5736dda4efa729cf047298cf79b Mon Sep 17 00:00:00 2001 From: Jonathon Misiewicz Date: Mon, 7 Oct 2024 16:40:18 -0400 Subject: [PATCH 569/592] Update CMakeLists.txt Silences a CMake warning. --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 80f2a49710..a16c05d0b2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -225,7 +225,7 @@ if(HIP_FOUND OR CUDA_FOUND) TiledArray/external/cuda.h TiledArray/device/cpu_cuda_vector.h) endif(CUDA_FOUND) -endif(CUDA_FOUND OR HIP_FOUND) +endif(HIP_FOUND OR CUDA_FOUND) set(TILEDARRAY_SOURCE_FILES TiledArray/tiledarray.cpp From 482ab5ea08227860116bcb0ae54901c9be2c9c37 Mon Sep 17 00:00:00 2001 From: Jonathon Misiewicz Date: Tue, 8 Oct 2024 15:54:19 -0400 Subject: [PATCH 570/592] Update umpire.cmake --- external/umpire.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/external/umpire.cmake b/external/umpire.cmake index ee2fa490e1..5b7a4f4078 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -223,6 +223,8 @@ else() "$;$;$;$;$;$;$" INTERFACE_LINK_LIBRARIES "$;$" + INTERFACE_COMPILE_DEFINITIONS + FMT_HEADER_ONLY=1 ) install(TARGETS TiledArray_UMPIRE EXPORT tiledarray COMPONENT tiledarray) From a414cab8bf34040fa9c05072aa1dbb526109ea34 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 8 Oct 2024 20:05:27 -0400 Subject: [PATCH 571/592] bump MAD tag to pull in https://github.com/m-a-d-n-e-s-s/madness/pull/550 --- INSTALL.md | 2 +- external/versions.cmake | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.md b/INSTALL.md index f2891672e2..ed0ba5046c 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -43,7 +43,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20 and later. - [BTAS](http://github.com/ValeevGroup/BTAS), tag 1cfcb12647c768ccd83b098c64cda723e1275e49 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 95589b0d020a076f93d02eead6da654b23dd3d91 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 93a9a5cec2a8fa87fba3afe8056607e6062a9058 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. diff --git a/external/versions.cmake b/external/versions.cmake index 909a969c28..d9d47a3bf2 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -11,8 +11,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 95589b0d020a076f93d02eead6da654b23dd3d91) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75) +set(TA_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 95589b0d020a076f93d02eead6da654b23dd3d91) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) From d856c6a4f4de592e21c1904dba93f5c22ad7b633 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 8 Oct 2024 20:01:41 -0400 Subject: [PATCH 572/592] DistArray: init_{tiles,elements} and fill* are parametrized by fence template parameter that controls whether operation uses local, global, or no fence (default, same as before) --- src/TiledArray/array_impl.h | 45 ++++++++++++++++++++++++++++++++----- src/TiledArray/dist_array.h | 43 +++++++++++++++++++++++------------ src/TiledArray/fwd.h | 8 +++++++ 3 files changed, 77 insertions(+), 19 deletions(-) diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h index df7138a9e7..9dbf5640c4 100644 --- a/src/TiledArray/array_impl.h +++ b/src/TiledArray/array_impl.h @@ -198,6 +198,17 @@ std::ostream& operator<<(std::ostream& os, const TileConstReference& a) { return os; } +/// Callaback used to update counter (typically, task counter) +template +struct IncrementCounter : public madness::CallbackInterface { + AtomicInt& counter; + IncrementCounter(AtomicInt& counter) : counter(counter) {} + void notify() override { + ++counter; + delete this; + } +}; + } // namespace detail } // namespace TiledArray @@ -770,20 +781,24 @@ class ArrayImpl : public TensorImpl, /// \tparam Op The type of the functor/function /// \param[in] op The operation used to generate tiles /// \param[in] skip_set If false, will throw if any tiles are already set + /// \return the total number of tiles that have been (or will be) initialized /// \throw TiledArray::Exception if the PIMPL is not set. Strong throw /// guarantee. /// \throw TiledArray::Exception if a tile is already set and skip_set is /// false. Weak throw guarantee. - template - void init_tiles(Op&& op, bool skip_set = false) { + template + std::int64_t init_tiles(Op&& op, bool skip_set = false) { // lifetime management of op depends on whether it is a lvalue ref (i.e. has // an external owner) or an rvalue ref // - if op is an lvalue ref: pass op to tasks // - if op is an rvalue ref pass make_shared_function(op) to tasks auto op_shared_handle = make_op_shared_handle(std::forward(op)); + std::int64_t ntiles_initialized{0}; auto it = this->pmap()->begin(); const auto end = this->pmap()->end(); + std::atomic ntask_completed{0}; for (; it != end; ++it) { const auto& index = *it; if (!this->is_zero(index)) { @@ -792,19 +807,39 @@ class ArrayImpl : public TensorImpl, if (fut.probe()) continue; } if constexpr (Exec == HostExecutor::MADWorld) { - Future tile = this->world().taskq.add( - [this_sptr = this->shared_from_this(), - index = ordinal_type(index), op_shared_handle]() -> value_type { + Future tile = + this->world().taskq.add([this_sptr = this->shared_from_this(), + index = ordinal_type(index), + op_shared_handle, this]() -> value_type { return op_shared_handle( this_sptr->trange().make_tile_range(index)); }); + ++ntiles_initialized; + if constexpr (fence == Fence::Local) { + tile.register_callback( + new IncrementCounter( + ntask_completed)); + } set(index, std::move(tile)); } else { static_assert(Exec == HostExecutor::Thread); set(index, op_shared_handle(this->trange().make_tile_range(index))); + ++ntiles_initialized; } } } + + if constexpr (fence == Fence::Local) { + if constexpr (Exec == HostExecutor::MADWorld) { + if (ntiles_initialized > 0) + this->world().await([&ntask_completed, ntiles_initialized]() { + return ntask_completed == ntiles_initialized; + }); + } + } else if constexpr (fence == Fence::Global) { + this->world().gop.fence(); + } + return ntiles_initialized; } }; // class ArrayImpl diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index 3bc9fe3c62..1aa90ce351 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -906,23 +906,29 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// guarantee. /// \throw TiledArray::Exception if skip_set is false and a local tile is /// already set. Weak throw guarantee. - void fill_local(const element_type& value = element_type(), - bool skip_set = false) { - init_tiles( + template + std::int64_t fill_local(const element_type& value = element_type(), + bool skip_set = false) { + return init_tiles( [value](const range_type& range) { return value_type(range, value); }, skip_set); } /// Fill all local tiles with the specified value + /// \tparam fence If Fence::No, the operation will return early, + /// before the tasks have completed /// \param[in] value What each local tile should be filled with. /// \param[in] skip_set If false, will throw if any tiles are already set + /// \return the total number of tiles that have been (or will be) initialized /// \throw TiledArray::Exception if the PIMPL is uninitialized. Strong throw /// guarantee. /// \throw TiledArray::Exception if skip_set is false and a local tile is /// already set. Weak throw guarantee. - void fill(const element_type& value = numeric_type(), bool skip_set = false) { - fill_local(value, skip_set); + template + std::int64_t fill(const element_type& value = numeric_type(), + bool skip_set = false) { + return fill_local(value, skip_set); } /// Fill all local tiles with random values @@ -934,18 +940,21 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// generate random values of type T this function will be disabled via SFINAE /// and attempting to use it will lead to a compile-time error. /// + /// \tparam fence If Fence::No, the operation will return early, + /// before the tasks have completed /// \tparam T The type of random value to generate. Defaults to /// element_type. /// \param[in] skip_set If false, will throw if any tiles are already set + /// \return the total number of tiles that have been (or will be) initialized /// \throw TiledArray::Exception if the PIMPL is not initialized. Strong /// throw guarantee. /// \throw TiledArray::Exception if skip_set is false and a local tile is /// already initialized. Weak throw guarantee. template > - void fill_random(bool skip_set = false) { - init_elements( + std::int64_t fill_random(bool skip_set = false) { + return init_elements( [](const auto&) { return detail::MakeRandom::generate_value(); }); } @@ -978,6 +987,8 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// return tile; /// }); /// \endcode + /// \tparam fence If Fence::No, the operation will return early, + /// before the tasks have completed /// \tparam Op The type of the functor/function /// \param[in] op The operation used to generate tiles /// \param[in] skip_set If false, will throw if any tiles are already set @@ -985,9 +996,11 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// guarantee. /// \throw TiledArray::Exception if a tile is already set and skip_set is /// false. Weak throw guarantee. - template - void init_tiles(Op&& op, bool skip_set = false) { - impl_ref().template init_tiles(std::forward(op), skip_set); + template + std::int64_t init_tiles(Op&& op, bool skip_set = false) { + return impl_ref().template init_tiles(std::forward(op), + skip_set); } /// Initialize elements of local, non-zero tiles with a user provided functor @@ -1009,15 +1022,17 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// \tparam Op Type of the function/functor which will generate the elements. /// \param[in] op The operation used to generate elements /// \param[in] skip_set If false, will throw if any tiles are already set + /// \return the total number of tiles that have been (or will be) initialized /// \throw TiledArray::Exception if the PIMPL is not initialized. Strong /// throw guarnatee. /// \throw TiledArray::Exception if skip_set is false and a local, non-zero /// tile is already initialized. Weak throw /// guarantee. - template - void init_elements(Op&& op, bool skip_set = false) { + template + std::int64_t init_elements(Op&& op, bool skip_set = false) { auto op_shared_handle = make_op_shared_handle(std::forward(op)); - init_tiles( + return init_tiles( [op = std::move(op_shared_handle)]( const TiledArray::Range& range) -> value_type { // Initialize the tile with the given range object diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index 652b835fab..e33aea5c18 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -203,6 +203,14 @@ using Array enum class HostExecutor { Thread, MADWorld, Default = MADWorld }; +/// fence types +enum class Fence { + Global, //!< global fence (`world.gop.fence()`) + Local, //!< local fence (all local work done, equivalent to + //!< `world.taskq.fence() in absence of active messages) + No //!< no fence +}; + namespace conversions { /// user defined conversions From 6d661ab2a103ad88c633f82c6fad1c9fef102872 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Tue, 8 Oct 2024 20:02:30 -0400 Subject: [PATCH 573/592] diagonal_array: instead of taskq.fence, use more robust fence mechanism of init_tiles --- src/TiledArray/special/diagonal_array.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/special/diagonal_array.h b/src/TiledArray/special/diagonal_array.h index d60b23db94..eac0c65e92 100644 --- a/src/TiledArray/special/diagonal_array.h +++ b/src/TiledArray/special/diagonal_array.h @@ -157,7 +157,8 @@ std::enable_if_t::value, void> write_diag_tiles_to_array_rng(Array &A, RandomAccessIterator diagonals_begin) { using Tile = typename Array::value_type; - A.init_tiles( + // N.B. Fence::Local ensures lifetime of the diagonals range + A.template init_tiles( // Task to create each tile [diagonals_begin](const Range &rng) { // Compute range of diagonal elements in the tile @@ -221,7 +222,6 @@ diagonal_array(World &world, TiledRange const &trange, if constexpr (is_dense_v) { Array A(world, trange); detail::write_diag_tiles_to_array_rng(A, diagonals_begin); - A.world().taskq.fence(); // ensure tasks outlive the diagonals_begin view return A; } else { // Compute shape and init the Array @@ -231,7 +231,6 @@ diagonal_array(World &world, TiledRange const &trange, ShapeType shape(shape_norm, trange); Array A(world, trange, shape); detail::write_diag_tiles_to_array_rng(A, diagonals_begin); - A.world().taskq.fence(); // ensure tasks outlive the diagonals_begin view return A; } abort(); // unreachable From c955339a3c1f6139a39fb081f13414c0b05a11f7 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 9 Oct 2024 19:34:56 -0400 Subject: [PATCH 574/592] foreach and make_array use callbacks instead of atomic counters for local completion checks --- src/TiledArray/conversions/foreach.h | 23 +++++++++------- src/TiledArray/conversions/make_array.h | 36 ++++++++++++++++++------- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/src/TiledArray/conversions/foreach.h b/src/TiledArray/conversions/foreach.h index 20f2d36ec3..2c77c91a0f 100644 --- a/src/TiledArray/conversions/foreach.h +++ b/src/TiledArray/conversions/foreach.h @@ -283,11 +283,10 @@ inline std:: arg.trange().tiles_range(), 0); // Construct the task function used to construct the result tiles. - madness::AtomicInt counter; - counter = 0; - int task_count = 0; + std::atomic ntask_completed{0}; + std::int64_t ntask_created{0}; auto op_shared_handle = make_op_shared_handle(std::forward(op)); - const auto task = [op_shared_handle, &counter, &tile_norms]( + const auto task = [op_shared_handle, &tile_norms]( const ordinal_type ord, const_if_t& arg_tile, const ArgTiles&... arg_tiles) -> result_value_type { @@ -295,7 +294,6 @@ inline std:: auto result_tile = op_caller(std::move(op_shared_handle), tile_norms.at_ordinal(ord), arg_tile, arg_tiles...); - ++counter; return result_tile; }; @@ -310,7 +308,9 @@ inline std:: continue; auto result_tile = world.taskq.add(task, ord, arg.find_local(ord), args.find(ord)...); - ++task_count; + ++ntask_created; + result_tile.register_callback( + new IncrementCounter(ntask_completed)); tiles.emplace_back(ord, std::move(result_tile)); if (op_returns_void) // if Op does not evaluate norms, use the (scaled) // norms of the first arg @@ -324,7 +324,9 @@ inline std:: auto result_tile = world.taskq.add(task, ord, detail::get_sparse_tile(ord, arg), detail::get_sparse_tile(ord, args)...); - ++task_count; + ++ntask_created; + result_tile.register_callback( + new IncrementCounter(ntask_completed)); tiles.emplace_back(ord, std::move(result_tile)); if (op_returns_void) // if Op does not evaluate norms, find max // (scaled) norms of all args @@ -339,9 +341,10 @@ inline std:: } // Wait for tile norm data to be collected. - if (task_count > 0) - world.await( - [&counter, task_count]() -> bool { return counter == task_count; }); + if (ntask_created > 0) + world.await([&ntask_completed, ntask_created]() -> bool { + return ntask_created == ntask_completed; + }); // Construct the new array result_array_type result( diff --git a/src/TiledArray/conversions/make_array.h b/src/TiledArray/conversions/make_array.h index 6f5ada0bba..1295e6f8e4 100644 --- a/src/TiledArray/conversions/make_array.h +++ b/src/TiledArray/conversions/make_array.h @@ -26,6 +26,7 @@ #ifndef TILEDARRAY_CONVERSIONS_MAKE_ARRAY_H__INCLUDED #define TILEDARRAY_CONVERSIONS_MAKE_ARRAY_H__INCLUDED +#include "TiledArray/array_impl.h" #include "TiledArray/external/madness.h" #include "TiledArray/shape.h" #include "TiledArray/type_traits.h" @@ -79,6 +80,10 @@ inline Array make_array( // Make an empty result array Array result(world, trange); + // Construct the task function used to construct the result tiles. + std::atomic ntask_completed{0}; + std::int64_t ntask_created{0}; + // Iterate over local tiles of arg for (const auto index : *result.pmap()) { // Spawn a task to evaluate the tile @@ -89,11 +94,20 @@ inline Array make_array( return tile; }, trange.make_tile_range(index)); - + ++ntask_created; + tile.register_callback( + new detail::IncrementCounter( + ntask_completed)); // Store result tile - result.set(index, tile); + result.set(index, std::move(tile)); } + // Wait for tile tasks to complete + if (ntask_created > 0) + world.await([&ntask_completed, ntask_created]() -> bool { + return ntask_completed == ntask_created; + }); + return result; } @@ -150,26 +164,28 @@ inline Array make_array( trange.tiles_range(), 0); // Construct the task function used to construct the result tiles. - madness::AtomicInt counter; - counter = 0; - int task_count = 0; + std::atomic ntask_completed{0}; + std::int64_t ntask_created{0}; auto task = [&](const ordinal_type index) -> value_type { value_type tile; tile_norms.at_ordinal(index) = op(tile, trange.make_tile_range(index)); - ++counter; return tile; }; for (const auto index : *pmap) { auto result_tile = world.taskq.add(task, index); - ++task_count; + ++ntask_created; + result_tile.register_callback( + new detail::IncrementCounter( + ntask_completed)); tiles.emplace_back(index, std::move(result_tile)); } // Wait for tile norm data to be collected. - if (task_count > 0) - world.await( - [&counter, task_count]() -> bool { return counter == task_count; }); + if (ntask_created > 0) + world.await([&ntask_completed, ntask_created]() -> bool { + return ntask_completed == ntask_created; + }); // Construct the new array Array result(world, trange, From 1986ccf0c6220dd2765da0342e94a2b27aecebb0 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 12 Oct 2024 03:28:01 -0400 Subject: [PATCH 575/592] [python] amends PyTA for d856c6a4f4de592e21c1904dba93f5c22ad7b633 --- python/src/TiledArray/python/array.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/src/TiledArray/python/array.h b/python/src/TiledArray/python/array.h index 782846df4c..e3cc1c79b7 100644 --- a/python/src/TiledArray/python/array.h +++ b/python/src/TiledArray/python/array.h @@ -208,7 +208,7 @@ void make_array_class(py::object m, const char *name) { py::return_value_policy::reference) .def_property_readonly("trange", &array::trange) .def_property_readonly("shape", &array::shape) - .def("fill", &Array::fill, py::arg("value"), + .def("fill", &Array::template fill<>, py::arg("value"), py::arg("skip_set") = false) .def("init", &array::init_tiles) // Array object needs be alive while iterator is used */ From c12bc4544a872886420390dab35f3f0dc5d698a6 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Tue, 15 Oct 2024 16:56:04 -0400 Subject: [PATCH 576/592] Bug fixes in iterator logic in `UserPmap`. --- src/TiledArray/pmap/user_pmap.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/TiledArray/pmap/user_pmap.h b/src/TiledArray/pmap/user_pmap.h index 50966f5744..31f5c51e53 100644 --- a/src/TiledArray/pmap/user_pmap.h +++ b/src/TiledArray/pmap/user_pmap.h @@ -52,7 +52,7 @@ class UserPmap : public Pmap { UserPmap(World& world, size_type size, Index2Rank&& i2r) : Pmap(world, size), index2rank_(std::forward(i2r)) {} - /// Constructs map that does not know the number of local elements + /// Constructs map that knows the number of local elements /// \tparam Index2Rank a callable type with `size_type(size_t)` signature /// \param world A reference to the world @@ -88,10 +88,10 @@ class UserPmap : public Pmap { virtual bool known_local_size() const { return known_local_size_; } virtual const_iterator begin() const { - return Iterator(*this, 0, this->size_, 0, false); + return Iterator(*this, 0, this->size_, 0, /* checking = */ true); } virtual const_iterator end() const { - return Iterator(*this, 0, this->size_, this->size_, false); + return Iterator(*this, 0, this->size_, this->size_, /* checking = */ true); } private: From 3b4baf7afed4ac7357290f3d90f5d4e3edbc09e0 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 21 Oct 2024 10:22:53 -0400 Subject: [PATCH 577/592] sprintf -> snprintf --- src/TiledArray/util/bug.cpp | 2 +- tests/dist_array.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/util/bug.cpp b/src/TiledArray/util/bug.cpp index 5e58ba667c..ff37f14343 100644 --- a/src/TiledArray/util/bug.cpp +++ b/src/TiledArray/util/bug.cpp @@ -166,7 +166,7 @@ void Debugger::set_prefix(const char *p) { void Debugger::set_prefix(int i) { char p[128]; - sprintf(p, "%3d: ", i); + snprintf(p, sizeof(p), "%3d: ", i); set_prefix(p); } diff --git a/tests/dist_array.cpp b/tests/dist_array.cpp index 998b0d8f9f..64f69e69db 100644 --- a/tests/dist_array.cpp +++ b/tests/dist_array.cpp @@ -60,7 +60,7 @@ namespace { std::string to_parallel_archive_file_name(const char* prefix_name, int rank) { char buf[256]; MADNESS_ASSERT(strlen(prefix_name) + 7 <= sizeof(buf)); - sprintf(buf, "%s.%5.5d", prefix_name, rank); + snprintf(buf, sizeof(buf), "%s.%5.5d", prefix_name, rank); return buf; } } // namespace @@ -716,7 +716,7 @@ BOOST_AUTO_TEST_CASE(parallel_serialization) { mktemp(archive_file_prefix_name); madness::archive::ParallelOutputArchive<> oar(world, archive_file_prefix_name, nio); - oar& a; + oar & a; oar.close(); madness::archive::ParallelInputArchive<> iar(world, archive_file_prefix_name, @@ -740,7 +740,7 @@ BOOST_AUTO_TEST_CASE(parallel_sparse_serialization) { mktemp(archive_file_prefix_name); madness::archive::ParallelOutputArchive<> oar(world, archive_file_prefix_name, nio); - oar& b; + oar & b; oar.close(); madness::archive::ParallelInputArchive<> iar(world, archive_file_prefix_name, @@ -783,7 +783,7 @@ BOOST_AUTO_TEST_CASE(issue_225) { madness::archive::BinaryFstreamInputArchive iar(archive_file_name); decltype(S) S_read; decltype(St) St_read; - iar& S_read& St_read; + iar & S_read & St_read; BOOST_CHECK_EQUAL(S_read.trange(), S.trange()); BOOST_REQUIRE(S_read.shape() == S.shape()); From 0ee85a9808f9214c68f5b849104baf1b875aa784 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 23 Oct 2024 11:45:40 -0400 Subject: [PATCH 578/592] nvToolsExt -> nvtx3 --- cmake/tiledarray-config.cmake.in | 2 +- external/cuda.cmake | 2 +- src/CMakeLists.txt | 2 +- src/TiledArray/external/device.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/tiledarray-config.cmake.in b/cmake/tiledarray-config.cmake.in index c6d0a49822..7afccb273b 100644 --- a/cmake/tiledarray-config.cmake.in +++ b/cmake/tiledarray-config.cmake.in @@ -49,7 +49,7 @@ set(TILEDARRAY_HAS_CUDA "@CUDA_FOUND@") if(TILEDARRAY_HAS_CUDA) cmake_minimum_required(VERSION 3.17) if (NOT TARGET CUDA::cublas) - find_dependency(CUDAToolkit REQUIRED COMPONENTS cublas nvToolsExt) + find_dependency(CUDAToolkit REQUIRED COMPONENTS cublas nvtx3) endif(NOT TARGET CUDA::cublas) set(CMAKE_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@") # workaround from https://gitlab.kitware.com/cmake/cmake/issues/18614#note_485631 diff --git a/external/cuda.cmake b/external/cuda.cmake index aa1e51e53e..74bd953e65 100644 --- a/external/cuda.cmake +++ b/external/cuda.cmake @@ -26,7 +26,7 @@ set(TILEDARRAY_HAS_CUDA 1 CACHE BOOL "Whether TiledArray has CUDA support") # NB CUDAToolkit does NOT have COMPONENTS find_package(CUDAToolkit REQUIRED) -foreach (library cublas;nvToolsExt) +foreach (library cublas;nvtx3) if (NOT TARGET CUDA::${library}) message(FATAL_ERROR "CUDA::${library} not found") endif() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a16c05d0b2..30ff478384 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -277,7 +277,7 @@ if(CUDA_FOUND OR HIP_FOUND) endforeach() # the list of libraries on which TiledArray depends on - list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cudart CUDA::cublas CUDA::nvToolsExt) + list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cudart CUDA::cublas CUDA::nvtx3) endif(CUDA_FOUND) diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index 4f9d365e0a..a4885bc175 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -36,7 +36,7 @@ #elif defined(TILEDARRAY_HAS_CUDA) #include #include -#include +#include #include #include #endif From bf984deedfe064c311b473f8d05514b37c413fb5 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Mon, 28 Oct 2024 13:24:26 -0400 Subject: [PATCH 579/592] availability of CUDA/HIP does not mean they should be used --- cmake/tiledarray-config.cmake.in | 2 +- examples/device/CMakeLists.txt | 2 +- src/CMakeLists.txt | 22 +++++++++++----------- tests/CMakeLists.txt | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cmake/tiledarray-config.cmake.in b/cmake/tiledarray-config.cmake.in index 7afccb273b..abff1952ea 100644 --- a/cmake/tiledarray-config.cmake.in +++ b/cmake/tiledarray-config.cmake.in @@ -45,7 +45,7 @@ endif() # if TA is a CUDA-dependent library it needs CUDA to link properly ... unfortunately CMake is not able to do this correctly # see https://gitlab.kitware.com/cmake/cmake/issues/18614 # so try workarounds -set(TILEDARRAY_HAS_CUDA "@CUDA_FOUND@") +set(TILEDARRAY_HAS_CUDA "@TILEDARRAY_HAS_CUDA@") if(TILEDARRAY_HAS_CUDA) cmake_minimum_required(VERSION 3.17) if (NOT TARGET CUDA::cublas) diff --git a/examples/device/CMakeLists.txt b/examples/device/CMakeLists.txt index e2376c4eae..bab6aa8e05 100644 --- a/examples/device/CMakeLists.txt +++ b/examples/device/CMakeLists.txt @@ -23,7 +23,7 @@ # -if(CUDA_FOUND OR HIP_FOUND) +if(TILEDARRAY_HAS_HIP OR TILEDARRAY_HAS_HIP) foreach(_exec device_task ta_dense_device ta_cc_abcd_device ta_vector_device ta_reduce_device) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 30ff478384..776b85f4a1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -204,7 +204,7 @@ TiledArray/util/time.h TiledArray/util/vector.h ) -if(HIP_FOUND OR CUDA_FOUND) +if(TILEDARRAY_HAS_HIP OR TILEDARRAY_HAS_CUDA) list(APPEND TILEDARRAY_HEADER_FILES TiledArray/external/device.h TiledArray/external/librett.h @@ -220,12 +220,12 @@ if(HIP_FOUND OR CUDA_FOUND) TiledArray/device/platform.h TiledArray/device/thrust.h TiledArray/device/um_storage.h) - if(CUDA_FOUND) + if(TILEDARRAY_HAS_CUDA) list(APPEND TILEDARRAY_HEADER_FILES TiledArray/external/cuda.h TiledArray/device/cpu_cuda_vector.h) - endif(CUDA_FOUND) -endif(HIP_FOUND OR CUDA_FOUND) + endif(TILEDARRAY_HAS_CUDA) +endif(TILEDARRAY_HAS_HIP OR TILEDARRAY_HAS_CUDA) set(TILEDARRAY_SOURCE_FILES TiledArray/tiledarray.cpp @@ -256,13 +256,13 @@ set_source_files_properties( # when FetchContent umpire: set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers umpire) set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers TiledArray_UMPIRE range-v3::range-v3) -if(CUDA_FOUND OR HIP_FOUND) +if(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP) set(TILEDARRAY_DEVICE_SOURCE_FILES TiledArray/device/btas_um_tensor.cpp ) - if(CUDA_FOUND) + if(TILEDARRAY_HAS_CUDA) list(APPEND TILEDARRAY_DEVICE_SOURCE_FILES TiledArray/device/cpu_cuda_vector.cu @@ -279,9 +279,9 @@ if(CUDA_FOUND OR HIP_FOUND) # the list of libraries on which TiledArray depends on list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cudart CUDA::cublas CUDA::nvtx3) - endif(CUDA_FOUND) + endif(TILEDARRAY_HAS_CUDA) - if (HIP_FOUND) + if (TILEDARRAY_HAS_HIP) list(APPEND TILEDARRAY_DEVICE_SOURCE_FILES TiledArray/device/kernel/thrust/mult_kernel.hip TiledArray/device/kernel/thrust/reduce_kernel.hip @@ -298,7 +298,7 @@ if(CUDA_FOUND OR HIP_FOUND) list(APPEND _TILEDARRAY_DEPENDENCIES TiledArray_LIBRETT) list(APPEND TILEDARRAY_SOURCE_FILES "${TILEDARRAY_DEVICE_SOURCE_FILES}") -endif(CUDA_FOUND OR HIP_FOUND) +endif(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP) if( TARGET TiledArray_SCALAPACK ) list(APPEND _TILEDARRAY_DEPENDENCIES TiledArray_SCALAPACK) @@ -345,10 +345,10 @@ add_library(tiledarray ${TILEDARRAY_SOURCE_FILES} ${TILEDARRAY_HEADER_FILES}) target_compile_options(${targetname} PUBLIC ${CMAKE_CXX_FLAG_LIST}) target_compile_features(${targetname} PUBLIC "cxx_std_${CMAKE_CXX_STANDARD}") - if (CUDA_FOUND) + if (TILEDARRAY_HAS_CUDA) target_include_directories(${targetname} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) target_compile_features(tiledarray PUBLIC "cuda_std_${CMAKE_CUDA_STANDARD}") - endif (CUDA_FOUND) + endif (TILEDARRAY_HAS_CUDA) if (LAPACK_INCLUDE_DIRS) target_include_directories(${targetname} PUBLIC ${LAPACK_INCLUDE_DIRS}) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 85d30d7728..a30770fb18 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -104,7 +104,7 @@ set(ta_test_src_files ta_test.cpp btas.cpp ) -if(CUDA_FOUND OR HIP_FOUND) +if(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP) list(APPEND ta_test_src_files librett.cpp expressions_device_um.cpp tensor_um.cpp) endif() @@ -118,7 +118,7 @@ if (CMAKE_CXX_STANDARD GREATER_EQUAL 20 AND DEFINED Boost_VERSION) endif() # if tiledarray library was compiled without exceptions, use TA header-only (see below) -if (NOT (TA_ASSERT_POLICY STREQUAL TA_ASSERT_THROW) AND NOT CUDA_FOUND AND FALSE) +if (NOT (TA_ASSERT_POLICY STREQUAL TA_ASSERT_THROW) AND NOT TILEDARRAY_HAS_CUDA AND FALSE) add_ta_executable(${executable} "${ta_test_src_files}" "MADworld;${TILEDARRAY_PRIVATE_LINK_LIBRARIES}") target_compile_definitions(${executable} PRIVATE TILEDARRAY_HEADER_ONLY=1) if (LAPACK_INCLUDE_DIRS) From 93697b94bd47e75f281f63e00dec7827e6752374 Mon Sep 17 00:00:00 2001 From: Joseph Schuchart Date: Mon, 28 Oct 2024 20:56:13 -0400 Subject: [PATCH 580/592] Make sure namespace device is always closed Even if TILEDARRAY_HAS_DEVICE is not defined Signed-off-by: Joseph Schuchart --- src/TiledArray/external/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h index a4885bc175..76d769b472 100644 --- a/src/TiledArray/external/device.h +++ b/src/TiledArray/external/device.h @@ -915,10 +915,10 @@ inline umpire::Allocator& get_pinned_allocator::operator()() { return deviceEnv::instance()->pinned_allocator(); } -} // namespace detail - #endif // TILEDARRAY_HAS_DEVICE +} // namespace detail + #ifdef TILEDARRAY_HAS_CUDA namespace nvidia { From 4339f892a37e8dcefe8cfc6ddfb85d23efdebd18 Mon Sep 17 00:00:00 2001 From: Ajay Date: Tue, 5 Nov 2024 12:50:22 -0500 Subject: [PATCH 581/592] Hush compiler warning and fix typos --- src/TiledArray/math/linalg/basic.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/TiledArray/math/linalg/basic.h b/src/TiledArray/math/linalg/basic.h index c00a363286..2045c8a82c 100644 --- a/src/TiledArray/math/linalg/basic.h +++ b/src/TiledArray/math/linalg/basic.h @@ -123,6 +123,8 @@ inline DistArray concat(const DistArray& a, case Concat::Both: return TiledArray::concat({a, b}, std::vector{true, true}); + default: + TA_EXCEPTION("Invalid Concat value"); } } @@ -257,7 +259,7 @@ inline auto norm2( TiledArray::math::linalg::detail::prefer_distributed(MATRIX)) \ return TiledArray::math::linalg::ttg::FN; \ if (get_linalg_backend() == LinearAlgebraBackend::ScaLAPACK) \ - TA_EXCEPTION("ScaLAPACK lineear algebra backend is not available"); \ + TA_EXCEPTION("ScaLAPACK linear algebra backend is not available"); \ return non_distributed::FN; #elif !TILEDARRAY_HAS_TTG && TILEDARRAY_HAS_SCALAPACK #define TILEDARRAY_MATH_LINALG_DISPATCH_W_TTG(FN, MATRIX) \ @@ -274,7 +276,7 @@ inline auto norm2( if (get_linalg_backend() == LinearAlgebraBackend::TTG) \ TA_EXCEPTION("TTG linear algebra backend is not available"); \ if (get_linalg_backend() == LinearAlgebraBackend::ScaLAPACK) \ - TA_EXCEPTION("ScaLAPACK lineear algebra backend is not available"); \ + TA_EXCEPTION("ScaLAPACK linear algebra backend is not available"); \ return non_distributed::FN; #endif // !TILEDARRAY_HAS_TTG && !TILEDARRAY_HAS_SCALAPACK #endif // defined(TILEDARRAY_MATH_LINALG_DISPATCH_W_TTG) @@ -301,7 +303,7 @@ inline auto norm2( TA_EXCEPTION(TILEDARRAY_MATH_LINALG_DISPATCH_WO_TTG_STRINGIFY( \ FN) " is not provided by the TTG backend"); \ if (get_linalg_backend() == LinearAlgebraBackend::ScaLAPACK) \ - TA_EXCEPTION("ScaLAPACK lineear algebra backend is not available"); \ + TA_EXCEPTION("ScaLAPACK linear algebra backend is not available"); \ return non_distributed::FN; #elif !TILEDARRAY_HAS_TTG && TILEDARRAY_HAS_SCALAPACK #define TILEDARRAY_MATH_LINALG_DISPATCH_WO_TTG(FN, MATRIX) \ @@ -318,7 +320,7 @@ inline auto norm2( if (get_linalg_backend() == LinearAlgebraBackend::TTG) \ TA_EXCEPTION("TTG linear algebra backend is not available"); \ if (get_linalg_backend() == LinearAlgebraBackend::ScaLAPACK) \ - TA_EXCEPTION("ScaLAPACK lineear algebra backend is not available"); \ + TA_EXCEPTION("ScaLAPACK linear algebra backend is not available"); \ return non_distributed::FN; #endif // !TILEDARRAY_HAS_TTG && !TILEDARRAY_HAS_SCALAPACK #endif // defined(TILEDARRAY_MATH_LINALG_DISPATCH_WO_TTG) From c6e9490b01de8ab1aab94c4c7cf74347ec581e34 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 4 Nov 2024 10:00:09 -0500 Subject: [PATCH 582/592] Tensor-of-tensor tiles can have some elements (i.e. tensors) that are zero. --- src/TiledArray/einsum/tiledarray.h | 9 ++++-- src/TiledArray/expressions/cont_engine.h | 3 +- src/TiledArray/tensor/kernels.h | 2 ++ src/TiledArray/tensor/tensor.h | 36 ++++++++++++++++++------ src/TiledArray/tile_op/contract_reduce.h | 8 +++--- 5 files changed, 42 insertions(+), 16 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 41797efafa..40076ed0ce 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -485,8 +485,13 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // Step IV: C2(ijpq) -> C(ipjq) auto sum_tot_2_tos = [](auto const &tot) { - typename std::remove_reference_t::value_type result( - tot.range(), [tot](auto &&ix) { return tot(ix).sum(); }); + using tot_t = std::remove_reference_t; + typename tot_t::value_type result( + tot.range(), [tot](auto &&ix) { + if (!tot(ix).empty()) + return tot(ix).sum(); + else return typename tot_t::numeric_type{}; + }); return result; }; diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index 58d7b9ad57..f0a94c7e05 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -513,7 +513,8 @@ class ContEngine : public BinaryEngine { const left_tile_element_type& left, const right_tile_element_type& right) { contrreduce_op(result, left, right); - result = contrreduce_op(result); // permutations of result are applied as "postprocessing" + if (!TA::empty(result)) + result = contrreduce_op(result); // permutations of result are applied as "postprocessing" }; } // ToT x ToT } else if (inner_prod == TensorProduct::Hadamard) { diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index 5d40ce5c14..a2530f2f5d 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -996,6 +996,8 @@ auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, auto result = identity; for (std::remove_cv_t ord = 0ul; ord < volume; ++ord) { + if (tensor1.data()[ord].range().volume() == 0 + || ((tensors.data()[ord].range().volume() == 0) || ...)) continue; auto temp = tensor_reduce(reduce_op, join_op, identity, tensor1.data()[ord], tensors.data()[ord]...); join_op(result, temp); diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index bd72af487c..bd6fb8f3e5 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -431,7 +431,8 @@ class Tensor { auto volume = total_size(); for (decltype(volume) i = 0; i < volume; ++i) { auto& el = *(data() + i); - el = p(el, inner_perm); + if (!el.empty()) + el = p(el, inner_perm); } } } @@ -588,9 +589,13 @@ class Tensor { Tensor clone() const { Tensor result; if (data_) { - result = detail::tensor_op( - [](const numeric_type value) -> numeric_type { return value; }, - *this); + if constexpr (detail::is_tensor_of_tensor_v) { + result = Tensor(*this, [](value_type const& el) { return el.clone(); }); + } else { + result = detail::tensor_op( + [](const numeric_type value) -> numeric_type { return value; }, + *this); + } } else if (range_) { // corner case: data_ = null implies range_.volume() // == 0; TA_ASSERT(range_.volume() == 0); @@ -1538,6 +1543,7 @@ class Tensor { detail::is_bipartite_permutation_v; // tile ops pass bipartite permutations here even if this is a plain tensor if constexpr (!is_tot) { + if (empty()) return *this; if constexpr (is_bperm) { TA_ASSERT(inner_size(perm) == 0); // ensure this is a plain permutation return Tensor(*this, op, outer(std::forward(perm))); @@ -1574,6 +1580,7 @@ class Tensor { template >::type* = nullptr> Tensor scale(const Scalar factor) const { + if (range().volume() == 0) return *this; return unary([factor](const value_type& a) -> decltype(auto) { using namespace TiledArray::detail; return a * factor; @@ -1626,6 +1633,10 @@ class Tensor { return binary( right, [](const value_type& l, const value_t& r) -> decltype(auto) { + if constexpr (detail::is_tensor_v) { + if (l.empty() && r.empty()) + return value_type{}; + } return l + r; }); } @@ -1740,6 +1751,7 @@ class Tensor { template ::value>::type* = nullptr> Tensor& add_to(const Right& right) { + if (right.empty()) return *this; if (empty()) { *this = Tensor{right.range(), value_type{}}; } @@ -1923,11 +1935,17 @@ class Tensor { typename std::enable_if>::type* = nullptr> decltype(auto) mult(const Right& right) const { - return binary( - right, - [](const value_type& l, const value_t& r) -> decltype(auto) { - return l * r; - }); + + auto mult_op =[](const value_type& l, const value_t& r) -> decltype(auto) { + return l * r; + }; + + if (empty() || right.empty()) { + using res_t = decltype(std::declval().binary(std::declval(), mult_op)); + return res_t{}; + } + + return binary(right, mult_op); } /// Multiply this by \c right to create a new, permuted tensor diff --git a/src/TiledArray/tile_op/contract_reduce.h b/src/TiledArray/tile_op/contract_reduce.h index 94c7107343..f0654f1431 100644 --- a/src/TiledArray/tile_op/contract_reduce.h +++ b/src/TiledArray/tile_op/contract_reduce.h @@ -326,17 +326,17 @@ class ContractReduce : public ContractReduceBase { /// \param[in] right The right-hand tile to be contracted void operator()(result_type& result, const first_argument_type& left, const second_argument_type& right) const { + using TiledArray::empty; + using TiledArray::gemm; + if (empty(left) || empty(right)) return; + if constexpr (!ContractReduceBase_::plain_tensors) { TA_ASSERT(this->elem_muladd_op()); // not yet implemented - using TiledArray::empty; - using TiledArray::gemm; gemm(result, left, right, ContractReduceBase_::gemm_helper(), this->elem_muladd_op()); } else { // plain tensors TA_ASSERT(!this->elem_muladd_op()); - using TiledArray::empty; - using TiledArray::gemm; if (empty(result)) result = gemm(left, right, ContractReduceBase_::factor(), ContractReduceBase_::gemm_helper()); From bc69ec5ac765d42e62acbbda3538b614fb426562 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sat, 9 Nov 2024 17:39:31 -0500 Subject: [PATCH 583/592] [unit] retile_suite/retile_more skip zero tiles --- tests/retile.cpp | 54 ++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/tests/retile.cpp b/tests/retile.cpp index 0f4100d4c8..6ac15a48c4 100644 --- a/tests/retile.cpp +++ b/tests/retile.cpp @@ -6,26 +6,24 @@ BOOST_AUTO_TEST_SUITE(retile_suite) BOOST_AUTO_TEST_CASE(retile_tensor) { - TA::detail::matrix_il some_values = { - {0.1, 0.2, 0.3, 0.4, 0.5}, - {0.6, 0.7, 0.8, 0.9, 1.0}, - {1.1, 1.2, 1.3, 1.4, 1.5}, - {1.6, 1.7, 1.8, 1.9, 2.0}, - {2.1, 2.2, 2.3, 2.4, 2.5} - }; - - auto range0 = TA::TiledRange1(0, 3, 5); - auto range1 = TA::TiledRange1(0, 4, 5); - auto trange = TA::TiledRange({range0, range1}); - - TA::TArrayD default_dense(*GlobalFixture::world, some_values); - TA::TSpArrayD default_sparse(*GlobalFixture::world, some_values); - - auto result_dense = retile(default_dense, trange); - auto result_sparse = retile(default_sparse, trange); - - BOOST_CHECK_EQUAL(result_dense.trange(), trange); - BOOST_CHECK_EQUAL(result_sparse.trange(), trange); + TA::detail::matrix_il some_values = {{0.1, 0.2, 0.3, 0.4, 0.5}, + {0.6, 0.7, 0.8, 0.9, 1.0}, + {1.1, 1.2, 1.3, 1.4, 1.5}, + {1.6, 1.7, 1.8, 1.9, 2.0}, + {2.1, 2.2, 2.3, 2.4, 2.5}}; + + auto range0 = TA::TiledRange1(0, 3, 5); + auto range1 = TA::TiledRange1(0, 4, 5); + auto trange = TA::TiledRange({range0, range1}); + + TA::TArrayD default_dense(*GlobalFixture::world, some_values); + TA::TSpArrayD default_sparse(*GlobalFixture::world, some_values); + + auto result_dense = retile(default_dense, trange); + auto result_sparse = retile(default_sparse, trange); + + BOOST_CHECK_EQUAL(result_dense.trange(), trange); + BOOST_CHECK_EQUAL(result_sparse.trange(), trange); } BOOST_AUTO_TEST_CASE(retile_more) { @@ -69,17 +67,20 @@ BOOST_AUTO_TEST_CASE(retile_more) { return tile.norm(); }; + auto arr_source0 = + TA::make_array(world, tr_source, set_random_tensor_tile); + auto arr_target0 = TA::retile(arr_source0, tr_target); + auto get_elem = [](auto const& arr, auto const& eix) { auto tix = arr.trange().element_to_tile(eix); auto&& tile = arr.find(tix).get(false); return tile(eix); }; - auto arr_source0 = - TA::make_array(world, tr_source, set_random_tensor_tile); - auto arr_target0 = TA::retile(arr_source0, tr_target); - for (auto&& eix : elem_rng) { + auto tix = arr_source0.trange().element_to_tile(eix); + BOOST_REQUIRE(arr_source0.is_zero(tix) == arr_target0.is_zero(tix)); + if (arr_source0.is_zero(tix)) continue; BOOST_REQUIRE(get_elem(arr_source0, eix) == get_elem(arr_target0, eix)); } @@ -94,8 +95,11 @@ BOOST_AUTO_TEST_CASE(retile_more) { world.gop.fence(); for (auto&& eix : elem_rng) { + auto tix = arr_source.trange().element_to_tile(eix); + BOOST_REQUIRE(arr_source.is_zero(tix) == arr_target.is_zero(tix)); + if (arr_source.is_zero(tix)) continue; BOOST_REQUIRE(get_elem(arr_source, eix) == get_elem(arr_target, eix)); } } -BOOST_AUTO_TEST_SUITE_END() \ No newline at end of file +BOOST_AUTO_TEST_SUITE_END() From d0d7b73e9efc11b238d77785450a2369cf78b276 Mon Sep 17 00:00:00 2001 From: Samuel Powell Date: Thu, 29 Aug 2024 16:12:51 -0400 Subject: [PATCH 584/592] Initial steps, min. working to attach debugger externally --- src/TiledArray/util/bug.cpp | 48 +++++++++++++++++++++++++++++++++++++ src/TiledArray/util/bug.h | 3 +++ 2 files changed, 51 insertions(+) diff --git a/src/TiledArray/util/bug.cpp b/src/TiledArray/util/bug.cpp index ff37f14343..74024fff7b 100644 --- a/src/TiledArray/util/bug.cpp +++ b/src/TiledArray/util/bug.cpp @@ -180,6 +180,11 @@ void Debugger::default_cmd() { } } +const std::string Debugger::gdb_cmd_ = + "gdb -ex \"set variable debugger_ready_=1\" --pid=$(PID) $(EXEC)"; +const std::string Debugger::lldb_cmd_ = + "lldb -p $(PID) -o \"expr debugger_ready_=1\""; + void Debugger::resolve_cmd_alias() { if (cmd_ == "gdb_xterm") { cmd_ = @@ -192,6 +197,28 @@ void Debugger::resolve_cmd_alias() { } } +std::string Debugger::replace_macros(std::string str) { + if (!str.empty()) { + int pid = getpid(); + std::string::size_type pos; + std::string pidvar("$(PID)"); + while ((pos = str.find(pidvar)) != std::string::npos) { + std::string pidstr; + pidstr += std::to_string(pid); + str.replace(pos, pidvar.size(), pidstr); + } + std::string execvar("$(EXEC)"); + while ((pos = str.find(execvar)) != std::string::npos) { + str.replace(pos, execvar.size(), exec_); + } + std::string prefixvar("$(PREFIX)"); + while ((pos = str.find(prefixvar)) != std::string::npos) { + str.replace(pos, prefixvar.size(), prefix_); + } + } + return str; +} + void Debugger::set_cmd(const char *cmd) { if (cmd) { cmd_ = cmd; @@ -262,6 +289,27 @@ void Debugger::debug(const char *reason) { ; } } + } // Here, need handling of cmd_ empty + if (sleep_) { + std::cout << prefix_ << "Debugger: sleeping " << sleep_ + << " seconds to wait for debugger ..." << std::endl; + sleep(sleep_); + } + if (wait_for_debugger_) { + std::cout << prefix_ << "Debugger: waiting for the user ..."; + if (cmd_.empty()) { + std::cout << " attach debugger to process " + << std::to_string(getpid()) + << " as follows:" << std::endl + << prefix_ << "Debugger: - if using gdb: " + << replace_macros(gdb_cmd_) << std::endl + << prefix_ << "Debugger: - if using lldb: " + << replace_macros(lldb_cmd_); + } + + std::cout << prefix_ << ": waiting for the user ..." << std::endl; + while (!debugger_ready_) + ; } } diff --git a/src/TiledArray/util/bug.h b/src/TiledArray/util/bug.h index 2d217ceaee..0aadd3927e 100644 --- a/src/TiledArray/util/bug.h +++ b/src/TiledArray/util/bug.h @@ -384,6 +384,9 @@ class Debugger { private: /// Replaces alias in cmd_ with its full form void resolve_cmd_alias(); + std::string replace_macros(std::string cmd); + static const std::string gdb_cmd_; + static const std::string lldb_cmd_; }; /// Use this to create a Debugger object and make it the default From 23dc95404cea77df70f7a89391bab0949a040709 Mon Sep 17 00:00:00 2001 From: Samuel Powell Date: Mon, 16 Sep 2024 11:25:10 -0400 Subject: [PATCH 585/592] Allow attaching debugger via cl, enable prelaunch actions, cleanup --- src/TiledArray/util/bug.cpp | 129 ++++++++++++++---------------------- src/TiledArray/util/bug.h | 20 ++++-- 2 files changed, 66 insertions(+), 83 deletions(-) diff --git a/src/TiledArray/util/bug.cpp b/src/TiledArray/util/bug.cpp index 74024fff7b..57e96c162d 100644 --- a/src/TiledArray/util/bug.cpp +++ b/src/TiledArray/util/bug.cpp @@ -77,7 +77,6 @@ Debugger::~Debugger() { for (int i = 0; i < NSIG; i++) { if (mysigs_[i]) signals[i] = nullptr; } - delete[] mysigs_; } void Debugger::init() { @@ -91,7 +90,7 @@ void Debugger::init() { debug_ = 1; wait_for_debugger_ = 1; - mysigs_ = new int[NSIG]; + mysigs_ = std::make_unique(NSIG); for (int i = 0; i < NSIG; i++) { mysigs_[i] = 0; } @@ -187,13 +186,9 @@ const std::string Debugger::lldb_cmd_ = void Debugger::resolve_cmd_alias() { if (cmd_ == "gdb_xterm") { - cmd_ = - "xterm -title \"$(PREFIX)$(EXEC)\" -e gdb -ex \"set variable " - "debugger_ready_=1\" --pid=$(PID) $(EXEC) &"; + cmd_ = "xterm -title \"$(PREFIX)$(EXEC)\" -e " + gdb_cmd_ + " &"; } else if (cmd_ == "lldb_xterm") { - cmd_ = - "xterm -title \"$(PREFIX)$(EXEC)\" -e lldb -p $(PID) -o \"expr " - "debugger_ready_=1\" &"; + cmd_ = "xterm -title \"$(PREFIX)$(EXEC)\" -e " + lldb_cmd_ + " &"; } } @@ -222,10 +217,10 @@ std::string Debugger::replace_macros(std::string str) { void Debugger::set_cmd(const char *cmd) { if (cmd) { cmd_ = cmd; - resolve_cmd_alias(); } else { cmd_.resize(0); } + this->resolve_cmd_alias(); } void Debugger::debug(const char *reason) { @@ -236,80 +231,50 @@ void Debugger::debug(const char *reason) { std::cout << "no reason given"; std::cout << std::endl; - if (!cmd_.empty()) { - int pid = getpid(); - // contruct the command name - std::string cmd = cmd_; - std::string::size_type pos; - std::string pidvar("$(PID)"); - while ((pos = cmd.find(pidvar)) != std::string::npos) { - std::string pidstr; - pidstr += std::to_string(pid); - cmd.replace(pos, pidvar.size(), pidstr); - } - std::string execvar("$(EXEC)"); - while ((pos = cmd.find(execvar)) != std::string::npos) { - cmd.replace(pos, execvar.size(), exec_); - } - std::string prefixvar("$(PREFIX)"); - while ((pos = cmd.find(prefixvar)) != std::string::npos) { - cmd.replace(pos, prefixvar.size(), prefix_); - } - // start the debugger - // before starting the debugger de-register signal handler for SIGTRAP to - // let the debugger take over - release(SIGTRAP); + const std::string cmd = replace_macros(cmd_); + // start the debugger + // before starting the debugger de-register signal handler for SIGTRAP to + // let the debugger take over + release(SIGTRAP); + int system_retvalue = 0; + if (!cmd.empty()) { std::cout << prefix_ << "Debugger: starting \"" << cmd << "\"" << std::endl; - debugger_ready_ = 0; - const auto system_retvalue = system(cmd.c_str()); - if (system_retvalue != 0) { // call to system() failed - std::cout << prefix_ - << "Failed debugger launch: system() did not succeed ..." - << std::endl; - } else { // call to system() succeeded - // wait until the debugger is ready - if (sleep_) { - std::cout << prefix_ << "Sleeping " << sleep_ - << " seconds to wait for debugger ..." << std::endl; - sleep(sleep_); - } - if (wait_for_debugger_) { - std::string make_ready_message; - if (cmd_.find(" gdb ") != std::string::npos || - cmd_.find(" lldb ") != std::string::npos) { - make_ready_message = - " configure debugging session (set breakpoints/watchpoints, " - "etc.) then type 'c' to continue running"; - } - - std::cout << prefix_ << ": waiting for the user ..." - << make_ready_message << std::endl; - while (!debugger_ready_) - ; - } - } - } // Here, need handling of cmd_ empty - if (sleep_) { - std::cout << prefix_ << "Debugger: sleeping " << sleep_ - << " seconds to wait for debugger ..." << std::endl; - sleep(sleep_); + system_retvalue = system(cmd.c_str()); } - if (wait_for_debugger_) { - std::cout << prefix_ << "Debugger: waiting for the user ..."; - if (cmd_.empty()) { - std::cout << " attach debugger to process " - << std::to_string(getpid()) - << " as follows:" << std::endl - << prefix_ << "Debugger: - if using gdb: " - << replace_macros(gdb_cmd_) << std::endl - << prefix_ << "Debugger: - if using lldb: " - << replace_macros(lldb_cmd_); + if (system_retvalue != 0) { + ExEnv::outn() << prefix_ + << "Failed debugger launch: system() did not succeed ..." + << std::endl; + } else { // call to system() succeeded + // wait until the debugger is ready + if (sleep_) { + std::cout << prefix_ << "Debugger: sleeping " << sleep_ + << " seconds to wait for debugger ..." << std::endl; + sleep(sleep_); } + if (wait_for_debugger_) { + std::cout << prefix_ << "Debugger: waiting for the user ..."; + if (cmd_.find(" gdb ") != std::string::npos || + cmd_.find(" lldb ") != std::string::npos) { + std::cout << + " configure debugging session (set breakpoints/watchpoints, " + "etc.) then type 'c' to continue running"; + } else if (cmd.empty()) { + std::cout << " attach debugger to process " + << std::to_string(getpid()) + << " as follows:" << std::endl + << prefix_ << "Debugger: - if using gdb: " + << replace_macros(gdb_cmd_) << std::endl + << prefix_ << "Debugger: - if using lldb: " + << replace_macros(lldb_cmd_); + } + std::cout << std::endl; - std::cout << prefix_ << ": waiting for the user ..." << std::endl; - while (!debugger_ready_) - ; + debugger_ready_ = 0; + while (!debugger_ready_) + ; + } } } @@ -334,6 +299,10 @@ void Debugger::got_signal(int sig) { else signame = "UNKNOWN SIGNAL"; + for (auto const &action: actions_) { + action(); + } + actions_.clear(); if (traceback_) { traceback(signame); } @@ -403,6 +372,10 @@ void Debugger::__traceback(const std::string &prefix, const char *reason) { std::cout << result.str(nframes_to_skip) << std::endl; } +void Debugger::register_prelaunch_action(std::function action) { + actions_.push_back(action); +} + void create_debugger(const char *cmd, const char *exec, std::int64_t rank) { auto debugger = std::make_shared(); if (cmd) debugger->set_cmd(cmd); diff --git a/src/TiledArray/util/bug.h b/src/TiledArray/util/bug.h index 0aadd3927e..f2f284169a 100644 --- a/src/TiledArray/util/bug.h +++ b/src/TiledArray/util/bug.h @@ -291,7 +291,7 @@ class Debugger { bool sleep_; bool wait_for_debugger_; bool handle_sigint_; - int *mysigs_; + std::unique_ptr mysigs_; void init(); @@ -325,11 +325,11 @@ class Debugger { @param reason optional string specifying the reason for traceback */ virtual void traceback(const char *reason); - /// Turn on or off debugging on a signel. The default is on. + /// Turn on or off debugging on a signal. The default is on. virtual void set_debug_on_signal(int); - /// Turn on or off traceback on a signel. The default is on. + /// Turn on or off traceback on a signal. The default is on. virtual void set_traceback_on_signal(int); - /// Turn on or off exit after a signel. The default is on. + /// Turn on or off exit after a signal. The default is on. virtual void set_exit_on_signal(int); /** Turn on or off running an infinite loop after the debugger is started. This loop gives the debugger a chance to attack to the process. @@ -370,7 +370,7 @@ class Debugger { virtual void default_cmd(); /** Set the name of the executable for the current process. It is up to the programmer to set this, even if the Debugger - is initialized with the KeyVal constructor. */ + is initialized with the constructor. */ virtual void set_exec(const char *); /// Called when signal sig is received. This is mainly for internal use. @@ -381,12 +381,22 @@ class Debugger { /// Return the global default debugger. static std::shared_ptr default_debugger(); + /// Register a (one-time) action to be executed when debugger is launched + /// @param action an action to be executed + /// @note multiple actions registered via this will be executed in order of + /// their registration + void register_prelaunch_action(std::function action); + private: /// Replaces alias in cmd_ with its full form void resolve_cmd_alias(); + /// Replace macros (\c PID , \c EXEC , \c PREFIX ) in \p cmd by their values + /// \param cmd a string + /// \return processed str std::string replace_macros(std::string cmd); static const std::string gdb_cmd_; static const std::string lldb_cmd_; + std::vector> actions_; // prelaunch actions }; /// Use this to create a Debugger object and make it the default From 2f2df83870cd3e5f0449720512e08609851d7652 Mon Sep 17 00:00:00 2001 From: Samuel Powell Date: Mon, 16 Sep 2024 11:42:46 -0400 Subject: [PATCH 586/592] C-c, C-v --- src/TiledArray/util/bug.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TiledArray/util/bug.cpp b/src/TiledArray/util/bug.cpp index 57e96c162d..41121b52ca 100644 --- a/src/TiledArray/util/bug.cpp +++ b/src/TiledArray/util/bug.cpp @@ -243,7 +243,7 @@ void Debugger::debug(const char *reason) { system_retvalue = system(cmd.c_str()); } if (system_retvalue != 0) { - ExEnv::outn() << prefix_ + std::cout << prefix_ << "Failed debugger launch: system() did not succeed ..." << std::endl; } else { // call to system() succeeded From 99decc18f3c60a41e7877be8112bb664234cdf30 Mon Sep 17 00:00:00 2001 From: Samuel Powell Date: Wed, 18 Sep 2024 12:58:28 -0400 Subject: [PATCH 587/592] Omitted header include --- src/TiledArray/util/bug.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/TiledArray/util/bug.h b/src/TiledArray/util/bug.h index f2f284169a..38ae55198f 100644 --- a/src/TiledArray/util/bug.h +++ b/src/TiledArray/util/bug.h @@ -30,6 +30,7 @@ #include #include +#include #include #include #include From 31ade880cc894eb12e535a8f2209e44fcf7ff573 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 10 Nov 2024 12:55:46 -0500 Subject: [PATCH 588/592] bug.cpp: use std::{signal,system} --- src/TiledArray/util/bug.cpp | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/TiledArray/util/bug.cpp b/src/TiledArray/util/bug.cpp index 41121b52ca..0105635f37 100644 --- a/src/TiledArray/util/bug.cpp +++ b/src/TiledArray/util/bug.cpp @@ -105,14 +105,14 @@ static void handler(int sig) { void Debugger::handle(int sig) { if (sig >= NSIG) return; typedef void (*handler_type)(int); - signal(sig, (handler_type)handler); + std::signal(sig, (handler_type)handler); signals[sig] = this; mysigs_[sig] = 1; } void Debugger::release(int sig) { if (sig >= NSIG) return; - signal(sig, SIG_DFL); + std::signal(sig, SIG_DFL); signals[sig] = nullptr; mysigs_[sig] = 0; } @@ -231,7 +231,6 @@ void Debugger::debug(const char *reason) { std::cout << "no reason given"; std::cout << std::endl; - const std::string cmd = replace_macros(cmd_); // start the debugger // before starting the debugger de-register signal handler for SIGTRAP to @@ -240,13 +239,13 @@ void Debugger::debug(const char *reason) { int system_retvalue = 0; if (!cmd.empty()) { std::cout << prefix_ << "Debugger: starting \"" << cmd << "\"" << std::endl; - system_retvalue = system(cmd.c_str()); + system_retvalue = std::system(cmd.c_str()); } if (system_retvalue != 0) { std::cout << prefix_ - << "Failed debugger launch: system() did not succeed ..." - << std::endl; - } else { // call to system() succeeded + << "Failed debugger launch: system() did not succeed ..." + << std::endl; + } else { // call to system() succeeded // wait until the debugger is ready if (sleep_) { std::cout << prefix_ << "Debugger: sleeping " << sleep_ @@ -257,17 +256,17 @@ void Debugger::debug(const char *reason) { std::cout << prefix_ << "Debugger: waiting for the user ..."; if (cmd_.find(" gdb ") != std::string::npos || cmd_.find(" lldb ") != std::string::npos) { - std::cout << - " configure debugging session (set breakpoints/watchpoints, " - "etc.) then type 'c' to continue running"; + std::cout + << " configure debugging session (set breakpoints/watchpoints, " + "etc.) then type 'c' to continue running"; } else if (cmd.empty()) { - std::cout << " attach debugger to process " - << std::to_string(getpid()) + std::cout << " attach debugger to process " << std::to_string(getpid()) << " as follows:" << std::endl - << prefix_ << "Debugger: - if using gdb: " - << replace_macros(gdb_cmd_) << std::endl - << prefix_ << "Debugger: - if using lldb: " - << replace_macros(lldb_cmd_); + << prefix_ + << "Debugger: - if using gdb: " << replace_macros(gdb_cmd_) + << std::endl + << prefix_ + << "Debugger: - if using lldb: " << replace_macros(lldb_cmd_); } std::cout << std::endl; @@ -299,7 +298,7 @@ void Debugger::got_signal(int sig) { else signame = "UNKNOWN SIGNAL"; - for (auto const &action: actions_) { + for (auto const &action : actions_) { action(); } actions_.clear(); From b376b06b9f1301200de5ed0464feaf08ca2f16c0 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Sun, 10 Nov 2024 13:02:24 -0500 Subject: [PATCH 589/592] bug.h: reformat --- src/TiledArray/util/bug.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/TiledArray/util/bug.h b/src/TiledArray/util/bug.h index 38ae55198f..5367497b62 100644 --- a/src/TiledArray/util/bug.h +++ b/src/TiledArray/util/bug.h @@ -395,9 +395,10 @@ class Debugger { /// \param cmd a string /// \return processed str std::string replace_macros(std::string cmd); + static const std::string gdb_cmd_; static const std::string lldb_cmd_; - std::vector> actions_; // prelaunch actions + std::vector> actions_; // prelaunch actions }; /// Use this to create a Debugger object and make it the default From abb7886c9a99a9f432d83467710b6d01843193f4 Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Mon, 18 Nov 2024 14:16:37 -0500 Subject: [PATCH 590/592] More efficient contractions involving modes of the outer tensor in ToT involving expressions. --- src/TiledArray/einsum/tiledarray.h | 182 ++++++++++++----------- src/TiledArray/expressions/cont_engine.h | 54 ++++++- src/TiledArray/tensor/tensor.h | 1 + src/TiledArray/tile_op/contract_reduce.h | 1 - 4 files changed, 142 insertions(+), 96 deletions(-) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index 40076ed0ce..e341e646a9 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -420,6 +420,9 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using ResultTensor = typename ArrayC::value_type; using ResultShape = typename ArrayC::shape_type; + auto const& tnsrExprA = A; + auto const& tnsrExprB = B; + auto a = std::get<0>(Einsum::idx(A)); auto b = std::get<0>(Einsum::idx(B)); Einsum::Index c = std::get<0>(cs); @@ -536,16 +539,10 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, // the evaluation can be delegated to the expression layer // for distarrays of both nested and non-nested tensor tiles. // *) If no Hadamard indices are present (!h) the evaluation - // can be delegated to the expression _only_ for distarrays with - // non-nested tensor tiles. - // This is because even if Hadamard indices are not present, a contracted - // index might be present pertinent to the outer tensor in case of a - // nested-tile distarray, which is especially handled within this - // function because expression layer cannot handle that yet. + // can be delegated to the expression layer. // - if ((h && !(i || e)) // pure Hadamard - || (IsArrayToT && !(i || h)) // ToT result from outer-product - || (IsArrayT && !h)) // T from general product without Hadamard + if ((h && !(i || e)) // pure Hadamard + || !h) // no Hadamard { ArrayC C; C(std::string(c) + inner.c) = A * B; @@ -577,21 +574,6 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, return C; } - // - // when contraction happens in the outer tensor - // need to evaluate specially.. - // - if (IsArrayToT && i.size() > 0) { - auto annot_c = std::string(h + e + i) + inner.c; - auto temp1 = einsum(A, B, idx(annot_c), world); - auto temp2 = reduce_modes(temp1, i.size()); - - auto annot_c_ = std::string(h + e) + inner.c; - decltype(temp2) result; - result(std::string(c) + inner.c) = temp2(annot_c_); - return result; - } - using ::Einsum::index::permutation; using TiledArray::Permutation; @@ -640,79 +622,103 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using Index = Einsum::Index; - if constexpr (AreArraySame && - AreArraySame) { - if (!e) { // hadamard reduction - auto &[A, B] = AB; - TiledRange trange(range_map[i]); - RangeProduct tiles; - for (auto idx : i) { - tiles *= Range(range_map[idx].tiles_range()); + if (!e) { // hadamard reduction + auto &[A, B] = AB; + TiledRange trange(range_map[i]); + RangeProduct tiles; + for (auto idx : i) { + tiles *= Range(range_map[idx].tiles_range()); + } + auto pa = A.permutation; + auto pb = B.permutation; + for (Index h : H.tiles) { + if (!C.array.is_local(h)) continue; + size_t batch = 1; + for (size_t i = 0; i < h.size(); ++i) { + batch *= H.batch[i].at(h[i]); } - auto pa = A.permutation; - auto pb = B.permutation; - for (Index h : H.tiles) { - if (!C.array.is_local(h)) continue; - size_t batch = 1; - for (size_t i = 0; i < h.size(); ++i) { - batch *= H.batch[i].at(h[i]); - } - ResultTensor tile(TiledArray::Range{batch}, - typename ResultTensor::value_type{}); - for (Index i : tiles) { - // skip this unless both input tiles exist - const auto pahi_inv = apply_inverse(pa, h + i); - const auto pbhi_inv = apply_inverse(pb, h + i); - if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) - continue; - - auto ai = A.array.find(pahi_inv).get(); - auto bi = B.array.find(pbhi_inv).get(); - if (pa) ai = ai.permute(pa); - if (pb) bi = bi.permute(pb); - auto shape = trange.tile(i); - ai = ai.reshape(shape, batch); - bi = bi.reshape(shape, batch); - for (size_t k = 0; k < batch; ++k) { - using Ix = ::Einsum::Index; - if constexpr (AreArrayToT) { - auto aik = ai.batch(k); - auto bik = bi.batch(k); - auto vol = aik.total_size(); - TA_ASSERT(vol == bik.total_size()); - - auto &el = tile({k}); - using TensorT = std::remove_reference_t; - - auto mult_op = [&inner](auto const &l, - auto const &r) -> TensorT { - return inner.h ? TA::detail::tensor_hadamard(l, inner.A, r, - inner.B, inner.C) - : TA::detail::tensor_contract( - l, inner.A, r, inner.B, inner.C); - }; - - for (auto i = 0; i < vol; ++i) - el.add_to(mult_op(aik.data()[i], bik.data()[i])); - - } else { - auto hk = ai.batch(k).dot(bi.batch(k)); - tile({k}) += hk; - } + ResultTensor tile(TiledArray::Range{batch}, + typename ResultTensor::value_type{}); + for (Index i : tiles) { + // skip this unless both input tiles exist + const auto pahi_inv = apply_inverse(pa, h + i); + const auto pbhi_inv = apply_inverse(pb, h + i); + if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue; + + auto ai = A.array.find(pahi_inv).get(); + auto bi = B.array.find(pbhi_inv).get(); + if (pa) ai = ai.permute(pa); + if (pb) bi = bi.permute(pb); + auto shape = trange.tile(i); + ai = ai.reshape(shape, batch); + bi = bi.reshape(shape, batch); + for (size_t k = 0; k < batch; ++k) { + using Ix = ::Einsum::Index; + if constexpr (AreArrayToT) { + auto aik = ai.batch(k); + auto bik = bi.batch(k); + auto vol = aik.total_size(); + TA_ASSERT(vol == bik.total_size()); + + auto &el = tile({k}); + using TensorT = std::remove_reference_t; + + auto mult_op = [&inner](auto const &l, auto const &r) -> TensorT { + return inner.h ? TA::detail::tensor_hadamard(l, inner.A, r, + inner.B, inner.C) + : TA::detail::tensor_contract(l, inner.A, r, + inner.B, inner.C); + }; + + for (auto i = 0; i < vol; ++i) + el.add_to(mult_op(aik.data()[i], bik.data()[i])); + + } else if constexpr (!AreArraySame) { + auto aik = ai.batch(k); + auto bik = bi.batch(k); + auto vol = aik.total_size(); + TA_ASSERT(vol == bik.total_size()); + + auto &el = tile({k}); + + for (auto i = 0; i < vol; ++i) + if constexpr (IsArrayToT) { + el.add_to(aik.data()[i].scale(bik.data()[i])); + } else { + el.add_to(bik.data()[i].scale(aik.data()[i])); + } + + } else { + auto hk = ai.batch(k).dot(bi.batch(k)); + tile({k}) += hk; } } - auto pc = C.permutation; - auto shape = apply_inverse(pc, C.array.trange().tile(h)); - tile = tile.reshape(shape); - if (pc) tile = tile.permute(pc); - C.array.set(h, tile); } - return C.array; + auto pc = C.permutation; + auto shape = apply_inverse(pc, C.array.trange().tile(h)); + tile = tile.reshape(shape); + if (pc) tile = tile.permute(pc); + C.array.set(h, tile); } + return C.array; } // generalized contraction + if constexpr (IsArrayToT) { + if (inner.C != inner.h + inner.e) { + // when inner tensor permutation is non-trivial (could be potentially + // elided by extending this function (@c einsum) to take into account + // of inner tensor's permutations) + auto temp_annot = std::string(c) + ";" + std::string(inner.h + inner.e); + ArrayC temp = einsum(tnsrExprA, tnsrExprB, + Einsum::idx(temp_annot), world); + ArrayC result; + result(std::string(c) + inner.c) = temp(temp_annot); + return result; + } + } + auto update_tr = [&e = std::as_const(e), &i = std::as_const(i), &range_map = std::as_const(range_map)](auto &term) { auto ei = (e + i & term.idx); diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h index f0a94c7e05..3d0ef11c10 100644 --- a/src/TiledArray/expressions/cont_engine.h +++ b/src/TiledArray/expressions/cont_engine.h @@ -279,25 +279,62 @@ class ContEngine : public BinaryEngine { outer_size(left_indices_), outer_size(right_indices_), (!implicit_permute_outer_ ? std::move(outer_perm) : Permutation{})); } else { + + auto make_total_perm = [this]() -> BipartitePermutation { + if (this->product_type() != TensorProduct::Contraction + || this->implicit_permute_inner_) + return this->implicit_permute_outer_ + ? BipartitePermutation() + : BipartitePermutation(outer(this->perm_)); + + // Here, + // this->product_type() is Tensor::Contraction, and, + // this->implicit_permute_inner_ is false + + return this->inner_product_type() == TensorProduct::Scale + ? BipartitePermutation(outer(this->perm_)) + : this->perm_; + }; + + auto total_perm = make_total_perm(); + // factor_ is absorbed into inner_tile_nonreturn_op_ op_ = op_type( left_op, right_op, scalar_type(1), outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), - (!implicit_permute_outer_ ? std::move(outer_perm) : Permutation{}), + total_perm, this->element_nonreturn_op_); } trange_ = ContEngine_::make_trange(outer_perm); shape_ = ContEngine_::make_shape(outer_perm); } else { // Initialize non-permuted structure + if constexpr (!TiledArray::detail::is_tensor_of_tensor_v) { op_ = op_type(left_op, right_op, factor_, outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_)); } else { + + auto make_total_perm = [this]() -> BipartitePermutation { + if (this->product_type() != TensorProduct::Contraction + || this->implicit_permute_inner_) + return {}; + + // Here, + // this->product_type() is Tensor::Contraction, and, + // this->implicit_permute_inner_ is false + + return this->inner_product_type() == TensorProduct::Scale + ? BipartitePermutation(outer(this->perm_)) + : this->perm_; + }; + + auto total_perm = make_total_perm(); + // factor_ is absorbed into inner_tile_nonreturn_op_ op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_), outer_size(left_indices_), outer_size(right_indices_), - BipartitePermutation{}, this->element_nonreturn_op_); + total_perm, this->element_nonreturn_op_); } trange_ = ContEngine_::make_trange(); shape_ = ContEngine_::make_shape(); @@ -509,12 +546,15 @@ class ContEngine : public BinaryEngine { inner_size(this->left_indices_), inner_size(this->right_indices_)); this->element_nonreturn_op_ = - [contrreduce_op](result_tile_element_type& result, - const left_tile_element_type& left, - const right_tile_element_type& right) { + [contrreduce_op, permute_inner = this->product_type() != + TensorProduct::Contraction]( + result_tile_element_type& result, + const left_tile_element_type& left, + const right_tile_element_type& right) { contrreduce_op(result, left, right); - if (!TA::empty(result)) - result = contrreduce_op(result); // permutations of result are applied as "postprocessing" + // permutations of result are applied as "postprocessing" + if (permute_inner && !TA::empty(result)) + result = contrreduce_op(result); }; } // ToT x ToT } else if (inner_prod == TensorProduct::Hadamard) { diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h index bd6fb8f3e5..a394594b8e 100644 --- a/src/TiledArray/tensor/tensor.h +++ b/src/TiledArray/tensor/tensor.h @@ -1630,6 +1630,7 @@ class Tensor { template ::value>::type* = nullptr> Tensor add(const Right& right) const& { + if (right.empty()) return *this; return binary( right, [](const value_type& l, const value_t& r) -> decltype(auto) { diff --git a/src/TiledArray/tile_op/contract_reduce.h b/src/TiledArray/tile_op/contract_reduce.h index f0654f1431..2a5e90ea5d 100644 --- a/src/TiledArray/tile_op/contract_reduce.h +++ b/src/TiledArray/tile_op/contract_reduce.h @@ -332,7 +332,6 @@ class ContractReduce : public ContractReduceBase { if constexpr (!ContractReduceBase_::plain_tensors) { TA_ASSERT(this->elem_muladd_op()); - // not yet implemented gemm(result, left, right, ContractReduceBase_::gemm_helper(), this->elem_muladd_op()); } else { // plain tensors From b6614c61a6680bb7de319c7aab506b704fc30aeb Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 21 Nov 2024 08:42:31 -0500 Subject: [PATCH 591/592] Bug fix: do not try to multiply empty tensor(s). --- src/TiledArray/einsum/tiledarray.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h index e341e646a9..ace7caa15a 100644 --- a/src/TiledArray/einsum/tiledarray.h +++ b/src/TiledArray/einsum/tiledarray.h @@ -664,6 +664,7 @@ auto einsum(expressions::TsrExpr A, expressions::TsrExpr B, using TensorT = std::remove_reference_t; auto mult_op = [&inner](auto const &l, auto const &r) -> TensorT { + if (l.empty() || r.empty()) return TensorT{}; return inner.h ? TA::detail::tensor_hadamard(l, inner.A, r, inner.B, inner.C) : TA::detail::tensor_contract(l, inner.A, r, From 42578f79c23c06ae11f35b260ad8f276a9ab2d4c Mon Sep 17 00:00:00 2001 From: Bimal Gaudel Date: Thu, 21 Nov 2024 12:28:32 -0500 Subject: [PATCH 592/592] More zero tensor checking. --- src/TiledArray/tensor/kernels.h | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h index a2530f2f5d..0b0767ed81 100644 --- a/src/TiledArray/tensor/kernels.h +++ b/src/TiledArray/tensor/kernels.h @@ -417,14 +417,15 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) { TA_ASSERT(!empty(result, tensors...)); TA_ASSERT(is_range_set_congruent(result, tensors...)); - const auto volume = result.range().volume(); - - for (decltype(result.range().volume()) ord = 0ul; ord < volume; ++ord) { + auto volume = result.total_size(); + for (decltype(volume) ord = 0; ord < volume; ++ord) { + if constexpr (is_tensor_of_tensor_v) + if (((tensors.data()[ord].range().volume() == 0) || ...)) continue; if constexpr (std::is_invocable_r_v) - op(result.at_ordinal(ord), tensors.at_ordinal(ord)...); + op(result.data()[ord], tensors.data()[ord]...); else - inplace_tensor_op(op, result.at_ordinal(ord), tensors.at_ordinal(ord)...); + inplace_tensor_op(op, result.data()[ord], tensors.data()[ord]...); } }