diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 143c88f8ea..4c6a097d9a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,20 +64,10 @@ jobs: sudo ln -s /usr/lib/x86_64-linux-gnu/libscalapack-openmpi.so /usr/lib/x86_64-linux-gnu/libscalapack.so echo "MPIEXEC=/usr/bin/mpiexec" >> $GITHUB_ENV - - name: Prepare ccache timestamp - id: ccache_cache_timestamp - shell: cmake -P {0} - run: | - string(TIMESTAMP current_date "%Y-%m-%d-%H;%M;%S" UTC) - message("::set-output name=timestamp::${current_date}") - - - name: Setup ccache cache files - uses: actions/cache@v1.1.0 + - name: Setup ccache + uses: hendrikmuhs/ccache-action@v1.2 with: - path: ${{github.workspace}}/build/.ccache - key: ${{ matrix.config.name }}-ccache-${{ steps.ccache_cache_timestamp.outputs.timestamp }} - restore-keys: | - ${{ matrix.config.name }}-ccache- + key: ccache-${{ matrix.os }}-${{ matrix.build_type }}-${{ matrix.task_backend }} - name: "Configure build: ${{ env.BUILD_CONFIG }}" shell: bash @@ -89,7 +79,7 @@ jobs: working-directory: ${{github.workspace}}/build shell: bash run: | - ccache -p && ccache -z && cmake --build . --target tiledarray && cmake --build . --target examples && ccache -s + ccache -p && ccache -z && cmake --build . --target tiledarray ta_test examples && ccache -s - name: Test working-directory: ${{github.workspace}}/build @@ -97,5 +87,4 @@ jobs: #run: ctest -C $${{matrix.build_type}} run: | source ${{github.workspace}}/ci/openmpi.env - cmake --build . --target ta_test cmake --build . --target check-tiledarray diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8b675a692c..33a8d0c9bf 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -25,6 +25,14 @@ before_script: # TODO optimize ta_test build memory consumption - export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:=1} - echo "CMAKE_BUILD_PARALLEL_LEVEL=$CMAKE_BUILD_PARALLEL_LEVEL" + # configure ccache + - export CCACHE_DIR=/root/.ccache + - export CCACHE_COMPRESS=true + - export CCACHE_COMPRESSLEVEL=6 + # print out the ccache configuration + - ccache -p + # zero out the ccache statistics + - ccache -z ubuntu: stage: build @@ -64,3 +72,8 @@ ubuntu: ENABLE_CUDA : [ "ENABLE_CUDA=ON" ] TA_TARGETS : [ "tiledarray examples-tiledarray check_serial-tiledarray" ] RUNNER_TAGS: [ cuda ] + + +after_script: + # print out the ccache statistics + - ccache -s diff --git a/CMakeLists.txt b/CMakeLists.txt index 101b1b0d16..a130211293 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -299,6 +299,16 @@ include_directories(${PROJECT_SOURCE_DIR}/src ${PROJECT_BINARY_DIR}/src) ########################## add_custom_target(External-tiledarray) +# ccache is an optional dep but must be found first so that the rest of dependencies can use it +find_program(CCACHE ccache) +if(CCACHE) + mark_as_advanced(CCACHE) + message (STATUS "Found ccache: ${CCACHE}") + set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C++") + set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C") + set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling CUDA") +endif(CCACHE) + # required deps: # 1. derive runtime (CUDA/HIP/...) first since others may depend on it if(ENABLE_CUDA) @@ -336,15 +346,7 @@ if(ENABLE_SCALAPACK) include(external/scalapackpp.cmake) endif() -# optional deps: -# 1. ccache -find_program(CCACHE ccache) -if(CCACHE) - mark_as_advanced(CCACHE) - message (STATUS "Found ccache: ${CCACHE}") - set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C++") - set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C") -endif(CCACHE) +# other optional deps: # 2. TTG # N.B. make sure TA configures MADNESS correctly #if (TA_TTG) diff --git a/INSTALL.md b/INSTALL.md index 0e573bb050..ed0ba5046c 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -41,9 +41,9 @@ Both methods are supported. However, for most users we _strongly_ recommend to b - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing* - Boost.Range: header-only, *only used for unit testing* - [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20 and later. -- [BTAS](http://github.com/ValeevGroup/BTAS), tag 4b3757cc2b5862f93589afc1e37523e543779c7a . If usable BTAS installation is not found, TiledArray will download and compile +- [BTAS](http://github.com/ValeevGroup/BTAS), tag 1cfcb12647c768ccd83b098c64cda723e1275e49 . If usable BTAS installation is not found, TiledArray will download and compile BTAS from source. *This is the recommended way to compile BTAS for all users*. -- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 95589b0d020a076f93d02eead6da654b23dd3d91 . +- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 93a9a5cec2a8fa87fba3afe8056607e6062a9058 . Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray. If usable MADNESS installation is not found, TiledArray will download and compile MADNESS from source. *This is the recommended way to compile MADNESS for all users*. @@ -69,7 +69,7 @@ Optional prerequisites: - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required. - [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators. Note that TiledArray does not use ROCm directly but its C++ Heterogeneous-Compute Interface for Portability, `HIP`; although HIP can also be used to program CUDA-enabled devices, in TiledArray it is used only to program ROCm devices, hence ROCm and HIP will be used interchangeably. - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 6eed30d4dd2a5aa58840fe895dcffd80be7fbece). - - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag v2024.02.1). + - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag 8c85866107f78a58403e20a2ae8e1f24c9852287). - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later). - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing: - [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ (C++17) wrapper for ScaLAPACK (tag 6397f52cf11c0dfd82a79698ee198a2fce515d81); pulls and builds the following additional prerequisite diff --git a/external/librett.cmake b/external/librett.cmake index afebabb486..5eca3314ce 100644 --- a/external/librett.cmake +++ b/external/librett.cmake @@ -98,6 +98,13 @@ else() "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") endif(CMAKE_TOOLCHAIN_FILE) + foreach(lang C CXX CUDA) + if (DEFINED CMAKE_${lang}_COMPILER_LAUNCHER) + list(APPEND LIBRETT_CMAKE_ARGS + "-DCMAKE_${lang}_COMPILER_LAUNCHER=${CMAKE_${lang}_COMPILER_LAUNCHER}") + endif() + endforeach() + if (BUILD_SHARED_LIBS) set(LIBRETT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) else(BUILD_SHARED_LIBS) diff --git a/external/umpire.cmake b/external/umpire.cmake index 37152e98d2..5b7a4f4078 100644 --- a/external/umpire.cmake +++ b/external/umpire.cmake @@ -152,6 +152,13 @@ else() ) endif(CMAKE_TOOLCHAIN_FILE) + foreach(lang C CXX CUDA) + if (DEFINED CMAKE_${lang}_COMPILER_LAUNCHER) + list(APPEND UMPIRE_CMAKE_ARGS + "-DCMAKE_${lang}_COMPILER_LAUNCHER=${CMAKE_${lang}_COMPILER_LAUNCHER}") + endif() + endforeach() + if (BUILD_SHARED_LIBS) set(UMPIRE_DEFAULT_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX}) else(BUILD_SHARED_LIBS) @@ -170,8 +177,6 @@ else() DOWNLOAD_DIR ${EXTERNAL_SOURCE_DIR} GIT_REPOSITORY ${UMPIRE_URL} GIT_TAG ${UMPIRE_TAG} - #--Patch step----------------- - PATCH_COMMAND patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/external/umpire.finalize_io.patch #--Configure step------------- SOURCE_DIR ${EXTERNAL_SOURCE_DIR} LIST_SEPARATOR :: @@ -218,6 +223,8 @@ else() "$;$;$;$;$;$;$" INTERFACE_LINK_LIBRARIES "$;$" + INTERFACE_COMPILE_DEFINITIONS + FMT_HEADER_ONLY=1 ) install(TARGETS TiledArray_UMPIRE EXPORT tiledarray COMPONENT tiledarray) diff --git a/external/umpire.finalize_io.patch b/external/umpire.finalize_io.patch deleted file mode 100644 index fa78727d7f..0000000000 --- a/external/umpire.finalize_io.patch +++ /dev/null @@ -1,47 +0,0 @@ -diff --git a/src/umpire/util/io.cpp b/src/umpire/util/io.cpp -index 806fb9e3..551c5e82 100644 ---- a/src/umpire/util/io.cpp -+++ b/src/umpire/util/io.cpp -@@ -52,10 +52,23 @@ std::ostream& error() - - namespace util { - -+namespace detail { -+OutputBuffer& s_log_buffer_accessor() -+{ -+ static OutputBuffer buffer; -+ return buffer; -+} -+OutputBuffer& s_error_buffer_accessor() -+{ -+ static OutputBuffer buffer; -+ return buffer; -+} -+} -+ - void initialize_io(const bool enable_log) - { -- static util::OutputBuffer s_log_buffer; -- static util::OutputBuffer s_error_buffer; -+ OutputBuffer& s_log_buffer = detail::s_log_buffer_accessor(); -+ OutputBuffer& s_error_buffer = detail::s_error_buffer_accessor(); - - s_log_buffer.setConsoleStream(nullptr); - s_error_buffer.setConsoleStream(&std::cerr); -@@ -121,6 +134,16 @@ void initialize_io(const bool enable_log) - MPI::logMpiInfo(); - } - -+void finalize_io() -+{ -+ detail::s_log_buffer_accessor().sync(); -+ detail::s_log_buffer_accessor().setConsoleStream(nullptr); -+ detail::s_log_buffer_accessor().setFileStream(nullptr); -+ detail::s_error_buffer_accessor().sync(); -+ detail::s_error_buffer_accessor().setConsoleStream(nullptr); -+ detail::s_error_buffer_accessor().setFileStream(nullptr); -+} -+ - void flush_files() - { - log().flush(); diff --git a/external/versions.cmake b/external/versions.cmake index 3363908bf3..d9d47a3bf2 100644 --- a/external/versions.cmake +++ b/external/versions.cmake @@ -11,19 +11,19 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7) set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626) set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496) -set(TA_TRACKED_MADNESS_TAG 95589b0d020a076f93d02eead6da654b23dd3d91) -set(TA_TRACKED_MADNESS_PREVIOUS_TAG 96ac90e8f193ccfaf16f346b4652927d2d362e75) +set(TA_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058) +set(TA_TRACKED_MADNESS_PREVIOUS_TAG 95589b0d020a076f93d02eead6da654b23dd3d91) set(TA_TRACKED_MADNESS_VERSION 0.10.1) set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1) -set(TA_TRACKED_BTAS_TAG 4b3757cc2b5862f93589afc1e37523e543779c7a) -set(TA_TRACKED_BTAS_PREVIOUS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) +set(TA_TRACKED_BTAS_TAG 1cfcb12647c768ccd83b098c64cda723e1275e49) +set(TA_TRACKED_BTAS_PREVIOUS_TAG 4b3757cc2b5862f93589afc1e37523e543779c7a) set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece) set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83) -set(TA_TRACKED_UMPIRE_TAG v2024.02.1) -set(TA_TRACKED_UMPIRE_PREVIOUS_TAG 20839b2e8e8972070dd8f75c7f00d50d6c399716) +set(TA_TRACKED_UMPIRE_TAG 8c85866107f78a58403e20a2ae8e1f24c9852287) +set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v2024.02.1) set(TA_TRACKED_SCALAPACKPP_TAG 6397f52cf11c0dfd82a79698ee198a2fce515d81) set(TA_TRACKED_SCALAPACKPP_PREVIOUS_TAG 711ef363479a90c88788036f9c6c8adb70736cbf ) diff --git a/python/src/TiledArray/python/array.h b/python/src/TiledArray/python/array.h index 782846df4c..e3cc1c79b7 100644 --- a/python/src/TiledArray/python/array.h +++ b/python/src/TiledArray/python/array.h @@ -208,7 +208,7 @@ void make_array_class(py::object m, const char *name) { py::return_value_policy::reference) .def_property_readonly("trange", &array::trange) .def_property_readonly("shape", &array::shape) - .def("fill", &Array::fill, py::arg("value"), + .def("fill", &Array::template fill<>, py::arg("value"), py::arg("skip_set") = false) .def("init", &array::init_tiles) // Array object needs be alive while iterator is used */ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 80f2a49710..a16c05d0b2 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -225,7 +225,7 @@ if(HIP_FOUND OR CUDA_FOUND) TiledArray/external/cuda.h TiledArray/device/cpu_cuda_vector.h) endif(CUDA_FOUND) -endif(CUDA_FOUND OR HIP_FOUND) +endif(HIP_FOUND OR CUDA_FOUND) set(TILEDARRAY_SOURCE_FILES TiledArray/tiledarray.cpp diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h index 92680722cf..7d5b59d7c1 100644 --- a/src/TiledArray/array_impl.h +++ b/src/TiledArray/array_impl.h @@ -198,6 +198,17 @@ std::ostream& operator<<(std::ostream& os, const TileConstReference& a) { return os; } +/// Callaback used to update counter (typically, task counter) +template +struct IncrementCounter : public madness::CallbackInterface { + AtomicInt& counter; + IncrementCounter(AtomicInt& counter) : counter(counter) {} + void notify() override { + ++counter; + delete this; + } +}; + } // namespace detail } // namespace TiledArray @@ -773,20 +784,24 @@ class ArrayImpl : public TensorImpl, /// \tparam Op The type of the functor/function /// \param[in] op The operation used to generate tiles /// \param[in] skip_set If false, will throw if any tiles are already set + /// \return the total number of tiles that have been (or will be) initialized /// \throw TiledArray::Exception if the PIMPL is not set. Strong throw /// guarantee. /// \throw TiledArray::Exception if a tile is already set and skip_set is /// false. Weak throw guarantee. - template - void init_tiles(Op&& op, bool skip_set = false) { + template + std::int64_t init_tiles(Op&& op, bool skip_set = false) { // lifetime management of op depends on whether it is a lvalue ref (i.e. has // an external owner) or an rvalue ref // - if op is an lvalue ref: pass op to tasks // - if op is an rvalue ref pass make_shared_function(op) to tasks auto op_shared_handle = make_op_shared_handle(std::forward(op)); + std::int64_t ntiles_initialized{0}; auto it = this->pmap()->begin(); const auto end = this->pmap()->end(); + std::atomic ntask_completed{0}; for (; it != end; ++it) { const auto& index = *it; if (!this->is_zero(index)) { @@ -795,19 +810,39 @@ class ArrayImpl : public TensorImpl, if (fut.probe()) continue; } if constexpr (Exec == HostExecutor::MADWorld) { - Future tile = this->world().taskq.add( - [this_sptr = this->shared_from_this(), - index = ordinal_type(index), op_shared_handle]() -> value_type { + Future tile = + this->world().taskq.add([this_sptr = this->shared_from_this(), + index = ordinal_type(index), + op_shared_handle, this]() -> value_type { return op_shared_handle( this_sptr->trange().make_tile_range(index)); }); + ++ntiles_initialized; + if constexpr (fence == Fence::Local) { + tile.register_callback( + new IncrementCounter( + ntask_completed)); + } set(index, std::move(tile)); } else { static_assert(Exec == HostExecutor::Thread); set(index, op_shared_handle(this->trange().make_tile_range(index))); + ++ntiles_initialized; } } } + + if constexpr (fence == Fence::Local) { + if constexpr (Exec == HostExecutor::MADWorld) { + if (ntiles_initialized > 0) + this->world().await([&ntask_completed, ntiles_initialized]() { + return ntask_completed == ntiles_initialized; + }); + } + } else if constexpr (fence == Fence::Global) { + this->world().gop.fence(); + } + return ntiles_initialized; } }; // class ArrayImpl diff --git a/src/TiledArray/conversions/foreach.h b/src/TiledArray/conversions/foreach.h index 20f2d36ec3..2c77c91a0f 100644 --- a/src/TiledArray/conversions/foreach.h +++ b/src/TiledArray/conversions/foreach.h @@ -283,11 +283,10 @@ inline std:: arg.trange().tiles_range(), 0); // Construct the task function used to construct the result tiles. - madness::AtomicInt counter; - counter = 0; - int task_count = 0; + std::atomic ntask_completed{0}; + std::int64_t ntask_created{0}; auto op_shared_handle = make_op_shared_handle(std::forward(op)); - const auto task = [op_shared_handle, &counter, &tile_norms]( + const auto task = [op_shared_handle, &tile_norms]( const ordinal_type ord, const_if_t& arg_tile, const ArgTiles&... arg_tiles) -> result_value_type { @@ -295,7 +294,6 @@ inline std:: auto result_tile = op_caller(std::move(op_shared_handle), tile_norms.at_ordinal(ord), arg_tile, arg_tiles...); - ++counter; return result_tile; }; @@ -310,7 +308,9 @@ inline std:: continue; auto result_tile = world.taskq.add(task, ord, arg.find_local(ord), args.find(ord)...); - ++task_count; + ++ntask_created; + result_tile.register_callback( + new IncrementCounter(ntask_completed)); tiles.emplace_back(ord, std::move(result_tile)); if (op_returns_void) // if Op does not evaluate norms, use the (scaled) // norms of the first arg @@ -324,7 +324,9 @@ inline std:: auto result_tile = world.taskq.add(task, ord, detail::get_sparse_tile(ord, arg), detail::get_sparse_tile(ord, args)...); - ++task_count; + ++ntask_created; + result_tile.register_callback( + new IncrementCounter(ntask_completed)); tiles.emplace_back(ord, std::move(result_tile)); if (op_returns_void) // if Op does not evaluate norms, find max // (scaled) norms of all args @@ -339,9 +341,10 @@ inline std:: } // Wait for tile norm data to be collected. - if (task_count > 0) - world.await( - [&counter, task_count]() -> bool { return counter == task_count; }); + if (ntask_created > 0) + world.await([&ntask_completed, ntask_created]() -> bool { + return ntask_created == ntask_completed; + }); // Construct the new array result_array_type result( diff --git a/src/TiledArray/conversions/make_array.h b/src/TiledArray/conversions/make_array.h index 6f5ada0bba..1295e6f8e4 100644 --- a/src/TiledArray/conversions/make_array.h +++ b/src/TiledArray/conversions/make_array.h @@ -26,6 +26,7 @@ #ifndef TILEDARRAY_CONVERSIONS_MAKE_ARRAY_H__INCLUDED #define TILEDARRAY_CONVERSIONS_MAKE_ARRAY_H__INCLUDED +#include "TiledArray/array_impl.h" #include "TiledArray/external/madness.h" #include "TiledArray/shape.h" #include "TiledArray/type_traits.h" @@ -79,6 +80,10 @@ inline Array make_array( // Make an empty result array Array result(world, trange); + // Construct the task function used to construct the result tiles. + std::atomic ntask_completed{0}; + std::int64_t ntask_created{0}; + // Iterate over local tiles of arg for (const auto index : *result.pmap()) { // Spawn a task to evaluate the tile @@ -89,11 +94,20 @@ inline Array make_array( return tile; }, trange.make_tile_range(index)); - + ++ntask_created; + tile.register_callback( + new detail::IncrementCounter( + ntask_completed)); // Store result tile - result.set(index, tile); + result.set(index, std::move(tile)); } + // Wait for tile tasks to complete + if (ntask_created > 0) + world.await([&ntask_completed, ntask_created]() -> bool { + return ntask_completed == ntask_created; + }); + return result; } @@ -150,26 +164,28 @@ inline Array make_array( trange.tiles_range(), 0); // Construct the task function used to construct the result tiles. - madness::AtomicInt counter; - counter = 0; - int task_count = 0; + std::atomic ntask_completed{0}; + std::int64_t ntask_created{0}; auto task = [&](const ordinal_type index) -> value_type { value_type tile; tile_norms.at_ordinal(index) = op(tile, trange.make_tile_range(index)); - ++counter; return tile; }; for (const auto index : *pmap) { auto result_tile = world.taskq.add(task, index); - ++task_count; + ++ntask_created; + result_tile.register_callback( + new detail::IncrementCounter( + ntask_completed)); tiles.emplace_back(index, std::move(result_tile)); } // Wait for tile norm data to be collected. - if (task_count > 0) - world.await( - [&counter, task_count]() -> bool { return counter == task_count; }); + if (ntask_created > 0) + world.await([&ntask_completed, ntask_created]() -> bool { + return ntask_completed == ntask_created; + }); // Construct the new array Array result(world, trange, diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h index 8ed2c8b043..cb9d094f34 100644 --- a/src/TiledArray/dist_array.h +++ b/src/TiledArray/dist_array.h @@ -906,23 +906,29 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// guarantee. /// \throw TiledArray::Exception if skip_set is false and a local tile is /// already set. Weak throw guarantee. - void fill_local(const element_type& value = element_type(), - bool skip_set = false) { - init_tiles( + template + std::int64_t fill_local(const element_type& value = element_type(), + bool skip_set = false) { + return init_tiles( [value](const range_type& range) { return value_type(range, value); }, skip_set); } /// Fill all local tiles with the specified value + /// \tparam fence If Fence::No, the operation will return early, + /// before the tasks have completed /// \param[in] value What each local tile should be filled with. /// \param[in] skip_set If false, will throw if any tiles are already set + /// \return the total number of tiles that have been (or will be) initialized /// \throw TiledArray::Exception if the PIMPL is uninitialized. Strong throw /// guarantee. /// \throw TiledArray::Exception if skip_set is false and a local tile is /// already set. Weak throw guarantee. - void fill(const element_type& value = numeric_type(), bool skip_set = false) { - fill_local(value, skip_set); + template + std::int64_t fill(const element_type& value = numeric_type(), + bool skip_set = false) { + return fill_local(value, skip_set); } /// Fill all local tiles with random values @@ -934,18 +940,21 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// generate random values of type T this function will be disabled via SFINAE /// and attempting to use it will lead to a compile-time error. /// + /// \tparam fence If Fence::No, the operation will return early, + /// before the tasks have completed /// \tparam T The type of random value to generate. Defaults to /// element_type. /// \param[in] skip_set If false, will throw if any tiles are already set + /// \return the total number of tiles that have been (or will be) initialized /// \throw TiledArray::Exception if the PIMPL is not initialized. Strong /// throw guarantee. /// \throw TiledArray::Exception if skip_set is false and a local tile is /// already initialized. Weak throw guarantee. template > - void fill_random(bool skip_set = false) { - init_elements( + std::int64_t fill_random(bool skip_set = false) { + return init_elements( [](const auto&) { return detail::MakeRandom::generate_value(); }); } @@ -978,6 +987,8 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// return tile; /// }); /// \endcode + /// \tparam fence If Fence::No, the operation will return early, + /// before the tasks have completed /// \tparam Op The type of the functor/function /// \param[in] op The operation used to generate tiles /// \param[in] skip_set If false, will throw if any tiles are already set @@ -985,9 +996,11 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// guarantee. /// \throw TiledArray::Exception if a tile is already set and skip_set is /// false. Weak throw guarantee. - template - void init_tiles(Op&& op, bool skip_set = false) { - impl_ref().template init_tiles(std::forward(op), skip_set); + template + std::int64_t init_tiles(Op&& op, bool skip_set = false) { + return impl_ref().template init_tiles(std::forward(op), + skip_set); } /// Initialize elements of local, non-zero tiles with a user provided functor @@ -1009,15 +1022,17 @@ class DistArray : public madness::archive::ParallelSerializableObject { /// \tparam Op Type of the function/functor which will generate the elements. /// \param[in] op The operation used to generate elements /// \param[in] skip_set If false, will throw if any tiles are already set + /// \return the total number of tiles that have been (or will be) initialized /// \throw TiledArray::Exception if the PIMPL is not initialized. Strong /// throw guarnatee. /// \throw TiledArray::Exception if skip_set is false and a local, non-zero /// tile is already initialized. Weak throw /// guarantee. - template - void init_elements(Op&& op, bool skip_set = false) { + template + std::int64_t init_elements(Op&& op, bool skip_set = false) { auto op_shared_handle = make_op_shared_handle(std::forward(op)); - init_tiles( + return init_tiles( [op = std::move(op_shared_handle)]( const TiledArray::Range& range) -> value_type { // Initialize the tile with the given range object diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h index 652b835fab..e33aea5c18 100644 --- a/src/TiledArray/fwd.h +++ b/src/TiledArray/fwd.h @@ -203,6 +203,14 @@ using Array enum class HostExecutor { Thread, MADWorld, Default = MADWorld }; +/// fence types +enum class Fence { + Global, //!< global fence (`world.gop.fence()`) + Local, //!< local fence (all local work done, equivalent to + //!< `world.taskq.fence() in absence of active messages) + No //!< no fence +}; + namespace conversions { /// user defined conversions diff --git a/src/TiledArray/special/diagonal_array.h b/src/TiledArray/special/diagonal_array.h index d60b23db94..eac0c65e92 100644 --- a/src/TiledArray/special/diagonal_array.h +++ b/src/TiledArray/special/diagonal_array.h @@ -157,7 +157,8 @@ std::enable_if_t::value, void> write_diag_tiles_to_array_rng(Array &A, RandomAccessIterator diagonals_begin) { using Tile = typename Array::value_type; - A.init_tiles( + // N.B. Fence::Local ensures lifetime of the diagonals range + A.template init_tiles( // Task to create each tile [diagonals_begin](const Range &rng) { // Compute range of diagonal elements in the tile @@ -221,7 +222,6 @@ diagonal_array(World &world, TiledRange const &trange, if constexpr (is_dense_v) { Array A(world, trange); detail::write_diag_tiles_to_array_rng(A, diagonals_begin); - A.world().taskq.fence(); // ensure tasks outlive the diagonals_begin view return A; } else { // Compute shape and init the Array @@ -231,7 +231,6 @@ diagonal_array(World &world, TiledRange const &trange, ShapeType shape(shape_norm, trange); Array A(world, trange, shape); detail::write_diag_tiles_to_array_rng(A, diagonals_begin); - A.world().taskq.fence(); // ensure tasks outlive the diagonals_begin view return A; } abort(); // unreachable