Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into kmp5/experimental/u…
Browse files Browse the repository at this point in the history
…pkeep_btas
  • Loading branch information
kmp5VT committed Sep 28, 2024
2 parents 0179033 + ec51edb commit 307f517
Show file tree
Hide file tree
Showing 90 changed files with 3,697 additions and 1,619 deletions.
49 changes: 35 additions & 14 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,26 @@ jobs:
strategy:
fail-fast: false
matrix:
os : [ macos-latest ]
cxx : [ clang++, /opt/homebrew/bin/g++-11 ]
os : [ macos-latest, ubuntu-22.04 ]
build_type : [ Release, Debug ]
task_backend: [ Pthreads, PaRSEC ]
prerequisites : [ gcc@11 boost eigen open-mpi bison scalapack ]
include:
- os: ubuntu-22.04
cc: /usr/bin/gcc-12
cxx: /usr/bin/g++-12
- os: macos-latest
cc: clang
cxx: clang++

name: "${{ matrix.os }}: ${{ matrix.cxx }} ${{ matrix.build_type }} ${{ matrix.task_backend }}"
runs-on: ${{ matrix.os }}
env:
CXX : ${{ matrix.cxx }}
CCACHE_DIR : ${{github.workspace}}/build/.ccache
CCACHE_COMPRESS : true
CCACHE_COMPRESSLEVEL : 6
OMPI_MCA_btl_vader_single_copy_mechanism : none
PARSEC_MCA_runtime_bind_threads : 0
BUILD_CONFIG : >
-DMADNESS_TASK_BACKEND=${{ matrix.task_backend }}
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
Expand All @@ -33,37 +43,48 @@ jobs:
steps:
- uses: actions/checkout@v2

- uses: maxim-lobanov/setup-xcode@v1
with:
xcode-version: 'latest-stable'

- name: Host system info
shell: bash
run: cmake -P ${{github.workspace}}/ci/host_system_info.cmake

- name: Install ${{matrix.prerequisites}}

- name: Install prerequisite MacOS packages
if: ${{ matrix.os == 'macos-latest' }}
run: |
brew install ${{matrix.prerequisites}}
echo "/usr/local/opt/bison/bin" >> $GITHUB_PATH
brew install ninja boost eigen open-mpi bison scalapack ccache
echo "MPIEXEC=/opt/homebrew/bin/mpiexec" >> $GITHUB_ENV
- name: Install prerequisites Ubuntu packages
if: ${{ matrix.os == 'ubuntu-22.04' }}
run: |
wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
sudo apt-get update
sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison libscalapack-mpi-dev cmake doxygen
sudo ln -s /usr/lib/x86_64-linux-gnu/libscalapack-openmpi.so /usr/lib/x86_64-linux-gnu/libscalapack.so
echo "MPIEXEC=/usr/bin/mpiexec" >> $GITHUB_ENV
- name: Setup ccache
uses: hendrikmuhs/[email protected]
with:
key: ccache-${{ matrix.os }}-${{ matrix.build_type }}-${{ matrix.task_backend }}

- name: "Configure build: ${{ env.BUILD_CONFIG }}"
shell: bash
run: |
set -x;
cmake -B${{github.workspace}}/build $BUILD_CONFIG || (cat CMakeFiles/CMakeOutput.log && cat CMakeFiles/CMakeError.log)
cmake -B${{github.workspace}}/build $BUILD_CONFIG || (cat CMakeFiles/CMakeConfigureLog.yaml)
- name: Build
working-directory: ${{github.workspace}}/build
shell: bash
run: |
cmake --build . --target tiledarray
cmake --build . --target examples
ccache -p && ccache -z && cmake --build . --target tiledarray ta_test examples && ccache -s
- name: Test
working-directory: ${{github.workspace}}/build
shell: bash
#run: ctest -C $${{matrix.build_type}}
run: |
source ${{github.workspace}}/ci/openmpi.env
cmake --build . --target ta_test
cmake --build . --target check-tiledarray
32 changes: 16 additions & 16 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,18 @@ before_script:
# TODO optimize ta_test build memory consumption
- export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:=1}
- echo "CMAKE_BUILD_PARALLEL_LEVEL=$CMAKE_BUILD_PARALLEL_LEVEL"
# configure ccache
- export CCACHE_DIR=/root/.ccache
- export CCACHE_COMPRESS=true
- export CCACHE_COMPRESSLEVEL=6
# print out the ccache configuration
- ccache -p
# zero out the ccache statistics
- ccache -z

ubuntu:
stage: build
tags:
- docker
- ${RUNNER_TAGS}
timeout: 3h
image: valeevgroup/${IMAGE}
Expand Down Expand Up @@ -58,22 +65,15 @@ ubuntu:
metrics: build/metrics.txt
parallel:
matrix:
- IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ]
- IMAGE : [ "ubuntu:22.04" ]
CXX: [ g++ ]
BUILD_TYPE : [ "Release" ]
BLA_VENDOR : [ "BLAS_PREFERENCE_LIST=IntelMKL" ]
BLA_THREADS : [ "IntelMKL_THREAD_LAYER=tbb" ]
# ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ]
TA_PYTHON : [ "TA_PYTHON=OFF" ] # needs to be fixed for MKL
RUNNER_TAGS: [ linux ]
- IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ]
CXX: [ g++, clang++-13 ]
BUILD_TYPE : [ "Release", "Debug" ]
ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ]
RUNNER_TAGS: [ linux ]
- IMAGE : [ "ubuntu:22.04", "ubuntu:20.04" ]
CXX: [ g++ ]
BUILD_TYPE : [ "Release", "Debug" ]
BUILD_TYPE : [ "RelWithDebInfo" ]
TA_PYTHON : [ "TA_PYTHON=OFF" ]
ENABLE_CUDA : [ "ENABLE_CUDA=ON" ]
TA_TARGETS : [ "tiledarray examples-tiledarray check_serial-tiledarray" ]
RUNNER_TAGS: [ cuda ]


after_script:
# print out the ccache statistics
- ccache -s
30 changes: 13 additions & 17 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,6 @@ option(TA_TRACE_GLOBAL_COMM_STATS "Enable tracing of communication stats of glob
add_feature_info(TASK_TRACE_DEBUG TA_TRACE_GLOBAL_COMM_STATS "Debug communication stats of global objects (DistEval's and DIstributedStorage) TiledArray")
set(TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE ${TA_TRACE_GLOBAL_COMM_STATS})

option(TA_RANGEV3 "Enable Range-V3 library" OFF)
add_feature_info(TA_RANGEV3 TA_RANGEV3 "Range-V3 ranges library")

option(TA_TTG "Enable search/build of TTG library" OFF)
add_feature_info(TA_TTG TA_TTG "TTG library")

Expand Down Expand Up @@ -302,6 +299,16 @@ include_directories(${PROJECT_SOURCE_DIR}/src ${PROJECT_BINARY_DIR}/src)
##########################
add_custom_target(External-tiledarray)

# ccache is an optional dep but must be found first so that the rest of dependencies can use it
find_program(CCACHE ccache)
if(CCACHE)
mark_as_advanced(CCACHE)
message (STATUS "Found ccache: ${CCACHE}")
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C++")
set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C")
set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling CUDA")
endif(CCACHE)

# required deps:
# 1. derive runtime (CUDA/HIP/...) first since others may depend on it
if(ENABLE_CUDA)
Expand All @@ -310,6 +317,7 @@ endif()
if(ENABLE_HIP)
include(external/hip.cmake)
endif()
include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchRangeV3.cmake)
include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchMADWorld.cmake)
if (TA_TTG)
include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchTTG.cmake)
Expand Down Expand Up @@ -338,20 +346,8 @@ if(ENABLE_SCALAPACK)
include(external/scalapackpp.cmake)
endif()

# optional deps:
# 1. ccache
find_program(CCACHE ccache)
if(CCACHE)
mark_as_advanced(CCACHE)
message (STATUS "Found ccache: ${CCACHE}")
set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C++")
set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C")
endif(CCACHE)
# 2. range-v3
if (TA_RANGEV3)
include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchRangeV3.cmake)
endif(TA_RANGEV3)
# 3. TTG
# other optional deps:
# 2. TTG
# N.B. make sure TA configures MADNESS correctly
#if (TA_TTG)
# include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchTTG.cmake)
Expand Down
9 changes: 4 additions & 5 deletions INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ Both methods are supported. However, for most users we _strongly_ recommend to b
- Boost.Container: header-only
- Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing*
- Boost.Range: header-only, *only used for unit testing*
- [BTAS](http://github.com/ValeevGroup/BTAS), tag 4e8f5233aa7881dccdfcc37ce07128833926d3c2 . If usable BTAS installation is not found, TiledArray will download and compile
- [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20 and later.
- [BTAS](http://github.com/ValeevGroup/BTAS), tag 1cfcb12647c768ccd83b098c64cda723e1275e49 . If usable BTAS installation is not found, TiledArray will download and compile
BTAS from source. *This is the recommended way to compile BTAS for all users*.
- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 8788aea9758bfe6479cc23d39e6c77b7528009db .
- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 95589b0d020a076f93d02eead6da654b23dd3d91 .
Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray.
If usable MADNESS installation is not found, TiledArray will download and compile
MADNESS from source. *This is the recommended way to compile MADNESS for all users*.
Expand All @@ -68,13 +69,12 @@ Optional prerequisites:
- [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required.
- [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators. Note that TiledArray does not use ROCm directly but its C++ Heterogeneous-Compute Interface for Portability, `HIP`; although HIP can also be used to program CUDA-enabled devices, in TiledArray it is used only to program ROCm devices, hence ROCm and HIP will be used interchangeably.
- [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 6eed30d4dd2a5aa58840fe895dcffd80be7fbece).
- [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag 20839b2e8e8972070dd8f75c7f00d50d6c399716).
- [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag 8c85866107f78a58403e20a2ae8e1f24c9852287).
- [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later).
- [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing:
- [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ (C++17) wrapper for ScaLAPACK (tag 6397f52cf11c0dfd82a79698ee198a2fce515d81); pulls and builds the following additional prerequisite
- [blacspp](https://github.com/wavefunction91/blacspp.git) -- a modern C++ (C++17) wrapper for BLACS
- Python3 interpreter -- to test (optionally-built) Python bindings
- [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20; only used for some unit testing of the functionality anticipated to be supported by future C++ standards.
- [TTG](https://github.com/TESSEorg/ttg.git) -- C++ implementation of the Template Task Graph programming model for fine-grained flow-graph composition of distributed memory programs (tag 3fe4a06dbf4b05091269488aab38223da1f8cb8e).

Many of these dependencies can be installed with a package manager,
Expand Down Expand Up @@ -416,7 +416,6 @@ support may be added.
* `TA_ASSERT_POLICY` -- Set to `TA_ASSERT_IGNORE` to disable `TA_ASSERT` assertions, `TA_ASSERT_THROW` to cause `TA_ASSERT` assertions to throw, `TA_ASSERT_ABORT` to cause `TA_ASSERT` assertions to abort. The default is `TA_ASSERT_IGNORE` if CMake uses a single-configuration generator and`CMAKE_BUILD_TYPE` is set to `Release` or `MinSizeRel`, else the default is `TA_ASSERT_THROW`.
* `BUILD_TESTING` -- Set of `OFF` to disable building unit tests. The default is `ON`.
* `TA_TRACE_TASKS` -- Set to `ON` to enable tracing of MADNESS tasks using custom task tracer. Note that standard profilers/tracers are generally useless (except in the trivial cases) with MADWorld-based programs since the submission context of tasks is not captured by standard tracing tools; this makes it impossible in a nontrivial program to attribute tasks to source code. WARNING: task tracing his will greatly increase the memory requirements. [Default=OFF].
* `TA_RANGEV3` -- Set to `ON` to find or fetch the Range-V3 library and enable additional tests of TA components with constructs anticipated to be supported in the future. [Default=OFF].
* `TA_TTG` -- Set to `ON` to find or fetch the TTG library. [Default=OFF].
* `TA_SIGNED_1INDEX_TYPE` -- Set to `OFF` to use unsigned 1-index coordinate type (default for TiledArray 1.0.0-alpha.2 and older). The default is `ON`, which enables the use of negative indices in coordinates.
* `TA_MAX_SOO_RANK_METADATA` -- Specifies the maximum rank for which to use Small Object Optimization (hence, avoid the use of the heap) for metadata. The default is `8`.
Expand Down
5 changes: 5 additions & 0 deletions cmake/tiledarray-config.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ include(CMakeFindDependencyMacro)

@Boost_CONFIG_FILE_CONTENTS@

if (NOT TARGET range-v3::range-v3)
get_filename_component(range-v3_DIR "@range-v3_CONFIG@" DIRECTORY)
find_dependency(range-v3 QUIET REQUIRED HINTS "${range-v3_DIR}")
endif(NOT TARGET range-v3::range-v3)

if (NOT TARGET BTAS::BTAS)
get_filename_component(BTAS_DIR "@BTAS_CONFIG@" DIRECTORY)
find_dependency(BTAS 1.0.0 QUIET CONFIG REQUIRED HINTS "${BTAS_DIR}")
Expand Down
4 changes: 2 additions & 2 deletions examples/device/ta_cc_abcd_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,8 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
const double flops_per_fma =
(complex_T ? 8 : 2); // 1 multiply takes 6/1 flops for complex/real
// 1 add takes 2/1 flops for complex/real
const double n_gflop = flops_per_fma * std::pow(n_occ, 2) *
std::pow(n_uocc, 4) / std::pow(1024., 3);
const double n_gflop =
flops_per_fma * std::pow(n_occ, 2) * std::pow(n_uocc, 4) / 1e9;

using deviceTile =
btas::Tensor<T, TA::Range, TiledArray::device_um_btas_varray<T>>;
Expand Down
45 changes: 17 additions & 28 deletions examples/device/ta_dense_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,45 +36,41 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
using RT = TiledArray::detail::scalar_t<Storage>;
constexpr auto complex_T = TiledArray::detail::is_complex_v<T>;

const std::size_t Tm = Nm / Bm;
const std::size_t Tn = Nn / Bn;
const std::size_t Tk = Nk / Bk;

const std::int64_t nflops =
(complex_T ? 8 : 2) // 1 multiply takes 6/1 flops for complex/real
// 1 add takes 2/1 flops for complex/real
* static_cast<std::int64_t>(Nn) * static_cast<std::int64_t>(Nm) *
static_cast<std::int64_t>(Nk);

// Construct TiledRange
std::vector<unsigned int> blocking_m;
for (long i = 0l; i <= Nm; i += Bm) blocking_m.push_back(i);
const std::size_t Tm = blocking_m.size() - 1;

std::vector<unsigned int> blocking_n;
for (long i = 0l; i <= Nn; i += Bn) blocking_n.push_back(i);
const std::size_t Tn = blocking_n.size() - 1;

std::vector<unsigned int> blocking_k;
for (long i = 0l; i <= Nk; i += Bk) blocking_k.push_back(i);
const std::size_t Tk = blocking_k.size();

if (world.rank() == 0)
std::cout << "TiledArray: dense matrix multiply test...\n"
<< "Number of nodes = " << world.size()
<< "\nSize of A = " << Nm << "x" << Nk << " ("
<< double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)"
<< "\nSize of A block = " << Bm << "x" << Bk
<< "\nSize of (largest) A block = " << Bm << "x" << Bk
<< "\nSize of B = " << Nk << "x" << Nn << " ("
<< double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)"
<< "\nSize of B block = " << Bk << "x" << Bn
<< "\nSize of (largest) B block = " << Bk << "x" << Bn
<< "\nSize of C = " << Nm << "x" << Nn << " ("
<< double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)"
<< "\nSize of C block = " << Bm << "x" << Bn
<< "\nSize of (largest) C block = " << Bm << "x" << Bn
<< "\n# of blocks of C = " << Tm * Tn
<< "\nAverage # of blocks of C/node = "
<< double(Tm * Tn) / double(world.size()) << "\n";

// Construct TiledRange
std::vector<unsigned int> blocking_m;
blocking_m.reserve(Tm + 1);
for (long i = 0l; i <= Nm; i += Bm) blocking_m.push_back(i);

std::vector<unsigned int> blocking_n;
blocking_n.reserve(Tn + 1);
for (long i = 0l; i <= Nn; i += Bn) blocking_n.push_back(i);

std::vector<unsigned int> blocking_k;
blocking_k.reserve(Tk + 1);
for (long i = 0l; i <= Nk; i += Bk) blocking_k.push_back(i);

// Structure of c
std::vector<TiledArray::TiledRange1> blocking_C;
blocking_C.reserve(2);
Expand Down Expand Up @@ -255,11 +251,6 @@ int try_main(int argc, char **argv) {
std::cerr << "Error: block sizes must be greater than zero.\n";
return 1;
}
if ((Nm % Bm) != 0ul || Nn % Bn != 0ul || Nk % Bk != 0ul) {
std::cerr
<< "Error: dimension size must be evenly divisible by block size.\n";
return 1;
}
const long nrepeat = (argc >= 8 ? atol(argv[7]) : 5);
if (nrepeat <= 0) {
std::cerr << "Error: number of repetitions must be greater than zero.\n";
Expand Down Expand Up @@ -324,9 +315,7 @@ int try_main(int argc, char **argv) {
std::cout << "error(GetDeviceProperties) = " << error << std::endl;
}
std::cout << "Device #" << device_id << ": " << prop.name << std::endl
<< " managedMemory = " << prop.managedMemory << std::endl
<< " singleToDoublePrecisionPerfRatio = "
<< prop.singleToDoublePrecisionPerfRatio << std::endl;
<< " managedMemory = " << prop.managedMemory << std::endl;
int result;
error = TiledArray::device::deviceGetAttribute(
&result, TiledArray::device::DevAttrUnifiedAddressing, device_id);
Expand Down
Loading

0 comments on commit 307f517

Please sign in to comment.