Merge pull request #418 from ValeevGroup/evaleev/feature/hip

implements HIP/ROCm support
ValeevGroup · Sep 25, 2023 · 4fbb104 · 4fbb104
2 parents 881bb5a + 604e119
commit 4fbb104
Show file tree

Hide file tree

Showing 69 changed files with 3,150 additions and 3,298 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -130,10 +130,8 @@ add_feature_info(TBB ENABLE_TBB "Intel Thread-Building Blocks (TBB) supports pro
 option(ENABLE_CUDA "Enable use of CUDA with TiledArray" OFF)
 add_feature_info(CUDA ENABLE_CUDA "NVIDIA CUDA support for GPU")
 
-if(ENABLE_CUDA)
-  option(ENABLE_CUDA_ERROR_CHECK "TiledArray will always check errors in CUDA calls" ON)
-  add_feature_info(CUDA_ERROR_CHECK ENABLE_CUDA_ERROR_CHECK "Checks CUDA Error")
-endif()
+option(ENABLE_HIP "Enable use of HIP with TiledArray" OFF)
+add_feature_info(HIP ENABLE_HIP "AMD HIP/ROCm support for GPU")
 
 option(ENABLE_GPERFTOOLS "Enable linking with Gperftools" OFF)
 add_feature_info(GPERFTOOLS ENABLE_GPERFTOOLS "Google Performance Tools provide fast memory allocation and performance profiling")
@@ -306,10 +304,13 @@ include_directories(${PROJECT_SOURCE_DIR}/src ${PROJECT_BINARY_DIR}/src)
 add_custom_target(External-tiledarray)
 
 # required deps:
-# 1. CUDA first since others may depend on it
+# 1. derive runtime (CUDA/HIP/...) first since others may depend on it
 if(ENABLE_CUDA)
   include(external/cuda.cmake)
 endif()
+if(ENABLE_HIP)
+  include(external/hip.cmake)
+endif()
 if (TA_TTG)
   include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchTTG.cmake)
 endif(TA_TTG)

diff --git a/INSTALL.md b/INSTALL.md
@@ -32,7 +32,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b
 
   See the current [Travis CI matrix](.travis.yml) for the most up-to-date list of compilers that are known to work.
 
-- [CMake](https://cmake.org/), version 3.15 or higher; if CUDA support is needed, CMake 3.18 or higher is required.
+- [CMake](https://cmake.org/), version 3.15 or higher; if {CUDA,HIP} support is needed, CMake {3.18,3.21} or higher is required.
 - [Git](https://git-scm.com/) 1.8 or later (required to obtain TiledArray and MADNESS source code from GitHub)
 - [Eigen](http://eigen.tuxfamily.org/), version 3.3.5 or higher; if CUDA is enabled then 3.3.7 is required (will be downloaded automatically, if missing)
 - [Boost libraries](www.boost.org/), version 1.59 or higher (will be downloaded automatically, if missing). The following principal Boost components are used:
@@ -63,8 +63,11 @@ Compiling BTAS requires the following prerequisites:
   - BLAS and LAPACK libraries
 
 Optional prerequisites:
-- [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on CUDA-enabled accelerators. CUDA 11 or later is required. Support for CUDA also requires the following additional prerequisites, both of which will be built and installed automatically if missing:
-  - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag f5ebdbbba9c9689aa4613a5469021db2dacd8e46).
+- for execution on GPGPUs:
+  - device programming runtime:
+    - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required.
+    - [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators. Note that TiledArray does not use ROCm directly but its C++ Heterogeneous-Compute Interface for Portability, `HIP`; although HIP can also be used to program CUDA-enabled devices, in TiledArray it is used only to program ROCm devices, hence ROCm and HIP will be used interchangeably.
+  - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 6eed30d4dd2a5aa58840fe895dcffd80be7fbece).
   - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag f9640e0fa4245691cdd434e4f719ac5f7d455f82).
 - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later).
 - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing:
@@ -323,17 +326,18 @@ To discover and configure the use of Intel MKL consider these suggestions:
 
 Also note that even if OpenMP or TBB backends are used, TiledArray will be default set the number of threads to be used by MKL kernels to 1, regardless of the value of environment variables `MKL_NUM_THREADS`/`OMP_NUM_THREADS`. It is possible to change the number of threads to be used programmatically in your application by calling MKL function `mkl_set_num_threads()`.
 
-## CUDA
+## GPGPU support
 
-Support for execution on CUDA-enabled hardware is controlled by the following variables:
+Support for execution on NVIDIA and AMD GPGPUs is controlled by the following variables:
 
 * `ENABLE_CUDA`  -- Set to `ON` to turn on CUDA support. [Default=OFF].
 * `CMAKE_CUDA_HOST_COMPILER`  -- Set to the path to the host C++ compiler to be used by CUDA compiler. CUDA compilers used to be notorious for only being able to use specific C++ host compilers, but support for more recent C++ host compilers has improved. The default is determined by the CUDA compiler and the user environment variables (`PATH` etc.).
-* `ENABLE_CUDA_ERROR_CHECK` -- Set to `ON` to turn on assertions for successful completion of calls to CUDA runtime and libraries. [Default=OFF].
+* `ENABLE_HIP`  -- Set to `ON` to turn on HIP/ROCm support. [Default=OFF].
 * `LIBRETT_INSTALL_DIR` -- the installation prefix of the pre-installed LibreTT library. This should not be normally needed; it is strongly recommended to let TiledArray build and install LibreTT.
 * `UMPIRE_INSTALL_DIR` -- the installation prefix of the pre-installed Umpire library. This should not be normally needed; it is strongly recommended to let TiledArray build and install Umpire.
 
-For the CUDA compiler and toolkit to be discoverable the CUDA compiler (`nvcc`) should be in the `PATH` environment variable. Refer to the [FindCUDAToolkit module](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html) for more info.
+- For the CUDA compiler and toolkit to be discoverable the CUDA compiler (`nvcc`) should be in the `PATH` environment variable. Refer to the [FindCUDAToolkit module](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html) for more info.
+- For the ROCm platform to be discoverable add its prefix path (e.g., `/opt/rocm`) to `CMAKE_PREFIX_PATH`
 
 ## Eigen 3
 

diff --git a/cmake/modules/FindOrFetchBTAS.cmake b/cmake/modules/FindOrFetchBTAS.cmake
@@ -17,8 +17,14 @@ if (NOT TARGET BTAS::BTAS)
     set(_linalgpp_use_standard_linalg_kits TRUE)
   endif(DEFINED BLA_VENDOR)
 
-  if (NOT TILEDARRAY_HAS_CUDA)
-    # tell BLAS++/LAPACK++ to ignore CUDA
+  if (TILEDARRAY_HAS_CUDA)
+    # tell BLAS++/LAPACK++ to also look for CUDA
+    set(gpu_backend cuda CACHE STRING "The device backend to use for Linalg++")
+  elseif (TILEDARRAY_HAS_HIP)
+    # tell BLAS++/LAPACK++ to also look for HIP
+    set(gpu_backend hip CACHE STRING "The device backend to use for Linalg++")
+  else ()
+    # tell BLAS++/LAPACK++ to not look for device backends
     set(gpu_backend none CACHE STRING "The device backend to use for Linalg++")
   endif()
 

diff --git a/doc/devsamp/wiki/user-guide-2.cpp b/doc/devsamp/wiki/user-guide-2.cpp
@@ -36,6 +36,7 @@ TA::Tensor<T> make_tile2(const TA::Range& range, const double v) {
 
 // Fill array x with value v
 void init_array(TA::TArrayD& x, const double v) {
+  using std::begin, std::end;
   // Add local tiles to a
   for (auto it = begin(x); it != end(x); ++it) {
     // Construct a tile using a MADNESS task.

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -29,7 +29,7 @@ endif()
 
 # Add Subdirectories
 add_subdirectory (cc)
-add_subdirectory (cuda)
+add_subdirectory (device)
 add_subdirectory (dgemm)
 add_subdirectory (demo)
 add_subdirectory (scalapack)

diff --git a/examples/cuda/cuda_librett.cpp b/examples/cuda/cuda_librett.cpp
diff --git a/examples/cuda/CMakeLists.txt → examples/device/CMakeLists.txt b/examples/cuda/CMakeLists.txt → examples/device/CMakeLists.txt
@@ -23,14 +23,14 @@
 #
 
 
-if(CUDA_FOUND)
+if(CUDA_FOUND OR HIP_FOUND)
 
-  foreach(_exec cuda_librett cuda_task ta_dense_cuda ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda)
+    foreach(_exec device_task ta_dense_device ta_cc_abcd_device ta_vector_device ta_reduce_device)
 
-    # Add executable
-    add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray")
-    add_dependencies(examples-tiledarray ${_exec})
+        # Add executable
+        add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray")
+        add_dependencies(examples-tiledarray ${_exec})
 
-  endforeach()
+    endforeach()
 
-endif(CUDA_FOUND)
+endif()
diff --git a/examples/cuda/cuda_task.cpp → examples/device/device_task.cpp b/examples/cuda/cuda_task.cpp → examples/device/device_task.cpp
@@ -2,8 +2,10 @@
 // Created by Chong Peng on 11/14/18.
 //
 
-#include <TiledArray/cuda/btas_um_tensor.h>
-#include <TiledArray/cuda/cuda_task_fn.h>
+#include <TiledArray/device/blas.h>
+#include <TiledArray/device/btas_um_tensor.h>
+#include <TiledArray/device/device_task_fn.h>
+
 #include <tiledarray.h>
 
 using value_type = double;
@@ -26,10 +28,10 @@ void verify(const tile_type& tile, value_type value, std::size_t index) {
   }
 }
 
-tile_type scale(const tile_type& arg, value_type a, const cudaStream_t* stream,
-                std::size_t index) {
-  CudaSafeCall(
-      cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id()));
+tile_type scale(const tile_type& arg, value_type a,
+                const TiledArray::device::stream_t* stream, std::size_t index) {
+  DeviceSafeCall(TiledArray::device::setDevice(
+      TiledArray::deviceEnv::instance()->current_device_id()));
   /// make result Tensor
   using Storage = typename tile_type::tensor_type::storage_type;
   Storage result_storage;
@@ -40,36 +42,33 @@ tile_type scale(const tile_type& arg, value_type a, const cudaStream_t* stream,
                                          std::move(result_storage));
 
   /// copy the original Tensor
-  const auto& handle = TiledArray::cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, *stream));
-
-  CublasSafeCall(TiledArray::cublasCopy(handle, result.size(), arg.data(), 1,
-                                        device_data(result.storage()), 1));
+  auto& queue = TiledArray::BLASQueuePool::queue(*stream);
 
-  CublasSafeCall(TiledArray::cublasScal(handle, result.size(), &a,
-                                        device_data(result.storage()), 1));
+  blas::copy(result.size(), arg.data(), 1, device_data(result.storage()), 1,
+             queue);
 
-  //  cudaStreamSynchronize(stream);
-
-  TiledArray::synchronize_stream(stream);
+  blas::scal(result.size(), a, device_data(result.storage()), 1, queue);
 
   //  std::stringstream stream_str;
   //  stream_str << *stream;
-  //  std::string message = "run scale on Tensor: " + std::to_string(index) +  "
-  //  on stream: " + stream_str.str() +'\n'; std::cout << message;
+  //  std::string message = "run scale on Tensor: " + std::to_string(index) +
+  //                        "on stream: " + stream_str.str() + '\n';
+  //  std::cout << message;
+
+  TiledArray::device::synchronize_stream(stream);
+
   return tile_type(std::move(result));
 }
 
-void process_task(madness::World* world,
-                  const std::vector<cudaStream_t>* streams, std::size_t ntask) {
+void process_task(madness::World* world, std::size_t ntask) {
   const std::size_t iter = 50;
   const std::size_t M = 1000;
   const std::size_t N = 1000;
 
-  std::size_t n_stream = streams->size();
+  std::size_t n_stream = TiledArray::deviceEnv::instance()->num_streams();
 
   for (std::size_t i = 0; i < iter; i++) {
-    auto& stream = (*streams)[i % n_stream];
+    auto& stream = TiledArray::deviceEnv::instance()->stream(i % n_stream);
 
     TiledArray::Range range{M, N};
 
@@ -78,10 +77,11 @@ void process_task(madness::World* world,
     const double scale_factor = 2.0;
 
     // function pointer to the scale function to call
-    tile_type (*scale_fn)(const tile_type&, double, const cudaStream_t*,
-                          std::size_t) = &::scale;
+    tile_type (*scale_fn)(const tile_type&, double,
+                          const TiledArray::device::stream_t*, std::size_t) =
+        &::scale;
 
-    madness::Future<tile_type> scale_future = madness::add_cuda_task(
+    madness::Future<tile_type> scale_future = madness::add_device_task(
         *world, ::scale, tensor, scale_factor, &stream, ntask * iter + i);
 
     /// this should start until scale_taskfn is finished
@@ -92,27 +92,15 @@ void process_task(madness::World* world,
 int try_main(int argc, char** argv) {
   auto& world = TiledArray::get_default_world();
 
-  const std::size_t n_stream = 5;
   const std::size_t n_tasks = 5;
 
-  std::vector<cudaStream_t> streams(n_stream);
-  for (auto& stream : streams) {
-    // create the streams
-    CudaSafeCall(cudaStreamCreate(&stream));
-    //    std::cout << "stream: " << stream << "\n";
-  }
-
   // add process_task to different tasks/threads
   for (auto i = 0; i < n_tasks; i++) {
-    world.taskq.add(process_task, &world, &streams, i);
+    world.taskq.add(process_task, &world, i);
   }
 
   world.gop.fence();
 
-  for (auto& stream : streams) {
-    // create the streams
-    cudaStreamDestroy(stream);
-  }
   return 0;
 }
 
@@ -121,12 +109,12 @@ int main(int argc, char* argv[]) {
   try {
     // Initialize runtime
     try_main(argc, argv);
-  } catch (thrust::system::detail::bad_alloc& ex) {
+  } catch (std::exception& ex) {
     std::cout << ex.what() << std::endl;
 
     size_t free_mem, total_mem;
-    auto result = cudaMemGetInfo(&free_mem, &total_mem);
-    std::cout << "CUDA memory stats: {total,free} = {" << total_mem << ","
+    auto result = TiledArray::device::memGetInfo(&free_mem, &total_mem);
+    std::cout << "device memory stats: {total,free} = {" << total_mem << ","
               << free_mem << "}" << std::endl;
   } catch (...) {
     std::cerr << "unknown exception" << std::endl;

diff --git a/examples/cuda/ta_cc_abcd_cuda.cpp → examples/device/ta_cc_abcd_device.cpp b/examples/cuda/ta_cc_abcd_cuda.cpp → examples/device/ta_cc_abcd_device.cpp
@@ -17,7 +17,7 @@
  *
  */
 
-#include <TiledArray/cuda/btas_um_tensor.h>
+#include <TiledArray/device/btas_um_tensor.h>
 #include <TiledArray/version.h>
 #include <tiledarray.h>
 #include <iostream>
@@ -185,14 +185,14 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
   const double n_gflop = flops_per_fma * std::pow(n_occ, 2) *
                          std::pow(n_uocc, 4) / std::pow(1024., 3);
 
-  using CUDATile =
-      btas::Tensor<T, TA::Range, TiledArray::cuda_um_btas_varray<T>>;
-  using CUDAMatrix = TA::DistArray<TA::Tile<CUDATile>>;
+  using deviceTile =
+      btas::Tensor<T, TA::Range, TiledArray::device_um_btas_varray<T>>;
+  using deviceMatrix = TA::DistArray<TA::Tile<deviceTile>>;
 
   // Construct tensors
-  CUDAMatrix t2(world, trange_oovv);
-  CUDAMatrix v(world, trange_vvvv);
-  CUDAMatrix t2_v;
+  deviceMatrix t2(world, trange_oovv);
+  deviceMatrix v(world, trange_vvvv);
+  deviceMatrix t2_v;
   // To validate, fill input tensors with random data, otherwise just with 1s
   //  if (do_validate) {
   //    rand_fill_array(t2);
@@ -245,7 +245,7 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
   auto result = dot_length * 0.2 * 0.3;
 
   auto verify = [&world, &threshold, &result,
-                 &dot_length](const TA::Tile<CUDATile>& tile) {
+                 &dot_length](const TA::Tile<deviceTile>& tile) {
     auto n_elements = tile.size();
     for (std::size_t i = 0; i < n_elements; i++) {
       double abs_err = fabs(tile[i] - result);