jumbo "multidevice" bundle

- initial support for multiple devices - introduced device::Stream - sticky handling of streams should allow device tasks with multiple ops (e.g., scale+permute) to work correctly
ValeevGroup · Sep 26, 2023 · 64f5a9b · 64f5a9b
1 parent 3e59ff2
commit 64f5a9b
Show file tree

Hide file tree

Showing 21 changed files with 564 additions and 571 deletions.
diff --git a/doc/dox/dev/Optimization-Guide.md b/doc/dox/dev/Optimization-Guide.md
@@ -18,10 +18,8 @@ is devoted to communication. [Default = number of cores reported by ]
 
 ## MPI
 
-## CUDA
+## GPU/Device compute runtimes
 
-In addition to [the environment variables that control the CUDA runtime behavior](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars), several environment variables control specifically the execution of TiledArray on CUDA devices:
-* `TA_CUDA_NUM_STREAMS` -- The number of [CUDA streams](https://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf) used to execute tasks on each device. Each stream can be viewed as a thread in a threadpool, with tasks in a given stream executing in order, but each stream executing independently of others. For small tasks this may need to be increased. [Default=3]
-* `CUDA_VISIBLE_DEVICES` -- This CUDA runtime environment variable is queried by TiledArray to determine whether CUDA devices on a multi-GPU node have been pre-mapped to MPI ranks.
-  * By default (i.e. when # of MPI ranks on a node <= # of _available_ CUDA devices) TiledArray will map 1 device (in the order of increasing rank) to each MPI rank.
-  * If # of available CUDA devices < # of MPI ranks on a node _and_ `CUDA_VISIBLE_DEVICES` is set TiledArray will assume that the user mapped the devices to the MPI ranks appropriately (e.g. using a resource manager like `jsrun`) and only checks that each rank has access to 1 CUDA device.
+In addition to the environment variables that control the runtime behavior of [CUDA](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) and [HIP/ROCm](https://rocm.docs.amd.com/en/latest/search.html?q=environment+variables), several environment variables control specifically the execution of TiledArray on compute devices:
+* `TA_DEVICE_NUM_STREAMS` -- The number of [compute streams](https://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf) used to execute tasks on each device. Each stream can be viewed as a thread in a threadpool, with tasks in a given stream executing in order, but each stream executing independently of others. For small tasks this may need to be increased. In addition stream for compute tasks TiledArray also creates 2 dedicated streams for data transfers to/from each device. [Default=3]
+* `CUDA_VISIBLE_DEVICES`/`HIP_VISIBLE_DEVICES` -- These runtime environment variables are can be used to map CUDA/HIP devices, respectively, on a multi-device node to MPI ranks. It is usually the responsibility of the resource manager to control this mapping, thus normally it should not be needed. By default TiledArray will assign compute devices on a multidevice node round robin to each MPI rank.
diff --git a/examples/device/device_task.cpp b/examples/device/device_task.cpp
@@ -15,8 +15,8 @@ using tile_type = TA::Tile<tensor_type>;
 /// verify the elements in tile is equal to value
 void verify(const tile_type& tile, value_type value, std::size_t index) {
   //  const auto size = tile.size();
-  std::string message = "verify Tensor: " + std::to_string(index) + '\n';
-  std::cout << message;
+  //  std::string message = "verify Tensor: " + std::to_string(index) + '\n';
+  //  std::cout << message;
   for (auto& num : tile) {
     if (num != value) {
       std::string error("Error: " + std::to_string(num) + " " +
@@ -29,33 +29,31 @@ void verify(const tile_type& tile, value_type value, std::size_t index) {
 }
 
 tile_type scale(const tile_type& arg, value_type a,
-                const TiledArray::device::stream_t* stream, std::size_t index) {
-  DeviceSafeCall(TiledArray::device::setDevice(
-      TiledArray::deviceEnv::instance()->current_device_id()));
+                TiledArray::device::Stream stream, std::size_t index) {
   /// make result Tensor
   using Storage = typename tile_type::tensor_type::storage_type;
   Storage result_storage;
   auto result_range = arg.range();
-  make_device_storage(result_storage, arg.size(), *stream);
+  make_device_storage(result_storage, arg.size(), stream);
 
   typename tile_type::tensor_type result(std::move(result_range),
                                          std::move(result_storage));
 
   /// copy the original Tensor
-  auto& queue = TiledArray::BLASQueuePool::queue(*stream);
+  auto& queue = TiledArray::BLASQueuePool::queue(stream);
 
   blas::copy(result.size(), arg.data(), 1, device_data(result.storage()), 1,
              queue);
 
   blas::scal(result.size(), a, device_data(result.storage()), 1, queue);
 
   //  std::stringstream stream_str;
-  //  stream_str << *stream;
+  //  stream_str << stream;
   //  std::string message = "run scale on Tensor: " + std::to_string(index) +
   //                        "on stream: " + stream_str.str() + '\n';
   //  std::cout << message;
 
-  TiledArray::device::synchronize_stream(stream);
+  TiledArray::device::sync_madness_task_with(stream);
 
   return tile_type(std::move(result));
 }
@@ -65,10 +63,10 @@ void process_task(madness::World* world, std::size_t ntask) {
   const std::size_t M = 1000;
   const std::size_t N = 1000;
 
-  std::size_t n_stream = TiledArray::deviceEnv::instance()->num_streams();
+  std::size_t n_stream = TiledArray::deviceEnv::instance()->num_streams_total();
 
   for (std::size_t i = 0; i < iter; i++) {
-    auto& stream = TiledArray::deviceEnv::instance()->stream(i % n_stream);
+    auto stream = TiledArray::deviceEnv::instance()->stream(i % n_stream);
 
     TiledArray::Range range{M, N};
 
@@ -77,12 +75,11 @@ void process_task(madness::World* world, std::size_t ntask) {
     const double scale_factor = 2.0;
 
     // function pointer to the scale function to call
-    tile_type (*scale_fn)(const tile_type&, double,
-                          const TiledArray::device::stream_t*, std::size_t) =
-        &::scale;
+    tile_type (*scale_fn)(const tile_type&, double, TiledArray::device::Stream,
+                          std::size_t) = &::scale;
 
     madness::Future<tile_type> scale_future = madness::add_device_task(
-        *world, ::scale, tensor, scale_factor, &stream, ntask * iter + i);
+        *world, ::scale, tensor, scale_factor, stream, ntask * iter + i);
 
     /// this should start until scale_taskfn is finished
     world->taskq.add(verify, scale_future, scale_factor, ntask * iter + i);

diff --git a/examples/device/ta_dense_device.cpp b/examples/device/ta_dense_device.cpp
@@ -303,13 +303,13 @@ int try_main(int argc, char **argv) {
             << runtimeVersion << std::endl;
 
   {  // print device properties
-    int num_devices = TA::deviceEnv::instance()->num_devices();
+    int num_devices = TA::deviceEnv::instance()->num_visible_devices();
 
     if (num_devices <= 0) {
       throw std::runtime_error("No GPUs Found!\n");
     }
 
-    int device_id = TA::deviceEnv::instance()->current_device_id();
+    const int device_id = TA::deviceEnv::instance()->current_device_id();
 
     int mpi_size = world.size();
     int mpi_rank = world.rank();

diff --git a/examples/device/ta_reduce_device.cpp b/examples/device/ta_reduce_device.cpp
@@ -17,9 +17,10 @@
  *
  */
 
-#include <TiledArray/device/btas_um_tensor.h>
 #include <tiledarray.h>
 
+#include <TiledArray/device/btas_um_tensor.h>
+
 template <typename Tile>
 void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
                   const long Nn, const long Bn, const long nrepeat) {
@@ -291,13 +292,13 @@ int try_main(int argc, char **argv) {
             << runtimeVersion << std::endl;
 
   {  // print device properties
-    int num_devices = TA::deviceEnv::instance()->num_devices();
+    int num_devices = TA::deviceEnv::instance()->num_visible_devices();
 
     if (num_devices <= 0) {
       throw std::runtime_error("No GPUs Found!\n");
     }
 
-    int device_id = TA::deviceEnv::instance()->current_device_id();
+    const int device_id = TA::deviceEnv::instance()->current_device_id();
 
     int mpi_size = world.size();
     int mpi_rank = world.rank();

diff --git a/examples/device/ta_vector_device.cpp b/examples/device/ta_vector_device.cpp
@@ -308,13 +308,13 @@ int try_main(int argc, char **argv) {
             << runtimeVersion << std::endl;
 
   {  // print device properties
-    int num_devices = TA::deviceEnv::instance()->num_devices();
+    int num_devices = TA::deviceEnv::instance()->num_visible_devices();
 
     if (num_devices <= 0) {
       throw std::runtime_error("No GPUs Found!\n");
     }
 
-    int device_id = TA::deviceEnv::instance()->current_device_id();
+    const int device_id = TA::deviceEnv::instance()->current_device_id();
 
     int mpi_size = world.size();
     int mpi_rank = world.rank();

diff --git a/src/TiledArray/device/blas.cpp b/src/TiledArray/device/blas.cpp
@@ -31,27 +31,27 @@ bool BLASQueuePool::initialized() { return !queues_.empty(); }
 
 void BLASQueuePool::initialize() {
   if (initialized()) return;
-  queues_.reserve(deviceEnv::instance()->num_streams());
-  for (std::size_t sidx = 0; sidx != deviceEnv::instance()->num_streams();
+  queues_.reserve(deviceEnv::instance()->num_streams_total());
+  for (std::size_t sidx = 0; sidx != deviceEnv::instance()->num_streams_total();
        ++sidx) {
-    auto stream = deviceEnv::instance()->stream(
+    auto q = deviceEnv::instance()->stream(
         sidx);  // blaspp forsome reason wants non-const lvalue ref to stream
-    queues_.emplace_back(std::make_unique<blas::Queue>(0, stream));
+    queues_.emplace_back(std::make_unique<blas::Queue>(q.device, q.stream));
   }
 }
 
 void BLASQueuePool::finalize() { queues_.clear(); }
 
 blas::Queue& BLASQueuePool::queue(std::size_t ordinal) {
   TA_ASSERT(initialized());
-  TA_ASSERT(ordinal < deviceEnv::instance()->num_streams());
+  TA_ASSERT(ordinal < deviceEnv::instance()->num_streams_total());
   return *(queues_[ordinal]);
 }
 
-blas::Queue& BLASQueuePool::queue(device::stream_t const& stream) {
+blas::Queue& BLASQueuePool::queue(device::Stream const& stream) {
   TA_ASSERT(initialized());
   for (auto&& q : queues_) {
-    if (q->stream() == stream) return *q;
+    if (q->device() == stream.device && q->stream() == stream.stream) return *q;
   }
   throw TiledArray::Exception(
       "no matching device stream found in the BLAS queue pool");

diff --git a/src/TiledArray/device/blas.h b/src/TiledArray/device/blas.h
@@ -28,19 +28,13 @@
 
 #ifdef TILEDARRAY_HAS_DEVICE
 
-#include <TiledArray/external/device.h>
-
 #include <TiledArray/error.h>
+#include <TiledArray/external/device.h>
 #include <TiledArray/tensor/complex.h>
-
-#include <TiledArray/math/blas.h>
+#include <blas/device.hh>
 
 namespace TiledArray {
 
-/*
- * cuBLAS interface functions
- */
-
 /**
  * BLASQueuePool is a singleton controlling a pool of blas::Queue objects:
  * - queues map to stream 1-to-1, so do not call Queue::set_stream to maintain
@@ -54,20 +48,29 @@ struct BLASQueuePool {
   static void finalize();
 
   static blas::Queue &queue(std::size_t ordinal = 0);
-  static blas::Queue &queue(const device::stream_t &stream);
+  static blas::Queue &queue(const device::Stream &s);
 
  private:
   static std::vector<std::unique_ptr<blas::Queue>> queues_;
 };
 
-namespace detail {
+/// maps a (tile) Range to blas::Queue; if had already pushed work into a
+/// device::Stream (as indicated by madness_task_current_stream() )
+/// will return that Stream instead
+/// @param[in] range will determine the device::Stream to compute an object
+/// associated with this Range object
+/// @return the device::Stream to use for creating tasks generating work
+/// associated with Range \p range
 template <typename Range>
-blas::Queue &get_blasqueue_based_on_range(const Range &range) {
-  // TODO better way to get stream based on the id of tensor
-  auto stream_ord = range.offset() % device::Env::instance()->num_streams();
-  return BLASQueuePool::queue(stream_ord);
+blas::Queue &blasqueue_for(const Range &range) {
+  auto stream_opt = device::madness_task_current_stream();
+  if (!stream_opt) {
+    auto stream_ord =
+        range.offset() % device::Env::instance()->num_streams_total();
+    return BLASQueuePool::queue(stream_ord);
+  } else
+    return BLASQueuePool::queue(*stream_opt);
 }
-}  // namespace detail
 
 }  // namespace TiledArray