diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 7753a3436d..4c6a097d9a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,16 +12,26 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os : [ macos-latest ]
-        cxx : [ clang++, /usr/local/bin/g++-10 ]
+        os : [ macos-latest, ubuntu-22.04 ]
         build_type : [ Release, Debug ]
         task_backend: [ Pthreads, PaRSEC ]
-        prerequisites : [ gcc@10 boost eigen open-mpi bison scalapack ]
+        include:
+          - os: ubuntu-22.04
+            cc: /usr/bin/gcc-12
+            cxx: /usr/bin/g++-12
+          - os: macos-latest
+            cc: clang
+            cxx: clang++
 
     name: "${{ matrix.os }}: ${{ matrix.cxx }} ${{ matrix.build_type }} ${{ matrix.task_backend }}"
     runs-on: ${{ matrix.os }}
     env:
       CXX : ${{ matrix.cxx }}
+      CCACHE_DIR : ${{github.workspace}}/build/.ccache
+      CCACHE_COMPRESS : true
+      CCACHE_COMPRESSLEVEL : 6
+      OMPI_MCA_btl_vader_single_copy_mechanism : none
+      PARSEC_MCA_runtime_bind_threads : 0
       BUILD_CONFIG : >
         -DMADNESS_TASK_BACKEND=${{ matrix.task_backend }}
         -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -33,31 +43,43 @@ jobs:
     steps:
     - uses: actions/checkout@v2
 
-    - uses: maxim-lobanov/setup-xcode@v1
-      with:
-        xcode-version: '<14'
-
     - name: Host system info
       shell: bash
       run: cmake -P ${{github.workspace}}/ci/host_system_info.cmake
 
-    - name: Install ${{matrix.prerequisites}}
+
+    - name: Install prerequisite MacOS packages
+      if: ${{ matrix.os == 'macos-latest' }}
       run: |
-        brew install ${{matrix.prerequisites}}
-        echo "/usr/local/opt/bison/bin" >> $GITHUB_PATH
+        brew install ninja boost eigen open-mpi bison scalapack ccache
+        echo "MPIEXEC=/opt/homebrew/bin/mpiexec" >> $GITHUB_ENV
+
+    - name: Install prerequisites Ubuntu packages
+      if: ${{ matrix.os == 'ubuntu-22.04' }}
+      run: |
+        wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
+        sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main"
+        sudo apt-get update
+        sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison libscalapack-mpi-dev cmake doxygen
+        sudo ln -s /usr/lib/x86_64-linux-gnu/libscalapack-openmpi.so /usr/lib/x86_64-linux-gnu/libscalapack.so
+        echo "MPIEXEC=/usr/bin/mpiexec" >> $GITHUB_ENV
+
+    - name: Setup ccache
+      uses: hendrikmuhs/ccache-action@v1.2
+      with:
+        key: ccache-${{ matrix.os }}-${{ matrix.build_type }}-${{ matrix.task_backend }}
 
     - name: "Configure build: ${{ env.BUILD_CONFIG }}"
       shell: bash
       run: |
         set -x;
-        cmake -B${{github.workspace}}/build $BUILD_CONFIG || (cat CMakeFiles/CMakeOutput.log && cat CMakeFiles/CMakeError.log)
+        cmake -B${{github.workspace}}/build $BUILD_CONFIG || (cat CMakeFiles/CMakeConfigureLog.yaml)
 
     - name: Build
       working-directory: ${{github.workspace}}/build
       shell: bash
       run: |
-        cmake --build . --target tiledarray
-        cmake --build . --target examples
+        ccache -p && ccache -z && cmake --build . --target tiledarray ta_test examples && ccache -s
 
     - name: Test
       working-directory: ${{github.workspace}}/build
@@ -65,5 +87,4 @@ jobs:
       #run: ctest -C $${{matrix.build_type}}
       run: |
         source ${{github.workspace}}/ci/openmpi.env
-        cmake --build . --target ta_test
         cmake --build . --target check-tiledarray
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 93850215f1..33a8d0c9bf 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -15,6 +15,7 @@ variables:
     TA_UT_CTEST_TIMEOUT=3000
     ${TA_PYTHON}
     ${ENABLE_CUDA}
+    CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda
     ${BLA_VENDOR}
     ${BLA_THREADS}
     ${ENABLE_SCALAPACK}
@@ -24,11 +25,18 @@ before_script:
   # TODO optimize ta_test build memory consumption
   - export CMAKE_BUILD_PARALLEL_LEVEL=${CMAKE_BUILD_PARALLEL_LEVEL:=1}
   - echo "CMAKE_BUILD_PARALLEL_LEVEL=$CMAKE_BUILD_PARALLEL_LEVEL"
+  # configure ccache
+  - export CCACHE_DIR=/root/.ccache
+  - export CCACHE_COMPRESS=true
+  - export CCACHE_COMPRESSLEVEL=6
+  # print out the ccache configuration
+  - ccache -p
+  # zero out the ccache statistics
+  - ccache -z
 
 ubuntu:
   stage: build
   tags:
-    - docker
     - ${RUNNER_TAGS}
   timeout: 3h
   image: valeevgroup/${IMAGE}
@@ -52,26 +60,20 @@ ubuntu:
     - build/CMakeCache.txt
     - build/CMakeFiles/CMakeOutput.log
     - build/CMakeFiles/CMakeError.log
+    - build/CMakeFiles/CMakeConfigureLog.yaml
     reports:
       metrics: build/metrics.txt
   parallel:
     matrix:
-      - IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ]
+      - IMAGE : [ "ubuntu:22.04" ]
         CXX: [ g++ ]
-        BUILD_TYPE : [ "Release" ]
-        BLA_VENDOR : [ "BLAS_PREFERENCE_LIST=IntelMKL" ]
-        BLA_THREADS : [ "IntelMKL_THREAD_LAYER=tbb" ]
-        # ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ]
-        TA_PYTHON : [ "TA_PYTHON=OFF" ] # needs to be fixed for MKL
-        RUNNER_TAGS: [ linux ]
-      - IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ]
-        CXX: [ g++, clang++-9 ]
-        BUILD_TYPE : [ "Release", "Debug" ]
-        ENABLE_SCALAPACK : [ "ENABLE_SCALAPACK=ON", "ENABLE_SCALAPACK=OFF" ]
-        RUNNER_TAGS: [ linux ]
-      - IMAGE : [ "ubuntu:18.04", "ubuntu:20.04" ]
-        CXX: [ g++ ]
-        BUILD_TYPE : [ "Release", "Debug" ]
+        BUILD_TYPE : [ "RelWithDebInfo" ]
+        TA_PYTHON : [ "TA_PYTHON=OFF" ]
         ENABLE_CUDA : [ "ENABLE_CUDA=ON" ]
         TA_TARGETS : [ "tiledarray examples-tiledarray check_serial-tiledarray" ]
         RUNNER_TAGS: [ cuda ]
+
+
+after_script:
+  # print out the ccache statistics
+  - ccache -s
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 23f1509ca1..fd5c27bf6d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -38,5 +38,5 @@ repos:
                 name: Format C/C++ code using clang-format.
                 language: system
                 files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
-                entry: clang-format -i
-                args: [--style=file]
+                entry: bin/admin/clang-format.sh
+                args: [--style=file -i]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a50f0a789f..a130211293 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,25 +78,23 @@ enable_language(C)  # C needed even for basic platform introspection
 
 # Set install paths ============================================================
 
-set(TILEDARRAY_INSTALL_BINDIR "bin"
-    CACHE PATH "TiledArray binary install directory")
-set(TILEDARRAY_INSTALL_INCLUDEDIR "include"
-    CACHE PATH "TiledArray INCLUDE install directory")
-set(TILEDARRAY_INSTALL_LIBDIR "lib"
-    CACHE PATH "TiledArray LIB install directory")
-set(TILEDARRAY_INSTALL_SHAREDIR "share/tiledarray/${TILEDARRAY_EXT_VERSION}"
-    CACHE PATH "TiledArray DATA install directory")
-set(TILEDARRAY_INSTALL_DATADIR "${TILEDARRAY_INSTALL_SHAREDIR}/data"
-    CACHE PATH "TiledArray DATA install directory")
-set(TILEDARRAY_INSTALL_DOCDIR "${TILEDARRAY_INSTALL_SHAREDIR}/doc"
-    CACHE PATH "TiledArray DOC install directory")
-set(TILEDARRAY_INSTALL_CMAKEDIR "lib/cmake/tiledarray"
-    CACHE PATH "TiledArray CMAKE install directory")
+include(GNUInstallDirs)
+set(TILEDARRAY_INSTALL_BINDIR "${CMAKE_INSTALL_BINDIR}"
+        CACHE PATH "TiledArray binary install directory")
+set(TILEDARRAY_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}"
+        CACHE PATH "TiledArray INCLUDE install directory")
+set(TILEDARRAY_INSTALL_LIBDIR "${CMAKE_INSTALL_LIBDIR}"
+        CACHE PATH "TiledArray LIB install directory")
+set(TILEDARRAY_INSTALL_DATADIR "${CMAKE_INSTALL_DATAROOTDIR}/tiledarray/${TILEDARRAY_EXT_VERSION}"
+        CACHE PATH "TiledArray DATA install directory")
+set(TILEDARRAY_INSTALL_DOCDIR "${TILEDARRAY_INSTALL_DATADIR}/doc"
+        CACHE PATH "TiledArray DOC install directory")
+set(TILEDARRAY_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/tiledarray"
+        CACHE PATH "TiledArray CMAKE install directory")
 
 # Add module directory and modules =============================================
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/modules/)
 include(CMakePushCheckState)
-include(GNUInstallDirs)
 include(AppendFlags)
 include(RedefaultableOption)
 include(DetectMADNESSConfig)
@@ -111,6 +109,7 @@ include(LoadFetchContent)
 include(CMakeDependentOption)
 include(CMakePackageConfigHelpers)
 include(FeatureSummary)
+include(CTest)  # testing, defined BUILD_TESTING
 
 set(MPI_CXX_SKIP_MPICXX TRUE CACHE BOOL "MPI_CXX_SKIP_MPICXX")
 
@@ -130,10 +129,8 @@ add_feature_info(TBB ENABLE_TBB "Intel Thread-Building Blocks (TBB) supports pro
 option(ENABLE_CUDA "Enable use of CUDA with TiledArray" OFF)
 add_feature_info(CUDA ENABLE_CUDA "NVIDIA CUDA support for GPU")
 
-if(ENABLE_CUDA)
-  option(ENABLE_CUDA_ERROR_CHECK "TiledArray will always check errors in CUDA calls" ON)
-  add_feature_info(CUDA_ERROR_CHECK ENABLE_CUDA_ERROR_CHECK "Checks CUDA Error")
-endif()
+option(ENABLE_HIP "Enable use of HIP with TiledArray" OFF)
+add_feature_info(HIP ENABLE_HIP "AMD HIP/ROCm support for GPU")
 
 option(ENABLE_GPERFTOOLS "Enable linking with Gperftools" OFF)
 add_feature_info(GPERFTOOLS ENABLE_GPERFTOOLS "Google Performance Tools provide fast memory allocation and performance profiling")
@@ -168,12 +165,16 @@ if(TA_ENABLE_TILE_OPS_LOGGING AND NOT DEFINED TA_TILE_OPS_LOG_LEVEL)
   set(TA_TILE_OPS_LOG_LEVEL 1)
 endif(TA_ENABLE_TILE_OPS_LOGGING AND NOT DEFINED TA_TILE_OPS_LOG_LEVEL)
 
-option(TA_RANGEV3 "Enable Range-V3 library" OFF)
-add_feature_info(TA_RANGEV3 TA_RANGEV3 "Range-V3 ranges library")
+option(TA_TRACE_GLOBAL_COMM_STATS "Enable tracing of communication stats of global objects (DistEval's and DIstributedStorage) TiledArray" OFF)
+add_feature_info(TASK_TRACE_DEBUG TA_TRACE_GLOBAL_COMM_STATS "Debug communication stats of global objects (DistEval's and DIstributedStorage) TiledArray")
+set(TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE ${TA_TRACE_GLOBAL_COMM_STATS})
 
 option(TA_TTG "Enable search/build of TTG library" OFF)
 add_feature_info(TA_TTG TA_TTG "TTG library")
 
+option(IntelMKL_FAIR_DISPATCH "Enable fair dispatch in Intel MKL" OFF)
+add_feature_info(IntelMKL_FAIR_DISPATCH IntelMKL_FAIR_DISPATCH "Use of fair dispatch in Intel MKL")
+
 # Enable shared library support options
 redefaultable_option(TA_ASSUMES_ASLR_DISABLED "TiledArray assumes the Address Space Layout Randomization (ASLR) to be disabled" OFF)
 add_feature_info(ASSUMES_ASLR_DISABLED TA_ASSUMES_ASLR_DISABLED
@@ -288,13 +289,6 @@ set_property(
   CACHE TA_ASSERT_POLICY PROPERTY
   STRINGS TA_ASSERT_THROW TA_ASSERT_ABORT TA_ASSERT_IGNORE)
 
-# if building unit tests default to throw to be able to test TA_ASSERT statements
-if (NOT (TA_ASSERT_POLICY STREQUAL TA_ASSERT_THROW))
-  if (BUILD_TESTING)
-    message(FATAL_ERROR "TA_ASSERT_POLICY=${TA_ASSERT_POLICY} requires BUILD_TESTING=OFF")
-  endif(BUILD_TESTING)
-endif()
-
 ##########################
 # Include source dirctories
 ##########################
@@ -305,15 +299,29 @@ include_directories(${PROJECT_SOURCE_DIR}/src ${PROJECT_BINARY_DIR}/src)
 ##########################
 add_custom_target(External-tiledarray)
 
+# ccache is an optional dep but must be found first so that the rest of dependencies can use it
+find_program(CCACHE ccache)
+if(CCACHE)
+  mark_as_advanced(CCACHE)
+  message (STATUS "Found ccache: ${CCACHE}")
+  set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C++")
+  set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C")
+  set(CMAKE_CUDA_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling CUDA")
+endif(CCACHE)
+
 # required deps:
-# 1. CUDA first since others may depend on it
+# 1. derive runtime (CUDA/HIP/...) first since others may depend on it
 if(ENABLE_CUDA)
   include(external/cuda.cmake)
 endif()
+if(ENABLE_HIP)
+  include(external/hip.cmake)
+endif()
+include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchRangeV3.cmake)
+include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchMADWorld.cmake)
 if (TA_TTG)
   include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchTTG.cmake)
 endif(TA_TTG)
-include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchMADWorld.cmake)
 detect_MADNESS_configuration()
 include(external/eigen.cmake)
 # the FetchContent-based version will not work due to BLT target name conflicts
@@ -331,28 +339,15 @@ if (ENABLE_WFN91_LINALG_DISCOVERY_KIT)
   include(FetchWfn91LinAlgModules)
   include(FindLinalg)
 endif(ENABLE_WFN91_LINALG_DISCOVERY_KIT)
-# BTAS does a better job of building and checking Boost since it uses Boost::serialization
-# it also memorized the location of its config for use from install tree
+# Boost is to be discovered by the top cmake project, and every (sub)project needs to make sure it has all of its targets
+include(external/boost.cmake)
 include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchBTAS.cmake)
-include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchBoost.cmake)
 if(ENABLE_SCALAPACK)
   include(external/scalapackpp.cmake)
 endif()
 
-# optional deps:
-# 1. ccache
-find_program(CCACHE ccache)
-if(CCACHE)
-    mark_as_advanced(CCACHE)
-    message (STATUS "Found ccache: ${CCACHE}")
-    set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C++")
-    set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE}" CACHE STRING "Compiler launcher to use for compiling C")
-endif(CCACHE)
-# 2. range-v3
-if (TA_RANGEV3)
-  include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchRangeV3.cmake)
-endif(TA_RANGEV3)
-# 3. TTG
+# other optional deps:
+# 2. TTG
 # N.B. make sure TA configures MADNESS correctly
 #if (TA_TTG)
 #  include(${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchTTG.cmake)
@@ -369,7 +364,7 @@ add_subdirectory(doc)
 ##########################
 # checking/testing
 ##########################
-include(CTest)
+# N.B. CTest was included above
 if (BUILD_TESTING)
   set(_ctest_args -V -R "tiledarray/unit/run-np.*")
   set(_ctest_args_serial -V -R "tiledarray/unit/run-np-1")
@@ -377,6 +372,12 @@ if (BUILD_TESTING)
     list(APPEND _ctest_args --timeout ${TA_UT_CTEST_TIMEOUT})
     list(APPEND _ctest_args_serial --timeout ${TA_UT_CTEST_TIMEOUT})
   endif(DEFINED TA_UT_CTEST_TIMEOUT)
+
+  # if building unit tests need to configure with TA_ASSERT_POLICY=TA_ASSERT_THROW to be able to test TA_ASSERT statements
+  if (NOT (TA_ASSERT_POLICY STREQUAL TA_ASSERT_THROW))
+    message(WARNING "BUILD_TESTING=ON requires configuring with TA_ASSERT_POLICY=TA_ASSERT_THROW to engage REQUIRE_THROWS() tests; will skip these tests")
+  endif()
+
   add_custom_target_subproject(tiledarray check USES_TERMINAL COMMAND ${CMAKE_CTEST_COMMAND} ${_ctest_args})
   add_custom_target_subproject(tiledarray check_serial USES_TERMINAL COMMAND ${CMAKE_CTEST_COMMAND} ${_ctest_args_serial})
   add_subdirectory(tests)
@@ -431,7 +432,7 @@ CONFIGURE_FILE(
 
 # install config files
 install(FILES ${PROJECT_BINARY_DIR}/tiledarray.pc
-    DESTINATION lib/pkgconfig)
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig")
 
 # include extra cmake files
 install(FILES
@@ -480,8 +481,11 @@ ADD_CUSTOM_TARGET(release
   COMMENT "Switch CMAKE_BUILD_TYPE to Release"
   )
 
-feature_summary(WHAT ALL
-                DESCRIPTION "=== TiledArray Package/Feature Info ===")
+if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
+  feature_summary(WHAT ALL
+                  DESCRIPTION "=== TiledArray Package/Feature Info ===")
+  feature_summary(FILENAME ${CMAKE_CURRENT_BINARY_DIR}/features.log WHAT ALL)
+endif()
 
 option(TA_PYTHON "Build TA python module" OFF)
 if (TA_PYTHON)
diff --git a/INSTALL.md b/INSTALL.md
index ad33841e44..ed0ba5046c 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -32,7 +32,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b
 
   See the current [Travis CI matrix](.travis.yml) for the most up-to-date list of compilers that are known to work.
 
-- [CMake](https://cmake.org/), version 3.15 or higher; if CUDA support is needed, CMake 3.18 or higher is required.
+- [CMake](https://cmake.org/), version 3.15 or higher; if {CUDA,HIP} support is needed, CMake {3.18,3.21} or higher is required.
 - [Git](https://git-scm.com/) 1.8 or later (required to obtain TiledArray and MADNESS source code from GitHub)
 - [Eigen](http://eigen.tuxfamily.org/), version 3.3.5 or higher; if CUDA is enabled then 3.3.7 is required (will be downloaded automatically, if missing)
 - [Boost libraries](www.boost.org/), version 1.59 or higher (will be downloaded automatically, if missing). The following principal Boost components are used:
@@ -40,9 +40,10 @@ Both methods are supported. However, for most users we _strongly_ recommend to b
   - Boost.Container: header-only
   - Boost.Test: header-only or (optionally) as a compiled library, *only used for unit testing*
   - Boost.Range: header-only, *only used for unit testing*
-- [BTAS](http://github.com/ValeevGroup/BTAS), tag 6fcb6451bc7ca46a00534a30c51dc5c230c39ac3 . If usable BTAS installation is not found, TiledArray will download and compile
+- [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20 and later.
+- [BTAS](http://github.com/ValeevGroup/BTAS), tag 1cfcb12647c768ccd83b098c64cda723e1275e49 . If usable BTAS installation is not found, TiledArray will download and compile
   BTAS from source. *This is the recommended way to compile BTAS for all users*.
-- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 0b44ef319643cb9721fbe17d294987c146e6460e .
+- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 93a9a5cec2a8fa87fba3afe8056607e6062a9058 .
   Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray.
   If usable MADNESS installation is not found, TiledArray will download and compile
   MADNESS from source. *This is the recommended way to compile MADNESS for all users*.
@@ -63,16 +64,18 @@ Compiling BTAS requires the following prerequisites:
   - BLAS and LAPACK libraries
 
 Optional prerequisites:
-- [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on CUDA-enabled accelerators. CUDA 11 or later is required. Support for CUDA also requires the following additional prerequisites, both of which will be built and installed automatically if missing:
-  - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, HIP, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 68abe31a9ec6fd2fd9ffbcd874daa80457f947da).
-  - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag f9640e0fa4245691cdd434e4f719ac5f7d455f82).
+- for execution on GPGPUs:
+  - device programming runtime:
+    - [CUDA compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on NVIDIA's CUDA-enabled accelerators. CUDA 11 or later is required.
+    - [HIP/ROCm compiler and runtime](https://developer.nvidia.com/cuda-zone) -- for execution on AMD's ROCm-enabled accelerators. Note that TiledArray does not use ROCm directly but its C++ Heterogeneous-Compute Interface for Portability, `HIP`; although HIP can also be used to program CUDA-enabled devices, in TiledArray it is used only to program ROCm devices, hence ROCm and HIP will be used interchangeably.
+  - [LibreTT](github.com/victor-anisimov/LibreTT) -- free tensor transpose library for CUDA, ROCm, and SYCL platforms that is based on the [original cuTT library](github.com/ap-hynninen/cutt) extended to provide thread-safety improvements (via github.com/ValeevGroup/cutt) and extended to non-CUDA platforms by [@victor-anisimov](github.com/victor-anisimov) (tag 6eed30d4dd2a5aa58840fe895dcffd80be7fbece).
+  - [Umpire](github.com/LLNL/Umpire) -- portable memory manager for heterogeneous platforms (tag 8c85866107f78a58403e20a2ae8e1f24c9852287).
 - [Doxygen](http://www.doxygen.nl/) -- for building documentation (version 1.8.12 or later).
 - [ScaLAPACK](http://www.netlib.org/scalapack/) -- a distributed-memory linear algebra package. If detected, the following C++ components will also be sought and downloaded, if missing:
   - [scalapackpp](https://github.com/wavefunction91/scalapackpp.git) -- a modern C++ (C++17) wrapper for ScaLAPACK (tag 6397f52cf11c0dfd82a79698ee198a2fce515d81); pulls and builds the following additional prerequisite
     - [blacspp](https://github.com/wavefunction91/blacspp.git) -- a modern C++ (C++17) wrapper for BLACS
 - Python3 interpreter -- to test (optionally-built) Python bindings
-- [Range-V3](https://github.com/ericniebler/range-v3.git) -- a Ranges library that served as the basis for Ranges component of C++20; only used for some unit testing of the functionality anticipated to be supported by future C++ standards.
-- [TTG](https://github.com/TESSEorg/ttg.git) -- C++ implementation of the Template Task Graph programming model for fine-grained flow-graph composition of distributed memory programs.
+- [TTG](https://github.com/TESSEorg/ttg.git) -- C++ implementation of the Template Task Graph programming model for fine-grained flow-graph composition of distributed memory programs (tag 3fe4a06dbf4b05091269488aab38223da1f8cb8e).
 
 Many of these dependencies can be installed with a package manager,
 such as Homebrew on OS X or apt-get on Debian Linux distributions;
@@ -323,17 +326,18 @@ To discover and configure the use of Intel MKL consider these suggestions:
 
 Also note that even if OpenMP or TBB backends are used, TiledArray will be default set the number of threads to be used by MKL kernels to 1, regardless of the value of environment variables `MKL_NUM_THREADS`/`OMP_NUM_THREADS`. It is possible to change the number of threads to be used programmatically in your application by calling MKL function `mkl_set_num_threads()`.
 
-## CUDA
+## GPGPU support
 
-Support for execution on CUDA-enabled hardware is controlled by the following variables:
+Support for execution on NVIDIA and AMD GPGPUs is controlled by the following variables:
 
 * `ENABLE_CUDA`  -- Set to `ON` to turn on CUDA support. [Default=OFF].
 * `CMAKE_CUDA_HOST_COMPILER`  -- Set to the path to the host C++ compiler to be used by CUDA compiler. CUDA compilers used to be notorious for only being able to use specific C++ host compilers, but support for more recent C++ host compilers has improved. The default is determined by the CUDA compiler and the user environment variables (`PATH` etc.).
-* `ENABLE_CUDA_ERROR_CHECK` -- Set to `ON` to turn on assertions for successful completion of calls to CUDA runtime and libraries. [Default=OFF].
+* `ENABLE_HIP`  -- Set to `ON` to turn on HIP/ROCm support. [Default=OFF].
 * `LIBRETT_INSTALL_DIR` -- the installation prefix of the pre-installed LibreTT library. This should not be normally needed; it is strongly recommended to let TiledArray build and install LibreTT.
 * `UMPIRE_INSTALL_DIR` -- the installation prefix of the pre-installed Umpire library. This should not be normally needed; it is strongly recommended to let TiledArray build and install Umpire.
 
-For the CUDA compiler and toolkit to be discoverable the CUDA compiler (`nvcc`) should be in the `PATH` environment variable. Refer to the [FindCUDAToolkit module](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html) for more info.
+- For the CUDA compiler and toolkit to be discoverable the CUDA compiler (`nvcc`) should be in the `PATH` environment variable. Refer to the [FindCUDAToolkit module](https://cmake.org/cmake/help/latest/module/FindCUDAToolkit.html) for more info.
+- For the ROCm platform to be discoverable add its prefix path (e.g., `/opt/rocm`) to `CMAKE_PREFIX_PATH`
 
 ## Eigen 3
 
@@ -412,13 +416,13 @@ support may be added.
 * `TA_ASSERT_POLICY` -- Set to `TA_ASSERT_IGNORE` to disable `TA_ASSERT` assertions, `TA_ASSERT_THROW` to cause `TA_ASSERT` assertions to throw, `TA_ASSERT_ABORT` to cause `TA_ASSERT` assertions to abort. The default is `TA_ASSERT_IGNORE` if CMake uses a single-configuration generator and`CMAKE_BUILD_TYPE` is set to `Release` or `MinSizeRel`, else the default is `TA_ASSERT_THROW`.
 * `BUILD_TESTING` -- Set of `OFF` to disable building unit tests. The default is `ON`.
 * `TA_TRACE_TASKS` -- Set to `ON` to enable tracing of MADNESS tasks using custom task tracer. Note that standard profilers/tracers are generally useless (except in the trivial cases) with MADWorld-based programs since the submission context of tasks is not captured by standard tracing tools; this makes it impossible in a nontrivial program to attribute tasks to source code. WARNING: task tracing his will greatly increase the memory requirements. [Default=OFF].
-* `TA_RANGEV3` -- Set to `ON` to find or fetch the Range-V3 library and enable additional tests of TA components with constructs anticipated to be supported in the future. [Default=OFF].
 * `TA_TTG` -- Set to `ON` to find or fetch the TTG library. [Default=OFF].
 * `TA_SIGNED_1INDEX_TYPE` -- Set to `OFF` to use unsigned 1-index coordinate type (default for TiledArray 1.0.0-alpha.2 and older). The default is `ON`, which enables the use of negative indices in coordinates.
 * `TA_MAX_SOO_RANK_METADATA` -- Specifies the maximum rank for which to use Small Object Optimization (hence, avoid the use of the heap) for metadata. The default is `8`.
 * `TA_TENSOR_MEM_PROFILE` -- Set to `ON` to profile host memory allocations used by TA::Tensor. This causes the use of Umpire for host memory allocation. This also enables additional tracing facilities provided by Umpire; these can be controlled via [environment variable `UMPIRE_LOG_LEVEL`](https://umpire.readthedocs.io/en/develop/sphinx/features/logging_and_replay.html), but note that the default is to log Umpire info into a file rather than stdout.
 * `TA_TENSOR_MEM_TRACE` -- Set to `ON` to *trace* host memory allocations used by TA::Tensor. This turns on support for tracking memory used by `Tensor` objects; such tracking must be enabled programmatically. This can greatly increase memory consumption by the application and is only intended for expert developers troubleshooting memory use by TiledArray.
 * `TA_UT_CTEST_TIMEOUT` -- The value (in seconds) of the timeout to use for running the TA unit tests via CTest when building the `check`/`check-tiledarray` targets. The default timeout is 1500s.
+* `IntelMKL_FAIR_DISPATCH` -- If want to use Intel MKL library on non-Intel (e.g., AMD) CPUs, set to `ON` to use fair kernel dispatch. [Default=OFF].
 
 # Build TiledArray
 
diff --git a/bin/admin/clang-format.sh b/bin/admin/clang-format.sh
new file mode 100755
index 0000000000..3531dcc1b3
--- /dev/null
+++ b/bin/admin/clang-format.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+# these are the versions of clang-format that are supported required
+# should be ordered from oldest to newest to make sure the newest is picked
+supported_clang_format_versions="16 17"
+preferred_clang_format_version=""  # prefer most recent supported clang-format version
+for v in $supported_clang_format_versions; do
+  preferred_clang_format_version=$v
+done
+
+# append common locations of clang-format to PATH
+unameOut="$(uname -s)"
+case "${unameOut}" in
+    Darwin*)
+      extra_path=""
+      # this prefers more recent versions
+      for v in $supported_clang_format_versions; do
+        extra_path=/opt/homebrew/opt/llvm@$v/bin:/opt/homebrew/opt/clang-format@$v/bin:$extra_path
+      done
+      # prepend paths
+      export PATH=$extra_path:$PATH:/opt/homebrew/bin
+    ;;
+esac
+
+path_to_clang_format=`which clang-format`
+have_supported_clang_format_version=0
+if [[ "X$path_to_clang_format" != "X" ]]; then
+
+  # check clang-format version
+  clang_format_version=`clang-format --version | sed 's/.* version //' | awk -F'[.]' '{print $1}'`
+
+  #echo "supported_clang_format_versions=\"$supported_clang_format_versions\" clang_format_version=$clang_format_version"
+
+  # if found clang-format, but wrong version, check if docker is available
+  for v in $supported_clang_format_versions; do
+    if [[ $clang_format_version -eq $v ]]; then
+      have_supported_clang_format_version=1
+      break
+    fi
+  done
+fi
+
+if [[ $have_supported_clang_format_version -eq 0 ]]; then
+  echo "WARNING: found clang-format with unsupported version $clang_format_version (supported versions: $supported_clang_format_versions)"
+
+  # look for docker
+  path_to_docker=`which docker`
+  if [[ "X$path_to_docker" = "X" ]]; then
+    echo "ERROR: docker is not found either, PATH=$PATH, install one of supported clang-format versions (any of these: $supported_clang_format_versions) or install docker"
+    exit 1
+  fi
+
+  # if docker up?
+  docker info >/dev/null 2>&1
+  if [[ $? -ne 0 ]]; then
+    echo "ERROR: docker is found but not running, start it"
+    exit 1
+  fi
+
+  # use docker to run clang-format
+  mount_path=$(readlink -f "$HOME")
+
+  # convert file names in the arguments to relative paths
+  args=""
+  for i in "$@"; do
+    # skip options
+    if [[ "$i" == -* ]]; then
+      args="$args $i"
+      continue
+    fi
+    abs_file_path=$(readlink -f "$i")
+    if [[ "X$abs_file_path" = "X" ]]; then
+      echo "ERROR: given file $i is not found"
+      exit 1
+    fi
+
+    dir=$(dirname $abs_file_path)
+    file_path_relative_to_project_root=$(basename $abs_file_path)
+    while [[ "$dir" != "$mount_path" && "$dir" != "/" ]]; do
+      file_path_relative_to_project_root="$(basename $dir)/$file_path_relative_to_project_root"
+      dir=$(dirname $dir)
+      #echo "dir=$dir file_path_relative_to_project_root=$file_path_relative_to_project_root"
+    done
+    if [[ "$dir" == "/" ]]; then
+      echo "ERROR: given file $i (absolute path $abs_file_path) is not under \$HOME=$mount_path, cannot use docker-based clang-format in this case"
+      exit 1
+    fi
+    args="$args /hostHOME/$file_path_relative_to_project_root"
+  done
+  docker run --platform linux/x86_64 -v $mount_path:/hostHOME xianpengshen/clang-tools:$preferred_clang_format_version clang-format $args
+else
+  #echo "found $path_to_clang_format with required version $clang_format_version"
+  clang-format $*
+fi
diff --git a/bin/admin/dependency-versions-update-hook.py b/bin/admin/dependency-versions-update-hook.py
index 686b98b49a..f7f652c1bd 100755
--- a/bin/admin/dependency-versions-update-hook.py
+++ b/bin/admin/dependency-versions-update-hook.py
@@ -59,23 +59,7 @@ def replace_dep_id(topsrc, file_ext, dep_name, old_id, new_id, search_prefix = '
         tokens = line.split()
         if len(tokens) < 3:
             continue
-        if tokens[1].find('TRACKED_BOOST') != -1:
-            if tokens[1].find('PREVIOUS') != -1:
-                boost_old_version = tokens[2]
-            else:
-                boost_new_version = tokens[2]
-        elif tokens[1].find('INSTALL_BOOST') != -1:
-            if tokens[1].find('VERSION') != -1:
-                if tokens[1].find('PREVIOUS') != -1:
-                    boost_old_install_version = tokens[2]
-                else:
-                    boost_new_install_version = tokens[2]
-            else:  # URL_HASH
-                if tokens[1].find('PREVIOUS') != -1:
-                    boost_old_install_url_hash = tokens[2]
-                else:
-                    boost_new_install_url_hash = tokens[2]
-        elif tokens[1].find('TRACKED_EIGEN') != -1:
+        if tokens[1].find('TRACKED_EIGEN') != -1:
             if tokens[1].find('PREVIOUS') != -1:
                 eigen_old_version = tokens[2]
             else:
@@ -126,12 +110,14 @@ def replace_dep_id(topsrc, file_ext, dep_name, old_id, new_id, search_prefix = '
                 scalapackpp_old_tag = tokens[2]
             else:
                 scalapackpp_new_tag = tokens[2]
+        elif tokens[1].find('TTG') != -1:
+            if tokens[1].find('PREVIOUS') != -1:
+                ttg_old_tag = tokens[2]
+            else:
+                ttg_new_tag = tokens[2]
 
 any_files_changed = False
 
-# Boost version in INSTALL.md
-any_files_changed |= replace_dep_id(topsrc, 'md', 'Boost', boost_old_version, boost_new_version, 'boost.org/), version ', ' or higher')
-
 # Eigen version in INSTALL.md
 any_files_changed |= replace_dep_id(topsrc, 'md', 'Eigen', eigen_old_version, eigen_new_version, 'eigen.tuxfamily.org), version ', ' or higher')
 # Eigen install version in eigen.cmake
@@ -155,6 +141,9 @@ def replace_dep_id(topsrc, file_ext, dep_name, old_id, new_id, search_prefix = '
 # SCALAPACKPP tag in INSTALL.md
 any_files_changed |= replace_dep_id(topsrc, 'md', 'SCALAPACKPP', scalapackpp_old_tag, scalapackpp_new_tag, '', '')
 
+# TTG tag in INSTALL.md
+any_files_changed |= replace_dep_id(topsrc, 'md', 'TTG', ttg_old_tag, ttg_new_tag, '', '')
+
 if any_files_changed:
     sys.exit(1)
 else:
diff --git a/ci/.build-project b/ci/.build-project
index aeb7c73787..f7a7033755 100755
--- a/ci/.build-project
+++ b/ci/.build-project
@@ -75,8 +75,19 @@ cmd "source ci/openmpi.env"
 cmd "echo 'localhost slots=2' > /etc/openmpi/openmpi-default-hostfile"
 if [[ "$vars" =~ \"-DBLAS_PREFERENCE_LIST=IntelMKL ]]; then
   cmd "make -C /home/ValeevGroup install/intel-mkl"
-  cmd "source /opt/intel/mkl/bin/mklvars.sh intel64"
+  cmd "source /opt/intel/oneapi/mkl/latest/env/vars.sh"
   cmd "echo MKLROOT=\$MKLROOT"
+  # DevOps installs OneAPI MKL which requires OneAPI TBB ... although MKL bundles TBB
+  # the systemwide TBB package is found first + the MKL's TBB does not bundle
+  # so we can't discover it properly anyway
+  # unfortunately the default, libtbb-dev, package on ubuntu 20.04 is pre-OneAPI, so let's
+  # get rid of it + install "full" OneAPI TBB
+  if [[ "$vars" =~ \"-DIntelMKL_THREAD_LAYER=tbb ]]; then
+    cmd "(apt show libtbb2 && apt remove -y libtbb2) || echo \"no need to install libtbb2\""
+  fi
+  cmd "apt-get -yq install intel-oneapi-tbb-devel"
+  cmd "source /opt/intel/oneapi/tbb/latest/env/vars.sh"
+  cmd "echo TBBROOT=\$TBBROOT"
 fi
 if [[ "$vars" =~ \"-D([a-zA-Z]+_)?ENABLE_CUDA=(ON|TRUE|1|YES)\" ]]; then
   cmd "make -C /home/ValeevGroup install/cuda"
diff --git a/cmake/modules/FindOrFetchBTAS.cmake b/cmake/modules/FindOrFetchBTAS.cmake
index 764ec7046e..35ad3dd200 100644
--- a/cmake/modules/FindOrFetchBTAS.cmake
+++ b/cmake/modules/FindOrFetchBTAS.cmake
@@ -13,12 +13,18 @@ if (NOT TARGET BTAS::BTAS)
   # BTAS will load BLAS++/LAPACK++ ... if those use CMake's FindBLAS/FindLAPACK (as indicated by defined BLA_VENDOR)
   # will need to specify Fortran linkage convention ... manually for now, switching to NWX's linear algebra discovery
   # is necessary to handle all the corner cases for automatic discovery
-  if (BLA_VENDOR)
+  if (DEFINED BLA_VENDOR)
     set(_linalgpp_use_standard_linalg_kits TRUE)
-  endif(BLA_VENDOR)
+  endif(DEFINED BLA_VENDOR)
 
-  if (NOT TILEDARRAY_HAS_CUDA)
-    # tell BLAS++/LAPACK++ to ignore CUDA
+  if (TILEDARRAY_HAS_CUDA)
+    # tell BLAS++/LAPACK++ to also look for CUDA
+    set(gpu_backend cuda CACHE STRING "The device backend to use for Linalg++")
+  elseif (TILEDARRAY_HAS_HIP)
+    # tell BLAS++/LAPACK++ to also look for HIP
+    set(gpu_backend hip CACHE STRING "The device backend to use for Linalg++")
+  else ()
+    # tell BLAS++/LAPACK++ to not look for device backends
     set(gpu_backend none CACHE STRING "The device backend to use for Linalg++")
   endif()
 
diff --git a/cmake/modules/FindOrFetchBoost.cmake b/cmake/modules/FindOrFetchBoost.cmake
deleted file mode 100644
index 6ddb2a3b8d..0000000000
--- a/cmake/modules/FindOrFetchBoost.cmake
+++ /dev/null
@@ -1,79 +0,0 @@
-# Limit scope of the search if BOOST_ROOT or BOOST_INCLUDEDIR is provided.
-if (BOOST_ROOT OR BOOST_INCLUDEDIR)
-  set(Boost_NO_SYSTEM_PATHS TRUE)
-endif()
-
-# try find_package
-if (NOT TARGET Boost::boost)
-
-  # detect which Boost targets I already have
-  foreach(tgt boost;headers;${Boost_BTAS_DEPS_LIBRARIES})
-    if (TARGET Boost::${tgt})
-      set(ta_imported_boost_${tgt} 0)
-    else()
-      set(ta_imported_boost_${tgt} 1)
-    endif()
-  endforeach()
-
-  include(FindPackageRegimport)
-  find_package_regimport(Boost ${TA_TRACKED_BOOST_VERSION} QUIET)
-  if (TARGET Boost::boost)
-    message(STATUS "Found Boost ${Boost_VERSION}: ${Boost_INCLUDE_DIRS}")
-  endif(TARGET Boost::boost)
-
-  # Boost::* targets by default are not GLOBAL, so to allow users of TA to safely use them we need to make them global
-  # more discussion here: https://gitlab.kitware.com/cmake/cmake/-/issues/17256
-  foreach(tgt boost;headers;${Boost_BTAS_DEPS_LIBRARIES})
-    if (TARGET Boost::${tgt} AND ta_imported_boost_${tgt})
-      get_target_property(_boost_tgt_${tgt}_is_imported_global Boost::${tgt} IMPORTED_GLOBAL)
-      if (NOT _boost_tgt_${tgt}_is_imported_global)
-        set_target_properties(Boost::${tgt} PROPERTIES IMPORTED_GLOBAL TRUE)
-      endif()
-      unset(_boost_tgt_${tgt}_is_imported_global)
-    endif()
-  endforeach()
-
-endif (NOT TARGET Boost::boost)
-
-# if not found, build via FetchContent
-if (NOT TARGET Boost::boost)
-  include (FetchContent)
-  cmake_minimum_required (VERSION 3.14.0)  # for FetchContent_MakeAvailable
-
-  FetchContent_Declare(
-          CMAKEBOOST
-          GIT_REPOSITORY      https://github.com/Orphis/boost-cmake
-  )
-  FetchContent_MakeAvailable(CMAKEBOOST)
-  FetchContent_GetProperties(CMAKEBOOST
-          SOURCE_DIR CMAKEBOOST_SOURCE_DIR
-          BINARY_DIR CMAKEBOOST_BINARY_DIR
-          )
-
-  # current boost-cmake/master does not install boost correctly, so warn that installed TiledArray will not be usable
-  # boost-cmake/install_rules https://github.com/Orphis/boost-cmake/pull/45 is supposed to fix it but is inactive
-  message(WARNING "Building Boost from source makes TiledArray unusable from the install location! Install Boost using package manager or manually and reconfigure/reinstall TiledArray to fix this")
-  if (NOT TARGET Boost::headers)
-    add_library(Boost::headers ALIAS Boost::boost)
-  endif()
-  foreach(_lib serialization regex locale locale_deps thread chrono)  # these are non-header-only components used by MPQC
-    if (TARGET Boost_${_lib})
-      install(TARGETS Boost_${_lib} EXPORT btas COMPONENT boost-libs)
-      if (NOT TARGET Boost::${_lib})
-        add_library(Boost::${_lib} ALIAS Boost_${_lib})
-      endif()
-    endif()
-  endforeach()
-#  export(EXPORT tiledarray
-#      FILE "${PROJECT_BINARY_DIR}/boost-targets.cmake")
-#  install(EXPORT tiledarray
-#      FILE "boost-targets.cmake"
-#      DESTINATION "${TILEDARRAY_INSTALL_CMAKEDIR}"
-#      COMPONENT boost-libs)
-
-endif(NOT TARGET Boost::boost)
-
-# postcond check
-if (NOT TARGET Boost::boost)
-  message(FATAL_ERROR "FindOrFetchBoost could not make Boost::boost target available")
-endif(NOT TARGET Boost::boost)
diff --git a/cmake/modules/FindOrFetchMADWorld.cmake b/cmake/modules/FindOrFetchMADWorld.cmake
index 7be76bac5a..5961a4f05d 100644
--- a/cmake/modules/FindOrFetchMADWorld.cmake
+++ b/cmake/modules/FindOrFetchMADWorld.cmake
@@ -12,7 +12,15 @@ if (NOT TARGET MADworld)
 
   # TA-specific configuration
   set(MADNESS_BUILD_MADWORLD_ONLY ON CACHE BOOL "Whether to build MADNESS runtime only")
-  set(ENABLE_PARSEC OFF CACHE BOOL "Whether to use PaRSEC as the task backend of MADWorld")
+  if (TA_TTG)
+    if (NOT DEFINED MADNESS_TASK_BACKEND)
+      set(MADNESS_TASK_BACKEND PaRSEC CACHE STRING "The task backend to use for MADNESS tasks")
+    else ()
+      if (NOT(${MADNESS_TASK_BACKEND} STREQUAL PaRSEC))
+        message(FATAL_ERROR "must set MADNESS_TASK_BACKEND=PaRSEC if configuring with TA_TTG=ON")
+      endif()
+    endif()
+  endif()
   set(MPI_THREAD "multiple" CACHE INTERNAL "MADNESS requires MPI_THREAD_MULTIPLE")
   set(MADNESS_ASSUMES_ASLR_DISABLED ${TA_ASSUMES_ASLR_DISABLED} CACHE BOOL "Whether MADNESS assumes ASLR to be disabled")
   set(MPI_CXX_SKIP_MPICXX ON CACHE BOOL "Whether to disable search for C++ MPI-2 bindings")
diff --git a/cmake/tiledarray-config.cmake.in b/cmake/tiledarray-config.cmake.in
index 845f5225c3..abff1952ea 100644
--- a/cmake/tiledarray-config.cmake.in
+++ b/cmake/tiledarray-config.cmake.in
@@ -12,11 +12,22 @@ set(TILEDARRAY_EXT_VERSION "@TILEDARRAY_EXT_VERSION@")
 
 @PACKAGE_INIT@
 
+include(CMakeFindDependencyMacro)
+
 # Include library IMPORT targets
+
+@Boost_CONFIG_FILE_CONTENTS@
+
+if (NOT TARGET range-v3::range-v3)
+  get_filename_component(range-v3_DIR "@range-v3_CONFIG@" DIRECTORY)
+  find_dependency(range-v3 QUIET REQUIRED HINTS "${range-v3_DIR}")
+endif(NOT TARGET range-v3::range-v3)
+
 if (NOT TARGET BTAS::BTAS)
   get_filename_component(BTAS_DIR "@BTAS_CONFIG@" DIRECTORY)
-  find_package(BTAS 1.0.0 QUIET CONFIG REQUIRED HINTS "${BTAS_DIR}")
+  find_dependency(BTAS 1.0.0 QUIET CONFIG REQUIRED HINTS "${BTAS_DIR}")
 endif()
+
 if(NOT TARGET MADworld)
   # if madness installed separately, use the madness install discovered when configuring TA
   set(MADNESS_CONFIG_DIR "@MADNESS_CONFIG_DIR@")
@@ -34,11 +45,11 @@ endif()
 # if TA is a CUDA-dependent library it needs CUDA to link properly ... unfortunately CMake is not able to do this correctly
 # see https://gitlab.kitware.com/cmake/cmake/issues/18614
 # so try workarounds
-set(TILEDARRAY_HAS_CUDA "@CUDA_FOUND@")
+set(TILEDARRAY_HAS_CUDA "@TILEDARRAY_HAS_CUDA@")
 if(TILEDARRAY_HAS_CUDA)
   cmake_minimum_required(VERSION 3.17)
   if (NOT TARGET CUDA::cublas)
-    find_package(CUDAToolkit REQUIRED COMPONENTS cublas nvToolsExt)
+    find_dependency(CUDAToolkit REQUIRED COMPONENTS cublas nvtx3)
   endif(NOT TARGET CUDA::cublas)
   set(CMAKE_CUDA_HOST_COMPILER "@CMAKE_CUDA_HOST_COMPILER@")
   # workaround from https://gitlab.kitware.com/cmake/cmake/issues/18614#note_485631
diff --git a/doc/devsamp/wiki/user-guide-2.cpp b/doc/devsamp/wiki/user-guide-2.cpp
index da7664c8d4..ebef5be776 100644
--- a/doc/devsamp/wiki/user-guide-2.cpp
+++ b/doc/devsamp/wiki/user-guide-2.cpp
@@ -36,6 +36,7 @@ TA::Tensor<T> make_tile2(const TA::Range& range, const double v) {
 
 // Fill array x with value v
 void init_array(TA::TArrayD& x, const double v) {
+  using std::begin, std::end;
   // Add local tiles to a
   for (auto it = begin(x); it != end(x); ++it) {
     // Construct a tile using a MADNESS task.
diff --git a/doc/dox/dev/Optimization-Guide.md b/doc/dox/dev/Optimization-Guide.md
index 229cf82d0f..49fefb196e 100644
--- a/doc/dox/dev/Optimization-Guide.md
+++ b/doc/dox/dev/Optimization-Guide.md
@@ -18,10 +18,8 @@ is devoted to communication. [Default = number of cores reported by ]
 
 ## MPI
 
-## CUDA
+## GPU/Device compute runtimes
 
-In addition to [the environment variables that control the CUDA runtime behavior](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars), several environment variables control specifically the execution of TiledArray on CUDA devices:
-* `TA_CUDA_NUM_STREAMS` -- The number of [CUDA streams](https://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf) used to execute tasks on each device. Each stream can be viewed as a thread in a threadpool, with tasks in a given stream executing in order, but each stream executing independently of others. For small tasks this may need to be increased. [Default=3]
-* `CUDA_VISIBLE_DEVICES` -- This CUDA runtime environment variable is queried by TiledArray to determine whether CUDA devices on a multi-GPU node have been pre-mapped to MPI ranks.
-  * By default (i.e. when # of MPI ranks on a node <= # of _available_ CUDA devices) TiledArray will map 1 device (in the order of increasing rank) to each MPI rank.
-  * If # of available CUDA devices < # of MPI ranks on a node _and_ `CUDA_VISIBLE_DEVICES` is set TiledArray will assume that the user mapped the devices to the MPI ranks appropriately (e.g. using a resource manager like `jsrun`) and only checks that each rank has access to 1 CUDA device.
+In addition to the environment variables that control the runtime behavior of [CUDA](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) and [HIP/ROCm](https://rocm.docs.amd.com/en/latest/search.html?q=environment+variables), several environment variables control specifically the execution of TiledArray on compute devices:
+* `TA_DEVICE_NUM_STREAMS` -- The number of [compute streams](https://developer.download.nvidia.com/CUDA/training/StreamsAndConcurrencyWebinar.pdf) used to execute tasks on each device. Each stream can be viewed as a thread in a threadpool, with tasks in a given stream executing in order, but each stream executing independently of others. For small tasks this may need to be increased. In addition stream for compute tasks TiledArray also creates 2 dedicated streams for data transfers to/from each device. [Default=3]
+* `CUDA_VISIBLE_DEVICES`/`HIP_VISIBLE_DEVICES` -- These runtime environment variables are can be used to map CUDA/HIP devices, respectively, on a multi-device node to MPI ranks. It is usually the responsibility of the resource manager to control this mapping, thus normally it should not be needed. By default TiledArray will assign compute devices on a multidevice node round robin to each MPI rank.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index f74d35345a..d240192893 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -29,8 +29,8 @@ endif()
 
 # Add Subdirectories
 add_subdirectory (cc)
-add_subdirectory (cuda)
-add_subdirectory (dgemm)
+add_subdirectory (device)
+add_subdirectory (gemm)
 add_subdirectory (demo)
 add_subdirectory (scalapack)
 add_subdirectory (fock)
diff --git a/examples/cuda/ta_dense_cuda.cpp b/examples/cuda/ta_dense_cuda.cpp
deleted file mode 100644
index 14f692329b..0000000000
--- a/examples/cuda/ta_dense_cuda.cpp
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * This file is a part of TiledArray.
- * Copyright (C) 2018  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-
-#define CUDA_API_PER_THREAD_DEFAULT_STREAM
-
-#include <madness/config.h>
-
-// clang-format off
-
-#include <tiledarray.h>
-#include <TiledArray/cuda/btas_um_tensor.h>
-#include "TiledArray/cuda/cpu_cuda_vector.h"
-#include <TiledArray/external/btas.h>
-// clang-format on
-
-#include <cuda_profiler_api.h>
-
-namespace TiledArray {
-
-///
-/// cuda gemm interface function on left*right
-///
-
-template <typename T, typename Range>
-btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>> gemm(
-    const btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>> &left,
-    const btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>> &right,
-    T factor, const TiledArray::math::GemmHelper &gemm_helper) {
-  return btas_tensor_gemm_cuda_impl(left, right, factor, gemm_helper);
-}
-
-///
-/// cuda gemm interface function on result = left*right
-///
-
-template <typename T, typename Range>
-void gemm(btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>> &result,
-          const btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>> &left,
-          const btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>> &right,
-          T factor, const TiledArray::math::GemmHelper &gemm_helper) {
-  return btas_tensor_gemm_cuda_impl(result, left, right, factor, gemm_helper);
-}
-
-///
-/// cuda axpy interface function
-///
-
-template <typename T, typename Range>
-void add_to(btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>> &result,
-            const btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>> &arg) {
-  btas_tensor_add_to_cuda_impl(result, arg, T(1.0));
-}
-
-///
-/// cuda dot interface function
-///
-
-template <typename T, typename Range>
-typename btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>>::value_type
-squared_norm(
-    const btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>> &arg) {
-  return btas_tensor_squared_norm_cuda_impl(arg);
-}
-
-template <typename T, typename Range>
-typename btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>>::value_type
-norm(const btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>> &arg) {
-  return std::sqrt(squared_norm(arg));
-}
-
-/// to host for CPU GPU Array
-template <typename T, typename Range, typename Policy>
-void to_host(
-    TiledArray::DistArray<TiledArray::Tile<btas::Tensor<
-                              T, Range, TiledArray::cpu_cuda_vector<T>>>,
-                          Policy> &cpu_cuda_array) {
-  auto to_host =
-      [](TiledArray::Tile<
-          btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>>> &tile) {
-        auto &stream = detail::get_stream_based_on_range(tile.range());
-
-        // do norm on GPU
-        auto tile_norm = norm(tile.tensor());
-
-        TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
-            tile.tensor().storage(), stream);
-
-        return tile_norm;
-      };
-
-  foreach_inplace(cpu_cuda_array, to_host);
-  cpu_cuda_array.world().gop.fence();
-  cudaDeviceSynchronize();
-};
-
-/// to device for CPU GPU array
-template <typename T, typename Range, typename Policy>
-void to_device(
-    TiledArray::DistArray<TiledArray::Tile<btas::Tensor<
-                              T, Range, TiledArray::cpu_cuda_vector<T>>>,
-                          Policy> &cpu_gpu_array) {
-  auto to_device =
-      [](TiledArray::Tile<
-          btas::Tensor<T, Range, TiledArray::cpu_cuda_vector<T>>> &tile) {
-        auto &stream = detail::get_stream_based_on_range(tile.range());
-
-        TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
-            tile.tensor().storage(), stream);
-
-        return norm(tile.tensor());
-      };
-
-  foreach_inplace(cpu_gpu_array, to_device);
-  cpu_gpu_array.world().gop.fence();
-  cudaDeviceSynchronize();
-};
-
-}  // namespace TiledArray
-
-template <typename Storage>
-void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
-                  const long Nn, const long Bn, const long Nk, const long Bk,
-                  const long nrepeat) {
-  using Real = typename Storage::value_type;
-
-  const std::size_t Tm = Nm / Bm;
-  const std::size_t Tn = Nn / Bn;
-  const std::size_t Tk = Nk / Bk;
-
-  if (world.rank() == 0)
-    std::cout << "TiledArray: dense matrix multiply test...\n"
-              << "Number of nodes     = " << world.size()
-              << "\nSize of A         = " << Nm << "x" << Nk << " ("
-              << double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)"
-              << "\nSize of A block   = " << Bm << "x" << Bk
-              << "\nSize of B         = " << Nk << "x" << Nn << " ("
-              << double(Nk * Nn * sizeof(double)) / 1.0e9 << " GB)"
-              << "\nSize of B block   = " << Bk << "x" << Bn
-              << "\nSize of C         = " << Nm << "x" << Nn << " ("
-              << double(Nm * Nn * sizeof(double)) / 1.0e9 << " GB)"
-              << "\nSize of C block   = " << Bm << "x" << Bn
-              << "\n# of blocks of C  = " << Tm * Tn
-              << "\nAverage # of blocks of C/node = "
-              << double(Tm * Tn) / double(world.size()) << "\n";
-
-  // Construct TiledRange
-  std::vector<unsigned int> blocking_m;
-  blocking_m.reserve(Tm + 1);
-  for (long i = 0l; i <= Nm; i += Bm) blocking_m.push_back(i);
-
-  std::vector<unsigned int> blocking_n;
-  blocking_n.reserve(Tn + 1);
-  for (long i = 0l; i <= Nn; i += Bn) blocking_n.push_back(i);
-
-  std::vector<unsigned int> blocking_k;
-  blocking_k.reserve(Tk + 1);
-  for (long i = 0l; i <= Nk; i += Bk) blocking_k.push_back(i);
-
-  // Structure of c
-  std::vector<TiledArray::TiledRange1> blocking_C;
-  blocking_C.reserve(2);
-  blocking_C.push_back(
-      TiledArray::TiledRange1(blocking_m.begin(), blocking_m.end()));
-  blocking_C.push_back(
-      TiledArray::TiledRange1(blocking_n.begin(), blocking_n.end()));
-
-  // Structure of a
-  std::vector<TiledArray::TiledRange1> blocking_A;
-  blocking_A.reserve(2);
-  blocking_A.push_back(
-      TiledArray::TiledRange1(blocking_m.begin(), blocking_m.end()));
-  blocking_A.push_back(
-      TiledArray::TiledRange1(blocking_k.begin(), blocking_k.end()));
-
-  // Structure of b
-  std::vector<TiledArray::TiledRange1> blocking_B;
-  blocking_B.reserve(2);
-  blocking_B.push_back(
-      TiledArray::TiledRange1(blocking_k.begin(), blocking_k.end()));
-  blocking_B.push_back(
-      TiledArray::TiledRange1(blocking_n.begin(), blocking_n.end()));
-
-  TiledArray::TiledRange  // TRange for c
-      trange_c(blocking_C.begin(), blocking_C.end());
-
-  TiledArray::TiledRange  // TRange for a
-      trange_a(blocking_A.begin(), blocking_A.end());
-
-  TiledArray::TiledRange  // TRange for b
-      trange_b(blocking_B.begin(), blocking_B.end());
-
-  using value_type = typename Storage::value_type;
-  using CUDATile = btas::Tensor<Real, TA::Range, Storage>;
-  using CUDAMatrix = TA::DistArray<TA::Tile<CUDATile>>;
-  using TAMatrix = TA::DistArray<TA::Tensor<value_type>>;
-
-  CUDAMatrix c(world, trange_c);
-  value_type val_a = 0.03;
-  value_type val_b = 0.02;
-
-  {
-    // Construct and initialize arrays
-
-    TAMatrix a_host(world, trange_a);
-    TAMatrix b_host(world, trange_b);
-
-    a_host.fill(val_a);
-    b_host.fill(val_b);
-    CUDAMatrix a = TA::ta_tensor_to_um_tensor<TA::Tile<CUDATile>>(a_host);
-    CUDAMatrix b = TA::ta_tensor_to_um_tensor<TA::Tile<CUDATile>>(b_host);
-
-    world.gop.fence();
-
-    //    TA::to_device(a);
-    //    TA::to_device(b);
-
-    //    c("m,n") = a("m,k") * b("k,n");
-
-    // start profiler
-    cudaProfilerStart();
-
-    // Start clock
-    const double wall_time_start = madness::wall_time();
-
-    // Do matrix multiplication
-    for (int i = 0; i < nrepeat; ++i) {
-      double iter_time_start = madness::wall_time();
-      //      c("m,n") = a("m,k") * b("k,n") + a("m,n") - b("m,n");
-      c("m,n") = a("m,k") * b("k,n");
-      double iter_time_stop = madness::wall_time();
-      if (world.rank() == 0)
-        std::cout << "Iteration " << i + 1
-                  << " wall time: " << (iter_time_stop - iter_time_start)
-                  << "\n";
-    }
-    // Stop clock
-    const double wall_time_stop = madness::wall_time();
-
-    // stop profiler
-    cudaProfilerStop();
-
-    if (world.rank() == 0)
-      std::cout << "Average wall time   = "
-                << (wall_time_stop - wall_time_start) / double(nrepeat)
-                << " sec\nAverage GFLOPS      = "
-                << double(nrepeat) * 2.0 * double(Nn * Nm * Nm) /
-                       (wall_time_stop - wall_time_start) / 1.0e9
-                << "\n";
-  }
-
-  double threshold =
-      std::numeric_limits<typename Storage::value_type>::epsilon();
-  auto dot_length = Nk;
-  //  auto result = dot_length * val_a * val_b + val_a - val_b;
-  auto result = dot_length * val_a * val_b;
-
-  auto verify = [&world, &threshold, &result,
-                 &dot_length](TA::Tile<CUDATile> &tile) {
-    auto n_elements = tile.size();
-    for (std::size_t i = 0; i < n_elements; i++) {
-      double abs_err = fabs(tile[i] - result);
-      //      double abs_val = fabs(tile[i]);
-      double rel_err = abs_err / result / dot_length;
-      if (rel_err > threshold) {
-        std::cout << "Node: " << world.rank() << " Tile: " << tile.range()
-                  << " id: " << i
-                  << std::string(" gpu: " + std::to_string(tile[i]) +
-                                 " cpu: " + std::to_string(result) + "\n");
-        break;
-      }
-    }
-  };
-
-  for (auto iter = c.begin(); iter != c.end(); iter++) {
-    world.taskq.add(verify, c.find(iter.index()));
-  }
-
-  world.gop.fence();
-
-  if (world.rank() == 0) {
-    std::cout << "Verification Passed" << std::endl;
-  }
-}
-
-int try_main(int argc, char **argv) {
-  // Initialize runtime
-  TiledArray::World &world = TA_SCOPED_INITIALIZE(argc, argv);
-
-  // Get command line arguments
-  if (argc < 6) {
-    std::cout << "multiplies A(Nm,Nk) * B(Nk,Nn), with dimensions m, n, and k "
-                 "blocked by Bm, Bn, and Bk, respectively"
-              << std::endl
-              << "Usage: " << argv[0]
-              << " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [real = double] "
-                 "[storage type = cuda_um_btas_varray]\n";
-    return 0;
-  }
-  const long Nm = atol(argv[1]);
-  const long Bm = atol(argv[2]);
-  const long Nn = atol(argv[3]);
-  const long Bn = atol(argv[4]);
-  const long Nk = atol(argv[5]);
-  const long Bk = atol(argv[6]);
-  if (Nm <= 0 || Nn <= 0 || Nk <= 0) {
-    std::cerr << "Error: dimensions must be greater than zero.\n";
-    return 1;
-  }
-  if (Bm <= 0 || Bn <= 0 || Bk <= 0) {
-    std::cerr << "Error: block sizes must be greater than zero.\n";
-    return 1;
-  }
-  if ((Nm % Bm) != 0ul || Nn % Bn != 0ul || Nk % Bk != 0ul) {
-    std::cerr
-        << "Error: dimension size must be evenly divisible by block size.\n";
-    return 1;
-  }
-  const long nrepeat = (argc >= 8 ? atol(argv[7]) : 5);
-  if (nrepeat <= 0) {
-    std::cerr << "Error: number of repetitions must be greater than zero.\n";
-    return 1;
-  }
-
-  const auto real_type_str =
-      (argc >= 9) ? std::string(argv[8]) : std::string("double");
-
-  if (real_type_str != "float" && real_type_str != "double") {
-    std::cerr << "Error: invalid real type: " << real_type_str
-              << "\n Valid option includes: float or "
-                 "double. \n";
-  }
-
-  const auto storage_type =
-      (argc >= 10) ? std::string(argv[9]) : std::string{"cuda_um_btas_varray"};
-
-  if (storage_type != "cuda_um_btas_varray" &&
-      storage_type != "cuda_um_thrust_vector" &&
-      storage_type != "cpu_cuda_vector") {
-    std::cerr << "Error: invalid storage type: " << storage_type
-              << "\n Valid option includes: cuda_um_vector or "
-                 "cuda_um_btas_varray or cuda_um_thrust_vector "
-                 "or cpu_cuda_vector. \n";
-  }
-  std::cout << "Storage type: " << storage_type << "<" << real_type_str << ">"
-            << std::endl;
-  //  auto to_bool = [](const std::string &str) {
-  //    return (str == "true" || str == "True" || str == "TRUE" || str == "1" ||
-  //            str == "yes" || str == "Yes" || str == "YES");
-  //  };
-
-  int driverVersion, runtimeVersion;
-  auto error = cudaDriverGetVersion(&driverVersion);
-  if (error != cudaSuccess) {
-    std::cout << "error(cudaDriverGetVersion) = " << error << std::endl;
-  }
-  error = cudaRuntimeGetVersion(&runtimeVersion);
-  if (error != cudaSuccess) {
-    std::cout << "error(cudaRuntimeGetVersion) = " << error << std::endl;
-  }
-  std::cout << "CUDA {driver,runtime} versions = " << driverVersion << ","
-            << runtimeVersion << std::endl;
-
-  {  // print device properties
-    int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices();
-
-    if (num_cuda_devices <= 0) {
-      throw std::runtime_error("No CUDA-Enabled GPUs Found!\n");
-    }
-
-    int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id();
-
-    int mpi_size = world.size();
-    int mpi_rank = world.rank();
-
-    for (int i = 0; i < mpi_size; i++) {
-      if (i == mpi_rank) {
-        std::cout << "CUDA Device Information for MPI Process Rank: "
-                  << mpi_rank << std::endl;
-        cudaDeviceProp prop;
-        auto error = cudaGetDeviceProperties(&prop, cuda_device_id);
-        if (error != cudaSuccess) {
-          std::cout << "error(cudaGetDeviceProperties) = " << error
-                    << std::endl;
-        }
-        std::cout << "Device #" << cuda_device_id << ": " << prop.name
-                  << std::endl
-                  << "  managedMemory = " << prop.managedMemory << std::endl
-                  << "  singleToDoublePrecisionPerfRatio = "
-                  << prop.singleToDoublePrecisionPerfRatio << std::endl;
-        int result;
-        error = cudaDeviceGetAttribute(&result, cudaDevAttrUnifiedAddressing,
-                                       cuda_device_id);
-        std::cout << "  attrUnifiedAddressing = " << result << std::endl;
-        error = cudaDeviceGetAttribute(
-            &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id);
-        std::cout << "  attrConcurrentManagedAccess = " << result << std::endl;
-        error = cudaSetDevice(cuda_device_id);
-        if (error != cudaSuccess) {
-          std::cout << "error(cudaSetDevice) = " << error << std::endl;
-        }
-        size_t free_mem, total_mem;
-        error = cudaMemGetInfo(&free_mem, &total_mem);
-        std::cout << "  {total,free} memory = {" << total_mem << "," << free_mem
-                  << "}" << std::endl;
-      }
-      world.gop.fence();
-    }
-  }  // print device properties
-
-  //  if (storage_type == "cpu_cuda_vector") {
-  //    if (real_type_str == "double")
-  //      do_main_body<TiledArray::cpu_cuda_vector<double>>(world, Nm, Bm, Nn,
-  //      Bn,
-  //                                                        Nk, Bk, nrepeat);
-  //    else
-  //      do_main_body<TiledArray::cpu_cuda_vector<float>>(world, Nm, Bm, Nn,
-  //      Bn,
-  //                                                       Nk, Bk, nrepeat);
-  //  } else if (storage_type == "cuda_um_btas_varray") {
-  if (storage_type == "cuda_um_btas_varray") {
-    if (real_type_str == "double")
-      do_main_body<TiledArray::cuda_um_btas_varray<double>>(
-          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
-    else
-      do_main_body<TiledArray::cuda_um_btas_varray<float>>(world, Nm, Bm, Nn,
-                                                           Bn, Nk, Bk, nrepeat);
-  }
-  // else if (storage_type == "cuda_um_thrust_vector") {
-  //    if (real_type_str == "double")
-  //      do_main_body<TiledArray::cuda_um_thrust_vector<double>>(
-  //          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
-  //    else
-  //      do_main_body<TiledArray::cuda_um_thrust_vector<float>>(
-  //          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
-  //  }
-  else {
-    throw std::runtime_error("Invalid storage type!\n");
-  }
-
-  return 0;
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    try_main(argc, argv);
-  } catch (thrust::system::detail::bad_alloc &ex) {
-    std::cout << ex.what() << std::endl;
-
-    size_t free_mem, total_mem;
-    auto result = cudaMemGetInfo(&free_mem, &total_mem);
-    std::cout << "CUDA memory stats: {total,free} = {" << total_mem << ","
-              << free_mem << "}" << std::endl;
-  } catch (std::exception &ex) {
-    std::cout << ex.what() << std::endl;
-  } catch (...) {
-    std::cerr << "unknown exception" << std::endl;
-  }
-
-  return 0;
-}
diff --git a/examples/demo/CMakeLists.txt b/examples/demo/CMakeLists.txt
index c4c533cbf0..c2da9cb36e 100644
--- a/examples/demo/CMakeLists.txt
+++ b/examples/demo/CMakeLists.txt
@@ -16,8 +16,10 @@
 #  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 
-# Create the ta_fock_build executable
-
-# Add the demo executable
+# Standard TA demo to accompany the keynote slides
 add_ta_executable(demo "demo.cpp" "tiledarray")
 add_dependencies(examples-tiledarray demo)
+
+# TA demo snippets for the paper
+add_ta_executable(demo2 "demo2.cpp" "tiledarray")
+add_dependencies(examples-tiledarray demo2)
diff --git a/examples/demo/demo.cpp b/examples/demo/demo.cpp
index d6c1612d95..05f9a25bf5 100644
--- a/examples/demo/demo.cpp
+++ b/examples/demo/demo.cpp
@@ -39,7 +39,7 @@ auto make_tile(const TA::Range &range) {
 int main(int argc, char *argv[]) {
   using namespace std;
 
-  std::srand(2017);
+  TA::srand(2017);
   TA::World &world = TA::initialize(argc, argv);
 
   using namespace TA;
@@ -88,7 +88,6 @@ int main(int argc, char *argv[]) {
   SparseShape<float> shape(shape_tensor, TR);
   TSpArrayD a1(world, TR, shape);
   a1.fill_random();  // for deterministic fill:
-                     // TA::srand(seed);
                      // a1.fill_random<HostExecutor::Thread>();
   cout << "a1:\n" << a1 << endl;
   world.gop.fence();
diff --git a/examples/demo/demo2.cpp b/examples/demo/demo2.cpp
new file mode 100644
index 0000000000..7ef5ca45c8
--- /dev/null
+++ b/examples/demo/demo2.cpp
@@ -0,0 +1,158 @@
+/*
+ * This file is a part of TiledArray.
+ * Copyright (C) 2023  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#ifndef EXAMPLES_DEMO_DEMO2_CPP_
+#define EXAMPLES_DEMO_DEMO2_CPP_
+
+#include <tiledarray.h>
+#include <random>
+
+#include <TiledArray/expressions/einsum.h>
+#include <TiledArray/math/solvers/cp.h>
+
+int main(int argc, char* argv[]) {
+  using namespace std;
+
+  TA::srand(2017);
+  TA::World& world = TA::initialize(argc, argv);
+
+  using namespace TA;
+
+  // requires compiler new enough to support unicode characters in variable
+  // names
+#ifndef TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES
+#ifdef TILEDARRAY_CXX_COMPILER_IS_GCC
+#if __GNUC__ >= 10
+#define TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES 1
+#endif
+#elif !defined(TILEDARRAY_CXX_COMPILER_IS_ICC)
+#define TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES 1
+#endif
+#endif  // !defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES)
+
+#ifdef TILEDARRAY_CXX_COMPILER_SUPPORTS_UNICODE_VARIABLES
+
+  // $\rho \equiv \mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
+  Range ρ{{1, 11}, {-1, 9}};
+  // lower bound of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
+  assert((ρ.lobound() == Index{1, -1}));
+  // upper bound of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
+  assert((ρ.upbound() == Index{11, 9}));
+  // extent of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
+  assert((ρ.extent() == Index{10, 10}));
+  // 1st dimension of ρ is $\mathbb{Z}_{1,11}$
+  assert((ρ.dim(0) == Range1{1, 11}));
+  // 2nd dimension of ρ is $\mathbb{Z}_{-1,9}$
+  assert((ρ.dim(1) == Range1{-1, 9}));
+  // the number of elements in $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$
+  assert(ρ.volume() == 100);
+  // row-major order
+  assert((ρ.stride() == Index{10, 1}));
+  assert((ρ.ordinal({1, -1}) == 0));
+  assert((ρ.ordinal({1, 0}) == 1));
+  assert((ρ.ordinal({10, 8}) + 1 == ρ.volume()));
+  // prints "[1,-1] [1,0] .. [1,8] [2,-1] .. [10,8] "
+  for (auto&& idx : ρ) cout << idx << " ";
+
+  // $\mathbb{Z}_{1,11}$ tiled into $\mathbb{Z}_{1,5}$, $\mathbb{Z}_{5,8}$, and
+  // $\mathbb{Z}_{8,11}$
+  TiledRange1 τ0{1, 5, 8, 11};  // hashmarks
+  assert(τ0.extent() == 10);    // there are 10 elements in τ0
+  assert((τ0.elements_range() ==
+          Range1{1, 11}));        // elements indexed by $\mathbb{Z}_{1,11}$
+  assert(τ0.tile_extent() == 3);  // there are 3 tiles in τ0
+  assert((τ0.tiles_range() ==
+          Range1{0, 3}));                // tiles indexed by $\mathbb{Z}_{0,3}$
+  assert((τ0.tile(1) == Range1{5, 8}));  // 1st tile of τ0 is $\mathbb{Z}_{5,8}$
+
+  // $\mathbb{Z}_{-1,9}$ tiled into $\mathbb{Z}_{-1,5}$ and $\mathbb{Z}_{5,9}$
+  TiledRange1 τ1{-1, 5, 9};
+
+  // 2nd tile of $\code{tau0}$ is $\mathbb{Z}_{5,8}$
+  assert((τ0.tile(1) == Range1{5, 8}));
+  // 1st tile of $\code{tau1}$ is $\mathbb{Z}_{-1,5}$
+  assert((τ1.tile(0) == Range1{-1, 5}));
+
+  // prints "-1 0 1 2 3 4 "
+  for (auto&& i : τ1.tile(0)) cout << i << " ";
+
+  // tiling of $\mathbb{Z}_{1,11} \otimes \mathbb{Z}_{-1,9}$ by tensor product
+  // of
+  // $\code{τ0}$ and $\code{τ1}$
+  TiledRange τ{τ0, τ1};
+  // shortcut
+  TiledRange same_as_τ{{1, 5, 8, 11}, {-1, 5, 9}};
+
+  // tile index {0,0} refers to tile $\mathbb{Z}_{1,5} \otimes
+  // \mathbb{Z}_{-1,5}$
+  auto tile_0_0 = τ.tile({0, 0});
+  assert((tile_0_0 == Range{{1, 5}, {-1, 5}}));
+
+  // clang-format off
+
+  // 2-d array of $\code{double}$ 0s, indexed by ρ
+  Tensor<double> t0(ρ, 0.);
+  // same as $\code{t0}$ but filled with ordinals
+  TensorD t1(ρ, [&ρ](auto&& idx) {
+    return ρ.ordinal(idx);
+  });
+  // print out "0 1 .. 99 "
+  for (auto&& v : t1) cout << v << " ";
+  // same as $\code{t0}$, using existing buffer
+  shared_ptr<double[]> v(new double[ρ.volume()]);
+  TensorD t2(ρ, v); // t2 and v co-manage buffer lifetime
+  v[0] = 1.;
+  assert(t2(1, -1) == 1.);
+  // same as $\code{t0}$, using existing (unmanaged) buffer
+  auto t3 = make_map(v.get(), ρ);
+  v[0] = 2.;
+  assert(t3(1, -1) == 2.);
+  // Tensor has shallow-copy semantics
+  auto t4 = t0;
+  t0(1, -1) = 3.;
+  assert(t4(1, -1) == 3.);
+
+  // clang-format on
+
+  // default instance of $\code{DistArray}$ is
+  // a {\em dense} array of $\code{double}$s
+  // NB can use TArrayD instead of DistArray<>
+  DistArray<> a0(τ);
+  a0.fill(1.);  // fill $\code{da}$ with 1s
+  // every tile exists in a dense array
+  assert(!a0.is_zero({0, 0}));
+  // grab a ${\em future}$ to the {0,0} tile
+  auto t00 = a0.find({0, 0});
+
+  // shape of a {\em sparse} array over τ
+  // tiles with even ordinals ({0,0}, {0,2}, {1,1}) are zero
+  SparseShape s(TensorF(τ.tiles_range(), {0, 1, 0, 1, 0, 1}), τ);
+  // a sparse array of $\code{double}$s
+  // TSpArrayX $\equiv$ DistArray<TensorX, SparsePolicy>
+  TSpArrayD a1(τ, s);
+  // only some tiles are nonzero in sparse array
+  assert(a1.is_zero({0, 0}));
+  assert(!a1.is_zero({0, 1}));
+
+#endif  // defined(TILEDARRAY_CXX_COMPILER_SUPPORT_UNICODE_VARIABLES)
+
+  return 0;
+}
+
+#endif /* EXAMPLES_DEMO_DEMO2_CPP_ */
diff --git a/examples/cuda/CMakeLists.txt b/examples/device/CMakeLists.txt
similarity index 72%
rename from examples/cuda/CMakeLists.txt
rename to examples/device/CMakeLists.txt
index 5d7f56c86e..bab6aa8e05 100644
--- a/examples/cuda/CMakeLists.txt
+++ b/examples/device/CMakeLists.txt
@@ -23,14 +23,14 @@
 #
 
 
-if(CUDA_FOUND)
+if(TILEDARRAY_HAS_HIP OR TILEDARRAY_HAS_HIP)
 
-  foreach(_exec cuda_librett cuda_task ta_dense_cuda ta_cc_abcd_cuda ta_vector_cuda ta_reduce_cuda)
+    foreach(_exec device_task ta_dense_device ta_cc_abcd_device ta_vector_device ta_reduce_device)
 
-    # Add executable
-    add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray")
-    add_dependencies(examples-tiledarray ${_exec})
+        # Add executable
+        add_ta_executable(${_exec} "${_exec}.cpp" "tiledarray")
+        add_dependencies(examples-tiledarray ${_exec})
 
-  endforeach()
+    endforeach()
 
-endif(CUDA_FOUND)
+endif()
diff --git a/examples/cuda/cuda_task.cpp b/examples/device/device_task.cpp
similarity index 51%
rename from examples/cuda/cuda_task.cpp
rename to examples/device/device_task.cpp
index a019523b6e..bfd75ac51c 100644
--- a/examples/cuda/cuda_task.cpp
+++ b/examples/device/device_task.cpp
@@ -2,8 +2,10 @@
 // Created by Chong Peng on 11/14/18.
 //
 
-#include <TiledArray/cuda/btas_um_tensor.h>
-#include <TiledArray/cuda/cuda_task_fn.h>
+#include <TiledArray/device/blas.h>
+#include <TiledArray/device/btas_um_tensor.h>
+#include <TiledArray/device/device_task_fn.h>
+
 #include <tiledarray.h>
 
 using value_type = double;
@@ -13,8 +15,8 @@ using tile_type = TA::Tile<tensor_type>;
 /// verify the elements in tile is equal to value
 void verify(const tile_type& tile, value_type value, std::size_t index) {
   //  const auto size = tile.size();
-  std::string message = "verify Tensor: " + std::to_string(index) + '\n';
-  std::cout << message;
+  //  std::string message = "verify Tensor: " + std::to_string(index) + '\n';
+  //  std::cout << message;
   for (auto& num : tile) {
     if (num != value) {
       std::string error("Error: " + std::to_string(num) + " " +
@@ -26,50 +28,45 @@ void verify(const tile_type& tile, value_type value, std::size_t index) {
   }
 }
 
-tile_type scale(const tile_type& arg, value_type a, const cudaStream_t* stream,
-                std::size_t index) {
-  CudaSafeCall(
-      cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id()));
+tile_type scale(const tile_type& arg, value_type a,
+                TiledArray::device::Stream stream, std::size_t index) {
   /// make result Tensor
   using Storage = typename tile_type::tensor_type::storage_type;
   Storage result_storage;
   auto result_range = arg.range();
-  make_device_storage(result_storage, arg.size(), *stream);
+  make_device_storage(result_storage, arg.size(), stream);
 
   typename tile_type::tensor_type result(std::move(result_range),
                                          std::move(result_storage));
 
   /// copy the original Tensor
-  const auto& handle = TiledArray::cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, *stream));
+  auto& queue = TiledArray::BLASQueuePool::queue(stream);
 
-  CublasSafeCall(TiledArray::cublasCopy(handle, result.size(), arg.data(), 1,
-                                        device_data(result.storage()), 1));
+  blas::copy(result.size(), arg.data(), 1, device_data(result.storage()), 1,
+             queue);
 
-  CublasSafeCall(TiledArray::cublasScal(handle, result.size(), &a,
-                                        device_data(result.storage()), 1));
+  blas::scal(result.size(), a, device_data(result.storage()), 1, queue);
 
-  //  cudaStreamSynchronize(stream);
+  //  std::stringstream stream_str;
+  //  stream_str << stream;
+  //  std::string message = "run scale on Tensor: " + std::to_string(index) +
+  //                        "on stream: " + stream_str.str() + '\n';
+  //  std::cout << message;
 
-  TiledArray::synchronize_stream(stream);
+  TiledArray::device::sync_madness_task_with(stream);
 
-  //  std::stringstream stream_str;
-  //  stream_str << *stream;
-  //  std::string message = "run scale on Tensor: " + std::to_string(index) +  "
-  //  on stream: " + stream_str.str() +'\n'; std::cout << message;
   return tile_type(std::move(result));
 }
 
-void process_task(madness::World* world,
-                  const std::vector<cudaStream_t>* streams, std::size_t ntask) {
+void process_task(madness::World* world, std::size_t ntask) {
   const std::size_t iter = 50;
   const std::size_t M = 1000;
   const std::size_t N = 1000;
 
-  std::size_t n_stream = streams->size();
+  std::size_t n_stream = TiledArray::deviceEnv::instance()->num_streams_total();
 
   for (std::size_t i = 0; i < iter; i++) {
-    auto& stream = (*streams)[i % n_stream];
+    auto stream = TiledArray::deviceEnv::instance()->stream(i % n_stream);
 
     TiledArray::Range range{M, N};
 
@@ -78,11 +75,11 @@ void process_task(madness::World* world,
     const double scale_factor = 2.0;
 
     // function pointer to the scale function to call
-    tile_type (*scale_fn)(const tile_type&, double, const cudaStream_t*,
+    tile_type (*scale_fn)(const tile_type&, double, TiledArray::device::Stream,
                           std::size_t) = &::scale;
 
-    madness::Future<tile_type> scale_future = madness::add_cuda_task(
-        *world, ::scale, tensor, scale_factor, &stream, ntask * iter + i);
+    madness::Future<tile_type> scale_future = madness::add_device_task(
+        *world, ::scale, tensor, scale_factor, stream, ntask * iter + i);
 
     /// this should start until scale_taskfn is finished
     world->taskq.add(verify, scale_future, scale_factor, ntask * iter + i);
@@ -92,27 +89,15 @@ void process_task(madness::World* world,
 int try_main(int argc, char** argv) {
   auto& world = TiledArray::get_default_world();
 
-  const std::size_t n_stream = 5;
   const std::size_t n_tasks = 5;
 
-  std::vector<cudaStream_t> streams(n_stream);
-  for (auto& stream : streams) {
-    // create the streams
-    CudaSafeCall(cudaStreamCreate(&stream));
-    //    std::cout << "stream: " << stream << "\n";
-  }
-
   // add process_task to different tasks/threads
   for (auto i = 0; i < n_tasks; i++) {
-    world.taskq.add(process_task, &world, &streams, i);
+    world.taskq.add(process_task, &world, i);
   }
 
   world.gop.fence();
 
-  for (auto& stream : streams) {
-    // create the streams
-    cudaStreamDestroy(stream);
-  }
   return 0;
 }
 
@@ -121,12 +106,12 @@ int main(int argc, char* argv[]) {
   try {
     // Initialize runtime
     try_main(argc, argv);
-  } catch (thrust::system::detail::bad_alloc& ex) {
+  } catch (std::exception& ex) {
     std::cout << ex.what() << std::endl;
 
     size_t free_mem, total_mem;
-    auto result = cudaMemGetInfo(&free_mem, &total_mem);
-    std::cout << "CUDA memory stats: {total,free} = {" << total_mem << ","
+    auto result = TiledArray::device::memGetInfo(&free_mem, &total_mem);
+    std::cout << "device memory stats: {total,free} = {" << total_mem << ","
               << free_mem << "}" << std::endl;
   } catch (...) {
     std::cerr << "unknown exception" << std::endl;
diff --git a/examples/cuda/ta_cc_abcd_cuda.cpp b/examples/device/ta_cc_abcd_device.cpp
similarity index 95%
rename from examples/cuda/ta_cc_abcd_cuda.cpp
rename to examples/device/ta_cc_abcd_device.cpp
index 0887c90562..02d7781b12 100644
--- a/examples/cuda/ta_cc_abcd_cuda.cpp
+++ b/examples/device/ta_cc_abcd_device.cpp
@@ -17,7 +17,7 @@
  *
  */
 
-#include <TiledArray/cuda/btas_um_tensor.h>
+#include <TiledArray/device/btas_um_tensor.h>
 #include <TiledArray/version.h>
 #include <tiledarray.h>
 #include <iostream>
@@ -182,17 +182,17 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
   const double flops_per_fma =
       (complex_T ? 8 : 2);  // 1 multiply takes 6/1 flops for complex/real
                             // 1 add takes 2/1 flops for complex/real
-  const double n_gflop = flops_per_fma * std::pow(n_occ, 2) *
-                         std::pow(n_uocc, 4) / std::pow(1024., 3);
+  const double n_gflop =
+      flops_per_fma * std::pow(n_occ, 2) * std::pow(n_uocc, 4) / 1e9;
 
-  using CUDATile =
-      btas::Tensor<T, TA::Range, TiledArray::cuda_um_btas_varray<T>>;
-  using CUDAMatrix = TA::DistArray<TA::Tile<CUDATile>>;
+  using deviceTile =
+      btas::Tensor<T, TA::Range, TiledArray::device_um_btas_varray<T>>;
+  using deviceMatrix = TA::DistArray<TA::Tile<deviceTile>>;
 
   // Construct tensors
-  CUDAMatrix t2(world, trange_oovv);
-  CUDAMatrix v(world, trange_vvvv);
-  CUDAMatrix t2_v;
+  deviceMatrix t2(world, trange_oovv);
+  deviceMatrix v(world, trange_vvvv);
+  deviceMatrix t2_v;
   // To validate, fill input tensors with random data, otherwise just with 1s
   //  if (do_validate) {
   //    rand_fill_array(t2);
@@ -245,7 +245,7 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
   auto result = dot_length * 0.2 * 0.3;
 
   auto verify = [&world, &threshold, &result,
-                 &dot_length](const TA::Tile<CUDATile>& tile) {
+                 &dot_length](const TA::Tile<deviceTile>& tile) {
     auto n_elements = tile.size();
     for (std::size_t i = 0; i < n_elements; i++) {
       double abs_err = fabs(tile[i] - result);
diff --git a/examples/device/ta_dense_device.cpp b/examples/device/ta_dense_device.cpp
new file mode 100644
index 0000000000..30333c7edc
--- /dev/null
+++ b/examples/device/ta_dense_device.cpp
@@ -0,0 +1,378 @@
+/*
+ * This file is a part of TiledArray.
+ * Copyright (C) 2018  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+// clang-format off
+
+#include <tiledarray.h>
+#include <TiledArray/device/btas_um_tensor.h>
+#include <TiledArray/external/btas.h>
+// clang-format on
+
+#ifdef TILEDARRAY_HAS_CUDA
+#include <cuda_profiler_api.h>
+#endif  // TILEDARRAY_HAS_CUDA
+
+template <typename Storage>
+void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
+                  const long Nn, const long Bn, const long Nk, const long Bk,
+                  const long nrepeat) {
+  using T = TiledArray::detail::numeric_t<Storage>;
+  using RT = TiledArray::detail::scalar_t<Storage>;
+  constexpr auto complex_T = TiledArray::detail::is_complex_v<T>;
+
+  const std::int64_t nflops =
+      (complex_T ? 8 : 2)  // 1 multiply takes 6/1 flops for complex/real
+                           // 1 add takes 2/1 flops for complex/real
+      * static_cast<std::int64_t>(Nn) * static_cast<std::int64_t>(Nm) *
+      static_cast<std::int64_t>(Nk);
+
+  // Construct TiledRange
+  std::vector<unsigned int> blocking_m;
+  for (long i = 0l; i <= Nm; i += Bm) blocking_m.push_back(i);
+  const std::size_t Tm = blocking_m.size() - 1;
+
+  std::vector<unsigned int> blocking_n;
+  for (long i = 0l; i <= Nn; i += Bn) blocking_n.push_back(i);
+  const std::size_t Tn = blocking_n.size() - 1;
+
+  std::vector<unsigned int> blocking_k;
+  for (long i = 0l; i <= Nk; i += Bk) blocking_k.push_back(i);
+  const std::size_t Tk = blocking_k.size();
+
+  if (world.rank() == 0)
+    std::cout << "TiledArray: dense matrix multiply test...\n"
+              << "Number of nodes     = " << world.size()
+              << "\nSize of A         = " << Nm << "x" << Nk << " ("
+              << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)"
+              << "\nSize of (largest) A block   = " << Bm << "x" << Bk
+              << "\nSize of B         = " << Nk << "x" << Nn << " ("
+              << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)"
+              << "\nSize of (largest) B block   = " << Bk << "x" << Bn
+              << "\nSize of C         = " << Nm << "x" << Nn << " ("
+              << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)"
+              << "\nSize of (largest) C block   = " << Bm << "x" << Bn
+              << "\n# of blocks of C  = " << Tm * Tn
+              << "\nAverage # of blocks of C/node = "
+              << double(Tm * Tn) / double(world.size()) << "\n";
+
+  // Structure of c
+  std::vector<TiledArray::TiledRange1> blocking_C;
+  blocking_C.reserve(2);
+  blocking_C.push_back(
+      TiledArray::TiledRange1(blocking_m.begin(), blocking_m.end()));
+  blocking_C.push_back(
+      TiledArray::TiledRange1(blocking_n.begin(), blocking_n.end()));
+
+  // Structure of a
+  std::vector<TiledArray::TiledRange1> blocking_A;
+  blocking_A.reserve(2);
+  blocking_A.push_back(
+      TiledArray::TiledRange1(blocking_m.begin(), blocking_m.end()));
+  blocking_A.push_back(
+      TiledArray::TiledRange1(blocking_k.begin(), blocking_k.end()));
+
+  // Structure of b
+  std::vector<TiledArray::TiledRange1> blocking_B;
+  blocking_B.reserve(2);
+  blocking_B.push_back(
+      TiledArray::TiledRange1(blocking_k.begin(), blocking_k.end()));
+  blocking_B.push_back(
+      TiledArray::TiledRange1(blocking_n.begin(), blocking_n.end()));
+
+  TiledArray::TiledRange  // TRange for c
+      trange_c(blocking_C.begin(), blocking_C.end());
+
+  TiledArray::TiledRange  // TRange for a
+      trange_a(blocking_A.begin(), blocking_A.end());
+
+  TiledArray::TiledRange  // TRange for b
+      trange_b(blocking_B.begin(), blocking_B.end());
+
+  using DeviceTile = btas::Tensor<T, TA::Range, Storage>;
+  using DeviceMatrix = TA::DistArray<TA::Tile<DeviceTile>>;
+  using PinnedTile =
+      btas::Tensor<T, TA::Range,
+                   ::btas::varray<typename Storage::value_type,
+                                  TiledArray::device_pinned_allocator<T>>>;
+  using PinnedMatrix = TA::DistArray<TA::Tile<PinnedTile>>;
+  // using TAMatrix = TA::DistArray<TA::Tensor<T>>;
+
+  DeviceMatrix c(world, trange_c);
+  auto val_a = 0.03;
+  auto val_b = 0.02;
+
+  {
+    // Construct and initialize arrays
+
+    PinnedMatrix a_host(world, trange_a);
+    PinnedMatrix b_host(world, trange_b);
+
+    a_host.fill(val_a);
+    b_host.fill(val_b);
+    DeviceMatrix a = TA::ta_tensor_to_um_tensor<TA::Tile<DeviceTile>>(a_host);
+    DeviceMatrix b = TA::ta_tensor_to_um_tensor<TA::Tile<DeviceTile>>(b_host);
+
+    world.gop.fence();
+
+    //    TA::to_device(a);
+    //    TA::to_device(b);
+
+    //    c("m,n") = a("m,k") * b("k,n");
+
+#ifdef TILEDARRAY_HAS_CUDA
+    // start profiler
+    cudaProfilerStart();
+#endif  // TILEDARRAY_HAS_CUDA
+
+    double total_time = 0.0;
+    double total_gflop_rate = 0.0;
+
+    // Do matrix multiplication
+    for (int i = 0; i < nrepeat; ++i) {
+      double iter_time_start = madness::wall_time();
+      //      c("m,n") = a("m,k") * b("k,n") + a("m,n") - b("m,n");
+      c("m,n") = a("m,k") * b("k,n");
+      c.world().gop.fence();  // fence since GEMM can return early
+      double iter_time_stop = madness::wall_time();
+      const double iter_time = iter_time_stop - iter_time_start;
+      total_time += iter_time;
+      const double gflop_rate = double(nflops) / (iter_time * 1.e9);
+      total_gflop_rate += gflop_rate;
+      if (world.rank() == 0)
+        std::cout << "Iteration " << i + 1 << " wall time: " << iter_time
+                  << "\n";
+      if (world.rank() == 0)
+        std::cout << "Iteration " << i + 1 << "   time=" << time
+                  << "   GFLOPS=" << gflop_rate << "\n";
+    }
+    // Stop clock
+    const double wall_time_stop = madness::wall_time();
+
+#ifdef TILEDARRAY_HAS_CUDA
+    // stop profiler
+    cudaProfilerStop();
+#endif  // TILEDARRAY_HAS_CUDA
+
+    if (world.rank() == 0)
+      std::cout << "Average wall time   = " << total_time / double(nrepeat)
+                << " sec\nAverage GFLOPS      = "
+                << total_gflop_rate / double(nrepeat) << "\n";
+  }
+
+  double threshold = std::numeric_limits<RT>::epsilon();
+  auto dot_length = Nk;
+  //  auto result = dot_length * val_a * val_b + val_a - val_b;
+  T result;
+  if constexpr (complex_T) {
+    result = T(dot_length * val_a * val_b, 0.);
+  } else
+    result = dot_length * val_a * val_b;
+
+  auto verify = [&world, &threshold, &result,
+                 &dot_length](TA::Tile<DeviceTile> &tile) {
+    auto n_elements = tile.size();
+    for (std::size_t i = 0; i < n_elements; i++) {
+      double abs_err = std::abs(tile[i] - result);
+      //      double abs_val = fabs(tile[i]);
+      double rel_err = abs_err / std::abs(result) / dot_length;
+      if (rel_err > threshold) {
+        auto to_string = [](const auto &v) {
+          constexpr bool complex_T =
+              TiledArray::detail::is_complex_v<std::decay_t<decltype(v)>>;
+          if constexpr (complex_T) {
+            std::string result;
+            result = "{" + std::to_string(v.real()) + "," +
+                     std::to_string(v.imag()) + "}";
+            return result;
+          } else
+            return std::to_string(v);
+        };
+        std::cout << "Node: " << world.rank() << " Tile: " << tile.range()
+                  << " id: " << i
+                  << std::string(" gpu: " + to_string(tile[i]) +
+                                 " cpu: " + to_string(result) + "\n");
+        break;
+      }
+    }
+  };
+
+  for (auto iter = c.begin(); iter != c.end(); iter++) {
+    world.taskq.add(verify, c.find(iter.index()));
+  }
+
+  world.gop.fence();
+
+  if (world.rank() == 0) {
+    std::cout << "Verification Passed" << std::endl;
+  }
+}
+
+int try_main(int argc, char **argv) {
+  // Initialize runtime
+  TiledArray::World &world = TA_SCOPED_INITIALIZE(argc, argv);
+
+  // Get command line arguments
+  if (argc < 6) {
+    std::cout << "multiplies A(Nm,Nk) * B(Nk,Nn), with dimensions m, n, and k "
+                 "blocked by Bm, Bn, and Bk, respectively"
+              << std::endl
+              << "Usage: " << argv[0]
+              << " Nm Bm Nn Bn Nk Bk [# of repetitions = 5] [scalar = double] "
+                 "[storage type = device_um_btas_varray]\n";
+    return 0;
+  }
+  const long Nm = atol(argv[1]);
+  const long Bm = atol(argv[2]);
+  const long Nn = atol(argv[3]);
+  const long Bn = atol(argv[4]);
+  const long Nk = atol(argv[5]);
+  const long Bk = atol(argv[6]);
+  if (Nm <= 0 || Nn <= 0 || Nk <= 0) {
+    std::cerr << "Error: dimensions must be greater than zero.\n";
+    return 1;
+  }
+  if (Bm <= 0 || Bn <= 0 || Bk <= 0) {
+    std::cerr << "Error: block sizes must be greater than zero.\n";
+    return 1;
+  }
+  const long nrepeat = (argc >= 8 ? atol(argv[7]) : 5);
+  if (nrepeat <= 0) {
+    std::cerr << "Error: number of repetitions must be greater than zero.\n";
+    return 1;
+  }
+
+  const std::string scalar_type_str = (argc >= 9 ? argv[8] : "double");
+  if (scalar_type_str != "double" && scalar_type_str != "float" &&
+      scalar_type_str != "zdouble" && scalar_type_str != "zfloat") {
+    std::cerr << "Error: invalid real type " << scalar_type_str << ".\n";
+    std::cerr << "       valid real types are \"double\", \"float\", "
+                 "\"zdouble\", and \"zfloat\".\n";
+    return 1;
+  }
+
+  const auto storage_type = (argc >= 10) ? std::string(argv[9])
+                                         : std::string{"device_um_btas_varray"};
+
+  if (storage_type != "device_um_btas_varray") {
+    std::cerr << "Error: invalid storage type: " << storage_type
+              << "\n Valid option includes: "
+                 "device_um_btas_varray \n";
+  }
+  std::cout << "Storage type: " << storage_type << "<" << scalar_type_str << ">"
+            << std::endl;
+  //  auto to_bool = [](const std::string &str) {
+  //    return (str == "true" || str == "True" || str == "TRUE" || str == "1" ||
+  //            str == "yes" || str == "Yes" || str == "YES");
+  //  };
+
+  int driverVersion, runtimeVersion;
+  auto error = TiledArray::device::driverVersion(&driverVersion);
+  if (error != TiledArray::device::Success) {
+    std::cout << "error(DriverGetVersion) = " << error << std::endl;
+  }
+  error = TiledArray::device::runtimeVersion(&runtimeVersion);
+  if (error != TiledArray::device::Success) {
+    std::cout << "error(RuntimeGetVersion) = " << error << std::endl;
+  }
+  std::cout << "device {driver,runtime} versions = " << driverVersion << ","
+            << runtimeVersion << std::endl;
+
+  {  // print device properties
+    int num_devices = TA::deviceEnv::instance()->num_visible_devices();
+
+    if (num_devices <= 0) {
+      throw std::runtime_error("No GPUs Found!\n");
+    }
+
+    const int device_id = TA::deviceEnv::instance()->current_device_id();
+
+    int mpi_size = world.size();
+    int mpi_rank = world.rank();
+
+    for (int i = 0; i < mpi_size; i++) {
+      if (i == mpi_rank) {
+        std::cout << "Device Information for MPI Process Rank: " << mpi_rank
+                  << std::endl;
+        TiledArray::device::deviceProp_t prop;
+        auto error = TiledArray::device::getDeviceProperties(&prop, device_id);
+        if (error != TiledArray::device::Success) {
+          std::cout << "error(GetDeviceProperties) = " << error << std::endl;
+        }
+        std::cout << "Device #" << device_id << ": " << prop.name << std::endl
+                  << "  managedMemory = " << prop.managedMemory << std::endl;
+        int result;
+        error = TiledArray::device::deviceGetAttribute(
+            &result, TiledArray::device::DevAttrUnifiedAddressing, device_id);
+        std::cout << "  attrUnifiedAddressing = " << result << std::endl;
+        error = TiledArray::device::deviceGetAttribute(
+            &result, TiledArray::device::DevAttrConcurrentManagedAccess,
+            device_id);
+        std::cout << "  attrConcurrentManagedAccess = " << result << std::endl;
+        error = TiledArray::device::setDevice(device_id);
+        if (error != TiledArray::device::Success) {
+          std::cout << "error(device::setDevice) = " << error << std::endl;
+        }
+        size_t free_mem, total_mem;
+        error = TiledArray::device::memGetInfo(&free_mem, &total_mem);
+        std::cout << "  {total,free} memory = {" << total_mem << "," << free_mem
+                  << "}" << std::endl;
+      }
+      world.gop.fence();
+    }
+  }  // print device properties
+
+  if (storage_type == "device_um_btas_varray") {
+    if (scalar_type_str == "double")
+      do_main_body<TiledArray::device_um_btas_varray<double>>(
+          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
+    else if (scalar_type_str == "float")
+      do_main_body<TiledArray::device_um_btas_varray<float>>(
+          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
+    else if (scalar_type_str == "zdouble")
+      do_main_body<TiledArray::device_um_btas_varray<std::complex<double>>>(
+          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
+    else if (scalar_type_str == "zfloat")
+      do_main_body<TiledArray::device_um_btas_varray<std::complex<float>>>(
+          world, Nm, Bm, Nn, Bn, Nk, Bk, nrepeat);
+    else {
+      abort();  // unreachable
+    }
+  } else {
+    throw std::runtime_error("Invalid storage type!\n");
+  }
+
+  return 0;
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    try_main(argc, argv);
+  } catch (std::exception &ex) {
+    std::cout << ex.what() << std::endl;
+
+    size_t free_mem, total_mem;
+    auto result = TiledArray::device::memGetInfo(&free_mem, &total_mem);
+    std::cout << "device memory stats: {total,free} = {" << total_mem << ","
+              << free_mem << "}" << std::endl;
+  } catch (...) {
+    std::cerr << "unknown exception" << std::endl;
+  }
+
+  return 0;
+}
diff --git a/examples/cuda/ta_reduce_cuda.cpp b/examples/device/ta_reduce_device.cpp
similarity index 83%
rename from examples/cuda/ta_reduce_cuda.cpp
rename to examples/device/ta_reduce_device.cpp
index e453069892..96d1bdbda4 100644
--- a/examples/cuda/ta_reduce_cuda.cpp
+++ b/examples/device/ta_reduce_device.cpp
@@ -17,15 +17,9 @@
  *
  */
 
-#define CUDA_API_PER_THREAD_DEFAULT_STREAM
-
-#include <madness/config.h>
-
-// clang-format off
-
 #include <tiledarray.h>
-#include <TiledArray/cuda/btas_um_tensor.h>
-// clang-format on
+
+#include <TiledArray/device/btas_um_tensor.h>
 
 template <typename Tile>
 void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
@@ -237,7 +231,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
 }
 
 template <typename T>
-using cudaTile = TiledArray::Tile<TiledArray::btasUMTensorVarray<T>>;
+using deviceTile = TiledArray::Tile<TiledArray::btasUMTensorVarray<T>>;
 
 int try_main(int argc, char **argv) {
   // Initialize runtime
@@ -286,57 +280,56 @@ int try_main(int argc, char **argv) {
   }
 
   int driverVersion, runtimeVersion;
-  auto error = cudaDriverGetVersion(&driverVersion);
-  if (error != cudaSuccess) {
-    std::cout << "error(cudaDriverGetVersion) = " << error << std::endl;
+  auto error = TiledArray::device::driverVersion(&driverVersion);
+  if (error != TiledArray::device::Success) {
+    std::cout << "error(driverVersion) = " << error << std::endl;
   }
-  error = cudaRuntimeGetVersion(&runtimeVersion);
-  if (error != cudaSuccess) {
-    std::cout << "error(cudaRuntimeGetVersion) = " << error << std::endl;
+  error = TiledArray::device::runtimeVersion(&runtimeVersion);
+  if (error != TiledArray::device::Success) {
+    std::cout << "error(runtimeVersion) = " << error << std::endl;
   }
-  std::cout << "CUDA {driver,runtime} versions = " << driverVersion << ","
+  std::cout << "device {driver,runtime} versions = " << driverVersion << ","
             << runtimeVersion << std::endl;
 
   {  // print device properties
-    int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices();
+    int num_devices = TA::deviceEnv::instance()->num_visible_devices();
 
-    if (num_cuda_devices <= 0) {
-      throw std::runtime_error("No CUDA-Enabled GPUs Found!\n");
+    if (num_devices <= 0) {
+      throw std::runtime_error("No GPUs Found!\n");
     }
 
-    int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id();
+    const int device_id = TA::deviceEnv::instance()->current_device_id();
 
     int mpi_size = world.size();
     int mpi_rank = world.rank();
 
     for (int i = 0; i < mpi_size; i++) {
       if (i == mpi_rank) {
-        std::cout << "CUDA Device Information for MPI Process Rank: "
-                  << mpi_rank << std::endl;
-        cudaDeviceProp prop;
-        auto error = cudaGetDeviceProperties(&prop, cuda_device_id);
-        if (error != cudaSuccess) {
-          std::cout << "error(cudaGetDeviceProperties) = " << error
-                    << std::endl;
+        std::cout << "Device Information for MPI Process Rank: " << mpi_rank
+                  << std::endl;
+        TiledArray::device::deviceProp_t prop;
+        auto error = TiledArray::device::getDeviceProperties(&prop, device_id);
+        if (error != TiledArray::device::Success) {
+          std::cout << "error(getDeviceProperties) = " << error << std::endl;
         }
-        std::cout << "Device #" << cuda_device_id << ": " << prop.name
-                  << std::endl
+        std::cout << "Device #" << device_id << ": " << prop.name << std::endl
                   << "  managedMemory = " << prop.managedMemory << std::endl
                   << "  singleToDoublePrecisionPerfRatio = "
                   << prop.singleToDoublePrecisionPerfRatio << std::endl;
         int result;
-        error = cudaDeviceGetAttribute(&result, cudaDevAttrUnifiedAddressing,
-                                       cuda_device_id);
+        error = TiledArray::device::deviceGetAttribute(
+            &result, TiledArray::device::DevAttrUnifiedAddressing, device_id);
         std::cout << "  attrUnifiedAddressing = " << result << std::endl;
-        error = cudaDeviceGetAttribute(
-            &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id);
+        error = TiledArray::device::deviceGetAttribute(
+            &result, TiledArray::device::DevAttrConcurrentManagedAccess,
+            device_id);
         std::cout << "  attrConcurrentManagedAccess = " << result << std::endl;
-        error = cudaSetDevice(cuda_device_id);
-        if (error != cudaSuccess) {
-          std::cout << "error(cudaSetDevice) = " << error << std::endl;
+        error = TiledArray::device::setDevice(device_id);
+        if (error != TiledArray::device::Success) {
+          std::cout << "error(device::setDevice) = " << error << std::endl;
         }
         size_t free_mem, total_mem;
-        error = cudaMemGetInfo(&free_mem, &total_mem);
+        error = TiledArray::device::memGetInfo(&free_mem, &total_mem);
         std::cout << "  {total,free} memory = {" << total_mem << "," << free_mem
                   << "}" << std::endl;
       }
@@ -348,7 +341,7 @@ int try_main(int argc, char **argv) {
     if (world.rank() == 0) {
       std::cout << "\n GPU vector operations. \n\n";
     }
-    do_main_body<cudaTile<double>>(world, Nm, Bm, Nn, Bn, nrepeat);
+    do_main_body<deviceTile<double>>(world, Nm, Bm, Nn, Bn, nrepeat);
 
     if (world.rank() == 0) {
       std::cout << "\n CPU vector operations. \n\n";
@@ -359,7 +352,7 @@ int try_main(int argc, char **argv) {
     if (world.rank() == 0) {
       std::cout << "\n GPU vector operations. \n\n";
     }
-    do_main_body<cudaTile<float>>(world, Nm, Bm, Nn, Bn, nrepeat);
+    do_main_body<deviceTile<float>>(world, Nm, Bm, Nn, Bn, nrepeat);
 
     if (world.rank() == 0) {
       std::cout << "\n CPU vector operations. \n\n";
@@ -373,15 +366,13 @@ int try_main(int argc, char **argv) {
 int main(int argc, char *argv[]) {
   try {
     try_main(argc, argv);
-  } catch (thrust::system::detail::bad_alloc &ex) {
+  } catch (std::exception &ex) {
     std::cout << ex.what() << std::endl;
 
     size_t free_mem, total_mem;
-    auto result = cudaMemGetInfo(&free_mem, &total_mem);
-    std::cout << "CUDA memory stats: {total,free} = {" << total_mem << ","
+    auto result = TiledArray::device::memGetInfo(&free_mem, &total_mem);
+    std::cout << "device memory stats: {total,free} = {" << total_mem << ","
               << free_mem << "}" << std::endl;
-  } catch (std::exception &ex) {
-    std::cout << ex.what() << std::endl;
   } catch (...) {
     std::cerr << "unknown exception" << std::endl;
   }
diff --git a/examples/cuda/ta_vector_cuda.cpp b/examples/device/ta_vector_device.cpp
similarity index 83%
rename from examples/cuda/ta_vector_cuda.cpp
rename to examples/device/ta_vector_device.cpp
index 1593a68e8b..4507ee64f7 100644
--- a/examples/cuda/ta_vector_cuda.cpp
+++ b/examples/device/ta_vector_device.cpp
@@ -17,17 +17,9 @@
  *
  */
 
-#define CUDA_API_PER_THREAD_DEFAULT_STREAM
-
-#include <madness/config.h>
-
-// clang-format off
-
-#include <tiledarray.h>
-#include <TiledArray/cuda/btas_um_tensor.h>
-#include "TiledArray/cuda/cpu_cuda_vector.h"
+#include <TiledArray/device/btas_um_tensor.h>
 #include <TiledArray/external/btas.h>
-// clang-format on
+#include <tiledarray.h>
 
 template <typename Tile>
 void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
@@ -255,7 +247,7 @@ void do_main_body(TiledArray::World &world, const long Nm, const long Bm,
 }
 
 template <typename T>
-using cudaTile = TiledArray::Tile<TiledArray::btasUMTensorVarray<T>>;
+using deviceTile = TiledArray::Tile<TiledArray::btasUMTensorVarray<T>>;
 
 int try_main(int argc, char **argv) {
   // Initialize runtime
@@ -304,57 +296,55 @@ int try_main(int argc, char **argv) {
   }
 
   int driverVersion, runtimeVersion;
-  auto error = cudaDriverGetVersion(&driverVersion);
-  if (error != cudaSuccess) {
-    std::cout << "error(cudaDriverGetVersion) = " << error << std::endl;
+  auto error = TA::device::driverVersion(&driverVersion);
+  if (error != TA::device::Success) {
+    std::cout << "error(DriverGetVersion) = " << error << std::endl;
   }
-  error = cudaRuntimeGetVersion(&runtimeVersion);
-  if (error != cudaSuccess) {
-    std::cout << "error(cudaRuntimeGetVersion) = " << error << std::endl;
+  error = TA::device::runtimeVersion(&runtimeVersion);
+  if (error != TA::device::Success) {
+    std::cout << "error(RuntimeGetVersion) = " << error << std::endl;
   }
-  std::cout << "CUDA {driver,runtime} versions = " << driverVersion << ","
+  std::cout << "device {driver,runtime} versions = " << driverVersion << ","
             << runtimeVersion << std::endl;
 
   {  // print device properties
-    int num_cuda_devices = TA::cudaEnv::instance()->num_cuda_devices();
+    int num_devices = TA::deviceEnv::instance()->num_visible_devices();
 
-    if (num_cuda_devices <= 0) {
-      throw std::runtime_error("No CUDA-Enabled GPUs Found!\n");
+    if (num_devices <= 0) {
+      throw std::runtime_error("No GPUs Found!\n");
     }
 
-    int cuda_device_id = TA::cudaEnv::instance()->current_cuda_device_id();
+    const int device_id = TA::deviceEnv::instance()->current_device_id();
 
     int mpi_size = world.size();
     int mpi_rank = world.rank();
 
     for (int i = 0; i < mpi_size; i++) {
       if (i == mpi_rank) {
-        std::cout << "CUDA Device Information for MPI Process Rank: "
-                  << mpi_rank << std::endl;
-        cudaDeviceProp prop;
-        auto error = cudaGetDeviceProperties(&prop, cuda_device_id);
-        if (error != cudaSuccess) {
-          std::cout << "error(cudaGetDeviceProperties) = " << error
-                    << std::endl;
+        std::cout << "Device Information for MPI Process Rank: " << mpi_rank
+                  << std::endl;
+        TA::device::deviceProp_t prop;
+        auto error = TA::device::getDeviceProperties(&prop, device_id);
+        if (error != TA::device::Success) {
+          std::cout << "error(GetDeviceProperties) = " << error << std::endl;
         }
-        std::cout << "Device #" << cuda_device_id << ": " << prop.name
-                  << std::endl
+        std::cout << "Device #" << device_id << ": " << prop.name << std::endl
                   << "  managedMemory = " << prop.managedMemory << std::endl
                   << "  singleToDoublePrecisionPerfRatio = "
                   << prop.singleToDoublePrecisionPerfRatio << std::endl;
         int result;
-        error = cudaDeviceGetAttribute(&result, cudaDevAttrUnifiedAddressing,
-                                       cuda_device_id);
+        error = TA::device::deviceGetAttribute(
+            &result, TA::device::DevAttrUnifiedAddressing, device_id);
         std::cout << "  attrUnifiedAddressing = " << result << std::endl;
-        error = cudaDeviceGetAttribute(
-            &result, cudaDevAttrConcurrentManagedAccess, cuda_device_id);
+        error = TA::device::deviceGetAttribute(
+            &result, TA::device::DevAttrConcurrentManagedAccess, device_id);
         std::cout << "  attrConcurrentManagedAccess = " << result << std::endl;
-        error = cudaSetDevice(cuda_device_id);
-        if (error != cudaSuccess) {
-          std::cout << "error(cudaSetDevice) = " << error << std::endl;
+        error = TA::device::setDevice(device_id);
+        if (error != TA::device::Success) {
+          std::cout << "error(device::setDevice) = " << error << std::endl;
         }
         size_t free_mem, total_mem;
-        error = cudaMemGetInfo(&free_mem, &total_mem);
+        error = TA::device::memGetInfo(&free_mem, &total_mem);
         std::cout << "  {total,free} memory = {" << total_mem << "," << free_mem
                   << "}" << std::endl;
       }
@@ -366,7 +356,7 @@ int try_main(int argc, char **argv) {
     if (world.rank() == 0) {
       std::cout << "\n GPU vector operations. \n\n";
     }
-    do_main_body<cudaTile<double>>(world, Nm, Bm, Nn, Bn, nrepeat);
+    do_main_body<deviceTile<double>>(world, Nm, Bm, Nn, Bn, nrepeat);
 
     if (world.rank() == 0) {
       std::cout << "\n CPU vector operations. \n\n";
@@ -377,7 +367,7 @@ int try_main(int argc, char **argv) {
     if (world.rank() == 0) {
       std::cout << "\n GPU vector operations. \n\n";
     }
-    do_main_body<cudaTile<float>>(world, Nm, Bm, Nn, Bn, nrepeat);
+    do_main_body<deviceTile<float>>(world, Nm, Bm, Nn, Bn, nrepeat);
 
     if (world.rank() == 0) {
       std::cout << "\n CPU vector operations. \n\n";
@@ -391,15 +381,13 @@ int try_main(int argc, char **argv) {
 int main(int argc, char *argv[]) {
   try {
     try_main(argc, argv);
-  } catch (thrust::system::detail::bad_alloc &ex) {
+  } catch (std::exception &ex) {
     std::cout << ex.what() << std::endl;
 
     size_t free_mem, total_mem;
-    auto result = cudaMemGetInfo(&free_mem, &total_mem);
-    std::cout << "CUDA memory stats: {total,free} = {" << total_mem << ","
+    auto result = TA::device::memGetInfo(&free_mem, &total_mem);
+    std::cout << "device memory stats: {total,free} = {" << total_mem << ","
               << free_mem << "}" << std::endl;
-  } catch (std::exception &ex) {
-    std::cout << ex.what() << std::endl;
   } catch (...) {
     std::cerr << "unknown exception" << std::endl;
   }
diff --git a/examples/dgemm/ta_dense_asymm.cpp b/examples/dgemm/ta_dense_asymm.cpp
deleted file mode 100644
index d33fd6192a..0000000000
--- a/examples/dgemm/ta_dense_asymm.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * This file is a part of TiledArray.
- * Copyright (C) 2013  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-
-#include <TiledArray/external/btas.h>
-#include <tiledarray.h>
-#include <iostream>
-
-int main(int argc, char** argv) {
-  // Initialize runtime
-  TiledArray::World& world = TA_SCOPED_INITIALIZE(argc, argv);
-
-  // Get command line arguments
-  if (argc < 6) {
-    std::cout << "multiplies A(Nm,Nk) * B(Nk,Nn), with dimensions m, n, and k "
-                 "blocked by Bm, Bn, and Bk, respectively"
-              << std::endl
-              << "Usage: " << argv[0]
-              << " Nm Bm Nn Bn Nk Bk [repetitions=5] [real=double]\n";
-    return 0;
-  }
-  const long Nm = atol(argv[1]);
-  const long Bm = atol(argv[2]);
-  const long Nn = atol(argv[3]);
-  const long Bn = atol(argv[4]);
-  const long Nk = atol(argv[5]);
-  const long Bk = atol(argv[6]);
-  if (Nm <= 0 || Nn <= 0 || Nk <= 0) {
-    std::cerr << "Error: dimensions must be greater than zero.\n";
-    return 1;
-  }
-  if (Bm <= 0 || Bn <= 0 || Bk <= 0) {
-    std::cerr << "Error: block sizes must be greater than zero.\n";
-    return 1;
-  }
-  if ((Nm % Bm) != 0ul || Nn % Bn != 0ul || Nk % Bk != 0ul) {
-    std::cerr
-        << "Error: dimension size must be evenly divisible by block size.\n";
-    return 1;
-  }
-  const long repeat = (argc >= 8 ? atol(argv[7]) : 5);
-  if (repeat <= 0) {
-    std::cerr << "Error: number of repetitions must be greater than zero.\n";
-    return 1;
-  }
-
-  const std::string real_type_str = (argc >= 9 ? argv[8] : "double");
-  if (real_type_str != "double" && real_type_str != "float") {
-    std::cerr << "Error: invalid real type " << real_type_str << ".\n";
-    return 1;
-  }
-
-  const std::size_t Tm = Nm / Bm;
-  const std::size_t Tn = Nn / Bn;
-  const std::size_t Tk = Nk / Bk;
-
-  if (world.rank() == 0)
-    std::cout << "TiledArray: dense matrix multiply test...\n"
-              << "Number of nodes     = " << world.size()
-              << "\nSize of A         = " << Nm << "x" << Nk << " ("
-              << double(Nm * Nk * sizeof(double)) / 1.0e9 << " GB)"
-              << "\nSize of A block   = " << Bm << "x" << Bk
-              << "\nSize of B         = " << Nk << "x" << Nn << " ("
-              << double(Nk * Nn * sizeof(double)) / 1.0e9 << " GB)"
-              << "\nSize of B block   = " << Bk << "x" << Bn
-              << "\nSize of C         = " << Nm << "x" << Nn << " ("
-              << double(Nm * Nn * sizeof(double)) / 1.0e9 << " GB)"
-              << "\nSize of C block   = " << Bm << "x" << Bn
-              << "\n# of blocks of C  = " << Tm * Tn
-              << "\nAverage # of blocks of C/node = "
-              << double(Tm * Tn) / double(world.size()) << "\n";
-
-  // Construct TiledRange
-  std::vector<unsigned int> blocking_m;
-  blocking_m.reserve(Tm + 1);
-  for (long i = 0l; i <= Nm; i += Bm) blocking_m.push_back(i);
-
-  std::vector<unsigned int> blocking_n;
-  blocking_n.reserve(Tn + 1);
-  for (long i = 0l; i <= Nn; i += Bn) blocking_n.push_back(i);
-
-  std::vector<unsigned int> blocking_k;
-  blocking_k.reserve(Tk + 1);
-  for (long i = 0l; i <= Nk; i += Bk) blocking_k.push_back(i);
-
-  // Structure of c
-  std::vector<TiledArray::TiledRange1> blocking_C;
-  blocking_C.reserve(2);
-  blocking_C.push_back(
-      TiledArray::TiledRange1(blocking_m.begin(), blocking_m.end()));
-  blocking_C.push_back(
-      TiledArray::TiledRange1(blocking_n.begin(), blocking_n.end()));
-
-  // Structure of a
-  std::vector<TiledArray::TiledRange1> blocking_A;
-  blocking_A.reserve(2);
-  blocking_A.push_back(
-      TiledArray::TiledRange1(blocking_m.begin(), blocking_m.end()));
-  blocking_A.push_back(
-      TiledArray::TiledRange1(blocking_k.begin(), blocking_k.end()));
-
-  // Structure of b
-  std::vector<TiledArray::TiledRange1> blocking_B;
-  blocking_B.reserve(2);
-  blocking_B.push_back(
-      TiledArray::TiledRange1(blocking_k.begin(), blocking_k.end()));
-  blocking_B.push_back(
-      TiledArray::TiledRange1(blocking_n.begin(), blocking_n.end()));
-
-  TiledArray::TiledRange  // TRange for c
-      trange_c(blocking_C.begin(), blocking_C.end());
-
-  TiledArray::TiledRange  // TRange for a
-      trange_a(blocking_A.begin(), blocking_A.end());
-
-  TiledArray::TiledRange  // TRange for b
-      trange_b(blocking_B.begin(), blocking_B.end());
-
-  auto run = [&](auto* tarray_ptr) {
-    using Array = std::decay_t<std::remove_pointer_t<decltype(tarray_ptr)>>;
-
-    // Construct and initialize arrays
-    Array a(world, trange_a);
-    Array b(world, trange_b);
-    Array c(world, trange_c);
-    a.fill(1.0);
-    b.fill(1.0);
-
-    // Start clock
-    world.gop.fence();
-    const double wall_time_start = madness::wall_time();
-
-    // Do matrix multiplication
-    for (int i = 0; i < repeat; ++i) {
-      c("m,n") = a("m,k") * b("k,n");
-      world.gop.fence();
-      if (world.rank() == 0) std::cout << "Iteration " << i + 1 << "\n";
-    }
-
-    // Stop clock
-    const double wall_time_stop = madness::wall_time();
-
-    if (world.rank() == 0)
-      std::cout << "Average wall time   = "
-                << (wall_time_stop - wall_time_start) / double(repeat)
-                << " sec\nAverage GFLOPS      = "
-                << double(repeat) * 2.0 * double(Nn * Nm * Nk) /
-                       (wall_time_stop - wall_time_start) / 1.0e9
-                << "\n";
-  };
-
-  // by default use TiledArray tensors
-  constexpr bool use_btas = false;
-  // btas::Tensor instead
-  if (real_type_str == "double") {
-    if constexpr (!use_btas)
-      run(static_cast<TiledArray::TArrayD*>(nullptr));
-    else
-      run(static_cast<TiledArray::DistArray<
-              TiledArray::Tile<btas::Tensor<double, TiledArray::Range>>>*>(
-          nullptr));
-  } else {
-    if constexpr (!use_btas)
-      run(static_cast<TiledArray::TArrayF*>(nullptr));
-    else
-      run(static_cast<TiledArray::DistArray<
-              TiledArray::Tile<btas::Tensor<float, TiledArray::Range>>>*>(
-          nullptr));
-  }
-
-  return 0;
-}
diff --git a/examples/dgemm/ta_dense_new_tile.cpp b/examples/dgemm/ta_dense_new_tile.cpp
deleted file mode 100644
index 79dae8a579..0000000000
--- a/examples/dgemm/ta_dense_new_tile.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * This file is a part of TiledArray.
- * Copyright (C) 2013  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *   (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- */
-
-#include <TiledArray/version.h>
-#include <tiledarray.h>
-#include <iostream>
-
-using Tile_t = TiledArray::Tile<TiledArray::Tensor<double>>;
-using Array_t = TiledArray::DistArray<Tile_t>;
-
-void set_tiles(double val, Array_t& a) {
-  auto const& trange = a.trange();
-
-  auto pmap = a.pmap();
-  const auto end = pmap->end();
-  for (auto it = pmap->begin(); it != end; ++it) {
-    auto range = trange.make_tile_range(*it);
-    a.set(*it, Tile_t(TiledArray::Tensor<double>(range, val)));
-  }
-}
-
-int main(int argc, char** argv) {
-  int rc = 0;
-
-  try {
-    // Initialize runtime
-    TiledArray::World& world = TA_SCOPED_INITIALIZE(argc, argv);
-
-    // Get command line arguments
-    if (argc < 2) {
-      std::cout << "Usage: " << argv[0]
-                << " matrix_size block_size [repetitions]\n";
-      return 0;
-    }
-    const long matrix_size = atol(argv[1]);
-    const long block_size = atol(argv[2]);
-    if (matrix_size <= 0) {
-      std::cerr << "Error: matrix size must be greater than zero.\n";
-      return 1;
-    }
-    if (block_size <= 0) {
-      std::cerr << "Error: block size must be greater than zero.\n";
-      return 1;
-    }
-    if ((matrix_size % block_size) != 0ul) {
-      std::cerr << "Error: matrix size must be evenly divisible by block "
-                   "size.\n";
-      return 1;
-    }
-    const long repeat = (argc >= 4 ? atol(argv[3]) : 5);
-    if (repeat <= 0) {
-      std::cerr << "Error: number of repetitions must be greater than zero.\n";
-      return 1;
-    }
-
-    const std::size_t num_blocks = matrix_size / block_size;
-    const std::size_t block_count = num_blocks * num_blocks;
-
-    if (world.rank() == 0)
-      std::cout << "TiledArray: dense matrix multiply test..."
-                << "\nGit description: " << TiledArray::git_description()
-                << "\nNumber of nodes     = " << world.size()
-                << "\nMatrix size         = " << matrix_size << "x"
-                << matrix_size << "\nBlock size          = " << block_size
-                << "x" << block_size << "\nMemory per matrix   = "
-                << double(matrix_size * matrix_size * sizeof(double)) / 1.0e9
-                << " GB\nNumber of blocks    = " << block_count
-                << "\nAverage blocks/node = "
-                << double(block_count) / double(world.size()) << "\n";
-
-    const double flop =
-        2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9;
-
-    // Construct TiledRange
-    std::vector<unsigned int> blocking;
-    blocking.reserve(num_blocks + 1);
-    for (long i = 0l; i <= matrix_size; i += block_size) blocking.push_back(i);
-
-    std::vector<TiledArray::TiledRange1> blocking2(
-        2, TiledArray::TiledRange1(blocking.begin(), blocking.end()));
-
-    TiledArray::TiledRange trange(blocking2.begin(), blocking2.end());
-
-    // Construct and initialize arrays
-    Array_t a(world, trange);
-    Array_t b(world, trange);
-    Array_t c(world, trange);
-    set_tiles(1.0, a);
-    set_tiles(1.0, b);
-
-    TiledArray::TArrayD a_check(world, trange);
-    TiledArray::TArrayD b_check(world, trange);
-    TiledArray::TArrayD c_check(world, trange);
-    a_check.fill(1.0);
-    b_check.fill(1.0);
-
-    // Start clock
-    world.gop.fence();
-    if (world.rank() == 0)
-      std::cout << "Starting iterations: "
-                << "\n";
-
-    double total_time = 0.0;
-
-    // Do matrix multiplication
-    for (int i = 0; i < repeat; ++i) {
-      const double start = madness::wall_time();
-      c("m,n") = a("m,k") * b("k,n");
-      c_check("m,n") = a_check("m,k") * b_check("k,n");
-      //      world.gop.fence();
-      const double time = madness::wall_time() - start;
-      total_time += time;
-      if (world.rank() == 0)
-        std::cout << "Iteration " << i + 1 << "   time=" << time
-                  << "   GFLOPS=" << flop / time << "\n";
-      auto check_it = c_check.begin();
-      for (auto it = c.begin(); it != c.end() && check_it != c_check.end();
-           ++it, ++check_it) {
-        auto tile_diff = it->get().tensor().subt(check_it->get()).norm();
-        if (tile_diff >= 1e-15) {
-          std::cout << "Tile " << it.ordinal() << " failed test "
-                    << " with norm diff " << tile_diff << std::endl;
-          assert(false);
-        }
-      }
-    }
-
-    // Print results
-    if (world.rank() == 0)
-      std::cout << "Average wall time   = " << total_time / double(repeat)
-                << " sec\nAverage GFLOPS      = "
-                << double(repeat) * flop / total_time << "\n";
-
-  } catch (TiledArray::Exception& e) {
-    std::cerr << "!! TiledArray exception: " << e.what() << "\n";
-    rc = 1;
-  } catch (madness::MadnessException& e) {
-    std::cerr << "!! MADNESS exception: " << e.what() << "\n";
-    rc = 1;
-  } catch (SafeMPI::Exception& e) {
-    std::cerr << "!! SafeMPI exception: " << e.what() << "\n";
-    rc = 1;
-  } catch (std::exception& e) {
-    std::cerr << "!! std exception: " << e.what() << "\n";
-    rc = 1;
-  } catch (...) {
-    std::cerr << "!! exception: unknown exception\n";
-    rc = 1;
-  }
-
-  return rc;
-}
diff --git a/examples/dgemm/CMakeLists.txt b/examples/gemm/CMakeLists.txt
similarity index 94%
rename from examples/dgemm/CMakeLists.txt
rename to examples/gemm/CMakeLists.txt
index 47df67bf36..5808cdec6e 100644
--- a/examples/dgemm/CMakeLists.txt
+++ b/examples/gemm/CMakeLists.txt
@@ -26,7 +26,7 @@
 # Create example executable
 
 foreach(_exec ta_blas ta_eigen ta_band ta_dense ta_sparse ta_dense_nonuniform
-              ta_dense_asymm ta_sparse_grow ta_dense_new_tile
+              ta_dense_asymm ta_sparse_grow
               ta_cc_abcd)
 
   # Add executable
diff --git a/examples/dgemm/README b/examples/gemm/README
similarity index 92%
rename from examples/dgemm/README
rename to examples/gemm/README
index bbb80e88c0..de156f154d 100644
--- a/examples/dgemm/README
+++ b/examples/gemm/README
@@ -12,9 +12,9 @@ Applications usage:
 
   ta_band matrix_size block_size band_width [repetitions]
 
-  blas matrix_size [repetitions]
+  ta_blas matrix_size [repetitions]
 
-  eigen matrix_size [repetitions]
+  ta_eigen matrix_size [repetitions]
 
 Argument definitions:
 
diff --git a/examples/dgemm/block_size_data_process.py b/examples/gemm/block_size_data_process.py
similarity index 100%
rename from examples/dgemm/block_size_data_process.py
rename to examples/gemm/block_size_data_process.py
diff --git a/examples/dgemm/block_size_scan.sh b/examples/gemm/block_size_scan.sh
similarity index 100%
rename from examples/dgemm/block_size_scan.sh
rename to examples/gemm/block_size_scan.sh
diff --git a/examples/dgemm/ta_band.cpp b/examples/gemm/ta_band.cpp
similarity index 86%
rename from examples/dgemm/ta_band.cpp
rename to examples/gemm/ta_band.cpp
index d55550cebd..0743ef734b 100644
--- a/examples/dgemm/ta_band.cpp
+++ b/examples/gemm/ta_band.cpp
@@ -17,6 +17,7 @@
  *
  */
 
+#include <TiledArray/util/time.h>
 #include <tiledarray.h>
 #include <iostream>
 
@@ -104,38 +105,33 @@ int main(int argc, char** argv) {
       for (; j < j_end; ++j, ++ij) shape_tensor[ij] = 1.0;
     }
 
-    TiledArray::SparseShape<float> shape(shape_tensor, trange);
+    TiledArray::SparseShape<float> shape(
+        shape_tensor, trange, /* per_element_norms_already = */ true);
 
     // Construct and initialize arrays
     TiledArray::TSpArrayD a(world, trange, shape);
     TiledArray::TSpArrayD b(world, trange, shape);
-    TiledArray::TSpArrayD c(world, trange);
+    TiledArray::TSpArrayD c;
     a.fill(1.0);
     b.fill(1.0);
 
-    // Start clock
-    world.gop.fence();
-    const double wall_time_start = madness::wall_time();
-
     // Do matrix multiplication
+    world.gop.fence();
     for (int i = 0; i < repeat; ++i) {
-      c("m,n") = a("m,k") * b("k,n");
-      world.gop.fence();
+      TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); world.gop.fence();)
       if (world.rank() == 0) std::cout << "Iteration " << i + 1 << "\n";
     }
 
-    // Stop clock
-    const double wall_time_stop = madness::wall_time();
-
     // Print results
-    const long flop = 2.0 * c("m,n").sum().get();
+    const auto gflops_per_call = 2.0 * c("m,n").sum().get() / 1.e9;
     if (world.rank() == 0) {
-      std::cout << "Average wall time = "
-                << (wall_time_stop - wall_time_start) / double(repeat)
-                << "\nAverage GFLOPS = "
-                << double(repeat) * double(flop) /
-                       (wall_time_stop - wall_time_start) / 1.0e9
-                << "\n";
+      auto durations = TiledArray::duration_statistics();
+      std::cout << "Average wall time   = " << durations.mean
+                << " s\nAverage GFLOPS      = "
+                << gflops_per_call * durations.mean_reciprocal
+                << "\nMedian wall time   = " << durations.median
+                << " s\nMedian GFLOPS      = "
+                << gflops_per_call / durations.median << "\n";
     }
 
   } catch (TiledArray::Exception& e) {
diff --git a/examples/dgemm/ta_blas.cpp b/examples/gemm/ta_blas.cpp
similarity index 79%
rename from examples/dgemm/ta_blas.cpp
rename to examples/gemm/ta_blas.cpp
index 0a4feff383..c97f5bbedc 100644
--- a/examples/dgemm/ta_blas.cpp
+++ b/examples/gemm/ta_blas.cpp
@@ -17,13 +17,14 @@
  *
  */
 
+#include <TiledArray/util/time.h>
 #include <tiledarray.h>
 #include <iostream>
 
 int main(int argc, char** argv) {
   // Get command line arguments
   if (argc < 2) {
-    std::cout << "Usage: " << argv[0] << " matrix_size [repetitions]\n";
+    std::cout << "Usage: " << argv[0] << " matrix_size [repetitions = 5]\n";
     return 0;
   }
   const long matrix_size = atol(argv[1]);
@@ -66,31 +67,25 @@ int main(int argc, char** argv) {
   const integer m = matrix_size, n = matrix_size, k = matrix_size;
   const integer lda = matrix_size, ldb = matrix_size, ldc = matrix_size;
 
-  // Start clock
-  const double wall_time_start = madness::wall_time();
-
-  // Do matrix multiplcation
-  // Note: If TiledArray has not been configured with blas, this will be an
-  // eigen call.
+  // Do matrix multiplication
   for (int i = 0; i < repeat; ++i) {
-    gemm(opa, opb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    TA_RECORD_DURATION(
+        gemm(opa, opb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc));
   }
-
-  // Stop clock
-  const double wall_time_stop = madness::wall_time();
+  auto durations = TiledArray::duration_statistics();
 
   // Cleanup memory
   free(a);
   free(b);
   free(c);
 
-  std::cout << "Average wall time = "
-            << (wall_time_stop - wall_time_start) / double(repeat)
-            << "\nAverage GFLOPS = "
-            << double(repeat) * 2.0 *
-                   double(matrix_size * matrix_size * matrix_size) /
-                   (wall_time_stop - wall_time_start) / 1.0e9
-            << "\n";
+  const auto gflops_per_call =
+      2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9;
+  std::cout << "Average wall time = " << durations.mean << "\nAverage GFLOPS = "
+            << gflops_per_call * durations.mean_reciprocal
+            << "\nMedian wall time = " << durations.median
+            << "\nMedian GFLOPS = " << gflops_per_call / durations.median
+            << std::endl;
 
   return 0;
 }
diff --git a/examples/dgemm/ta_cc_abcd.cpp b/examples/gemm/ta_cc_abcd.cpp
similarity index 75%
rename from examples/dgemm/ta_cc_abcd.cpp
rename to examples/gemm/ta_cc_abcd.cpp
index c1881063d4..f038f09ea0 100644
--- a/examples/dgemm/ta_cc_abcd.cpp
+++ b/examples/gemm/ta_cc_abcd.cpp
@@ -17,6 +17,7 @@
  *
  */
 
+#include <TiledArray/util/time.h>
 #include <TiledArray/version.h>
 #include <tiledarray.h>
 #include <iostream>
@@ -33,17 +34,30 @@ bool to_bool(const char* str) {
 // if n = average tile size
 // this will produce tiles of these sizes: n+1, n-1, n+2, n-2, etc.
 // the last tile absorbs the remainder
-std::vector<unsigned int> make_tiling(unsigned int range_size,
-                                      unsigned int ntiles) {
-  const auto average_tile_size = range_size / ntiles;
-  TA_ASSERT(average_tile_size > ntiles);
-  std::vector<unsigned int> result(ntiles + 1);
-  result[0] = 0;
-  for (long t = 0; t != ntiles - 1; ++t) {
-    result[t + 1] =
-        result[t] + average_tile_size + ((t % 2 == 0) ? (t + 1) : (-t));
+std::vector<unsigned int> make_nonuniform_tiling(unsigned int range_size,
+                                                 int tile_size) {
+  std::vector<unsigned int> result;
+  result.push_back(0);
+  for (long t = 0; true; ++t) {
+    unsigned int next_tile_boundary =
+        result.back() + tile_size +
+        std::max(static_cast<int>((t % 2 == 0) ? (t + 1) : (-t)),
+                 1 - tile_size);
+    if (next_tile_boundary >= range_size) break;
+    result.push_back(next_tile_boundary);
   }
-  result[ntiles] = range_size;
+  if (result.back() != range_size) result.push_back(range_size);
+  return result;
+}
+
+// makes tiles as uniform as possible
+std::vector<unsigned int> make_uniform_tiling(unsigned int range_size,
+                                              int tile_size) {
+  std::vector<unsigned int> result;
+  for (unsigned int t = 0; t <= range_size; t += tile_size) {
+    result.push_back(t);
+  }
+  if (result.back() != range_size) result.push_back(range_size);
   return result;
 }
 
@@ -71,40 +85,33 @@ int main(int argc, char** argv) {
 
     // Get command line arguments
     if (argc < 5) {
-      std::cout << "Mocks t2(i,a,j,b) * v(a,b,c,d) term in CC amplitude eqs"
-                << std::endl
-                << "Usage: " << argv[0]
-                << " occ_size occ_nblocks uocc_size "
-                   "uocc_nblocks [repetitions] [use_complex]"
-                << std::endl;
+      std::cout
+          << "Mocks t2(i,j,a,b) * v(a,b,c,d) term in CC amplitude eqs"
+          << std::endl
+          << "Usage: " << argv[0]
+          << " occ_size occ_tilesize uocc_size "
+             "uocc_tilesize [repetitions] [scalar=double] [uniform_tiling=1]"
+          << std::endl;
       return 0;
     }
     const long n_occ = atol(argv[1]);
-    const long nblk_occ = atol(argv[2]);
+    const long b_occ = atol(argv[2]);
     const long n_uocc = atol(argv[3]);
-    const long nblk_uocc = atol(argv[4]);
+    const long b_uocc = atol(argv[4]);
     if (n_occ <= 0) {
       std::cerr << "Error: occ_size must be greater than zero.\n";
       return 1;
     }
-    if (nblk_occ <= 0) {
-      std::cerr << "Error: occ_nblocks must be greater than zero.\n";
+    if (b_occ <= 0) {
+      std::cerr << "Error: occ_tilesize must be greater than zero.\n";
       return 1;
     }
     if (n_uocc <= 0) {
       std::cerr << "Error: uocc_size must be greater than zero.\n";
       return 1;
     }
-    if (nblk_uocc <= 0) {
-      std::cerr << "Error: uocc_nblocks must be greater than zero.\n";
-      return 1;
-    }
-    if ((n_occ < nblk_occ) != 0ul) {
-      std::cerr << "Error: occ_size must be greater than occ_nblocks.\n";
-      return 1;
-    }
-    if ((n_uocc < nblk_uocc) != 0ul) {
-      std::cerr << "Error: uocc_size must be greater than uocc_nblocks.\n";
+    if (b_uocc <= 0) {
+      std::cerr << "Error: uocc_tilesize must be greater than zero.\n";
       return 1;
     }
     const long repeat = (argc >= 6 ? atol(argv[5]) : 5);
@@ -112,29 +119,59 @@ int main(int argc, char** argv) {
       std::cerr << "Error: number of repetitions must be greater than zero.\n";
       return 1;
     }
-    const bool use_complex = (argc >= 7 ? to_bool(argv[6]) : false);
+
+    const std::string scalar_type_str = (argc >= 7 ? argv[6] : "double");
+    if (scalar_type_str != "double" && scalar_type_str != "float" &&
+        scalar_type_str != "zdouble" && scalar_type_str != "zfloat") {
+      std::cerr << "Error: invalid real type " << scalar_type_str << ".\n";
+      std::cerr << "       valid real types are \"double\", \"float\", "
+                   "\"zdouble\", and \"zfloat\".\n";
+      return 1;
+    }
+
+    const bool uniform_tiling = (argc >= 8 ? std::atol(argv[7]) : true);
 
     if (world.rank() == 0)
       std::cout << "TiledArray: CC T2.V term test..."
                 << "\nGit description: " << TiledArray::git_description()
                 << "\nNumber of nodes     = " << world.size()
                 << "\nocc size            = " << n_occ
-                << "\nocc nblocks         = " << nblk_occ
+                << "\nocc tilesize        = " << b_occ
                 << "\nuocc size           = " << n_uocc
-                << "\nuocc nblocks        = " << nblk_uocc
-                << "\nComplex             = "
-                << (use_complex ? "true" : "false") << "\n";
+                << "\nuocc tilesize       = " << b_uocc
+                << "\nscalar type         = " << scalar_type_str
+                << "\nuniform tiling      = "
+                << (uniform_tiling ? "true" : "false") << std::endl;
 
     // Construct TiledRange1's
-    std::vector<unsigned int> tiling_occ = make_tiling(n_occ, nblk_occ);
-    std::vector<unsigned int> tiling_uocc = make_tiling(n_uocc, nblk_uocc);
+    std::vector<unsigned int> tiling_occ =
+        uniform_tiling ? make_uniform_tiling(n_occ, b_occ)
+                       : make_nonuniform_tiling(n_occ, b_occ);
+    std::vector<unsigned int> tiling_uocc =
+        uniform_tiling ? make_uniform_tiling(n_uocc, b_uocc)
+                       : make_nonuniform_tiling(n_uocc, b_uocc);
     auto trange_occ = TA::TiledRange1(tiling_occ.begin(), tiling_occ.end());
     auto trange_uocc = TA::TiledRange1(tiling_uocc.begin(), tiling_uocc.end());
-
-    if (use_complex)
-      cc_abcd<std::complex<double>>(world, trange_occ, trange_uocc, repeat);
-    else
+    auto print_tile_sizes = [](const auto& tiling) {
+      auto b = tiling.begin();
+      for (auto current = b + 1; current != tiling.end(); ++current) {
+        std::cout << *current - *(current - 1) << " ";
+      }
+      std::cout << std::endl;
+    };
+    std::cout << " occ tile sizes: ";
+    print_tile_sizes(tiling_occ);
+    std::cout << "uocc tile sizes: ";
+    print_tile_sizes(tiling_uocc);
+
+    if (scalar_type_str == "double")
       cc_abcd<double>(world, trange_occ, trange_uocc, repeat);
+    else if (scalar_type_str == "zdouble")
+      cc_abcd<std::complex<double>>(world, trange_occ, trange_uocc, repeat);
+    else if (scalar_type_str == "float")
+      cc_abcd<float>(world, trange_occ, trange_uocc, repeat);
+    else if (scalar_type_str == "zfloat")
+      cc_abcd<std::complex<float>>(world, trange_occ, trange_uocc, repeat);
 
     TA::finalize();
 
@@ -174,13 +211,13 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
   const double flops_per_fma =
       (complex_T ? 8 : 2);  // 1 multiply takes 6/1 flops for complex/real
                             // 1 add takes 2/1 flops for complex/real
-  const double n_gflop = flops_per_fma * std::pow(n_occ, 2) *
-                         std::pow(n_uocc, 4) / std::pow(1024., 3);
+  const double gflops_per_call =
+      flops_per_fma * std::pow(n_occ, 2) * std::pow(n_uocc, 4) / 1e9;
 
   // Construct tensors
-  TA::TArrayD t2(world, trange_oovv);
-  TA::TArrayD v(world, trange_vvvv);
-  TA::TArrayD t2_v;
+  TA::TSpArray<T> t2(world, trange_oovv);
+  TA::TSpArray<T> v(world, trange_vvvv);
+  TA::TSpArray<T> t2_v;
   // To validate, fill input tensors with random data, otherwise just with 1s
   if (do_validate) {
     rand_fill_array(t2);
@@ -196,50 +233,49 @@ void cc_abcd(TA::World& world, const TA::TiledRange1& trange_occ,
     std::cout << "Starting iterations: "
               << "\n";
 
-  double total_time = 0.0;
-  double total_gflop_rate = 0.0;
-
   // Do matrix multiplication
   for (int i = 0; i < repeat; ++i) {
-    const double start = madness::wall_time();
-
+    auto tp_start = TiledArray::now();
     // this is how the user would express this contraction
-    if (false) t2_v("i,j,a,b") = t2("i,j,c,d") * v("a,b,c,d");
+    if (true) t2_v("i,j,a,b") = t2("i,j,c,d") * v("a,b,c,d");
 
     // this demonstrates to the PaRSEC team what happens under the hood of the
     // expression above
-    if (true) {
+    if (false) {
       tensor_contract_444(t2_v, t2, v);
 
       // to validate replace: false -> true
       if (do_validate) {
         // obtain reference result using the high-level DSL
-        TA::TArrayD t2_v_ref;
+        TA::TSpArray<T> t2_v_ref;
         t2_v_ref("i,j,a,b") = t2("i,j,c,d") * v("c,d,a,b");
-        TA::TArrayD error;
+        TA::TSpArray<T> error;
         error("i,j,a,b") = t2_v_ref("i,j,a,b") - t2_v("i,j,a,b");
         std::cout << "Validating the result (ignore the timings/performance!): "
                      "||ref_result - result||_2^2 = "
                   << error("i,j,a,b").squared_norm().get() << std::endl;
       }
     }
+    t2_v.world().gop.fence();
+    TiledArray::record_duration_since(tp_start);
 
-    const double stop = madness::wall_time();
-    const double time = stop - start;
-    total_time += time;
-    const double gflop_rate = n_gflop / time;
-    total_gflop_rate += gflop_rate;
+    const double time = TiledArray::durations().back();
+    const double gflop_rate = gflops_per_call / time;
     if (world.rank() == 0)
       std::cout << "Iteration " << i + 1 << "   time=" << time
                 << "   GFLOPS=" << gflop_rate << "\n";
   }
 
   // Print results
-  if (world.rank() == 0)
-    std::cout << "Average wall time   = "
-              << total_time / static_cast<double>(repeat)
-              << " sec\nAverage GFLOPS      = "
-              << total_gflop_rate / static_cast<double>(repeat) << "\n";
+  if (world.rank() == 0) {
+    auto durations = TiledArray::duration_statistics();
+    std::cout << "Average wall time   = " << durations.mean
+              << " s\nAverage GFLOPS      = "
+              << gflops_per_call * durations.mean_reciprocal
+              << "\nMedian wall time   = " << durations.median
+              << " s\nMedian GFLOPS      = "
+              << gflops_per_call / durations.median << "\n";
+  }
 }
 
 template <typename LeftTile, typename RightTile, typename Policy, typename Op>
@@ -358,6 +394,7 @@ template <typename Tile, typename Policy>
 void tensor_contract_444(TA::DistArray<Tile, Policy>& tv,
                          const TA::DistArray<Tile, Policy>& t,
                          const TA::DistArray<Tile, Policy>& v) {
+  using Shape = typename Policy::shape_type;
   // for convenience, obtain the tiled ranges for the two kinds of dimensions
   // used to define t, v, and tv
   auto trange_occ = t.trange().dim(0);   // the first dimension of t is occ
@@ -379,10 +416,10 @@ void tensor_contract_444(TA::DistArray<Tile, Policy>& tv,
   auto ncols = n_uocc * n_uocc;
   TA::detail::ProcGrid proc_grid(world, nrowtiles, ncoltiles, nrows, ncols);
   std::shared_ptr<TA::Pmap> pmap;
-  auto t_eval = make_array_eval(t, t.world(), TA::DenseShape(),
+  auto t_eval = make_array_eval(t, t.world(), Shape(),
                                 proc_grid.make_row_phase_pmap(ninttiles),
                                 TA::Permutation(), make_array_noop<Tile>());
-  auto v_eval = make_array_eval(v, v.world(), TA::DenseShape(),
+  auto v_eval = make_array_eval(v, v.world(), Shape(),
                                 proc_grid.make_col_phase_pmap(ninttiles),
                                 TA::Permutation(), make_array_noop<Tile>());
 
@@ -401,7 +438,7 @@ void tensor_contract_444(TA::DistArray<Tile, Policy>& tv,
   // 2. there will be a dummy output ArrayEval, its Futures will be set by the
   // PTG
   auto contract =
-      make_contract_eval(t_eval, v_eval, world, TA::DenseShape(), pmap,
+      make_contract_eval(t_eval, v_eval, world, Shape(), pmap,
                          TA::Permutation(), make_contract<Tile>(4u, 4u, 4u));
 
   // eval() just schedules the Summa task and proceeds
diff --git a/examples/dgemm/ta_dense.cpp b/examples/gemm/ta_dense.cpp
similarity index 89%
rename from examples/dgemm/ta_dense.cpp
rename to examples/gemm/ta_dense.cpp
index 82506b1d0d..c0ffebd4dc 100644
--- a/examples/dgemm/ta_dense.cpp
+++ b/examples/gemm/ta_dense.cpp
@@ -17,6 +17,7 @@
  *
  */
 
+#include <TiledArray/util/time.h>
 #include <TiledArray/version.h>
 #include <madness/world/worldmem.h>
 #include <tiledarray.h>
@@ -129,7 +130,7 @@ void gemm_(TiledArray::World& world, const TiledArray::TiledRange& trange,
 
   const auto n = trange.elements_range().extent()[0];
   const auto complex_T = TiledArray::detail::is_complex<T>::value;
-  const double gflop =
+  const double gflops_per_call =
       (complex_T ? 8 : 2)  // 1 multiply takes 6/1 flops for complex/real
                            // 1 add takes 2/1 flops for complex/real
       * double(n * n * n) / 1.0e9;
@@ -168,28 +169,26 @@ void gemm_(TiledArray::World& world, const TiledArray::TiledRange& trange,
       std::cout << "Starting iterations: "
                 << "\n";
 
-    double total_time = 0.0;
-    double total_gflop_rate = 0.0;
-
     // Do matrix multiplication
     for (int i = 0; i < repeat; ++i) {
-      const double start = madness::wall_time();
-      c("m,n") = a("m,k") * b("k,n");
-      memtrace("c=a*b");
-      const double time = madness::wall_time() - start;
-      total_time += time;
-      const double gflop_rate = gflop / time;
-      total_gflop_rate += gflop_rate;
+      TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); memtrace("c=a*b");)
+      const auto time = TiledArray::durations().back();
+      const double gflop_rate = gflops_per_call / time;
       if (world.rank() == 0)
         std::cout << "Iteration " << i + 1 << "   time=" << time
                   << "   GFLOPS=" << gflop_rate << "\n";
     }
 
     // Print results
-    if (world.rank() == 0)
-      std::cout << "Average wall time   = " << total_time / double(repeat)
-                << " sec\nAverage GFLOPS      = "
-                << total_gflop_rate / double(repeat) << "\n";
+    if (world.rank() == 0) {
+      auto durations = TiledArray::duration_statistics();
+      std::cout << "Average wall time   = " << durations.mean
+                << " s\nAverage GFLOPS      = "
+                << gflops_per_call * durations.mean_reciprocal
+                << "\nMedian wall time   = " << durations.median
+                << " s\nMedian GFLOPS      = "
+                << gflops_per_call / durations.median << "\n";
+    }
 
   }  // array lifetime scope
   memtrace("stop");
diff --git a/examples/gemm/ta_dense_asymm.cpp b/examples/gemm/ta_dense_asymm.cpp
new file mode 100644
index 0000000000..356d838ec0
--- /dev/null
+++ b/examples/gemm/ta_dense_asymm.cpp
@@ -0,0 +1,245 @@
+/*
+ * This file is a part of TiledArray.
+ * Copyright (C) 2013  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <TiledArray/external/btas.h>
+#include <TiledArray/util/time.h>
+#include <tiledarray.h>
+#include <iostream>
+
+int main(int argc, char** argv) {
+  // Initialize runtime
+  TiledArray::World& world = TA_SCOPED_INITIALIZE(argc, argv);
+
+  // Get command line arguments
+  if (argc < 6) {
+    std::cout << "multiplies A(Nm,Nk) * B(Nk,Nn), with dimensions m, n, and k "
+                 "blocked by Bm, Bn, and Bk, respectively"
+              << std::endl
+              << "Usage: " << argv[0]
+              << " Nm Bm Nn Bn Nk Bk [repetitions=5] [scalar=double] "
+                 "[do_memtrace=0]\n";
+    return 0;
+  }
+  const long Nm = atol(argv[1]);
+  const long Bm = atol(argv[2]);
+  const long Nn = atol(argv[3]);
+  const long Bn = atol(argv[4]);
+  const long Nk = atol(argv[5]);
+  const long Bk = atol(argv[6]);
+  if (Nm <= 0 || Nn <= 0 || Nk <= 0) {
+    std::cerr << "Error: dimensions must be greater than zero.\n";
+    return 1;
+  }
+  if (Bm <= 0 || Bn <= 0 || Bk <= 0) {
+    std::cerr << "Error: block sizes must be greater than zero.\n";
+    return 1;
+  }
+  const long repeat = (argc >= 8 ? atol(argv[7]) : 5);
+  if (repeat <= 0) {
+    std::cerr << "Error: number of repetitions must be greater than zero.\n";
+    return 1;
+  }
+
+  const std::string scalar_type_str = (argc >= 9 ? argv[8] : "double");
+  if (scalar_type_str != "double" && scalar_type_str != "float" &&
+      scalar_type_str != "zdouble" && scalar_type_str != "zfloat") {
+    std::cerr << "Error: invalid real type " << scalar_type_str << ".\n";
+    std::cerr << "       valid real types are \"double\", \"float\", "
+                 "\"zdouble\", and \"zfloat\".\n";
+    return 1;
+  }
+
+  const bool do_memtrace = (argc >= 10 ? std::atol(argv[9]) : false);
+
+  // Construct TiledRange
+  std::vector<unsigned int> blocking_m;
+  for (long i = 0l; i <= Nm; i += Bm) blocking_m.push_back(i);
+  if (blocking_m.back() != Nm) blocking_m.push_back(Nm);
+
+  std::vector<unsigned int> blocking_n;
+  for (long i = 0l; i <= Nn; i += Bn) blocking_n.push_back(i);
+  if (blocking_n.back() != Nn) blocking_n.push_back(Nn);
+
+  std::vector<unsigned int> blocking_k;
+  for (long i = 0l; i <= Nk; i += Bk) blocking_k.push_back(i);
+  if (blocking_k.back() != Nk) blocking_k.push_back(Nk);
+
+  const std::size_t Tm = blocking_m.size() - 1;
+  const std::size_t Tn = blocking_n.size() - 1;
+  const std::size_t Tk = blocking_k.size() - 1;
+
+  // Structure of c
+  std::vector<TiledArray::TiledRange1> blocking_C;
+  blocking_C.reserve(2);
+  blocking_C.push_back(
+      TiledArray::TiledRange1(blocking_m.begin(), blocking_m.end()));
+  blocking_C.push_back(
+      TiledArray::TiledRange1(blocking_n.begin(), blocking_n.end()));
+
+  // Structure of a
+  std::vector<TiledArray::TiledRange1> blocking_A;
+  blocking_A.reserve(2);
+  blocking_A.push_back(
+      TiledArray::TiledRange1(blocking_m.begin(), blocking_m.end()));
+  blocking_A.push_back(
+      TiledArray::TiledRange1(blocking_k.begin(), blocking_k.end()));
+
+  // Structure of b
+  std::vector<TiledArray::TiledRange1> blocking_B;
+  blocking_B.reserve(2);
+  blocking_B.push_back(
+      TiledArray::TiledRange1(blocking_k.begin(), blocking_k.end()));
+  blocking_B.push_back(
+      TiledArray::TiledRange1(blocking_n.begin(), blocking_n.end()));
+
+  TiledArray::TiledRange  // TRange for c
+      trange_c(blocking_C.begin(), blocking_C.end());
+
+  TiledArray::TiledRange  // TRange for a
+      trange_a(blocking_A.begin(), blocking_A.end());
+
+  TiledArray::TiledRange  // TRange for b
+      trange_b(blocking_B.begin(), blocking_B.end());
+
+  auto run = [&](auto* tarray_ptr) {
+    using Array = std::decay_t<std::remove_pointer_t<decltype(tarray_ptr)>>;
+    using T = TiledArray::detail::numeric_t<Array>;
+    const auto complex_T = TiledArray::detail::is_complex_v<T>;
+    const double gflops_per_call =
+        (complex_T ? 8 : 2)  // 1 multiply takes 6/1 flops for complex/real
+                             // 1 add takes 2/1 flops for complex/real
+        * static_cast<std::int64_t>(Nn) * static_cast<std::int64_t>(Nm) *
+        static_cast<std::int64_t>(Nk) / 1.e9;
+
+    if (world.rank() == 0)
+      std::cout << "TiledArray: dense matrix multiply test...\n"
+                << "Number of nodes     = " << world.size()
+                << "\nScalar type       = " << scalar_type_str
+                << "\nSize of A         = " << Nm << "x" << Nk << " ("
+                << double(Nm * Nk * sizeof(T)) / 1.0e9 << " GB)"
+                << "\nSize of (largest) A block   = " << Bm << "x" << Bk
+                << "\nSize of B         = " << Nk << "x" << Nn << " ("
+                << double(Nk * Nn * sizeof(T)) / 1.0e9 << " GB)"
+                << "\nSize of (largest) B block   = " << Bk << "x" << Bn
+                << "\nSize of C         = " << Nm << "x" << Nn << " ("
+                << double(Nm * Nn * sizeof(T)) / 1.0e9 << " GB)"
+                << "\nSize of (largest) C block   = " << Bm << "x" << Bn
+                << "\n# of blocks of C  = " << Tm * Tn
+                << "\nAverage # of blocks of C/node = "
+                << double(Tm * Tn) / double(world.size()) << "\n";
+
+    auto memtrace = [do_memtrace, &world](const std::string& str) -> void {
+      if (do_memtrace) {
+        world.gop.fence();
+        madness::print_meminfo(world.rank(), str);
+      } else {
+        world.gop.fence();
+      }
+#ifdef TA_TENSOR_MEM_PROFILE
+      {
+        std::cout
+            << str << ": TA::Tensor allocated "
+            << TA::hostEnv::instance()->host_allocator_getActualHighWatermark()
+            << " bytes and used "
+            << TA::hostEnv::instance()->host_allocator().getHighWatermark()
+            << " bytes" << std::endl;
+      }
+#endif
+    };
+
+    memtrace("start");
+    {  // array lifetime scope
+      // Construct and initialize arrays
+      Array a(world, trange_a);
+      Array b(world, trange_b);
+      Array c(world, trange_c);
+      a.fill(1.0);
+      b.fill(1.0);
+      memtrace("allocated a and b");
+
+      // Start clock
+      world.gop.fence();
+      if (world.rank() == 0)
+        std::cout << "Starting iterations: "
+                  << "\n";
+
+      // Do matrix multiplication
+      for (int i = 0; i < repeat; ++i) {
+        TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); memtrace("c=a*b");)
+        const double time = TiledArray::durations().back();
+        const double gflop_rate = gflops_per_call / time;
+        if (world.rank() == 0)
+          std::cout << "Iteration " << i + 1 << "   time=" << time
+                    << "   GFLOPS=" << gflop_rate << "\n";
+      }
+
+      // Stop clock
+      const double wall_time_stop = madness::wall_time();
+
+      if (world.rank() == 0) {
+        auto durations = TiledArray::duration_statistics();
+        std::cout << "Average wall time   = " << durations.mean
+                  << " s\nAverage GFLOPS      = "
+                  << gflops_per_call * durations.mean_reciprocal
+                  << "\nMedian wall time   = " << durations.median
+                  << " s\nMedian GFLOPS      = "
+                  << gflops_per_call / durations.median << "\n";
+      }
+
+    }  // array lifetime scope
+    memtrace("stop");
+  };
+
+  // by default use TiledArray tensors
+  constexpr bool use_btas = false;
+  // btas::Tensor instead
+  if (scalar_type_str == "double") {
+    if constexpr (!use_btas)
+      run(static_cast<TiledArray::TArrayD*>(nullptr));
+    else
+      run(static_cast<TiledArray::DistArray<
+              TiledArray::Tile<btas::Tensor<double, TiledArray::Range>>>*>(
+          nullptr));
+  } else if (scalar_type_str == "float") {
+    if constexpr (!use_btas)
+      run(static_cast<TiledArray::TArrayF*>(nullptr));
+    else
+      run(static_cast<TiledArray::DistArray<
+              TiledArray::Tile<btas::Tensor<float, TiledArray::Range>>>*>(
+          nullptr));
+  } else if (scalar_type_str == "zdouble") {
+    if constexpr (!use_btas)
+      run(static_cast<TiledArray::TArrayZ*>(nullptr));
+    else
+      run(static_cast<TiledArray::DistArray<TiledArray::Tile<
+              btas::Tensor<std::complex<double>, TiledArray::Range>>>*>(
+          nullptr));
+  } else if (scalar_type_str == "zfloat") {
+    if constexpr (!use_btas)
+      run(static_cast<TiledArray::TArrayC*>(nullptr));
+    else
+      run(static_cast<TiledArray::DistArray<TiledArray::Tile<
+              btas::Tensor<std::complex<float>, TiledArray::Range>>>*>(
+          nullptr));
+  } else {
+    abort();  // unreachable
+  }
+
+  return 0;
+}
diff --git a/examples/dgemm/ta_dense_nonuniform.cpp b/examples/gemm/ta_dense_nonuniform.cpp
similarity index 87%
rename from examples/dgemm/ta_dense_nonuniform.cpp
rename to examples/gemm/ta_dense_nonuniform.cpp
index c01a4ece11..20e8cce712 100644
--- a/examples/dgemm/ta_dense_nonuniform.cpp
+++ b/examples/gemm/ta_dense_nonuniform.cpp
@@ -17,6 +17,7 @@
  *
  */
 
+#include <TiledArray/util/time.h>
 #include <TiledArray/version.h>
 #include <tiledarray.h>
 #include <iostream>
@@ -58,7 +59,7 @@ int main(int argc, char** argv) {
     const long num_blocks = matrix_size / block_size;
     const long block_count = num_blocks * num_blocks;
 
-    const double flop =
+    const double gflops_per_call =
         2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9;
 
     // Construct TiledRange
@@ -121,25 +122,25 @@ int main(int argc, char** argv) {
       std::cout << "Starting iterations: "
                 << "\n";
 
-    double total_time = 0.0;
-
     // Do matrix multiplication
     for (int i = 0; i < repeat; ++i) {
-      const double start = madness::wall_time();
-      c("m,n") = a("m,k") * b("k,n");
-      //      world.gop.fence();
-      const double time = madness::wall_time() - start;
-      total_time += time;
+      TA_RECORD_DURATION(c("m,n") = a("m,k") * b("k,n"); world.gop.fence();)
+      const double time = TiledArray::durations().back();
       if (world.rank() == 0)
         std::cout << "Iteration " << i + 1 << "   time=" << time
-                  << "   GFLOPS=" << flop / time << "\n";
+                  << "   GFLOPS=" << gflops_per_call / time << "\n";
     }
 
     // Print results
-    if (world.rank() == 0)
-      std::cout << "Average wall time   = " << total_time / double(repeat)
-                << " sec\nAverage GFLOPS      = "
-                << double(repeat) * flop / total_time << "\n";
+    if (world.rank() == 0) {
+      auto durations = TiledArray::duration_statistics();
+      std::cout << "Average wall time   = " << durations.mean
+                << " s\nAverage GFLOPS      = "
+                << gflops_per_call * durations.mean_reciprocal
+                << "\nMedian wall time   = " << durations.median
+                << " s\nMedian GFLOPS      = "
+                << gflops_per_call / durations.median << "\n";
+    }
 
   } catch (TiledArray::Exception& e) {
     std::cerr << "!! TiledArray exception: " << e.what() << "\n";
diff --git a/examples/dgemm/ta_eigen.cpp b/examples/gemm/ta_eigen.cpp
similarity index 76%
rename from examples/dgemm/ta_eigen.cpp
rename to examples/gemm/ta_eigen.cpp
index 0aa5474cd6..018de9a81f 100644
--- a/examples/dgemm/ta_eigen.cpp
+++ b/examples/gemm/ta_eigen.cpp
@@ -17,6 +17,7 @@
  *
  */
 
+#include <TiledArray/util/time.h>
 #include <tiledarray.h>
 #include <iostream>
 
@@ -50,24 +51,16 @@ int main(int argc, char** argv) {
   b.fill(1.0);
   c.fill(0.0);
 
-  // Start clock
-  const double wall_time_start = madness::wall_time();
-
-  // Do matrix multiplcation
+  // Do matrix multiplication
   for (int i = 0; i < repeat; ++i) {
-    c.noalias() = 1.0 * a * b + 0.0 * c;
+    TA_RECORD_DURATION(c.noalias() = 1.0 * a * b + 0.0 * c);
   }
 
-  // Stop clock
-  const double wall_time_stop = madness::wall_time();
-
-  std::cout << "Average wall time = "
-            << (wall_time_stop - wall_time_start) / double(repeat)
-            << "\nAverage GFLOPS = "
-            << double(repeat) * 2.0 *
-                   double(matrix_size * matrix_size * matrix_size) /
-                   (wall_time_stop - wall_time_start) / 1.0e9
-            << "\n";
+  auto durations = TiledArray::duration_statistics();
+  std::cout << "Average wall time = " << durations.mean << "\nAverage GFLOPS = "
+            << (2.0 * double(matrix_size * matrix_size * matrix_size) / 1.0e9) *
+                   durations.mean_reciprocal
+            << std::endl;
 
   return 0;
 }
diff --git a/examples/dgemm/ta_sparse.cpp b/examples/gemm/ta_sparse.cpp
similarity index 100%
rename from examples/dgemm/ta_sparse.cpp
rename to examples/gemm/ta_sparse.cpp
diff --git a/examples/dgemm/ta_sparse_grow.cpp b/examples/gemm/ta_sparse_grow.cpp
similarity index 100%
rename from examples/dgemm/ta_sparse_grow.cpp
rename to examples/gemm/ta_sparse_grow.cpp
diff --git a/external/boost.cmake b/external/boost.cmake
new file mode 100644
index 0000000000..c89b2e3667
--- /dev/null
+++ b/external/boost.cmake
@@ -0,0 +1,39 @@
+# Boost can be discovered by every (sub)package but only the top package can *build* it ...
+# in either case must declare the components used by TA
+set(required_components
+        headers
+        algorithm
+        container
+        iterator
+        random
+        tuple
+)
+if (BUILD_TESTING)
+    list(APPEND required_components
+            test
+    )
+endif()
+if (DEFINED Boost_REQUIRED_COMPONENTS)
+    list(APPEND Boost_REQUIRED_COMPONENTS
+            ${required_components})
+    list(REMOVE_DUPLICATES Boost_REQUIRED_COMPONENTS)
+else()
+    set(Boost_REQUIRED_COMPONENTS "${required_components}" CACHE STRING "Components of Boost to discovered or built")
+endif()
+set(optional_components
+        serialization     # BTAS
+)
+if (DEFINED Boost_OPTIONAL_COMPONENTS)
+    list(APPEND Boost_OPTIONAL_COMPONENTS
+            ${optional_components}
+    )
+    list(REMOVE_DUPLICATES Boost_OPTIONAL_COMPONENTS)
+else()
+    set(Boost_OPTIONAL_COMPONENTS "${optional_components}" CACHE STRING "Optional components of Boost to discovered or built")
+endif()
+
+if (NOT DEFINED Boost_FETCH_IF_MISSING)
+    set(Boost_FETCH_IF_MISSING 1)
+endif()
+
+include(${vg_cmake_kit_SOURCE_DIR}/modules/FindOrFetchBoost.cmake)
diff --git a/external/cuda.cmake b/external/cuda.cmake
index 3b2eb6ce37..74bd953e65 100644
--- a/external/cuda.cmake
+++ b/external/cuda.cmake
@@ -5,27 +5,35 @@ set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_EXTENSIONS OFF)
 set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+# N.B. need relaxed constexpr for std::complex
+# see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#constexpr-functions%5B/url%5D:
+if (DEFINED CMAKE_CUDA_FLAGS)
+  set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr ${CMAKE_CUDA_FLAGS}")
+else()
+  set(CMAKE_CUDA_FLAGS "--expt-relaxed-constexpr")
+endif()
+# if CMAKE_CUDA_HOST_COMPILER not set, set it to CMAKE_CXX_COMPILER, else NVCC will grab something from PATH
+if (NOT DEFINED CMAKE_CUDA_HOST_COMPILER)
+  set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}" CACHE STRING "The host C++ compiler to be used by the CUDA compiler")
+endif()
+
 enable_language(CUDA)
 
 set(CUDA_FOUND TRUE)
 set(TILEDARRAY_HAS_CUDA 1 CACHE BOOL "Whether TiledArray has CUDA support")
 
-if(ENABLE_CUDA_ERROR_CHECK)
-  set (TILEDARRAY_CHECK_CUDA_ERROR 1)
-endif(ENABLE_CUDA_ERROR_CHECK)
-
 # find CUDA toolkit
 # NB CUDAToolkit does NOT have COMPONENTS
 find_package(CUDAToolkit REQUIRED)
 
-foreach (library cublas;nvToolsExt)
+foreach (library cublas;nvtx3)
   if (NOT TARGET CUDA::${library})
     message(FATAL_ERROR "CUDA::${library} not found")
   endif()
 endforeach()
 
 if (NOT DEFINED CUDAToolkit_ROOT)
-  get_filename_component(CUDAToolkit_ROOT "${CUDAToolkit_INCLUDE_DIR}/../" ABSOLUTE CACHE)
+  get_filename_component(CUDAToolkit_ROOT "${CUDAToolkit_LIBRARY_DIR}/../" ABSOLUTE CACHE)
 endif(NOT DEFINED CUDAToolkit_ROOT)
 
 # sanitize implicit dirs if CUDA host compiler != C++ compiler
diff --git a/external/eigen.cmake b/external/eigen.cmake
index f2d28076dd..57bbead90d 100644
--- a/external/eigen.cmake
+++ b/external/eigen.cmake
@@ -103,8 +103,10 @@ else()
   message("** Will build Eigen from ${EIGEN3_URL}")
 
   ExternalProject_Add(eigen3
-    PREFIX ${CMAKE_INSTALL_PREFIX}
-   #--Download step--------------
+    PREFIX ${FETCHCONTENT_BASE_DIR}
+    STAMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts
+    TMP_DIR ${FETCHCONTENT_BASE_DIR}/eigen3-ep-artifacts  # needed in case CMAKE_INSTALL_PREFIX is not writable
+    #--Download step--------------
     DOWNLOAD_DIR ${EXTERNAL_SOURCE_DIR}
     URL ${EIGEN3_URL}
     URL_HASH ${EIGEN3_URL_HASH}
diff --git a/external/hip.cmake b/external/hip.cmake
new file mode 100644
index 0000000000..a76f543454
--- /dev/null
+++ b/external/hip.cmake
@@ -0,0 +1,31 @@
+# cmake 3.21 introduced HIP language support
+cmake_minimum_required(VERSION 3.21.0)
+set(CMAKE_HIP_STANDARD 17)
+set(CMAKE_HIP_EXTENSIONS OFF)
+set(CMAKE_HIP_STANDARD_REQUIRED ON)
+enable_language(HIP)
+
+set(HIP_FOUND TRUE)
+set(TILEDARRAY_HAS_HIP 1 CACHE BOOL "Whether TiledArray has HIP support")
+set(TILEDARRAY_CHECK_HIP_ERROR 1 CACHE BOOL "Whether TiledArray will check HIP errors")
+
+# find HIP components
+find_package(hipblas REQUIRED)
+find_package(rocprim REQUIRED)  # for rocthrust, per https://github.com/ROCmSoftwarePlatform/rocThrust#using-rocthrust-in-a-project
+find_package(rocthrust REQUIRED)
+
+foreach (library hipblas;rocthrust)
+  if (NOT TARGET roc::${library})
+    message(FATAL_ERROR "roc::${library} not found")
+  endif()
+endforeach()
+
+##
+## Umpire
+##
+include(external/umpire.cmake)
+
+##
+## LibreTT
+##
+include(external/librett.cmake)
diff --git a/external/librett.cmake b/external/librett.cmake
index a34dbf7869..5eca3314ce 100644
--- a/external/librett.cmake
+++ b/external/librett.cmake
@@ -35,12 +35,12 @@ else()
         set(LIBRETT_TAG ${TA_TRACKED_LIBRETT_TAG})
     endif (NOT LIBRETT_TAG)
 
-    message("** Will clone LibreTT from ${LIBRETT_URL}")
+    if (CMAKE_PREFIX_PATH)
+        set(LIBRETT_CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH})
+    endif()
+    list(APPEND LIBRETT_CMAKE_PREFIX_PATH ${_UMPIRE_INSTALL_DIR})
 
-    # need to change the separator of list to avoid issues with ExternalProject parsing
-#    set(CUDA_FLAGS "${CUDA_NVCC_FLAGS}")
-#    string(REPLACE ";" "::" CUDA_FLAGS "${CUDA_NVCC_FLAGS}")
-    #message(STATUS "CUDA_FLAGS: " "${CUDA_FLAGS}")
+    message("** Will clone LibreTT from ${LIBRETT_URL}")
 
     set(LIBRETT_CMAKE_ARGS
         -DCMAKE_INSTALL_PREFIX=${EXTERNAL_INSTALL_DIR}
@@ -62,27 +62,49 @@ else()
         -DCMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}
         -DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}
         -DCMAKE_AR=${CMAKE_AR}
-        -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}
-        -DCMAKE_CUDA_STANDARD=${CMAKE_CUDA_STANDARD}
-        -DCMAKE_CUDA_EXTENSIONS=${CMAKE_CUDA_EXTENSIONS}
         -DENABLE_UMPIRE=OFF
         # N.B. ThreadSafeUMDynamicPool this no longer exists!!! Must teach LibreTT to take allocate/deallocate methods
         # from the user code
         -DLIBRETT_USES_THIS_UMPIRE_ALLOCATOR=ThreadSafeUMDynamicPool
-        -DCMAKE_PREFIX_PATH=${_UMPIRE_INSTALL_DIR}
+        -DCMAKE_PREFIX_PATH=${LIBRETT_CMAKE_PREFIX_PATH}
         -DENABLE_NO_ALIGNED_ALLOC=ON
-        -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CUDA_HOST_COMPILER}
-        -DCUDA_TOOLKIT_ROOT_DIR=${CUDAToolkit_ROOT}
-	-DENABLE_CUDA=ON
         )
-    if (DEFINED CMAKE_CUDA_ARCHITECTURES)
-        list(APPEND LIBRETT_CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES})
-    endif(DEFINED CMAKE_CUDA_ARCHITECTURES)
+    if (ENABLE_CUDA)
+        list(APPEND LIBRETT_CMAKE_ARGS
+                -DENABLE_CUDA=ON
+                -DCMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}
+                -DCMAKE_CUDA_STANDARD=${CMAKE_CUDA_STANDARD}
+                -DCMAKE_CUDA_EXTENSIONS=${CMAKE_CUDA_EXTENSIONS}
+                -DCMAKE_CUDA_HOST_COMPILER=${CMAKE_CUDA_HOST_COMPILER}
+                -DCUDA_TOOLKIT_ROOT_DIR=${CUDAToolkit_ROOT}
+        )
+        if (DEFINED CMAKE_CUDA_ARCHITECTURES)
+            list(APPEND LIBRETT_CMAKE_ARGS "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
+        endif(DEFINED CMAKE_CUDA_ARCHITECTURES)
+    endif()
+    if (ENABLE_HIP)
+        list(APPEND LIBRETT_CMAKE_ARGS
+                -DENABLE_HIP=ON
+                -DCMAKE_HIP_COMPILER=${CMAKE_HIP_COMPILER}
+                -DCMAKE_HIP_STANDARD=${CMAKE_HIP_STANDARD}
+                -DCMAKE_HIP_EXTENSIONS=${CMAKE_HIP_EXTENSIONS}
+        )
+        if (DEFINED CMAKE_HIP_ARCHITECTURES)
+            list(APPEND LIBRETT_CMAKE_ARGS "-DCMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}")
+        endif(DEFINED CMAKE_HIP_ARCHITECTURES)
+    endif()
     if (CMAKE_TOOLCHAIN_FILE)
         set(LIBRETT_CMAKE_ARGS "${LIBRETT_CMAKE_ARGS}"
             "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
     endif(CMAKE_TOOLCHAIN_FILE)
 
+    foreach(lang C CXX CUDA)
+        if (DEFINED CMAKE_${lang}_COMPILER_LAUNCHER)
+            list(APPEND LIBRETT_CMAKE_ARGS
+                    "-DCMAKE_${lang}_COMPILER_LAUNCHER=${CMAKE_${lang}_COMPILER_LAUNCHER}")
+        endif()
+    endforeach()
+
     if (BUILD_SHARED_LIBS)
         set(LIBRETT_DEFAULT_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
     else(BUILD_SHARED_LIBS)
@@ -94,7 +116,7 @@ else()
     message(STATUS "custom target librett is expected to build these byproducts: ${LIBRETT_BUILD_BYPRODUCTS}")
 
     ExternalProject_Add(librett
-            PREFIX ${CMAKE_INSTALL_PREFIX}
+            PREFIX ${FETCHCONTENT_BASE_DIR}
             STAMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts
             TMP_DIR ${FETCHCONTENT_BASE_DIR}/librett-ep-artifacts  # needed in case CMAKE_INSTALL_PREFIX is not writable
             #--Download step--------------
@@ -133,7 +155,7 @@ else()
             ")
 
     # Add LibreTT dependency to External
-    add_dependencies(External-tiledarray librett-build)
+    add_dependencies(External-tiledarray librett)
 
     set(_LIBRETT_INSTALL_DIR ${EXTERNAL_INSTALL_DIR})
 
@@ -148,6 +170,20 @@ set_target_properties(TiledArray_LIBRETT
         INTERFACE_LINK_LIBRARIES
         "$<BUILD_INTERFACE:${LIBRETT_BUILD_BYPRODUCTS}>;$<INSTALL_INTERFACE:${_LIBRETT_INSTALL_DIR}/lib/librett.${LIBRETT_DEFAULT_LIBRARY_SUFFIX}>"
         )
+if (ENABLE_CUDA)
+    set_target_properties(TiledArray_LIBRETT
+            PROPERTIES
+            INTERFACE_COMPILE_DEFINITIONS
+            "LIBRETT_USES_CUDA=1"
+    )
+endif()
+if (ENABLE_HIP)
+    set_target_properties(TiledArray_LIBRETT
+            PROPERTIES
+            INTERFACE_COMPILE_DEFINITIONS
+            "LIBRETT_USES_HIP=1"
+    )
+endif()
 
 install(TARGETS TiledArray_LIBRETT EXPORT tiledarray COMPONENT tiledarray)
 
diff --git a/external/umpire.cmake b/external/umpire.cmake
index 1ee9dde48b..5b7a4f4078 100644
--- a/external/umpire.cmake
+++ b/external/umpire.cmake
@@ -14,6 +14,21 @@ if(_UMPIRE_INSTALL_DIR)
 #    find_package(umpire REQUIRED)
     message(STATUS "Umpire found at ${_UMPIRE_INSTALL_DIR}")
 
+    add_library(TiledArray_UMPIRE INTERFACE)
+
+    set_target_properties(
+            TiledArray_UMPIRE
+            PROPERTIES
+            INTERFACE_INCLUDE_DIRECTORIES
+            "${_UMPIRE_INSTALL_DIR}/include"
+            INTERFACE_LINK_LIBRARIES
+            "umpire"
+            INTERFACE_LINK_DIRECTORIES
+            "${_UMPIRE_INSTALL_DIR}/lib/"
+            )
+
+     install(TARGETS TiledArray_UMPIRE EXPORT tiledarray COMPONENT tiledarray)
+
 elseif(TA_EXPERT)
 
     message("** Umpire was not found")
@@ -48,18 +63,23 @@ else()
         set(enable_umpire_asserts ON)
     endif()
 
-    # as of now BLT only supports up to C++17, so limit CMAKE_CXX_STANDARD
+    # as of now BLT only supports up to C++20, so limit CMAKE_CXX_STANDARD
     set(BLT_CXX_STD ${CMAKE_CXX_STANDARD})
-    set(BLT_CXX_STD_MAX 17)
+    set(BLT_CXX_STD_MAX 20)
     if (BLT_CXX_STD GREATER ${BLT_CXX_STD_MAX})
         set(BLT_CXX_STD ${BLT_CXX_STD_MAX})
     endif()
 
+    if (CMAKE_PREFIX_PATH)
+        set(UMPIRE_CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH})
+    endif()
+
     set(UMPIRE_CMAKE_ARGS
         -DCMAKE_INSTALL_PREFIX=${EXTERNAL_INSTALL_DIR}
         -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}
         -DCMAKE_POSITION_INDEPENDENT_CODE=${CMAKE_POSITION_INDEPENDENT_CODE}
         -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+        -DCMAKE_PREFIX_PATH=${UMPIRE_CMAKE_PREFIX_PATH}
         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
         -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
         -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
@@ -82,6 +102,7 @@ else()
         -DENABLE_EXAMPLES=OFF
         -DENABLE_LOGGING=OFF
         -DENABLE_ASSERTS=${enable_umpire_asserts}
+        -DENABLE_CLANGFORMAT=OFF
         )
 
     # caveat: on recent Ubuntu default libstdc++ provides filesystem, but if using older gcc (gcc-8) must link against
@@ -102,15 +123,42 @@ else()
                 -DCUDA_TOOLKIT_ROOT_DIR=${CUDAToolkit_ROOT}
                 )
         if (DEFINED CMAKE_CUDA_ARCHITECTURES)
-            list(APPEND UMPIRE_CMAKE_ARGS -DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES})
+            list(APPEND UMPIRE_CMAKE_ARGS "-DCMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
         endif(DEFINED CMAKE_CUDA_ARCHITECTURES)
+        # BLT will need FindCUDA until https://github.com/LLNL/blt/pull/585 is merged
+        # with CMake 3.28.1 needs to set CMP0146 to OLD
+        if (POLICY CMP0146)
+            list(APPEND UMPIRE_CMAKE_ARGS -DCMAKE_POLICY_DEFAULT_CMP0146=OLD)
+        endif()
+        # as of CMake 3.28+ FindCUDA seems to require CUDA_TOOLKIT_ROOT_DIR to be defined
+        if (DEFINED CUDA_TOOLKIT_ROOT_DIR)
+            list(APPEND UMPIRE_CMAKE_ARGS "-DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}")
+        endif()
     endif(ENABLE_CUDA)
+    if (ENABLE_HIP)
+        list(APPEND UMPIRE_CMAKE_ARGS
+                -DENABLE_HIP=ON
+                -DCMAKE_HIP_COMPILER=${CMAKE_HIP_COMPILER}
+                -DCMAKE_HIP_STANDARD=${CMAKE_HIP_STANDARD}
+                -DCMAKE_HIP_EXTENSIONS=${CMAKE_HIP_EXTENSIONS}
+        )
+        if (DEFINED CMAKE_HIP_ARCHITECTURES)
+            list(APPEND UMPIRE_CMAKE_ARGS "-DCMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}")
+        endif(DEFINED CMAKE_HIP_ARCHITECTURES)
+    endif(ENABLE_HIP)
     if (CMAKE_TOOLCHAIN_FILE)
         set(UMPIRE_CMAKE_ARGS "${UMPIRE_CMAKE_ARGS}"
             "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}"
             )
     endif(CMAKE_TOOLCHAIN_FILE)
 
+    foreach(lang C CXX CUDA)
+        if (DEFINED CMAKE_${lang}_COMPILER_LAUNCHER)
+            list(APPEND UMPIRE_CMAKE_ARGS
+                    "-DCMAKE_${lang}_COMPILER_LAUNCHER=${CMAKE_${lang}_COMPILER_LAUNCHER}")
+        endif()
+    endforeach()
+
     if (BUILD_SHARED_LIBS)
         set(UMPIRE_DEFAULT_LIBRARY_SUFFIX ${CMAKE_SHARED_LIBRARY_SUFFIX})
     else(BUILD_SHARED_LIBS)
@@ -122,7 +170,7 @@ else()
     message(STATUS "custom target Umpire is expected to build these byproducts: ${UMPIRE_BUILD_BYPRODUCTS}")
 
     ExternalProject_Add(Umpire
-            PREFIX ${CMAKE_INSTALL_PREFIX}
+            PREFIX ${FETCHCONTENT_BASE_DIR}
             STAMP_DIR ${FETCHCONTENT_BASE_DIR}/umpire-ep-artifacts
             TMP_DIR ${FETCHCONTENT_BASE_DIR}/umpire-ep-artifacts   # needed in case CMAKE_INSTALL_PREFIX is not writable
             #--Download step--------------
@@ -145,7 +193,8 @@ else()
             )
 
     # TiledArray_UMPIRE target depends on existence of these directories to be usable from the build tree at configure time
-    execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${EXTERNAL_SOURCE_DIR}/src/umpire/tpl/camp/include")
+    execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${EXTERNAL_SOURCE_DIR}/src/tpl/umpire/camp/include")
+    execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${EXTERNAL_BUILD_DIR}/src/tpl/umpire/camp/include")
     execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory "${EXTERNAL_BUILD_DIR}/include")
 
     # do install of Umpire as part of building TiledArray's install target
@@ -164,23 +213,24 @@ else()
 
     set(_UMPIRE_INSTALL_DIR ${EXTERNAL_INSTALL_DIR})
 
-endif(_UMPIRE_INSTALL_DIR)
-
-# manually add Umpire library
 
-add_library(TiledArray_UMPIRE INTERFACE)
+    add_library(TiledArray_UMPIRE INTERFACE)
 
-set_target_properties(
-        TiledArray_UMPIRE
-        PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES
-        "$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/umpire/tpl/camp/include>;$<BUILD_INTERFACE:${EXTERNAL_BUILD_DIR}/include>;$<INSTALL_INTERFACE:${_UMPIRE_INSTALL_DIR}/include>"
-        INTERFACE_LINK_LIBRARIES
-        "$<BUILD_INTERFACE:${UMPIRE_BUILD_BYPRODUCTS}>;$<INSTALL_INTERFACE:${_UMPIRE_INSTALL_DIR}/lib/libumpire${UMPIRE_DEFAULT_LIBRARY_SUFFIX}>"
-        )
+    set_target_properties(
+            TiledArray_UMPIRE
+            PROPERTIES
+            INTERFACE_INCLUDE_DIRECTORIES
+            "$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/tpl>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/tpl/umpire/camp/include>;$<BUILD_INTERFACE:${EXTERNAL_SOURCE_DIR}/src/tpl/umpire/fmt/include>;$<BUILD_INTERFACE:${EXTERNAL_BUILD_DIR}/src/tpl/umpire/camp/include>;$<BUILD_INTERFACE:${EXTERNAL_BUILD_DIR}/include>;$<INSTALL_INTERFACE:${_UMPIRE_INSTALL_DIR}/include>"
+            INTERFACE_LINK_LIBRARIES
+            "$<BUILD_INTERFACE:${UMPIRE_BUILD_BYPRODUCTS}>;$<INSTALL_INTERFACE:${_UMPIRE_INSTALL_DIR}/lib/libumpire${UMPIRE_DEFAULT_LIBRARY_SUFFIX}>"
+            INTERFACE_COMPILE_DEFINITIONS
+            FMT_HEADER_ONLY=1
+            )
 
 install(TARGETS TiledArray_UMPIRE EXPORT tiledarray COMPONENT tiledarray)
 
+endif(_UMPIRE_INSTALL_DIR)
+
 #TODO test Umpire
 
 endif(NOT TARGET TiledArray_UMPIRE)
diff --git a/external/versions.cmake b/external/versions.cmake
index 8f18337801..9007dde279 100644
--- a/external/versions.cmake
+++ b/external/versions.cmake
@@ -1,15 +1,7 @@
 # for each dependency track both current and previous id (the variable for the latter must contain PREVIOUS)
 # to be able to auto-update them
 
-set(TA_TRACKED_VGCMAKEKIT_TAG 3cbfe7c1e2e2667964b737e6abcc44d173fb9775)
-
-# Boost explicitly downgraded to 1.59 from 1.68
-set(TA_TRACKED_BOOST_VERSION 1.59)
-set(TA_TRACKED_BOOST_PREVIOUS_VERSION 1.68)
-set(TA_INSTALL_BOOST_VERSION 1.70.0)
-set(TA_INSTALL_BOOST_PREVIOUS_VERSION 1.70.0)
-set(TA_INSTALL_BOOST_URL_HASH 882b48708d211a5f48e60b0124cf5863c1534cd544ecd0664bb534a4b5d506e9)
-set(TA_INSTALL_BOOST_PREVIOUS_URL_HASH 882b48708d211a5f48e60b0124cf5863c1534cd544ecd0664bb534a4b5d506e9)
+set(TA_TRACKED_VGCMAKEKIT_TAG 72bb8f049e68443e817ce7299f0d1dabfaf01b7e)
 
 # N.B. may need to update INSTALL.md manually with the CUDA-specific version
 set(TA_TRACKED_EIGEN_VERSION 3.3.5)
@@ -19,26 +11,26 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7)
 set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626)
 set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496)
 
-set(TA_TRACKED_MADNESS_TAG 0b44ef319643cb9721fbe17d294987c146e6460e)
-set(TA_TRACKED_MADNESS_PREVIOUS_TAG 29a2bf3d3c2670c608b7bfdf2299d76fbc20e041)
+set(TA_TRACKED_MADNESS_TAG 93a9a5cec2a8fa87fba3afe8056607e6062a9058)
+set(TA_TRACKED_MADNESS_PREVIOUS_TAG 95589b0d020a076f93d02eead6da654b23dd3d91)
 set(TA_TRACKED_MADNESS_VERSION 0.10.1)
 set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1)
 
-set(TA_TRACKED_BTAS_TAG 4a1304cea6677255dc6f70705469b8c387713ccc)
-set(TA_TRACKED_BTAS_PREVIOUS_TAG 474ddc095cbea12a1d28aca5435703dd9f69b166)
+set(TA_TRACKED_BTAS_TAG 4cd283245027f19a7ea6c18f5a7195361dd89900)
+set(TA_TRACKED_BTAS_PREVIOUS_TAG 4b3757cc2b5862f93589afc1e37523e543779c7a)
 
-set(TA_TRACKED_LIBRETT_TAG 68abe31a9ec6fd2fd9ffbcd874daa80457f947da)
-set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 7e27ac766a9038df6aa05613784a54a036c4b796)
+set(TA_TRACKED_LIBRETT_TAG 6eed30d4dd2a5aa58840fe895dcffd80be7fbece)
+set(TA_TRACKED_LIBRETT_PREVIOUS_TAG 354e0ccee54aeb2f191c3ce2c617ebf437e49d83)
 
-set(TA_TRACKED_UMPIRE_TAG f9640e0fa4245691cdd434e4f719ac5f7d455f82)
-set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v6.0.0)
+set(TA_TRACKED_UMPIRE_TAG 8c85866107f78a58403e20a2ae8e1f24c9852287)
+set(TA_TRACKED_UMPIRE_PREVIOUS_TAG v2024.02.1)
 
 set(TA_TRACKED_SCALAPACKPP_TAG 6397f52cf11c0dfd82a79698ee198a2fce515d81)
 set(TA_TRACKED_SCALAPACKPP_PREVIOUS_TAG 711ef363479a90c88788036f9c6c8adb70736cbf )
 
-set(TA_TRACKED_RANGEV3_TAG 2e0591c57fce2aca6073ad6e4fdc50d841827864)
-set(TA_TRACKED_RANGEV3_PREVIOUS_TAG dbdaa247a25a0daa24c68f1286a5693c72ea0006)
+set(TA_TRACKED_RANGEV3_TAG 0.12.0)
+set(TA_TRACKED_RANGEV3_PREVIOUS_TAG 2e0591c57fce2aca6073ad6e4fdc50d841827864)
 
 set(TA_TRACKED_TTG_URL https://github.com/TESSEorg/ttg)
-set(TA_TRACKED_TTG_TAG a9a1a55b45f7503da39d8466a1a421155ac5ca2a)
-set(TA_TRACKED_TTG_PREVIOUS_TAG 1251bec25e07a74a05e5cd4cdec181a95a9baa66)
+set(TA_TRACKED_TTG_TAG 3fe4a06dbf4b05091269488aab38223da1f8cb8e)
+set(TA_TRACKED_TTG_PREVIOUS_TAG 26da9b40872660b864794658d4fdeee1a95cb4d6)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 690b35979d..99e29e2a83 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,14 +1,18 @@
 cmake_minimum_required(VERSION 3.12)
 
+project(python-tiledarray)
+
+if (NOT TARGET Python::Module)
+  find_package(Python COMPONENTS Interpreter Development REQUIRED)
+endif()
+
 FetchContent_Declare(
   pybind11
   GIT_REPOSITORY      https://github.com/ValeevGroup/pybind11.git
-  GIT_TAG             80d452484c5409444b0ec19383faa84bb7a4d351 # v2.4.3
+  GIT_TAG             ValeevGroup/v2.11
   )
 FetchContent_MakeAvailable(pybind11)
 
-project(python-tiledarray)
-
 set(CMAKE_CXX_STANDARD 17)
 add_compile_options(-Wall)
 
@@ -39,11 +43,11 @@ if (BUILD_TESTING)
 
   # check for presence of prerequisite modules
   foreach(_mod pytest numpy)
-    execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import ${_mod}"
+    execute_process(COMMAND ${Python_EXECUTABLE} -c "import ${_mod}"
         OUTPUT_QUIET ERROR_QUIET
         RESULTS_VARIABLE check_for_${_mod})
     if (check_for_${_mod})
-      message(FATAL_ERROR "Python module \"${_mod}\" is not installed; install via \"${PYTHON_EXECUTABLE} -m pip install ${_mod}\" and rerun cmake")
+      message(FATAL_ERROR "Python module \"${_mod}\" is not installed; install via \"${Python_EXECUTABLE} -m pip install ${_mod}\" and rerun cmake")
     endif(check_for_${_mod})
   endforeach(_mod)
 
@@ -51,7 +55,7 @@ if (BUILD_TESTING)
   add_test(
     NAME tiledarray/unit/python/run
     # need to use pytest to find tiledarray module properly
-    COMMAND ${PYTHON_EXECUTABLE} -m pytest ${PROJECT_SOURCE_DIR}/test_tiledarray.py -v
+    COMMAND ${Python_EXECUTABLE} -m pytest ${PROJECT_SOURCE_DIR}/test_tiledarray.py -v
     WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
   )
   set_tests_properties(tiledarray/unit/python/run
diff --git a/python/src/TiledArray/python/array.h b/python/src/TiledArray/python/array.h
index 782846df4c..e3cc1c79b7 100644
--- a/python/src/TiledArray/python/array.h
+++ b/python/src/TiledArray/python/array.h
@@ -208,7 +208,7 @@ void make_array_class(py::object m, const char *name) {
                                  py::return_value_policy::reference)
           .def_property_readonly("trange", &array::trange<Array>)
           .def_property_readonly("shape", &array::shape<Array, py::tuple>)
-          .def("fill", &Array::fill, py::arg("value"),
+          .def("fill", &Array::template fill<>, py::arg("value"),
                py::arg("skip_set") = false)
           .def("init", &array::init_tiles<Array>)
           // Array object needs be alive while iterator is used */
diff --git a/python/src/TiledArray/python/trange.h b/python/src/TiledArray/python/trange.h
index 488421291d..8c008c1fa9 100644
--- a/python/src/TiledArray/python/trange.h
+++ b/python/src/TiledArray/python/trange.h
@@ -45,7 +45,6 @@ auto list(const TiledRange &trange) {
   return v;
 }
 
-// template<>
 inline TiledRange make_trange(std::vector<std::vector<int64_t> > trange) {
   std::vector<TiledRange1> trange1;
   for (auto tr : trange) {
@@ -58,11 +57,7 @@ inline TiledRange make_trange(std::vector<std::vector<int64_t> > trange) {
 inline TiledRange make_trange(std::vector<int64_t> shape, size_t block) {
   std::vector<TiledRange1> trange1;
   for (size_t i = 0; i < shape.size(); ++i) {
-    std::vector<int64_t> tr1;
-    for (size_t j = 0; j <= (shape[i] + block - 1); j += block) {
-      tr1.push_back(std::min<int64_t>(j, shape[i]));
-    }
-    trange1.push_back(TiledRange1(tr1.begin(), tr1.end()));
+    trange1.emplace_back(TiledRange1::make_uniform(shape[i], block));
   }
   return TiledRange(trange1.begin(), trange1.end());
 }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index afd67dc797..776b85f4a1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -100,7 +100,6 @@ TiledArray/dist_eval/contraction_eval.h
 TiledArray/dist_eval/dist_eval.h
 TiledArray/dist_eval/unary_eval.h
 TiledArray/einsum/index.h
-TiledArray/einsum/index.cpp
 TiledArray/einsum/range.h
 TiledArray/einsum/string.h
 TiledArray/expressions/add_engine.h
@@ -134,8 +133,8 @@ TiledArray/expressions/index_list.h
 TiledArray/external/btas.h
 TiledArray/external/madness.h
 TiledArray/external/umpire.h
+TiledArray/host/env.cpp
 TiledArray/host/env.h
-TiledArray/host/allocator.h
 TiledArray/math/blas.h
 TiledArray/math/gemm_helper.h
 TiledArray/math/outer.h
@@ -194,39 +193,39 @@ TiledArray/util/backtrace.h
 TiledArray/util/bug.h
 TiledArray/util/function.h
 TiledArray/util/initializer_list.h
+TiledArray/util/invoke.h
 TiledArray/util/logger.h
-TiledArray/util/ptr_registry.cpp
 TiledArray/util/ptr_registry.h
-TiledArray/util/random.cpp
 TiledArray/util/random.h
 TiledArray/util/singleton.h
 TiledArray/util/threads.h
-TiledArray/util/threads.cpp
 TiledArray/util/thread_specific.h
 TiledArray/util/time.h
 TiledArray/util/vector.h
 )
 
-if(CUDA_FOUND)
-
+if(TILEDARRAY_HAS_HIP OR TILEDARRAY_HAS_CUDA)
   list(APPEND TILEDARRAY_HEADER_FILES
-     TiledArray/external/cuda.h
-     TiledArray/external/librett.h
-     TiledArray/cuda/cublas.h
-     TiledArray/cuda/btas_cublas.h
-     TiledArray/cuda/btas_um_tensor.h
-     TiledArray/cuda/cpu_cuda_vector.h
-     TiledArray/cuda/cuda_task_fn.h
-     TiledArray/cuda/kernel/mult_kernel.h
-     TiledArray/cuda/kernel/mult_kernel_impl.h
-     TiledArray/cuda/kernel/reduce_kernel.h
-     TiledArray/cuda/kernel/reduce_kernel_impl.h
-     TiledArray/cuda/platform.h
-     TiledArray/cuda/thrust.h
-     TiledArray/cuda/um_allocator.h
-     TiledArray/cuda/um_storage.h)
-
-endif(CUDA_FOUND)
+          TiledArray/external/device.h
+          TiledArray/external/librett.h
+          TiledArray/device/blas.cpp
+          TiledArray/device/blas.h
+          TiledArray/device/btas.h
+          TiledArray/device/btas_um_tensor.h
+          TiledArray/device/device_task_fn.h
+          TiledArray/device/kernel/mult_kernel.h
+          TiledArray/device/kernel/reduce_kernel.h
+          TiledArray/device/kernel/thrust/mult_kernel.h
+          TiledArray/device/kernel/thrust/reduce_kernel.h
+          TiledArray/device/platform.h
+          TiledArray/device/thrust.h
+          TiledArray/device/um_storage.h)
+  if(TILEDARRAY_HAS_CUDA)
+    list(APPEND TILEDARRAY_HEADER_FILES
+       TiledArray/external/cuda.h
+       TiledArray/device/cpu_cuda_vector.h)
+  endif(TILEDARRAY_HAS_CUDA)
+endif(TILEDARRAY_HAS_HIP OR TILEDARRAY_HAS_CUDA)
 
 set(TILEDARRAY_SOURCE_FILES
 TiledArray/tiledarray.cpp
@@ -236,10 +235,15 @@ TiledArray/tensor_impl.cpp
 TiledArray/array_impl.cpp
 TiledArray/dist_array.cpp
 TiledArray/version.cpp
-TiledArray/util/backtrace.cpp
-TiledArray/util/bug.cpp
+TiledArray/einsum/index.cpp
+TiledArray/expressions/permopt.cpp
 TiledArray/math/linalg/basic.cpp
 TiledArray/math/linalg/rank-local.cpp
+TiledArray/util/backtrace.cpp
+TiledArray/util/bug.cpp
+TiledArray/util/ptr_registry.cpp
+TiledArray/util/random.cpp
+TiledArray/util/threads.cpp
 )
 # feed TILEDARRAY_GIT_REVISION and TILEDARRAY_GIT_DESCRIPTION to TiledArray/version.cpp only to avoid recompiling everything
 set_source_files_properties(
@@ -250,26 +254,51 @@ set_source_files_properties(
 
 # the list of libraries on which TiledArray depends on, will be cached later
 # when FetchContent umpire: set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers umpire)
-set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers TiledArray_UMPIRE)
+set(_TILEDARRAY_DEPENDENCIES MADworld TiledArray_Eigen BTAS::BTAS blaspp_headers TiledArray_UMPIRE range-v3::range-v3)
 
-# TODO better ways to handle tiledarray cuda dependency
-if(CUDA_FOUND)
+if(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP)
 
-  list(APPEND TILEDARRAY_SOURCE_FILES
-          TiledArray/cuda/btas_um_tensor.cpp
-          TiledArray/cuda/cpu_cuda_vector.cu
-          TiledArray/cuda/kernel/mult_kernel.cu
-          TiledArray/cuda/kernel/reduce_kernel.cu
-          TiledArray/cuda/um_storage.cu)
+  set(TILEDARRAY_DEVICE_SOURCE_FILES
+          TiledArray/device/btas_um_tensor.cpp
+  )
 
-  set_source_files_properties(TiledArray/cuda/btas_um_tensor.cpp
+  if(TILEDARRAY_HAS_CUDA)
+
+    list(APPEND TILEDARRAY_DEVICE_SOURCE_FILES
+          TiledArray/device/cpu_cuda_vector.cu
+          TiledArray/device/kernel/thrust/mult_kernel.cu
+          TiledArray/device/kernel/thrust/reduce_kernel.cu
+          TiledArray/device/um_storage.cu)
+
+    foreach( f IN LISTS TILEDARRAY_DEVICE_SOURCE_FILES )
+      set_source_files_properties( "${f}"
           PROPERTIES
-          LANGUAGE CUDA)
+          INCLUDE_DIRECTORIES "${CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES}")
+    endforeach()
+
+    # the list of libraries on which TiledArray depends on
+    list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cudart CUDA::cublas CUDA::nvtx3)
 
-  # the list of libraries on which TiledArray depends on
-  list(APPEND _TILEDARRAY_DEPENDENCIES CUDA::cublas CUDA::nvToolsExt TiledArray_LIBRETT)
+  endif(TILEDARRAY_HAS_CUDA)
 
-endif(CUDA_FOUND)
+  if (TILEDARRAY_HAS_HIP)
+    list(APPEND TILEDARRAY_DEVICE_SOURCE_FILES
+            TiledArray/device/kernel/thrust/mult_kernel.hip
+            TiledArray/device/kernel/thrust/reduce_kernel.hip
+    )
+
+    list(APPEND _TILEDARRAY_DEPENDENCIES hip::host
+            # N.B. linking to rocthrust makes all files in tiledarray target compiled as HIP ...
+            #      seemingly (like with CUDA thrust) linking to this target is not needed
+            # roc::rocthrust
+    )
+  endif()
+
+  # LibreTT needed for either CUDA or HIP
+  list(APPEND _TILEDARRAY_DEPENDENCIES TiledArray_LIBRETT)
+
+  list(APPEND TILEDARRAY_SOURCE_FILES "${TILEDARRAY_DEVICE_SOURCE_FILES}")
+endif(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP)
 
 if( TARGET TiledArray_SCALAPACK )
   list(APPEND _TILEDARRAY_DEPENDENCIES TiledArray_SCALAPACK)
@@ -280,6 +309,16 @@ if( TARGET ttg-parsec )
   list(APPEND _TILEDARRAY_DEPENDENCIES ttg-parsec)
 endif()
 
+if (IntelMKL_FAIR_DISPATCH AND BLAS_IS_MKL)
+    message(WARNING "created tiledarray_mkl_dispatch")
+    add_library(tiledarray_mkl_dispatch OBJECT
+      TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c
+      TiledArray/external/agnerfog/intel_mkl_feature_patch.c
+    )
+    # N.B. --allow-multiple-definition is a GNU linker extension
+    list(APPEND _TILEDARRAY_DEPENDENCIES $<TARGET_OBJECTS:tiledarray_mkl_dispatch> -Wl,--allow-multiple-definition)
+endif()
+
 # cache deps as TILEDARRAY_PRIVATE_LINK_LIBRARIES
 set(TILEDARRAY_PRIVATE_LINK_LIBRARIES ${_TILEDARRAY_DEPENDENCIES} CACHE STRING "List of libraries on which TiledArray depends on")
 
@@ -296,7 +335,7 @@ add_library(tiledarray ${TILEDARRAY_SOURCE_FILES} ${TILEDARRAY_HEADER_FILES})
 
   target_link_libraries(${targetname} PUBLIC ${TILEDARRAY_PRIVATE_LINK_LIBRARIES})
   target_link_libraries(${targetname} PUBLIC MADworld)
-  target_link_libraries(${targetname} PUBLIC Boost::boost)
+  target_link_libraries(${targetname} PUBLIC Boost::headers)
 
   # build all external deps before building tiledarray
   add_dependencies(${targetname} External-tiledarray)
@@ -306,10 +345,10 @@ add_library(tiledarray ${TILEDARRAY_SOURCE_FILES} ${TILEDARRAY_HEADER_FILES})
   target_compile_options(${targetname} PUBLIC ${CMAKE_CXX_FLAG_LIST})
   target_compile_features(${targetname} PUBLIC "cxx_std_${CMAKE_CXX_STANDARD}")
 
-  if (CUDA_FOUND)
+  if (TILEDARRAY_HAS_CUDA)
     target_include_directories(${targetname} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
     target_compile_features(tiledarray PUBLIC "cuda_std_${CMAKE_CUDA_STANDARD}")
-  endif (CUDA_FOUND)
+  endif (TILEDARRAY_HAS_CUDA)
 
   if (LAPACK_INCLUDE_DIRS)
     target_include_directories(${targetname} PUBLIC ${LAPACK_INCLUDE_DIRS})
diff --git a/src/TiledArray/array_impl.h b/src/TiledArray/array_impl.h
index beb8ba3e09..7d5b59d7c1 100644
--- a/src/TiledArray/array_impl.h
+++ b/src/TiledArray/array_impl.h
@@ -30,6 +30,7 @@
 #include <TiledArray/tensor_impl.h>
 #include <TiledArray/transform_iterator.h>
 #include <TiledArray/type_traits.h>
+#include <TiledArray/util/function.h>
 
 namespace TiledArray {
 namespace detail {
@@ -197,6 +198,17 @@ std::ostream& operator<<(std::ostream& os, const TileConstReference<Impl>& a) {
   return os;
 }
 
+/// Callaback used to update counter (typically, task counter)
+template <typename AtomicInt>
+struct IncrementCounter : public madness::CallbackInterface {
+  AtomicInt& counter;
+  IncrementCounter(AtomicInt& counter) : counter(counter) {}
+  void notify() override {
+    ++counter;
+    delete this;
+  }
+};
+
 }  // namespace detail
 }  // namespace TiledArray
 
@@ -407,7 +419,8 @@ class ArrayIterator {
 /// \note It is the users responsibility to ensure the process maps on all
 /// nodes are identical.
 template <typename Tile, typename Policy>
-class ArrayImpl : public TensorImpl<Policy> {
+class ArrayImpl : public TensorImpl<Policy>,
+                  public std::enable_shared_from_this<ArrayImpl<Tile, Policy>> {
  public:
   typedef ArrayImpl<Tile, Policy> ArrayImpl_;  ///< This object type
   typedef TensorImpl<Policy> TensorImpl_;  ///< The base class of this object
@@ -423,6 +436,9 @@ class ArrayImpl : public TensorImpl<Policy> {
   typedef typename TensorImpl_::pmap_interface
       pmap_interface;       ///< process map interface type
   typedef Tile value_type;  ///< Tile or data type
+  typedef typename Tile::value_type
+      element_type;  ///< The value type of a tile. It is the numeric_type for
+                     ///< tensor-of-scalars tiles.
   typedef
       typename eval_trait<Tile>::type eval_type;  ///< The tile evaluation type
   typedef typename numeric_type<value_type>::type
@@ -440,6 +456,68 @@ class ArrayImpl : public TensorImpl<Policy> {
  private:
   storage_type data_;  ///< Tile container
 
+ public:
+  static madness::AtomicInt cleanup_counter_;
+
+  /// Array deleter function
+
+  /// This function schedules a task for lazy cleanup. Array objects are
+  /// deleted only after the object has been deleted in all processes.
+  /// \param pimpl The implementation pointer to be deleted.
+  static void lazy_deleter(const ArrayImpl_* const pimpl) {
+    if (pimpl) {
+      if (madness::initialized()) {
+        World& world = pimpl->world();
+        const madness::uniqueidT id = pimpl->id();
+        cleanup_counter_++;
+
+        // wait for all DelayedSet's to vanish
+        world.await([&]() { return (pimpl->num_live_ds() == 0); }, true);
+
+        try {
+          world.gop.lazy_sync(id, [pimpl]() {
+            delete pimpl;
+            ArrayImpl_::cleanup_counter_--;
+          });
+        } catch (madness::MadnessException& e) {
+          fprintf(stderr,
+                  "!! ERROR TiledArray: madness::MadnessException thrown in "
+                  "DistArray::lazy_deleter().\n"
+                  "%s\n"
+                  "!! ERROR TiledArray: The exception has been absorbed.\n"
+                  "!! ERROR TiledArray: rank=%i\n",
+                  e.what(), world.rank());
+
+          cleanup_counter_--;
+          delete pimpl;
+        } catch (std::exception& e) {
+          fprintf(stderr,
+                  "!! ERROR TiledArray: std::exception thrown in "
+                  "DistArray::lazy_deleter().\n"
+                  "%s\n"
+                  "!! ERROR TiledArray: The exception has been absorbed.\n"
+                  "!! ERROR TiledArray: rank=%i\n",
+                  e.what(), world.rank());
+
+          cleanup_counter_--;
+          delete pimpl;
+        } catch (...) {
+          fprintf(stderr,
+                  "!! ERROR TiledArray: An unknown exception was thrown in "
+                  "DistArray::lazy_deleter().\n"
+                  "!! ERROR TiledArray: The exception has been absorbed.\n"
+                  "!! ERROR TiledArray: rank=%i\n",
+                  world.rank());
+
+          cleanup_counter_--;
+          delete pimpl;
+        }
+      } else {
+        delete pimpl;
+      }
+    }
+  }
+
  public:
   /// Constructor
 
@@ -453,7 +531,32 @@ class ArrayImpl : public TensorImpl<Policy> {
   ArrayImpl(World& world, const trange_type& trange, const shape_type& shape,
             const std::shared_ptr<const pmap_interface>& pmap)
       : TensorImpl_(world, trange, shape, pmap),
-        data_(world, trange.tiles_range().volume(), pmap) {}
+        data_(world, trange.tiles_range().volume(), pmap) {
+    // Validate the process map
+    TA_ASSERT(pmap->size() == trange.tiles_range().volume() &&
+              "TiledArray::DistArray::DistArray() -- The size of the process "
+              "map is not "
+              "equal to the number of tiles in the TiledRange object.");
+    TA_ASSERT(pmap->rank() ==
+                  typename pmap_interface::size_type(world.rank()) &&
+              "TiledArray::DistArray::DistArray() -- The rank of the process "
+              "map is not equal to that "
+              "of the world object.");
+    TA_ASSERT(pmap->procs() ==
+                  typename pmap_interface::size_type(world.size()) &&
+              "TiledArray::DistArray::DistArray() -- The number of processes "
+              "in the process map is not "
+              "equal to that of the world object.");
+
+    // Validate the shape
+    TA_ASSERT(
+        !shape.empty() &&
+        "TiledArray::DistArray::DistArray() -- The shape is not initialized.");
+    TA_ASSERT(shape.validate(trange.tiles_range()) &&
+              "TiledArray::DistArray::DistArray() -- The range of the shape is "
+              "not equal to "
+              "the tiles range.");
+  }
 
   /// Virtual destructor
   virtual ~ArrayImpl() {}
@@ -636,10 +739,117 @@ class ArrayImpl : public TensorImpl<Policy> {
   /// DistributedStorage
 
   /// @return const reference to the atomic counter of live DelayedSet requests
-  const madness::AtomicInt& num_live_ds() const { return data_.num_live_ds(); }
+  const std::atomic<std::size_t>& num_live_ds() const {
+    return data_.num_live_ds();
+  }
+
+  /// Reports the number of live DelayedForward requests for this object's
+  /// DistributedStorage
+
+  /// @return const reference to the atomic counter of live DelayedForward
+  /// requests
+  const std::atomic<std::size_t>& num_live_df() const {
+    return data_.num_live_df();
+  }
+
+  /// Initialize (local) tiles with a user provided functor
+
+  /// This function is used to initialize the local, non-zero tiles of the array
+  /// via a function (or functor). The work is done in parallel, therefore \c op
+  /// must be a thread safe function/functor. The signature of the functor
+  /// should be:
+  /// \code
+  /// value_type op(const range_type&)
+  /// \endcode
+  /// For example, in the following code, the array tiles are initialized with
+  /// random numbers from 0 to 1:
+  /// \code
+  /// array.init_tiles([] (const TiledArray::Range& range) ->
+  /// TiledArray::Tensor<double>
+  ///     {
+  ///        // Initialize the tile with the given range object
+  ///        TiledArray::Tensor<double> tile(range);
+  ///
+  ///        // Initialize the random number generator
+  ///        std::default_random_engine generator;
+  ///        std::uniform_real_distribution<double> distribution(0.0,1.0);
+  ///
+  ///        // Fill the tile with random numbers
+  ///        for(auto& value : tile)
+  ///           value = distribution(generator);
+  ///
+  ///        return tile;
+  ///     });
+  /// \endcode
+  /// \tparam Op The type of the functor/function
+  /// \param[in] op The operation used to generate tiles
+  /// \param[in] skip_set If false, will throw if any tiles are already set
+  /// \return the total number of tiles that have been (or will be) initialized
+  /// \throw TiledArray::Exception if the PIMPL is not set. Strong throw
+  ///                              guarantee.
+  /// \throw TiledArray::Exception if a tile is already set and skip_set is
+  ///                              false. Weak throw guarantee.
+  template <HostExecutor Exec = HostExecutor::Default, Fence fence = Fence::No,
+            typename Op>
+  std::int64_t init_tiles(Op&& op, bool skip_set = false) {
+    // lifetime management of op depends on whether it is a lvalue ref (i.e. has
+    // an external owner) or an rvalue ref
+    // - if op is an lvalue ref: pass op to tasks
+    // - if op is an rvalue ref pass make_shared_function(op) to tasks
+    auto op_shared_handle = make_op_shared_handle(std::forward<Op>(op));
+
+    std::int64_t ntiles_initialized{0};
+    auto it = this->pmap()->begin();
+    const auto end = this->pmap()->end();
+    std::atomic<std::int64_t> ntask_completed{0};
+    for (; it != end; ++it) {
+      const auto& index = *it;
+      if (!this->is_zero(index)) {
+        if (skip_set) {
+          auto& fut = this->get_local(index);
+          if (fut.probe()) continue;
+        }
+        if constexpr (Exec == HostExecutor::MADWorld) {
+          Future<value_type> tile =
+              this->world().taskq.add([this_sptr = this->shared_from_this(),
+                                       index = ordinal_type(index),
+                                       op_shared_handle, this]() -> value_type {
+                return op_shared_handle(
+                    this_sptr->trange().make_tile_range(index));
+              });
+          ++ntiles_initialized;
+          if constexpr (fence == Fence::Local) {
+            tile.register_callback(
+                new IncrementCounter<decltype(ntask_completed)>(
+                    ntask_completed));
+          }
+          set(index, std::move(tile));
+        } else {
+          static_assert(Exec == HostExecutor::Thread);
+          set(index, op_shared_handle(this->trange().make_tile_range(index)));
+          ++ntiles_initialized;
+        }
+      }
+    }
+
+    if constexpr (fence == Fence::Local) {
+      if constexpr (Exec == HostExecutor::MADWorld) {
+        if (ntiles_initialized > 0)
+          this->world().await([&ntask_completed, ntiles_initialized]() {
+            return ntask_completed == ntiles_initialized;
+          });
+      }
+    } else if constexpr (fence == Fence::Global) {
+      this->world().gop.fence();
+    }
+    return ntiles_initialized;
+  }
 
 };  // class ArrayImpl
 
+template <typename Tile, typename Policy>
+madness::AtomicInt ArrayImpl<Tile, Policy>::cleanup_counter_;
+
 #ifndef TILEDARRAY_HEADER_ONLY
 
 extern template class ArrayImpl<Tensor<double>, DensePolicy>;
@@ -662,6 +872,167 @@ extern template class ArrayImpl<Tensor<std::complex<float>>, SparsePolicy>;
 
 #endif  // TILEDARRAY_HEADER_ONLY
 
+template <typename Tile, typename Policy>
+void write_tile_block(madness::uniqueidT target_array_id,
+                      std::size_t target_tile_ord,
+                      const Tile& target_tile_contribution) {
+  auto* world_ptr = World::world_from_id(target_array_id.get_world_id());
+  auto target_array_ptr_opt =
+      world_ptr->ptr_from_id<typename ArrayImpl<Tile, Policy>::storage_type>(
+          target_array_id);
+  TA_ASSERT(target_array_ptr_opt);
+  TA_ASSERT((*target_array_ptr_opt)->is_local(target_tile_ord));
+  (*target_array_ptr_opt)
+      ->get_local(target_tile_ord)
+      .get()
+      .block(target_tile_contribution.range()) = target_tile_contribution;
+}
+
+template <typename Tile, typename Policy>
+std::shared_ptr<ArrayImpl<Tile, Policy>> make_with_new_trange(
+    const std::shared_ptr<const ArrayImpl<Tile, Policy>>& source_array_sptr,
+    const TiledRange& target_trange,
+    typename ArrayImpl<Tile, Policy>::element_type new_value_fill =
+        typename ArrayImpl<Tile, Policy>::element_type{}) {
+  TA_ASSERT(source_array_sptr);
+  auto& source_array = *source_array_sptr;
+  auto& world = source_array.world();
+  const auto rank = source_array.trange().rank();
+  TA_ASSERT(rank == target_trange.rank());
+
+  // compute metadata
+  // - list of target tile indices and the corresponding Range1 for each 1-d
+  // source tile
+  using target_tiles_t = std::vector<std::pair<TA_1INDEX_TYPE, Range1>>;
+  using mode_target_tiles_t = std::vector<target_tiles_t>;
+  using all_target_tiles_t = std::vector<mode_target_tiles_t>;
+
+  all_target_tiles_t all_target_tiles(target_trange.rank());
+  // for each mode ...
+  for (auto d = 0; d != target_trange.rank(); ++d) {
+    mode_target_tiles_t& mode_target_tiles = all_target_tiles[d];
+    auto& target_tr1 = target_trange.dim(d);
+    auto& target_element_range = target_tr1.elements_range();
+    // ... and each tile in that mode ...
+    for (auto&& source_tile : source_array.trange().dim(d)) {
+      mode_target_tiles.emplace_back();
+      auto& target_tiles = mode_target_tiles.back();
+      auto source_tile_lo = source_tile.lobound();
+      auto source_tile_up = source_tile.upbound();
+      auto source_element_idx = source_tile_lo;
+      // ... find all target tiles what overlap with it
+      if (target_element_range.overlaps_with(source_tile)) {
+        while (source_element_idx < source_tile_up) {
+          if (target_element_range.includes(source_element_idx)) {
+            auto target_tile_idx =
+                target_tr1.element_to_tile(source_element_idx);
+            auto target_tile = target_tr1.tile(target_tile_idx);
+            auto target_lo =
+                std::max(source_element_idx, target_tile.lobound());
+            auto target_up = std::min(source_tile_up, target_tile.upbound());
+            target_tiles.emplace_back(target_tile_idx,
+                                      Range1(target_lo, target_up));
+            source_element_idx = target_up;
+          } else if (source_element_idx < target_element_range.lobound()) {
+            source_element_idx = target_element_range.lobound();
+          } else if (source_element_idx >= target_element_range.upbound())
+            break;
+        }
+      }
+    }
+  }
+
+  // estimate the shape, if sparse
+  // use max value for each nonzero tile, then will recompute after tiles are
+  // assigned
+  using shape_type = typename Policy::shape_type;
+  shape_type target_shape;
+  const auto& target_tiles_range = target_trange.tiles_range();
+  if constexpr (!is_dense_v<Policy>) {
+    // each rank computes contributions to the shape norms from its local tiles
+    Tensor<float> target_shape_norms(target_tiles_range, 0);
+    auto& source_trange = source_array.trange();
+    const auto e = source_array.cend();
+    for (auto it = source_array.cbegin(); it != e; ++it) {
+      auto source_tile_idx = it.index();
+
+      // make range for iterating over all possible target tile idx combinations
+      TA::Index target_tile_ord_extent_range(rank);
+      for (auto d = 0; d != rank; ++d) {
+        target_tile_ord_extent_range[d] =
+            all_target_tiles[d][source_tile_idx[d]].size();
+      }
+
+      // loop over every target tile combination
+      TA::Range target_tile_ord_extent(target_tile_ord_extent_range);
+      for (auto& target_tile_ord : target_tile_ord_extent) {
+        TA::Index target_tile_idx(rank);
+        for (auto d = 0; d != rank; ++d) {
+          target_tile_idx[d] =
+              all_target_tiles[d][source_tile_idx[d]][target_tile_ord[d]].first;
+        }
+        target_shape_norms(target_tile_idx) = std::numeric_limits<float>::max();
+      }
+    }
+    world.gop.max(target_shape_norms.data(), target_shape_norms.size());
+    target_shape = SparseShape(target_shape_norms, target_trange);
+  }
+
+  using Array = ArrayImpl<Tile, Policy>;
+  auto target_array_sptr = std::shared_ptr<Array>(
+      new Array(
+          source_array.world(), target_trange, target_shape,
+          Policy::default_pmap(world, target_trange.tiles_range().volume())),
+      Array::lazy_deleter);
+  auto& target_array = *target_array_sptr;
+  target_array.init_tiles([value = new_value_fill](const Range& range) {
+    return typename Array::value_type(range, value);
+  });
+  target_array.world().gop.fence();
+
+  // loop over local tile and sends its contributions to the targets
+  {
+    auto& source_trange = source_array.trange();
+    const auto e = source_array.cend();
+    auto& target_tiles_range = target_trange.tiles_range();
+    for (auto it = source_array.cbegin(); it != e; ++it) {
+      const auto& source_tile = *it;
+      auto source_tile_idx = it.index();
+
+      // make range for iterating over all possible target tile idx combinations
+      TA::Index target_tile_ord_extent_range(rank);
+      for (auto d = 0; d != rank; ++d) {
+        target_tile_ord_extent_range[d] =
+            all_target_tiles[d][source_tile_idx[d]].size();
+      }
+
+      // loop over every target tile combination
+      TA::Range target_tile_ord_extent(target_tile_ord_extent_range);
+      for (auto& target_tile_ord : target_tile_ord_extent) {
+        TA::Index target_tile_idx(rank);
+        container::svector<TA::Range1> target_tile_rngs1(rank);
+        for (auto d = 0; d != rank; ++d) {
+          std::tie(target_tile_idx[d], target_tile_rngs1[d]) =
+              all_target_tiles[d][source_tile_idx[d]][target_tile_ord[d]];
+        }
+        TA_ASSERT(source_tile.future().probe());
+        Tile target_tile_contribution(
+            source_tile.get().block(target_tile_rngs1));
+        auto target_tile_idx_ord = target_tiles_range.ordinal(target_tile_idx);
+        auto target_proc = target_array.pmap()->owner(target_tile_idx_ord);
+        world.taskq.add(target_proc, &write_tile_block<Tile, Policy>,
+                        target_array.id(), target_tile_idx_ord,
+                        target_tile_contribution);
+      }
+    }
+  }
+  // data is mutated in place, so must wait for all tasks to complete
+  target_array.world().gop.fence();
+  // WARNING!! need to truncate in DistArray ctor
+
+  return target_array_sptr;
+}
+
 }  // namespace detail
 }  // namespace TiledArray
 
diff --git a/src/TiledArray/block_range.h b/src/TiledArray/block_range.h
index 08096c96ea..06f8ecd629 100644
--- a/src/TiledArray/block_range.h
+++ b/src/TiledArray/block_range.h
@@ -85,7 +85,7 @@ class BlockRange : public Range {
       upper[d] = upper_bound_d;
       // Check input dimensions
       TA_ASSERT(lower[d] >= range.lobound(d));
-      TA_ASSERT(lower[d] < upper[d]);
+      TA_ASSERT(lower[d] <= upper[d]);
       TA_ASSERT(upper[d] <= range.upbound(d));
       extent[d] = upper[d] - lower[d];
       TA_ASSERT(extent[d] ==
@@ -132,7 +132,7 @@ class BlockRange : public Range {
       upper[d] = upper_bound_d;
       // Check input dimensions
       TA_ASSERT(lower[d] >= range.lobound(d));
-      TA_ASSERT(lower[d] < upper[d]);
+      TA_ASSERT(lower[d] <= upper[d]);
       TA_ASSERT(upper[d] <= range.upbound(d));
       extent[d] = upper[d] - lower[d];
       TA_ASSERT(extent[d] ==
@@ -177,9 +177,10 @@ class BlockRange : public Range {
   /// \param range the host Range
   /// \param lower_bound A sequence of lower bounds for each dimension
   /// \param upper_bound A sequence of upper bounds for each dimension
+  /// \note Zero-extent blocks along any mode is possible, i.e. `lower_bound[d] == upper_bound[d]` is supported
   /// \throw TiledArray::Exception When the size of \p lower_bound is not
   /// equal to that of \p upper_bound.
-  /// \throw TiledArray::Exception When `lower_bound[i] >= upper_bound[i]`
+  /// \throw TiledArray::Exception When `lower_bound[i] > upper_bound[i]`
   // clang-format on
   template <typename Index1, typename Index2,
             typename = std::enable_if_t<detail::is_integral_range_v<Index1> &&
@@ -204,9 +205,10 @@ class BlockRange : public Range {
   /// \param range the host Range
   /// \param lower_bound An initializer list of lower bounds for each dimension
   /// \param upper_bound An initializer list of upper bounds for each dimension
+  /// \note Zero-extent blocks along any mode is possible, i.e. `lower_bound[d] == upper_bound[d]` is supported
   /// \throw TiledArray::Exception When the size of \p lower_bound is not
   /// equal to that of \p upper_bound.
-  /// \throw TiledArray::Exception When `lower_bound[i] >= upper_bound[i]`
+  /// \throw TiledArray::Exception When `lower_bound[i] > upper_bound[i]`
   // clang-format on
   template <typename Index1, typename Index2,
             typename = std::enable_if_t<std::is_integral_v<Index1> &&
@@ -247,7 +249,8 @@ class BlockRange : public Range {
   /// \endcode
   /// \tparam PairRange Type representing a range of generalized pairs (see TiledArray::detail::is_gpair_v )
   /// \param bounds A range of {lower,upper} bounds for each dimension
-  /// \throw TiledArray::Exception When `bounds[i].lower>=bounds[i].upper` for any \c i .
+  /// \note Zero-extent blocks along any mode is possible, i.e. `bounds[d].lower == bounds[d].upper` is supported
+  /// \throw TiledArray::Exception When `bounds[i].lower>bounds[i].upper` for any \c i .
   // clang-format on
   template <typename PairRange,
             typename = std::enable_if_t<detail::is_gpair_range_v<PairRange>>>
@@ -264,8 +267,9 @@ class BlockRange : public Range {
   ///   BlockRange br0(r, {std::make_pair(0,4), std::pair{1,6}, std::pair(2,8)});
   /// \endcode
   /// \tparam GPair a generalized pair of integral types
-  /// \param bound A range of {lower,upper} bounds for each dimension
-  /// \throw TiledArray::Exception When `bound[i].lower>=bound[i].upper` for any \c i .
+  /// \param bounds A range of {lower,upper} bounds for each dimension
+  /// \note Zero-extent blocks along any mode is possible, i.e. `bounds[d].lower == bounds[d].upper` is supported
+  /// \throw TiledArray::Exception When `bounds[i].lower>bounds[i].upper` for any \c i .
   // clang-format on
   template <typename GPair>
   BlockRange(const Range& range, const std::initializer_list<GPair>& bounds,
@@ -290,8 +294,9 @@ class BlockRange : public Range {
   ///   BlockRange br0(r, {{0,4}, {1,6}, {2,8}});
   /// \endcode
   /// \tparam Index An integral type
-  /// \param bound A range of {lower,upper} bounds for each dimension
-  /// \throw TiledArray::Exception When `bound[i].lower>=bound[i].upper` for any \c i .
+  /// \param bounds A range of {lower,upper} bounds for each dimension
+  /// \note Zero-extent blocks along any mode is possible, i.e. `bounds[d].lower == bounds[d].upper` is supported
+  /// \throw TiledArray::Exception When `bounds[i].lower>bounds[i].upper` for any \c i .
   // clang-format on
   template <typename Index,
             typename = std::enable_if_t<std::is_integral_v<Index>>>
@@ -354,6 +359,8 @@ class BlockRange : public Range {
   /// \return The ordinal index in the
   /// \throw TiledArray::Exception When \c index is not included in this range
   ordinal_type ordinal(ordinal_type ord) const {
+    // ordinals are useless for zero-volume ranges
+    TA_ASSERT(volume() != 0);
     // Check that ord is contained by this range.
     TA_ASSERT(Range::includes_ordinal(ord));
 
@@ -414,7 +421,7 @@ class BlockRange : public Range {
   template <typename Archive>
   void serialize(Archive& ar) const {
     Range::serialize(ar);
-    ar& block_offset_;
+    ar & block_offset_;
   }
 };  // BlockRange
 
diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in
index 0c4d5d5cbc..483847067f 100644
--- a/src/TiledArray/config.h.in
+++ b/src/TiledArray/config.h.in
@@ -76,7 +76,20 @@
 
 /* Define if TiledArray configured with CUDA support */
 #cmakedefine TILEDARRAY_HAS_CUDA @TILEDARRAY_HAS_CUDA@
-#cmakedefine TILEDARRAY_CHECK_CUDA_ERROR @TILEDARRAY_CHECK_CUDA_ERROR@
+
+/* Define if TiledArray configured with HIP support */
+#cmakedefine TILEDARRAY_HAS_HIP @TILEDARRAY_HAS_HIP@
+
+// Umpire and LibreTT limited to 1 device runtime at a time, so is TA
+#if defined(TILEDARRAY_HAS_HIP)
+#  define TILEDARRAY_HAS_DEVICE 1
+#  define TILEDARRAY_DEVICE_RUNTIME HIP
+#  define TILEDARRAY_DEVICE_RUNTIME_STR "HIP"
+#elif defined(TILEDARRAY_HAS_CUDA)
+#  define TILEDARRAY_HAS_DEVICE 1
+#  define TILEDARRAY_DEVICE_RUNTIME CUDA
+#  define TILEDARRAY_DEVICE_RUNTIME_STR "CUDA"
+#endif
 
 /* Is TA::Tensor memory profiling enabled? */
 #cmakedefine TA_TENSOR_MEM_PROFILE 1
@@ -100,6 +113,8 @@
 #endif  // !defined(TILEDARRAY_HAS_BTAS)
 #if defined(TILEDARRAY_HAS_BTAS) && defined(BTAS_HAS_INTEL_MKL)
 #  define TILEDARRAY_HAS_INTEL_MKL
+/* use fair dispatch in Intel MKL? */
+#cmakedefine IntelMKL_FAIR_DISPATCH
 #endif
 
 /* Add macro TILEDARRAY_FORCE_INLINE which does as the name implies. */
@@ -161,6 +176,9 @@
 #cmakedefine TA_ENABLE_TILE_OPS_LOGGING 1
 #define TA_TILE_OPS_LOG_LEVEL 0@TA_TILE_OPS_LOG_LEVEL@
 
+/* Enables collection of communication statistics for global objects (DistEval and DistributedStorage) */
+#cmakedefine TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE 1
+
 /* ----------- pragma helpers ---------------*/
 #define TILEDARRAY_PRAGMA(x) _Pragma(#x)
 /* same as TILEDARRAY_PRAGMA(x), but expands x */
diff --git a/src/TiledArray/conversions/btas.h b/src/TiledArray/conversions/btas.h
index 28e5790e8f..ab07e97b53 100644
--- a/src/TiledArray/conversions/btas.h
+++ b/src/TiledArray/conversions/btas.h
@@ -36,6 +36,9 @@
 #include <TiledArray/tensor.h>
 #include <TiledArray/tensor/tensor_map.h>
 
+#include <range/v3/view/transform.hpp>
+#include <range/v3/view/zip.hpp>
+
 namespace TiledArray {
 
 // clang-format off
@@ -49,11 +52,12 @@ namespace TiledArray {
 /// \tparam Storage_ The storage type of the source btas::Tensor object
 /// \tparam Tensor_ A tensor type (e.g., TiledArray::Tensor or btas::Tensor,
 ///         optionally wrapped into TiledArray::Tile)
-/// \param[in] src The source object; its subblock defined by the {lower,upper}
-///            bounds \c {dst.lobound(),dst.upbound()} will be copied to \c dst
+/// \param[in] src The source object; its subblock
+///            `{dst.lobound(),dst.upbound()}`
+///            will be copied to \c dst
 /// \param[out] dst The object that will contain the contents of the
 ///             corresponding subblock of src
-/// \throw TiledArray::Exception When the dimensions of \c src and \c dst do not
+/// \throw TiledArray::Exception When the dimensions of \p src and \p dst do not
 ///        match.
 // clang-format on
 template <typename T, typename Range_, typename Storage_, typename Tensor_>
@@ -73,6 +77,57 @@ inline void btas_subtensor_to_tensor(
   dst_view = src_view;
 }
 
+// clang-format off
+/// Copy a block of a btas::Tensor into a TiledArray::Tensor
+
+/// A block of btas::Tensor \c src will be copied into TiledArray::Tensor \c
+/// dst. The block dimensions will be determined by the dimensions of the range
+/// of \c dst .
+/// \tparam T The tensor element type
+/// \tparam Range_ The range type of the source btas::Tensor object
+/// \tparam Storage_ The storage type of the source btas::Tensor object
+/// \tparam Tensor_ A tensor type (e.g., TiledArray::Tensor or btas::Tensor,
+///         optionally wrapped into TiledArray::Tile)
+/// \param[in] src The source object; its subblock
+///            `{dst.lobound() + offset,dst.upbound() + offset}`
+///            will be copied to \c dst
+/// \param[out] dst The object that will contain the contents of the
+///             corresponding subblock of src
+/// \param[out] offset the offset to be applied to the coordinates of `dst.range()` to determine the block in \p src to be copied; this is needed if the DistArray that will contain \p dst will have a range whose lobound is different from `src.lobound()`
+/// \throw TiledArray::Exception When the dimensions of \p src and \p dst do not
+///        match.
+// clang-format on
+template <
+    typename T, typename Range_, typename Storage_, typename Tensor_,
+    typename IntegerRange,
+    typename = std::enable_if_t<detail::is_integral_range_v<IntegerRange>>>
+inline void btas_subtensor_to_tensor(
+    const btas::Tensor<T, Range_, Storage_>& src, Tensor_& dst,
+    IntegerRange&& offset) {
+  TA_ASSERT(dst.range().rank() == src.range().rank());
+  TA_ASSERT(ranges::size(offset) == src.range().rank());
+
+  const auto& src_range = src.range();
+  const auto& dst_range = dst.range();
+  auto src_blk_range =
+      TiledArray::BlockRange(detail::make_ta_range(src_range),
+                             ranges::views::zip(dst_range.lobound(), offset) |
+                                 ranges::views::transform([](auto&& i_j) {
+                                   auto&& [i, j] = i_j;
+                                   return i + j;
+                                 }),
+                             ranges::views::zip(dst_range.upbound(), offset) |
+                                 ranges::views::transform([](auto&& i_j) {
+                                   auto&& [i, j] = i_j;
+                                   return i + j;
+                                 }));
+  using std::data;
+  auto src_view = TiledArray::make_const_map(data(src), src_blk_range);
+  auto dst_view = TiledArray::make_map(data(dst), dst_range);
+
+  dst_view = src_view;
+}
+
 // clang-format off
 /// Copy a TiledArray::Tensor into a block of a btas::Tensor
 
@@ -86,8 +141,8 @@ inline void btas_subtensor_to_tensor(
 /// \tparam Storage_ The storage type of the destination btas::Tensor object
 /// \param[in] src The source object whose contents will be copied into
 ///            a subblock of \c dst
-/// \param[out] dst The destination object; its subblock defined by the
-///             {lower,upper} bounds \c {src.lobound(),src.upbound()} will be
+/// \param[out] dst The destination object; its subblock
+///             `{src.lobound(),src.upbound()}` will be
 ///             overwritten with the content of \c src
 /// \throw TiledArray::Exception When the dimensions
 ///        of \c src and \c dst do not match.
@@ -109,6 +164,57 @@ inline void tensor_to_btas_subtensor(const Tensor_& src,
   dst_view = src_view;
 }
 
+// clang-format off
+/// Copy a TiledArray::Tensor into a block of a btas::Tensor
+
+/// TiledArray::Tensor \c src will be copied into a block of btas::Tensor
+/// \c dst. The block dimensions will be determined by the dimensions of the range
+/// of \c src .
+/// \tparam Tensor_ A tensor type (e.g., TiledArray::Tensor or btas::Tensor,
+///         optionally wrapped into TiledArray::Tile)
+/// \tparam T The tensor element type
+/// \tparam Range_ The range type of the destination btas::Tensor object
+/// \tparam Storage_ The storage type of the destination btas::Tensor object
+/// \param[in] src The source object whose contents will be copied into
+///            a subblock of \c dst
+/// \param[out] dst The destination object; its subblock
+///             `{src.lobound()+offset,src.upbound()+offset}` will be
+///             overwritten with the content of \c src
+/// \param[out] offset the offset to be applied to the coordinates of `src.range()` to determine the block in \p dst to be copied; this is needed if the DistArray that contains \p src has a range whose lobound is different from `dst.lobound()`
+/// \throw TiledArray::Exception When the dimensions
+///        of \c src and \c dst do not match.
+// clang-format on
+template <
+    typename Tensor_, typename T, typename Range_, typename Storage_,
+    typename IntegerRange,
+    typename = std::enable_if_t<detail::is_integral_range_v<IntegerRange>>>
+inline void tensor_to_btas_subtensor(const Tensor_& src,
+                                     btas::Tensor<T, Range_, Storage_>& dst,
+                                     IntegerRange&& offset) {
+  TA_ASSERT(dst.range().rank() == src.range().rank());
+  TA_ASSERT(ranges::size(offset) == src.range().rank());
+
+  const auto& src_range = src.range();
+  const auto& dst_range = dst.range();
+  auto dst_blk_range =
+      TiledArray::BlockRange(detail::make_ta_range(dst_range),
+                             ranges::views::zip(src_range.lobound(), offset) |
+                                 ranges::views::transform([](auto&& i_j) {
+                                   auto&& [i, j] = i_j;
+                                   return i + j;
+                                 }),
+                             ranges::views::zip(src_range.upbound(), offset) |
+                                 ranges::views::transform([](auto&& i_j) {
+                                   auto&& [i, j] = i_j;
+                                   return i + j;
+                                 }));
+  using std::data;
+  auto src_view = TiledArray::make_const_map(data(src), src_range);
+  auto dst_view = TiledArray::make_map(data(dst), dst_blk_range);
+
+  dst_view = src_view;
+}
+
 namespace detail {
 
 /// Task function for converting btas::Tensor subblock to a
@@ -127,7 +233,13 @@ void counted_btas_subtensor_to_tensor(const BTAS_Tensor_* src, DistArray_* dst,
                                       const typename Range::index_type i,
                                       madness::AtomicInt* counter) {
   typename DistArray_::value_type tensor(dst->trange().make_tile_range(i));
-  btas_subtensor_to_tensor(*src, tensor);
+  auto offset = ranges::views::zip(ranges::views::all(src->range().lobound()),
+                                   dst->trange().elements_range().lobound()) |
+                ranges::views::transform([](const auto& s_d) {
+                  auto&& [s, d] = s_d;
+                  return s - d;
+                });
+  btas_subtensor_to_tensor(*src, tensor, offset);
   dst->set(i, tensor);
   (*counter)++;
 }
@@ -137,12 +249,24 @@ void counted_btas_subtensor_to_tensor(const BTAS_Tensor_* src, DistArray_* dst,
 /// \tparam TA_Tensor_ a TiledArray::Tensor type
 /// \tparam BTAS_Tensor_ a btas::Tensor type
 /// \param src The source tensor
-/// \param dst The destination tensor
-/// \param counter The task counter
-template <typename TA_Tensor_, typename BTAS_Tensor_>
-void counted_tensor_to_btas_subtensor(const TA_Tensor_& src, BTAS_Tensor_* dst,
+/// \param src_array_lobound the lobound of the DistArrany that contains src,
+/// used to compute the offset to be applied to the coordinates of `src.range()`
+/// to determine the block in \p dst to be copied into \param dst The
+/// destination tensor \param counter The task counter
+template <
+    typename TA_Tensor_, typename BTAS_Tensor_, typename IntegerRange,
+    typename = std::enable_if_t<detail::is_integral_range_v<IntegerRange>>>
+void counted_tensor_to_btas_subtensor(const TA_Tensor_& src,
+                                      IntegerRange src_array_lobound,
+                                      BTAS_Tensor_* dst,
                                       madness::AtomicInt* counter) {
-  tensor_to_btas_subtensor(src, *dst);
+  auto offset = ranges::views::zip(ranges::views::all(dst->range().lobound()),
+                                   src_array_lobound) |
+                ranges::views::transform([](const auto& d_s) {
+                  auto&& [d, s] = d_s;
+                  return d - s;
+                });
+  tensor_to_btas_subtensor(src, *dst, offset);
   (*counter)++;
 }
 
@@ -267,41 +391,14 @@ DistArray_ btas_tensor_to_array(
   return array;
 }
 
-/// Convert a TiledArray::DistArray object into a btas::Tensor object
+namespace detail {
 
-/// This function will copy the contents of \c src into a \c btas::Tensor
-/// object. The copy operation is done in parallel, and this function will block
-/// until all elements of \c src have been copied into the result array tiles.
-/// The size of \c src.world().size() must be equal to 1 or \c src must be a
-/// replicated TiledArray::DistArray. Usage:
-/// \code
-/// TiledArray::TArrayD
-/// array(world, trange);
-/// // Set tiles of array ...
-///
-/// auto t = array_to_btas_tensor(array);
-/// \endcode
-/// \tparam Tile the tile type of \c src
-/// \tparam Policy the policy type of \c src
-/// \tparam Range_ the range type of the result (either, btas::RangeNd or
-///         TiledArray::Range)
-/// \tparam Storage_ the storage type of the result
-/// \param[in] src The TiledArray::DistArray<Tile,Policy> object whose contents
-/// will be copied to the result.
-/// \return A \c btas::Tensor object that is a copy of \c src
-/// \throw TiledArray::Exception When world size is greater than
-///        1 and \c src is not replicated
-/// \param[in] target_rank the rank on which to create the BTAS tensor
-///            containing the data of \c src ; if \c target_rank=-1 then
-///            create the BTAS tensor on every rank (this requires
-///            that \c src.is_replicated()==true )
-/// \return BTAS tensor object containing the data of \c src , if my rank equals
-///         \c target_rank or \c target_rank==-1 ,
-///         default-initialized BTAS tensor otherwise.
+/// \sa TiledArray::array_to_btas_tensor()
 template <typename Tile, typename Policy, typename Range_ = TiledArray::Range,
           typename Storage_ = btas::DEFAULT::storage<typename Tile::value_type>>
-btas::Tensor<typename Tile::value_type, Range_, Storage_> array_to_btas_tensor(
-    const TiledArray::DistArray<Tile, Policy>& src, int target_rank = -1) {
+btas::Tensor<typename Tile::value_type, Range_, Storage_>
+array_to_btas_tensor_impl(const TiledArray::DistArray<Tile, Policy>& src,
+                          const Range_& result_range, int target_rank) {
   // Test preconditions
   if (target_rank == -1 && src.world().size() > 1 &&
       !src.pmap()->is_replicated())
@@ -314,13 +411,11 @@ btas::Tensor<typename Tile::value_type, Range_, Storage_> array_to_btas_tensor(
   using result_type =
       btas::Tensor<typename TiledArray::DistArray<Tile, Policy>::element_type,
                    Range_, Storage_>;
-  using result_range_type = typename result_type::range_type;
 
   // Construct the result
   if (target_rank == -1 || src.world().rank() == target_rank) {
     // if array is sparse must initialize to zero
-    result_type result(
-        result_range_type(src.trange().elements_range().extent()), 0.0);
+    result_type result(result_range, 0.0);
 
     // Spawn tasks to copy array tiles to btas::Tensor
     madness::AtomicInt counter;
@@ -329,8 +424,12 @@ btas::Tensor<typename Tile::value_type, Range_, Storage_> array_to_btas_tensor(
     for (std::size_t i = 0; i < src.size(); ++i) {
       if (!src.is_zero(i)) {
         src.world().taskq.add(
-            &detail::counted_tensor_to_btas_subtensor<Tile, result_type>,
-            src.find(i), &result, &counter);
+            &detail::counted_tensor_to_btas_subtensor<
+                Tile, result_type,
+                std::decay_t<
+                    decltype(src.trange().elements_range().lobound())>>,
+            src.find(i), src.trange().elements_range().lobound(), &result,
+            &counter);
         ++n;
       }
     }
@@ -343,6 +442,59 @@ btas::Tensor<typename Tile::value_type, Range_, Storage_> array_to_btas_tensor(
     return result_type{};
 }
 
+}  // namespace detail
+
+/// Convert a TiledArray::DistArray object into a btas::Tensor object
+
+/// This function will copy the contents of \c src into a \c btas::Tensor
+/// object. The copy operation is done in parallel, and this function will block
+/// until all elements of \c src have been copied into the result array tiles.
+/// The size of \c src.world().size() must be equal to 1 or \c src must be a
+/// replicated TiledArray::DistArray. Usage:
+/// \code
+/// TiledArray::TArrayD
+/// array(world, trange);
+/// // Set tiles of array ...
+///
+/// auto t = array_to_btas_tensor(array);
+/// \endcode
+/// \tparam Tile the tile type of \c src
+/// \tparam Policy the policy type of \c src
+/// \tparam Range_ the range type of the result (either, btas::RangeNd or
+///         TiledArray::Range)
+/// \tparam Storage_ the storage type of the result
+/// \param[in] src The TiledArray::DistArray<Tile,Policy> object whose contents
+/// will be copied to the result.
+/// \param[in] target_rank the rank on which to create the BTAS tensor
+///            containing the data of \c src ; if \c target_rank=-1 then
+///            create the BTAS tensor on every rank (this requires
+///            that \c src.is_replicated()==true )
+/// \return BTAS tensor object containing the data of \c src , if my rank equals
+///         \c target_rank or \c target_rank==-1 ,
+///         default-initialized BTAS tensor otherwise.
+/// \warning The range of \c src is
+///         not preserved, i.e. the lobound of the result is zero. Use the
+///         variant of this function tagged with preserve_lobound_t to
+///          preserve the range.
+/// \throw TiledArray::Exception When world size is greater than
+///        1 and \c src is not replicated
+template <typename Tile, typename Policy, typename Range_ = TiledArray::Range,
+          typename Storage_ = btas::DEFAULT::storage<typename Tile::value_type>>
+btas::Tensor<typename Tile::value_type, Range_, Storage_> array_to_btas_tensor(
+    const TiledArray::DistArray<Tile, Policy>& src, int target_rank = -1) {
+  return detail::array_to_btas_tensor_impl(
+      src, Range_(src.trange().elements_range().extent()), target_rank);
+}
+
+template <typename Tile, typename Policy, typename Range_ = TiledArray::Range,
+          typename Storage_ = btas::DEFAULT::storage<typename Tile::value_type>>
+btas::Tensor<typename Tile::value_type, Range_, Storage_> array_to_btas_tensor(
+    const TiledArray::DistArray<Tile, Policy>& src, preserve_lobound_t,
+    int target_rank = -1) {
+  return detail::array_to_btas_tensor_impl(src, src.trange().elements_range(),
+                                           target_rank);
+}
+
 }  // namespace TiledArray
 
 #endif  // TILEDARRAY_CONVERSIONS_BTAS_H__INCLUDED
diff --git a/src/TiledArray/conversions/clone.h b/src/TiledArray/conversions/clone.h
index b8c05df840..910d86e21d 100644
--- a/src/TiledArray/conversions/clone.h
+++ b/src/TiledArray/conversions/clone.h
@@ -26,6 +26,10 @@
 #ifndef TILEDARRAY_CONVERSIONS_CLONE_H__INCLUDED
 #define TILEDARRAY_CONVERSIONS_CLONE_H__INCLUDED
 
+#ifdef TILEDARRAY_HAS_DEVICE
+#include "TiledArray/device/device_task_fn.h"
+#endif
+
 namespace TiledArray {
 
 /// Forward declarations
@@ -53,12 +57,28 @@ inline DistArray<Tile, Policy> clone(const DistArray<Tile, Policy>& arg) {
     if (arg.is_zero(index)) continue;
 
     // Spawn a task to clone the tiles
-    Future<value_type> tile = world.taskq.add(
-        [](const value_type& tile) -> value_type {
-          using TiledArray::clone;
-          return clone(tile);
-        },
-        arg.find(index));
+
+    Future<value_type> tile;
+    if constexpr (!detail::is_device_tile_v<value_type>) {
+      tile = world.taskq.add(
+          [](const value_type& tile) -> value_type {
+            using TiledArray::clone;
+            return clone(tile);
+          },
+          arg.find(index));
+    } else {
+#ifdef TILEDARRAY_HAS_DEVICE
+      tile = madness::add_device_task(
+          world,
+          [](const value_type& tile) -> value_type {
+            using TiledArray::clone;
+            return clone(tile);
+          },
+          arg.find(index));
+#else
+      abort();  // unreachable
+#endif
+    }
 
     // Store result tile
     result.set(index, tile);
diff --git a/src/TiledArray/conversions/concat.h b/src/TiledArray/conversions/concat.h
index 7c440c54e2..e7b3e9da55 100644
--- a/src/TiledArray/conversions/concat.h
+++ b/src/TiledArray/conversions/concat.h
@@ -64,7 +64,7 @@ DistArray<Tile, Policy> concat(
   using std::begin;
   using std::end;
 
-  index b(r), e(r);  // updated for concatted modes only
+  index b(r), e(r);  // updated for concatenated modes only
   std::fill(begin(b), end(b), 0);
   for (auto i = 0ul; i != arrays.size(); ++i) {
     auto& tr = arrays[i].trange();
@@ -92,8 +92,12 @@ DistArray<Tile, Policy> concat(
   DistArray<Tile, Policy> result(*target_world, tr);
   const auto annot = detail::dummy_annotation(r);
   for (auto i = 0ul; i != arrays.size(); ++i) {
-    result(annot).block(tile_begin_end[i].first, tile_begin_end[i].second) =
-        arrays[i](annot);
+    if (arrays[i].trange().tiles_range().volume() !=
+        0) {  // N.B. empty block range expression bug workaround
+      result.make_tsrexpr(annot).block(tile_begin_end[i].first,
+                                       tile_begin_end[i].second) =
+          arrays[i].make_tsrexpr(annot);
+    }
   }
   result.world().gop.fence();
 
diff --git a/src/TiledArray/conversions/dense_to_sparse.h b/src/TiledArray/conversions/dense_to_sparse.h
index e5c23cf5ba..6147c01a56 100644
--- a/src/TiledArray/conversions/dense_to_sparse.h
+++ b/src/TiledArray/conversions/dense_to_sparse.h
@@ -27,7 +27,7 @@ to_sparse(DistArray<Tile, ArgPolicy> const &dense_array) {
   const auto begin = dense_array.begin();
   for (auto it = begin; it != end; ++it) {
     // write the norm of each local tile to the tensor
-    norm(it->get(), tile_norms[it.ordinal()]);
+    norm(it->get(), tile_norms[it.index()]);
   }
 
   // Construct a sparse shape the constructor will handle communicating the
@@ -40,9 +40,9 @@ to_sparse(DistArray<Tile, ArgPolicy> const &dense_array) {
   // sparse_array set the sparse array tile with a clone so as not to hold
   // a pointer to the original tile.
   for (auto it = begin; it != end; ++it) {
-    const auto ord = it.ordinal();
-    if (!sparse_array.is_zero(ord)) {
-      sparse_array.set(ord, it->get().clone());
+    const auto ix = it.index();
+    if (!sparse_array.is_zero(ix)) {
+      sparse_array.set(ix, it->get().clone());
     }
   }
 
diff --git a/src/TiledArray/conversions/eigen.h b/src/TiledArray/conversions/eigen.h
index 816a8bfe24..3caeecc178 100644
--- a/src/TiledArray/conversions/eigen.h
+++ b/src/TiledArray/conversions/eigen.h
@@ -196,20 +196,26 @@ eigen_map(T& tensor) {
 
 /// Copy a block of an Eigen matrix into a tensor
 
-/// A block of \c matrix will be copied into \c tensor. The block
-/// dimensions will be determined by the dimensions of the tensor's range.
+// clang-format off
+/// A block of \c matrix will be copied into \c tensor. If `tensor.rank()==2`
+/// the block is `[tensor.range().lobound()[0] - base_offsets[0], tensor.range().upbound()[0] - base_offsets[0]) x `[tensor.range().lobound()[1] - base_offsets[1], tensor.range().upbound()[1] - base_offsets[1])`,
+/// else it is `[tensor.range().lobound()[0] - base_offsets[0], tensor.range().upbound()[0] - base_offsets[0])`.
+///
 /// \tparam T A tensor type, e.g. TiledArray::Tensor
 /// \tparam Derived The derived type of an Eigen matrix
 /// \param[in] matrix The object that will be assigned the content of \c tensor
-/// \param[out] tensor The object that will be assigned the content of \c matrix
+/// \param[out] tensor The object that will contain the block of \c matrix
+/// \param[in] base_offsets The base offsets for the tensor range (should be lobound of the array that will contain tensor as a tile)
 /// \throw TiledArray::Exception When the dimensions of \c tensor are not equal
 /// to 1 or 2.
 /// \throw TiledArray::Exception When the range of \c tensor is outside the
 /// range of \c matrix .
+// clang-format on
 template <typename T, typename Derived,
           std::enable_if_t<detail::is_contiguous_tensor_v<T>>* = nullptr>
-inline void eigen_submatrix_to_tensor(const Eigen::MatrixBase<Derived>& matrix,
-                                      T& tensor) {
+inline void eigen_submatrix_to_tensor(
+    const Eigen::MatrixBase<Derived>& matrix, T& tensor,
+    std::array<Range1::index1_type, 2> base_offsets = {0, 0}) {
   [[maybe_unused]] typedef typename T::index1_type size_type;
   TA_ASSERT((tensor.range().rank() == 2u) || (tensor.range().rank() == 1u));
 
@@ -223,60 +229,71 @@ inline void eigen_submatrix_to_tensor(const Eigen::MatrixBase<Derived>& matrix,
 
   if (tensor.range().rank() == 2u) {
     // Get tensor range data
-    const std::size_t tensor_lower_0 = tensor_lower[0];
-    const std::size_t tensor_lower_1 = tensor_lower[1];
-    [[maybe_unused]] const std::size_t tensor_upper_0 = tensor_upper[0];
-    [[maybe_unused]] const std::size_t tensor_upper_1 = tensor_upper[1];
-    const std::size_t tensor_extent_0 = tensor_extent[0];
-    const std::size_t tensor_extent_1 = tensor_extent[1];
-
-    TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.rows()));
-    TA_ASSERT(tensor_upper_1 <= std::size_t(matrix.cols()));
+    const size_type tensor_lower_0 = tensor_lower[0];
+    const size_type tensor_lower_1 = tensor_lower[1];
+    [[maybe_unused]] const size_type tensor_upper_0 = tensor_upper[0];
+    [[maybe_unused]] const size_type tensor_upper_1 = tensor_upper[1];
+    const size_type tensor_extent_0 = tensor_extent[0];
+    const size_type tensor_extent_1 = tensor_extent[1];
+
+    TA_ASSERT(tensor_extent_0 <= size_type(matrix.rows()));
+    TA_ASSERT(tensor_extent_1 <= size_type(matrix.cols()));
+    TA_ASSERT(tensor_lower_0 >= base_offsets[0]);
+    TA_ASSERT(tensor_lower_1 >= base_offsets[1]);
 
     // Copy matrix
     eigen_map(tensor, tensor_extent_0, tensor_extent_1) = matrix.block(
-        tensor_lower_0, tensor_lower_1, tensor_extent_0, tensor_extent_1);
+        tensor_lower_0 - base_offsets[0], tensor_lower_1 - base_offsets[1],
+        tensor_extent_0, tensor_extent_1);
   } else {
     // Get tensor range data
-    const std::size_t tensor_lower_0 = tensor_lower[0];
-    [[maybe_unused]] const std::size_t tensor_upper_0 = tensor_upper[0];
-    const std::size_t tensor_extent_0 = tensor_extent[0];
+    const size_type tensor_lower_0 = tensor_lower[0];
+    [[maybe_unused]] const size_type tensor_upper_0 = tensor_upper[0];
+    const size_type tensor_extent_0 = tensor_extent[0];
 
     // Check that matrix is a vector.
     TA_ASSERT((matrix.rows() == 1) || (matrix.cols() == 1));
 
     if (matrix.rows() == 1) {
-      TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.cols()));
+      TA_ASSERT(tensor_extent_0 <= size_type(matrix.cols()));
+      TA_ASSERT(tensor_lower_0 >= base_offsets[0]);
 
       // Copy the row vector to tensor
       eigen_map(tensor, 1, tensor_extent_0) =
-          matrix.block(0, tensor_lower_0, 1, tensor_extent_0);
+          matrix.block(0, tensor_lower_0 - base_offsets[0], 1, tensor_extent_0);
     } else {
-      TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.rows()));
+      TA_ASSERT(tensor_extent_0 <= size_type(matrix.rows()));
+      TA_ASSERT(tensor_lower_0 >= base_offsets[0]);
 
       // Copy the column vector to tensor
       eigen_map(tensor, tensor_extent_0, 1) =
-          matrix.block(tensor_lower_0, 0, tensor_extent_0, 1);
+          matrix.block(tensor_lower_0 - base_offsets[0], 0, tensor_extent_0, 1);
     }
   }
 }
 
 /// Copy the content of a tensor into an Eigen matrix block
 
-/// The content of tensor will be copied into a block of matrix. The block
-/// dimensions will be determined by the dimensions of the tensor's range.
-/// \tparam T A tensor type, e.g. TiledArray::Tensor
-/// \tparam Derived The derived type of an Eigen matrix
-/// \param[in] tensor The object that will be copied to \c matrix
-/// \param[out] matrix The object that will be assigned the content of \c tensor
-/// \throw TiledArray::Exception When the dimensions of \c tensor are not equal
-/// to 1 or 2.
-/// \throw TiledArray::Exception When the range of \c tensor is outside the
-/// range of \c matrix .
+/// The content of tensor will be copied into a block of matrix.
+/// If `tensor.rank()==2`
+/// the block is `[tensor.range().lobound()[0] - base_offsets[0],
+/// tensor.range().upbound()[0] - base_offsets[0]) x
+/// `[tensor.range().lobound()[1] - base_offsets[1], tensor.range().upbound()[1]
+/// - base_offsets[1])`, else it is `[tensor.range().lobound()[0] -
+/// base_offsets[0], tensor.range().upbound()[0] - base_offsets[0])`. \tparam T
+/// A tensor type, e.g. TiledArray::Tensor \tparam Derived The derived type of
+/// an Eigen matrix \param[in] tensor The object that will be copied to \c
+/// matrix \param[out] matrix The object that will be assigned the content of \c
+/// tensor \param[in] base_offsets The base offsets for the tensor range (should
+/// be lobound of the array that will contain tensor as a tile) \throw
+/// TiledArray::Exception When the dimensions of \c tensor are not equal to 1
+/// or 2. \throw TiledArray::Exception When the range of \c tensor is outside
+/// the range of \c matrix .
 template <typename T, typename Derived,
           std::enable_if_t<detail::is_contiguous_tensor_v<T>>* = nullptr>
-inline void tensor_to_eigen_submatrix(const T& tensor,
-                                      Eigen::MatrixBase<Derived>& matrix) {
+inline void tensor_to_eigen_submatrix(
+    const T& tensor, Eigen::MatrixBase<Derived>& matrix,
+    std::array<Range1::index1_type, 2> base_offsets = {0, 0}) {
   [[maybe_unused]] typedef typename T::index1_type size_type;
   TA_ASSERT((tensor.range().rank() == 2u) || (tensor.range().rank() == 1u));
 
@@ -290,39 +307,44 @@ inline void tensor_to_eigen_submatrix(const T& tensor,
 
   if (tensor.range().rank() == 2) {
     // Get tensor range data
-    const std::size_t tensor_lower_0 = tensor_lower[0];
-    const std::size_t tensor_lower_1 = tensor_lower[1];
-    [[maybe_unused]] const std::size_t tensor_upper_0 = tensor_upper[0];
-    [[maybe_unused]] const std::size_t tensor_upper_1 = tensor_upper[1];
-    const std::size_t tensor_extent_0 = tensor_extent[0];
-    const std::size_t tensor_extent_1 = tensor_extent[1];
-
-    TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.rows()));
-    TA_ASSERT(tensor_upper_1 <= std::size_t(matrix.cols()));
+    const size_type tensor_lower_0 = tensor_lower[0];
+    const size_type tensor_lower_1 = tensor_lower[1];
+    [[maybe_unused]] const size_type tensor_upper_0 = tensor_upper[0];
+    [[maybe_unused]] const size_type tensor_upper_1 = tensor_upper[1];
+    const size_type tensor_extent_0 = tensor_extent[0];
+    const size_type tensor_extent_1 = tensor_extent[1];
+
+    TA_ASSERT(tensor_extent_0 <= size_type(matrix.rows()));
+    TA_ASSERT(tensor_extent_1 <= size_type(matrix.cols()));
+    TA_ASSERT(tensor_lower_0 >= base_offsets[0]);
+    TA_ASSERT(tensor_lower_1 >= base_offsets[1]);
 
     // Copy tensor into matrix
-    matrix.block(tensor_lower_0, tensor_lower_1, tensor_extent_0,
+    matrix.block(tensor_lower_0 - base_offsets[0],
+                 tensor_lower_1 - base_offsets[1], tensor_extent_0,
                  tensor_extent_1) =
         eigen_map(tensor, tensor_extent_0, tensor_extent_1);
   } else {
     // Get tensor range data
-    const std::size_t tensor_lower_0 = tensor_lower[0];
-    [[maybe_unused]] const std::size_t tensor_upper_0 = tensor_upper[0];
-    const std::size_t tensor_extent_0 = tensor_extent[0];
+    const size_type tensor_lower_0 = tensor_lower[0];
+    [[maybe_unused]] const size_type tensor_upper_0 = tensor_upper[0];
+    const size_type tensor_extent_0 = tensor_extent[0];
 
     TA_ASSERT((matrix.rows() == 1) || (matrix.cols() == 1));
 
     if (matrix.rows() == 1) {
-      TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.cols()));
+      TA_ASSERT(tensor_extent_0 <= size_type(matrix.cols()));
+      TA_ASSERT(tensor_lower_0 >= base_offsets[0]);
 
       // Copy tensor into row vector
-      matrix.block(0, tensor_lower_0, 1, tensor_extent_0) =
+      matrix.block(0, tensor_lower_0 - base_offsets[0], 1, tensor_extent_0) =
           eigen_map(tensor, 1, tensor_extent_0);
     } else {
-      TA_ASSERT(tensor_upper_0 <= std::size_t(matrix.rows()));
+      TA_ASSERT(tensor_extent_0 <= size_type(matrix.rows()));
+      TA_ASSERT(tensor_lower_0 >= base_offsets[0]);
 
       // Copy tensor into column vector
-      matrix.block(tensor_lower_0, 0, tensor_extent_0, 1) =
+      matrix.block(tensor_lower_0 - base_offsets[0], 0, tensor_extent_0, 1) =
           eigen_map(tensor, tensor_extent_0, 1);
     }
   }
@@ -344,7 +366,12 @@ void counted_eigen_submatrix_to_tensor(const Eigen::MatrixBase<Derived>* matrix,
                                        const typename A::ordinal_type i,
                                        madness::AtomicInt* counter) {
   typename A::value_type tensor(array->trange().make_tile_range(i));
-  eigen_submatrix_to_tensor(*matrix, tensor);
+  // array lobound, in case not base-0
+  const auto* range_lobound_data =
+      array->trange().elements_range().lobound_data();
+  std::array<Range1::index1_type, 2> array_lobound{
+      {range_lobound_data[0], range_lobound_data[1]}};
+  eigen_submatrix_to_tensor(*matrix, tensor, array_lobound);
   array->set(i, tensor);
   (*counter)++;
 }
@@ -357,10 +384,11 @@ void counted_eigen_submatrix_to_tensor(const Eigen::MatrixBase<Derived>* matrix,
 /// \param tensor The tensor to be copied
 /// \param counter The task counter
 template <typename Derived, typename T>
-void counted_tensor_to_eigen_submatrix(const T& tensor,
-                                       Eigen::MatrixBase<Derived>* matrix,
-                                       madness::AtomicInt* counter) {
-  tensor_to_eigen_submatrix(tensor, *matrix);
+void counted_tensor_to_eigen_submatrix(
+    const T& tensor, Eigen::MatrixBase<Derived>* matrix,
+    std::array<Range1::index1_type, 2> base_offsets,
+    madness::AtomicInt* counter) {
+  tensor_to_eigen_submatrix(tensor, *matrix, base_offsets);
   (*counter)++;
 }
 
@@ -524,6 +552,12 @@ array_to_eigen(const DistArray<Tile, Policy>& array) {
   EigenMatrix matrix =
       EigenMatrix::Zero(array_extent[0], (rank == 2 ? array_extent[1] : 1));
 
+  // array lobound, in case not base-0
+  const auto* range_lobound_data =
+      array.trange().elements_range().lobound_data();
+  std::array<Range1::index1_type, 2> array_lobound{
+      {range_lobound_data[0], range_lobound_data[1]}};
+
   // Spawn tasks to copy array tiles to the Eigen matrix
   madness::AtomicInt counter;
   counter = 0;
@@ -533,7 +567,7 @@ array_to_eigen(const DistArray<Tile, Policy>& array) {
       array.world().taskq.add(
           &detail::counted_tensor_to_eigen_submatrix<
               EigenMatrix, typename DistArray<Tile, Policy>::value_type>,
-          array.find(i), &matrix, &counter);
+          array.find(i), &matrix, array_lobound, &counter);
       ++n;
     }
   }
@@ -565,6 +599,7 @@ array_to_eigen(const DistArray<Tile, Policy>& array) {
 /// // Create a range for the new array object
 /// std::vector<std::size_t> blocks;
 /// for(std::size_t i = 0ul; i <= 100ul; i += 10ul)
+///   // N.B. can create non-0-base range, replace i -> i + base_offse
 ///   blocks.push_back(i);
 /// std::array<TiledArray::TiledRange1, 2> blocks2 =
 ///     {{ TiledArray::TiledRange1(blocks.begin(), blocks.end()),
@@ -634,6 +669,7 @@ inline A row_major_buffer_to_array(
 /// // Create a range for the new array object
 /// std::vector<std::size_t> blocks;
 /// for(std::size_t i = 0ul; i <= 100ul; i += 10ul)
+///   // N.B. can create non-0-base range, replace i -> i + base_offse
 ///   blocks.push_back(i);
 /// std::array<TiledArray::TiledRange1, 2> blocks2 =
 ///     {{ TiledArray::TiledRange1(blocks.begin(), blocks.end()),
@@ -705,11 +741,13 @@ inline A column_major_buffer_to_array(
 ///        match.
 // clang-format on
 template <typename T, int NumIndices_, int Options_, typename IndexType_,
-          typename Tensor_>
+          typename Tensor_, std::size_t NumIndices_Sz = NumIndices_>
 inline void eigen_subtensor_to_tensor(
     const Eigen::Tensor<T, NumIndices_, Options_, IndexType_>& src,
-    Tensor_& dst) {
+    Tensor_& dst,
+    std::array<Range1::index1_type, NumIndices_Sz> base_offsets = {}) {
   TA_ASSERT(dst.range().rank() == NumIndices_);
+  static_assert(NumIndices_Sz == NumIndices_);
 
   auto to_array = [](const auto& seq) {
     TA_ASSERT(seq.size() == NumIndices_);
@@ -718,6 +756,13 @@ inline void eigen_subtensor_to_tensor(
     return result;
   };
 
+  auto to_base0 = [&](const auto& arr) {
+    TA_ASSERT(arr.size() == NumIndices_);
+    std::array<IndexType_, NumIndices_> result;
+    for (int i = 0; i < NumIndices_; ++i) result[i] = arr[i] - base_offsets[i];
+    return result;
+  };
+
   [[maybe_unused]] auto reverse_extent_indices = []() {
     std::array<IndexType_, NumIndices_> result;
     std::iota(result.rbegin(), result.rend(), 0);
@@ -725,8 +770,8 @@ inline void eigen_subtensor_to_tensor(
   };
 
   const auto& dst_range = dst.range();
-  auto src_block =
-      src.slice(to_array(dst_range.lobound()), to_array(dst_range.extent()));
+  auto src_block = src.slice(to_base0(to_array(dst_range.lobound())),
+                             to_array(dst_range.extent()));
   auto dst_eigen_map = Eigen::TensorMap<
       Eigen::Tensor<T, NumIndices_, Eigen::RowMajor, IndexType_>>(
       dst.data(), to_array(dst_range.extent()));
@@ -758,11 +803,13 @@ inline void eigen_subtensor_to_tensor(
 ///        of \c src and \c dst do not match.
 // clang-format on
 template <typename Tensor_, typename T, int NumIndices_, int Options_,
-          typename IndexType_>
+          typename IndexType_, std::size_t NumIndices_Sz = NumIndices_>
 inline void tensor_to_eigen_subtensor(
     const Tensor_& src,
-    Eigen::Tensor<T, NumIndices_, Options_, IndexType_>& dst) {
+    Eigen::Tensor<T, NumIndices_, Options_, IndexType_>& dst,
+    std::array<Range1::index1_type, NumIndices_Sz> base_offsets = {}) {
   TA_ASSERT(src.range().rank() == NumIndices_);
+  static_assert(NumIndices_Sz == NumIndices_);
 
   auto to_array = [](const auto& seq) {
     TA_ASSERT(seq.size() == NumIndices_);
@@ -771,6 +818,13 @@ inline void tensor_to_eigen_subtensor(
     return result;
   };
 
+  auto to_base0 = [&](const auto& arr) {
+    TA_ASSERT(arr.size() == NumIndices_);
+    std::array<IndexType_, NumIndices_> result;
+    for (int i = 0; i < NumIndices_; ++i) result[i] = arr[i] - base_offsets[i];
+    return result;
+  };
+
   [[maybe_unused]] auto reverse_extent_indices = []() {
     std::array<IndexType_, NumIndices_> result;
     std::iota(result.rbegin(), result.rend(), 0);
@@ -778,8 +832,8 @@ inline void tensor_to_eigen_subtensor(
   };
 
   const auto& src_range = src.range();
-  auto dst_block =
-      dst.slice(to_array(src_range.lobound()), to_array(src_range.extent()));
+  auto dst_block = dst.slice(to_base0(to_array(src_range.lobound())),
+                             to_array(src_range.extent()));
   auto src_eigen_map = Eigen::TensorMap<
       Eigen::Tensor<const T, NumIndices_, Eigen::RowMajor, IndexType_>>(
       src.data(), to_array(src_range.extent()));
@@ -809,7 +863,13 @@ void counted_eigen_subtensor_to_tensor(const Eigen_Tensor_* src,
                                        const typename Range::index_type i,
                                        madness::AtomicInt* counter) {
   typename DistArray_::value_type tensor(dst->trange().make_tile_range(i));
-  eigen_subtensor_to_tensor(*src, tensor);
+  // array lobound, in case not base-0
+  const auto* range_lobound_data =
+      dst->trange().elements_range().lobound_data();
+  std::array<Range1::index1_type, Eigen_Tensor_::NumIndices> array_lobound;
+  std::copy(range_lobound_data, range_lobound_data + dst->trange().rank(),
+            array_lobound.begin());
+  eigen_subtensor_to_tensor(*src, tensor, array_lobound);
   dst->set(i, tensor);
   (*counter)++;
 }
@@ -822,10 +882,11 @@ void counted_eigen_subtensor_to_tensor(const Eigen_Tensor_* src,
 /// \param dst The destination tensor
 /// \param counter The task counter
 template <typename TA_Tensor_, typename Eigen_Tensor_>
-void counted_tensor_to_eigen_subtensor(const TA_Tensor_& src,
-                                       Eigen_Tensor_* dst,
-                                       madness::AtomicInt* counter) {
-  tensor_to_eigen_subtensor(src, *dst);
+void counted_tensor_to_eigen_subtensor(
+    const TA_Tensor_& src, Eigen_Tensor_* dst,
+    std::array<Range1::index1_type, Eigen_Tensor_::NumIndices> base_offsets,
+    madness::AtomicInt* counter) {
+  tensor_to_eigen_subtensor(src, *dst, base_offsets);
   (*counter)++;
 }
 
@@ -1004,6 +1065,12 @@ Tensor array_to_eigen_tensor(const TiledArray::DistArray<Tile, Policy>& src,
     result_type result(src.trange().elements_range().extent());
     result.setZero();
 
+    const auto* range_lobound_data =
+        src.trange().elements_range().lobound_data();
+    std::array<Range1::index1_type, Tensor::NumIndices> array_lobound;
+    std::copy(range_lobound_data, range_lobound_data + src.trange().rank(),
+              array_lobound.begin());
+
     // Spawn tasks to copy array tiles to btas::Tensor
     madness::AtomicInt counter;
     counter = 0;
@@ -1012,7 +1079,7 @@ Tensor array_to_eigen_tensor(const TiledArray::DistArray<Tile, Policy>& src,
       if (!src.is_zero(i)) {
         src.world().taskq.add(
             &detail::counted_tensor_to_eigen_subtensor<Tile, result_type>,
-            src.find(i), &result, &counter);
+            src.find(i), &result, array_lobound, &counter);
         ++n;
       }
     }
diff --git a/src/TiledArray/conversions/foreach.h b/src/TiledArray/conversions/foreach.h
index 9d219ac191..2c77c91a0f 100644
--- a/src/TiledArray/conversions/foreach.h
+++ b/src/TiledArray/conversions/foreach.h
@@ -283,11 +283,10 @@ inline std::
       arg.trange().tiles_range(), 0);
 
   // Construct the task function used to construct the result tiles.
-  madness::AtomicInt counter;
-  counter = 0;
-  int task_count = 0;
+  std::atomic<std::int64_t> ntask_completed{0};
+  std::int64_t ntask_created{0};
   auto op_shared_handle = make_op_shared_handle(std::forward<Op>(op));
-  const auto task = [op_shared_handle, &counter, &tile_norms](
+  const auto task = [op_shared_handle, &tile_norms](
                         const ordinal_type ord,
                         const_if_t<not inplace, arg_value_type>& arg_tile,
                         const ArgTiles&... arg_tiles) -> result_value_type {
@@ -295,7 +294,6 @@ inline std::
     auto result_tile =
         op_caller(std::move(op_shared_handle), tile_norms.at_ordinal(ord),
                   arg_tile, arg_tiles...);
-    ++counter;
     return result_tile;
   };
 
@@ -310,7 +308,9 @@ inline std::
           continue;
         auto result_tile =
             world.taskq.add(task, ord, arg.find_local(ord), args.find(ord)...);
-        ++task_count;
+        ++ntask_created;
+        result_tile.register_callback(
+            new IncrementCounter<decltype(ntask_completed)>(ntask_completed));
         tiles.emplace_back(ord, std::move(result_tile));
         if (op_returns_void)  // if Op does not evaluate norms, use the (scaled)
                               // norms of the first arg
@@ -324,7 +324,9 @@ inline std::
         auto result_tile =
             world.taskq.add(task, ord, detail::get_sparse_tile(ord, arg),
                             detail::get_sparse_tile(ord, args)...);
-        ++task_count;
+        ++ntask_created;
+        result_tile.register_callback(
+            new IncrementCounter<decltype(ntask_completed)>(ntask_completed));
         tiles.emplace_back(ord, std::move(result_tile));
         if (op_returns_void)  // if Op does not evaluate norms, find max
                               // (scaled) norms of all args
@@ -339,9 +341,10 @@ inline std::
   }
 
   // Wait for tile norm data to be collected.
-  if (task_count > 0)
-    world.await(
-        [&counter, task_count]() -> bool { return counter == task_count; });
+  if (ntask_created > 0)
+    world.await([&ntask_completed, ntask_created]() -> bool {
+      return ntask_created == ntask_completed;
+    });
 
   // Construct the new array
   result_array_type result(
@@ -463,7 +466,7 @@ inline std::enable_if_t<is_dense_v<Policy>, DistArray<Tile, Policy>> foreach (
 /// want to modify the elements of the array to be equal to the square
 /// root of the original value:
 /// \code
-/// foreach(array, [] (TiledArray::TensorD& tile) {
+/// foreach_inplace(array, [] (TiledArray::TensorD& tile) {
 ///   tile.inplace_unary([&] (double& value) { value = std::sqrt(value); });
 /// });
 /// \endcode
@@ -561,7 +564,7 @@ inline std::enable_if_t<!is_dense_v<Policy>, DistArray<Tile, Policy>> foreach (
 /// example, if we want to modify the elements of the array to be equal to the
 /// square root of the original value:
 /// \code
-/// foreach(array, [] (auto& tile) -> float {
+/// foreach_inplace(array, [] (auto& tile) -> float {
 ///   double norm_squared = 0.0;
 ///   tile.inplace_unary([&] (double& value) {
 ///     norm_squared += value; // Assume value >= 0
diff --git a/src/TiledArray/conversions/make_array.h b/src/TiledArray/conversions/make_array.h
index cc2216e58a..1295e6f8e4 100644
--- a/src/TiledArray/conversions/make_array.h
+++ b/src/TiledArray/conversions/make_array.h
@@ -26,6 +26,7 @@
 #ifndef TILEDARRAY_CONVERSIONS_MAKE_ARRAY_H__INCLUDED
 #define TILEDARRAY_CONVERSIONS_MAKE_ARRAY_H__INCLUDED
 
+#include "TiledArray/array_impl.h"
 #include "TiledArray/external/madness.h"
 #include "TiledArray/shape.h"
 #include "TiledArray/type_traits.h"
@@ -79,6 +80,10 @@ inline Array make_array(
   // Make an empty result array
   Array result(world, trange);
 
+  // Construct the task function used to construct the result tiles.
+  std::atomic<std::int64_t> ntask_completed{0};
+  std::int64_t ntask_created{0};
+
   // Iterate over local tiles of arg
   for (const auto index : *result.pmap()) {
     // Spawn a task to evaluate the tile
@@ -89,11 +94,20 @@ inline Array make_array(
           return tile;
         },
         trange.make_tile_range(index));
-
+    ++ntask_created;
+    tile.register_callback(
+        new detail::IncrementCounter<decltype(ntask_completed)>(
+            ntask_completed));
     // Store result tile
-    result.set(index, tile);
+    result.set(index, std::move(tile));
   }
 
+  // Wait for tile tasks to complete
+  if (ntask_created > 0)
+    world.await([&ntask_completed, ntask_created]() -> bool {
+      return ntask_completed == ntask_created;
+    });
+
   return result;
 }
 
@@ -150,26 +164,28 @@ inline Array make_array(
       trange.tiles_range(), 0);
 
   // Construct the task function used to construct the result tiles.
-  madness::AtomicInt counter;
-  counter = 0;
-  int task_count = 0;
+  std::atomic<std::int64_t> ntask_completed{0};
+  std::int64_t ntask_created{0};
   auto task = [&](const ordinal_type index) -> value_type {
     value_type tile;
-    tile_norms[index] = op(tile, trange.make_tile_range(index));
-    ++counter;
+    tile_norms.at_ordinal(index) = op(tile, trange.make_tile_range(index));
     return tile;
   };
 
   for (const auto index : *pmap) {
     auto result_tile = world.taskq.add(task, index);
-    ++task_count;
+    ++ntask_created;
+    result_tile.register_callback(
+        new detail::IncrementCounter<decltype(ntask_completed)>(
+            ntask_completed));
     tiles.emplace_back(index, std::move(result_tile));
   }
 
   // Wait for tile norm data to be collected.
-  if (task_count > 0)
-    world.await(
-        [&counter, task_count]() -> bool { return counter == task_count; });
+  if (ntask_created > 0)
+    world.await([&ntask_completed, ntask_created]() -> bool {
+      return ntask_completed == ntask_created;
+    });
 
   // Construct the new array
   Array result(world, trange,
diff --git a/src/TiledArray/conversions/retile.h b/src/TiledArray/conversions/retile.h
index 26440166c4..9f0f4cab4a 100644
--- a/src/TiledArray/conversions/retile.h
+++ b/src/TiledArray/conversions/retile.h
@@ -22,8 +22,9 @@
 #ifndef TILEDARRAY_RETILE_H
 #define TILEDARRAY_RETILE_H
 
-#include "TiledArray/util/annotation.h"
 #include "TiledArray/special/diagonal_array.h"
+#include "TiledArray/special/kronecker_delta.h"
+#include "TiledArray/util/annotation.h"
 
 /// \name Retile function
 /// \brief Retiles a tensor with a provided TiledRange
@@ -38,9 +39,11 @@
 
 namespace TiledArray {
 
-template <typename TileType, typename PolicyType>
-auto retile(const DistArray<TileType, PolicyType>& tensor,
-            const TiledRange& new_trange) {
+namespace detail {
+
+template <typename Tile, typename Policy>
+auto retile_v0(const DistArray<Tile, Policy>& tensor,
+               const TiledRange& new_trange) {
   // Make sure ranks match
   auto rank = new_trange.rank();
   auto tensor_rank = tensor.trange().rank();
@@ -67,11 +70,13 @@ auto retile(const DistArray<TileType, PolicyType>& tensor,
   };
 
   // Check the different dimensions and contract when needed
-  using tensor_type = DistArray<TileType, PolicyType>;
+  using tensor_type = DistArray<Tile, Policy>;
   auto start = detail::dummy_annotation(rank);
   tensor_type output_tensor;
   for (auto i = 0; i < rank; ++i) {
-    if (i == 0) { output_tensor(start) = tensor(start); }
+    if (i == 0) {
+      output_tensor(start) = tensor(start);
+    }
     if (new_trange.dim(i) != tensor.trange().dim(i)) {
       // Make identity for contraction
       TiledRange retiler{tensor.trange().dim(i), new_trange.dim(i)};
@@ -88,7 +93,80 @@ auto retile(const DistArray<TileType, PolicyType>& tensor,
   return output_tensor;
 }
 
-} // namespace TiledArray
+template <typename Tile, typename Policy>
+auto retile_v1(const DistArray<Tile, Policy>& tensor,
+               const TiledRange& new_trange) {
+  // Make sure ranks match
+  auto rank = new_trange.rank();
+  auto tensor_rank = tensor.trange().rank();
+  assert((rank == tensor_rank) && "TiledRanges are of different ranks");
+
+  // Makes the annotations for the contraction step
+  auto annotations = [&]() -> std::tuple<std::string, std::string> {
+    std::ostringstream final, switcher;
+    final << "j0";
+    switcher << "j0";
+    for (unsigned int d = 1; d < rank; ++d) {
+      final << ",j" << d;
+      switcher << ",j" << d;
+    }
+    for (unsigned int d = 0; d < rank; ++d) {
+      switcher << ",i" << d;
+    }
+    return {final.str(), switcher.str()};
+  };
+
+  // Check the different dimensions and contract when needed
+  using Array = DistArray<Tile, Policy>;
+  container::svector<TiledRange1> retiler_ranges;
+  for (auto i = 0; i < rank; ++i) {
+    retiler_ranges.emplace_back(new_trange.dim(i));
+  }
+  for (auto i = 0; i < rank; ++i) {
+    retiler_ranges.emplace_back(tensor.trange().dim(i));
+  }
+  TA::TiledRange retiler_range(retiler_ranges);
+  TA::DistArray<KroneckerDeltaTile, Policy> retiler(
+      tensor.world(), retiler_range,
+      SparseShape(kronecker_shape(retiler_range), retiler_range),
+      std::make_shared<detail::ReplicatedPmap>(
+          tensor.world(), retiler_range.tiles_range().volume()));
+  retiler.init_tiles([=](const TiledArray::Range& range) {
+    return KroneckerDeltaTile(range);
+  });
+
+  // Make indices for contraction
+
+  // Retile
+  Array output;
+  auto start = detail::dummy_annotation(rank);
+  auto [finish, change] = annotations();
+  output(finish) = retiler(change) * tensor(start);
+
+  return output;
+}
+
+template <typename Tile, typename Policy>
+auto retile_v2(const DistArray<Tile, Policy>& source_array,
+               const TiledRange& target_trange) {
+  return DistArray<Tile, Policy>(source_array, target_trange);
+}
+
+}  // namespace detail
+
+/// Creates a new DistArray with the same data as the input tensor, but with a
+/// different trange. The primary use-case is to change tiling while keeping the
+/// element range the same, but it can be used to select blocks of the data as
+/// well as increasing the element range (with the new elements initialized to
+/// zero)
+/// \param array The DistArray whose data is to be retiled
+/// \param target_trange The desired TiledRange of the output tensor
+template <typename Tile, typename Policy>
+auto retile(const DistArray<Tile, Policy>& array,
+            const TiledRange& target_trange) {
+  return detail::retile_v2(array, target_trange);
+}
 
+}  // namespace TiledArray
 
 #endif  // TILEDARRAY_RETILE_H
diff --git a/src/TiledArray/conversions/sparse_to_dense.h b/src/TiledArray/conversions/sparse_to_dense.h
index c5bdd812c5..7ee6e92049 100644
--- a/src/TiledArray/conversions/sparse_to_dense.h
+++ b/src/TiledArray/conversions/sparse_to_dense.h
@@ -52,8 +52,7 @@ to_dense(DistArray<Tile, ArgPolicy> const& sparse_array) {
       Tile tile(sparse_array.find(ord).get().clone());
       dense_array.set(ord, tile);
     } else {
-      // see DistArray::set(ordinal, element_type)
-      dense_array.set(ord, 0);
+      dense_array.set(ord, typename Tile::value_type{});
     }
   }
 
diff --git a/src/TiledArray/conversions/vector_of_arrays.h b/src/TiledArray/conversions/vector_of_arrays.h
index 9de3bf8d09..29f4932ca5 100644
--- a/src/TiledArray/conversions/vector_of_arrays.h
+++ b/src/TiledArray/conversions/vector_of_arrays.h
@@ -5,7 +5,7 @@
 #ifndef TILEDARRAY_CONVERSIONS_VECTOR_OF_ARRAYS_H_
 #define TILEDARRAY_CONVERSIONS_VECTOR_OF_ARRAYS_H_
 
-#include <tiledarray.h>
+#include <TiledArray/dist_array.h>
 
 namespace TiledArray {
 
diff --git a/src/TiledArray/cuda/btas_cublas.h b/src/TiledArray/cuda/btas_cublas.h
deleted file mode 100644
index ea073d0a78..0000000000
--- a/src/TiledArray/cuda/btas_cublas.h
+++ /dev/null
@@ -1,622 +0,0 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2018  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Chong Peng
- *  Department of Chemistry, Virginia Tech
- *  July 24, 2018
- *
- */
-
-#ifndef TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED
-#define TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED
-
-#include <TiledArray/cuda/cublas.h>
-#include <TiledArray/math/blas.h>
-
-#ifdef TILEDARRAY_HAS_CUDA
-
-#include <TiledArray/external/cuda.h>
-#include <btas/tensor.h>
-
-#include <TiledArray/cuda/kernel/mult_kernel.h>
-#include <TiledArray/cuda/kernel/reduce_kernel.h>
-#include <TiledArray/cuda/platform.h>
-#include <TiledArray/cuda/um_storage.h>
-#include <TiledArray/math/gemm_helper.h>
-
-namespace TiledArray {
-
-template <typename T, typename Scalar, typename Range, typename Storage,
-    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-btas::Tensor<T, Range, Storage> btas_tensor_gemm_cuda_impl(
-    const btas::Tensor<T, Range, Storage> &left,
-    const btas::Tensor<T, Range, Storage> &right, Scalar factor,
-    const TiledArray::math::GemmHelper &gemm_helper) {
-  // Check that the arguments are not empty and have the correct ranks
-  TA_ASSERT(!left.empty());
-  TA_ASSERT(left.range().rank() == gemm_helper.left_rank());
-  TA_ASSERT(!right.empty());
-  TA_ASSERT(right.range().rank() == gemm_helper.right_rank());
-
-  // Check that the inner dimensions of left and right match
-  TA_ASSERT(
-      ignore_tile_position() ||
-      gemm_helper.left_right_congruent(std::cbegin(left.range().lobound()),
-                                       std::cbegin(right.range().lobound())));
-  TA_ASSERT(
-      ignore_tile_position() ||
-      gemm_helper.left_right_congruent(std::cbegin(left.range().upbound()),
-                                       std::cbegin(right.range().upbound())));
-  TA_ASSERT(gemm_helper.left_right_congruent(
-      std::cbegin(left.range().extent()), std::cbegin(right.range().extent())));
-
-  // Compute gemm dimensions
-  using TiledArray::math::blas::integer;
-  integer m = 1, n = 1, k = 1;
-  gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range());
-
-  // Get the leading dimension for left and right matrices.
-  const integer lda =
-      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m);
-  const integer ldb =
-      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k);
-
-  T factor_t = T(factor);
-  T zero(0);
-
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-
-  //  typedef typename Tensor::storage_type storage_type;
-  auto result_range =
-      gemm_helper.make_result_range<Range>(left.range(), right.range());
-
-  auto &cuda_stream = detail::get_stream_based_on_range(result_range);
-
-  // the result Tensor type
-  typedef btas::Tensor<T, Range, Storage> Tensor;
-  Tensor result;
-
-  // check if stream is busy
-  //  auto stream_status = cudaStreamQuery(cuda_stream);
-
-  // if stream is completed, use GPU
-  //  if (stream_status == cudaSuccess) {
-  if (true) {
-    Storage result_storage;
-    make_device_storage(result_storage, result_range.area(), cuda_stream);
-    result = Tensor(std::move(result_range), std::move(result_storage));
-
-    // left and right are readonly!!
-    //    cudaMemAdvise(device_data(left), left.size() * sizeof(T),
-    //                  cudaMemAdviseSetReadMostly,
-    //                  cudaEnv::instance()->current_cuda_device_id());
-    //    cudaMemAdvise(device_data(right), right.size() * sizeof(T),
-    //                  cudaMemAdviseSetReadMostly,
-    //                  cudaEnv::instance()->current_cuda_device_id());
-
-    // prefetch data
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
-        left.storage(), cuda_stream);
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
-        right.storage(), cuda_stream);
-
-    const auto &handle = cuBLASHandlePool::handle();
-    CublasSafeCall(cublasSetStream(handle, cuda_stream));
-
-    CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.right_op()),
-                              to_cublas_op(gemm_helper.left_op()), n, m, k,
-                              &factor_t, device_data(right.storage()), ldb,
-                              device_data(left.storage()), lda, &zero,
-                              device_data(result.storage()), n));
-
-    // wait for cuda calls to finish
-    //    detail::thread_wait_cuda_stream(cuda_stream);
-    synchronize_stream(&cuda_stream);
-  }
-  // otherwise, use CPU
-  else {
-    Storage result_storage(result_range.area());
-    result = Tensor(std::move(result_range), std::move(result_storage));
-
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
-        result.storage(), cuda_stream);
-
-    // left and right are readonly!!
-    cudaMemAdvise(device_data(left), left.size() * sizeof(T),
-                  cudaMemAdviseSetReadMostly,
-                  cudaEnv::instance()->current_cuda_device_id());
-    cudaMemAdvise(device_data(right), right.size() * sizeof(T),
-                  cudaMemAdviseSetReadMostly,
-                  cudaEnv::instance()->current_cuda_device_id());
-
-    // prefetch data
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
-        left.storage(), cuda_stream);
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
-        right.storage(), cuda_stream);
-
-    TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n,
-                           k, factor_t, left.data(), lda, right.data(), ldb,
-                           zero, result.data(), n);
-  }
-
-  return result;
-}
-
-template <typename T, typename Scalar, typename Range, typename Storage, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-void btas_tensor_gemm_cuda_impl(
-    btas::Tensor<T, Range, Storage> &result,
-    const btas::Tensor<T, Range, Storage> &left,
-    const btas::Tensor<T, Range, Storage> &right, Scalar factor,
-    const TiledArray::math::GemmHelper &gemm_helper) {
-  // Check that the result is not empty and has the correct rank
-  TA_ASSERT(!result.empty());
-  TA_ASSERT(result.range().rank() == gemm_helper.result_rank());
-
-  // Check that the arguments are not empty and have the correct ranks
-  TA_ASSERT(!left.empty());
-  TA_ASSERT(left.range().rank() == gemm_helper.left_rank());
-  TA_ASSERT(!right.empty());
-  TA_ASSERT(right.range().rank() == gemm_helper.right_rank());
-
-  // Check that the outer dimensions of left match the the corresponding
-  // dimensions in result
-  TA_ASSERT(
-      ignore_tile_position() ||
-      gemm_helper.left_result_congruent(std::cbegin(left.range().lobound()),
-                                        std::cbegin(result.range().lobound())));
-  TA_ASSERT(
-      ignore_tile_position() ||
-      gemm_helper.left_result_congruent(std::cbegin(left.range().upbound()),
-                                        std::cbegin(result.range().upbound())));
-  TA_ASSERT(
-      gemm_helper.left_result_congruent(std::cbegin(left.range().extent()),
-                                        std::cbegin(result.range().extent())));
-
-  // Check that the outer dimensions of right match the the corresponding
-  // dimensions in result
-  TA_ASSERT(ignore_tile_position() ||
-            gemm_helper.right_result_congruent(
-                std::cbegin(right.range().lobound()),
-                std::cbegin(result.range().lobound())));
-  TA_ASSERT(ignore_tile_position() ||
-            gemm_helper.right_result_congruent(
-                std::cbegin(right.range().upbound()),
-                std::cbegin(result.range().upbound())));
-  TA_ASSERT(
-      gemm_helper.right_result_congruent(std::cbegin(right.range().extent()),
-                                         std::cbegin(result.range().extent())));
-
-  // Check that the inner dimensions of left and right match
-  TA_ASSERT(
-      ignore_tile_position() ||
-      gemm_helper.left_right_congruent(std::cbegin(left.range().lobound()),
-                                       std::cbegin(right.range().lobound())));
-  TA_ASSERT(
-      ignore_tile_position() ||
-      gemm_helper.left_right_congruent(std::cbegin(left.range().upbound()),
-                                       std::cbegin(right.range().upbound())));
-  TA_ASSERT(gemm_helper.left_right_congruent(
-      std::cbegin(left.range().extent()), std::cbegin(right.range().extent())));
-
-  // Compute gemm dimensions
-  using TiledArray::math::blas::integer;
-  integer m, n, k;
-  gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range());
-
-  // Get the leading dimension for left and right matrices.
-  const integer lda =
-      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m);
-  const integer ldb =
-      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k);
-
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
-
-  T factor_t = T(factor);
-  T one(1);
-  // check if stream is busy
-  //  auto stream_status = cudaStreamQuery(cuda_stream);
-
-  // if stream is completed, use GPU
-  //  if (stream_status == cudaSuccess) {
-  if (true) {
-    // left and right are readonly!!
-    //    cudaMemAdvise(device_data(left), left.size() * sizeof(T),
-    //                  cudaMemAdviseSetReadMostly,
-    //                  cudaEnv::instance()->current_cuda_device_id());
-    //    cudaMemAdvise(device_data(right), right.size() * sizeof(T),
-    //                  cudaMemAdviseSetReadMostly,
-    //                  cudaEnv::instance()->current_cuda_device_id());
-
-    // prefetch all data
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
-        left.storage(), cuda_stream);
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
-        right.storage(), cuda_stream);
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
-        result.storage(), cuda_stream);
-
-    const auto &handle = cuBLASHandlePool::handle();
-    CublasSafeCall(cublasSetStream(handle, cuda_stream));
-    CublasSafeCall(cublasGemm(handle, to_cublas_op(gemm_helper.right_op()),
-                              to_cublas_op(gemm_helper.left_op()), n, m, k,
-                              &factor_t, device_data(right.storage()), ldb,
-                              device_data(left.storage()), lda, &one,
-                              device_data(result.storage()), n));
-    synchronize_stream(&cuda_stream);
-
-    //    detail::thread_wait_cuda_stream(cuda_stream);
-
-  } else {
-    // left and right are readonly!!
-    cudaMemAdvise(device_data(left), left.size() * sizeof(T),
-                  cudaMemAdviseSetReadMostly,
-                  cudaEnv::instance()->current_cuda_device_id());
-    cudaMemAdvise(device_data(right), right.size() * sizeof(T),
-                  cudaMemAdviseSetReadMostly,
-                  cudaEnv::instance()->current_cuda_device_id());
-
-    // prefetch data
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
-        left.storage(), cuda_stream);
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
-        right.storage(), cuda_stream);
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
-        result.storage(), cuda_stream);
-
-    TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n,
-                           k, factor_t, left.data(), lda, right.data(), ldb,
-                           one, result.data(), n);
-  }
-}
-
-/// result[i] = arg[i]
-template <typename T, typename Range, typename Storage>
-btas::Tensor<T, Range, Storage> btas_tensor_clone_cuda_impl(
-    const btas::Tensor<T, Range, Storage> &arg) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-
-  Storage result_storage;
-  auto result_range = arg.range();
-  auto &cuda_stream = detail::get_stream_based_on_range(result_range);
-
-  make_device_storage(result_storage, arg.size(), cuda_stream);
-  btas::Tensor<T, Range, Storage> result(std::move(result_range),
-                                         std::move(result_storage));
-
-  // call cublasCopy
-  const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
-
-  CublasSafeCall(cublasCopy(handle, result.size(), device_data(arg.storage()),
-                            1, device_data(result.storage()), 1));
-
-  synchronize_stream(&cuda_stream);
-  return result;
-}
-
-/// result[i] = a * arg[i]
-template <typename T, typename Range, typename Storage, typename Scalar,
-    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-btas::Tensor<T, Range, Storage> btas_tensor_scale_cuda_impl(
-    const btas::Tensor<T, Range, Storage> &arg, const Scalar a) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-  //  std::cout << "scale, tile offset: " << arg.range().offset() << " stream: "
-  //  << arg.range().offset() % cudaEnv::instance()->num_cuda_streams() << "\n";
-
-  auto result = btas_tensor_clone_cuda_impl(arg);
-
-  // call cublasScale
-  const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
-  CublasSafeCall(
-      cublasScal(handle, result.size(), &a, device_data(result.storage()), 1));
-
-  synchronize_stream(&cuda_stream);
-
-  return result;
-}
-
-/// result[i] *= a
-template <typename T, typename Range, typename Storage, typename Scalar,
-    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-void btas_tensor_scale_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
-                                    const Scalar a) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
-  // call cublasScale
-  const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
-  CublasSafeCall(
-      cublasScal(handle, result.size(), &a, device_data(result.storage()), 1));
-
-  synchronize_stream(&cuda_stream);
-}
-
-/// result[i] = arg1[i] - a * arg2[i]
-template <typename T, typename Scalar, typename Range, typename Storage,
-    typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-btas::Tensor<T, Range, Storage> btas_tensor_subt_cuda_impl(
-    const btas::Tensor<T, Range, Storage> &arg1,
-    const btas::Tensor<T, Range, Storage> &arg2, const Scalar a) {
-  auto result = btas_tensor_clone_cuda_impl(arg1);
-
-  // revert the sign of a
-  auto b = -a;
-
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
-
-  if (in_memory_space<MemorySpace::CUDA>(result.storage())) {
-    const auto &handle = cuBLASHandlePool::handle();
-    CublasSafeCall(cublasSetStream(handle, cuda_stream));
-    CublasSafeCall(cublasAxpy(handle, result.size(), &b,
-                              device_data(arg2.storage()), 1,
-                              device_data(result.storage()), 1));
-  } else {
-    TA_ASSERT(false);
-    //    btas::axpy(1.0, arg, result);
-  }
-
-  synchronize_stream(&cuda_stream);
-  return result;
-}
-
-/// result[i] -= a * arg1[i]
-template <typename T, typename Scalar, typename Range, typename Storage, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-void btas_tensor_subt_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
-                                   const btas::Tensor<T, Range, Storage> &arg1,
-                                   const Scalar a) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
-
-  // revert the sign of a
-  auto b = -a;
-
-  const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
-  CublasSafeCall(cublasAxpy(handle, result.size(), &b,
-                            device_data(arg1.storage()), 1,
-                            device_data(result.storage()), 1));
-  synchronize_stream(&cuda_stream);
-}
-
-/// result[i] = arg1[i] + a * arg2[i]
-template <typename T, typename Scalar, typename Range, typename Storage, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-btas::Tensor<T, Range, Storage> btas_tensor_add_cuda_impl(
-    const btas::Tensor<T, Range, Storage> &arg1,
-    const btas::Tensor<T, Range, Storage> &arg2, const Scalar a) {
-  auto result = btas_tensor_clone_cuda_impl(arg1);
-
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
-
-  const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
-  CublasSafeCall(cublasAxpy(handle, result.size(), &a,
-                            device_data(arg2.storage()), 1,
-                            device_data(result.storage()), 1));
-
-  synchronize_stream(&cuda_stream);
-  return result;
-}
-
-/// result[i] += a * arg[i]
-template <typename T, typename Scalar, typename Range, typename Storage, typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
-void btas_tensor_add_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
-                                  const btas::Tensor<T, Range, Storage> &arg,
-                                  const Scalar a) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
-
-  //   TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(result.storage(),cuda_stream);
-  //   TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(arg.storage(),cuda_stream);
-
-  const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
-  CublasSafeCall(cublasAxpy(handle, result.size(), &a,
-                            device_data(arg.storage()), 1,
-                            device_data(result.storage()), 1));
-
-  synchronize_stream(&cuda_stream);
-}
-
-/// result[i] = result[i] * arg[i]
-template <typename T, typename Range, typename Storage>
-void btas_tensor_mult_to_cuda_impl(btas::Tensor<T, Range, Storage> &result,
-                                   const btas::Tensor<T, Range, Storage> &arg) {
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
-  auto &cuda_stream = detail::get_stream_based_on_range(result.range());
-
-  std::size_t n = result.size();
-
-  TA_ASSERT(n == arg.size());
-
-  mult_to_cuda_kernel(result.data(), arg.data(), n, cuda_stream, device_id);
-  synchronize_stream(&cuda_stream);
-}
-
-/// result[i] = arg1[i] * arg2[i]
-template <typename T, typename Range, typename Storage>
-btas::Tensor<T, Range, Storage> btas_tensor_mult_cuda_impl(
-    const btas::Tensor<T, Range, Storage> &arg1,
-    const btas::Tensor<T, Range, Storage> &arg2) {
-  std::size_t n = arg1.size();
-
-  TA_ASSERT(arg2.size() == n);
-
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
-  CudaSafeCall(cudaSetDevice(device_id));
-  auto &cuda_stream = detail::get_stream_based_on_range(arg1.range());
-
-  Storage result_storage;
-  make_device_storage(result_storage, n, cuda_stream);
-  btas::Tensor<T, Range, Storage> result(arg1.range(),
-                                         std::move(result_storage));
-
-  mult_cuda_kernel(result.data(), arg1.data(), arg2.data(), n, cuda_stream,
-                   device_id);
-
-  synchronize_stream(&cuda_stream);
-  return result;
-}
-
-// foreach(i) result += arg[i] * arg[i]
-template <typename T, typename Range, typename Storage>
-typename btas::Tensor<T, Range, Storage>::value_type
-btas_tensor_squared_norm_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-
-  auto &storage = arg.storage();
-  using TiledArray::math::blas::integer;
-  integer size = storage.size();
-  T result = 0;
-  if (in_memory_space<MemorySpace::CUDA>(storage)) {
-    const auto &handle = cuBLASHandlePool::handle();
-    CublasSafeCall(cublasSetStream(handle, cuda_stream));
-    CublasSafeCall(cublasDot(handle, size, device_data(storage), 1,
-                             device_data(storage), 1, &result));
-  } else {
-    TA_ASSERT(false);
-    //    result = TiledArray::math::dot(size, storage.data(), storage.data());
-  }
-  synchronize_stream(&cuda_stream);
-  return result;
-}
-
-// foreach(i) result += arg1[i] * arg2[i]
-template <typename T, typename Range, typename Storage>
-typename btas::Tensor<T, Range, Storage>::value_type btas_tensor_dot_cuda_impl(
-    const btas::Tensor<T, Range, Storage> &arg1,
-    const btas::Tensor<T, Range, Storage> &arg2) {
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-
-  auto &cuda_stream = detail::get_stream_based_on_range(arg1.range());
-
-  using TiledArray::math::blas::integer;
-  integer size = arg1.storage().size();
-
-  TA_ASSERT(size == arg2.storage().size());
-
-  T result = 0;
-  if (in_memory_space<MemorySpace::CUDA>(arg1.storage()) &&
-      in_memory_space<MemorySpace::CUDA>(arg2.storage())) {
-    const auto &handle = cuBLASHandlePool::handle();
-    CublasSafeCall(cublasSetStream(handle, cuda_stream));
-    CublasSafeCall(cublasDot(handle, size, device_data(arg1.storage()), 1,
-                             device_data(arg2.storage()), 1, &result));
-  } else {
-    TA_ASSERT(false);
-    //    result = TiledArray::math::dot(size, storage.data(), storage.data());
-  }
-  synchronize_stream(&cuda_stream);
-  return result;
-}
-
-template <typename T, typename Range, typename Storage>
-T btas_tensor_sum_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
-
-  auto &storage = arg.storage();
-  auto n = storage.size();
-
-  auto result = sum_cuda_kernel(arg.data(), n, cuda_stream, device_id);
-
-  synchronize_stream(&cuda_stream);
-  return result;
-}
-
-template <typename T, typename Range, typename Storage>
-T btas_tensor_product_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
-
-  auto &storage = arg.storage();
-  auto n = storage.size();
-
-  auto result = product_cuda_kernel(arg.data(), n, cuda_stream, device_id);
-
-  synchronize_stream(&cuda_stream);
-  return result;
-}
-
-template <typename T, typename Range, typename Storage>
-T btas_tensor_min_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
-
-  auto &storage = arg.storage();
-  auto n = storage.size();
-
-  auto result = min_cuda_kernel(arg.data(), n, cuda_stream, device_id);
-
-  synchronize_stream(&cuda_stream);
-  return result;
-}
-
-template <typename T, typename Range, typename Storage>
-T btas_tensor_max_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
-
-  auto &storage = arg.storage();
-  auto n = storage.size();
-
-  auto result = max_cuda_kernel(arg.data(), n, cuda_stream, device_id);
-
-  synchronize_stream(&cuda_stream);
-  return result;
-}
-
-template <typename T, typename Range, typename Storage>
-T btas_tensor_absmin_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
-
-  auto &storage = arg.storage();
-  auto n = storage.size();
-
-  auto result = absmin_cuda_kernel(arg.data(), n, cuda_stream, device_id);
-
-  synchronize_stream(&cuda_stream);
-  return result;
-}
-
-template <typename T, typename Range, typename Storage>
-T btas_tensor_absmax_cuda_impl(const btas::Tensor<T, Range, Storage> &arg) {
-  auto &cuda_stream = detail::get_stream_based_on_range(arg.range());
-  auto device_id = cudaEnv::instance()->current_cuda_device_id();
-
-  auto &storage = arg.storage();
-  auto n = storage.size();
-
-  auto result = absmax_cuda_kernel(arg.data(), n, cuda_stream, device_id);
-
-  synchronize_stream(&cuda_stream);
-  return result;
-}
-
-}  // namespace TiledArray
-
-#endif  // TILEDARRAY_HAS_CUDA
-
-#endif  // TILEDARRAY_BTAS_CUDA_CUBLAS_H__INCLUDED
diff --git a/src/TiledArray/cuda/btas_um_tensor.cpp b/src/TiledArray/cuda/btas_um_tensor.cpp
deleted file mode 100644
index 58c3981f18..0000000000
--- a/src/TiledArray/cuda/btas_um_tensor.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-//
-// Created by Chong Peng on 7/24/18.
-//
-
-// clang-format off
-#include <btas/array_adaptor.h>  // provides c++17 features (stds::data, std::size) when compiling CUDA (i.e. c++14)
-#include <TiledArray/cuda/btas_um_tensor.h>
-// clang-format on
-
-#ifdef TILEDARRAY_HAS_CUDA
-
-template class btas::varray<double, TiledArray::cuda_um_allocator<double>>;
-template class btas::varray<float, TiledArray::cuda_um_allocator<float>>;
-template class btas::varray<int, TiledArray::cuda_um_allocator<int>>;
-template class btas::varray<long, TiledArray::cuda_um_allocator<long>>;
-
-template class btas::Tensor<double, TiledArray::Range,
-                            TiledArray::cuda_um_btas_varray<double>>;
-template class btas::Tensor<float, TiledArray::Range,
-                            TiledArray::cuda_um_btas_varray<float>>;
-template class btas::Tensor<int, TiledArray::Range,
-                            TiledArray::cuda_um_btas_varray<int>>;
-template class btas::Tensor<long, TiledArray::Range,
-                            TiledArray::cuda_um_btas_varray<long>>;
-
-template class TiledArray::Tile<btas::Tensor<
-    double, TiledArray::Range, TiledArray::cuda_um_btas_varray<double>>>;
-template class TiledArray::Tile<btas::Tensor<
-    float, TiledArray::Range, TiledArray::cuda_um_btas_varray<float>>>;
-template class TiledArray::Tile<
-    btas::Tensor<int, TiledArray::Range, TiledArray::cuda_um_btas_varray<int>>>;
-template class TiledArray::Tile<btas::Tensor<
-    long, TiledArray::Range, TiledArray::cuda_um_btas_varray<long>>>;
-
-#endif  // TILEDARRAY_HAS_CUDA
diff --git a/src/TiledArray/cuda/cublas.h b/src/TiledArray/cuda/cublas.h
deleted file mode 100644
index a5d3da7afc..0000000000
--- a/src/TiledArray/cuda/cublas.h
+++ /dev/null
@@ -1,433 +0,0 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2018  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Chong Peng
- *  Department of Chemistry, Virginia Tech
- *  July 23, 2018
- *
- */
-
-#ifndef TILEDARRAY_MATH_CUBLAS_H__INCLUDED
-#define TILEDARRAY_MATH_CUBLAS_H__INCLUDED
-
-#include <TiledArray/config.h>
-
-#ifdef TILEDARRAY_HAS_CUDA
-
-#include <TiledArray/error.h>
-#include <TiledArray/tensor/complex.h>
-#include <cublas_v2.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
-#include <TiledArray/math/blas.h>
-
-#define CublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
-
-inline void __cublasSafeCall(cublasStatus_t err, const char *file,
-                             const int line) {
-#ifdef TILEDARRAY_CHECK_CUDA_ERROR
-  if (CUBLAS_STATUS_SUCCESS != err) {
-    std::stringstream ss;
-    ss << "cublasSafeCall() failed at: " << file << "(" << line << ")";
-    std::string what = ss.str();
-    throw std::runtime_error(what);
-  }
-#endif
-
-  return;
-}
-
-namespace TiledArray {
-
-/*
- * cuBLAS interface functions
- */
-
-/**
- * cuBLASHandlePool
- *
- * assign 1 cuBLAS handle / thread, use thread-local storage to manage
- *
- */
-class cuBLASHandlePool {
- public:
-  static const cublasHandle_t &handle() {
-    static thread_local cublasHandle_t *handle_{nullptr};
-    if (handle_ == nullptr) {
-      handle_ = new cublasHandle_t;
-      CublasSafeCall(cublasCreate(handle_));
-      CublasSafeCall(cublasSetPointerMode(*handle_, CUBLAS_POINTER_MODE_HOST));
-    }
-    return *handle_;
-  }
-};
-// thread_local cublasHandle_t *cuBLASHandlePool::handle_;
-
-inline cublasOperation_t to_cublas_op(math::blas::Op cblas_op) {
-  cublasOperation_t result{};
-  switch (cblas_op) {
-    case math::blas::Op::NoTrans:
-      result = CUBLAS_OP_N;
-      break;
-    case math::blas::Op::Trans:
-      result = CUBLAS_OP_T;
-      break;
-    case math::blas::Op::ConjTrans:
-      result = CUBLAS_OP_C;
-      break;
-  }
-  return result;
-}
-
-/// GEMM interface functions
-
-template <typename T>
-cublasStatus_t cublasGemm(cublasHandle_t handle, cublasOperation_t transa,
-                          cublasOperation_t transb, int m, int n, int k,
-                          const T *alpha, const T *A, int lda, const T *B,
-                          int ldb, const T *beta, T *C, int ldc);
-template <>
-inline cublasStatus_t cublasGemm<float>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const float *alpha, const float *A, int lda,
-    const float *B, int ldb, const float *beta, float *C, int ldc) {
-  return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
-                     beta, C, ldc);
-}
-template <>
-inline cublasStatus_t cublasGemm<double>(
-    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-    int m, int n, int k, const double *alpha, const double *A, int lda,
-    const double *B, int ldb, const double *beta, double *C, int ldc) {
-  return cublasDgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
-                     beta, C, ldc);
-}
-
-/// AXPY interface functions
-
-template <typename T, typename Scalar>
-cublasStatus_t cublasAxpy(cublasHandle_t handle, int n, const Scalar *alpha,
-                          const T *x, int incx, T *y, int incy);
-template <>
-inline cublasStatus_t cublasAxpy<float, float>(cublasHandle_t handle, int n,
-                                               const float *alpha,
-                                               const float *x, int incx,
-                                               float *y, int incy) {
-  return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasAxpy<double, double>(cublasHandle_t handle, int n,
-                                                 const double *alpha,
-                                                 const double *x, int incx,
-                                                 double *y, int incy) {
-  return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasAxpy<float, int>(cublasHandle_t handle, int n,
-                                             const int *alpha, const float *x,
-                                             int incx, float *y, int incy) {
-  const float alpha_float = float(*alpha);
-  return cublasSaxpy(handle, n, &alpha_float, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasAxpy<float, double>(cublasHandle_t handle, int n,
-                                                const double *alpha,
-                                                const float *x, int incx,
-                                                float *y, int incy) {
-  const float alpha_float = float(*alpha);
-  return cublasSaxpy(handle, n, &alpha_float, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasAxpy<double, int>(cublasHandle_t handle, int n,
-                                              const int *alpha, const double *x,
-                                              int incx, double *y, int incy) {
-  const double alpha_double = double(*alpha);
-  return cublasDaxpy(handle, n, &alpha_double, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasAxpy<double, float>(cublasHandle_t handle, int n,
-                                                const float *alpha,
-                                                const double *x, int incx,
-                                                double *y, int incy) {
-  const double alpha_double = double(*alpha);
-  return cublasDaxpy(handle, n, &alpha_double, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasAxpy<float, detail::ComplexConjugate<void>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<void> *alpha,
-    const float *x, int incx, float *y, int incy) {
-  return CUBLAS_STATUS_SUCCESS;
-}
-
-template <>
-inline cublasStatus_t
-cublasAxpy<float, detail::ComplexConjugate<detail::ComplexNegTag>>(
-    cublasHandle_t handle, int n,
-    const detail::ComplexConjugate<detail::ComplexNegTag> *alpha,
-    const float *x, int incx, float *y, int incy) {
-  const float alpha_float = float(-1.0);
-  return cublasSaxpy(handle, n, &alpha_float, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasAxpy<float, detail::ComplexConjugate<int>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<int> *alpha,
-    const float *x, int incx, float *y, int incy) {
-  const float alpha_float = float(alpha->factor());
-  return cublasSaxpy(handle, n, &alpha_float, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasAxpy<float, detail::ComplexConjugate<float>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<float> *alpha,
-    const float *x, int incx, float *y, int incy) {
-  const float alpha_float = float(alpha->factor());
-  return cublasSaxpy(handle, n, &alpha_float, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasAxpy<float, detail::ComplexConjugate<double>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<double> *alpha,
-    const float *x, int incx, float *y, int incy) {
-  const float alpha_float = float(alpha->factor());
-  return cublasSaxpy(handle, n, &alpha_float, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasAxpy<double, detail::ComplexConjugate<void>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<void> *alpha,
-    const double *x, int incx, double *y, int incy) {
-  return CUBLAS_STATUS_SUCCESS;
-}
-
-template <>
-inline cublasStatus_t
-cublasAxpy<double, detail::ComplexConjugate<detail::ComplexNegTag>>(
-    cublasHandle_t handle, int n,
-    const detail::ComplexConjugate<detail::ComplexNegTag> *alpha,
-    const double *x, int incx, double *y, int incy) {
-  const double alpha_double = double(-1.0);
-  return cublasDaxpy(handle, n, &alpha_double, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasAxpy<double, detail::ComplexConjugate<int>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<int> *alpha,
-    const double *x, int incx, double *y, int incy) {
-  const double alpha_double = double(alpha->factor());
-  return cublasDaxpy(handle, n, &alpha_double, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasAxpy<double, detail::ComplexConjugate<float>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<float> *alpha,
-    const double *x, int incx, double *y, int incy) {
-  const double alpha_double = double(alpha->factor());
-  return cublasDaxpy(handle, n, &alpha_double, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasAxpy<double, detail::ComplexConjugate<double>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<double> *alpha,
-    const double *x, int incx, double *y, int incy) {
-  const double alpha_double = double(alpha->factor());
-  return cublasDaxpy(handle, n, &alpha_double, x, incx, y, incy);
-}
-
-/// DOT interface functions
-
-template <typename T>
-cublasStatus_t cublasDot(cublasHandle_t handle, int n, const T *x, int incx,
-                         const T *y, int incy, T *result);
-template <>
-inline cublasStatus_t cublasDot<float>(cublasHandle_t handle, int n,
-                                       const float *x, int incx, const float *y,
-                                       int incy, float *result) {
-  return cublasSdot(handle, n, x, incx, y, incy, result);
-}
-
-template <>
-inline cublasStatus_t cublasDot<double>(cublasHandle_t handle, int n,
-                                        const double *x, int incx,
-                                        const double *y, int incy,
-                                        double *result) {
-  return cublasDdot(handle, n, x, incx, y, incy, result);
-}
-
-/// SCAL interface function
-template <typename T, typename Scalar>
-cublasStatus_t cublasScal(cublasHandle_t handle, int n, const Scalar *alpha,
-                          T *x, int incx);
-
-template <>
-inline cublasStatus_t cublasScal<float, float>(cublasHandle_t handle, int n,
-                                               const float *alpha, float *x,
-                                               int incx) {
-  return cublasSscal(handle, n, alpha, x, incx);
-};
-
-template <>
-inline cublasStatus_t cublasScal<double, double>(cublasHandle_t handle, int n,
-                                                 const double *alpha, double *x,
-                                                 int incx) {
-  return cublasDscal(handle, n, alpha, x, incx);
-};
-
-template <>
-inline cublasStatus_t cublasScal<float, int>(cublasHandle_t handle, int n,
-                                             const int *alpha, float *x,
-                                             int incx) {
-  const float alpha_float = float(*alpha);
-  return cublasSscal(handle, n, &alpha_float, x, incx);
-};
-
-template <>
-inline cublasStatus_t cublasScal<float, double>(cublasHandle_t handle, int n,
-                                                const double *alpha, float *x,
-                                                int incx) {
-  const float alpha_float = float(*alpha);
-  return cublasSscal(handle, n, &alpha_float, x, incx);
-};
-
-//
-template <>
-inline cublasStatus_t cublasScal<double, int>(cublasHandle_t handle, int n,
-                                              const int *alpha, double *x,
-                                              int incx) {
-  const double alpha_double = double(*alpha);
-  return cublasDscal(handle, n, &alpha_double, x, incx);
-};
-
-template <>
-inline cublasStatus_t cublasScal<double, float>(cublasHandle_t handle, int n,
-                                                const float *alpha, double *x,
-                                                int incx) {
-  const double alpha_double = double(*alpha);
-  return cublasDscal(handle, n, &alpha_double, x, incx);
-};
-
-template <>
-inline cublasStatus_t cublasScal<float, detail::ComplexConjugate<void>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<void> *alpha,
-    float *x, int incx) {
-  return CUBLAS_STATUS_SUCCESS;
-}
-
-template <>
-inline cublasStatus_t
-cublasScal<float, detail::ComplexConjugate<detail::ComplexNegTag>>(
-    cublasHandle_t handle, int n,
-    const detail::ComplexConjugate<detail::ComplexNegTag> *alpha, float *x,
-    int incx) {
-  const float alpha_float = float(-1.0);
-  return cublasSscal(handle, n, &alpha_float, x, incx);
-}
-
-template <>
-inline cublasStatus_t cublasScal<float, detail::ComplexConjugate<int>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<int> *alpha,
-    float *x, int incx) {
-  const float alpha_float = float(alpha->factor());
-  return cublasSscal(handle, n, &alpha_float, x, incx);
-}
-
-template <>
-inline cublasStatus_t cublasScal<float, detail::ComplexConjugate<float>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<float> *alpha,
-    float *x, int incx) {
-  const float alpha_float = float(alpha->factor());
-  return cublasSscal(handle, n, &alpha_float, x, incx);
-}
-
-template <>
-inline cublasStatus_t cublasScal<float, detail::ComplexConjugate<double>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<double> *alpha,
-    float *x, int incx) {
-  const float alpha_float = float(alpha->factor());
-  return cublasSscal(handle, n, &alpha_float, x, incx);
-}
-
-template <>
-inline cublasStatus_t cublasScal<double, detail::ComplexConjugate<void>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<void> *alpha,
-    double *x, int incx) {
-  return CUBLAS_STATUS_SUCCESS;
-}
-
-template <>
-inline cublasStatus_t
-cublasScal<double, detail::ComplexConjugate<detail::ComplexNegTag>>(
-    cublasHandle_t handle, int n,
-    const detail::ComplexConjugate<detail::ComplexNegTag> *alpha, double *x,
-    int incx) {
-  const double alpha_double = double(-1.0);
-  return cublasDscal(handle, n, &alpha_double, x, incx);
-}
-
-template <>
-inline cublasStatus_t cublasScal<double, detail::ComplexConjugate<int>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<int> *alpha,
-    double *x, int incx) {
-  const double alpha_double = double(alpha->factor());
-  return cublasDscal(handle, n, &alpha_double, x, incx);
-}
-
-template <>
-inline cublasStatus_t cublasScal<double, detail::ComplexConjugate<float>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<float> *alpha,
-    double *x, int incx) {
-  const double alpha_double = double(alpha->factor());
-  return cublasDscal(handle, n, &alpha_double, x, incx);
-}
-
-template <>
-inline cublasStatus_t cublasScal<double, detail::ComplexConjugate<double>>(
-    cublasHandle_t handle, int n, const detail::ComplexConjugate<double> *alpha,
-    double *x, int incx) {
-  const double alpha_double = double(alpha->factor());
-  return cublasDscal(handle, n, &alpha_double, x, incx);
-}
-
-/// COPY inerface function
-template <typename T>
-cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const T *x, int incx,
-                          T *y, int incy);
-
-template <>
-inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const float *x,
-                                 int incx, float *y, int incy) {
-  return cublasScopy(handle, n, x, incx, y, incy);
-}
-
-template <>
-inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x,
-                                 int incx, double *y, int incy) {
-  return cublasDcopy(handle, n, x, incx, y, incy);
-}
-
-}  // end of namespace TiledArray
-
-#endif  // TILEDARRAY_HAS_CUDA
-
-#endif  // TILEDARRAY_MATH_CUBLAS_H__INCLUDED
diff --git a/src/TiledArray/cuda/kernel/mult_kernel.cu b/src/TiledArray/cuda/kernel/mult_kernel.cu
deleted file mode 100644
index 8bbcae4927..0000000000
--- a/src/TiledArray/cuda/kernel/mult_kernel.cu
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2018  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Chong Peng
- *  Department of Chemistry, Virginia Tech
- *  Aug 21, 2018
- *
- */
-
-#include <TiledArray/cuda/kernel/mult_kernel.h>
-#include <TiledArray/cuda/kernel/mult_kernel_impl.h>
-
-
-#ifdef TILEDARRAY_HAS_CUDA
-
-namespace TiledArray {
-
-/// result[i] = result[i] * arg[i]
-void mult_to_cuda_kernel(int *result, const int *arg, std::size_t n,
-                         cudaStream_t stream, int device_id) {
-  mult_to_cuda_kernel_impl(result, arg, n, stream, device_id);
-}
-
-void mult_to_cuda_kernel(float *result, const float *arg, std::size_t n,
-                         cudaStream_t stream, int device_id) {
-  mult_to_cuda_kernel_impl(result, arg, n, stream, device_id);
-}
-
-void mult_to_cuda_kernel(double *result, const double *arg, std::size_t n,
-                         cudaStream_t stream, int device_id) {
-  mult_to_cuda_kernel_impl(result, arg, n, stream, device_id);
-}
-
-/// result[i] = arg1[i] * arg2[i]
-void mult_cuda_kernel(int *result, const int *arg1, const int *arg2, std::size_t n,
-                      cudaStream_t stream, int device_id){
-  mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id);
-}
-
-void mult_cuda_kernel(float *result, const float *arg1, const float *arg2, std::size_t n,
-                      cudaStream_t stream, int device_id){
-  mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id);
-}
-
-void mult_cuda_kernel(double *result, const double *arg1, const double *arg2, std::size_t n,
-                      cudaStream_t stream, int device_id){
-  mult_cuda_kernel_impl(result,arg1,arg2,n,stream,device_id);
-}
-
-
-}  // namespace TiledArray
-
-#endif // TILEDARRAY_HAS_CUDA
diff --git a/src/TiledArray/cuda/kernel/mult_kernel.h b/src/TiledArray/cuda/kernel/mult_kernel.h
deleted file mode 100644
index 7c333e879a..0000000000
--- a/src/TiledArray/cuda/kernel/mult_kernel.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2018  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Chong Peng
- *  Department of Chemistry, Virginia Tech
- *  Aug 21, 2018
- *
- */
-
-#ifndef TILEDARRAY_CUDA_MULT_KERNEL_H__INCLUDED
-#define TILEDARRAY_CUDA_MULT_KERNEL_H__INCLUDED
-
-#include <TiledArray/config.h>
-
-#ifdef TILEDARRAY_HAS_CUDA
-
-namespace TiledArray {
-
-/// result[i] = result[i] * arg[i]
-void mult_to_cuda_kernel(int *result, const int *arg, std::size_t n,
-                         cudaStream_t stream, int device_id);
-
-void mult_to_cuda_kernel(float *result, const float *arg, std::size_t n,
-                         cudaStream_t stream, int device_id);
-
-void mult_to_cuda_kernel(double *result, const double *arg, std::size_t n,
-                         cudaStream_t stream, int device_id);
-
-/// result[i] = arg1[i] * arg2[i]
-void mult_cuda_kernel(int *result, const int *arg1, const int *arg2,
-                      std::size_t n, cudaStream_t stream, int device_id);
-
-void mult_cuda_kernel(float *result, const float *arg1, const float *arg2,
-                      std::size_t n, cudaStream_t stream, int device_id);
-
-void mult_cuda_kernel(double *result, const double *arg1, const double *arg2,
-                      std::size_t n, cudaStream_t stream, int device_id);
-
-}  // namespace TiledArray
-
-#endif  // TILEDARRAY_HAS_CUDA
-
-#endif  // TILEDARRAY_CUDA_MULT_KERNEL_H__INCLUDED
diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.cu b/src/TiledArray/cuda/kernel/reduce_kernel.cu
deleted file mode 100644
index 1e1550260f..0000000000
--- a/src/TiledArray/cuda/kernel/reduce_kernel.cu
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2018  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Chong Peng
- *  Department of Chemistry, Virginia Tech
- *  May 8, 2019
- *
- */
-
-#include <TiledArray/cuda/kernel/reduce_kernel.h>
-#include <TiledArray/cuda/kernel/reduce_kernel_impl.h>
-
-
-#ifdef TILEDARRAY_HAS_CUDA
-
-namespace TiledArray {
-
-// foreach(i) result *= arg[i]
-int product_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
-                        int device_id){
-  return product_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-
-}
-
-float product_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
-                          int device_id){
-  return product_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-double product_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
-                           int device_id){
-
-  return product_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-
-// foreach(i) result += arg[i]
-int sum_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
-                    int device_id){
-  return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-float sum_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
-                      int device_id){
-  return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-double sum_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
-                       int device_id){
-  return sum_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-// foreach(i) result = max(result, arg[i])
-int max_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
-                    int device_id){
-  return max_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-float max_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
-                      int device_id){
-  return max_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-double max_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
-                       int device_id){
-  return max_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-// foreach(i) result = min(result, arg[i])
-int min_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
-                    int device_id){
-  return min_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-float min_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
-                      int device_id){
-  return min_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-double min_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
-                       int device_id){
-  return min_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-// foreach(i) result = max(result, abs(arg[i]))
-int absmax_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
-                       int device_id){
-  return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-float absmax_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
-                         int device_id){
-  return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-double absmax_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
-                          int device_id){
-  return absmax_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-// foreach(i) result = min(result, abs(arg[i]))
-int absmin_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
-                       int device_id){
-  return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-float absmin_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
-                         int device_id){
-  return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-double absmin_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
-                          int device_id){
-  return absmin_reduce_cuda_kernel_impl(arg, n, stream, device_id);
-}
-
-}  // namespace TiledArray
-
-#endif // TILEDARRAY_HAS_CUDA
diff --git a/src/TiledArray/cuda/kernel/reduce_kernel.h b/src/TiledArray/cuda/kernel/reduce_kernel.h
deleted file mode 100644
index 857cad6c0c..0000000000
--- a/src/TiledArray/cuda/kernel/reduce_kernel.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2018  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Chong Peng
- *  Department of Chemistry, Virginia Tech
- *  May 08, 2019
- *
- */
-
-#ifndef TILEDARRAY_CUDA_REDUCE_KERNEL_H__INCLUDED
-#define TILEDARRAY_CUDA_REDUCE_KERNEL_H__INCLUDED
-
-#include <TiledArray/config.h>
-
-#ifdef TILEDARRAY_HAS_CUDA
-
-namespace TiledArray {
-
-// foreach(i) result *= arg[i]
-int product_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
-                        int device_id);
-
-float product_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
-                          int device_id);
-
-double product_cuda_kernel(const double *arg, std::size_t n,
-                           cudaStream_t stream, int device_id);
-
-// foreach(i) result += arg[i]
-int sum_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
-                    int device_id);
-
-float sum_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
-                      int device_id);
-
-double sum_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
-                       int device_id);
-
-// foreach(i) result = max(result, arg[i])
-int max_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
-                    int device_id);
-
-float max_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
-                      int device_id);
-
-double max_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
-                       int device_id);
-
-// foreach(i) result = min(result, arg[i])
-int min_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
-                    int device_id);
-
-float min_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
-                      int device_id);
-
-double min_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
-                       int device_id);
-
-// foreach(i) result = max(result, abs(arg[i]))
-int absmax_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
-                       int device_id);
-
-float absmax_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
-                         int device_id);
-
-double absmax_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
-                          int device_id);
-
-// foreach(i) result = min(result, abs(arg[i]))
-int absmin_cuda_kernel(const int *arg, std::size_t n, cudaStream_t stream,
-                       int device_id);
-
-float absmin_cuda_kernel(const float *arg, std::size_t n, cudaStream_t stream,
-                         int device_id);
-
-double absmin_cuda_kernel(const double *arg, std::size_t n, cudaStream_t stream,
-                          int device_id);
-
-}  // namespace TiledArray
-
-#endif  // TILEDARRAY_HAS_CUDA
-
-#endif  // TILEDARRAY_CUDA_REDUCE_KERNEL_H__INCLUDED
diff --git a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h b/src/TiledArray/cuda/kernel/reduce_kernel_impl.h
deleted file mode 100644
index 12a8aa1e19..0000000000
--- a/src/TiledArray/cuda/kernel/reduce_kernel_impl.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2018  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Chong Peng
- *  Department of Chemistry, Virginia Tech
- *  Apir 11, 2018
- *
- */
-
-#ifndef TILEDARRAY_CUDA_REDUCE_KERNEL_IMPL_H__INCLUDED
-#define TILEDARRAY_CUDA_REDUCE_KERNEL_IMPL_H__INCLUDED
-
-#include <limits>
-
-#include <TiledArray/external/cuda.h>
-#include <thrust/device_vector.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/reduce.h>
-#include <thrust/transform_reduce.h>
-
-namespace TiledArray {
-
-namespace detail {
-
-template <typename T>
-struct absolute_value : public thrust::unary_function<T, T> {
-  __host__ __device__ T operator()(const T &x) const {
-    return x < T(0) ? -x : x;
-  }
-};
-
-}  // namespace detail
-
-/// T = reduce(T* arg)
-template <typename T, typename ReduceOp>
-T reduce_cuda_kernel_impl(ReduceOp &&op, const T *arg, std::size_t n, T init,
-                          cudaStream_t stream, int device_id) {
-  CudaSafeCall(cudaSetDevice(device_id));
-
-  auto arg_p = thrust::device_pointer_cast(arg);
-
-  auto result = thrust::reduce(thrust::cuda::par.on(stream), arg_p, arg_p + n,
-                               init, std::forward<ReduceOp>(op));
-
-  return result;
-}
-
-template <typename T>
-T product_reduce_cuda_kernel_impl(const T *arg, std::size_t n,
-                                  cudaStream_t stream, int device_id) {
-  T init(1);
-  thrust::multiplies<T> mul_op;
-  return reduce_cuda_kernel_impl(mul_op, arg, n, init, stream, device_id);
-}
-
-template <typename T>
-T sum_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream,
-                              int device_id) {
-  T init(0);
-  thrust::plus<T> plus_op;
-  return reduce_cuda_kernel_impl(plus_op, arg, n, init, stream, device_id);
-}
-
-template <typename T>
-T max_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream,
-                              int device_id) {
-  T init = std::numeric_limits<T>::lowest();
-  thrust::maximum<T> max_op;
-  return reduce_cuda_kernel_impl(max_op, arg, n, init, stream, device_id);
-}
-
-template <typename T>
-T min_reduce_cuda_kernel_impl(const T *arg, std::size_t n, cudaStream_t stream,
-                              int device_id) {
-  T init = std::numeric_limits<T>::max();
-  thrust::minimum<T> min_op;
-  return reduce_cuda_kernel_impl(min_op, arg, n, init, stream, device_id);
-}
-
-template <typename T>
-T absmax_reduce_cuda_kernel_impl(const T *arg, std::size_t n,
-                                 cudaStream_t stream, int device_id) {
-  T init(0);
-  thrust::maximum<T> max_op;
-  detail::absolute_value<T> abs_op;
-
-  CudaSafeCall(cudaSetDevice(device_id));
-
-  auto arg_p = thrust::device_pointer_cast(arg);
-
-  auto result = thrust::transform_reduce(thrust::cuda::par.on(stream), arg_p,
-                                         arg_p + n, abs_op, init, max_op);
-
-  return result;
-}
-
-template <typename T>
-T absmin_reduce_cuda_kernel_impl(const T *arg, std::size_t n,
-                                 cudaStream_t stream, int device_id) {
-  T init(0);
-  thrust::minimum<T> min_op;
-  detail::absolute_value<T> abs_op;
-
-  CudaSafeCall(cudaSetDevice(device_id));
-
-  auto arg_p = thrust::device_pointer_cast(arg);
-
-  auto result = thrust::transform_reduce(thrust::cuda::par.on(stream), arg_p,
-                                         arg_p + n, abs_op, init, min_op);
-  return result;
-}
-
-}  // namespace TiledArray
-
-#endif  // TILEDARRAY_CUDA_REDUCE_KERNEL_IMPL_H__INCLUDED
diff --git a/src/TiledArray/cuda/um_allocator.h b/src/TiledArray/cuda/um_allocator.h
deleted file mode 100644
index 99b281dc51..0000000000
--- a/src/TiledArray/cuda/um_allocator.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2018  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Eduard Valeyev
- *  Department of Chemistry, Virginia Tech
- *  Jan 31, 2018
- *
- */
-
-#ifndef TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED
-#define TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED
-
-#include <TiledArray/config.h>
-
-#ifdef TILEDARRAY_HAS_CUDA
-
-#include <TiledArray/external/cuda.h>
-#include <TiledArray/external/umpire.h>
-
-#include <memory>
-#include <stdexcept>
-
-namespace TiledArray {
-
-/// pooled thread-safe CUDA UM allocator
-template <class T>
-class cuda_um_allocator_impl
-    : public umpire_allocator_impl<T, detail::MutexLock<cudaEnv>> {
- public:
-  using base_type = umpire_allocator_impl<T, detail::MutexLock<cudaEnv>>;
-  using typename base_type::const_pointer;
-  using typename base_type::const_reference;
-  using typename base_type::pointer;
-  using typename base_type::reference;
-  using typename base_type::value_type;
-
-  cuda_um_allocator_impl() noexcept
-      : base_type(&cudaEnv::instance()->um_allocator()) {}
-
-  template <class U>
-  cuda_um_allocator_impl(const cuda_um_allocator_impl<U>& rhs) noexcept
-      : base_type(static_cast<const umpire_allocator_impl<U>&>(rhs)) {}
-
-  template <typename T1, typename T2>
-  friend bool operator==(const cuda_um_allocator_impl<T1>& lhs,
-                         const cuda_um_allocator_impl<T2>& rhs) noexcept;
-};  // class cuda_um_allocator
-
-template <class T1, class T2>
-bool operator==(const cuda_um_allocator_impl<T1>& lhs,
-                const cuda_um_allocator_impl<T2>& rhs) noexcept {
-  return lhs.umpire_allocator() == rhs.umpire_allocator();
-}
-
-template <class T1, class T2>
-bool operator!=(const cuda_um_allocator_impl<T1>& lhs,
-                const cuda_um_allocator_impl<T2>& rhs) noexcept {
-  return !(lhs == rhs);
-}
-
-template <typename T>
-using cuda_um_allocator = default_init_allocator<T, cuda_um_allocator_impl<T>>;
-
-}  // namespace TiledArray
-
-#endif  // TILEDARRAY_HAS_CUDA
-
-#endif  // TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED
diff --git a/src/TiledArray/cuda/um_storage.h b/src/TiledArray/cuda/um_storage.h
deleted file mode 100644
index 4b3781185c..0000000000
--- a/src/TiledArray/cuda/um_storage.h
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2018  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Eduard Valeyev
- *  Department of Chemistry, Virginia Tech
- *  Feb 6, 2018
- *
- */
-
-#ifndef TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED
-#define TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED
-
-#include <TiledArray/cuda/thrust.h>
-#include <TiledArray/cuda/um_allocator.h>
-
-#ifdef TILEDARRAY_HAS_CUDA
-
-#include <btas/array_adaptor.h>
-#include <btas/varray/varray.h>
-
-#include <TiledArray/cuda/platform.h>
-#include <TiledArray/utility.h>
-
-#include <madness/world/archive.h>
-
-namespace TiledArray {
-
-template <typename T>
-using cuda_um_thrust_vector =
-    thrust::device_vector<T, TiledArray::cuda_um_allocator<T>>;
-
-/// @return true if @c dev_vec is present in space @space
-template <MemorySpace Space, typename Storage>
-bool in_memory_space(const Storage& vec) noexcept {
-  return overlap(MemorySpace::CUDA_UM, Space);
-}
-
-/**
- * @tparam Space
- * @tparam Storage  the Storage type of the vector, such as cuda_um_btas_varray
- */
-template <ExecutionSpace Space, typename Storage>
-void to_execution_space(Storage& vec, cudaStream_t stream = 0) {
-  switch (Space) {
-    case ExecutionSpace::CPU: {
-      using std::data;
-      using std::size;
-      using value_type = typename Storage::value_type;
-      if (cudaEnv::instance()->concurrent_managed_access()) {
-        CudaSafeCall(cudaMemPrefetchAsync(data(vec),
-                                          size(vec) * sizeof(value_type),
-                                          cudaCpuDeviceId, stream));
-      }
-      break;
-    }
-    case ExecutionSpace::CUDA: {
-      using std::data;
-      using std::size;
-      using value_type = typename Storage::value_type;
-      int device = -1;
-      if (cudaEnv::instance()->concurrent_managed_access()) {
-        CudaSafeCall(cudaGetDevice(&device));
-        CudaSafeCall(cudaMemPrefetchAsync(
-            data(vec), size(vec) * sizeof(value_type), device, stream));
-      }
-      break;
-    }
-    default:
-      throw std::runtime_error("invalid execution space");
-  }
-}
-
-/**
- * create UM storage and prefetch it to device
- *
- * @param storage UM Storage type object
- * @param n size of um storage object
- * @param stream cuda stream used to perform prefetch
- */
-template <typename Storage>
-void make_device_storage(Storage& storage, std::size_t n,
-                         const cudaStream_t& stream = 0) {
-  storage = Storage(n);
-  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(storage,
-                                                                   stream);
-}
-
-/**
- *  return the device pointer for UM storage object
- *
- * @param storage UM Storage type object
- * @return data pointer of UM Storage object
- */
-template <typename Storage>
-typename Storage::value_type* device_data(Storage& storage) {
-  return storage.data();
-}
-
-/**
- *  return the const pointer for UM storage object
- *
- * @param storage UM Storage type object
- * @return const data pointer of UM Storage object
- */
-template <typename Storage>
-const typename Storage::value_type* device_data(const Storage& storage) {
-  return storage.data();
-}
-
-}  // namespace TiledArray
-
-namespace madness {
-namespace archive {
-
-template <class Archive, typename T>
-struct ArchiveLoadImpl<Archive, TiledArray::cuda_um_thrust_vector<T>> {
-  static inline void load(const Archive& ar,
-                          TiledArray::cuda_um_thrust_vector<T>& x) {
-    typename thrust::device_vector<
-        T, TiledArray::cuda_um_allocator<T>>::size_type n(0);
-    ar& n;
-    x.resize(n);
-    for (auto& xi : x) ar& xi;
-  }
-};
-
-template <class Archive, typename T>
-struct ArchiveStoreImpl<Archive, TiledArray::cuda_um_thrust_vector<T>> {
-  static inline void store(const Archive& ar,
-                           const TiledArray::cuda_um_thrust_vector<T>& x) {
-    ar& x.size();
-    for (const auto& xi : x) ar& xi;
-  }
-};
-
-template <class Archive, typename T>
-struct ArchiveLoadImpl<Archive, TiledArray::cuda_um_btas_varray<T>> {
-  static inline void load(const Archive& ar,
-                          TiledArray::cuda_um_btas_varray<T>& x) {
-    typename TiledArray::cuda_um_btas_varray<T>::size_type n(0);
-    ar& n;
-    x.resize(n);
-    for (auto& xi : x) ar& xi;
-  }
-};
-
-template <class Archive, typename T>
-struct ArchiveStoreImpl<Archive, TiledArray::cuda_um_btas_varray<T>> {
-  static inline void store(const Archive& ar,
-                           const TiledArray::cuda_um_btas_varray<T>& x) {
-    ar& x.size();
-    for (const auto& xi : x) ar& xi;
-  }
-};
-
-}  // namespace archive
-}  // namespace madness
-
-#endif  // TILEDARRAY_HAS_CUDA
-
-#endif  // TILEDARRAY_CUDA_UM_VECTOR_H__INCLUDED
diff --git a/src/TiledArray/device/blas.cpp b/src/TiledArray/device/blas.cpp
new file mode 100644
index 0000000000..cedd694241
--- /dev/null
+++ b/src/TiledArray/device/blas.cpp
@@ -0,0 +1,60 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2018  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Eduard Valeyev
+ *  Department of Chemistry, Virginia Tech
+ *  Sept 19, 2023
+ *
+ */
+
+#include <TiledArray/device/blas.h>
+
+namespace TiledArray {
+
+std::vector<std::unique_ptr<blas::Queue>> BLASQueuePool::queues_;
+
+bool BLASQueuePool::initialized() { return !queues_.empty(); }
+
+void BLASQueuePool::initialize() {
+  if (initialized()) return;
+  queues_.reserve(deviceEnv::instance()->num_streams_total());
+  for (std::size_t sidx = 0; sidx != deviceEnv::instance()->num_streams_total();
+       ++sidx) {
+    auto q = deviceEnv::instance()->stream(
+        sidx);  // blaspp forsome reason wants non-const lvalue ref to stream
+    queues_.emplace_back(std::make_unique<blas::Queue>(q.device, q.stream));
+  }
+}
+
+void BLASQueuePool::finalize() { queues_.clear(); }
+
+blas::Queue& BLASQueuePool::queue(std::size_t ordinal) {
+  TA_ASSERT(initialized());
+  TA_ASSERT(ordinal < deviceEnv::instance()->num_streams_total());
+  return *(queues_[ordinal]);
+}
+
+blas::Queue& BLASQueuePool::queue(device::Stream const& stream) {
+  TA_ASSERT(initialized());
+  for (auto&& q : queues_) {
+    if (q->device() == stream.device && q->stream() == stream.stream) return *q;
+  }
+  throw TiledArray::Exception(
+      "no matching device stream found in the BLAS queue pool");
+}
+
+}  // namespace TiledArray
diff --git a/src/TiledArray/device/blas.h b/src/TiledArray/device/blas.h
new file mode 100644
index 0000000000..bd905a528e
--- /dev/null
+++ b/src/TiledArray/device/blas.h
@@ -0,0 +1,79 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2018  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Chong Peng
+ *  Department of Chemistry, Virginia Tech
+ *  July 23, 2018
+ *
+ */
+
+#ifndef TILEDARRAY_DEVICE_BLAS_H__INCLUDED
+#define TILEDARRAY_DEVICE_BLAS_H__INCLUDED
+
+#include <TiledArray/config.h>
+
+#ifdef TILEDARRAY_HAS_DEVICE
+
+#include <TiledArray/error.h>
+#include <TiledArray/external/device.h>
+#include <TiledArray/tensor/complex.h>
+#include <blas/device.hh>
+
+namespace TiledArray {
+
+/**
+ * BLASQueuePool is a singleton controlling a pool of blas::Queue objects:
+ * - queues map to stream 1-to-1, so do not call Queue::set_stream to maintain
+ * this invariant
+ * - can access queues by the corresponding stream ordinal a la
+ * deviceEnv::stream()
+ */
+struct BLASQueuePool {
+  static bool initialized();
+  static void initialize();
+  static void finalize();
+
+  static blas::Queue &queue(std::size_t ordinal = 0);
+  static blas::Queue &queue(const device::Stream &s);
+
+ private:
+  static std::vector<std::unique_ptr<blas::Queue>> queues_;
+};
+
+/// maps a (tile) Range to blas::Queue; if had already pushed work into a
+/// device::Stream (as indicated by madness_task_current_stream() )
+/// will return that Stream instead
+/// @param[in] range will determine the device::Stream to compute an object
+/// associated with this Range object
+/// @return the device::Stream to use for creating tasks generating work
+/// associated with Range \p range
+template <typename Range>
+blas::Queue &blasqueue_for(const Range &range) {
+  auto stream_opt = device::madness_task_current_stream();
+  if (!stream_opt) {
+    auto stream_ord =
+        range.offset() % device::Env::instance()->num_streams_total();
+    return BLASQueuePool::queue(stream_ord);
+  } else
+    return BLASQueuePool::queue(*stream_opt);
+}
+
+}  // namespace TiledArray
+
+#endif  // TILEDARRAY_HAS_DEVICE
+
+#endif  // TILEDARRAY_DEVICE_BLAS_H__INCLUDED
diff --git a/src/TiledArray/device/btas.h b/src/TiledArray/device/btas.h
new file mode 100644
index 0000000000..b30fdd4edd
--- /dev/null
+++ b/src/TiledArray/device/btas.h
@@ -0,0 +1,554 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2018  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Chong Peng
+ *  Department of Chemistry, Virginia Tech
+ *  July 24, 2018
+ *
+ */
+
+#ifndef TILEDARRAY_DEVICE_BTAS_H__INCLUDED
+#define TILEDARRAY_DEVICE_BTAS_H__INCLUDED
+
+#include <TiledArray/config.h>
+
+#include <TiledArray/math/blas.h>
+
+#ifdef TILEDARRAY_HAS_DEVICE
+
+#include <TiledArray/device/blas.h>
+
+#include <TiledArray/external/device.h>
+#include <btas/tensor.h>
+
+#include <TiledArray/device/kernel/mult_kernel.h>
+#include <TiledArray/device/kernel/reduce_kernel.h>
+#include <TiledArray/device/platform.h>
+#include <TiledArray/device/um_storage.h>
+#include <TiledArray/math/gemm_helper.h>
+
+namespace TiledArray {
+
+namespace device {
+
+namespace btas {
+
+template <typename T, typename Scalar, typename Range, typename Storage,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+::btas::Tensor<T, Range, Storage> gemm(
+    const ::btas::Tensor<T, Range, Storage> &left,
+    const ::btas::Tensor<T, Range, Storage> &right, Scalar factor,
+    const TiledArray::math::GemmHelper &gemm_helper) {
+  // Check that the arguments are not empty and have the correct ranks
+  TA_ASSERT(!left.empty());
+  TA_ASSERT(left.range().rank() == gemm_helper.left_rank());
+  TA_ASSERT(!right.empty());
+  TA_ASSERT(right.range().rank() == gemm_helper.right_rank());
+
+  // Check that the inner dimensions of left and right match
+  TA_ASSERT(
+      ignore_tile_position() ||
+      gemm_helper.left_right_congruent(std::cbegin(left.range().lobound()),
+                                       std::cbegin(right.range().lobound())));
+  TA_ASSERT(
+      ignore_tile_position() ||
+      gemm_helper.left_right_congruent(std::cbegin(left.range().upbound()),
+                                       std::cbegin(right.range().upbound())));
+  TA_ASSERT(gemm_helper.left_right_congruent(
+      std::cbegin(left.range().extent()), std::cbegin(right.range().extent())));
+
+  // Compute gemm dimensions
+  using TiledArray::math::blas::integer;
+  integer m = 1, n = 1, k = 1;
+  gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range());
+
+  // Get the leading dimension for left and right matrices.
+  const integer lda = std::max(
+      integer{1},
+      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m));
+  const integer ldb = std::max(
+      integer{1},
+      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k));
+
+  T factor_t = T(factor);
+  T zero(0);
+
+  //  typedef typename Tensor::storage_type storage_type;
+  auto result_range =
+      gemm_helper.make_result_range<Range>(left.range(), right.range());
+
+  auto &queue = blasqueue_for(result_range);
+  const auto device = queue.device();
+  const auto str = queue.stream();
+  const device::Stream stream(device, str);
+  DeviceSafeCall(device::setDevice(device));
+
+  // the result Tensor type
+  typedef ::btas::Tensor<T, Range, Storage> Tensor;
+  Tensor result;
+
+  if (true) {
+    Storage result_storage;
+    make_device_storage(result_storage, result_range.area(), stream);
+    result = Tensor(std::move(result_range), std::move(result_storage));
+
+    // prefetch data
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
+        left.storage(), stream);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
+        right.storage(), stream);
+
+    static_assert(::btas::boxrange_iteration_order<Range>::value ==
+                  ::btas::boxrange_iteration_order<Range>::row_major);
+    const integer ldc = std::max(integer{1}, n);
+    blas::gemm(blas::Layout::ColMajor, gemm_helper.right_op(),
+               gemm_helper.left_op(), n, m, k, factor_t,
+               device_data(right.storage()), ldb, device_data(left.storage()),
+               lda, zero, device_data(result.storage()), ldc, queue);
+
+    device::sync_madness_task_with(stream);
+  }
+
+  return result;
+}
+
+template <typename T, typename Scalar, typename Range, typename Storage,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+void gemm(::btas::Tensor<T, Range, Storage> &result,
+          const ::btas::Tensor<T, Range, Storage> &left,
+          const ::btas::Tensor<T, Range, Storage> &right, Scalar factor,
+          const TiledArray::math::GemmHelper &gemm_helper) {
+  // Check that the result is not empty and has the correct rank
+  TA_ASSERT(!result.empty());
+  TA_ASSERT(result.range().rank() == gemm_helper.result_rank());
+
+  // Check that the arguments are not empty and have the correct ranks
+  TA_ASSERT(!left.empty());
+  TA_ASSERT(left.range().rank() == gemm_helper.left_rank());
+  TA_ASSERT(!right.empty());
+  TA_ASSERT(right.range().rank() == gemm_helper.right_rank());
+
+  // Check that the outer dimensions of left match the the corresponding
+  // dimensions in result
+  TA_ASSERT(
+      ignore_tile_position() ||
+      gemm_helper.left_result_congruent(std::cbegin(left.range().lobound()),
+                                        std::cbegin(result.range().lobound())));
+  TA_ASSERT(
+      ignore_tile_position() ||
+      gemm_helper.left_result_congruent(std::cbegin(left.range().upbound()),
+                                        std::cbegin(result.range().upbound())));
+  TA_ASSERT(
+      gemm_helper.left_result_congruent(std::cbegin(left.range().extent()),
+                                        std::cbegin(result.range().extent())));
+
+  // Check that the outer dimensions of right match the the corresponding
+  // dimensions in result
+  TA_ASSERT(ignore_tile_position() ||
+            gemm_helper.right_result_congruent(
+                std::cbegin(right.range().lobound()),
+                std::cbegin(result.range().lobound())));
+  TA_ASSERT(ignore_tile_position() ||
+            gemm_helper.right_result_congruent(
+                std::cbegin(right.range().upbound()),
+                std::cbegin(result.range().upbound())));
+  TA_ASSERT(
+      gemm_helper.right_result_congruent(std::cbegin(right.range().extent()),
+                                         std::cbegin(result.range().extent())));
+
+  // Check that the inner dimensions of left and right match
+  TA_ASSERT(
+      ignore_tile_position() ||
+      gemm_helper.left_right_congruent(std::cbegin(left.range().lobound()),
+                                       std::cbegin(right.range().lobound())));
+  TA_ASSERT(
+      ignore_tile_position() ||
+      gemm_helper.left_right_congruent(std::cbegin(left.range().upbound()),
+                                       std::cbegin(right.range().upbound())));
+  TA_ASSERT(gemm_helper.left_right_congruent(
+      std::cbegin(left.range().extent()), std::cbegin(right.range().extent())));
+
+  // Compute gemm dimensions
+  using TiledArray::math::blas::integer;
+  integer m, n, k;
+  gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range());
+
+  // Get the leading dimension for left and right matrices.
+  const integer lda = std::max(
+      integer{1},
+      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m));
+  const integer ldb = std::max(
+      integer{1},
+      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k));
+
+  auto &queue = blasqueue_for(result.range());
+  const auto stream = device::Stream(queue.device(), queue.stream());
+  DeviceSafeCall(device::setDevice(stream.device));
+
+  T factor_t = T(factor);
+  T one(1);
+  if (true) {
+    // prefetch all data
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
+        left.storage(), stream);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
+        right.storage(), stream);
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
+        result.storage(), stream);
+
+    static_assert(::btas::boxrange_iteration_order<Range>::value ==
+                  ::btas::boxrange_iteration_order<Range>::row_major);
+    const integer ldc = std::max(integer{1}, n);
+    blas::gemm(blas::Layout::ColMajor, gemm_helper.right_op(),
+               gemm_helper.left_op(), n, m, k, factor_t,
+               device_data(right.storage()), ldb, device_data(left.storage()),
+               lda, one, device_data(result.storage()), ldc, queue);
+    device::sync_madness_task_with(stream);
+  }
+}
+
+/// result[i] = arg[i]
+template <typename T, typename Range, typename Storage>
+::btas::Tensor<T, Range, Storage> clone(
+    const ::btas::Tensor<T, Range, Storage> &arg) {
+  Storage result_storage;
+  auto result_range = arg.range();
+  auto &queue = blasqueue_for(result_range);
+  const auto stream = Stream{queue.device(), queue.stream()};
+
+  make_device_storage(result_storage, arg.size(), stream);
+  ::btas::Tensor<T, Range, Storage> result(std::move(result_range),
+                                           std::move(result_storage));
+
+  blas::copy(result.size(), device_data(arg.storage()), 1,
+             device_data(result.storage()), 1, queue);
+
+  device::sync_madness_task_with(stream);
+  return result;
+}
+
+/// result[i] = a * arg[i]
+template <typename T, typename Range, typename Storage, typename Scalar,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+::btas::Tensor<T, Range, Storage> scale(
+    const ::btas::Tensor<T, Range, Storage> &arg, const Scalar a) {
+  auto &queue = blasqueue_for(arg.range());
+  const device::Stream stream(queue.device(), queue.stream());
+
+  auto result = clone(arg);
+
+  if constexpr (TiledArray::detail::is_blas_numeric_v<Scalar> ||
+                std::is_arithmetic_v<Scalar>) {
+    blas::scal(result.size(), a, device_data(result.storage()), 1, queue);
+  } else {
+    if constexpr (TiledArray::detail::is_complex_v<T>) {
+      abort();  // fused conjugation requires custom kernels, not yet supported
+    } else {
+      if constexpr (std::is_same_v<
+                        Scalar, TiledArray::detail::ComplexConjugate<void>>) {
+      } else if constexpr (std::is_same_v<
+                               Scalar,
+                               TiledArray::detail::ComplexConjugate<
+                                   TiledArray::detail::ComplexNegTag>>) {
+        blas::scal(result.size(), static_cast<T>(-1),
+                   device_data(result.storage()), 1, queue);
+      }
+    }
+  }
+
+  device::sync_madness_task_with(stream);
+
+  return result;
+}
+
+/// result[i] *= a
+template <typename T, typename Range, typename Storage, typename Scalar,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+void scale_to(::btas::Tensor<T, Range, Storage> &result, const Scalar a) {
+  auto &queue = blasqueue_for(result.range());
+  const device::Stream stream(queue.device(), queue.stream());
+
+  if constexpr (TiledArray::detail::is_blas_numeric_v<Scalar> ||
+                std::is_arithmetic_v<Scalar>) {
+    blas::scal(result.size(), a, device_data(result.storage()), 1, queue);
+  } else {
+    if constexpr (TiledArray::detail::is_complex_v<T>) {
+      abort();  // fused conjugation requires custom kernels, not yet supported
+    } else {
+      if constexpr (std::is_same_v<
+                        Scalar, TiledArray::detail::ComplexConjugate<void>>) {
+      } else if constexpr (std::is_same_v<
+                               Scalar,
+                               TiledArray::detail::ComplexConjugate<
+                                   TiledArray::detail::ComplexNegTag>>) {
+        blas::scal(result.size(), static_cast<T>(-1),
+                   device_data(result.storage()), 1, queue);
+      }
+    }
+  }
+
+  device::sync_madness_task_with(stream);
+}
+
+/// result[i] = arg1[i] - a * arg2[i]
+template <typename T, typename Scalar, typename Range, typename Storage,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+::btas::Tensor<T, Range, Storage> subt(
+    const ::btas::Tensor<T, Range, Storage> &arg1,
+    const ::btas::Tensor<T, Range, Storage> &arg2, const Scalar a) {
+  auto result = clone(arg1);
+
+  // revert the sign of a
+  auto b = -a;
+
+  auto &queue = blasqueue_for(result.range());
+  const device::Stream stream(queue.device(), queue.stream());
+
+  if (in_memory_space<MemorySpace::Device>(result.storage())) {
+    blas::axpy(result.size(), b, device_data(arg2.storage()), 1,
+               device_data(result.storage()), 1, queue);
+  } else {
+    TA_ASSERT(false);
+  }
+
+  device::sync_madness_task_with(stream);
+  return result;
+}
+
+/// result[i] -= a * arg1[i]
+template <typename T, typename Scalar, typename Range, typename Storage,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+void subt_to(::btas::Tensor<T, Range, Storage> &result,
+             const ::btas::Tensor<T, Range, Storage> &arg1, const Scalar a) {
+  auto &queue = blasqueue_for(result.range());
+  const device::Stream stream(queue.device(), queue.stream());
+
+  // revert the sign of a
+  auto b = -a;
+
+  blas::axpy(result.size(), b, device_data(arg1.storage()), 1,
+             device_data(result.storage()), 1, queue);
+  device::sync_madness_task_with(stream);
+}
+
+/// result[i] = arg1[i] + a * arg2[i]
+template <typename T, typename Scalar, typename Range, typename Storage,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+::btas::Tensor<T, Range, Storage> add(
+    const ::btas::Tensor<T, Range, Storage> &arg1,
+    const ::btas::Tensor<T, Range, Storage> &arg2, const Scalar a) {
+  auto result = clone(arg1);
+
+  auto &queue = blasqueue_for(result.range());
+  const device::Stream stream(queue.device(), queue.stream());
+
+  blas::axpy(result.size(), a, device_data(arg2.storage()), 1,
+             device_data(result.storage()), 1, queue);
+
+  device::sync_madness_task_with(stream);
+  return result;
+}
+
+/// result[i] += a * arg[i]
+template <typename T, typename Scalar, typename Range, typename Storage,
+          typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
+void add_to(::btas::Tensor<T, Range, Storage> &result,
+            const ::btas::Tensor<T, Range, Storage> &arg, const Scalar a) {
+  auto &queue = blasqueue_for(result.range());
+  const device::Stream stream(queue.device(), queue.stream());
+
+  //   TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(result.storage(),stream);
+  //   TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(arg.storage(),stream);
+
+  blas::axpy(result.size(), a, device_data(arg.storage()), 1,
+             device_data(result.storage()), 1, queue);
+
+  device::sync_madness_task_with(stream);
+}
+
+/// result[i] = result[i] * arg[i]
+template <typename T, typename Range, typename Storage>
+void mult_to(::btas::Tensor<T, Range, Storage> &result,
+             const ::btas::Tensor<T, Range, Storage> &arg) {
+  auto &queue = blasqueue_for(result.range());
+  const device::Stream stream(queue.device(), queue.stream());
+
+  std::size_t n = result.size();
+
+  TA_ASSERT(n == arg.size());
+
+  device::mult_to_kernel(result.data(), arg.data(), n, stream);
+  device::sync_madness_task_with(stream);
+}
+
+/// result[i] = arg1[i] * arg2[i]
+template <typename T, typename Range, typename Storage>
+::btas::Tensor<T, Range, Storage> mult(
+    const ::btas::Tensor<T, Range, Storage> &arg1,
+    const ::btas::Tensor<T, Range, Storage> &arg2) {
+  std::size_t n = arg1.size();
+
+  TA_ASSERT(arg2.size() == n);
+
+  auto stream = stream_for(arg1.range());
+
+  Storage result_storage;
+  make_device_storage(result_storage, n, stream);
+  ::btas::Tensor<T, Range, Storage> result(arg1.range(),
+                                           std::move(result_storage));
+
+  device::mult_kernel(result.data(), arg1.data(), arg2.data(), n, stream);
+
+  device::sync_madness_task_with(stream);
+  return result;
+}
+
+// foreach(i) result += arg[i] * arg[i]
+template <typename T, typename Range, typename Storage>
+typename ::btas::Tensor<T, Range, Storage>::value_type squared_norm(
+    const ::btas::Tensor<T, Range, Storage> &arg) {
+  auto &queue = blasqueue_for(arg.range());
+  const device::Stream stream(queue.device(), queue.stream());
+
+  auto &storage = arg.storage();
+  using TiledArray::math::blas::integer;
+  integer size = storage.size();
+  T result = 0;
+  if (in_memory_space<MemorySpace::Device>(storage)) {
+    blas::dot(size, device_data(storage), 1, device_data(storage), 1, &result,
+              queue);
+  } else {
+    TA_ASSERT(false);
+    //    result = TiledArray::math::dot(size, storage.data(), storage.data());
+  }
+  device::sync_madness_task_with(stream);
+  return result;
+}
+
+// foreach(i) result += arg1[i] * arg2[i]
+template <typename T, typename Range, typename Storage>
+typename ::btas::Tensor<T, Range, Storage>::value_type dot(
+    const ::btas::Tensor<T, Range, Storage> &arg1,
+    const ::btas::Tensor<T, Range, Storage> &arg2) {
+  auto &queue = blasqueue_for(arg1.range());
+  const device::Stream stream(queue.device(), queue.stream());
+
+  using TiledArray::math::blas::integer;
+  integer size = arg1.storage().size();
+
+  TA_ASSERT(size == arg2.storage().size());
+
+  T result = 0;
+  if (in_memory_space<MemorySpace::Device>(arg1.storage()) &&
+      in_memory_space<MemorySpace::Device>(arg2.storage())) {
+    blas::dot(size, device_data(arg1.storage()), 1, device_data(arg2.storage()),
+              1, &result, queue);
+  } else {
+    TA_ASSERT(false);
+    //    result = TiledArray::math::dot(size, storage.data(), storage.data());
+  }
+  device::sync_madness_task_with(stream);
+  return result;
+}
+
+template <typename T, typename Range, typename Storage>
+T sum(const ::btas::Tensor<T, Range, Storage> &arg) {
+  auto stream = device::stream_for(arg.range());
+
+  auto &storage = arg.storage();
+  auto n = storage.size();
+
+  auto result = device::sum_kernel(arg.data(), n, stream);
+
+  device::sync_madness_task_with(stream);
+  return result;
+}
+
+template <typename T, typename Range, typename Storage>
+T product(const ::btas::Tensor<T, Range, Storage> &arg) {
+  auto stream = device::stream_for(arg.range());
+
+  auto &storage = arg.storage();
+  auto n = storage.size();
+
+  auto result = device::product_kernel(arg.data(), n, stream);
+
+  device::sync_madness_task_with(stream);
+  return result;
+}
+
+template <typename T, typename Range, typename Storage>
+T min(const ::btas::Tensor<T, Range, Storage> &arg) {
+  auto stream = device::stream_for(arg.range());
+
+  auto &storage = arg.storage();
+  auto n = storage.size();
+
+  auto result = device::min_kernel(arg.data(), n, stream);
+
+  device::sync_madness_task_with(stream);
+  return result;
+}
+
+template <typename T, typename Range, typename Storage>
+T max(const ::btas::Tensor<T, Range, Storage> &arg) {
+  auto stream = device::stream_for(arg.range());
+
+  auto &storage = arg.storage();
+  auto n = storage.size();
+
+  auto result = device::max_kernel(arg.data(), n, stream);
+
+  device::sync_madness_task_with(stream);
+  return result;
+}
+
+template <typename T, typename Range, typename Storage>
+T absmin(const ::btas::Tensor<T, Range, Storage> &arg) {
+  auto stream = device::stream_for(arg.range());
+
+  auto &storage = arg.storage();
+  auto n = storage.size();
+
+  auto result = device::absmin_kernel(arg.data(), n, stream);
+
+  device::sync_madness_task_with(stream);
+  return result;
+}
+
+template <typename T, typename Range, typename Storage>
+T absmax(const ::btas::Tensor<T, Range, Storage> &arg) {
+  auto stream = device::stream_for(arg.range());
+
+  auto &storage = arg.storage();
+  auto n = storage.size();
+
+  auto result = device::absmax_kernel(arg.data(), n, stream);
+
+  device::sync_madness_task_with(stream);
+  return result;
+}
+
+}  // namespace btas
+
+}  // namespace device
+
+}  // namespace TiledArray
+
+#endif  // TILEDARRAY_HAS_DEVICE
+
+#endif  // TILEDARRAY_DEVICE_BTAS_H__INCLUDED
diff --git a/src/TiledArray/device/btas_um_tensor.cpp b/src/TiledArray/device/btas_um_tensor.cpp
new file mode 100644
index 0000000000..a4d2167812
--- /dev/null
+++ b/src/TiledArray/device/btas_um_tensor.cpp
@@ -0,0 +1,51 @@
+//
+// Created by Chong Peng on 7/24/18.
+//
+
+#include <TiledArray/config.h>
+
+#ifdef TILEDARRAY_HAS_DEVICE
+
+#include <TiledArray/device/btas_um_tensor.h>
+
+template class btas::varray<double, TiledArray::device_um_allocator<double>>;
+template class btas::varray<float, TiledArray::device_um_allocator<float>>;
+template class btas::varray<
+    std::complex<double>,
+    TiledArray::device_um_allocator<std::complex<double>>>;
+template class btas::varray<
+    std::complex<float>, TiledArray::device_um_allocator<std::complex<float>>>;
+template class btas::varray<int, TiledArray::device_um_allocator<int>>;
+template class btas::varray<long, TiledArray::device_um_allocator<long>>;
+
+template class btas::Tensor<double, TiledArray::Range,
+                            TiledArray::device_um_btas_varray<double>>;
+template class btas::Tensor<float, TiledArray::Range,
+                            TiledArray::device_um_btas_varray<float>>;
+template class btas::Tensor<
+    std::complex<double>, TiledArray::Range,
+    TiledArray::device_um_btas_varray<std::complex<double>>>;
+template class btas::Tensor<
+    std::complex<float>, TiledArray::Range,
+    TiledArray::device_um_btas_varray<std::complex<float>>>;
+template class btas::Tensor<int, TiledArray::Range,
+                            TiledArray::device_um_btas_varray<int>>;
+template class btas::Tensor<long, TiledArray::Range,
+                            TiledArray::device_um_btas_varray<long>>;
+
+template class TiledArray::Tile<btas::Tensor<
+    double, TiledArray::Range, TiledArray::device_um_btas_varray<double>>>;
+template class TiledArray::Tile<btas::Tensor<
+    float, TiledArray::Range, TiledArray::device_um_btas_varray<float>>>;
+template class TiledArray::Tile<
+    btas::Tensor<std::complex<double>, TiledArray::Range,
+                 TiledArray::device_um_btas_varray<std::complex<double>>>>;
+template class TiledArray::Tile<
+    btas::Tensor<std::complex<float>, TiledArray::Range,
+                 TiledArray::device_um_btas_varray<std::complex<float>>>>;
+template class TiledArray::Tile<btas::Tensor<
+    int, TiledArray::Range, TiledArray::device_um_btas_varray<int>>>;
+template class TiledArray::Tile<btas::Tensor<
+    long, TiledArray::Range, TiledArray::device_um_btas_varray<long>>>;
+
+#endif  // TILEDARRAY_HAS_DEVICE
diff --git a/src/TiledArray/cuda/btas_um_tensor.h b/src/TiledArray/device/btas_um_tensor.h
similarity index 68%
rename from src/TiledArray/cuda/btas_um_tensor.h
rename to src/TiledArray/device/btas_um_tensor.h
index 7bddc4a178..dec80dcaf1 100644
--- a/src/TiledArray/cuda/btas_um_tensor.h
+++ b/src/TiledArray/device/btas_um_tensor.h
@@ -21,17 +21,20 @@
  *
  */
 
-#ifndef TILEDARRAY_CUDA_CUDA_UM_TENSOR_H
-#define TILEDARRAY_CUDA_CUDA_UM_TENSOR_H
+#ifndef TILEDARRAY_DEVICE_BTAS_UM_TENSOR_H
+#define TILEDARRAY_DEVICE_BTAS_UM_TENSOR_H
 
 #include <tiledarray_fwd.h>
 
+#include <TiledArray/config.h>
 #include <TiledArray/external/btas.h>
+#include <TiledArray/external/device.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
-#include <TiledArray/cuda/btas_cublas.h>
-#include <TiledArray/cuda/um_storage.h>
+#include <TiledArray/device/blas.h>
+#include <TiledArray/device/btas.h>
+#include <TiledArray/device/um_storage.h>
 #include <TiledArray/external/librett.h>
 #include <TiledArray/tile.h>
 
@@ -39,15 +42,14 @@ namespace TiledArray {
 
 namespace detail {
 template <typename T, typename Range>
-struct is_cuda_tile<
-    ::btas::Tensor<T, Range, TiledArray::cuda_um_btas_varray<T>>>
+struct is_device_tile<
+    ::btas::Tensor<T, Range, TiledArray::device_um_btas_varray<T>>>
     : public std::true_type {};
 
 template <typename T>
-void to_cuda(const TiledArray::btasUMTensorVarray<T> &tile) {
-  cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id());
-  auto &stream = TiledArray::detail::get_stream_based_on_range(tile.range());
-  TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
+void to_device(const TiledArray::btasUMTensorVarray<T> &tile) {
+  auto stream = device::stream_for(tile.range());
+  TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
       tile.storage(), stream);
 }
 
@@ -64,12 +66,12 @@ struct ArchiveLoadImpl<Archive, TiledArray::btasUMTensorVarray<T>> {
   static inline void load(const Archive &ar,
                           TiledArray::btasUMTensorVarray<T> &t) {
     TiledArray::Range range{};
-    TiledArray::cuda_um_btas_varray<T> store{};
-    ar &range &store;
+    TiledArray::device_um_btas_varray<T> store{};
+    ar & range & store;
     t = TiledArray::btasUMTensorVarray<T>(std::move(range), std::move(store));
-    // cudaSetDevice(TiledArray::cudaEnv::instance()->current_cuda_device_id());
-    // auto &stream = TiledArray::detail::get_stream_based_on_range(range);
-    // TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(t.storage(),
+    // device::setDevice(TiledArray::deviceEnv::instance()->default_device_id());
+    // auto &stream = device::stream_for(range);
+    // TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(t.storage(),
     // stream);
   }
 };
@@ -78,12 +80,10 @@ template <class Archive, typename T>
 struct ArchiveStoreImpl<Archive, TiledArray::btasUMTensorVarray<T>> {
   static inline void store(const Archive &ar,
                            const TiledArray::btasUMTensorVarray<T> &t) {
-    CudaSafeCall(cudaSetDevice(
-        TiledArray::cudaEnv::instance()->current_cuda_device_id()));
-    auto &stream = TiledArray::detail::get_stream_based_on_range(t.range());
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(t.storage(),
-                                                                    stream);
-    ar &t.range() & t.storage();
+    auto stream = TiledArray::device::stream_for(t.range());
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
+        t.storage(), stream);
+    ar & t.range() & t.storage();
   }
 };
 
@@ -101,7 +101,7 @@ btasUMTensorVarray<T, Range> gemm(
     const btasUMTensorVarray<T, Range> &left,
     const btasUMTensorVarray<T, Range> &right, Scalar factor,
     const TiledArray::math::GemmHelper &gemm_helper) {
-  return btas_tensor_gemm_cuda_impl(left, right, factor, gemm_helper);
+  return device::btas::gemm(left, right, factor, gemm_helper);
 }
 
 template <typename T, typename Scalar, typename Range,
@@ -110,7 +110,7 @@ void gemm(btasUMTensorVarray<T, Range> &result,
           const btasUMTensorVarray<T, Range> &left,
           const btasUMTensorVarray<T, Range> &right, Scalar factor,
           const TiledArray::math::GemmHelper &gemm_helper) {
-  return btas_tensor_gemm_cuda_impl(result, left, right, factor, gemm_helper);
+  return device::btas::gemm(result, left, right, factor, gemm_helper);
 }
 
 ///
@@ -121,7 +121,7 @@ template <typename T, typename Range>
 btasUMTensorVarray<T, Range> clone(const btasUMTensorVarray<T, Range> &arg) {
   // TODO how to copy Unified Memory? from CPU or GPU? currently
   //  always copy on GPU, but need to investigate
-  return btas_tensor_clone_cuda_impl(arg);
+  return device::btas::clone(arg);
 }
 
 ///
@@ -135,25 +135,20 @@ btasUMTensorVarray<T, Range> shift(const btasUMTensorVarray<T, Range> &arg,
   // shift the range
   result_range.inplace_shift(range_shift);
 
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-
   // @important select the stream using the shifted range
-  auto &cuda_stream = detail::get_stream_based_on_range(result_range);
+  auto &queue = blasqueue_for(result_range);
+  const auto stream = device::Stream(queue.device(), queue.stream());
 
   typename btasUMTensorVarray<T, Range>::storage_type result_storage;
 
-  make_device_storage(result_storage, result_range.volume(), cuda_stream);
+  make_device_storage(result_storage, result_range.volume(), stream);
   btasUMTensorVarray<T, Range> result(std::move(result_range),
                                       std::move(result_storage));
 
-  // call cublasCopy
-  const auto &handle = cuBLASHandlePool::handle();
-  CublasSafeCall(cublasSetStream(handle, cuda_stream));
-
-  CublasSafeCall(cublasCopy(handle, result.size(), device_data(arg.storage()),
-                            1, device_data(result.storage()), 1));
+  blas::copy(result.size(), device_data(arg.storage()), 1,
+             device_data(result.storage()), 1, queue);
 
-  synchronize_stream(&cuda_stream);
+  device::sync_madness_task_with(stream);
   return result;
 }
 
@@ -176,10 +171,8 @@ btasUMTensorVarray<T, Range> permute(const btasUMTensorVarray<T, Range> &arg,
                                      const TiledArray::Permutation &perm) {
   // compute result range
   auto result_range = perm * arg.range();
-  CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-
   // compute the stream to use
-  auto &stream = detail::get_stream_based_on_range(result_range);
+  auto stream = device::stream_for(result_range);
 
   // allocate result memory
   typename btasUMTensorVarray<T, Range>::storage_type storage;
@@ -192,11 +185,22 @@ btasUMTensorVarray<T, Range> permute(const btasUMTensorVarray<T, Range> &arg,
   librett_permute(const_cast<T *>(device_data(arg.storage())),
                   device_data(result.storage()), arg.range(), perm, stream);
 
-  synchronize_stream(&stream);
+  device::sync_madness_task_with(stream);
 
   return result;
 }
 
+// WARNING omitting this overload dispatches to the base CPU implementation in
+// external/btas.h
+
+template <typename T, typename Range>
+btasUMTensorVarray<T, Range> permute(
+    const btasUMTensorVarray<T, Range> &arg,
+    const TiledArray::BipartitePermutation &perm) {
+  TA_ASSERT(inner_size(perm) == 0);  // this must be a plain permutation
+  return permute(arg, outer(perm));
+}
+
 ///
 /// scale
 ///
@@ -205,16 +209,16 @@ template <typename T, typename Range, typename Scalar,
           typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> scale(const btasUMTensorVarray<T, Range> &arg,
                                    const Scalar factor) {
-  detail::to_cuda(arg);
-  return btas_tensor_scale_cuda_impl(arg, factor);
+  detail::to_device(arg);
+  return device::btas::scale(arg, factor);
 }
 
 template <typename T, typename Range, typename Scalar,
           typename = std::enable_if_t<TiledArray::detail::is_numeric_v<Scalar>>>
 btasUMTensorVarray<T, Range> &scale_to(btasUMTensorVarray<T, Range> &arg,
                                        const Scalar factor) {
-  detail::to_cuda(arg);
-  btas_tensor_scale_to_cuda_impl(arg, factor);
+  detail::to_device(arg);
+  device::btas::scale_to(arg, factor);
   return arg;
 }
 
@@ -226,10 +230,6 @@ btasUMTensorVarray<T, Range> scale(const btasUMTensorVarray<T, Range> &arg,
                                    const Scalar factor, const Perm &perm) {
   auto result = scale(arg, factor);
 
-  // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
-
   return permute(result, perm);
 }
 
@@ -239,8 +239,8 @@ btasUMTensorVarray<T, Range> scale(const btasUMTensorVarray<T, Range> &arg,
 
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> neg(const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
-  return btas_tensor_scale_cuda_impl(arg, T(-1.0));
+  detail::to_device(arg);
+  return device::btas::scale(arg, T(-1.0));
 }
 
 template <
@@ -250,17 +250,13 @@ btasUMTensorVarray<T, Range> neg(const btasUMTensorVarray<T, Range> &arg,
                                  const Perm &perm) {
   auto result = neg(arg);
 
-  // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
-
   return permute(result, perm);
 }
 
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> &neg_to(btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
-  btas_tensor_scale_to_cuda_impl(arg, T(-1.0));
+  detail::to_device(arg);
+  device::btas::scale_to(arg, T(-1.0));
   return arg;
 }
 
@@ -271,9 +267,9 @@ btasUMTensorVarray<T, Range> &neg_to(btasUMTensorVarray<T, Range> &arg) {
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2) {
-  detail::to_cuda(arg1);
-  detail::to_cuda(arg2);
-  return btas_tensor_subt_cuda_impl(arg1, arg2, T(1.0));
+  detail::to_device(arg1);
+  detail::to_device(arg2);
+  return device::btas::subt(arg1, arg2, T(1.0));
 }
 
 template <typename T, typename Scalar, typename Range,
@@ -282,7 +278,7 @@ btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Scalar factor) {
   auto result = subt(arg1, arg2);
-  btas_tensor_scale_to_cuda_impl(result, factor);
+  device::btas::scale_to(result, factor);
   return result;
 }
 
@@ -293,11 +289,6 @@ btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Perm &perm) {
   auto result = subt(arg1, arg2);
-
-  // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
-
   return permute(result, perm);
 }
 
@@ -309,11 +300,6 @@ btasUMTensorVarray<T, Range> subt(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Scalar factor, const Perm &perm) {
   auto result = subt(arg1, arg2, factor);
-
-  // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
-
   return permute(result, perm);
 }
 
@@ -325,9 +311,9 @@ template <typename T, typename Range>
 btasUMTensorVarray<T, Range> &subt_to(
     btasUMTensorVarray<T, Range> &result,
     const btasUMTensorVarray<T, Range> &arg1) {
-  detail::to_cuda(result);
-  detail::to_cuda(arg1);
-  btas_tensor_subt_to_cuda_impl(result, arg1, T(1.0));
+  detail::to_device(result);
+  detail::to_device(arg1);
+  device::btas::subt_to(result, arg1, T(1.0));
   return result;
 }
 
@@ -337,7 +323,7 @@ btasUMTensorVarray<T, Range> &subt_to(btasUMTensorVarray<T, Range> &result,
                                       const btasUMTensorVarray<T, Range> &arg1,
                                       const Scalar factor) {
   subt_to(result, arg1);
-  btas_tensor_scale_to_cuda_impl(result, factor);
+  device::btas::scale_to(result, factor);
   return result;
 }
 
@@ -348,9 +334,9 @@ btasUMTensorVarray<T, Range> &subt_to(btasUMTensorVarray<T, Range> &result,
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
                                  const btasUMTensorVarray<T, Range> &arg2) {
-  detail::to_cuda(arg1);
-  detail::to_cuda(arg2);
-  return btas_tensor_add_cuda_impl(arg1, arg2, T(1.0));
+  detail::to_device(arg1);
+  detail::to_device(arg2);
+  return device::btas::add(arg1, arg2, T(1.0));
 }
 
 template <typename T, typename Scalar, typename Range,
@@ -359,7 +345,7 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
                                  const btasUMTensorVarray<T, Range> &arg2,
                                  const Scalar factor) {
   auto result = add(arg1, arg2);
-  btas_tensor_scale_to_cuda_impl(result, factor);
+  device::btas::scale_to(result, factor);
   return result;
 }
 
@@ -371,11 +357,6 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
                                  const btasUMTensorVarray<T, Range> &arg2,
                                  const Scalar factor, const Perm &perm) {
   auto result = add(arg1, arg2, factor);
-
-  // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
-
   return permute(result, perm);
 }
 
@@ -386,11 +367,6 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
                                  const btasUMTensorVarray<T, Range> &arg2,
                                  const Perm &perm) {
   auto result = add(arg1, arg2);
-
-  // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
-
   return permute(result, perm);
 }
 
@@ -401,9 +377,9 @@ btasUMTensorVarray<T, Range> add(const btasUMTensorVarray<T, Range> &arg1,
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> &add_to(btasUMTensorVarray<T, Range> &result,
                                      const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(result);
-  detail::to_cuda(arg);
-  btas_tensor_add_to_cuda_impl(result, arg, T(1.0));
+  detail::to_device(result);
+  detail::to_device(arg);
+  device::btas::add_to(result, arg, T(1.0));
   return result;
 }
 
@@ -413,7 +389,7 @@ btasUMTensorVarray<T, Range> &add_to(btasUMTensorVarray<T, Range> &result,
                                      const btasUMTensorVarray<T, Range> &arg,
                                      const Scalar factor) {
   add_to(result, arg);
-  btas_tensor_scale_to_cuda_impl(result, factor);
+  device::btas::scale_to(result, factor);
   return result;
 }
 
@@ -424,9 +400,9 @@ template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type dot(
     const btasUMTensorVarray<T, Range> &arg1,
     const btasUMTensorVarray<T, Range> &arg2) {
-  detail::to_cuda(arg1);
-  detail::to_cuda(arg2);
-  return btas_tensor_dot_cuda_impl(arg1, arg2);
+  detail::to_device(arg1);
+  detail::to_device(arg2);
+  return device::btas::dot(arg1, arg2);
 }
 
 ///
@@ -435,9 +411,9 @@ typename btasUMTensorVarray<T, Range>::value_type dot(
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2) {
-  detail::to_cuda(arg1);
-  detail::to_cuda(arg2);
-  return btas_tensor_mult_cuda_impl(arg1, arg2);
+  detail::to_device(arg1);
+  detail::to_device(arg2);
+  return device::btas::mult(arg1, arg2);
 }
 
 template <typename T, typename Scalar, typename Range,
@@ -446,7 +422,7 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Scalar factor) {
   auto result = mult(arg1, arg2);
-  btas_tensor_scale_to_cuda_impl(result, factor);
+  device::btas::scale_to(result, factor);
   return result;
 }
 
@@ -457,11 +433,6 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Perm &perm) {
   auto result = mult(arg1, arg2);
-
-  // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
-
   return permute(result, perm);
 }
 
@@ -473,11 +444,6 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
                                   const btasUMTensorVarray<T, Range> &arg2,
                                   const Scalar factor, const Perm &perm) {
   auto result = mult(arg1, arg2, factor);
-
-  // wait to finish before switch stream
-  auto stream = tls_cudastream_accessor();
-  cudaStreamSynchronize(*stream);
-
   return permute(result, perm);
 }
 
@@ -487,9 +453,9 @@ btasUMTensorVarray<T, Range> mult(const btasUMTensorVarray<T, Range> &arg1,
 template <typename T, typename Range>
 btasUMTensorVarray<T, Range> &mult_to(btasUMTensorVarray<T, Range> &result,
                                       const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(result);
-  detail::to_cuda(arg);
-  btas_tensor_mult_to_cuda_impl(result, arg);
+  detail::to_device(result);
+  detail::to_device(arg);
+  device::btas::mult_to(result, arg);
   return result;
 }
 
@@ -499,7 +465,7 @@ btasUMTensorVarray<T, Range> &mult_to(btasUMTensorVarray<T, Range> &result,
                                       const btasUMTensorVarray<T, Range> &arg,
                                       const Scalar factor) {
   mult_to(result, arg);
-  btas_tensor_scale_to_cuda_impl(result, factor);
+  device::btas::scale_to(result, factor);
   return result;
 }
 
@@ -514,8 +480,8 @@ btasUMTensorVarray<T, Range> &mult_to(btasUMTensorVarray<T, Range> &result,
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type squared_norm(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
-  return btas_tensor_squared_norm_cuda_impl(arg);
+  detail::to_device(arg);
+  return device::btas::squared_norm(arg);
 }
 
 ///
@@ -525,8 +491,8 @@ typename btasUMTensorVarray<T, Range>::value_type squared_norm(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type norm(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
-  return std::sqrt(btas_tensor_squared_norm_cuda_impl(arg));
+  detail::to_device(arg);
+  return std::sqrt(device::btas::squared_norm(arg));
 }
 
 ///
@@ -544,8 +510,8 @@ typename btasUMTensorVarray<T, Range>::value_type trace(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type sum(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
-  return btas_tensor_sum_cuda_impl(arg);
+  detail::to_device(arg);
+  return device::btas::sum(arg);
 }
 
 ///
@@ -554,8 +520,8 @@ typename btasUMTensorVarray<T, Range>::value_type sum(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type product(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
-  return btas_tensor_product_cuda_impl(arg);
+  detail::to_device(arg);
+  return device::btas::product(arg);
 }
 
 ///
@@ -564,8 +530,8 @@ typename btasUMTensorVarray<T, Range>::value_type product(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type max(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
-  return btas_tensor_max_cuda_impl(arg);
+  detail::to_device(arg);
+  return device::btas::max(arg);
 }
 
 ///
@@ -574,8 +540,8 @@ typename btasUMTensorVarray<T, Range>::value_type max(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type abs_max(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
-  return btas_tensor_absmax_cuda_impl(arg);
+  detail::to_device(arg);
+  return device::btas::absmax(arg);
 }
 
 ///
@@ -584,8 +550,8 @@ typename btasUMTensorVarray<T, Range>::value_type abs_max(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type min(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
-  return btas_tensor_min_cuda_impl(arg);
+  detail::to_device(arg);
+  return device::btas::min(arg);
 }
 
 ///
@@ -594,8 +560,8 @@ typename btasUMTensorVarray<T, Range>::value_type min(
 template <typename T, typename Range>
 typename btasUMTensorVarray<T, Range>::value_type abs_min(
     const btasUMTensorVarray<T, Range> &arg) {
-  detail::to_cuda(arg);
-  return btas_tensor_absmin_cuda_impl(arg);
+  detail::to_device(arg);
+  return device::btas::absmin(arg);
 }
 
 /// to host for UM Array
@@ -603,10 +569,9 @@ template <typename UMTensor, typename Policy>
 void to_host(
     TiledArray::DistArray<TiledArray::Tile<UMTensor>, Policy> &um_array) {
   auto to_host = [](TiledArray::Tile<UMTensor> &tile) {
-    CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-    auto &stream = detail::get_stream_based_on_range(tile.range());
+    auto stream = device::stream_for(tile.range());
 
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
         tile.tensor().storage(), stream);
   };
 
@@ -622,7 +587,7 @@ void to_host(
   }
 
   world.gop.fence();
-  CudaSafeCall(cudaDeviceSynchronize());
+  DeviceSafeCall(device::deviceSynchronize());
 };
 
 /// to device for UM Array
@@ -630,10 +595,9 @@ template <typename UMTensor, typename Policy>
 void to_device(
     TiledArray::DistArray<TiledArray::Tile<UMTensor>, Policy> &um_array) {
   auto to_device = [](TiledArray::Tile<UMTensor> &tile) {
-    CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-    auto &stream = detail::get_stream_based_on_range(tile.range());
+    auto stream = device::stream_for(tile.range());
 
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
         tile.tensor().storage(), stream);
   };
 
@@ -649,7 +613,7 @@ void to_device(
   }
 
   world.gop.fence();
-  CudaSafeCall(cudaDeviceSynchronize());
+  DeviceSafeCall(device::deviceSynchronize());
 };
 
 /// convert array from UMTensor to TiledArray::Tensor
@@ -661,12 +625,12 @@ um_tensor_to_ta_tensor(
   const auto convert_tile_memcpy = [](const UMTensor &tile) {
     TATensor result(tile.tensor().range());
 
-    auto &stream = cudaEnv::instance()->cuda_stream_d2h();
-    CudaSafeCall(
-        cudaMemcpyAsync(result.data(), tile.data(),
-                        tile.size() * sizeof(typename TATensor::value_type),
-                        cudaMemcpyDefault, stream));
-    synchronize_stream(&stream);
+    auto stream = device::stream_for(result.range());
+    DeviceSafeCall(
+        device::memcpyAsync(result.data(), tile.data(),
+                            tile.size() * sizeof(typename TATensor::value_type),
+                            device::MemcpyDefault, stream));
+    device::sync_madness_task_with(stream);
 
     return result;
   };
@@ -676,10 +640,9 @@ um_tensor_to_ta_tensor(
     using std::begin;
     const auto n = tile.tensor().size();
 
-    CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-    auto &stream = detail::get_stream_based_on_range(tile.range());
+    auto stream = device::stream_for(tile.range());
 
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CPU>(
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Host>(
         tile.tensor().storage(), stream);
 
     std::copy_n(tile.data(), n, result.data());
@@ -688,7 +651,7 @@ um_tensor_to_ta_tensor(
   };
 
   const char *use_legacy_conversion =
-      std::getenv("TA_CUDA_LEGACY_UM_CONVERSION");
+      std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION");
   auto ta_array = use_legacy_conversion
                       ? to_new_tile_type(um_array, convert_tile_um)
                       : to_new_tile_type(um_array, convert_tile_memcpy);
@@ -711,32 +674,15 @@ template <typename UMTensor, typename TATensor, typename Policy>
 typename std::enable_if<!std::is_same<UMTensor, TATensor>::value,
                         TiledArray::DistArray<UMTensor, Policy>>::type
 ta_tensor_to_um_tensor(const TiledArray::DistArray<TATensor, Policy> &array) {
-  auto convert_tile_memcpy = [](const TATensor &tile) {
-    /// UMTensor must be wrapped into TA::Tile
-
-    CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-
-    using Tensor = typename UMTensor::tensor_type;
-
-    auto &stream = cudaEnv::instance()->cuda_stream_h2d();
-    typename Tensor::storage_type storage;
-    make_device_storage(storage, tile.range().area(), stream);
-    Tensor result(tile.range(), std::move(storage));
-
-    CudaSafeCall(
-        cudaMemcpyAsync(result.data(), tile.data(),
-                        tile.size() * sizeof(typename Tensor::value_type),
-                        cudaMemcpyDefault, stream));
-
-    synchronize_stream(&stream);
-    return TiledArray::Tile<Tensor>(std::move(result));
-  };
+  using inT = typename TATensor::value_type;
+  using outT = typename UMTensor::value_type;
+  // check if element conversion is necessary
+  constexpr bool T_conversion = !std::is_same_v<inT, outT>;
 
+  // this is safe even when need to convert element types, but less efficient
   auto convert_tile_um = [](const TATensor &tile) {
     /// UMTensor must be wrapped into TA::Tile
 
-    CudaSafeCall(cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-
     using Tensor = typename UMTensor::tensor_type;
     typename Tensor::storage_type storage(tile.range().area());
 
@@ -746,20 +692,53 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray<TATensor, Policy> &array) {
 
     std::copy_n(tile.data(), n, result.data());
 
-    auto &stream = detail::get_stream_based_on_range(result.range());
+    auto stream = device::stream_for(result.range());
 
     // prefetch data to GPU
-    TiledArray::to_execution_space<TiledArray::ExecutionSpace::CUDA>(
+    TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(
         result.storage(), stream);
 
+    // N.B. move! without it have D-to-H transfer due to calling UM
+    // allocator construct() on the host
     return TiledArray::Tile<Tensor>(std::move(result));
   };
 
-  const char *use_legacy_conversion =
-      std::getenv("TA_CUDA_LEGACY_UM_CONVERSION");
-  auto um_array = use_legacy_conversion
-                      ? to_new_tile_type(array, convert_tile_um)
-                      : to_new_tile_type(array, convert_tile_memcpy);
+  TiledArray::DistArray<UMTensor, Policy> um_array;
+  if constexpr (T_conversion) {
+    um_array = to_new_tile_type(array, convert_tile_um);
+  } else {
+    // this is more efficient for copying:
+    // - avoids copy on host followed by UM transfer, instead uses direct copy
+    // - replaced unneeded copy (which also caused D-to-H transfer due to
+    // calling UM allocator construct() on the host) by move
+    // This eliminates all spurious UM traffic in (T) W3 contractions
+    auto convert_tile_memcpy = [](const TATensor &tile) {
+      /// UMTensor must be wrapped into TA::Tile
+
+      using Tensor = typename UMTensor::tensor_type;
+
+      auto stream = device::stream_for(tile.range());
+      typename Tensor::storage_type storage;
+      make_device_storage(storage, tile.range().area(), stream);
+      Tensor result(tile.range(), std::move(storage));
+
+      DeviceSafeCall(
+          device::memcpyAsync(result.data(), tile.data(),
+                              tile.size() * sizeof(typename Tensor::value_type),
+                              device::MemcpyDefault, stream));
+
+      device::sync_madness_task_with(stream);
+      // N.B. move! without it have D-to-H transfer due to calling UM
+      // allocator construct() on the host
+      return TiledArray::Tile<Tensor>(std::move(result));
+    };
+
+    const char *use_legacy_conversion =
+        std::getenv("TA_DEVICE_LEGACY_UM_CONVERSION");
+    um_array = use_legacy_conversion
+                   ? to_new_tile_type(array, convert_tile_um)
+                   : to_new_tile_type(array, convert_tile_memcpy);
+  }
 
   array.world().gop.fence();
   return um_array;
@@ -778,31 +757,49 @@ ta_tensor_to_um_tensor(const TiledArray::DistArray<UMTensor, Policy> &array) {
 #ifndef TILEDARRAY_HEADER_ONLY
 
 extern template class btas::varray<double,
-                                   TiledArray::cuda_um_allocator<double>>;
-extern template class btas::varray<float, TiledArray::cuda_um_allocator<float>>;
-extern template class btas::varray<int, TiledArray::cuda_um_allocator<int>>;
-extern template class btas::varray<long, TiledArray::cuda_um_allocator<long>>;
+                                   TiledArray::device_um_allocator<double>>;
+extern template class btas::varray<float,
+                                   TiledArray::device_um_allocator<float>>;
+extern template class btas::varray<
+    std::complex<double>,
+    TiledArray::device_um_allocator<std::complex<double>>>;
+extern template class btas::varray<
+    std::complex<float>, TiledArray::device_um_allocator<std::complex<float>>>;
+extern template class btas::varray<int, TiledArray::device_um_allocator<int>>;
+extern template class btas::varray<long, TiledArray::device_um_allocator<long>>;
 
 extern template class btas::Tensor<double, TiledArray::Range,
-                                   TiledArray::cuda_um_btas_varray<double>>;
+                                   TiledArray::device_um_btas_varray<double>>;
 extern template class btas::Tensor<float, TiledArray::Range,
-                                   TiledArray::cuda_um_btas_varray<float>>;
+                                   TiledArray::device_um_btas_varray<float>>;
+extern template class btas::Tensor<
+    std::complex<double>, TiledArray::Range,
+    TiledArray::device_um_btas_varray<std::complex<double>>>;
+extern template class btas::Tensor<
+    std::complex<float>, TiledArray::Range,
+    TiledArray::device_um_btas_varray<std::complex<float>>>;
 extern template class btas::Tensor<int, TiledArray::Range,
-                                   TiledArray::cuda_um_btas_varray<int>>;
+                                   TiledArray::device_um_btas_varray<int>>;
 extern template class btas::Tensor<long, TiledArray::Range,
-                                   TiledArray::cuda_um_btas_varray<long>>;
+                                   TiledArray::device_um_btas_varray<long>>;
 
 extern template class TiledArray::Tile<btas::Tensor<
-    double, TiledArray::Range, TiledArray::cuda_um_btas_varray<double>>>;
+    double, TiledArray::Range, TiledArray::device_um_btas_varray<double>>>;
 extern template class TiledArray::Tile<btas::Tensor<
-    float, TiledArray::Range, TiledArray::cuda_um_btas_varray<float>>>;
+    float, TiledArray::Range, TiledArray::device_um_btas_varray<float>>>;
 extern template class TiledArray::Tile<
-    btas::Tensor<int, TiledArray::Range, TiledArray::cuda_um_btas_varray<int>>>;
+    btas::Tensor<std::complex<double>, TiledArray::Range,
+                 TiledArray::device_um_btas_varray<std::complex<double>>>>;
+extern template class TiledArray::Tile<
+    btas::Tensor<std::complex<float>, TiledArray::Range,
+                 TiledArray::device_um_btas_varray<std::complex<float>>>>;
+extern template class TiledArray::Tile<btas::Tensor<
+    int, TiledArray::Range, TiledArray::device_um_btas_varray<int>>>;
 extern template class TiledArray::Tile<btas::Tensor<
-    long, TiledArray::Range, TiledArray::cuda_um_btas_varray<long>>>;
+    long, TiledArray::Range, TiledArray::device_um_btas_varray<long>>>;
 
 #endif  // TILEDARRAY_HEADER_ONLY
 
-#endif  // TILEDARRAY_HAS_CUDA
+#endif  // TILEDARRAY_HAS_DEVICE
 
-#endif  // TILEDARRAY_CUDA_CUDA_UM_TENSOR_H
+#endif  // TILEDARRAY_DEVICE_BTAS_UM_TENSOR_H
diff --git a/src/TiledArray/cuda/cpu_cuda_vector.cu b/src/TiledArray/device/cpu_cuda_vector.cu
similarity index 70%
rename from src/TiledArray/cuda/cpu_cuda_vector.cu
rename to src/TiledArray/device/cpu_cuda_vector.cu
index 34dd405807..639cc56acc 100644
--- a/src/TiledArray/cuda/cpu_cuda_vector.cu
+++ b/src/TiledArray/device/cpu_cuda_vector.cu
@@ -1,5 +1,5 @@
 
-#include <TiledArray/cuda/cpu_cuda_vector.h>
+#include <TiledArray/device/cpu_cuda_vector.h>
 
 
 namespace thrust {
@@ -15,11 +15,25 @@ void resize<float,thrust::device_allocator<float>>(
     size_t size) {
     dev_vec.resize(size);
 }
+template<>
+void resize<std::complex<double>,thrust::device_allocator<std::complex<double>>>(
+    thrust::device_vector<std::complex<double>, thrust::device_allocator<std::complex<double>>>& dev_vec,
+    size_t size) {
+    dev_vec.resize(size);
+}
+template<>
+void resize<std::complex<float>,thrust::device_allocator<std::complex<float>>>(
+    thrust::device_vector<std::complex<float>, thrust::device_allocator<std::complex<float>>>& dev_vec,
+    size_t size) {
+    dev_vec.resize(size);
+}
 }
 
 namespace TiledArray {
 template class cpu_cuda_vector<double>;
 template class cpu_cuda_vector<float>;
+template class cpu_cuda_vector<std::complex<double>>;
+template class cpu_cuda_vector<std::complex<float>>;
 }
 
 // Thrust included in CUDA 9+ seems to generate uninstantiated CUB calls
@@ -35,6 +49,12 @@ auto force_missing_copy_instantiations_double() {
 auto force_missing_copy_instantiations_float() {
   return force_missing_copy_instantiations<float>();
 }
+auto force_missing_copy_instantiations_zdouble() {
+  return force_missing_copy_instantiations<std::complex<double>>();
+}
+auto force_missing_copy_instantiations_zfloat() {
+  return force_missing_copy_instantiations<std::complex<float>>();
+}
 auto force_missing_copy_instantiations_unsigned_long() {
   return force_missing_copy_instantiations<unsigned long>();
 }
@@ -65,4 +85,3 @@ auto force_missing_copy_n_instantiations_long_long(){
 }
 
 #endif  // __CUDACC_VER_MAJOR__  >= 9
-
diff --git a/src/TiledArray/cuda/cpu_cuda_vector.h b/src/TiledArray/device/cpu_cuda_vector.h
similarity index 91%
rename from src/TiledArray/cuda/cpu_cuda_vector.h
rename to src/TiledArray/device/cpu_cuda_vector.h
index 7370eeaa2e..d7a9ad1422 100644
--- a/src/TiledArray/cuda/cpu_cuda_vector.h
+++ b/src/TiledArray/device/cpu_cuda_vector.h
@@ -1,11 +1,11 @@
 
-#ifndef TILEDARRAY_CUDA_CPU_CUDA_VECTOR_H__INCLUDED
-#define TILEDARRAY_CUDA_CPU_CUDA_VECTOR_H__INCLUDED
+#ifndef TILEDARRAY_DEVICE_CPU_CUDA_VECTOR_H__INCLUDED
+#define TILEDARRAY_DEVICE_CPU_CUDA_VECTOR_H__INCLUDED
 
 #include <btas/array_adaptor.h>
 
-#include <TiledArray/cuda/platform.h>
-#include <TiledArray/cuda/thrust.h>
+#include <TiledArray/device/platform.h>
+#include <TiledArray/device/thrust.h>
 
 #include <madness/world/archive.h>
 
@@ -158,13 +158,15 @@ class cpu_cuda_vector {
 
 extern template class cpu_cuda_vector<double>;
 extern template class cpu_cuda_vector<float>;
+extern template class cpu_cuda_vector<std::complex<double>>;
+extern template class cpu_cuda_vector<std::complex<float>>;
 
 template <MemorySpace Space, typename T, typename HostAlloc,
           typename DeviceAlloc>
 bool in_memory_space(
     const cpu_cuda_vector<T, HostAlloc, DeviceAlloc>& vec) noexcept {
-  return (vec.on_host() && overlap(MemorySpace::CPU, Space)) ||
-         (vec.on_device() && overlap(MemorySpace::CUDA, Space));
+  return (vec.on_host() && overlap(MemorySpace::Host, Space)) ||
+         (vec.on_device() && overlap(MemorySpace::Device, Space));
 }
 
 template <ExecutionSpace Space, typename T, typename HostAlloc,
@@ -172,11 +174,11 @@ template <ExecutionSpace Space, typename T, typename HostAlloc,
 void to_execution_space(cpu_cuda_vector<T, HostAlloc, DeviceAlloc>& vec,
                         cudaStream_t stream = 0) {
   switch (Space) {
-    case ExecutionSpace::CPU: {
+    case ExecutionSpace::Host: {
       vec.to_host();
       break;
     }
-    case ExecutionSpace::CUDA: {
+    case ExecutionSpace::Device: {
       vec.to_device();
       break;
     }
@@ -229,4 +231,4 @@ struct ArchiveStoreImpl<Archive, TiledArray::cpu_cuda_vector<T>> {
 }  // namespace archive
 }  // namespace madness
 
-#endif  // TILEDARRAY_CUDA_CPU_CUDA_VECTOR_H__INCLUDED
+#endif  // TILEDARRAY_DEVICE_CPU_CUDA_VECTOR_H__INCLUDED
diff --git a/src/TiledArray/cuda/cuda_task_fn.h b/src/TiledArray/device/device_task_fn.h
similarity index 69%
rename from src/TiledArray/cuda/cuda_task_fn.h
rename to src/TiledArray/device/device_task_fn.h
index 8de133b3bd..e376cc39e6 100644
--- a/src/TiledArray/cuda/cuda_task_fn.h
+++ b/src/TiledArray/device/device_task_fn.h
@@ -2,28 +2,27 @@
 // Created by Chong Peng on 2019-03-20.
 //
 
-#ifndef TILEDARRAY_CUDA_CUDA_TASK_FN_H__INCLUDED
-#define TILEDARRAY_CUDA_CUDA_TASK_FN_H__INCLUDED
+#ifndef TILEDARRAY_DEVICE_DEVICE_TASK_FN_H__INCLUDED
+#define TILEDARRAY_DEVICE_DEVICE_TASK_FN_H__INCLUDED
 
 #include <TiledArray/config.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
-#include <TiledArray/external/cuda.h>
+#include <TiledArray/external/device.h>
 #include <TiledArray/util/time.h>
-#include <cuda_runtime.h>
 #include <madness/world/taskfn.h>
 
 namespace TiledArray {
 namespace detail {
 
 template <int64_t CallabackId>
-std::atomic<int64_t>& cuda_callback_duration_ns() {
+std::atomic<int64_t>& device_callback_duration_ns() {
   static std::atomic<int64_t> value{0};
   return value;
 }
 
-inline std::atomic<int64_t>& cuda_taskfn_callback_duration_ns() {
+inline std::atomic<int64_t>& device_taskfn_callback_duration_ns() {
   static std::atomic<int64_t> value{0};
   return value;
 }
@@ -34,9 +33,9 @@ inline std::atomic<int64_t>& cuda_taskfn_callback_duration_ns() {
 namespace madness {
 
 ///
-/// cudaTaskFn class
-/// represent a task that calls an async cuda kernel
-/// the task must call synchronize_stream function to tell which stream it
+/// deviceTaskFn class
+/// represent a task that calls an async device kernel
+/// the task must call sync_madness_task_with function to tell which stream it
 /// used
 ///
 
@@ -44,55 +43,55 @@ template <typename fnT, typename arg1T = void, typename arg2T = void,
           typename arg3T = void, typename arg4T = void, typename arg5T = void,
           typename arg6T = void, typename arg7T = void, typename arg8T = void,
           typename arg9T = void>
-struct cudaTaskFn : public TaskInterface {
-  static_assert(not(std::is_const<arg1T>::value ||
-                    std::is_reference<arg1T>::value),
-                "improper instantiation of cudaTaskFn, arg1T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg2T>::value ||
-                    std::is_reference<arg2T>::value),
-                "improper instantiation of cudaTaskFn, arg2T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg3T>::value ||
-                    std::is_reference<arg3T>::value),
-                "improper instantiation of cudaTaskFn, arg3T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg4T>::value ||
-                    std::is_reference<arg4T>::value),
-                "improper instantiation of cudaTaskFn, arg4T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg5T>::value ||
-                    std::is_reference<arg5T>::value),
-                "improper instantiation of cudaTaskFn, arg5T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg6T>::value ||
-                    std::is_reference<arg6T>::value),
-                "improper instantiation of cudaTaskFn, arg6T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg7T>::value ||
-                    std::is_reference<arg7T>::value),
-                "improper instantiation of cudaTaskFn, arg7T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg8T>::value ||
-                    std::is_reference<arg8T>::value),
-                "improper instantiation of cudaTaskFn, arg8T cannot be a const "
-                "or reference type");
-  static_assert(not(std::is_const<arg9T>::value ||
-                    std::is_reference<arg9T>::value),
-                "improper instantiation of cudaTaskFn, arg9T cannot be a const "
-                "or reference type");
+struct deviceTaskFn : public TaskInterface {
+  static_assert(
+      not(std::is_const<arg1T>::value || std::is_reference<arg1T>::value),
+      "improper instantiation of deviceTaskFn, arg1T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg2T>::value || std::is_reference<arg2T>::value),
+      "improper instantiation of deviceTaskFn, arg2T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg3T>::value || std::is_reference<arg3T>::value),
+      "improper instantiation of deviceTaskFn, arg3T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg4T>::value || std::is_reference<arg4T>::value),
+      "improper instantiation of deviceTaskFn, arg4T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg5T>::value || std::is_reference<arg5T>::value),
+      "improper instantiation of deviceTaskFn, arg5T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg6T>::value || std::is_reference<arg6T>::value),
+      "improper instantiation of deviceTaskFn, arg6T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg7T>::value || std::is_reference<arg7T>::value),
+      "improper instantiation of deviceTaskFn, arg7T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg8T>::value || std::is_reference<arg8T>::value),
+      "improper instantiation of deviceTaskFn, arg8T cannot be a const "
+      "or reference type");
+  static_assert(
+      not(std::is_const<arg9T>::value || std::is_reference<arg9T>::value),
+      "improper instantiation of deviceTaskFn, arg9T cannot be a const "
+      "or reference type");
 
  private:
   /// This class type
-  typedef cudaTaskFn<fnT, arg1T, arg2T, arg3T, arg4T, arg5T, arg6T, arg7T,
-                     arg8T, arg9T>
-      cudaTaskFn_;
+  typedef deviceTaskFn<fnT, arg1T, arg2T, arg3T, arg4T, arg5T, arg6T, arg7T,
+                       arg8T, arg9T>
+      deviceTaskFn_;
 
   friend class AsyncTaskInterface;
 
-  /// internal Task structure that wraps the Async cuda function
+  /// internal Task structure that wraps the Async device function
   struct AsyncTaskInterface : public madness::TaskInterface {
-    AsyncTaskInterface(cudaTaskFn_* task, int ndepend = 0,
+    AsyncTaskInterface(deviceTaskFn_* task, int ndepend = 0,
                        const TaskAttributes attr = TaskAttributes())
         : TaskInterface(ndepend, attr), task_(task) {}
 
@@ -100,45 +99,55 @@ struct cudaTaskFn : public TaskInterface {
 
    protected:
     void run(const TaskThreadEnv& env) override {
+      TA_ASSERT(!stream_);
+      TA_ASSERT(
+          TiledArray::device::detail::madness_task_stream_opt_ptr_accessor() ==
+          nullptr);
+      // tell the task to report stream to be synced with to this->stream_
+      TiledArray::device::detail::madness_task_stream_opt_ptr_accessor() =
+          &this->stream_;
+
       // run the async function, the function must call synchronize_stream() to
       // set the stream it used!!
       task_->run_async();
 
-      // get the stream used by async function
-      auto stream = TiledArray::tls_cudastream_accessor();
-
-      //      TA_ASSERT(stream != nullptr);
+      // clear ptr to stream_
+      TiledArray::device::detail::madness_task_stream_opt_ptr_accessor() =
+          nullptr;
 
       // WARNING, need to handle NoOp
-      if (stream == nullptr) {
+      if (!stream_) {
         task_->notify();
       } else {
-        // TODO should we use cuda callback or cuda events??
-        // insert cuda callback
-        cudaLaunchHostFunc(*stream, cuda_callback, task_);
-        // reset stream to nullptr
-        TiledArray::synchronize_stream(nullptr);
+        // TODO should we use device callback or device events??
+        // insert device callback
+        DeviceSafeCall(TiledArray::device::launchHostFunc(
+            *stream_, device_callback, task_));
+        // processed sync, clear state
+        stream_ = {};
       }
     }
 
    private:
-    static void CUDART_CB cuda_callback(void* userData) {
+    static void DEVICERT_CB device_callback(void* userData) {
       TA_ASSERT(!madness::is_madness_thread());
       const auto t0 = TiledArray::now();
       // convert void * to AsyncTaskInterface*
-      auto* callback = static_cast<cudaTaskFn_*>(userData);
+      auto* callback = static_cast<deviceTaskFn_*>(userData);
       //      std::stringstream address;
       //      address << (void*) callback;
-      //      std::string message = "callback on cudaTaskFn: " + address.str() +
+      //      std::string message = "callback on deviceTaskFn: " + address.str()
+      //      +
       //        '\n'; std::cout << message;
       callback->notify();
       const auto t1 = TiledArray::now();
 
-      TiledArray::detail::cuda_taskfn_callback_duration_ns() +=
+      TiledArray::detail::device_taskfn_callback_duration_ns() +=
           TiledArray::duration_in_ns(t0, t1);
     }
 
-    cudaTaskFn_* task_;
+    deviceTaskFn_* task_;
+    std::optional<TiledArray::device::Stream> stream_;  // stream to sync with
   };
 
  public:
@@ -160,7 +169,7 @@ struct cudaTaskFn : public TaskInterface {
   futureT result_;             ///< The task Future result
   const functionT func_;       ///< The task function
   TaskInterface* async_task_;  ///< The internal AsyncTaskInterface that wraps
-  ///< the async cuda function
+  ///< the async device function
   futureT async_result_;  ///< the future returned from the async task
 
   // If the value of the argument is known at the time the
@@ -258,7 +267,7 @@ struct cudaTaskFn : public TaskInterface {
 
   /// Check dependencies and register callbacks where necessary
   void check_dependencies() {
-    this->inc();  // the current cudaTaskFn depends on the internal
+    this->inc();  // the current deviceTaskFn depends on the internal
     // AsyncTaskInterface, dependency = 1
     check_dependency(arg1_);
     check_dependency(arg2_);
@@ -272,13 +281,14 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   // Copies are not allowed.
-  cudaTaskFn(const cudaTaskFn_&);
-  cudaTaskFn_ operator=(cudaTaskFn_&);
+  deviceTaskFn(const deviceTaskFn_&);
+  deviceTaskFn_ operator=(deviceTaskFn_&);
 
  public:
 #if MADNESS_TASKQ_VARIADICS
 
-  cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -298,8 +308,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -319,8 +329,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T, typename a2T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -340,8 +350,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T, typename a2T, typename a3T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -361,8 +371,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T, typename a2T, typename a3T, typename a4T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, a4T&& a4, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, a4T&& a4, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -383,8 +393,8 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, a4T&& a4, a5T&& a5, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, a4T&& a4, a5T&& a5, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -405,8 +415,9 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -427,9 +438,9 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T, typename a7T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -450,9 +461,9 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T, typename a7T, typename a8T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -474,9 +485,9 @@ struct cudaTaskFn : public TaskInterface {
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T, typename a7T, typename a8T,
             typename a9T>
-  cudaTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
-             a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8,
-             a9T&& a9, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, a1T&& a1, a2T&& a2,
+               a3T&& a3, a4T&& a4, a5T&& a5, a6T&& a6, a7T&& a7, a8T&& a8,
+               a9T&& a9, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -495,8 +506,9 @@ struct cudaTaskFn : public TaskInterface {
     check_dependencies();
   }
 
-  cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr,
-             archive::BufferInputArchive& input_arch)
+  deviceTaskFn(const futureT& result, functionT func,
+               const TaskAttributes& attr,
+               archive::BufferInputArchive& input_arch)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -514,7 +526,8 @@ struct cudaTaskFn : public TaskInterface {
     check_dependencies();
   }
 #else   // MADNESS_TASKQ_VARIADICS
-  cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -534,8 +547,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -555,8 +568,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T, typename a2T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const TaskAttributes& attr = TaskAttributes())
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const TaskAttributes& attr = TaskAttributes())
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -576,8 +589,8 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T, typename a2T, typename a3T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -597,9 +610,9 @@ struct cudaTaskFn : public TaskInterface {
   }
 
   template <typename a1T, typename a2T, typename a3T, typename a4T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const a4T& a4,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const a4T& a4,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -620,9 +633,9 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -643,9 +656,9 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
-             const a6T& a6, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
+               const a6T& a6, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -666,9 +679,9 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T, typename a7T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
-             const a6T& a6, const a7T& a7, const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
+               const a6T& a6, const a7T& a7, const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -689,10 +702,10 @@ struct cudaTaskFn : public TaskInterface {
 
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T, typename a7T, typename a8T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
-             const a6T& a6, const a7T& a7, const a8T& a8,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
+               const a6T& a6, const a7T& a7, const a8T& a8,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -714,10 +727,10 @@ struct cudaTaskFn : public TaskInterface {
   template <typename a1T, typename a2T, typename a3T, typename a4T,
             typename a5T, typename a6T, typename a7T, typename a8T,
             typename a9T>
-  cudaTaskFn(const futureT& result, functionT func, const a1T& a1,
-             const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
-             const a6T& a6, const a7T& a7, const a8T& a8, const a9T& a9,
-             const TaskAttributes& attr)
+  deviceTaskFn(const futureT& result, functionT func, const a1T& a1,
+               const a2T& a2, const a3T& a3, const a4T& a4, const a5T& a5,
+               const a6T& a6, const a7T& a7, const a8T& a8, const a9T& a9,
+               const TaskAttributes& attr)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -736,8 +749,9 @@ struct cudaTaskFn : public TaskInterface {
     check_dependencies();
   }
 
-  cudaTaskFn(const futureT& result, functionT func, const TaskAttributes& attr,
-             archive::BufferInputArchive& input_arch)
+  deviceTaskFn(const futureT& result, functionT func,
+               const TaskAttributes& attr,
+               archive::BufferInputArchive& input_arch)
       : TaskInterface(attr),
         result_(result),
         func_(func),
@@ -757,7 +771,7 @@ struct cudaTaskFn : public TaskInterface {
 #endif  // MADNESS_TASKQ_VARIADICS
 
   // no need to delete async_task_, as it will be deleted by the TaskQueue
-  virtual ~cudaTaskFn() = default;
+  virtual ~deviceTaskFn() = default;
 
   const futureT& result() const { return result_; }
 
@@ -770,16 +784,16 @@ struct cudaTaskFn : public TaskInterface {
   }
 #else
  protected:
-  /// when this cudaTaskFn gets run, it means the AsyncTaskInterface is done
+  /// when this deviceTaskFn gets run, it means the AsyncTaskInterface is done
   /// set the result with async_result_, which is finished
   void run(const TaskThreadEnv& env) override {
     result_.set(std::move(async_result_));
   }
 #endif  // HAVE_INTEL_TBB
 
-};  // class cudaTaskFn
+};  // class deviceTaskFn
 
-/// add a cudaTaskFn object to World
+/// add a deviceTaskFn object to World
 /// \tparam fnT A function pointer or functor
 /// \tparam a1T Type of argument 1.
 /// \tparam a2T Type of argument 2.
@@ -794,15 +808,15 @@ struct cudaTaskFn : public TaskInterface {
 /// \return Description needed.
 template <typename fnT, typename a1T, typename a2T, typename a3T, typename a4T,
           typename a5T, typename a6T, typename a7T, typename a8T, typename a9T>
-typename cudaTaskFn<fnT, a1T, a2T, a3T, a4T, a5T, a6T, a7T, a8T, a9T>::futureT
-add_cuda_taskfn(
+typename deviceTaskFn<fnT, a1T, a2T, a3T, a4T, a5T, a6T, a7T, a8T, a9T>::futureT
+add_device_taskfn(
     madness::World& world,
-    cudaTaskFn<fnT, a1T, a2T, a3T, a4T, a5T, a6T, a7T, a8T, a9T>* t) {
-  typename cudaTaskFn<fnT, a1T, a2T, a3T, a4T, a5T, a6T, a7T, a8T, a9T>::futureT
-      res(t->result());
-  // add the internal async task in cuda task as well
+    deviceTaskFn<fnT, a1T, a2T, a3T, a4T, a5T, a6T, a7T, a8T, a9T>* t) {
+  typename deviceTaskFn<fnT, a1T, a2T, a3T, a4T, a5T, a6T, a7T, a8T,
+                        a9T>::futureT res(t->result());
+  // add the internal async task in device task as well
   world.taskq.add(t->async_task());
-  // add the cuda task
+  // add the device task
   world.taskq.add(static_cast<TaskInterface*>(t));
   return res;
 }
@@ -815,13 +829,13 @@ template <
     typename fnT, typename... argsT,
     typename = std::enable_if_t<!meta::taskattr_is_last_arg<argsT...>::value>>
 typename detail::function_enabler<fnT(future_to_ref_t<argsT>...)>::type
-add_cuda_task(madness::World& world, fnT&& fn, argsT&&... args) {
-  /// type of cudaTaskFn object
+add_device_task(madness::World& world, fnT&& fn, argsT&&... args) {
+  /// type of deviceTaskFn object
   using taskT =
-      cudaTaskFn<std::decay_t<fnT>,
-                 std::remove_const_t<std::remove_reference_t<argsT>>...>;
+      deviceTaskFn<std::decay_t<fnT>,
+                   std::remove_const_t<std::remove_reference_t<argsT>>...>;
 
-  return add_cuda_taskfn(
+  return add_device_taskfn(
       world, new taskT(typename taskT::futureT(), std::forward<fnT>(fn),
                        std::forward<argsT>(args)..., TaskAttributes()));
 }
@@ -835,13 +849,13 @@ template <
     typename = std::enable_if_t<meta::taskattr_is_last_arg<argsT...>::value>>
 typename meta::drop_last_arg_and_apply_callable<
     detail::function_enabler, fnT, future_to_ref_t<argsT>...>::type::type
-add_cuda_task(madness::World& world, fnT&& fn, argsT&&... args) {
-  /// type of cudaTaskFn object
+add_device_task(madness::World& world, fnT&& fn, argsT&&... args) {
+  /// type of deviceTaskFn object
   using taskT = typename meta::drop_last_arg_and_apply<
-      cudaTaskFn, std::decay_t<fnT>,
+      deviceTaskFn, std::decay_t<fnT>,
       std::remove_const_t<std::remove_reference_t<argsT>>...>::type;
 
-  return add_cuda_taskfn(
+  return add_device_taskfn(
       world, new taskT(typename taskT::futureT(), std::forward<fnT>(fn),
                        std::forward<argsT>(args)...));
 }
@@ -852,14 +866,14 @@ add_cuda_task(madness::World& world, fnT&& fn, argsT&&... args) {
 /// \tparam argsT   variadic template for arguments
 /// \return A future to the result
 template <typename objT, typename memfnT, typename... argsT>
-typename detail::memfunc_enabler<objT, memfnT>::type add_cuda_task(
+typename detail::memfunc_enabler<objT, memfnT>::type add_device_task(
     madness::World& world, objT&& obj, memfnT memfn, argsT&&... args) {
-  return add_cuda_task(world,
-                       detail::wrap_mem_fn(std::forward<objT>(obj), memfn),
-                       std::forward<argsT>(args)...);
+  return add_device_task(world,
+                         detail::wrap_mem_fn(std::forward<objT>(obj), memfn),
+                         std::forward<argsT>(args)...);
 }
 
 }  // namespace madness
 
-#endif  // TILDARRAY_HAS_CUDA
-#endif  // TILEDARRAY_CUDA_CUDA_TASK_FN_H__INCLUDED
+#endif  // TILDARRAY_HAS_DEVICE
+#endif  // TILEDARRAY_DEVICE_DEVICE_TASK_FN_H__INCLUDED
diff --git a/src/TiledArray/device/kernel/mult_kernel.h b/src/TiledArray/device/kernel/mult_kernel.h
new file mode 100644
index 0000000000..38a854000a
--- /dev/null
+++ b/src/TiledArray/device/kernel/mult_kernel.h
@@ -0,0 +1,76 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2018  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Chong Peng
+ *  Department of Chemistry, Virginia Tech
+ *  Aug 21, 2018
+ *
+ */
+
+#ifndef TILEDARRAY_DEVICE_MULT_KERNEL_H__INCLUDED
+#define TILEDARRAY_DEVICE_MULT_KERNEL_H__INCLUDED
+
+#include <TiledArray/config.h>
+
+#ifdef TILEDARRAY_HAS_DEVICE
+
+#include <complex>
+
+#include <TiledArray/external/device.h>
+
+namespace TiledArray::device {
+
+/// result[i] = result[i] * arg[i]
+void mult_to_kernel(int *result, const int *arg, std::size_t n,
+                    const Stream &stream);
+
+void mult_to_kernel(float *result, const float *arg, std::size_t n,
+                    const Stream &stream);
+
+void mult_to_kernel(double *result, const double *arg, std::size_t n,
+                    const Stream &stream);
+
+void mult_to_kernel(std::complex<float> *result, const std::complex<float> *arg,
+                    std::size_t n, const Stream &stream);
+
+void mult_to_kernel(std::complex<double> *result,
+                    const std::complex<double> *arg, std::size_t n,
+                    const Stream &stream);
+
+/// result[i] = arg1[i] * arg2[i]
+void mult_kernel(int *result, const int *arg1, const int *arg2, std::size_t n,
+                 const Stream &stream);
+
+void mult_kernel(float *result, const float *arg1, const float *arg2,
+                 std::size_t n, const Stream &stream);
+
+void mult_kernel(double *result, const double *arg1, const double *arg2,
+                 std::size_t n, const Stream &stream);
+
+void mult_kernel(std::complex<float> *result, const std::complex<float> *arg1,
+                 const std::complex<float> *arg2, std::size_t n,
+                 const Stream &stream);
+
+void mult_kernel(std::complex<double> *result, const std::complex<double> *arg1,
+                 const std::complex<double> *arg2, std::size_t n,
+                 const Stream &stream);
+
+}  // namespace TiledArray::device
+
+#endif  // TILEDARRAY_HAS_DEVICE
+
+#endif  // TILEDARRAY_DEVICE_MULT_KERNEL_H__INCLUDED
diff --git a/src/TiledArray/device/kernel/reduce_kernel.h b/src/TiledArray/device/kernel/reduce_kernel.h
new file mode 100644
index 0000000000..5af88c58be
--- /dev/null
+++ b/src/TiledArray/device/kernel/reduce_kernel.h
@@ -0,0 +1,107 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2018  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Chong Peng
+ *  Department of Chemistry, Virginia Tech
+ *  May 08, 2019
+ *
+ */
+
+#ifndef TILEDARRAY_DEVICE_REDUCE_KERNEL_H__INCLUDED
+#define TILEDARRAY_DEVICE_REDUCE_KERNEL_H__INCLUDED
+
+#include <TiledArray/config.h>
+
+#ifdef TILEDARRAY_HAS_DEVICE
+
+#include <complex>
+
+#include <TiledArray/external/device.h>
+
+namespace TiledArray::device {
+
+// foreach(i) result *= arg[i]
+int product_kernel(const int* arg, std::size_t n, const Stream& stream);
+
+float product_kernel(const float* arg, std::size_t n, const Stream& stream);
+
+double product_kernel(const double* arg, std::size_t n, const Stream& stream);
+
+std::complex<float> product_kernel(const std::complex<float>* arg,
+                                   std::size_t n, const Stream& stream);
+
+std::complex<double> product_kernel(const std::complex<double>* arg,
+                                    std::size_t n, const Stream& stream);
+
+// foreach(i) result += arg[i]
+int sum_kernel(const int* arg, std::size_t n, const Stream& stream);
+
+float sum_kernel(const float* arg, std::size_t n, const Stream& stream);
+
+double sum_kernel(const double* arg, std::size_t n, const Stream& stream);
+
+std::complex<float> sum_kernel(const std::complex<float>* arg, std::size_t n,
+                               const Stream& stream);
+
+std::complex<double> sum_kernel(const std::complex<double>* arg, std::size_t n,
+                                const Stream& stream);
+
+// foreach(i) result = max(result, arg[i])
+int max_kernel(const int* arg, std::size_t n, const Stream& stream);
+
+float max_kernel(const float* arg, std::size_t n, const Stream& stream);
+
+double max_kernel(const double* arg, std::size_t n, const Stream& stream);
+
+// foreach(i) result = min(result, arg[i])
+int min_kernel(const int* arg, std::size_t n, const Stream& stream);
+
+float min_kernel(const float* arg, std::size_t n, const Stream& stream);
+
+double min_kernel(const double* arg, std::size_t n, const Stream& stream);
+
+// foreach(i) result = max(result, abs(arg[i]))
+int absmax_kernel(const int* arg, std::size_t n, const Stream& stream);
+
+float absmax_kernel(const float* arg, std::size_t n, const Stream& stream);
+
+double absmax_kernel(const double* arg, std::size_t n, const Stream& stream);
+
+std::complex<float> absmax_kernel(const std::complex<float>* arg, std::size_t n,
+                                  const Stream& stream);
+
+std::complex<double> absmax_kernel(const std::complex<double>* arg,
+                                   std::size_t n, const Stream& stream);
+
+// foreach(i) result = min(result, abs(arg[i]))
+int absmin_kernel(const int* arg, std::size_t n, const Stream& stream);
+
+float absmin_kernel(const float* arg, std::size_t n, const Stream& stream);
+
+double absmin_kernel(const double* arg, std::size_t n, const Stream& stream);
+
+std::complex<float> absmin_kernel(const std::complex<float>* arg, std::size_t n,
+                                  const Stream& stream);
+
+std::complex<double> absmin_kernel(const std::complex<double>* arg,
+                                   std::size_t n, const Stream& stream);
+
+}  // namespace TiledArray::device
+
+#endif  // TILEDARRAY_HAS_DEVICE
+
+#endif  // TILEDARRAY_DEVICE_REDUCE_KERNEL_H__INCLUDED
diff --git a/src/TiledArray/device/kernel/thrust/mult_kernel.cu b/src/TiledArray/device/kernel/thrust/mult_kernel.cu
new file mode 100644
index 0000000000..e28ccd757a
--- /dev/null
+++ b/src/TiledArray/device/kernel/thrust/mult_kernel.cu
@@ -0,0 +1,81 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2018  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Chong Peng
+ *  Department of Chemistry, Virginia Tech
+ *  Aug 21, 2018
+ *
+ */
+
+#include <TiledArray/device/kernel/mult_kernel.h>
+#include <TiledArray/device/kernel/thrust/mult_kernel.h>
+
+namespace TiledArray::device {
+
+/// result[i] = result[i] * arg[i]
+void mult_to_kernel(int *result, const int *arg, std::size_t n,
+                    const Stream& stream) {
+  mult_to_kernel_thrust(result, arg, n, stream);
+}
+
+void mult_to_kernel(float *result, const float *arg, std::size_t n,
+                    const Stream& stream) {
+  mult_to_kernel_thrust(result, arg, n, stream);
+}
+
+void mult_to_kernel(double *result, const double *arg, std::size_t n,
+                    const Stream& stream) {
+  mult_to_kernel_thrust(result, arg, n, stream);
+}
+
+void mult_to_kernel(std::complex<float> *result, const std::complex<float> *arg, std::size_t n,
+                    const Stream& stream) {
+  mult_to_kernel_thrust(result, arg, n, stream);
+}
+
+void mult_to_kernel(std::complex<double> *result, const std::complex<double> *arg, std::size_t n,
+                    const Stream& stream) {
+  mult_to_kernel_thrust(result, arg, n, stream);
+}
+
+/// result[i] = arg1[i] * arg2[i]
+void mult_kernel(int *result, const int *arg1, const int *arg2, std::size_t n,
+                 const Stream& stream){
+  mult_kernel_thrust(result,arg1,arg2,n,stream);
+}
+
+void mult_kernel(float *result, const float *arg1, const float *arg2, std::size_t n,
+                 const Stream& stream){
+  mult_kernel_thrust(result,arg1,arg2,n,stream);
+}
+
+void mult_kernel(double *result, const double *arg1, const double *arg2, std::size_t n,
+                 const Stream& stream){
+  mult_kernel_thrust(result,arg1,arg2,n,stream);
+}
+
+void mult_kernel(std::complex<float> *result, const std::complex<float> *arg1, const std::complex<float> *arg2, std::size_t n,
+                 const Stream& stream){
+  mult_kernel_thrust(result,arg1,arg2,n,stream);
+}
+
+void mult_kernel(std::complex<double> *result, const std::complex<double> *arg1, const std::complex<double> *arg2, std::size_t n,
+                 const Stream& stream){
+  mult_kernel_thrust(result,arg1,arg2,n,stream);
+}
+
+}  // namespace TiledArray::device
diff --git a/src/TiledArray/cuda/kernel/mult_kernel_impl.h b/src/TiledArray/device/kernel/thrust/mult_kernel.h
similarity index 63%
rename from src/TiledArray/cuda/kernel/mult_kernel_impl.h
rename to src/TiledArray/device/kernel/thrust/mult_kernel.h
index b237dfab1e..8a48493cf0 100644
--- a/src/TiledArray/cuda/kernel/mult_kernel_impl.h
+++ b/src/TiledArray/device/kernel/thrust/mult_kernel.h
@@ -21,41 +21,42 @@
  *
  */
 
-#ifndef TILEDARRAY_CUDA_MULT_KERNEL_IMPL_H__INCLUDED
-#define TILEDARRAY_CUDA_MULT_KERNEL_IMPL_H__INCLUDED
+#ifndef TILEDARRAY_DEVICE_KERNEL_THRUST_MULT_KERNEL_H__INCLUDED
+#define TILEDARRAY_DEVICE_KERNEL_THRUST_MULT_KERNEL_H__INCLUDED
 
-#include <TiledArray/external/cuda.h>
+#include <TiledArray/device/thrust.h>
+#include <TiledArray/external/device.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 
-namespace TiledArray {
+namespace TiledArray::device {
 
 /// result[i] = result[i] * arg[i]
 template <typename T>
-void mult_to_cuda_kernel_impl(T *result, const T *arg, std::size_t n,
-                              cudaStream_t stream, int device_id) {
-  CudaSafeCall(cudaSetDevice(device_id));
+void mult_to_kernel_thrust(T *result, const T *arg, std::size_t n,
+                           const Stream &s) {
+  DeviceSafeCall(device::setDevice(s.device));
 
   thrust::multiplies<T> mul_op;
   thrust::transform(
-      thrust::cuda::par.on(stream), thrust::device_pointer_cast(arg),
+      thrust_system::par.on(s.stream), thrust::device_pointer_cast(arg),
       thrust::device_pointer_cast(arg) + n, thrust::device_pointer_cast(result),
       thrust::device_pointer_cast(result), mul_op);
 }
 
 /// result[i] = arg1[i] * arg2[i]
 template <typename T>
-void mult_cuda_kernel_impl(T *result, const T *arg1, const T *arg2,
-                           std::size_t n, cudaStream_t stream, int device_id) {
-  CudaSafeCall(cudaSetDevice(device_id));
+void mult_kernel_thrust(T *result, const T *arg1, const T *arg2, std::size_t n,
+                        const Stream &s) {
+  DeviceSafeCall(device::setDevice(s.device));
 
   thrust::multiplies<T> mul_op;
   thrust::transform(
-      thrust::cuda::par.on(stream), thrust::device_pointer_cast(arg1),
+      thrust_system::par.on(s.stream), thrust::device_pointer_cast(arg1),
       thrust::device_pointer_cast(arg1) + n, thrust::device_pointer_cast(arg2),
       thrust::device_pointer_cast(result), mul_op);
 }
 
-}  // namespace TiledArray
+}  // namespace TiledArray::device
 
-#endif  // TILEDARRAY_CUDA_MULT_KERNEL_IMPL_H__INCLUDED
+#endif  // TILEDARRAY_DEVICE_KERNEL_THRUST_MULT_KERNEL_H__INCLUDED
diff --git a/src/TiledArray/device/kernel/thrust/mult_kernel.hip b/src/TiledArray/device/kernel/thrust/mult_kernel.hip
new file mode 100644
index 0000000000..f0788eb5e2
--- /dev/null
+++ b/src/TiledArray/device/kernel/thrust/mult_kernel.hip
@@ -0,0 +1 @@
+#include <TiledArray/device/kernel/thrust/mult_kernel.cu>
diff --git a/src/TiledArray/device/kernel/thrust/reduce_kernel.cu b/src/TiledArray/device/kernel/thrust/reduce_kernel.cu
new file mode 100644
index 0000000000..08145ef0b4
--- /dev/null
+++ b/src/TiledArray/device/kernel/thrust/reduce_kernel.cu
@@ -0,0 +1,141 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2018  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Chong Peng
+ *  Department of Chemistry, Virginia Tech
+ *  May 8, 2019
+ *
+ */
+
+#include <TiledArray/device/kernel/reduce_kernel.h>
+#include <TiledArray/device/kernel/thrust/reduce_kernel.h>
+
+namespace TiledArray::device {
+
+// foreach(i) result *= arg[i]
+int product_kernel(const int *arg, std::size_t n, const Stream& stream){
+  return product_reduce_kernel_thrust(arg, n, stream);
+}
+
+float product_kernel(const float *arg, std::size_t n, const Stream& stream){
+  return product_reduce_kernel_thrust(arg, n, stream);
+}
+
+double product_kernel(const double *arg, std::size_t n, const Stream& stream){
+
+  return product_reduce_kernel_thrust(arg, n, stream);
+}
+
+std::complex<float> product_kernel(const std::complex<float> *arg, std::size_t n, const Stream& stream){
+  return product_reduce_kernel_thrust(arg, n, stream);
+}
+
+std::complex<double> product_kernel(const std::complex<double> *arg, std::size_t n, const Stream& stream){
+
+  return product_reduce_kernel_thrust(arg, n, stream);
+}
+
+// foreach(i) result += arg[i]
+int sum_kernel(const int *arg, std::size_t n, const Stream& stream){
+  return sum_reduce_kernel_thrust(arg, n, stream);
+}
+
+float sum_kernel(const float *arg, std::size_t n, const Stream& stream){
+  return sum_reduce_kernel_thrust(arg, n, stream);
+}
+
+double sum_kernel(const double *arg, std::size_t n, const Stream& stream){
+  return sum_reduce_kernel_thrust(arg, n, stream);
+}
+
+std::complex<float> sum_kernel(const std::complex<float> *arg, std::size_t n, const Stream& stream){
+  return sum_reduce_kernel_thrust(arg, n, stream);
+}
+
+std::complex<double> sum_kernel(const std::complex<double> *arg, std::size_t n, const Stream& stream){
+  return sum_reduce_kernel_thrust(arg, n, stream);
+}
+
+// foreach(i) result = max(result, arg[i])
+int max_kernel(const int *arg, std::size_t n, const Stream& stream){
+  return max_reduce_kernel_thrust(arg, n, stream);
+}
+
+float max_kernel(const float *arg, std::size_t n, const Stream& stream){
+  return max_reduce_kernel_thrust(arg, n, stream);
+}
+
+double max_kernel(const double *arg, std::size_t n, const Stream& stream){
+  return max_reduce_kernel_thrust(arg, n, stream);
+}
+
+// foreach(i) result = min(result, arg[i])
+int min_kernel(const int *arg, std::size_t n, const Stream& stream){
+  return min_reduce_kernel_thrust(arg, n, stream);
+}
+
+float min_kernel(const float *arg, std::size_t n, const Stream& stream){
+  return min_reduce_kernel_thrust(arg, n, stream);
+}
+
+double min_kernel(const double *arg, std::size_t n, const Stream& stream){
+  return min_reduce_kernel_thrust(arg, n, stream);
+}
+
+// foreach(i) result = max(result, abs(arg[i]))
+int absmax_kernel(const int *arg, std::size_t n, const Stream& stream){
+  return absmax_reduce_kernel_thrust(arg, n, stream);
+}
+
+float absmax_kernel(const float *arg, std::size_t n, const Stream& stream){
+  return absmax_reduce_kernel_thrust(arg, n, stream);
+}
+
+double absmax_kernel(const double *arg, std::size_t n, const Stream& stream){
+  return absmax_reduce_kernel_thrust(arg, n, stream);
+}
+
+std::complex<float> absmax_kernel(const std::complex<float> *arg, std::size_t n, const Stream& stream){
+  return absmax_reduce_kernel_thrust(arg, n, stream);
+}
+
+std::complex<double> absmax_kernel(const std::complex<double> *arg, std::size_t n, const Stream& stream){
+  return absmax_reduce_kernel_thrust(arg, n, stream);
+}
+
+// foreach(i) result = min(result, abs(arg[i]))
+int absmin_kernel(const int *arg, std::size_t n, const Stream& stream){
+  return absmin_reduce_kernel_thrust(arg, n, stream);
+}
+
+float absmin_kernel(const float *arg, std::size_t n, const Stream& stream){
+  return absmin_reduce_kernel_thrust(arg, n, stream);
+}
+
+double absmin_kernel(const double *arg, std::size_t n, const Stream& stream){
+  return absmin_reduce_kernel_thrust(arg, n, stream);
+}
+
+std::complex<float> absmin_kernel(const std::complex<float> *arg, std::size_t n, const Stream& stream){
+  return absmin_reduce_kernel_thrust(arg, n, stream);
+}
+
+std::complex<double> absmin_kernel(const std::complex<double> *arg, std::size_t n, const Stream& stream){
+  return absmin_reduce_kernel_thrust(arg, n, stream);
+}
+
+}  // namespace TiledArray::device
diff --git a/src/TiledArray/device/kernel/thrust/reduce_kernel.h b/src/TiledArray/device/kernel/thrust/reduce_kernel.h
new file mode 100644
index 0000000000..e5137ffb21
--- /dev/null
+++ b/src/TiledArray/device/kernel/thrust/reduce_kernel.h
@@ -0,0 +1,138 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2018  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Chong Peng
+ *  Department of Chemistry, Virginia Tech
+ *  Apir 11, 2018
+ *
+ */
+
+#ifndef TILEDARRAY_DEVICE_THRUST_REDUCE_KERNEL_H__INCLUDED
+#define TILEDARRAY_DEVICE_THRUST_REDUCE_KERNEL_H__INCLUDED
+
+#include <limits>
+
+#include <TiledArray/device/thrust.h>
+#include <TiledArray/external/device.h>
+#include <TiledArray/type_traits.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/functional.h>
+#include <thrust/reduce.h>
+#include <thrust/transform_reduce.h>
+
+namespace TiledArray::device {
+
+namespace detail {
+template <typename T>
+struct absolute_value
+    : public thrust::unary_function<T, TiledArray::detail::scalar_t<T>> {
+  __host__ __device__ TiledArray::detail::scalar_t<T> operator()(
+      const T &x) const {
+    using RT = TiledArray::detail::scalar_t<T>;
+    if constexpr (!TiledArray::detail::is_complex_v<T>) {
+      return x < RT(0) ? -x : x;
+    } else
+      return std::sqrt(x.real() * x.real() + x.imag() * x.imag());
+  }
+};
+
+}  // namespace detail
+
+/// T = reduce(T* arg)
+template <typename T, typename ReduceOp>
+T reduce_kernel_thrust(ReduceOp &&op, const T *arg, std::size_t n, T init,
+                       const Stream &s) {
+  DeviceSafeCall(device::setDevice(s.device));
+
+  auto arg_p = thrust::device_pointer_cast(arg);
+
+  auto result = thrust::reduce(thrust_system::par.on(s.stream), arg_p,
+                               arg_p + n, init, std::forward<ReduceOp>(op));
+
+  return result;
+}
+
+template <typename T>
+T product_reduce_kernel_thrust(const T *arg, std::size_t n,
+                               const Stream &stream) {
+  T init(1);
+  thrust::multiplies<T> mul_op;
+  return reduce_kernel_thrust(mul_op, arg, n, init, stream);
+}
+
+template <typename T>
+T sum_reduce_kernel_thrust(const T *arg, std::size_t n, const Stream &stream) {
+  T init(0);
+  thrust::plus<T> plus_op;
+  return reduce_kernel_thrust(plus_op, arg, n, init, stream);
+}
+
+template <typename T>
+T max_reduce_kernel_thrust(const T *arg, std::size_t n, const Stream &stream) {
+  T init = std::numeric_limits<T>::lowest();
+  thrust::maximum<T> max_op;
+  return reduce_kernel_thrust(max_op, arg, n, init, stream);
+}
+
+template <typename T>
+T min_reduce_kernel_thrust(const T *arg, std::size_t n, const Stream &stream) {
+  T init = std::numeric_limits<T>::max();
+  thrust::minimum<T> min_op;
+  return reduce_kernel_thrust(min_op, arg, n, init, stream);
+}
+
+template <typename T>
+TiledArray::detail::scalar_t<T> absmax_reduce_kernel_thrust(const T *arg,
+                                                            std::size_t n,
+                                                            const Stream &s) {
+  using TR = TiledArray::detail::scalar_t<T>;
+  TR init(0);
+  thrust::maximum<TR> max_op;
+  detail::absolute_value<T> abs_op;
+
+  DeviceSafeCall(device::setDevice(s.device));
+
+  auto arg_p = thrust::device_pointer_cast(arg);
+
+  auto result = thrust::transform_reduce(thrust_system::par.on(s.stream), arg_p,
+                                         arg_p + n, abs_op, init, max_op);
+
+  return result;
+}
+
+template <typename T>
+TiledArray::detail::scalar_t<T> absmin_reduce_kernel_thrust(const T *arg,
+                                                            std::size_t n,
+                                                            const Stream &s) {
+  using TR = TiledArray::detail::scalar_t<T>;
+  TR init = std::numeric_limits<TR>::max();
+  thrust::minimum<TR> min_op;
+  detail::absolute_value<T> abs_op;
+
+  DeviceSafeCall(device::setDevice(s.device));
+
+  auto arg_p = thrust::device_pointer_cast(arg);
+
+  auto result = thrust::transform_reduce(thrust_system::par.on(s.stream), arg_p,
+                                         arg_p + n, abs_op, init, min_op);
+  return result;
+}
+
+}  // namespace TiledArray::device
+
+#endif  // TILEDARRAY_DEVICE_THRUST_REDUCE_KERNEL_H__INCLUDED
diff --git a/src/TiledArray/device/kernel/thrust/reduce_kernel.hip b/src/TiledArray/device/kernel/thrust/reduce_kernel.hip
new file mode 100644
index 0000000000..5be5002c84
--- /dev/null
+++ b/src/TiledArray/device/kernel/thrust/reduce_kernel.hip
@@ -0,0 +1 @@
+#include <TiledArray/device/kernel/thrust/reduce_kernel.cu>
diff --git a/src/TiledArray/cuda/platform.h b/src/TiledArray/device/platform.h
similarity index 87%
rename from src/TiledArray/cuda/platform.h
rename to src/TiledArray/device/platform.h
index f94226b39e..d30a204fb4 100644
--- a/src/TiledArray/cuda/platform.h
+++ b/src/TiledArray/device/platform.h
@@ -21,8 +21,8 @@
  *
  */
 
-#ifndef TILEDARRAY_CUDA_PLATFORM_H__INCLUDED
-#define TILEDARRAY_CUDA_PLATFORM_H__INCLUDED
+#ifndef TILEDARRAY_DEVICE_PLATFORM_H__INCLUDED
+#define TILEDARRAY_DEVICE_PLATFORM_H__INCLUDED
 
 namespace TiledArray {
 
@@ -31,9 +31,9 @@ enum class MemorySpace {
   // MemorySpace is represented as a bitfield to compute unions and
   // intersections easier
   Null = 0b00,
-  CPU = 0b01,
-  CUDA = 0b10,
-  CUDA_UM = CPU | CUDA  // union of CPU and CUDA spaces
+  Host = 0b01,
+  Device = 0b10,
+  Device_UM = Host | Device  // union of host and device spaces
 };
 
 // customization point: in_memory_space<S>(O) -> bool
@@ -55,11 +55,11 @@ constexpr bool overlap(MemorySpace space1, MemorySpace space2) {
 }
 
 /// enumerates the execution spaces
-enum class ExecutionSpace { CPU, CUDA };
+enum class ExecutionSpace { Host, Device };
 
 // customization point: to_execution_space<S>(O) -> void
 // "moves" O to execution space S
 
 }  // namespace TiledArray
 
-#endif  // TILEDARRAY_CUDA_PLATFORM_H__INCLUDED
+#endif  // TILEDARRAY_DEVICE_PLATFORM_H__INCLUDED
diff --git a/src/TiledArray/cuda/thrust.h b/src/TiledArray/device/thrust.h
similarity index 70%
rename from src/TiledArray/cuda/thrust.h
rename to src/TiledArray/device/thrust.h
index fe9d02c529..2de7a5b8bb 100644
--- a/src/TiledArray/cuda/thrust.h
+++ b/src/TiledArray/device/thrust.h
@@ -21,14 +21,26 @@
  *
  */
 
-#ifndef TILEDARRAY_CUDA_THRUST_H__INCLUDED
-#define TILEDARRAY_CUDA_THRUST_H__INCLUDED
+#ifndef TILEDARRAY_DEVICE_THRUST_H__INCLUDED
+#define TILEDARRAY_DEVICE_THRUST_H__INCLUDED
 
 #include <TiledArray/config.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
+#ifdef TILEDARRAY_HAS_CUDA
 #include <cuda_runtime_api.h>
+#endif
+
+// rocthrust headers rely on THRUST_DEVICE_SYSTEM being defined, which is only
+// defined by the HIP-specific compilers to be usable with host compiler define
+// it here explicitly
+#ifdef TILEDARRAY_HAS_HIP
+#ifndef THRUST_DEVICE_SYSTEM
+#define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP
+#endif
+#endif
+
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
@@ -38,10 +50,12 @@ namespace thrust {
 
 // thrust::device_malloc_allocator name changed to device_allocator after
 // version 10
+#ifdef TILEDARRAY_HAS_CUDA
 #if CUDART_VERSION < 10000
 template <typename T>
 using device_allocator = thrust::device_malloc_allocator<T>;
 #endif
+#endif  // TILEDARRAY_HAS_CUDA
 
 template <typename T, typename Alloc>
 const T* data(const thrust::device_vector<T, Alloc>& dev_vec) {
@@ -57,6 +71,16 @@ template <typename T, typename Alloc>
 void resize(thrust::device_vector<T, Alloc>& dev_vec, size_t size);
 }  // namespace thrust
 
-#endif  // TILEDARRAY_HAS_CUDA
+namespace TiledArray::device {
+
+#ifdef TILEDARRAY_HAS_CUDA
+namespace thrust_system = thrust::cuda;
+#elif TILEDARRAY_HAS_HIP
+namespace thrust_system = thrust::hip;
+#endif
+
+}  // namespace TiledArray::device
+
+#endif  // TILEDARRAY_HAS_DEVICE
 
-#endif  // TILEDARRAY_CUDA_THRUST_H__INCLUDED
+#endif  // TILEDARRAY_DEVICE_THRUST_H__INCLUDED
diff --git a/src/TiledArray/cuda/um_storage.cu b/src/TiledArray/device/um_storage.cu
similarity index 66%
rename from src/TiledArray/cuda/um_storage.cu
rename to src/TiledArray/device/um_storage.cu
index 3462f7d7c1..8879c246f8 100644
--- a/src/TiledArray/cuda/um_storage.cu
+++ b/src/TiledArray/device/um_storage.cu
@@ -22,29 +22,29 @@
  */
 
 
-#include <TiledArray/cuda/um_allocator.h>
-#include <TiledArray/cuda/thrust.h>
+#include <TiledArray/external/device.h>
+#include <TiledArray/device/thrust.h>
 
 #ifdef TILEDARRAY_HAS_CUDA
 
 namespace thrust {
 template<>
-void resize<double,TiledArray::cuda_um_allocator<double>>(
-    thrust::device_vector<double, TiledArray::cuda_um_allocator<double>>& dev_vec,
+void resize<double,TiledArray::device_um_allocator<double>>(
+    thrust::device_vector<double, TiledArray::device_um_allocator<double>>& dev_vec,
     size_t size) {
     dev_vec.resize(size);
 }
 template<>
-void resize<float,TiledArray::cuda_um_allocator<float>>(
-    thrust::device_vector<float, TiledArray::cuda_um_allocator<float>>& dev_vec,
+void resize<float,TiledArray::device_um_allocator<float>>(
+    thrust::device_vector<float, TiledArray::device_um_allocator<float>>& dev_vec,
     size_t size) {
     dev_vec.resize(size);
 }
 }
 
 namespace thrust {
-template class device_vector<double, TiledArray::cuda_um_allocator<double>>;
-template class device_vector<float, TiledArray::cuda_um_allocator<float>>;
+template class device_vector<double, TiledArray::device_um_allocator<double>>;
+template class device_vector<float, TiledArray::device_um_allocator<float>>;
 }
 
 #endif //TILEDARRAY_HAS_CUDA
diff --git a/src/TiledArray/device/um_storage.h b/src/TiledArray/device/um_storage.h
new file mode 100644
index 0000000000..d91c032312
--- /dev/null
+++ b/src/TiledArray/device/um_storage.h
@@ -0,0 +1,122 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2018  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Eduard Valeyev
+ *  Department of Chemistry, Virginia Tech
+ *  Feb 6, 2018
+ *
+ */
+
+#ifndef TILEDARRAY_DEVICE_UM_VECTOR_H__INCLUDED
+#define TILEDARRAY_DEVICE_UM_VECTOR_H__INCLUDED
+
+#include <TiledArray/external/device.h>
+
+#ifdef TILEDARRAY_HAS_DEVICE
+
+#include <btas/array_adaptor.h>
+#include <btas/varray/varray.h>
+
+#include <TiledArray/device/platform.h>
+#include <TiledArray/utility.h>
+
+#include <madness/world/archive.h>
+
+namespace TiledArray {
+
+/// @return true if @c dev_vec is present in space @space
+template <MemorySpace Space, typename Storage>
+bool in_memory_space(const Storage& vec) noexcept {
+  return overlap(MemorySpace::Device_UM, Space);
+}
+
+/**
+ * @tparam Space
+ * @tparam Storage  the Storage type of the vector, such as
+ * device_um_btas_varray
+ */
+template <ExecutionSpace Space, typename Storage>
+void to_execution_space(Storage& vec, const device::Stream& s) {
+  switch (Space) {
+    case ExecutionSpace::Host: {
+      using std::data;
+      using std::size;
+      using value_type = typename Storage::value_type;
+      if (deviceEnv::instance()->concurrent_managed_access()) {
+        DeviceSafeCall(device::memPrefetchAsync(data(vec),
+                                                size(vec) * sizeof(value_type),
+                                                device::CpuDeviceId, s.stream));
+      }
+      break;
+    }
+    case ExecutionSpace::Device: {
+      using std::data;
+      using std::size;
+      using value_type = typename Storage::value_type;
+      if (deviceEnv::instance()->concurrent_managed_access()) {
+        DeviceSafeCall(device::memPrefetchAsync(
+            data(vec), size(vec) * sizeof(value_type), s.device, s.stream));
+      }
+      break;
+    }
+    default:
+      throw std::runtime_error("invalid execution space");
+  }
+}
+
+/**
+ * create UM storage and prefetch it to device
+ *
+ * @param storage UM Storage type object
+ * @param n size of um storage object
+ * @param stream device stream used to perform prefetch
+ */
+template <typename Storage>
+void make_device_storage(Storage& storage, std::size_t n,
+                         const device::Stream& s) {
+  storage = Storage(n);
+  TiledArray::to_execution_space<TiledArray::ExecutionSpace::Device>(storage,
+                                                                     s);
+}
+
+/**
+ *  return the device pointer for UM storage object
+ *
+ * @param storage UM Storage type object
+ * @return data pointer of UM Storage object
+ */
+template <typename Storage>
+typename Storage::value_type* device_data(Storage& storage) {
+  return storage.data();
+}
+
+/**
+ *  return the const pointer for UM storage object
+ *
+ * @param storage UM Storage type object
+ * @return const data pointer of UM Storage object
+ */
+template <typename Storage>
+const typename Storage::value_type* device_data(const Storage& storage) {
+  return storage.data();
+}
+
+}  // namespace TiledArray
+
+#endif  // TILEDARRAY_HAS_CUDA
+
+#endif  // TILEDARRAY_DEVICE_UM_VECTOR_H__INCLUDED
diff --git a/src/TiledArray/dist_array.h b/src/TiledArray/dist_array.h
index ea6a066441..cb9d094f34 100644
--- a/src/TiledArray/dist_array.h
+++ b/src/TiledArray/dist_array.h
@@ -136,73 +136,33 @@ class DistArray : public madness::archive::ParallelSerializableObject {
       std::is_same_v<std::decay_t<Value>, Future<value_type>> ||
       std::is_same_v<std::decay_t<Value>, value_type>;
 
+  /// compute type of DistArray with different Policy and/or Tile
+  template <typename TileU, typename PolicyU = Policy>
+  using rebind_t = DistArray<TileU, PolicyU>;
+
  private:
-  pimpl_type pimpl_;  ///< managed ptr to Array implementation
+  template <typename Numeric, typename = void>
+  struct rebind_numeric;
+  template <typename Numeric>
+  struct rebind_numeric<
+      Numeric, std::enable_if_t<detail::has_rebind_numeric_v<Tile, Numeric>>> {
+    using type =
+        DistArray<typename Tile::template rebind_numeric_t<Numeric>, Policy>;
+  };
+
+ public:
+  /// compute type of DistArray with Tile's rebound numeric type
+  /// @note this is SFINAE-disabled if `Tile::rebind_numeric_t<Numeric>` is not
+  /// defined
+  template <typename Numeric>
+  using rebind_numeric_t = typename rebind_numeric<Numeric>::type;
+
+ private:
+  pimpl_type pimpl_ = {};  ///< managed ptr to Array implementation
   bool defer_deleter_to_next_fence_ =
       false;  ///< if true, the impl object is scheduled to be destroyed in the
               ///< next fence
 
-  static madness::AtomicInt cleanup_counter_;
-
-  /// Array deleter function
-
-  /// This function schedules a task for lazy cleanup. Array objects are
-  /// deleted only after the object has been deleted in all processes.
-  /// \param pimpl The implementation pointer to be deleted.
-  static void lazy_deleter(const impl_type* const pimpl) {
-    if (pimpl) {
-      if (madness::initialized()) {
-        World& world = pimpl->world();
-        const madness::uniqueidT id = pimpl->id();
-        cleanup_counter_++;
-
-        // wait for all DelayedSet's to vanish
-        world.await([&]() { return (pimpl->num_live_ds() == 0); }, true);
-
-        try {
-          world.gop.lazy_sync(id, [pimpl]() {
-            delete pimpl;
-            DistArray::cleanup_counter_--;
-          });
-        } catch (madness::MadnessException& e) {
-          fprintf(stderr,
-                  "!! ERROR TiledArray: madness::MadnessException thrown in "
-                  "Array::lazy_deleter().\n"
-                  "%s\n"
-                  "!! ERROR TiledArray: The exception has been absorbed.\n"
-                  "!! ERROR TiledArray: rank=%i\n",
-                  e.what(), world.rank());
-
-          cleanup_counter_--;
-          delete pimpl;
-        } catch (std::exception& e) {
-          fprintf(stderr,
-                  "!! ERROR TiledArray: std::exception thrown in "
-                  "Array::lazy_deleter().\n"
-                  "%s\n"
-                  "!! ERROR TiledArray: The exception has been absorbed.\n"
-                  "!! ERROR TiledArray: rank=%i\n",
-                  e.what(), world.rank());
-
-          cleanup_counter_--;
-          delete pimpl;
-        } catch (...) {
-          fprintf(stderr,
-                  "!! ERROR TiledArray: An unknown exception was thrown in "
-                  "Array::lazy_deleter().\n"
-                  "!! ERROR TiledArray: The exception has been absorbed.\n"
-                  "!! ERROR TiledArray: rank=%i\n",
-                  world.rank());
-
-          cleanup_counter_--;
-          delete pimpl;
-        }
-      } else {
-        delete pimpl;
-      }
-    }
-  }
-
   /// Sparse array initialization
 
   /// \param world The world where the array will live.
@@ -218,34 +178,10 @@ class DistArray : public madness::archive::ParallelSerializableObject {
     if (!pmap) {
       // Construct a default process map
       pmap = Policy::default_pmap(world, trange.tiles_range().volume());
-    } else {
-      // Validate the process map
-      TA_ASSERT(pmap->size() == trange.tiles_range().volume() &&
-                "TiledArray::DistArray::DistArray() -- The size of the process "
-                "map is not "
-                "equal to the number of tiles in the TiledRange object.");
-      TA_ASSERT(pmap->rank() ==
-                    typename pmap_interface::size_type(world.rank()) &&
-                "TiledArray::DistArray::DistArray() -- The rank of the process "
-                "map is not equal to that "
-                "of the world object.");
-      TA_ASSERT(pmap->procs() ==
-                    typename pmap_interface::size_type(world.size()) &&
-                "TiledArray::DistArray::DistArray() -- The number of processes "
-                "in the process map is not "
-                "equal to that of the world object.");
     }
 
-    // Validate the shape
-    TA_ASSERT(
-        !shape.empty() &&
-        "TiledArray::DistArray::DistArray() -- The shape is not initialized.");
-    TA_ASSERT(shape.validate(trange.tiles_range()) &&
-              "TiledArray::DistArray::DistArray() -- The range of the shape is "
-              "not equal to "
-              "the tiles range.");
-
-    return pimpl_type(new impl_type(world, trange, shape, pmap), lazy_deleter);
+    return pimpl_type(new impl_type(world, trange, shape, pmap),
+                      impl_type::lazy_deleter);
   }
 
  public:
@@ -256,7 +192,7 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   /// array is uninitialized, but these arrays may be assign via a tensor
   /// expression assignment or the copy construction.
 
-  DistArray() : pimpl_() {}
+  DistArray() = default;
 
   /// Copy constructor
 
@@ -277,6 +213,19 @@ class DistArray : public madness::archive::ParallelSerializableObject {
             const std::shared_ptr<const pmap_interface>& pmap = {})
       : pimpl_(init(world, trange, shape_type(1, trange), pmap)) {}
 
+  /// Dense array constructor
+
+  /// Constructs an array with the given meta data in default World.
+  /// This constructor only initializes the array meta data;
+  /// the array tiles are empty and must be assigned by the user.
+  /// \param trange The tiled range object that will be used to set the array
+  /// tiling.
+  /// \param pmap The tile index -> process map
+  explicit DistArray(const trange_type& trange,
+                     const std::shared_ptr<const pmap_interface>& pmap = {})
+      : pimpl_(init(get_default_world(), trange, shape_type(1, trange), pmap)) {
+  }
+
   /// Sparse array constructor
 
   /// Constructs an array with the given meta data. This constructor only
@@ -291,6 +240,19 @@ class DistArray : public madness::archive::ParallelSerializableObject {
                 std::shared_ptr<const pmap_interface>())
       : pimpl_(init(world, trange, shape, pmap)) {}
 
+  /// Sparse array constructor
+
+  /// Constructs an array with the given meta data in default World.
+  /// This constructor only initializes the array meta data; the array tiles
+  /// are empty and must be assigned by the user.
+  /// \param trange The tiled range object that will be used to set the array
+  /// tiling. \param shape The array shape that defines zero and non-zero tiles
+  /// \param pmap The tile index -> process map
+  DistArray(const trange_type& trange, const shape_type& shape,
+            const std::shared_ptr<const pmap_interface>& pmap =
+                std::shared_ptr<const pmap_interface>())
+      : pimpl_(init(get_default_world(), trange, shape, pmap)) {}
+
   /// \name Initializer list constructors
   /// \brief Creates a new tensor containing the elements in the provided
   ///         `std::initializer_list`.
@@ -353,6 +315,41 @@ class DistArray : public madness::archive::ParallelSerializableObject {
               std::initializer_list<std::initializer_list<T>>>>>>
           il)
       : DistArray(array_from_il<DistArray>(world, il)) {}
+
+  template <typename T>
+  explicit DistArray(std::initializer_list<T> il)  // N.B. clang does not like
+                                                   // detail::vector_il<T> here
+      : DistArray(array_from_il<DistArray>(get_default_world(), il)) {}
+
+  template <typename T>
+  explicit DistArray(std::initializer_list<std::initializer_list<T>> il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), il)) {}
+
+  template <typename T>
+  explicit DistArray(
+      std::initializer_list<std::initializer_list<std::initializer_list<T>>> il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), il)) {}
+
+  template <typename T>
+  explicit DistArray(std::initializer_list<std::initializer_list<
+                         std::initializer_list<std::initializer_list<T>>>>
+                         il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), il)) {}
+
+  template <typename T>
+  explicit DistArray(
+      std::initializer_list<std::initializer_list<std::initializer_list<
+          std::initializer_list<std::initializer_list<T>>>>>
+          il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), il)) {}
+
+  template <typename T>
+  explicit DistArray(
+      std::initializer_list<
+          std::initializer_list<std::initializer_list<std::initializer_list<
+              std::initializer_list<std::initializer_list<T>>>>>>
+          il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), il)) {}
   ///@}
 
   /// \name Tiling initializer list constructors
@@ -419,8 +416,57 @@ class DistArray : public madness::archive::ParallelSerializableObject {
               std::initializer_list<std::initializer_list<T>>>>>>
           il)
       : DistArray(array_from_il<DistArray>(world, trange, il)) {}
+
+  template <typename T>
+  DistArray(const trange_type& trange, std::initializer_list<T> il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), trange, il)) {}
+
+  template <typename T>
+  DistArray(const trange_type& trange,
+            std::initializer_list<std::initializer_list<T>> il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), trange, il)) {}
+
+  template <typename T>
+  DistArray(
+      const trange_type& trange,
+      std::initializer_list<std::initializer_list<std::initializer_list<T>>> il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), trange, il)) {}
+
+  template <typename T>
+  DistArray(const trange_type& trange,
+            std::initializer_list<std::initializer_list<
+                std::initializer_list<std::initializer_list<T>>>>
+                il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), trange, il)) {}
+
+  template <typename T>
+  DistArray(const trange_type& trange,
+            std::initializer_list<std::initializer_list<std::initializer_list<
+                std::initializer_list<std::initializer_list<T>>>>>
+                il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), trange, il)) {}
+
+  template <typename T>
+  DistArray(
+      const trange_type& trange,
+      std::initializer_list<
+          std::initializer_list<std::initializer_list<std::initializer_list<
+              std::initializer_list<std::initializer_list<T>>>>>>
+          il)
+      : DistArray(array_from_il<DistArray>(get_default_world(), trange, il)) {}
   /// @}
 
+  /// "copy" constructor that replaces the TiledRange
+
+  /// This constructor remaps the data of \p other according to \p new_trange ,
+  /// with \p new_value_fill used to fill the new elements, if any
+  DistArray(const DistArray& other, const trange_type& new_trange,
+            element_type new_value_fill = element_type{})
+      : pimpl_(
+            make_with_new_trange(other.pimpl(), new_trange, new_value_fill)) {
+    this->truncate();
+  }
+
   /// converting copy constructor
 
   /// This constructor uses the meta data of `other` to initialize the meta
@@ -428,7 +474,7 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   /// initialized using TiledArray::Cast<Tile,OtherTile>
   /// \param other The array to be copied
   template <typename OtherTile, typename = enable_if_not_my_type<OtherTile>>
-  explicit DistArray(const DistArray<OtherTile, Policy>& other) : pimpl_() {
+  DistArray(const DistArray<OtherTile, Policy>& other) : pimpl_() {
     *this = foreach<Tile>(other, [](Tile& result, const OtherTile& source) {
       result = TiledArray::Cast<Tile, OtherTile>{}(source);
     });
@@ -514,7 +560,7 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   /// Checks if this is a unique handle to the implementation object
 
   /// \return true if this is a unique handle to the implementation object
-  bool is_unique() const { return pimpl_.unique(); }
+  bool is_unique() const { return pimpl_.use_count() == 1; }
 
   /// Wait for lazy tile cleanup
 
@@ -527,10 +573,10 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   /// \throw madness::MadnessException When timeout has been exceeded.
   static void wait_for_lazy_cleanup(World& world, const double = 60.0) {
     try {
-      world.await([&]() { return (cleanup_counter_ == 0); }, true);
+      world.await([&]() { return (impl_type::cleanup_counter_ == 0); }, true);
     } catch (...) {
       printf("%i: Array lazy cleanup timeout with %i pending cleanup(s)\n",
-             world.rank(), int(cleanup_counter_));
+             world.rank(), int(impl_type::cleanup_counter_));
       throw;
     }
   }
@@ -860,23 +906,29 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   ///                              guarantee.
   /// \throw TiledArray::Exception if skip_set is false and a local tile is
   ///                              already set. Weak throw guarantee.
-  void fill_local(const element_type& value = element_type(),
-                  bool skip_set = false) {
-    init_tiles(
+  template <Fence fence = Fence::No>
+  std::int64_t fill_local(const element_type& value = element_type(),
+                          bool skip_set = false) {
+    return init_tiles<HostExecutor::Default, fence>(
         [value](const range_type& range) { return value_type(range, value); },
         skip_set);
   }
 
   /// Fill all local tiles with the specified value
 
+  /// \tparam fence If Fence::No, the operation will return early,
+  ///         before the tasks have completed
   /// \param[in] value What each local tile should be filled with.
   /// \param[in] skip_set If false, will throw if any tiles are already set
+  /// \return the total number of tiles that have been (or will be) initialized
   /// \throw TiledArray::Exception if the PIMPL is uninitialized. Strong throw
   ///                              guarantee.
   /// \throw TiledArray::Exception if skip_set is false and a local tile is
   ///                              already set. Weak throw guarantee.
-  void fill(const element_type& value = numeric_type(), bool skip_set = false) {
-    fill_local(value, skip_set);
+  template <Fence fence = Fence::No>
+  std::int64_t fill(const element_type& value = numeric_type(),
+                    bool skip_set = false) {
+    return fill_local<fence>(value, skip_set);
   }
 
   /// Fill all local tiles with random values
@@ -888,18 +940,21 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   /// generate random values of type T this function will be disabled via SFINAE
   /// and attempting to use it will lead to a compile-time error.
   ///
+  /// \tparam fence If Fence::No, the operation will return early,
+  ///         before the tasks have completed
   /// \tparam T The type of random value to generate. Defaults to
   ///           element_type.
   /// \param[in] skip_set If false, will throw if any tiles are already set
+  /// \return the total number of tiles that have been (or will be) initialized
   /// \throw TiledArray::Exception if the PIMPL is not initialized. Strong
   ///                              throw guarantee.
   /// \throw TiledArray::Exception if skip_set is false and a local tile is
   ///                              already initialized. Weak throw guarantee.
   template <HostExecutor Exec = HostExecutor::Default,
-            typename T = element_type,
+            typename T = element_type, Fence fence = Fence::No,
             typename = detail::enable_if_can_make_random_t<T>>
-  void fill_random(bool skip_set = false) {
-    init_elements<Exec>(
+  std::int64_t fill_random(bool skip_set = false) {
+    return init_elements<Exec, fence>(
         [](const auto&) { return detail::MakeRandom<T>::generate_value(); });
   }
 
@@ -932,6 +987,8 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   ///        return tile;
   ///     });
   /// \endcode
+  /// \tparam fence If Fence::No, the operation will return early,
+  ///         before the tasks have completed
   /// \tparam Op The type of the functor/function
   /// \param[in] op The operation used to generate tiles
   /// \param[in] skip_set If false, will throw if any tiles are already set
@@ -939,36 +996,11 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   ///                              guarantee.
   /// \throw TiledArray::Exception if a tile is already set and skip_set is
   ///                              false. Weak throw guarantee.
-  template <HostExecutor Exec = HostExecutor::Default, typename Op>
-  void init_tiles(Op&& op, bool skip_set = false) {
-    // lifetime management of op depends on whether it is a lvalue ref (i.e. has
-    // an external owner) or an rvalue ref
-    // - if op is an lvalue ref: pass op to tasks
-    // - if op is an rvalue ref pass make_shared_function(op) to tasks
-    auto op_shared_handle = make_op_shared_handle(std::forward<Op>(op));
-
-    auto it = impl_ref().pmap()->begin();
-    const auto end = pimpl_->pmap()->end();
-    for (; it != end; ++it) {
-      const auto& index = *it;
-      if (!pimpl_->is_zero(index)) {
-        if (skip_set) {
-          auto fut = find_local(index);
-          if (fut.probe()) continue;
-        }
-        if constexpr (Exec == HostExecutor::MADWorld) {
-          Future<value_type> tile = pimpl_->world().taskq.add(
-              [pimpl = pimpl_, index = ordinal_type(index),
-               op_shared_handle]() -> value_type {
-                return op_shared_handle(pimpl->trange().make_tile_range(index));
-              });
-          set(index, std::move(tile));
-        } else {
-          static_assert(Exec == HostExecutor::Thread);
-          set(index, op_shared_handle(trange().make_tile_range(index)));
-        }
-      }
-    }
+  template <HostExecutor Exec = HostExecutor::Default, Fence fence = Fence::No,
+            typename Op>
+  std::int64_t init_tiles(Op&& op, bool skip_set = false) {
+    return impl_ref().template init_tiles<Exec, fence>(std::forward<Op>(op),
+                                                       skip_set);
   }
 
   /// Initialize elements of local, non-zero tiles with a user provided functor
@@ -990,15 +1022,17 @@ class DistArray : public madness::archive::ParallelSerializableObject {
   /// \tparam Op Type of the function/functor which will generate the elements.
   /// \param[in] op The operation used to generate elements
   /// \param[in] skip_set If false, will throw if any tiles are already set
+  /// \return the total number of tiles that have been (or will be) initialized
   /// \throw TiledArray::Exception if the PIMPL is not initialized. Strong
   ///                              throw guarnatee.
   /// \throw TiledArray::Exception if skip_set is false and a local, non-zero
   ///                              tile is already initialized. Weak throw
   ///                              guarantee.
-  template <HostExecutor Exec = HostExecutor::Default, typename Op>
-  void init_elements(Op&& op, bool skip_set = false) {
+  template <HostExecutor Exec = HostExecutor::Default, Fence fence = Fence::No,
+            typename Op>
+  std::int64_t init_elements(Op&& op, bool skip_set = false) {
     auto op_shared_handle = make_op_shared_handle(std::forward<Op>(op));
-    init_tiles<Exec>(
+    return init_tiles<Exec, fence>(
         [op = std::move(op_shared_handle)](
             const TiledArray::Range& range) -> value_type {
           // Initialize the tile with the given range object
@@ -1104,6 +1138,32 @@ class DistArray : public madness::archive::ParallelSerializableObject {
     return TiledArray::expressions::TsrExpr<DistArray>(*this, vars);
   }
 
+  /// Create a tensor expression from an annotation (possibly free of
+  /// inner-tensor sub-annotation).
+
+  /// \brief This method creates a tensor expression but does not insist the
+  ///        annotation to be bipartite (outer and inner tensor annotations).
+  /// \param vars A string with a comma-separated list of variables.
+  /// \note Only use for unary evaluations when the indexing of the inner
+  ///       tensors is not significant, eg. norm computation.
+  ///
+  auto make_tsrexpr(const std::string& vars) {
+    return TiledArray::expressions::TsrExpr<DistArray>(*this, vars);
+  }
+
+  /// Create a tensor expression from an annotation (possibly free of
+  /// inner-tensor sub-annotation).
+
+  /// \brief This method creates a tensor expression but does not insist the
+  ///        annotation to be bipartite (outer and inner tensor annotations).
+  /// \param vars A string with a comma-separated list of variables.
+  /// \note Only use for unary evaluations when the indexing of the inner
+  ///       tensors is not significant, eg. norm computation.
+  ///
+  auto make_tsrexpr(const std::string& vars) const {
+    return TiledArray::expressions::TsrExpr<const DistArray>(*this, vars);
+  }
+
   /// \deprecated use DistArray::world()
   [[deprecated]] World& get_world() const { return world(); }
 
@@ -1339,7 +1399,7 @@ class DistArray : public madness::archive::ParallelSerializableObject {
         shape() & typeid(pmap().get()).hash_code();
     int64_t count = 0;
     for (auto it = begin(); it != end(); ++it) ++count;
-    ar& count;
+    ar & count;
     for (auto it = begin(); it != end(); ++it) ar & it->get();
   }
 
@@ -1356,14 +1416,14 @@ class DistArray : public madness::archive::ParallelSerializableObject {
     auto& world = TiledArray::get_default_world();
 
     std::size_t typeid_hash = 0l;
-    ar& typeid_hash;
+    ar & typeid_hash;
     if (typeid_hash != typeid(*this).hash_code())
       TA_EXCEPTION(
           "DistArray::serialize: source DistArray type != this DistArray type");
 
     ProcessID world_size = -1;
     ProcessID world_rank = -1;
-    ar& world_size& world_rank;
+    ar & world_size & world_rank;
     if (world_size != world.size() || world_rank != world.rank())
       TA_EXCEPTION(
           "DistArray::serialize: source DistArray world != this DistArray "
@@ -1371,13 +1431,13 @@ class DistArray : public madness::archive::ParallelSerializableObject {
 
     trange_type trange;
     shape_type shape;
-    ar& trange& shape;
+    ar & trange & shape;
 
     // use default pmap, ensure it's the same pmap used to serialize
     auto volume = trange.tiles_range().volume();
     auto pmap = detail::policy_t<DistArray>::default_pmap(world, volume);
     size_t pmap_hash_code = 0;
-    ar& pmap_hash_code;
+    ar & pmap_hash_code;
     if (pmap_hash_code != typeid(pmap.get()).hash_code())
       TA_EXCEPTION(
           "DistArray::serialize: source DistArray pmap != this DistArray pmap");
@@ -1385,10 +1445,10 @@ class DistArray : public madness::archive::ParallelSerializableObject {
         new impl_type(world, std::move(trange), std::move(shape), pmap));
 
     int64_t count = 0;
-    ar& count;
+    ar & count;
     for (auto it = begin(); it != end(); ++it, --count) {
       Tile tile;
-      ar& tile;
+      ar & tile;
       this->set(it.ordinal(), std::move(tile));
     }
     if (count != 0)
@@ -1421,27 +1481,27 @@ class DistArray : public madness::archive::ParallelSerializableObject {
       // make sure source data matches the expected type
       // TODO would be nice to be able to convert the data upon reading
       std::size_t typeid_hash = 0l;
-      localar& typeid_hash;
+      localar & typeid_hash;
       if (typeid_hash != typeid(*this).hash_code())
         TA_EXCEPTION(
             "DistArray::load: source DistArray type != this DistArray type");
 
       // make sure same number of clients for every I/O node
       int num_io_clients = 0;
-      localar& num_io_clients;
+      localar & num_io_clients;
       if (num_io_clients != ar.num_io_clients())
         TA_EXCEPTION("DistArray::load: invalid parallel archive");
 
       trange_type trange;
       shape_type shape;
-      localar& trange& shape;
+      localar & trange & shape;
 
       // send trange and shape to every client
       for (ProcessID p = 0; p < world.size(); ++p) {
         if (p != me && ar.io_node(p) == me) {
           world.mpi.Send(int(1), p, tag);  // Tell client to expect the data
           madness::archive::MPIOutputArchive dest(world, p);
-          dest& trange& shape;
+          dest & trange & shape;
           dest.flush();
         }
       }
@@ -1453,13 +1513,13 @@ class DistArray : public madness::archive::ParallelSerializableObject {
           new impl_type(world, std::move(trange), std::move(shape), pmap));
 
       int64_t count = 0;
-      localar& count;
+      localar & count;
       for (size_t ord = 0; ord != volume; ++ord) {
         if (!is_zero(ord)) {
           auto owner_rank = pmap->owner(ord);
           if (ar.io_node(owner_rank) == me) {
             Tile tile;
-            localar& tile;
+            localar & tile;
             this->set(ord, std::move(tile));
             --count;
           }
@@ -1478,7 +1538,7 @@ class DistArray : public madness::archive::ParallelSerializableObject {
       world.mpi.Recv(flag, p, tag);
       TA_ASSERT(flag == 1);
       madness::archive::MPIInputArchive source(world, p);
-      source& trange& shape;
+      source & trange & shape;
 
       // use default pmap
       auto volume = trange.tiles_range().volume();
@@ -1523,7 +1583,7 @@ class DistArray : public madness::archive::ParallelSerializableObject {
           }
         }
       }
-      localar& count;
+      localar & count;
       for (size_t ord = 0; ord != volume; ++ord) {
         if (!is_zero(ord)) {
           auto owner_rank = pmap()->owner(ord);
@@ -1681,9 +1741,6 @@ class DistArray : public madness::archive::ParallelSerializableObject {
 
 };  // class DistArray
 
-template <typename Tile, typename Policy>
-madness::AtomicInt DistArray<Tile, Policy>::cleanup_counter_;
-
 #ifndef TILEDARRAY_HEADER_ONLY
 
 extern template class DistArray<Tensor<double>, DensePolicy>;
@@ -1737,42 +1794,83 @@ auto rank(const DistArray<Tile, Policy>& a) {
   return a.trange().tiles_range().rank();
 }
 
+/// Checks if for every tile `i` its range matches the tile range produced by
+/// `a.trange()`
+
+/// @return `a.get(i)->range() == a.trange().make_tile_range(i)` for every tile
+/// `i`
 template <typename Tile, typename Policy>
-size_t volume(const DistArray<Tile, Policy>& a) {
-  // this is the number of tiles
-  if (a.size() > 0)  // assuming dense shape
-    return a.trange().elements_range().volume();
-  return 0;
+bool tile_ranges_match_trange(const DistArray<Tile, Policy>& a) {
+  auto end = a.end();
+  for (auto it = a.begin(); it != end; ++it) {
+    if (it->is_local() && !a.is_zero(it.index()))
+      if ((*it).get().range() != a.trange().make_tile_range(it.index()))
+        return false;
+  }
+  return true;
+}
+
+///
+/// \brief Get the total elements in the non-zero tiles of an array.
+///        For tensor-of-tensor tiles, the total is the sum of the number of
+///        elements in the inner tensors of non-zero tiles.
+///
+template <typename Tile, typename Policy>
+size_t volume(const DistArray<Tile, Policy>& array) {
+  std::atomic<size_t> vol = 0;
+
+  auto local_vol = [&vol](Tile const& in_tile) {
+    if constexpr (detail::is_tensor_of_tensor_v<Tile>) {
+      auto reduce_op = [](size_t& MADNESS_RESTRICT result, auto&& arg) {
+        result += arg->total_size();
+      };
+      auto join_op = [](auto& MADNESS_RESTRICT result, size_t count) {
+        result += count;
+      };
+      vol += in_tile.reduce(reduce_op, join_op, size_t{0});
+    } else
+      vol += in_tile.total_size();
+  };
+
+  for (auto&& local_tile_future : array)
+    array.world().taskq.add(local_vol, local_tile_future.get());
+
+  array.world().gop.fence();
+
+  size_t vol_ = vol;
+  array.world().gop.sum(&vol_, 1);
+
+  return vol_;
 }
 
 template <typename Tile, typename Policy>
 auto abs_min(const DistArray<Tile, Policy>& a) {
-  return a(detail::dummy_annotation(rank(a))).abs_min();
+  return a.make_tsrexpr(detail::dummy_annotation(rank(a))).abs_min();
 }
 
 template <typename Tile, typename Policy>
 auto abs_max(const DistArray<Tile, Policy>& a) {
-  return a(detail::dummy_annotation(rank(a))).abs_max();
+  return a.make_tsrexpr(detail::dummy_annotation(rank(a))).abs_max();
 }
 
 template <typename Tile, typename Policy>
 auto dot(const DistArray<Tile, Policy>& a, const DistArray<Tile, Policy>& b) {
-  return (a(detail::dummy_annotation(rank(a)))
-              .dot(b(detail::dummy_annotation(rank(b)))))
-      .get();
+  auto&& expr_a = a.make_tsrexpr(detail::dummy_annotation(rank(a)));
+  auto&& expr_b = b.make_tsrexpr(detail::dummy_annotation(rank(b)));
+  return expr_a.dot(expr_b).get();
 }
 
 template <typename Tile, typename Policy>
 auto inner_product(const DistArray<Tile, Policy>& a,
                    const DistArray<Tile, Policy>& b) {
-  return (a(detail::dummy_annotation(rank(a)))
-              .inner_product(b(detail::dummy_annotation(rank(b)))))
-      .get();
+  auto&& expr_a = a.make_tsrexpr(detail::dummy_annotation(rank(a)));
+  auto&& expr_b = b.make_tsrexpr(detail::dummy_annotation(rank(b)));
+  return expr_a.inner_product(expr_b).get();
 }
 
 template <typename Tile, typename Policy>
 auto squared_norm(const DistArray<Tile, Policy>& a) {
-  return a(detail::dummy_annotation(rank(a))).squared_norm();
+  return a.make_tsrexpr(detail::dummy_annotation(rank(a))).squared_norm();
 }
 
 template <typename Tile, typename Policy>
@@ -1832,12 +1930,28 @@ DistArray<T, P> replicated(const DistArray<T, P>& a) {
 
   // Put the replicator pointer in the deferred cleanup object so it will
   // be deleted at the end of the next fence.
-  TA_ASSERT(replicator.unique());  // Required for deferred_cleanup
+  TA_ASSERT(replicator.use_count() == 1);  // Required for deferred_cleanup
   madness::detail::deferred_cleanup(world, replicator);
 
   return result;
 }
 
+namespace detail {
+
+template <typename Tile, typename Policy>
+struct real_t_impl<DistArray<Tile, Policy>> {
+  using type = typename DistArray<Tile, Policy>::template rebind_numeric_t<
+      typename Tile::scalar_type>;
+};
+
+template <typename Tile, typename Policy>
+struct complex_t_impl<DistArray<Tile, Policy>> {
+  using type = typename DistArray<Tile, Policy>::template rebind_numeric_t<
+      std::complex<typename Tile::scalar_type>>;
+};
+
+}  // namespace detail
+
 }  // namespace TiledArray
 
 // serialization
@@ -1866,13 +1980,13 @@ template <class Tile, class Policy>
 void save(const TiledArray::DistArray<Tile, Policy>& x,
           const std::string name) {
   archive::ParallelOutputArchive<> ar2(x.world(), name.c_str(), 1);
-  ar2& x;
+  ar2 & x;
 }
 
 template <class Tile, class Policy>
 void load(TiledArray::DistArray<Tile, Policy>& x, const std::string name) {
   archive::ParallelInputArchive<> ar2(x.world(), name.c_str(), 1);
-  ar2& x;
+  ar2 & x;
 }
 
 }  // namespace madness
diff --git a/src/TiledArray/dist_eval/array_eval.h b/src/TiledArray/dist_eval/array_eval.h
index c9f3daf195..2eaad01a9b 100644
--- a/src/TiledArray/dist_eval/array_eval.h
+++ b/src/TiledArray/dist_eval/array_eval.h
@@ -59,7 +59,7 @@ class LazyArrayTile {
       (!Op::is_consumable) && consume_ ? op_->consume(tile_)
                                        : (*op_)(tile_)));  ///< conversion_type
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
   // TODO need a better design on how to manage the lifetime of converted Tile
   mutable conversion_result_type conversion_tile_;
 #endif
@@ -69,7 +69,7 @@ class LazyArrayTile {
       : tile_(),
         op_(),
         consume_(false)
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
         ,
         conversion_tile_()
 #endif
@@ -83,7 +83,7 @@ class LazyArrayTile {
       : tile_(other.tile_),
         op_(other.op_),
         consume_(other.consume_)
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
         ,
         conversion_tile_()
 #endif
@@ -100,7 +100,7 @@ class LazyArrayTile {
       : tile_(tile),
         op_(op),
         consume_(consume)
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
         ,
         conversion_tile_()
 #endif
@@ -114,7 +114,7 @@ class LazyArrayTile {
     tile_ = other.tile_;
     op_ = other.op_;
     consume_ = other.consume_;
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
     conversion_tile_ = other.conversion_tile_;
 #endif
     return *this;
@@ -126,7 +126,7 @@ class LazyArrayTile {
   bool is_consumable() const { return consume_ || op_->permutation(); }
 
   /// Convert tile to evaluation type using the op object
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
   explicit operator conversion_result_type&() const {
     conversion_tile_ =
@@ -198,6 +198,26 @@ class ArrayEvalImpl
   std::shared_ptr<op_type> op_;  ///< The tile operation
   BlockRange block_range_;       ///< Sub-block range
 
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+  // tracing artifacts
+  using pending_counter_t = std::atomic<std::size_t>[];  // 1 counter per rank
+  mutable std::shared_ptr<pending_counter_t>
+      ntiles_pending_;  // number of pending tiles from each rank
+  mutable std::shared_ptr<pending_counter_t>
+      ntasks_pending_;  // number of pending tasks using data from each rank
+
+  struct AtomicCounterDecreaser : public madness::CallbackInterface {
+    std::shared_ptr<std::atomic<std::size_t>> counter;
+
+    AtomicCounterDecreaser(std::shared_ptr<std::atomic<std::size_t>> counter)
+        : counter(std::move(counter)) {}
+    void notify() override {
+      --(*counter);
+      delete this;
+    }
+  };
+#endif
+
  public:
   /// Construct with full array range
 
@@ -208,16 +228,39 @@ class ArrayEvalImpl
   /// \param pmap The process map for the result tensor tiles
   /// \param perm The permutation that is applied to the tile coordinate index
   /// \param op The operation that will be used to evaluate the tiles of array
-  template <typename Perm, typename = std::enable_if_t<
-                               TiledArray::detail::is_permutation_v<Perm>>>
+  template <typename Perm,
+            typename = std::enable_if_t<TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>>>>
   ArrayEvalImpl(const array_type& array, World& world,
                 const trange_type& trange, const shape_type& shape,
-                const std::shared_ptr<const pmap_interface>& pmap,
-                const Perm& perm, const op_type& op)
-      : DistEvalImpl_(world, trange, shape, pmap, outer(perm)),
+                const std::shared_ptr<const pmap_interface>& pmap, Perm&& perm,
+                const op_type& op)
+      : DistEvalImpl_(world, trange, shape, pmap,
+                      outer(std::forward<Perm>(perm))),
         array_(array),
         op_(std::make_shared<op_type>(op)),
-        block_range_() {}
+        block_range_()
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+        ,
+        ntiles_pending_(new std::atomic<std::size_t>[world.size()]),
+        ntasks_pending_(new std::atomic<std::size_t>[world.size()])
+#endif
+  {
+#if 0
+    std::stringstream ss;
+    ss << "ArrayEvalImpl: id=" << this->id();
+    if (array_) ss << " array.id()=" << array_.id();
+    ss << "\n";
+    std::cout << ss.str();
+#endif
+
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+    for (auto rank = 0; rank != world.size(); ++rank) {
+      ntiles_pending_[rank] = 0;
+      ntasks_pending_[rank] = 0;
+    }
+#endif
+  }
 
   /// Constructor with sub-block range
 
@@ -232,25 +275,59 @@ class ArrayEvalImpl
   /// \param op The operation that will be used to evaluate the tiles of array
   /// \param lower_bound The sub-block lower bound
   /// \param upper_bound The sub-block upper bound
-  template <typename Index1, typename Index2, typename Perm,
-            typename = std::enable_if_t<
-                TiledArray::detail::is_integral_range_v<Index1> &&
-                TiledArray::detail::is_integral_range_v<Index2> &&
-                TiledArray::detail::is_permutation_v<Perm>>>
+  template <
+      typename Index1, typename Index2, typename Perm,
+      typename = std::enable_if_t<
+          TiledArray::detail::is_integral_range_v<Index1> &&
+          TiledArray::detail::is_integral_range_v<Index2> &&
+          TiledArray::detail::is_permutation_v<std::remove_reference_t<Perm>>>>
   ArrayEvalImpl(const array_type& array, World& world,
                 const trange_type& trange, const shape_type& shape,
-                const std::shared_ptr<const pmap_interface>& pmap,
-                const Perm& perm, const op_type& op, const Index1& lower_bound,
+                const std::shared_ptr<const pmap_interface>& pmap, Perm&& perm,
+                const op_type& op, const Index1& lower_bound,
                 const Index2& upper_bound)
-      : DistEvalImpl_(world, trange, shape, pmap, outer(perm)),
+      : DistEvalImpl_(world, trange, shape, pmap,
+                      outer(std::forward<Perm>(perm))),
         array_(array),
         op_(std::make_shared<op_type>(op)),
-        block_range_(array.trange().tiles_range(), lower_bound, upper_bound) {}
+        block_range_(array.trange().tiles_range(), lower_bound, upper_bound)
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+        ,
+        ntiles_pending_(new std::atomic<std::size_t>[world.size()]),
+        ntasks_pending_(new std::atomic<std::size_t>[world.size()])
+#endif
+  {
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+    for (auto rank = 0; rank != world.size(); ++rank) {
+      ntiles_pending_[rank] = 0;
+      ntasks_pending_[rank] = 0;
+    }
+#endif
+  }
 
   /// Virtual destructor
-  virtual ~ArrayEvalImpl() {}
+  virtual ~ArrayEvalImpl() {
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+    if (std::find_if(ntiles_pending_.get(),
+                     ntiles_pending_.get() + this->world().size(),
+                     [](const auto& v) { return v != 0; }) !=
+        ntiles_pending_.get() + this->world().size()) {
+      madness::print_error(
+          "ArrayEvalImpl: pending tiles at destruction! (id=", this->id(), ")");
+      abort();
+    }
+    if (std::find_if(ntasks_pending_.get(),
+                     ntasks_pending_.get() + this->world().size(),
+                     [](const auto& v) { return v != 0; }) !=
+        ntasks_pending_.get() + this->world().size()) {
+      madness::print_error(
+          "ArrayEvalImpl: pending tasks at destruction! (id=", this->id(), ")");
+      abort();
+    }
+#endif
+  }
 
-  virtual Future<value_type> get_tile(ordinal_type i) const {
+  Future<value_type> get_tile(ordinal_type i) const override {
     // Get the array index that corresponds to the target index
     auto array_index = DistEvalImpl_::perm_index_to_source(i);
 
@@ -258,19 +335,49 @@ class ArrayEvalImpl
     // index to the correct location.
     if (block_range_.rank()) array_index = block_range_.ordinal(array_index);
 
-    // Get the tile from array_, which may be located on a remote node.
-    Future<typename array_type::value_type> tile = array_.find(array_index);
-
-    const bool consumable_tile = !array_.is_local(array_index);
+    const bool arg_tile_is_remote = !array_.is_local(array_index);
+    const ProcessID arg_tile_owner = array_.owner(array_index);
 
-    return eval_tile(tile, consumable_tile);
+    Future<value_type> result;
+    bool task_created = false;
+    if (arg_tile_is_remote) {
+      TA_ASSERT(arg_tile_owner != array_.world().rank());
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+      ntiles_pending_[arg_tile_owner]++;
+#endif
+      auto arg_tile = array_.find(array_index);
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+      arg_tile.register_callback(
+          new AtomicCounterDecreaser(std::shared_ptr<std::atomic<std::size_t>>(
+              ntiles_pending_, ntiles_pending_.get() + arg_tile_owner)));
+#endif
+      std::tie(result, task_created) =
+          eval_tile(arg_tile, /* consumable_tile = */ true
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+                    ,
+                    arg_tile_owner
+#endif
+          );
+    } else {
+      TA_ASSERT(arg_tile_owner == array_.world().rank());
+      std::tie(result, task_created) = eval_tile(array_.find_local(array_index),
+                                                 /* consumable_tile = */ false
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+                                                 ,
+                                                 arg_tile_owner
+#endif
+      );
+    }
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+    TA_ASSERT(ntiles_pending_[this->world().rank()] == 0);
+    // even if data is local we may have created a task to evaluate it
+    // TA_ASSERT(ntasks_pending_[this->world().rank()] == 0);
+#endif
+    return result;
   }
 
-  /// Discard a tile that is not needed
-
-  /// This function handles the cleanup for tiles that are not needed in
-  /// subsequent computation.
-  virtual void discard_tile(ordinal_type) const {
+  void discard_tile(ordinal_type i) const override {
+    TA_ASSERT(this->is_local(i));
     const_cast<ArrayEvalImpl_*>(this)->notify();
   }
 
@@ -281,23 +388,36 @@ class ArrayEvalImpl
   }
 
   /// Evaluate a single LazyArrayTile
-  madness::Future<value_type> eval_tile(
+  /// @return A pair of the future to the tile and a boolean indicating whether
+  /// a task was created to produce the tile
+  [[nodiscard]] std::pair<madness::Future<value_type>, bool> eval_tile(
       const madness::Future<typename array_type::value_type>& tile,
-      const bool consumable_tile) const {
+      const bool consumable_tile
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+      ,
+      const ProcessID tile_owner
+#endif
+  ) const {
     // Insert the tile into this evaluator for subsequent processing
     if (tile.probe()) {
       // Skip the task since the tile is ready
       Future<value_type> result;
       result.set(make_tile(tile, consumable_tile));
       const_cast<ArrayEvalImpl_*>(this)->notify();
-      return result;
+      return {result, false};
     } else {
       // Spawn a task to set the tile when the input tile is not ready.
       Future<value_type> result = TensorImpl_::world().taskq.add(
           shared_from_this(), &ArrayEvalImpl_::make_tile, tile, consumable_tile,
           madness::TaskAttributes::hipri());
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+      ntasks_pending_[tile_owner]++;
+      result.register_callback(
+          new AtomicCounterDecreaser(std::shared_ptr<std::atomic<std::size_t>>(
+              ntasks_pending_, ntasks_pending_.get() + tile_owner)));
+#endif
       result.register_callback(const_cast<ArrayEvalImpl_*>(this));
-      return result;
+      return {result, true};
     }
   }
   /// Evaluate the tiles of this tensor
@@ -305,28 +425,24 @@ class ArrayEvalImpl
   /// This function will evaluate the children of this distributed evaluator
   /// and evaluate the tiles for this distributed evaluator.
   /// \return The number of tiles that will be set by this process
-  virtual int internal_eval() {
-    // Counter for the number of tasks submitted by this object
-    int task_count = 0;
-
-    // Get a count of the number of local tiles.
-    if (TensorImpl_::shape().is_dense()) {
-      task_count = TensorImpl_::pmap()->local_size();
-    } else {
-      // Create iterator to tiles that are local for this evaluator.
-      typename array_type::pmap_interface::const_iterator it =
-          TensorImpl_::pmap()->begin();
-      const typename array_type::pmap_interface::const_iterator end =
-          TensorImpl_::pmap()->end();
-
-      for (; it != end; ++it) {
-        if (!TensorImpl_::is_zero(*it)) ++task_count;
-      }
+  int internal_eval() override { return TensorImpl_::local_nnz(); }
+
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+  std::string status() const override {
+    std::stringstream ss;
+    ss << "ArrayEvalImpl: array.id()=" << array_.id();
+    ss << " ntiles_pending=[";
+    for (auto rank = 0; rank != this->world().size(); ++rank) {
+      ss << " " << ntiles_pending_[rank];
     }
-
-    return task_count;
+    ss << "] ntasks_pending=[";
+    for (auto rank = 0; rank != this->world().size(); ++rank) {
+      ss << " " << ntasks_pending_[rank];
+    }
+    ss << "]\n";
+    return ss.str();
   }
-
+#endif
 };  // class ArrayEvalImpl
 
 }  // namespace detail
diff --git a/src/TiledArray/dist_eval/binary_eval.h b/src/TiledArray/dist_eval/binary_eval.h
index a4c203d3dd..87cce91656 100644
--- a/src/TiledArray/dist_eval/binary_eval.h
+++ b/src/TiledArray/dist_eval/binary_eval.h
@@ -68,6 +68,16 @@ class BinaryEvalImpl : public DistEvalImpl<typename Op::result_type, Policy>,
   right_type right_;  ///< Right argument
   op_type op_;        ///< binary element operator
 
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+  // artifacts of tracing
+  mutable ordinal_type left_ntiles_used_;   // # of tiles used from left_
+  mutable ordinal_type right_ntiles_used_;  // # of tiles used from right_
+  mutable ordinal_type
+      left_ntiles_discarded_;  // # of tiles discarded from left_
+  mutable ordinal_type
+      right_ntiles_discarded_;  // # of tiles discarded from right_
+#endif
+
  public:
   /// Construct a binary evaluator
 
@@ -88,8 +98,19 @@ class BinaryEvalImpl : public DistEvalImpl<typename Op::result_type, Policy>,
       : DistEvalImpl_(world, trange, shape, pmap, outer(perm)),
         left_(left),
         right_(right),
-        op_(op) {
-    TA_ASSERT(left.trange() == right.trange());
+        op_(op)
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+        ,
+        left_ntiles_used_(0),
+        right_ntiles_used_(0),
+        left_ntiles_discarded_(0),
+        right_ntiles_discarded_(0)
+#endif
+  {
+    TA_ASSERT(ignore_tile_position()
+                  ? left.trange().elements_range().extent() ==
+                        right.trange().elements_range().extent()
+                  : left.trange() == right.trange());
   }
 
   virtual ~BinaryEvalImpl() {}
@@ -100,14 +121,14 @@ class BinaryEvalImpl : public DistEvalImpl<typename Op::result_type, Policy>,
   /// \return A \c Future to the tile at index i
   /// \throw TiledArray::Exception When tile \c i is owned by a remote node.
   /// \throw TiledArray::Exception When tile \c i a zero tile.
-  virtual Future<value_type> get_tile(ordinal_type i) const {
+  Future<value_type> get_tile(ordinal_type i) const override {
     TA_ASSERT(TensorImpl_::is_local(i));
     TA_ASSERT(!TensorImpl_::is_zero(i));
 
     const auto source_index = DistEvalImpl_::perm_index_to_source(i);
-    const ProcessID source =
-        left_.owner(source_index);  // Left and right
-                                    // should have the same owner
+    const ProcessID source = left_.owner(source_index);
+    // Left and right should have the same owner
+    TA_ASSERT(source == right_.owner(source_index));
 
     const madness::DistributedID key(DistEvalImpl_::id(), i);
     return TensorImpl_::world().gop.template recv<value_type>(source, key);
@@ -118,17 +139,17 @@ class BinaryEvalImpl : public DistEvalImpl<typename Op::result_type, Policy>,
   /// This function handles the cleanup for tiles that are not needed in
   /// subsequent computation.
   /// \param i The index of the tile
-  virtual void discard_tile(ordinal_type i) const { get_tile(i); }
+  void discard_tile(ordinal_type i) const override { get_tile(i); }
 
  private:
   /// Task function for evaluating tiles
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
   /// \param i The tile index
   /// \param left The left-hand tile
   /// \param right The right-hand tile
   template <typename L, typename R, typename U = value_type>
-  std::enable_if_t<!detail::is_cuda_tile_v<U>, void> eval_tile(
+  std::enable_if_t<!detail::is_device_tile_v<U>, void> eval_tile(
       const ordinal_type i, L left, R right) {
     DistEvalImpl_::set_tile(i, op_(left, right));
   }
@@ -137,11 +158,11 @@ class BinaryEvalImpl : public DistEvalImpl<typename Op::result_type, Policy>,
   /// \param left The left-hand tile
   /// \param right The right-hand tile
   template <typename L, typename R, typename U = value_type>
-  std::enable_if_t<detail::is_cuda_tile_v<U>, void> eval_tile(
+  std::enable_if_t<detail::is_device_tile_v<U>, void> eval_tile(
       const ordinal_type i, L left, R right) {
     // TODO avoid copy the Op object
     auto result_tile =
-        madness::add_cuda_task(DistEvalImpl_::world(), op_, left, right);
+        madness::add_device_task(DistEvalImpl_::world(), op_, left, right);
     DistEvalImpl_::set_tile(i, result_tile);
   }
 #else
@@ -160,7 +181,7 @@ class BinaryEvalImpl : public DistEvalImpl<typename Op::result_type, Policy>,
   /// until the tasks for the children are evaluated (not for the tasks of
   /// this object).
   /// \return The number of tiles that will be set by this process
-  virtual int internal_eval() {
+  int internal_eval() override {
     // Evaluate child tensors
     left_.eval();
     right_.eval();
@@ -195,6 +216,12 @@ class BinaryEvalImpl : public DistEvalImpl<typename Op::result_type, Policy>,
             &BinaryEvalImpl_::template eval_tile<left_argument_type,
                                                  right_argument_type>,
             target_index, left_.get(source_index), right_.get(source_index));
+        TA_ASSERT(left_.is_local(source_index));
+        TA_ASSERT(right_.is_local(source_index));
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+        left_ntiles_used_++;
+        right_ntiles_used_++;
+#endif
 
         ++task_count;
       }
@@ -213,32 +240,64 @@ class BinaryEvalImpl : public DistEvalImpl<typename Op::result_type, Policy>,
                 &BinaryEvalImpl_::template eval_tile<const ZeroTensor,
                                                      right_argument_type>,
                 target_index, ZeroTensor(), right_.get(index));
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+            right_ntiles_used_++;
+#endif
           } else if (right_.is_zero(index)) {
             TensorImpl_::world().taskq.add(
                 self,
                 &BinaryEvalImpl_::template eval_tile<left_argument_type,
                                                      const ZeroTensor>,
                 target_index, left_.get(index), ZeroTensor());
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+            left_ntiles_used_++;
+#endif
           } else {
+            TA_ASSERT(!left_.is_zero(index) && !right_.is_zero(index));
             TensorImpl_::world().taskq.add(
                 self,
                 &BinaryEvalImpl_::template eval_tile<left_argument_type,
                                                      right_argument_type>,
                 target_index, left_.get(index), right_.get(index));
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+            left_ntiles_used_++;
+            right_ntiles_used_++;
+#endif
           }
 
           ++task_count;
         } else {
           // Cleanup unused tiles
-          if (!left_.is_zero(index)) left_.discard(index);
-          if (!right_.is_zero(index)) right_.discard(index);
+          if (!left_.is_zero(index)) {
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+            left_ntiles_discarded_++;
+#endif
+            left_.discard(index);
+          }
+          if (!right_.is_zero(index)) {
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+            right_ntiles_discarded_++;
+#endif
+            right_.discard(index);
+          }
         }
       }
     }
 
     // Wait for child tensors to be evaluated, and process tasks while waiting.
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+    TA_ASSERT(left_.local_nnz() == left_ntiles_used_ + left_ntiles_discarded_);
+    TA_ASSERT(right_.local_nnz() ==
+              right_ntiles_used_ + right_ntiles_discarded_);
+#endif
     left_.wait();
     right_.wait();
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+    // for some evaluators like SUMMA real task counts are not available even
+    // after wait() TA_ASSERT(left_.task_count() >= left_ntiles_used_ +
+    // left_ntiles_discarded_); TA_ASSERT(right_.task_count() >=
+    // right_ntiles_used_ + right_ntiles_discarded_);
+#endif
 
     return task_count;
   }
diff --git a/src/TiledArray/dist_eval/contraction_eval.h b/src/TiledArray/dist_eval/contraction_eval.h
index ae3b456bc0..a747c0748b 100644
--- a/src/TiledArray/dist_eval/contraction_eval.h
+++ b/src/TiledArray/dist_eval/contraction_eval.h
@@ -31,11 +31,11 @@
 
 #include <TiledArray/tensor/type_traits.h>
 
-//#define TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1
-//#define TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 1
-//#define TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1
-//#define TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 1
-//#define TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1
+// #define TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL 1
+// #define TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE 1
+// #define TILEDARRAY_ENABLE_SUMMA_TRACE_STEP 1
+// #define TILEDARRAY_ENABLE_SUMMA_TRACE_BCAST 1
+// #define TILEDARRAY_ENABLE_SUMMA_TRACE_FINALIZE 1
 
 namespace TiledArray {
 namespace detail {
@@ -118,6 +118,7 @@ class Summa
   typedef std::pair<ordinal_type, left_future>
       col_datum;  ///< Datum element type for a left-hand argument column
 
+  // various tracing/debugging artifacts
   static constexpr const bool trace_tasks =
 #ifdef TILEDARRAY_ENABLE_TASK_DEBUG_TRACE
       true
@@ -125,6 +126,16 @@ class Summa
       false
 #endif
       ;
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+  mutable std::atomic<ordinal_type>
+      left_ntiles_used_;  // # of tiles used from left_
+  mutable std::atomic<ordinal_type>
+      right_ntiles_used_;  // # of tiles used from right_
+  mutable std::atomic<ordinal_type>
+      left_ntiles_discarded_;  // # of tiles discarded from left_
+  mutable std::atomic<ordinal_type>
+      right_ntiles_discarded_;  // # of tiles discarded from right_
+#endif
 
  protected:
   // Import base class functions
@@ -478,8 +489,8 @@ class Summa
   template <typename Arg>
   static typename std::enable_if<
       is_lazy_tile<typename Arg::value_type>::value
-#ifdef TILEDARRAY_HAS_CUDA
-          && !detail::is_cuda_tile_v<typename Arg::value_type>
+#ifdef TILEDARRAY_HAS_DEVICE
+          && !detail::is_device_tile_v<typename Arg::value_type>
 #endif
       ,
       Future<typename Arg::eval_type>>::type
@@ -490,7 +501,7 @@ class Summa
                                  madness::TaskAttributes::hipri());
   }
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
   /// Conversion function
 
   /// This function spawns a task that will convert a lazy tile from the
@@ -502,13 +513,14 @@ class Summa
   template <typename Arg>
   static typename std::enable_if<
       is_lazy_tile<typename Arg::value_type>::value &&
-          detail::is_cuda_tile_v<typename Arg::value_type>,
+          detail::is_device_tile_v<typename Arg::value_type>,
       Future<typename Arg::eval_type>>::type
   get_tile(Arg& arg, const typename Arg::ordinal_type index) {
     auto convert_tile_fn =
         &Summa_::template convert_tile<typename Arg::value_type>;
-    return madness::add_cuda_task(arg.world(), convert_tile_fn, arg.get(index),
-                                  madness::TaskAttributes::hipri());
+    return madness::add_device_task(arg.world(), convert_tile_fn,
+                                    arg.get(index),
+                                    madness::TaskAttributes::hipri());
   }
 #endif
 
@@ -704,11 +716,17 @@ class Summa
 
         if (do_broadcast) {
           // Broadcast the tile
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+          ++left_ntiles_used_;
+#endif
           const madness::DistributedID key(DistEvalImpl_::id(), index);
           auto tile = get_tile(left_, index);
           TensorImpl_::world().gop.bcast(key, tile, group_root, row_group);
         } else {
           // Discard the tile
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+          ++left_ntiles_discarded_;
+#endif
           left_.discard(index);
         }
       }
@@ -747,12 +765,18 @@ class Summa
 
         if (do_broadcast) {
           // Broadcast the tile
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+          ++right_ntiles_used_;
+#endif
           const madness::DistributedID key(DistEvalImpl_::id(),
                                            index + left_.size());
           auto tile = get_tile(right_, index);
           TensorImpl_::world().gop.bcast(key, tile, group_root, col_group);
         } else {
           // Discard the tile
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+          ++right_ntiles_discarded_;
+#endif
           right_.discard(index);
         }
       }
@@ -865,45 +889,63 @@ class Summa
 
   /// Initialize reduce tasks and construct broadcast groups
   ordinal_type initialize(const DenseShape&) {
-    // Construct static broadcast groups for dense arguments
-    const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul);
-    col_group_ = proc_grid_.make_col_group(col_did);
-    const madness::DistributedID row_did(DistEvalImpl_::id(), k_);
-    row_group_ = proc_grid_.make_row_group(row_did);
+    // if contraction is over zero-volume range just initialize tiles to zero
+    if (k_ == 0) {
+      ordinal_type tile_count = 0;
+      const auto& tiles_range = this->trange().tiles_range();
+      for (auto&& tile_idx : tiles_range) {
+        auto tile_ord = tiles_range.ordinal(tile_idx);
+        if (this->is_local(tile_ord)) {
+          this->world().taskq.add([this, tile_ord, tile_idx]() {
+            this->set_tile(tile_ord,
+                           value_type(this->trange().tile(tile_idx),
+                                      typename value_type::value_type{}));
+          });
+          ++tile_count;
+        }
+      }
+      return tile_count;
+    } else {
+      // Construct static broadcast groups for dense arguments
+      const madness::DistributedID col_did(DistEvalImpl_::id(), 0ul);
+      col_group_ = proc_grid_.make_col_group(col_did);
+      const madness::DistributedID row_did(DistEvalImpl_::id(), k_);
+      row_group_ = proc_grid_.make_row_group(row_did);
 
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
-    std::stringstream ss;
-    ss << "init: rank=" << TensorImpl_::world().rank() << "\n    col_group_=("
-       << col_did.first << ", " << col_did.second << ") { ";
-    for (ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc)
-      ss << col_group_.world_rank(gproc) << " ";
-    ss << "}\n    row_group_=(" << row_did.first << ", " << row_did.second
-       << ") { ";
-    for (ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc)
-      ss << row_group_.world_rank(gproc) << " ";
-    ss << "}\n";
-    printf(ss.str().c_str());
+      std::stringstream ss;
+      ss << "init: rank=" << TensorImpl_::world().rank() << "\n    col_group_=("
+         << col_did.first << ", " << col_did.second << ") { ";
+      for (ProcessID gproc = 0ul; gproc < col_group_.size(); ++gproc)
+        ss << col_group_.world_rank(gproc) << " ";
+      ss << "}\n    row_group_=(" << row_did.first << ", " << row_did.second
+         << ") { ";
+      for (ProcessID gproc = 0ul; gproc < row_group_.size(); ++gproc)
+        ss << row_group_.world_rank(gproc) << " ";
+      ss << "}\n";
+      printf(ss.str().c_str());
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
 
-    // Allocate memory for the reduce pair tasks.
-    std::allocator<ReducePairTask<op_type>> alloc;
-    reduce_tasks_ = alloc.allocate(proc_grid_.local_size());
+      // Allocate memory for the reduce pair tasks.
+      std::allocator<ReducePairTask<op_type>> alloc;
+      reduce_tasks_ = alloc.allocate(proc_grid_.local_size());
 
-    // Iterate over all local tiles
-    const ordinal_type n = proc_grid_.local_size();
-    for (ordinal_type t = 0ul; t < n; ++t) {
-      // Initialize the reduction task
-      ReducePairTask<op_type>* MADNESS_RESTRICT const reduce_task =
-          reduce_tasks_ + t;
-      new (reduce_task) ReducePairTask<op_type>(TensorImpl_::world(), op_
+      // Iterate over all local tiles
+      const ordinal_type n = proc_grid_.local_size();
+      for (ordinal_type t = 0ul; t < n; ++t) {
+        // Initialize the reduction task
+        ReducePairTask<op_type>* MADNESS_RESTRICT const reduce_task =
+            reduce_tasks_ + t;
+        new (reduce_task) ReducePairTask<op_type>(TensorImpl_::world(), op_
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
-                                                ,
-                                                nullptr, t
+                                                  ,
+                                                  nullptr, t
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
-      );
-    }
+        );
+      }
 
-    return proc_grid_.local_size();
+      return proc_grid_.local_size();
+    }
   }
 
   /// Initialize reduce tasks
@@ -914,6 +956,9 @@ class Summa
     ss << "    initialize rank=" << TensorImpl_::world().rank() << " tiles={ ";
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_INITIALIZE
 
+    // fast return if there is no work to do
+    if (k_ == 0) return 0;
+
     // Allocate memory for the reduce pair tasks.
     std::allocator<ReducePairTask<op_type>> alloc;
     reduce_tasks_ = alloc.allocate(proc_grid_.local_size());
@@ -1323,7 +1368,6 @@ class Summa
 
     template <typename Derived>
     void make_next_step_tasks(Derived* task, ordinal_type depth) {
-      TA_ASSERT(depth > 0);
       // Set the depth to be no greater than the maximum number steps
       if (depth > owner_->k_) depth = owner_->k_;
 
@@ -1549,7 +1593,16 @@ class Summa
         left_stride_(k),
         left_stride_local_(proc_grid.proc_rows() * k),
         right_stride_(1ul),
-        right_stride_local_(proc_grid.proc_cols()) {}
+        right_stride_local_(proc_grid.proc_cols())
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+        ,
+        left_ntiles_used_(0),
+        right_ntiles_used_(0),
+        left_ntiles_discarded_(0),
+        right_ntiles_discarded_(0)
+#endif
+  {
+  }
 
   virtual ~Summa() {}
 
@@ -1559,7 +1612,7 @@ class Summa
   /// \return A \c Future to the tile at index i
   /// \throw TiledArray::Exception When tile \c i is owned by a remote node.
   /// \throw TiledArray::Exception When tile \c i a zero tile.
-  virtual Future<value_type> get_tile(ordinal_type i) const {
+  Future<value_type> get_tile(ordinal_type i) const override {
     TA_ASSERT(TensorImpl_::is_local(i));
     TA_ASSERT(!TensorImpl_::is_zero(i));
 
@@ -1583,7 +1636,7 @@ class Summa
   /// This function handles the cleanup for tiles that are not needed in
   /// subsequent computation.
   /// \param i The index of the tile
-  virtual void discard_tile(ordinal_type i) const { get_tile(i); }
+  void discard_tile(ordinal_type i) const override { get_tile(i); }
 
  private:
   /// Adjust iteration depth based on memory constraints
@@ -1646,7 +1699,7 @@ class Summa
   /// until the tasks for the children are evaluated (not for the tasks of
   /// this object).
   /// \return The number of tiles that will be set by this process
-  virtual int internal_eval() {
+  int internal_eval() override {
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
     printf("eval: start eval children rank=%i\n", TensorImpl_::world().rank());
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
@@ -1673,60 +1726,92 @@ class Summa
           std::max(ProcGrid::size_type(2),
                    std::min(proc_grid_.proc_rows(), proc_grid_.proc_cols()));
 
-      // Construct the first SUMMA iteration task
-      if (TensorImpl_::shape().is_dense()) {
-        // We cannot have more iterations than there are blocks in the k
-        // dimension
-        if (depth > k_) depth = k_;
-
-        // Modify the number of concurrent iterations based on the available
-        // memory.
-        depth = mem_bound_depth(depth, 0.0f, 0.0f);
-
-        // Enforce user defined depth bound
-        if (max_depth_) depth = std::min(depth, max_depth_);
-
-        TensorImpl_::world().taskq.add(
-            new DenseStepTask(shared_from_this(), depth));
-      } else {
-        // Increase the depth based on the amount of sparsity in an iteration.
+      // watch out for the corner case: contraction over zero-volume range
+      // producing nonzero-volume result ... in that case there is nothing to do
+      // the appropriate initialization was performed in the initialize() method
+      if (k_ != 0) {
+        // Construct the first SUMMA iteration task
+        if (TensorImpl_::shape().is_dense()) {
+          // We cannot have more iterations than there are blocks in the k
+          // dimension
+          if (depth > k_) depth = k_;
+
+          // Modify the number of concurrent iterations based on the available
+          // memory.
+          depth = mem_bound_depth(depth, 0.0f, 0.0f);
+
+          // Enforce user defined depth bound
+          if (max_depth_) depth = std::min(depth, max_depth_);
+
+          TensorImpl_::world().taskq.add(
+              new DenseStepTask(shared_from_this(), depth));
+        } else {
+          // Increase the depth based on the amount of sparsity in an iteration.
 
-        // Get the sparsity fractions for the left- and right-hand arguments.
-        const float left_sparsity = left_.shape().sparsity();
-        const float right_sparsity = right_.shape().sparsity();
+          // Get the sparsity fractions for the left- and right-hand arguments.
+          const float left_sparsity = left_.shape().sparsity();
+          const float right_sparsity = right_.shape().sparsity();
 
-        // Compute the fraction of non-zero result tiles in a single SUMMA
-        // iteration.
-        const float frac_non_zero = (1.0f - std::min(left_sparsity, 0.9f)) *
-                                    (1.0f - std::min(right_sparsity, 0.9f));
+          // Compute the fraction of non-zero result tiles in a single SUMMA
+          // iteration.
+          const float frac_non_zero = (1.0f - std::min(left_sparsity, 0.9f)) *
+                                      (1.0f - std::min(right_sparsity, 0.9f));
 
-        // Compute the new depth based on sparsity of the arguments
-        depth =
-            float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) + 0.5f;
+          // Compute the new depth based on sparsity of the arguments
+          depth = float(depth) * (1.0f - 1.35638f * std::log2(frac_non_zero)) +
+                  0.5f;
 
-        // We cannot have more iterations than there are blocks in the k
-        // dimension
-        if (depth > k_) depth = k_;
+          // We cannot have more iterations than there are blocks in the k
+          // dimension
+          if (depth > k_) depth = k_;
 
-        // Modify the number of concurrent iterations based on the available
-        // memory and sparsity of the argument tensors.
-        depth = mem_bound_depth(depth, left_sparsity, right_sparsity);
+          // Modify the number of concurrent iterations based on the available
+          // memory and sparsity of the argument tensors.
+          depth = mem_bound_depth(depth, left_sparsity, right_sparsity);
 
-        // Enforce user defined depth bound
-        if (max_depth_) depth = std::min(depth, max_depth_);
+          // Enforce user defined depth bound
+          if (max_depth_) depth = std::min(depth, max_depth_);
 
-        TensorImpl_::world().taskq.add(
-            new SparseStepTask(shared_from_this(), depth));
-      }
+          TensorImpl_::world().taskq.add(
+              new SparseStepTask(shared_from_this(), depth));
+        }
+      }  // k_ != 0
     }
 
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
     printf("eval: start wait children rank=%i\n", TensorImpl_::world().rank());
 #endif  // TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
 
+    // corner case: if left or right are zero-volume no tasks were scheduled, so
+    // need to discard all of their tiles manually
+    if (left_.range().volume() == 0) {
+      for (auto&& tile_idx : right_.range()) {
+        auto tile_ord = right_.range().ordinal(tile_idx);
+        if (right_.is_local(tile_ord) && !right_.is_zero(tile_ord))
+          right_.discard(tile_ord);
+      }
+    }
+    if (right_.range().volume() == 0) {
+      for (auto&& tile_idx : left_.range()) {
+        auto tile_ord = left_.range().ordinal(tile_idx);
+        if (left_.is_local(tile_ord) && !left_.is_zero(tile_ord))
+          left_.discard(tile_ord);
+      }
+    }
+
     // Wait for child tensors to be evaluated, and process tasks while waiting.
     left_.wait();
     right_.wait();
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+    // values of left_ntiles_used_ etc. are not available until all broadcasts
+    // have been completed ...
+//    TA_ASSERT(left_.local_nnz() == left_ntiles_used_ +
+//    left_ntiles_discarded_); TA_ASSERT(right_.local_nnz() ==
+//    right_ntiles_used_ + right_ntiles_discarded_);
+//    TA_ASSERT(left_.task_count() >= left_ntiles_used_ +
+//    left_ntiles_discarded_); TA_ASSERT(right_.task_count() >=
+//    right_ntiles_used_ + right_ntiles_discarded_);
+#endif
 
 #ifdef TILEDARRAY_ENABLE_SUMMA_TRACE_EVAL
     printf("eval: finished wait children rank=%i\n",
diff --git a/src/TiledArray/dist_eval/dist_eval.h b/src/TiledArray/dist_eval/dist_eval.h
index b1056e0ac1..9e0157cb8b 100644
--- a/src/TiledArray/dist_eval/dist_eval.h
+++ b/src/TiledArray/dist_eval/dist_eval.h
@@ -25,9 +25,9 @@
 #include <TiledArray/permutation.h>
 #include <TiledArray/tensor_impl.h>
 #include <TiledArray/type_traits.h>
-#ifdef TILEDARRAY_HAS_CUDA
-#include <TiledArray/cuda/cuda_task_fn.h>
-#include <TiledArray/external/cuda.h>
+#ifdef TILEDARRAY_HAS_DEVICE
+#include <TiledArray/device/device_task_fn.h>
+#include <TiledArray/external/device.h>
 #endif
 
 namespace TiledArray {
@@ -110,7 +110,7 @@ class DistEvalImpl : public TensorImpl<Policy>,
                const std::shared_ptr<const pmap_interface>& pmap,
                const Permutation& perm)
       : TensorImpl_(world, trange, shape, pmap),
-        id_(world.unique_obj_id()),
+        id_(world.make_unique_obj_id()),
         source_to_target_(),
         target_to_source_(),
         task_count_(-1),
@@ -123,6 +123,28 @@ class DistEvalImpl : public TensorImpl<Policy>,
       source_to_target_ = PermIndex(source_range, perm);
       target_to_source_ = PermIndex(trange.tiles_range(), inv_perm);
     }
+
+#if 0
+    {
+      // print out expected number of tiles on each rank
+      std::vector<size_t> ntiles_per_rank(world.size(), 0);
+      for (auto& i : trange.tiles_range()) {
+        if (!TensorImpl_::is_zero(i)) {
+          ntiles_per_rank[TensorImpl_::owner(i)]++;
+        }
+      }
+      std::stringstream ss;
+      ss << "DistEvalImpl: id=" << id_;
+      if (perm)
+        ss << " perm=" << perm;
+      ss << " ntiles=[";
+      for (auto& i : ntiles_per_rank) {
+        ss << i << " ";
+      }
+      ss << "]";
+      std::cout << ss.str() << std::endl;
+    }
+#endif
   }
 
   virtual ~DistEvalImpl() {}
@@ -142,7 +164,8 @@ class DistEvalImpl : public TensorImpl<Policy>,
 
   /// This function handles the cleanup for tiles that are not needed in
   /// subsequent computation.
-  /// \param i The index of the tile
+  /// \param i The index of the local tile to discard
+  /// \pre `this->is_local(i)`
   virtual void discard_tile(ordinal_type i) const = 0;
 
   /// Set tensor value
@@ -176,7 +199,7 @@ class DistEvalImpl : public TensorImpl<Policy>,
   }
 
   /// Tile set notification
-  virtual void notify() { set_counter_++; }
+  void notify() override { set_counter_++; }
 
   /// Wait for all tiles to be assigned
   void wait() const {
@@ -234,13 +257,36 @@ class DistEvalImpl : public TensorImpl<Policy>,
     TA_ASSERT(task_count_ >= 0);
   }
 
+  /// \return The number of tasks spawned on this rank (after invoking eval()
+  /// this should be equal to local_nnz() for simple evaluators like
+  /// unary/binary, or greater than that for more complex evaluators like SUMMA
+  ordinal_type task_count() const {
+    if (task_count_ == -1)
+      return 0;
+    else
+      return task_count_;
+  }
+
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+  /// reports evaluator status
+
+  /// intended for debugging purposes
+  /// @return string containing log of the current status of evaluator (empty
+  /// string, unless overridden in the specialization)
+  [[nodiscard]] virtual std::string status() const { return {}; }
+#endif
 };  // class DistEvalImpl
 
-/// Tensor expression object
+/// Tensor expression evaluator wrapper
 
-/// This object holds a tensor expression. It is used to store various type
-/// of tensor expressions that depend on the pimpl used to construct the
-/// expression.
+/// This object holds a tensor expression evaluator (DistEvalImpl).
+///
+/// \note Tensor expression evaluators (DistEval and DistEvalImpl)
+/// are similar to DistArray in that they has tensorial structure
+/// (TensorImpl), with shape and policy, but their semantics that
+/// differs from DistArray (e.g., data is not stored
+/// persistently).
+///
 /// \tparam Tile The output tile type
 /// \tparam Policy The tensor policy class
 template <typename Tile, typename Policy>
@@ -333,7 +379,7 @@ class DistEval {
     return pimpl_->pmap();
   }
 
-  /// Query the density of the tensor
+  /// Query if the tensor is dense
 
   /// \return \c true if the tensor is dense, otherwise false
   bool is_dense() const { return pimpl_->is_dense(); }
@@ -348,7 +394,7 @@ class DistEval {
   /// \return The tiled range of the tensor
   const trange_type& trange() const { return pimpl_->trange(); }
 
-  /// Tile move
+  /// Tile accessor
 
   /// Tile is removed after it is set.
   /// \param i The tile index
@@ -359,8 +405,12 @@ class DistEval {
 
   /// This function handles the cleanup for tiles that are not needed in
   /// subsequent computation.
-  /// \param i The index of the tile
-  virtual void discard(ordinal_type i) const { pimpl_->discard_tile(i); }
+  /// \param i The index of a local tile to discard
+  /// \pre `this->is_local(i)`
+  virtual void discard(ordinal_type i) const {
+    TA_ASSERT(this->is_local(i));
+    pimpl_->discard_tile(i);
+  }
 
   /// World object accessor
 
@@ -372,9 +422,35 @@ class DistEval {
   /// \return The unique id for this object
   madness::uniqueidT id() const { return pimpl_->id(); }
 
+  /// \return Number of nonzero tiles on this rank
+  /// \sa TensorImpl::local_nnz()
+  ordinal_type local_nnz() const { return pimpl_->local_nnz(); }
+
+  /// \return The number of tasks spawned on this rank (after invoking eval()
+  /// this should be same as the value returned by local_nnz(), if everything is
+  /// well)
+  ordinal_type task_count() const { return pimpl_->task_count(); }
+
   /// Wait for all local tiles to be evaluated
   void wait() const { pimpl_->wait(); }
 
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+  /// reports evaluator status
+
+  /// intended for debugging purposes
+  /// @return string containing log of the current status of evaluator (empty
+  /// string, unless overridden in the specialization)
+  std::string status() const {
+    std::ostringstream oss;
+    oss << "DistEval status: id=" << id()
+        << " impl_type_name=" << typeid(*(pimpl_.get())).name()
+        << "                 ";
+    oss << pimpl_->status();
+    oss << "\n";
+    return oss.str();
+  }
+#endif
+
 };  // class DistEval
 
 }  // namespace detail
diff --git a/src/TiledArray/dist_eval/unary_eval.h b/src/TiledArray/dist_eval/unary_eval.h
index b3707b92c2..66ab742ada 100644
--- a/src/TiledArray/dist_eval/unary_eval.h
+++ b/src/TiledArray/dist_eval/unary_eval.h
@@ -74,7 +74,13 @@ class UnaryEvalImpl
                 const Perm& perm, const op_type& op)
       : DistEvalImpl_(world, trange, shape, pmap, outer(perm)),
         arg_(arg),
-        op_(op) {}
+        op_(op)
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+        ,
+        arg_ntiles_used_(0)
+#endif
+  {
+  }
 
   /// Virtual destructor
   virtual ~UnaryEvalImpl() {}
@@ -85,7 +91,7 @@ class UnaryEvalImpl
   /// \return A \c Future to the tile at index i
   /// \throw TiledArray::Exception When tile \c i is owned by a remote node.
   /// \throw TiledArray::Exception When tile \c i a zero tile.
-  virtual Future<value_type> get_tile(ordinal_type i) const {
+  Future<value_type> get_tile(ordinal_type i) const override {
     TA_ASSERT(TensorImpl_::is_local(i));
     TA_ASSERT(!TensorImpl_::is_zero(i));
     const auto source = arg_.owner(DistEvalImpl_::perm_index_to_source(i));
@@ -98,7 +104,7 @@ class UnaryEvalImpl
   /// This function handles the cleanup for tiles that are not needed in
   /// subsequent computation.
   /// \param i The index of the tile
-  virtual void discard_tile(ordinal_type i) const { get_tile(i); }
+  void discard_tile(ordinal_type i) const override { get_tile(i); }
 
  private:
   /// Input tile argument type
@@ -111,22 +117,22 @@ class UnaryEvalImpl
 
   /// Task function for evaluating tiles
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
   /// \param i The tile index
   /// \param tile The tile to be evaluated
   template <typename U = value_type>
-  std::enable_if_t<detail::is_cuda_tile_v<U>, void> eval_tile(
+  std::enable_if_t<detail::is_device_tile_v<U>, void> eval_tile(
       const ordinal_type i, tile_argument_type tile) {
     // TODO avoid copy Op object
     auto result_tile =
-        madness::add_cuda_task(DistEvalImpl_::world(), op_, tile);
+        madness::add_device_task(DistEvalImpl_::world(), op_, tile);
     DistEvalImpl_::set_tile(i, result_tile);
   }
 
   /// \param i The tile index
   /// \param tile The tile to be evaluated
   template <typename U = value_type>
-  std::enable_if_t<!detail::is_cuda_tile_v<U>, void> eval_tile(
+  std::enable_if_t<!detail::is_device_tile_v<U>, void> eval_tile(
       const ordinal_type i, tile_argument_type tile) {
     DistEvalImpl_::set_tile(i, op_(tile));
   }
@@ -144,7 +150,7 @@ class UnaryEvalImpl
   /// until the tasks for the children are evaluated (not for the tasks of
   /// this object).
   /// \return The number of tiles that will be set by this process
-  virtual int internal_eval() {
+  int internal_eval() override {
     // Convert pimpl to this object type so it can be used in tasks
     std::shared_ptr<UnaryEvalImpl_> self =
         std::enable_shared_from_this<UnaryEvalImpl_>::shared_from_this();
@@ -152,10 +158,12 @@ class UnaryEvalImpl
     // Evaluate argument
     arg_.eval();
 
-    // Counter for the number of tasks submitted by this object
+    // Counter for the number of tasks that will use local tiles of arg_
     ordinal_type task_count = 0ul;
 
-    // Make sure all local tiles are present.
+    // now create tasks that will produce result tiles and push them to the
+    // destination N.B. data is pushed, rather than pulled, to be able to manage
+    // the lifetime of the argument
     const typename pmap_interface::const_iterator end = arg_.pmap()->end();
     typename pmap_interface::const_iterator it = arg_.pmap()->begin();
     for (; it != end; ++it) {
@@ -165,9 +173,11 @@ class UnaryEvalImpl
       if (!arg_.is_zero(index)) {
         // Get target tile index
         const auto target_index = DistEvalImpl_::perm_index_to_target(index);
+        TA_ASSERT(!this->is_zero(target_index));
 
         // Schedule tile evaluation task
-#ifdef TILEDARRAY_HAS_CUDA
+        TA_ASSERT(arg_.is_local(index));
+#ifdef TILEDARRAY_HAS_DEVICE
         TensorImpl_::world().taskq.add(self,
                                        &UnaryEvalImpl_::template eval_tile<>,
                                        target_index, arg_.get(index));
@@ -175,12 +185,18 @@ class UnaryEvalImpl
         TensorImpl_::world().taskq.add(self, &UnaryEvalImpl_::eval_tile,
                                        target_index, arg_.get(index));
 #endif
-
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+        arg_ntiles_used_++;
+#endif
         ++task_count;
       }
     }
 
     // Wait for local tiles of argument to be evaluated
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+    TA_ASSERT(arg_.local_nnz() == arg_ntiles_used_);
+    TA_ASSERT(arg_.task_count() >= arg_ntiles_used_);
+#endif  //
     arg_.wait();
 
     return task_count;
@@ -188,7 +204,14 @@ class UnaryEvalImpl
 
   arg_type arg_;  ///< Argument
   op_type op_;    ///< The unary tile operation
-};                // class UnaryEvalImpl
+
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+  // artifacts of tracing/debugging
+  mutable ordinal_type arg_ntiles_used_;  // # of tiles used from arg_ ; N.B. no
+                                          // tiles are discarded!
+#endif
+
+};  // class UnaryEvalImpl
 
 }  // namespace detail
 }  // namespace TiledArray
diff --git a/src/TiledArray/distributed_storage.h b/src/TiledArray/distributed_storage.h
index 27c2885dcd..60eb715c34 100644
--- a/src/TiledArray/distributed_storage.h
+++ b/src/TiledArray/distributed_storage.h
@@ -23,6 +23,17 @@
 #include <TiledArray/pmap/pmap.h>
 
 namespace TiledArray {
+
+/// Describes how to get remote data
+enum class RemoteDataGetPolicy {
+  /// no caching = each get will trigger data fetch
+  nocache,
+  /// aggregate gets until data arrives, subsequent gets will trigger new gets
+  aggregate,
+  /// get once, read forever
+  cache
+};
+
 namespace detail {
 
 /// Distributed storage container.
@@ -41,7 +52,7 @@ namespace detail {
 /// thread. DO NOT construct world objects within tasks where the order of
 /// execution is nondeterministic.
 template <typename T>
-class DistributedStorage : public madness::WorldObject<DistributedStorage<T> > {
+class DistributedStorage : public madness::WorldObject<DistributedStorage<T>> {
  public:
   typedef DistributedStorage<T> DistributedStorage_;  ///< This object type
   typedef madness::WorldObject<DistributedStorage_>
@@ -64,8 +75,22 @@ class DistributedStorage : public madness::WorldObject<DistributedStorage<T> > {
                               ///< stored by this container
   std::shared_ptr<const pmap_interface>
       pmap_;  ///< The process map that defines the element distribution
-  mutable container_type data_;     ///< The local data container
-  madness::AtomicInt num_live_ds_;  ///< Number of live DelayedSet objects
+  mutable container_type data_;  ///< The local data container
+
+  // tracing/defensive driving artifacts
+  mutable std::atomic<std::size_t>
+      num_live_ds_;  ///< Number of live DelayedSet objects
+  mutable std::atomic<std::size_t>
+      num_live_df_;  ///< Number of live DelayedForward objects
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+  mutable std::vector<std::atomic<std::size_t>>
+      ngets_served_per_rank_;  ///< Counts # of gets served to remote ranks
+  mutable std::vector<std::atomic<std::size_t>>
+      ngets_sent_per_rank_;  ///< Counts # of gets sent to remote ranks
+  mutable std::vector<std::atomic<std::size_t>>
+      ngets_received_per_rank_;  ///< Counts # of gets received from remote
+                                 ///< ranks
+#endif
 
   // not allowed
   DistributedStorage(const DistributedStorage_&);
@@ -120,6 +145,124 @@ class DistributedStorage : public madness::WorldObject<DistributedStorage<T> > {
   };  // struct DelayedSet
   friend struct DelayedSet;
 
+  /// Tile cache works just like madness::detail::DistCache (and in fact is
+  /// based on it) in that it implements a local cache for asynchronous data
+  /// pulls. Unlike madness::detail::DistCache:
+  /// - this is unidirectional, i.e. there is no need to manually push data into
+  /// the cache (a task sending data
+  ///   will be posted).
+  /// - depending on get policy data will either stay in the cache forever or
+  /// will be discarded upon arrival;
+  ///   subsequent gets will need to fetch the data again (may make this
+  ///   user-controllable in the future)
+  mutable container_type remote_data_cache_;
+
+  /// Get the cache value accosted with \c key
+
+  /// This will get the value associated with \c key to \c value. If
+  /// the cache element does not exist, a task requesting the data will be sent
+  /// to the owner, a future referring to the result will be inserted in the
+  /// cache so that the subsequent gets will receive the same data. After data
+  /// arrival the future will be removed from the cache, thus subsequent gets
+  /// will need to fetch the data again. \param[in] key The target key \return A
+  /// future that holds/will hold the cache value
+  future get_cached(const key_type& key, bool keep_in_cache = false) const {
+    // Retrieve the cached future
+    typename container_type::const_accessor acc;
+    if (remote_data_cache_.insert(
+            acc, key)) {  // no future in cache yet, create a task
+      static_assert(std::is_signed_v<ProcessID>);
+      const ProcessID rank = this->get_world().rank();
+      ProcessID rank_w_persistence = keep_in_cache ? rank : -(rank + 1);
+      WorldObject_::task(owner(key), &DistributedStorage_::get_cached_handler,
+                         key, rank_w_persistence,
+                         madness::TaskAttributes::hipri());
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+      ngets_sent_per_rank_.at(owner(key))++;
+#endif
+    }
+    return acc->second;
+  }
+
+  /// used to forward data that were unassigned at the time of request arrival
+  struct DelayedForward : public madness::CallbackInterface {
+   public:
+    DelayedForward(const DistributedStorage_& ds, key_type key,
+                   ProcessID destination_rank, bool keep_in_cache)
+        : ds(ds),
+          key(key),
+          destination_rank(destination_rank),
+          keep_in_cache(keep_in_cache) {}
+
+    void notify() override {
+      auto& data_fut = ds.get_local(key);
+      TA_ASSERT(
+          data_fut.probe());  // must be ready, otherwise why is this invoked?
+      if (keep_in_cache) {
+        ds.task(destination_rank,
+                &DistributedStorage_::template set_cached_handler<true>, key,
+                data_fut, madness::TaskAttributes::hipri());
+      } else {
+        ds.task(destination_rank,
+                &DistributedStorage_::template set_cached_handler<false>, key,
+                data_fut, madness::TaskAttributes::hipri());
+      }
+      delete this;
+    }
+
+   private:
+    const DistributedStorage_& ds;
+    key_type key;
+    ProcessID destination_rank;
+    bool keep_in_cache;
+  };
+
+  void get_cached_handler(const size_type key,
+                          ProcessID destination_rank_w_persistence) const {
+    const bool keep_in_cache = destination_rank_w_persistence >= 0;
+    const ProcessID destination_rank =
+        destination_rank_w_persistence < 0
+            ? (-destination_rank_w_persistence - 1)
+            : destination_rank_w_persistence;
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+    ngets_served_per_rank_.at(destination_rank)++;
+#endif
+    auto& data_fut = get_local(key);
+    if (data_fut.probe()) {
+      if (keep_in_cache) {
+        WorldObject_::task(
+            destination_rank,
+            &DistributedStorage_::template set_cached_handler<true>, key,
+            data_fut, madness::TaskAttributes::hipri());
+      } else {
+        WorldObject_::task(
+            destination_rank,
+            &DistributedStorage_::template set_cached_handler<false>, key,
+            data_fut, madness::TaskAttributes::hipri());
+      }
+    } else {  // data not ready yet, defer send to a callback (maybe task??)
+      const_cast<future&>(data_fut).register_callback(
+          new DelayedForward(*this, key, destination_rank, keep_in_cache));
+    }
+  }
+
+  template <bool KeepInCache>
+  void set_cached_handler(const size_type key, const value_type& datum) const {
+    // assign the future first, then remove from the cache
+    typename container_type::accessor acc;
+    [[maybe_unused]] const bool inserted = remote_data_cache_.insert(acc, key);
+    // future must be in cache
+    TA_ASSERT(!inserted);
+    // assign it
+    acc->second.set(datum);
+    // remove it from the cache
+    if constexpr (!KeepInCache) remote_data_cache_.erase(acc);
+
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+    ngets_received_per_rank_.at(this->owner(key))++;
+#endif
+  }
+
  public:
   /// Makes an initialized, empty container with default data distribution (no
   /// communication)
@@ -136,23 +279,47 @@ class DistributedStorage : public madness::WorldObject<DistributedStorage<T> > {
       : WorldObject_(world),
         max_size_(max_size),
         pmap_(pmap),
-        data_((max_size / world.size()) + 11) {
+        data_((max_size / world.size()) + 11)
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+        ,
+        ngets_served_per_rank_(world.size()),
+        ngets_sent_per_rank_(world.size()),
+        ngets_received_per_rank_(world.size())
+#endif
+  {
     // Check that the process map is appropriate for this storage object
     TA_ASSERT(pmap_);
     TA_ASSERT(pmap_->size() == max_size);
     TA_ASSERT(pmap_->rank() == pmap_interface::size_type(world.rank()));
     TA_ASSERT(pmap_->procs() == pmap_interface::size_type(world.size()));
     num_live_ds_ = 0;
+    num_live_df_ = 0;
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+    for (auto rank = 0; rank != world.size(); ++rank) {
+      ngets_served_per_rank_[rank] = 0;
+      ngets_sent_per_rank_[rank] = 0;
+      ngets_received_per_rank_[rank] = 0;
+    }
+#endif
     WorldObject_::process_pending();
   }
 
   virtual ~DistributedStorage() {
     if (num_live_ds_ != 0) {
-      madness::print_error(
-          "DistributedStorage (object id=", this->id(),
-          ") destroyed while "
-          "outstanding tasks exist. Add a fence() to extend the lifetime of "
-          "this object.");
+      madness::print_error("DistributedStorage (object id=", this->id(),
+                           ") destroyed while "
+                           "pending tasks that set its data exist. Add a "
+                           "fence() to extend the lifetime of "
+                           "this object.");
+      abort();
+    }
+    if (num_live_df_ != 0) {
+      madness::print_error("DistributedStorage (object id=", this->id(),
+                           ") destroyed while "
+                           "pending callbacks that forward its data to other "
+                           "ranks exist. This may indicate a bug in your "
+                           "program or you may need to extend the lifetime of "
+                           "this object.");
       abort();
     }
   }
@@ -207,18 +374,21 @@ class DistributedStorage : public madness::WorldObject<DistributedStorage<T> > {
   /// \return A future to element \c i
   /// \throw TiledArray::Exception If \c i is greater than or equal to \c
   /// max_size() .
-  future get(size_type i) const {
+  future get(size_type i,
+             RemoteDataGetPolicy policy = RemoteDataGetPolicy::nocache) const {
     TA_ASSERT(i < max_size_);
     if (is_local(i)) {
       return get_local(i);
     } else {
-      // Send a request to the owner of i for the element.
-      future result;
-      WorldObject_::task(owner(i), &DistributedStorage_::get_handler, i,
-                         result.remote_ref(get_world()),
-                         madness::TaskAttributes::hipri());
-
-      return result;
+      if (policy == RemoteDataGetPolicy::nocache) {
+        // Send a request to the owner of i for the element.
+        future result;
+        WorldObject_::task(owner(i), &DistributedStorage_::get_handler, i,
+                           result.remote_ref(get_world()),
+                           madness::TaskAttributes::hipri());
+        return result;
+      } else
+        return get_cached(i, policy == RemoteDataGetPolicy::cache);
     }
   }
 
@@ -234,6 +404,13 @@ class DistributedStorage : public madness::WorldObject<DistributedStorage<T> > {
     // Return the local element.
     const_accessor acc;
     [[maybe_unused]] const bool inserted = data_.insert(acc, i);
+#ifdef MADNESS_WORLDOBJECT_FUTURE_TRACE
+    if (inserted) {
+      auto& f_nonconst_ref =
+          const_cast<std::remove_const_t<decltype(acc->second)>&>(acc->second);
+      this->trace(f_nonconst_ref);
+    }
+#endif
     return acc->second;
   }
 
@@ -249,6 +426,13 @@ class DistributedStorage : public madness::WorldObject<DistributedStorage<T> > {
     // Return the local element.
     accessor acc;
     [[maybe_unused]] const bool inserted = data_.insert(acc, i);
+#ifdef MADNESS_WORLDOBJECT_FUTURE_TRACE
+    if (inserted) {
+      auto& f_nonconst_ref =
+          const_cast<std::remove_const_t<decltype(acc->second)>&>(acc->second);
+      this->trace(f_nonconst_ref);
+    }
+#endif
     return acc->second;
   }
 
@@ -308,6 +492,14 @@ class DistributedStorage : public madness::WorldObject<DistributedStorage<T> > {
         // Set the future
         existing_f.set(f);
       }
+#ifdef MADNESS_WORLDOBJECT_FUTURE_TRACE
+      else {
+        auto& f_nonconst_ref =
+            const_cast<std::remove_const_t<decltype(acc->second)>&>(
+                acc->second);
+        this->trace(f_nonconst_ref);
+      }
+#endif
     } else {
       if (f.probe()) {
         set_remote(i, f);
@@ -321,7 +513,25 @@ class DistributedStorage : public madness::WorldObject<DistributedStorage<T> > {
   /// Reports the number of live DelayedSet requests
 
   /// @return const reference to the atomic counter of live DelayedSet requests
-  const madness::AtomicInt& num_live_ds() const { return num_live_ds_; }
+  const std::atomic<std::size_t>& num_live_ds() const { return num_live_ds_; }
+
+  /// Reports the number of live DelayedForward requests
+
+  /// @return const reference to the atomic counter of live DelayedForward
+  /// requests
+  const std::atomic<std::size_t>& num_live_df() const { return num_live_df_; }
+
+#ifdef TILEDARRAY_ENABLE_GLOBAL_COMM_STATS_TRACE
+  const std::vector<std::atomic<std::size_t>>& ngets_served_per_rank() const {
+    return ngets_served_per_rank_;
+  }
+  const std::vector<std::atomic<std::size_t>>& ngets_sent_per_rank() const {
+    return ngets_sent_per_rank_;
+  }
+  const std::vector<std::atomic<std::size_t>>& ngets_received_per_rank() const {
+    return ngets_received_per_rank_;
+  }
+#endif
 };  // class DistributedStorage
 
 }  // namespace detail
diff --git a/src/TiledArray/einsum/index.h b/src/TiledArray/einsum/index.h
index 58c378704b..67e9d6c1a0 100644
--- a/src/TiledArray/einsum/index.h
+++ b/src/TiledArray/einsum/index.h
@@ -3,10 +3,10 @@
 
 #include "TiledArray/expressions/fwd.h"
 
+#include <TiledArray/einsum/string.h>
 #include <TiledArray/error.h>
 #include <TiledArray/permutation.h>
 #include <TiledArray/util/vector.h>
-#include <TiledArray/einsum/string.h>
 
 #include <iosfwd>
 #include <string>
@@ -29,10 +29,11 @@ class Index {
  public:
   using container_type = small_vector<T>;
   using value_type = typename container_type::value_type;
+  using iterator = typename container_type::iterator;
 
   Index() = default;
   Index(const container_type &s) : data_(s) {}
-  Index(const std::initializer_list<T> &s) : data_(s) {}
+  explicit Index(const std::initializer_list<T> &s) : data_(s) {}
 
   template <typename S, typename U = void>
   Index(const S &s) {
@@ -45,18 +46,14 @@ class Index {
   Index(const char (&s)[N]) : Index(std::string(s)) {}
 
   template <typename U = void>
-  explicit Index(const char* &s) : Index(std::string(s)) {}
+  explicit Index(const char *&s) : Index(std::string(s)) {}
 
   template <typename U = void>
   explicit Index(const std::string &s) {
-    static_assert(
-      std::is_same_v<T,char> ||
-      std::is_same_v<T,std::string>
-    );
-    if constexpr (std::is_same_v<T,std::string>) {
+    static_assert(std::is_same_v<T, char> || std::is_same_v<T, std::string>);
+    if constexpr (std::is_same_v<T, std::string>) {
       data_ = index::tokenize(s);
-    }
-    else {
+    } else {
       using std::begin;
       using std::end;
       data_.assign(begin(s), end(s));
@@ -78,8 +75,11 @@ class Index {
 
   size_t size() const { return data_.size(); }
 
-  auto begin() const { return data_.begin(); }
-  auto end() const { return data_.end(); }
+  auto begin() const { return data_.cbegin(); }
+  auto end() const { return data_.cend(); }
+
+  auto begin() { return data_.begin(); }
+  auto end() { return data_.end(); }
 
   auto find(const T &v) const {
     return std::find(this->begin(), this->end(), v);
@@ -209,11 +209,8 @@ auto permute(const Permutation &p, const Index<T> &s,
   if (!p) return s;
   using R = typename Index<T>::container_type;
   R r(p.size());
-  TiledArray::detail::permute_n(
-    p.size(),
-    p.begin(), s.begin(), r.begin(),
-    std::bool_constant<Inverse>{}
-  );
+  TiledArray::detail::permute_n(p.size(), p.begin(), s.begin(), r.begin(),
+                                std::bool_constant<Inverse>{});
   return Index<T>{r};
 }
 
@@ -306,8 +303,8 @@ IndexMap<K, V> operator|(const IndexMap<K, V> &a, const IndexMap<K, V> &b) {
 }  // namespace Einsum::index
 
 namespace Einsum {
-  using index::Index;
-  using index::IndexMap;
-}  // namespace TiledArray::Einsum
+using index::Index;
+using index::IndexMap;
+}  // namespace Einsum
 
 #endif /* TILEDARRAY_EINSUM_INDEX_H__INCLUDED */
diff --git a/src/TiledArray/einsum/range.h b/src/TiledArray/einsum/range.h
index 32eb669588..79b409e64d 100644
--- a/src/TiledArray/einsum/range.h
+++ b/src/TiledArray/einsum/range.h
@@ -14,7 +14,8 @@ using small_vector = TiledArray::container::svector<T>;
 struct Range {
   using value_type = int64_t;
   using iterator = boost::counting_iterator<value_type>;
-  template<class Pair>
+  template <class Pair, typename std::enable_if_t<
+                            !std::is_convertible_v<Pair, Range>, bool> = true>
   explicit Range(Pair &&pair) : Range(pair.first, pair.second) {}
   Range(value_type begin, value_type end) : begin_(begin), end_(end) {}
   auto begin() const { return iterator(begin_); }
diff --git a/src/TiledArray/einsum/string.h b/src/TiledArray/einsum/string.h
index 7647aed63b..d2dc6048ab 100644
--- a/src/TiledArray/einsum/string.h
+++ b/src/TiledArray/einsum/string.h
@@ -1,50 +1,50 @@
 #ifndef TILEDARRAY_EINSUM_STRING_H
 #define TILEDARRAY_EINSUM_STRING_H
 
+#include <boost/algorithm/string/join.hpp>
 #include <boost/algorithm/string/split.hpp>
 #include <boost/algorithm/string/trim.hpp>
-#include <boost/algorithm/string/join.hpp>
+#include <sstream>
 #include <string>
 #include <vector>
 
 namespace Einsum::string {
 namespace {
 
-  // Split delimiter must match completely
-  template<typename T = std::string, typename U = T>
-  std::pair<T,U> split2(const std::string& s, const std::string &d) {
-    auto pos = s.find(d);
-    if (pos == s.npos) return { T(s), U("") };
-    return { T(s.substr(0,pos)), U(s.substr(pos+d.size())) };
-  }
+// Split delimiter must match completely
+template <typename T = std::string, typename U = T>
+std::pair<T, U> split2(const std::string& s, const std::string& d) {
+  auto pos = s.find(d);
+  if (pos == s.npos) return {T(s), U("")};
+  return {T(s.substr(0, pos)), U(s.substr(pos + d.size()))};
+}
 
-  // Split delimiter must match completely
-  std::vector<std::string> split(const std::string& s, char d) {
-    std::vector<std::string> res;
-    return boost::split(res, s, [&d](char c) { return c == d; } /*boost::is_any_of(d)*/);
-  }
+// Split delimiter must match completely
+std::vector<std::string> split(const std::string& s, char d) {
+  std::vector<std::string> res;
+  return boost::split(res, s,
+                      [&d](char c) { return c == d; } /*boost::is_any_of(d)*/);
+}
 
-  std::string trim(const std::string& s) {
-    return boost::trim_copy(s);
-  }
+std::string trim(const std::string& s) { return boost::trim_copy(s); }
 
-  template <typename T>
-  std::string str(const T& obj) {
-    std::stringstream ss;
-    ss << obj;
-    return ss.str();
-  }
+template <typename T>
+std::string str(const T& obj) {
+  std::stringstream ss;
+  ss << obj;
+  return ss.str();
+}
 
-  template<typename T, typename U = std::string>
-  std::string join(const T &s, const U& j = U("")) {
-    std::vector<std::string> strings;
-    for (auto e : s) {
-      strings.push_back(str(e));
-    }
-    return boost::join(strings, j);
+template <typename T, typename U = std::string>
+std::string join(const T& s, const U& j = U("")) {
+  std::vector<std::string> strings;
+  for (auto e : s) {
+    strings.push_back(str(e));
   }
-
-}
+  return boost::join(strings, j);
 }
 
-#endif //TILEDARRAY_EINSUM_STRING_H
+}  // namespace
+}  // namespace Einsum::string
+
+#endif  // TILEDARRAY_EINSUM_STRING_H
diff --git a/src/TiledArray/einsum/tiledarray.h b/src/TiledArray/einsum/tiledarray.h
index c248956066..ace7caa15a 100644
--- a/src/TiledArray/einsum/tiledarray.h
+++ b/src/TiledArray/einsum/tiledarray.h
@@ -9,6 +9,10 @@
 #include "TiledArray/tiled_range.h"
 #include "TiledArray/tiled_range1.h"
 
+namespace TiledArray {
+enum struct DeNest { True, False };
+}
+
 namespace TiledArray::Einsum {
 
 using ::Einsum::index::small_vector;
@@ -22,6 +26,102 @@ using ::Einsum::index::IndexMap;
 using ::Einsum::index::Permutation;
 using ::Einsum::index::permutation;
 
+///
+/// \tparam T A type that parameterizes ::Einsum::Index<T>.
+///
+/// This class makes it easier to work with indices involved in a binary
+/// tensor multiplication. Also defines a canonical order of the indices.
+///
+/// Consider an arbitrary binary tensor multiplication annotated as:
+///     A(a_1,...,a_m) * B(b_1,...,b_n) -> C(c_1,...,c_l)
+/// Note that {c_1,...,c_l} is subset of ({a_1,...,a_m} union {b_1,...,b_n}).
+///
+/// We define following index types.
+///     * Hadamard index: An index that annotates A, B, and C.
+///     * Contracted index: An index that annotates A and B but not C.
+///     * External index of A: An index that annotates A and C but not B.
+///     * External index of B: An index that annotates B and C but not A.
+///
+/// Defining canonical index ordering.
+///     * Hadamard indices are canonically ordered if they appear in the same
+///       order in A's annotation.
+///     * Contracted indices are canonically ordered if they appear in the same
+///       order in A's annotation.
+///     * External indices of A are canonically ordered if they appear in the
+///       same order in A's annotation.
+///     * External indices of B are canonically ordered if they appear in the
+///       same order in B's annotation.
+///     * Tensor A's indices are canonically ordered if Hadamard, external
+///       indices of A, and contracted indices appear in that order and all
+///       three index groups are themselves canonically ordered.
+///     * Tensor B's indices are canonically ordered if Hadamard, external
+///       indices of B, and contracted indices appear in that order and all
+///       three index groups are themselves canonically ordered.
+///     * Tensor C's indices are canonically ordered if Hadamard, external
+///       indices of A and external indices of B appear in that order and all
+///       three index groups are themselves canonically ordered.
+///
+/// Example: Consider the evaluation: A(i,j,p,a,b) * B(j,i,q,b,a) -> C(i,p,j,q).
+///          - Hadamard indices: {i,j}
+///          - External indices of A: {p}
+///          - External indices of B: {q}
+///          - Contracted indices: {a, b}
+///          All index groups above are canonically ordered.
+///          Writing C's indices in canonical order would give: {i,j,p,q}.
+///
+template <typename T>
+class TensorOpIndices {
+ public:
+  using index_t = ::Einsum::Index<T>;
+
+  TensorOpIndices(index_t const &ixA, index_t const &ixB, index_t const &ixC)
+      : orig_indices_({ixA, ixB, ixC}) {
+    hadamard_ = ixA & ixB & ixC;
+    contracted_ = (ixA & ixB) - ixC;
+    external_A_ = (ixA - ixB) & ixC;
+    external_B_ = (ixB - ixA) & ixC;
+  }
+
+  [[nodiscard]] index_t const &ix_A() const { return orig_indices_[A]; }
+  [[nodiscard]] index_t const &ix_B() const { return orig_indices_[B]; }
+  [[nodiscard]] index_t const &ix_C() const { return orig_indices_[C]; }
+
+  [[nodiscard]] index_t ix_A_canon() const {
+    return hadamard() + external_A() + contracted();
+  }
+
+  [[nodiscard]] index_t ix_B_canon() const {
+    return hadamard() + external_B() + contracted();
+  }
+
+  [[nodiscard]] index_t ix_C_canon() const {
+    return hadamard() + external_A() + external_B();
+  }
+
+  [[nodiscard]] index_t const &hadamard() const { return hadamard_; }
+  [[nodiscard]] index_t const &contracted() const { return contracted_; }
+  [[nodiscard]] index_t const &external_A() const { return external_A_; }
+  [[nodiscard]] index_t const &external_B() const { return external_B_; }
+
+  [[nodiscard]] Permutation to_canon_A() const {
+    return ::Einsum::index::permutation(ix_A(), ix_A_canon());
+  }
+
+  [[nodiscard]] Permutation to_canon_B() const {
+    return ::Einsum::index::permutation(ix_B(), ix_B_canon());
+  }
+
+  [[nodiscard]] Permutation to_canon_C() const {
+    return ::Einsum::index::permutation(ix_C(), ix_C_canon());
+  }
+
+ private:
+  enum { A, B, C, ABC };
+  std::array<index_t, ABC> orig_indices_;
+
+  index_t hadamard_, contracted_, external_A_, external_B_;
+};
+
 /// converts the annotation of an expression to an Index
 template <typename Array>
 auto idx(const std::string &s) {
@@ -64,13 +164,264 @@ struct ArrayTerm {
   }
 };
 
-template <typename Array_, typename... Indices>
-auto einsum(expressions::TsrExpr<Array_> A, expressions::TsrExpr<Array_> B,
+namespace {
+template <typename DArrayT>
+constexpr bool IsArrayT = detail::is_tensor_v<typename DArrayT::value_type>;
+
+template <typename DArrayToT>
+constexpr bool IsArrayToT =
+    detail::is_tensor_of_tensor_v<typename DArrayToT::value_type>;
+
+template <typename ArrayT1, typename ArrayT2>
+constexpr bool AreArrayT = IsArrayT<ArrayT1> && IsArrayT<ArrayT2>;
+
+template <typename ArrayT1, typename ArrayT2>
+constexpr bool AreArrayToT = IsArrayToT<ArrayT1> && IsArrayToT<ArrayT2>;
+
+template <typename ArrayT1, typename ArrayT2>
+constexpr bool AreArraySame =
+    AreArrayT<ArrayT1, ArrayT2> || AreArrayToT<ArrayT1, ArrayT2>;
+
+template <typename Array>
+using DeNestedArray = DistArray<typename Array::value_type::value_type,
+                                typename Array::policy_type>;
+
+template <typename Array1, typename Array2>
+using MaxNestedArray = std::conditional_t<(detail::nested_rank<Array2> >
+                                           detail::nested_rank<Array1>),
+                                          Array2, Array1>;
+
+}  // namespace
+
+namespace {
+
+///
+/// \brief This function replicates a tensor B into a tensor A such that
+///        A(a_1,...a_k,i_1,...,i_l) = B(i_1,...,i_l). Evidently, the
+///        extents of i_n modes must match in both A and B.
+///
+/// \tparam Tensor TiledArray::Tensor type.
+/// \param to The target tensor.
+/// \param from The source tensor that will be replicated into \c to.
+///
+template <typename Tensor,
+          typename = std::enable_if_t<detail::is_nested_tensor_v<Tensor>>>
+void replicate_tensor(Tensor &to, Tensor const &from) {
+  // assert that corresponding modes have the same extents
+  TA_ASSERT(std::equal(from.range().extent().rbegin(),
+                       from.range().extent().rend(),
+                       to.range().extent().rbegin()));
+
+  // number of elements to be copied
+  // (same as the number of elements in @c from)
+  auto const N = from.range().volume();
+  for (auto i = 0; i < to.range().volume(); i += N)
+    std::copy(from.begin(), from.end(), to.data() + i);
+}
+
+///
+/// \brief This function is the @c DistArray counterpart of the function
+///        @c replicate_tensor(TA::Tensor&, TA::Tensor const&).
+///
+/// \tparam Array
+/// \param from The DistArray to be by-rank replicated.
+/// \parama prepend_trng TiledRange1's in this argument will be prepended to the
+///         `TiledRange` of the argument array.
+/// \return An array whose rank is increased by `prepend_trng.rank()`.
+/// \see `replicate_tensor`
+///
+template <typename Array,
+          typename = std::enable_if_t<detail::is_array_v<Array>>>
+auto replicate_array(Array from, TiledRange const &prepend_trng) {
+  auto const result_rank = prepend_trng.rank() + rank(from);
+  container::svector<TiledRange1> tr1s;
+  tr1s.reserve(result_rank);
+  for (auto const &r : prepend_trng) tr1s.emplace_back(r);
+  for (auto const &r : from.trange()) tr1s.emplace_back(r);
+  auto const result_trange = TiledRange(tr1s);
+
+  from.make_replicated();
+  auto &world = from.world();
+  world.gop.fence();
+
+  auto result = make_array<Array>(
+      world, result_trange,
+      [from, res_tr = result_trange, delta_rank = prepend_trng.rank()](
+          auto &tile, auto const &res_rng) {
+        using std::begin;
+        using std::end;
+        using std::next;
+
+        typename Array::value_type repped(res_rng);
+        auto res_coord_ix = res_tr.element_to_tile(res_rng.lobound());
+        auto from_coord_ix = decltype(res_coord_ix)(
+            next(begin(res_coord_ix), delta_rank), end(res_coord_ix));
+        if (from.is_zero(from_coord_ix)) return typename Array::scalar_type{0};
+        replicate_tensor(repped, from.find_local(from_coord_ix).get(false));
+        tile = repped;
+        return tile.norm();
+      });
+
+  if constexpr (std::is_same_v<typename Array::policy_type, SparsePolicy>)
+    result.truncate();
+
+  return result;
+}
+
+///
+/// Given a rank-N tensor and a ∂-rank such that ∂ in [0,N), returns a new
+/// rank-N' tensor (where N' = N - ∂) by summing over the ∂ ranks from the
+/// end of the input tensor's range. For example, reduce_modes(A, 2) where
+/// A.range().rank() == 5 will result into a new tensor (B) of rank-3 such that
+/// B(i,j,k) = Σ_l Σ_m A(i,j,k,l,m).
+///
+/// \param orig Input Tensor.
+/// \param dmodes Reduce this many modes from the end as implied in the
+///               range of the input tensor.
+/// \return Tensor with reduced rank.
+///
+template <typename T, typename... Ts>
+auto reduce_modes(Tensor<T, Ts...> const &orig, size_t drank) {
+  if (drank == 0) return orig;
+  TA_ASSERT(orig.nbatch() == 1);
+  auto const orig_rng = orig.range();
+  TA_ASSERT(orig_rng.rank() > drank);
+
+  auto const result_rng = [orig_rng, drank]() {
+    container::vector<Range1> r1s;
+    for (auto i = 0; i < orig_rng.rank() - drank; ++i)
+      r1s.emplace_back(orig_rng.dim(i));
+    return TA::Range(r1s);
+  }();
+
+  auto const delta_rng = [orig_rng, drank]() {
+    container::vector<Range1> r1s;
+    for (auto i = orig_rng.rank() - drank; i < orig_rng.rank(); ++i)
+      r1s.emplace_back(orig_rng.dim(i));
+    return TA::Range(r1s);
+  }();
+
+  auto const delta_vol = delta_rng.volume();
+
+  auto reducer = [orig, delta_vol, delta_rng](auto const &ix) {
+    auto orig_ix = ix;
+    std::copy(delta_rng.lobound().begin(),  //
+              delta_rng.lobound().end(),    //
+              std::back_inserter(orig_ix));
+
+    auto beg = orig.data() + orig.range().ordinal(orig_ix);
+    auto end = beg + delta_vol;
+
+    // cannot get it done this way: return std::reduce(beg, end);
+
+    typename std::iterator_traits<decltype(beg)>::value_type sum{};
+    for (; beg != end; ++beg) sum += *beg;
+    return sum;
+  };
+
+  return Tensor<T, Ts...>(result_rng, reducer);
+}
+
+///
+/// \param orig Input DistArray.
+/// \param dmodes Reduce this many modes from the end as implied in the
+///        tiled range of the input array.
+/// \return Array with reduced rank.
+/// \see reduce_modes(Tensor<T, Ts...>, size_t)
+///
+template <typename T, typename P>
+auto reduce_modes(TA::DistArray<T, P> orig, size_t drank) {
+  TA_ASSERT(orig.trange().rank() > drank);
+  if (drank == 0) return orig;
+
+  auto const result_trange = [orig, drank]() {
+    container::svector<TiledRange1> tr1s;
+    for (auto i = 0; i < (orig.trange().rank() - drank); ++i)
+      tr1s.emplace_back(orig.trange().at(i));
+    return TiledRange(tr1s);
+  }();
+
+  auto const delta_trange = [orig, drank]() {
+    container::svector<TiledRange1> tr1s;
+    for (auto i = orig.trange().rank() - drank; i < orig.trange().rank(); ++i)
+      tr1s.emplace_back(orig.trange().at(i));
+    return TiledRange(tr1s);
+  }();
+
+  orig.make_replicated();
+  orig.world().gop.fence();
+
+  auto make_tile = [orig, delta_trange, drank](auto &tile, auto const &rng) {
+    using tile_type = std::remove_reference_t<decltype(tile)>;
+
+    tile_type res(rng, typename tile_type::value_type{});
+
+    bool all_summed_tiles_zeros{true};
+    for (auto &&r : delta_trange.tiles_range()) {
+      container::svector<TA::Range::index1_type> ix1s = rng.lobound();
+
+      {
+        auto d = delta_trange.make_tile_range(r);
+        auto dlo = d.lobound();
+        std::copy(dlo.begin(), dlo.end(), std::back_inserter(ix1s));
+      }
+
+      auto tix = orig.trange().element_to_tile(ix1s);
+      if constexpr (std::is_same_v<P, SparsePolicy>)
+        if (orig.is_zero(tix)) continue;
+      auto got = orig.find_local(tix).get(false);
+
+      res += reduce_modes(got, drank);
+      all_summed_tiles_zeros = false;
+    }
+
+    if (all_summed_tiles_zeros)
+      return typename std::remove_reference_t<decltype(tile)>::scalar_type{0};
+
+    tile = res;
+    return res.norm();
+  };
+
+  auto result =
+      make_array<DistArray<T, P>>(orig.world(), result_trange, make_tile);
+  if constexpr (std::is_same_v<P, SparsePolicy>) result.truncate();
+
+  return result;
+}
+
+///
+/// \tparam Ixs Iterable of indices.
+/// \param map A map from the index type of \c Ixs to TiledRange1.
+/// \param ixs Iterable of indices.
+/// \return TiledRange object.
+///
+template <typename Ixs>
+TiledRange make_trange(RangeMap const &map, Ixs const &ixs) {
+  container::svector<TiledRange1> tr1s;
+  tr1s.reserve(ixs.size());
+  for (auto &&i : ixs) tr1s.emplace_back(map[i]);
+  return TiledRange(tr1s);
+}
+
+}  // namespace
+
+template <DeNest DeNestFlag = DeNest::False, typename ArrayA_, typename ArrayB_,
+          typename... Indices>
+auto einsum(expressions::TsrExpr<ArrayA_> A, expressions::TsrExpr<ArrayB_> B,
             std::tuple<Einsum::Index<std::string>, Indices...> cs,
             World &world) {
-  using Array = std::remove_cv_t<Array_>;
-  using Tensor = typename Array::value_type;
-  using Shape = typename Array::shape_type;
+  using ArrayA = std::remove_cv_t<ArrayA_>;
+  using ArrayB = std::remove_cv_t<ArrayB_>;
+
+  using ArrayC =
+      std::conditional_t<DeNestFlag == DeNest::True, DeNestedArray<ArrayA>,
+                         MaxNestedArray<ArrayA, ArrayB>>;
+
+  using ResultTensor = typename ArrayC::value_type;
+  using ResultShape = typename ArrayC::shape_type;
+
+  auto const& tnsrExprA = A;
+  auto const& tnsrExprB = B;
 
   auto a = std::get<0>(Einsum::idx(A));
   auto b = std::get<0>(Einsum::idx(B));
@@ -78,209 +429,394 @@ auto einsum(expressions::TsrExpr<Array_> A, expressions::TsrExpr<Array_> B,
 
   struct {
     std::string a, b, c;
+    // Hadamard, external, internal indices for inner tensor
+    Einsum::Index<std::string> A, B, C, h, e, i;
   } inner;
-  if constexpr (std::tuple_size<decltype(cs)>::value == 2) {
+
+  if constexpr (IsArrayToT<ArrayA>) {
     inner.a = ";" + (std::string)std::get<1>(Einsum::idx(A));
+    inner.A = std::get<1>(Einsum::idx(A));
+  }
+
+  if constexpr (IsArrayToT<ArrayB>) {
     inner.b = ";" + (std::string)std::get<1>(Einsum::idx(B));
+    inner.B = std::get<1>(Einsum::idx(B));
+  }
+
+  if constexpr (std::tuple_size<decltype(cs)>::value == 2) {
+    static_assert(IsArrayToT<ArrayC>);
     inner.c = ";" + (std::string)std::get<1>(cs);
+    inner.C = std::get<1>(cs);
   }
 
-  // these are "Hadamard" (fused) indices
-  auto h = a & b & c;
+  {
+    inner.h = inner.A & inner.B & inner.C;
+    inner.e = (inner.A ^ inner.B);
+    inner.i = (inner.A & inner.B) - inner.h;
+    if constexpr (IsArrayToT<ArrayC>)
+      TA_ASSERT(!(inner.h && (inner.i || inner.e)) &&
+                "General product between inner tensors not supported");
+  }
 
-  // no Hadamard indices => standard contraction (or even outer product)
-  // same a, b, and c => pure Hadamard
-  if (!h || (!(a ^ b) && !(b ^ c))) {
-    Array C;
-    C(std::string(c) + inner.c) = A * B;
+  if constexpr (DeNestFlag == DeNest::True) {
+    static_assert(detail::nested_rank<ArrayA> == detail::nested_rank<ArrayB> &&
+                  detail::nested_rank<ArrayA> == 2);
+
+    TA_ASSERT(!inner.C &&
+              "Denested result cannot have inner-tensor annotation");
+
+    TA_ASSERT(inner.i.size() == inner.A.size() &&
+              inner.i.size() == inner.B.size() &&
+              "Nested-rank-reduction only supported when the inner tensor "
+              "ranks match on the arguments");
+
+    //
+    // Illustration of steps by an example.
+    //
+    // Consider the evaluation: A(ijpab;xy) * B(jiqba;yx) -> C(ipjq).
+    //
+    // Note for the outer indices:
+    //      - Hadamard: 'ij'
+    //      - External A: 'p'
+    //      - External B: 'q'
+    //      - Contracted: 'ab'
+    //
+    // Now C is evaluated in the following steps.
+    //  Step I:   A(ijpab;xy) * B(jiqba;yx) -> C0(ijpqab;xy)
+    //  Step II:  C0(ijpqab;xy) -> C1(ijpqab)
+    //  Step III: C1(ijpqab) -> C2(ijpq)
+    //  Step IV:  C2(ijpq) -> C(ipjq)
+
+    auto sum_tot_2_tos = [](auto const &tot) {
+      using tot_t = std::remove_reference_t<decltype(tot)>;
+      typename tot_t::value_type result(
+          tot.range(), [tot](auto &&ix) {
+            if (!tot(ix).empty())
+              return tot(ix).sum();
+            else return typename tot_t::numeric_type{};
+          });
+      return result;
+    };
+
+    auto const oixs = TensorOpIndices(a, b, c);
+
+    struct {
+      std::string C0, C1, C2;
+    } const Cn_annot{
+        std::string(oixs.ix_C_canon() + oixs.contracted()) + inner.a,
+        {oixs.ix_C_canon() + oixs.contracted()},
+        {oixs.ix_C_canon()}};
+
+    //  Step I:   A(ijpab;xy) * B(jiqba;yx) -> C0(ijpqab;xy)
+    auto C0 = einsum(A, B, Cn_annot.C0);
+
+    //  Step II:  C0(ijpqab;xy) -> C1(ijpqab)
+    auto C1 = TA::foreach<typename ArrayC::value_type>(
+        C0, [sum_tot_2_tos](auto &out_tile, auto const &in_tile) {
+          out_tile = sum_tot_2_tos(in_tile);
+        });
+
+    //  Step III: C1(ijpqab) -> C2(ijpq)
+    auto C2 = reduce_modes(C1, oixs.contracted().size());
+
+    //  Step IV:  C2(ijpq) -> C(ipjq)
+    ArrayC C;
+    C(c) = C2(Cn_annot.C2);
     return C;
-  }
 
-  auto e = (a ^ b);
-  // contracted indices
-  auto i = (a & b) - h;
+  } else {
+    // these are "Hadamard" (fused) indices
+    auto h = a & b & c;
+
+    // external indices
+    auto e = (a ^ b);
+
+    // contracted indices
+    auto i = (a & b) - h;
+
+    //
+    // *) Pure Hadamard indices: (h && !(i || e)) is true implies
+    //   the evaluation can be delegated to the expression layer
+    //   for distarrays of both nested and non-nested tensor tiles.
+    // *) If no Hadamard indices are present (!h) the evaluation
+    //    can be delegated to the expression layer.
+    //
+    if ((h && !(i || e))  // pure Hadamard
+        || !h)            // no Hadamard
+    {
+      ArrayC C;
+      C(std::string(c) + inner.c) = A * B;
+      return C;
+    }
+
+    TA_ASSERT(e || h);
+
+    auto range_map =
+        (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange()));
+
+    // special Hadamard
+    if (h.size() == a.size() || h.size() == b.size()) {
+      TA_ASSERT(!i && e);
+      bool const small_a = h.size() == a.size();
+      auto const delta_trng = make_trange(range_map, e);
+      std::string target_layout = std::string(c) + inner.c;
+      ArrayC C;
+      if (small_a) {
+        auto temp = replicate_array(A.array(), delta_trng);
+        std::string temp_layout = std::string(e) + "," + A.annotation();
+        C(target_layout) = temp(temp_layout) * B;
+      } else {
+        auto temp = replicate_array(B.array(), delta_trng);
+        std::string temp_layout = std::string(e) + "," + B.annotation();
+        C(target_layout) = A * temp(temp_layout);
+      }
 
-  TA_ASSERT(e || h);
+      return C;
+    }
 
-  auto range_map =
-      (RangeMap(a, A.array().trange()) | RangeMap(b, B.array().trange()));
+    using ::Einsum::index::permutation;
+    using TiledArray::Permutation;
+
+    std::tuple<ArrayTerm<ArrayA>, ArrayTerm<ArrayB>> AB{{A.array(), a},
+                                                        {B.array(), b}};
 
-  using ::Einsum::index::permutation;
-  using TiledArray::Permutation;
+    auto update_perm_and_indices = [&e = std::as_const(e),
+                                    &i = std::as_const(i),
+                                    &h = std::as_const(h)](auto &term) {
+      auto ei = (e + i & term.idx);
+      if (term.idx != h + ei) {
+        term.permutation = permutation(term.idx, h + ei);
+      }
+      term.expr = ei;
+    };
 
-  ArrayTerm<Array> AB[2] = {{A.array(), a}, {B.array(), b}};
+    std::invoke(update_perm_and_indices, std::get<0>(AB));
+    std::invoke(update_perm_and_indices, std::get<1>(AB));
 
-  for (auto &term : AB) {
-    auto ei = (e + i & term.idx);
-    if (term.idx != h + ei) {
-      term.permutation = permutation(term.idx, h + ei);
+    ArrayTerm<ArrayC> C = {ArrayC(world, TiledRange(range_map[c])), c};
+    for (auto idx : e) {
+      C.tiles *= Range(range_map[idx].tiles_range());
     }
-    term.expr = ei;
-  }
+    if (C.idx != h + e) {
+      C.permutation = permutation(h + e, C.idx);
+    }
+    C.expr = e;
 
-  ArrayTerm<Array> C = {Array(world, TiledRange(range_map[c])), c};
-  for (auto idx : e) {
-    C.tiles *= Range(range_map[idx].tiles_range());
-  }
-  if (C.idx != h + e) {
-    C.permutation = permutation(h + e, C.idx);
-  }
-  C.expr = e;
+    std::get<0>(AB).expr += inner.a;
+    std::get<1>(AB).expr += inner.b;
 
-  AB[0].expr += inner.a;
-  AB[1].expr += inner.b;
-  C.expr += inner.c;
+    C.expr += inner.c;
 
-  struct {
-    RangeProduct tiles;
-    std::vector<std::vector<size_t>> batch;
-  } H;
+    struct {
+      RangeProduct tiles;
+      std::vector<std::vector<size_t>> batch;
+    } H;
 
-  for (auto idx : h) {
-    H.tiles *= Range(range_map[idx].tiles_range());
-    H.batch.push_back({});
-    for (auto r : range_map[idx]) {
-      H.batch.back().push_back(Range{r}.size());
+    for (auto idx : h) {
+      H.tiles *= Range(range_map[idx].tiles_range());
+      H.batch.push_back({});
+      for (auto r : range_map[idx]) {
+        H.batch.back().push_back(Range{r}.size());
+      }
     }
-  }
 
-  using Index = Einsum::Index<size_t>;
+    using Index = Einsum::Index<size_t>;
 
-  if constexpr (std::tuple_size<decltype(cs)>::value > 1) {
-    TA_ASSERT(e);
-  } else if (!e) {  // hadamard reduction
-    auto &[A, B] = AB;
-    TiledRange trange(range_map[i]);
-    RangeProduct tiles;
-    for (auto idx : i) {
-      tiles *= Range(range_map[idx].tiles_range());
-    }
-    auto pa = A.permutation;
-    auto pb = B.permutation;
-    for (Index h : H.tiles) {
-      if (!C.array.is_local(h)) continue;
-      size_t batch = 1;
-      for (size_t i = 0; i < h.size(); ++i) {
-        batch *= H.batch[i].at(h[i]);
+    if (!e) {  // hadamard reduction
+      auto &[A, B] = AB;
+      TiledRange trange(range_map[i]);
+      RangeProduct tiles;
+      for (auto idx : i) {
+        tiles *= Range(range_map[idx].tiles_range());
       }
-      Tensor tile(TiledArray::Range{batch}, typename Tensor::value_type(0));
-      for (Index i : tiles) {
-        // skip this unless both input tiles exist
-        const auto pahi_inv = apply_inverse(pa, h + i);
-        const auto pbhi_inv = apply_inverse(pb, h + i);
-        if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue;
-
-        auto ai = A.array.find(pahi_inv).get();
-        auto bi = B.array.find(pbhi_inv).get();
-        if (pa) ai = ai.permute(pa);
-        if (pb) bi = bi.permute(pb);
-        auto shape = trange.tile(i);
-        ai = ai.reshape(shape, batch);
-        bi = bi.reshape(shape, batch);
-        for (size_t k = 0; k < batch; ++k) {
-          auto hk = ai.batch(k).dot(bi.batch(k));
-          tile({k}) += hk;
+      auto pa = A.permutation;
+      auto pb = B.permutation;
+      for (Index h : H.tiles) {
+        if (!C.array.is_local(h)) continue;
+        size_t batch = 1;
+        for (size_t i = 0; i < h.size(); ++i) {
+          batch *= H.batch[i].at(h[i]);
         }
+        ResultTensor tile(TiledArray::Range{batch},
+                          typename ResultTensor::value_type{});
+        for (Index i : tiles) {
+          // skip this unless both input tiles exist
+          const auto pahi_inv = apply_inverse(pa, h + i);
+          const auto pbhi_inv = apply_inverse(pb, h + i);
+          if (A.array.is_zero(pahi_inv) || B.array.is_zero(pbhi_inv)) continue;
+
+          auto ai = A.array.find(pahi_inv).get();
+          auto bi = B.array.find(pbhi_inv).get();
+          if (pa) ai = ai.permute(pa);
+          if (pb) bi = bi.permute(pb);
+          auto shape = trange.tile(i);
+          ai = ai.reshape(shape, batch);
+          bi = bi.reshape(shape, batch);
+          for (size_t k = 0; k < batch; ++k) {
+            using Ix = ::Einsum::Index<std::string>;
+            if constexpr (AreArrayToT<ArrayA, ArrayB>) {
+              auto aik = ai.batch(k);
+              auto bik = bi.batch(k);
+              auto vol = aik.total_size();
+              TA_ASSERT(vol == bik.total_size());
+
+              auto &el = tile({k});
+              using TensorT = std::remove_reference_t<decltype(el)>;
+
+              auto mult_op = [&inner](auto const &l, auto const &r) -> TensorT {
+                if (l.empty() || r.empty()) return TensorT{};
+                return inner.h ? TA::detail::tensor_hadamard(l, inner.A, r,
+                                                             inner.B, inner.C)
+                               : TA::detail::tensor_contract(l, inner.A, r,
+                                                             inner.B, inner.C);
+              };
+
+              for (auto i = 0; i < vol; ++i)
+                el.add_to(mult_op(aik.data()[i], bik.data()[i]));
+
+            } else if constexpr (!AreArraySame<ArrayA, ArrayB>) {
+              auto aik = ai.batch(k);
+              auto bik = bi.batch(k);
+              auto vol = aik.total_size();
+              TA_ASSERT(vol == bik.total_size());
+
+              auto &el = tile({k});
+
+              for (auto i = 0; i < vol; ++i)
+                if constexpr (IsArrayToT<ArrayA>) {
+                  el.add_to(aik.data()[i].scale(bik.data()[i]));
+                } else {
+                  el.add_to(bik.data()[i].scale(aik.data()[i]));
+                }
+
+            } else {
+              auto hk = ai.batch(k).dot(bi.batch(k));
+              tile({k}) += hk;
+            }
+          }
+        }
+        auto pc = C.permutation;
+        auto shape = apply_inverse(pc, C.array.trange().tile(h));
+        tile = tile.reshape(shape);
+        if (pc) tile = tile.permute(pc);
+        C.array.set(h, tile);
       }
-      auto pc = C.permutation;
-      auto shape = apply_inverse(pc, C.array.trange().tile(h));
-      tile = tile.reshape(shape);
-      if (pc) tile = tile.permute(pc);
-      C.array.set(h, tile);
+      return C.array;
     }
-    return C.array;
-  }
 
-  // generalized contraction
-
-  for (auto &term : AB) {
-    auto ei = (e + i & term.idx);
-    term.ei_tiled_range = TiledRange(range_map[ei]);
-    for (auto idx : ei) {
-      term.tiles *= Range(range_map[idx].tiles_range());
+    // generalized contraction
+
+    if constexpr (IsArrayToT<ArrayC>) {
+      if (inner.C != inner.h + inner.e) {
+        // when inner tensor permutation is non-trivial (could be potentially
+        // elided by extending this function (@c einsum) to take into account
+        // of inner tensor's permutations)
+        auto temp_annot = std::string(c) + ";" + std::string(inner.h + inner.e);
+        ArrayC temp = einsum(tnsrExprA, tnsrExprB,
+                             Einsum::idx<ArrayC>(temp_annot), world);
+        ArrayC result;
+        result(std::string(c) + inner.c) = temp(temp_annot);
+        return result;
+      }
     }
-  }
 
-  std::vector<std::shared_ptr<World>> worlds;
-  std::vector<std::tuple<Index, Tensor>> local_tiles;
+    auto update_tr = [&e = std::as_const(e), &i = std::as_const(i),
+                      &range_map = std::as_const(range_map)](auto &term) {
+      auto ei = (e + i & term.idx);
+      term.ei_tiled_range = TiledRange(range_map[ei]);
+      for (auto idx : ei) {
+        term.tiles *= Range(range_map[idx].tiles_range());
+      }
+    };
 
-  // iterates over tiles of hadamard indices
-  for (Index h : H.tiles) {
-    auto &[A, B] = AB;
-    auto own = A.own(h) || B.own(h);
-    auto comm = world.mpi.comm().Split(own, world.rank());
-    worlds.push_back(std::make_unique<World>(comm));
-    auto &owners = worlds.back();
-    if (!own) continue;
-    size_t batch = 1;
-    for (size_t i = 0; i < h.size(); ++i) {
-      batch *= H.batch[i].at(h[i]);
-    }
-    for (auto &term : AB) {
-      term.local_tiles.clear();
-      const Permutation &P = term.permutation;
+    std::invoke(update_tr, std::get<0>(AB));
+    std::invoke(update_tr, std::get<1>(AB));
 
-      for (Index ei : term.tiles) {
-        auto idx = apply_inverse(P, h + ei);
-        if (!term.array.is_local(idx)) continue;
-        if (term.array.is_zero(idx)) continue;
+    std::vector<std::shared_ptr<World>> worlds;
+    std::vector<std::tuple<Index, ResultTensor>> local_tiles;
+
+    // iterates over tiles of hadamard indices
+    for (Index h : H.tiles) {
+      auto &[A, B] = AB;
+      auto own = A.own(h) || B.own(h);
+      auto comm = world.mpi.comm().Split(own, world.rank());
+      worlds.push_back(std::make_unique<World>(comm));
+      auto &owners = worlds.back();
+      if (!own) continue;
+      size_t batch = 1;
+      for (size_t i = 0; i < h.size(); ++i) {
+        batch *= H.batch[i].at(h[i]);
+      }
+
+      auto retile = [&owners, &h = std::as_const(h), batch](auto &term) {
+        term.local_tiles.clear();
+        const Permutation &P = term.permutation;
+
+        for (Index ei : term.tiles) {
+          auto idx = apply_inverse(P, h + ei);
+          if (!term.array.is_local(idx)) continue;
+          if (term.array.is_zero(idx)) continue;
+          // TODO no need for immediate evaluation
+          auto tile = term.array.find_local(idx).get();
+          if (P) tile = tile.permute(P);
+          auto shape = term.ei_tiled_range.tile(ei);
+          tile = tile.reshape(shape, batch);
+          term.local_tiles.push_back({ei, tile});
+        }
+        bool replicated = term.array.pmap()->is_replicated();
+        term.ei = TiledArray::make_array<decltype(term.array)>(
+            *owners, term.ei_tiled_range, term.local_tiles.begin(),
+            term.local_tiles.end(), replicated);
+      };
+      std::invoke(retile, std::get<0>(AB));
+      std::invoke(retile, std::get<1>(AB));
+
+      C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners);
+      A.ei.defer_deleter_to_next_fence();
+      B.ei.defer_deleter_to_next_fence();
+      A.ei = ArrayA();
+      B.ei = ArrayB();
+      // why omitting this fence leads to deadlock?
+      owners->gop.fence();
+      for (Index e : C.tiles) {
+        if (!C.ei.is_local(e)) continue;
+        if (C.ei.is_zero(e)) continue;
         // TODO no need for immediate evaluation
-        auto tile = term.array.find_local(idx).get();
+        auto tile = C.ei.find_local(e).get();
+        assert(tile.nbatch() == batch);
+        const Permutation &P = C.permutation;
+        auto c = apply(P, h + e);
+        auto shape = C.array.trange().tile(c);
+        shape = apply_inverse(P, shape);
+        tile = tile.reshape(shape);
         if (P) tile = tile.permute(P);
-        auto shape = term.ei_tiled_range.tile(ei);
-        tile = tile.reshape(shape, batch);
-        term.local_tiles.push_back({ei, tile});
+        local_tiles.push_back({c, tile});
       }
-      bool replicated = term.array.pmap()->is_replicated();
-      term.ei = TiledArray::make_array<Array>(
-          *owners, term.ei_tiled_range, term.local_tiles.begin(),
-          term.local_tiles.end(), replicated);
+      // mark for lazy deletion
+      C.ei = ArrayC();
     }
-    C.ei(C.expr) = (A.ei(A.expr) * B.ei(B.expr)).set_world(*owners);
-    A.ei.defer_deleter_to_next_fence();
-    B.ei.defer_deleter_to_next_fence();
-    A.ei = Array();
-    B.ei = Array();
-    // why omitting this fence leads to deadlock?
-    owners->gop.fence();
-    for (Index e : C.tiles) {
-      if (!C.ei.is_local(e)) continue;
-      if (C.ei.is_zero(e)) continue;
-      // TODO no need for immediate evaluation
-      auto tile = C.ei.find_local(e).get();
-      assert(tile.batch_size() == batch);
-      const Permutation &P = C.permutation;
-      auto c = apply(P, h + e);
-      auto shape = C.array.trange().tile(c);
-      shape = apply_inverse(P, shape);
-      tile = tile.reshape(shape);
-      if (P) tile = tile.permute(P);
-      local_tiles.push_back({c, tile});
+
+    if constexpr (!ResultShape::is_dense()) {
+      TiledRange tiled_range = TiledRange(range_map[c]);
+      std::vector<std::pair<Index, float>> tile_norms;
+      for (auto &[index, tile] : local_tiles) {
+        tile_norms.push_back({index, tile.norm()});
+      }
+      ResultShape shape(world, tile_norms, tiled_range);
+      C.array = ArrayC(world, TiledRange(range_map[c]), shape);
     }
-    // mark for lazy deletion
-    C.ei = Array();
-  }
 
-  if constexpr (!Shape::is_dense()) {
-    TiledRange tiled_range = TiledRange(range_map[c]);
-    std::vector<std::pair<Index, float>> tile_norms;
     for (auto &[index, tile] : local_tiles) {
-      tile_norms.push_back({index, tile.norm()});
+      if (C.array.is_zero(index)) continue;
+      C.array.set(index, tile);
     }
-    Shape shape(world, tile_norms, tiled_range);
-    C.array = Array(world, TiledRange(range_map[c]), shape);
-  }
 
-  for (auto &[index, tile] : local_tiles) {
-    if (C.array.is_zero(index)) continue;
-    C.array.set(index, tile);
-  }
+    for (auto &w : worlds) {
+      w->gop.fence();
+    }
 
-  for (auto &w : worlds) {
-    w->gop.fence();
+    return C.array;
   }
-
-  return C.array;
 }
 
 /// Computes ternary tensor product whose result
@@ -419,12 +955,19 @@ auto einsum(expressions::TsrExpr<T> A, expressions::TsrExpr<U> B) {
 /// @param[in] r result indices
 /// @warning just as in the plain expression code, reductions are a special
 /// case; use Expr::reduce()
-template <typename T, typename U, typename... Indices>
+template <DeNest DeNestFlag = DeNest::False, typename T, typename U,
+          typename... Indices>
 auto einsum(expressions::TsrExpr<T> A, expressions::TsrExpr<U> B,
             const std::string &cs, World &world = get_default_world()) {
-  static_assert(std::is_same<const T, const U>::value);
-  using E = expressions::TsrExpr<const T>;
-  return Einsum::einsum(E(A), E(B), Einsum::idx<T>(cs), world);
+  using ECT = expressions::TsrExpr<const T>;
+  using ECU = expressions::TsrExpr<const U>;
+
+  using ResultExprT =
+      std::conditional_t<DeNestFlag == DeNest::True, Einsum::DeNestedArray<T>,
+                         Einsum::MaxNestedArray<T, U>>;
+
+  return Einsum::einsum<DeNestFlag>(ECT(A), ECU(B),
+                                    Einsum::idx<ResultExprT>(cs), world);
 }
 
 template <typename T, typename U, typename V>
@@ -443,14 +986,44 @@ namespace TiledArray {
 using expressions::dot;
 using expressions::einsum;
 
-template <typename T, typename P>
-auto einsum(const std::string &expr, const DistArray<T, P> &A,
-            const DistArray<T, P> &B, World &world = get_default_world()) {
-  namespace string = ::Einsum::string;
-  auto [lhs, rhs] = string::split2(expr, "->");
-  auto [a, b] = string::split2(lhs, ",");
-  return einsum(A(string::join(a, ",")), B(string::join(b, ",")),
-                string::join(rhs, ","), world);
+template <DeNest DeNestFlag = DeNest::False, typename T1, typename T2,
+          typename P>
+auto einsum(const std::string &expr, const DistArray<T1, P> &A,
+            const DistArray<T2, P> &B, World &world = get_default_world()) {
+  using ::Einsum::string::join;
+  using ::Einsum::string::split2;
+
+  struct {
+    std::string A, B, C;
+  } annot;
+
+  {
+    struct {
+      std::string A, B, C;
+    } outer;
+
+    struct {
+      std::string A, B, C;
+    } inner;
+
+    auto [ab, aC] = split2(expr, "->");
+    std::tie(outer.C, inner.C) = split2(aC, ";");
+
+    auto [aA, aB] = split2(ab, ",");
+    std::tie(outer.A, inner.A) = split2(aA, ";");
+    std::tie(outer.B, inner.B) = split2(aB, ";");
+
+    auto combine = [](auto const &outer, auto const &inner) {
+      return inner.empty() ? join(outer, ",")
+                           : (join(outer, ",") + ";" + join(inner, ","));
+    };
+
+    annot.A = combine(outer.A, inner.A);
+    annot.B = combine(outer.B, inner.B);
+    annot.C = combine(outer.C, inner.C);
+  }
+
+  return einsum<DeNestFlag>(A(annot.A), B(annot.B), annot.C, world);
 }
 
 /// Computes ternary tensor product whose result
diff --git a/src/TiledArray/expressions/add_engine.h b/src/TiledArray/expressions/add_engine.h
index 9421f6ffb2..f4a879365a 100644
--- a/src/TiledArray/expressions/add_engine.h
+++ b/src/TiledArray/expressions/add_engine.h
@@ -195,10 +195,11 @@ class AddEngine : public BinaryEngine<AddEngine<Left, Right, Result>> {
 
   /// \param perm The permutation to be applied to tiles
   /// \return The tile operation
-  template <typename Perm, typename = std::enable_if_t<
-                               TiledArray::detail::is_permutation_v<Perm>>>
-  static op_type make_tile_op(const Perm& perm) {
-    return op_type(op_base_type(), perm);
+  template <typename Perm,
+            typename = std::enable_if_t<TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>>>>
+  static op_type make_tile_op(Perm&& perm) {
+    return op_type(op_base_type(), std::forward<Perm>(perm));
   }
 
   /// Expression identification tag
@@ -296,10 +297,11 @@ class ScalAddEngine
 
   /// \param perm The permutation to be applied to tiles
   /// \return The tile operation
-  template <typename Perm, typename = std::enable_if_t<
-                               TiledArray::detail::is_permutation_v<Perm>>>
-  op_type make_tile_op(const Perm& perm) const {
-    return op_type(op_base_type(factor_), perm);
+  template <typename Perm,
+            typename = std::enable_if_t<TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>>>>
+  op_type make_tile_op(Perm&& perm) const {
+    return op_type(op_base_type(factor_), std::forward<Perm>(perm));
   }
 
   /// Scaling factor accessor
diff --git a/src/TiledArray/expressions/binary_engine.h b/src/TiledArray/expressions/binary_engine.h
index 4758ab0069..486c5421a1 100644
--- a/src/TiledArray/expressions/binary_engine.h
+++ b/src/TiledArray/expressions/binary_engine.h
@@ -75,9 +75,10 @@ class BinaryEngine : public ExprEngine<Derived> {
 
  protected:
   // Import base class variables to this scope
+  using ExprEngine_::implicit_permute_inner_;
+  using ExprEngine_::implicit_permute_outer_;
   using ExprEngine_::indices_;
   using ExprEngine_::perm_;
-  using ExprEngine_::permute_tiles_;
   using ExprEngine_::pmap_;
   using ExprEngine_::shape_;
   using ExprEngine_::trange_;
@@ -96,14 +97,14 @@ class BinaryEngine : public ExprEngine<Derived> {
   PermutationType right_inner_permtype_ =
       PermutationType::general;  ///< Right-hand permutation type
 
-  template <TensorProduct ProductType>
+  template <TensorProduct OuterProductType>
   void init_indices_(const BipartiteIndexList& target_indices = {}) {
-    static_assert(ProductType == TensorProduct::Contraction ||
-                  ProductType == TensorProduct::Hadamard);
+    static_assert(OuterProductType == TensorProduct::Contraction ||
+                  OuterProductType == TensorProduct::Hadamard);
     // prefer to permute the arg with fewest leaves to try to minimize the
     // number of possible permutations
     using permopt_type =
-        std::conditional_t<ProductType == TensorProduct::Contraction,
+        std::conditional_t<OuterProductType == TensorProduct::Contraction,
                            GEMMPermutationOptimizer,
                            HadamardPermutationOptimizer>;
 
@@ -146,34 +147,29 @@ class BinaryEngine : public ExprEngine<Derived> {
         TiledArray::detail::is_tensor_of_tensor_v<left_tile_type>;
     constexpr bool right_tile_is_tot =
         TiledArray::detail::is_tensor_of_tensor_v<right_tile_type>;
-    static_assert(!(left_tile_is_tot ^ right_tile_is_tot),
-                  "ContEngine can only handle tensors of same nested-ness "
-                  "(both plain or both ToT)");
     constexpr bool args_are_plain_tensors =
         !left_tile_is_tot && !right_tile_is_tot;
-    if (args_are_plain_tensors &&
-        (left_outer_permtype_ == PermutationType::matrix_transpose ||
-         left_outer_permtype_ == PermutationType::identity)) {
-      left_.permute_tiles(false);
+    constexpr bool args_are_mixed_tensors =
+        left_tile_is_tot ^ right_tile_is_tot;
+    // implicit_permute_{outer,inner}() denotes whether permutations will be
+    // fused into consuming operation
+    if (left_outer_permtype_ == PermutationType::matrix_transpose ||
+        left_outer_permtype_ == PermutationType::identity) {
+      left_.implicit_permute_outer(true);
     }
-    if (!args_are_plain_tensors &&
-        ((left_outer_permtype_ == PermutationType::matrix_transpose ||
-          left_outer_permtype_ == PermutationType::identity) ||
-         (left_inner_permtype_ == PermutationType::matrix_transpose ||
-          left_inner_permtype_ == PermutationType::identity))) {
-      left_.permute_tiles(false);
+    if (left_tile_is_tot &&
+        (left_inner_permtype_ == PermutationType::matrix_transpose ||
+         left_inner_permtype_ == PermutationType::identity)) {
+      left_.implicit_permute_inner(true);
     }
-    if (args_are_plain_tensors &&
-        (right_outer_permtype_ == PermutationType::matrix_transpose ||
-         right_outer_permtype_ == PermutationType::identity)) {
-      right_.permute_tiles(false);
+    if (right_outer_permtype_ == PermutationType::matrix_transpose ||
+        right_outer_permtype_ == PermutationType::identity) {
+      right_.implicit_permute_outer(true);
     }
-    if (!args_are_plain_tensors &&
-        ((left_outer_permtype_ == PermutationType::matrix_transpose ||
-          left_outer_permtype_ == PermutationType::identity) ||
-         (right_inner_permtype_ == PermutationType::matrix_transpose ||
-          right_inner_permtype_ == PermutationType::identity))) {
-      right_.permute_tiles(false);
+    if (right_tile_is_tot &&
+        (right_inner_permtype_ == PermutationType::matrix_transpose ||
+         right_inner_permtype_ == PermutationType::identity)) {
+      right_.implicit_permute_inner(true);
     }
   }
 
@@ -190,14 +186,18 @@ class BinaryEngine : public ExprEngine<Derived> {
   /// result of this expression will be permuted to match \c target_indices.
   /// \param target_indices The target index list for this expression
   void perm_indices(const BipartiteIndexList& target_indices) {
-    if (permute_tiles_) {
-      TA_ASSERT(left_.indices().size() == target_indices.size());
-      TA_ASSERT(right_.indices().size() == target_indices.size());
+    if (!this->implicit_permute()) {
+      TA_ASSERT(
+          left_.indices().size() == target_indices.size() ||
+          (left_.indices().second().size() ^ target_indices.second().size()));
+      TA_ASSERT(
+          right_.indices().size() == target_indices.size() ||
+          (right_.indices().second().size() ^ target_indices.second().size()));
 
       init_indices_<TensorProduct::Hadamard>(target_indices);
 
-      TA_ASSERT(right_outer_permtype_ == PermutationType::general ||
-                right_inner_permtype_ == PermutationType::general);
+      TA_ASSERT(left_outer_permtype_ == PermutationType::general &&
+                right_outer_permtype_ == PermutationType::general);
 
       if (left_.indices() != left_indices_) left_.perm_indices(left_indices_);
       if (right_.indices() != right_indices_)
@@ -235,18 +235,30 @@ class BinaryEngine : public ExprEngine<Derived> {
     left_.init_struct(left_indices_);
     right_.init_struct(right_indices_);
 #ifndef NDEBUG
-    if (left_.trange() != right_.trange()) {
+    if (ignore_tile_position()) {
+      if (!is_congruent(left_.trange(), right_.trange())) {
+        if (TiledArray::get_default_world().rank() == 0) {
+          TA_USER_ERROR_MESSAGE(
+              "The TiledRanges of the left- and right-hand arguments the "
+              "binary "
+              "expression are not congruent:"
+              << "\n    left  = " << left_.trange()
+              << "\n    right = " << right_.trange());
+        }
+        TA_EXCEPTION(
+            "The TiledRange objects of a binary expression are not congruent.");
+      }
+    } else if (left_.trange() != right_.trange()) {
       if (TiledArray::get_default_world().rank() == 0) {
         TA_USER_ERROR_MESSAGE(
-            "The TiledRanges of the left- and right-hand arguments of the "
-            "binary operation are not equal:"
+            "The TiledRanges of the left- and right-hand arguments the binary "
+            "expression are not equal:"
             << "\n    left  = " << left_.trange()
             << "\n    right = " << right_.trange());
       }
 
       TA_EXCEPTION(
-          "The TiledRanges of the left- and right-hand arguments "
-          "of the binary operation are not equal.");
+          "The TiledRange objects of a binary expression are not equal.");
     }
 #endif  // NDEBUG
     ExprEngine_::init_struct(target_indices);
diff --git a/src/TiledArray/expressions/blk_tsr_engine.h b/src/TiledArray/expressions/blk_tsr_engine.h
index 2d16172dbe..9b6e750bb5 100644
--- a/src/TiledArray/expressions/blk_tsr_engine.h
+++ b/src/TiledArray/expressions/blk_tsr_engine.h
@@ -147,9 +147,10 @@ class BlkTsrEngineBase : public LeafEngine<Derived> {
 
  protected:
   // Import base class variables to this scope
+  using ExprEngine_::implicit_permute_inner_;
+  using ExprEngine_::implicit_permute_outer_;
   using ExprEngine_::indices_;
   using ExprEngine_::perm_;
-  using ExprEngine_::permute_tiles_;
   using ExprEngine_::pmap_;
   using ExprEngine_::shape_;
   using ExprEngine_::trange_;
@@ -157,22 +158,29 @@ class BlkTsrEngineBase : public LeafEngine<Derived> {
   using LeafEngine_::array_;
 
   container::svector<std::size_t>
-      lower_bound_;  ///< Lower bound of the tile block
+      lower_bound_;  ///< Tile coordinates of the lower bound of the tile block
+                     ///< in the host array
   container::svector<std::size_t>
-      upper_bound_;  ///< Upper bound of the tile block
+      upper_bound_;  ///< Tile coordinates of the upper bound of the tile block
+                     ///< in the host array
+  std::optional<Range::index_type>
+      trange_lobound_;  ///< Lobound of the result trange, modulo permutation
+                        ///< (i.e. referring to the modes of the host array)
 
  public:
   template <typename Array, bool Alias>
   BlkTsrEngineBase(const BlkTsrExpr<Array, Alias>& expr)
       : LeafEngine_(expr),
         lower_bound_(expr.lower_bound()),
-        upper_bound_(expr.upper_bound()) {}
+        upper_bound_(expr.upper_bound()),
+        trange_lobound_(expr.trange_lobound()) {}
 
   template <typename Array, typename Scalar>
   BlkTsrEngineBase(const ScalBlkTsrExpr<Array, Scalar>& expr)
       : LeafEngine_(expr),
         lower_bound_(expr.lower_bound()),
-        upper_bound_(expr.upper_bound()) {}
+        upper_bound_(expr.upper_bound()),
+        trange_lobound_(expr.trange_lobound()) {}
 
   /// Non-permuting tiled range factory function
 
@@ -194,16 +202,22 @@ class BlkTsrEngineBase : public LeafEngine<Derived> {
       const auto lower_d = lower[d];
       const auto upper_d = upper[d];
 
-      // Copy and shift the tiling for the block
-      auto i = lower_d;
-      const auto base_d = trange[d].tile(i).first;
-      trange1_data.emplace_back(0ul);
-      for (; i < upper_d; ++i)
-        trange1_data.emplace_back(trange[d].tile(i).second - base_d);
-
-      // Add the trange1 to the tiled range data
-      trange_data.emplace_back(trange1_data.begin(), trange1_data.end());
-      trange1_data.resize(0ul);
+      // Copy and shift the tiling for the block, if nonempty
+      if (lower_d != upper_d) {
+        auto i = lower_d;
+        const auto base_d = trange[d].tile(i).first;
+        const auto trange1_lobound =
+            trange_lobound_ ? (*trange_lobound_)[d] : 0ul;
+        trange1_data.emplace_back(trange1_lobound);
+        for (; i < upper_d; ++i)
+          trange1_data.emplace_back(trange[d].tile(i).extent() +
+                                    trange1_data.back());
+        // Add the trange1 to the tiled range data
+        trange_data.emplace_back(trange1_data.begin(), trange1_data.end());
+        trange1_data.resize(0ul);
+      } else {
+        trange_data.emplace_back();
+      }
     }
 
     return TiledRange(trange_data.begin(), trange_data.end());
@@ -233,16 +247,22 @@ class BlkTsrEngineBase : public LeafEngine<Derived> {
       const auto lower_i = lower[inv_perm_d];
       const auto upper_i = upper[inv_perm_d];
 
-      // Copy, shift, and permute the tiling of the block
-      auto i = lower_i;
-      const auto base_d = trange[inv_perm_d].tile(i).first;
-      trange1_data.emplace_back(0ul);
-      for (; i < upper_i; ++i)
-        trange1_data.emplace_back(trange[inv_perm_d].tile(i).second - base_d);
-
-      // Add the trange1 to the tiled range data
-      trange_data.emplace_back(trange1_data.begin(), trange1_data.end());
-      trange1_data.resize(0ul);
+      if (lower_i != upper_i) {
+        // Copy, shift, and permute the tiling of the block
+        auto i = lower_i;
+        const auto base_d = trange[inv_perm_d].tile(i).first;
+        const auto trange1_lobound =
+            trange_lobound_ ? (*trange_lobound_)[inv_perm_d] : 0ul;
+        trange1_data.emplace_back(trange1_lobound);
+        for (; i < upper_i; ++i)
+          trange1_data.emplace_back(trange[inv_perm_d].tile(i).extent() +
+                                    trange1_data.back());
+
+        // Add the trange1 to the tiled range data
+        trange_data.emplace_back(trange1_data.begin(), trange1_data.end());
+        trange1_data.resize(0ul);
+      } else
+        trange_data.emplace_back();
     }
 
     return TiledRange(trange_data.begin(), trange_data.end());
@@ -334,10 +354,12 @@ class BlkTsrEngine
  protected:
   // Import base class variables to this scope
   using BlkTsrEngineBase_::lower_bound_;
+  using BlkTsrEngineBase_::trange_lobound_;
   using BlkTsrEngineBase_::upper_bound_;
+  using ExprEngine_::implicit_permute_inner_;
+  using ExprEngine_::implicit_permute_outer_;
   using ExprEngine_::indices_;
   using ExprEngine_::perm_;
-  using ExprEngine_::permute_tiles_;
   using ExprEngine_::pmap_;
   using ExprEngine_::shape_;
   using ExprEngine_::trange_;
@@ -376,12 +398,22 @@ class BlkTsrEngine
     // Get temporary data pointers
     const auto* MADNESS_RESTRICT const trange = array_.trange().data().data();
     const auto* MADNESS_RESTRICT const lower = lower_bound_.data();
+    const auto* MADNESS_RESTRICT const upper = upper_bound_.data();
 
     // Initialize the range shift vector
     for (unsigned int d = 0u; d < rank; ++d) {
       const auto lower_d = lower[d];
-      const auto base_d = trange[d].tile(lower_d).first;
-      range_shift.emplace_back(-base_d);
+      const auto upper_d = upper[d];
+      if (lower_d != upper_d) {
+        // element lobound of the block in the host
+        const auto base_d = trange[d].tile(lower_d).first;
+        // element lobound of the target of this expression
+        const auto target_base_d =
+            trange_lobound_ ? (*trange_lobound_)[d] : 0ul;
+        range_shift.emplace_back(target_base_d - base_d);
+      } else {
+        range_shift.emplace_back(0l);
+      }
     }
 
     return op_type(op_base_type(range_shift));
@@ -391,9 +423,10 @@ class BlkTsrEngine
 
   /// \param perm The permutation to be applied to tiles
   /// \return The tile operation
-  template <typename Perm, typename = std::enable_if_t<
-                               TiledArray::detail::is_permutation_v<Perm>>>
-  op_type make_tile_op(const Perm& perm) const {
+  template <typename Perm,
+            typename = std::enable_if_t<TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>>>>
+  op_type make_tile_op(Perm&& perm) const {
     const unsigned int rank = trange_.tiles_range().rank();
 
     // Construct and allocate memory for the shift range
@@ -402,6 +435,7 @@ class BlkTsrEngine
     // Get temporary data pointers
     const auto* MADNESS_RESTRICT const trange = array_.trange().data().data();
     const auto* MADNESS_RESTRICT const lower = lower_bound_.data();
+    const auto* MADNESS_RESTRICT const upper = upper_bound_.data();
 
     // Initialize the permuted range shift vector
     auto outer_perm = outer(perm);
@@ -409,11 +443,17 @@ class BlkTsrEngine
     for (unsigned int d = 0u; d < rank; ++d) {
       const auto perm_d = outer_perm[d];
       const auto lower_d = lower[d];
-      const auto base_d = trange[d].tile(lower_d).first;
-      range_shift[perm_d] = -base_d;
+      const auto upper_d = upper[d];
+      if (lower_d != upper_d) {
+        // element lobound of the block in the host
+        const auto base_d = trange[d].tile(lower_d).first;
+        // element lobound of the target of this expression
+        const auto target_base_d = trange_lobound_ ? (*trange_lobound_)[d] : 0;
+        range_shift[perm_d] = target_base_d - base_d;
+      }
     }
 
-    return op_type(op_base_type(range_shift), perm);
+    return op_type(op_base_type(range_shift), std::forward<Perm>(perm));
   }
 
   /// Expression identification tag
@@ -477,10 +517,12 @@ class ScalBlkTsrEngine
  protected:
   // Import base class variables to this scope
   using BlkTsrEngineBase_::lower_bound_;
+  using BlkTsrEngineBase_::trange_lobound_;
   using BlkTsrEngineBase_::upper_bound_;
+  using ExprEngine_::implicit_permute_inner_;
+  using ExprEngine_::implicit_permute_outer_;
   using ExprEngine_::indices_;
   using ExprEngine_::perm_;
-  using ExprEngine_::permute_tiles_;
   using ExprEngine_::pmap_;
   using ExprEngine_::shape_;
   using ExprEngine_::trange_;
@@ -522,12 +564,21 @@ class ScalBlkTsrEngine
     // Get temporary data pointers
     const auto* MADNESS_RESTRICT const trange = array_.trange().data().data();
     const auto* MADNESS_RESTRICT const lower = lower_bound_.data();
+    const auto* MADNESS_RESTRICT const upper = upper_bound_.data();
 
     // Construct the inverse permutation
     for (unsigned int d = 0u; d < rank; ++d) {
       const auto lower_d = lower[d];
-      const auto base_d = trange[d].tile(lower_d).first;
-      range_shift.emplace_back(-base_d);
+      const auto upper_d = upper[d];
+      if (lower_d != upper_d) {
+        // element lobound of the block in the host
+        const auto base_d = trange[d].tile(lower_d).first;
+        // element lobound of the target of this expression
+        const auto target_base_d =
+            trange_lobound_ ? (*trange_lobound_)[d] : 0ul;
+        range_shift.emplace_back(target_base_d - base_d);
+      } else
+        range_shift.emplace_back(0);
     }
 
     return op_type(op_base_type(range_shift, factor_));
@@ -537,9 +588,10 @@ class ScalBlkTsrEngine
 
   /// \param perm The permutation to be applied to tiles
   /// \return The tile operation
-  template <typename Perm, typename = std::enable_if_t<
-                               TiledArray::detail::is_permutation_v<Perm>>>
-  op_type make_tile_op(const Perm& perm) const {
+  template <typename Perm,
+            typename = std::enable_if_t<TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>>>>
+  op_type make_tile_op(Perm&& perm) const {
     const unsigned int rank = trange_.tiles_range().rank();
 
     // Construct and allocate memory for the shift range
@@ -548,6 +600,7 @@ class ScalBlkTsrEngine
     // Get temporary data pointers
     const auto* MADNESS_RESTRICT const trange = array_.trange().data().data();
     const auto* MADNESS_RESTRICT const lower = lower_bound_.data();
+    const auto* MADNESS_RESTRICT const upper = upper_bound_.data();
 
     // Initialize the permuted range shift vector
     auto outer_perm = outer(perm);
@@ -555,11 +608,19 @@ class ScalBlkTsrEngine
     for (unsigned int d = 0u; d < rank; ++d) {
       const auto perm_d = outer_perm[d];
       const auto lower_d = lower[d];
-      const auto base_d = trange[d].tile(lower_d).first;
-      range_shift[perm_d] = -base_d;
+      const auto upper_d = upper[d];
+      if (lower_d != upper_d) {
+        // element lobound of the block in the host
+        const auto base_d = trange[d].tile(lower_d).first;
+        // element lobound of the target of this expression
+        const auto target_base_d =
+            trange_lobound_ ? (*trange_lobound_)[d] : 0ul;
+        range_shift[perm_d] = target_base_d - base_d;
+      }
     }
 
-    return op_type(op_base_type(range_shift, factor_), perm);
+    return op_type(op_base_type(range_shift, factor_),
+                   std::forward<Perm>(perm));
   }
 
   /// Expression identification tag
diff --git a/src/TiledArray/expressions/blk_tsr_expr.h b/src/TiledArray/expressions/blk_tsr_expr.h
index d32603b58f..661e2ff666 100644
--- a/src/TiledArray/expressions/blk_tsr_expr.h
+++ b/src/TiledArray/expressions/blk_tsr_expr.h
@@ -32,6 +32,8 @@
 #include <TiledArray/expressions/unary_expr.h>
 #include "blk_tsr_engine.h"
 
+#include <optional>
+
 namespace TiledArray {
 namespace expressions {
 
@@ -118,6 +120,10 @@ class BlkTsrExprBase : public Expr<Derived> {
       lower_bound_;  ///< Lower bound of the tile block
   container::svector<std::size_t>
       upper_bound_;  ///< Upper bound of the tile block
+  /// If non-null, element lobound of the expression trange (else zeros will be
+  /// used) Fusing permutation does not affect this (i.e. this refers to the
+  /// modes of the host array).
+  std::optional<Range::index_type> trange_lobound_;
 
   void check_valid() const {
     TA_ASSERT(array_);
@@ -179,7 +185,7 @@ class BlkTsrExprBase : public Expr<Derived> {
     const bool lower_upper_bound_check =
         std::equal(std::begin(lower_bound_), std::end(lower_bound_),
                    std::begin(upper_bound_),
-                   [](std::size_t l, std::size_t r) { return l < r; });
+                   [](std::size_t l, std::size_t r) { return l <= r; });
     if (!lower_upper_bound_check) {
       if (TiledArray::get_default_world().rank() == 0) {
         using TiledArray::operator<<;
@@ -285,6 +291,36 @@ class BlkTsrExprBase : public Expr<Derived> {
   /// \return The block upper bound
   const auto& upper_bound() const { return upper_bound_; }
 
+  /// Sets result trange lobound
+  /// @param[in] trange_lobound The result trange lobound
+  template <typename Index1,
+            typename = std::enable_if_t<
+                TiledArray::detail::is_integral_range_v<Index1>>>
+  Derived& set_trange_lobound(const Index1& trange_lobound) {
+    trange_lobound_.emplace(std::begin(trange_lobound),
+                            std::end(trange_lobound));
+    return static_cast<Derived&>(*this);
+  }
+
+  /// Sets result trange lobound
+  /// @param[in] trange_lobound The result trange lobound
+  template <typename Integer,
+            typename = std::enable_if_t<std::is_integral_v<Integer>>>
+  Derived& set_trange_lobound(std::initializer_list<Integer> trange_lobound) {
+    return this->set_trange_lobound<std::initializer_list<Integer>>(
+        trange_lobound);
+  }
+
+  /// Sets result trange lobound such that the tile lobounds are not changed
+  Derived& preserve_lobound() {
+    return set_trange_lobound(
+        array_.trange().make_tile_range(lower_bound()).lobound());
+  }
+
+  /// @return optional to result trange lobound; if null, the result trange
+  /// lobound is zero
+  const auto& trange_lobound() const { return trange_lobound_; }
+
 };  // class BlkTsrExprBase
 
 /// Block expression
diff --git a/src/TiledArray/expressions/cont_engine.h b/src/TiledArray/expressions/cont_engine.h
index 35c2f34199..3d0ef11c10 100644
--- a/src/TiledArray/expressions/cont_engine.h
+++ b/src/TiledArray/expressions/cont_engine.h
@@ -94,9 +94,10 @@ class ContEngine : public BinaryEngine<Derived> {
   using BinaryEngine_::right_indices_;
   using BinaryEngine_::right_inner_permtype_;
   using BinaryEngine_::right_outer_permtype_;
+  using ExprEngine_::implicit_permute_inner_;
+  using ExprEngine_::implicit_permute_outer_;
   using ExprEngine_::indices_;
   using ExprEngine_::perm_;
-  using ExprEngine_::permute_tiles_;
   using ExprEngine_::pmap_;
   using ExprEngine_::shape_;
   using ExprEngine_::trange_;
@@ -107,15 +108,26 @@ class ContEngine : public BinaryEngine<Derived> {
 
  protected:
   op_type op_;  ///< Tile operation
-  using tile_element_type = typename value_type::value_type;
-  std::function<void(tile_element_type&, const tile_element_type&,
-                     const tile_element_type&)>
-      inner_tile_nonreturn_op_;  ///< Tile element operation (only non-null for
-                                 ///< nested tensor expressions)
-  std::function<tile_element_type(const tile_element_type&,
-                                  const tile_element_type&)>
-      inner_tile_return_op_;  ///< Same as inner_tile_nonreturn_op_ but returns
-                              ///< the result
+
+  // tile types of the result and (after evaluation) left and right arguments
+  using result_tile_type = value_type;
+  using left_tile_type = typename EngineTrait<left_type>::eval_type;
+  using right_tile_type = typename EngineTrait<right_type>::eval_type;
+
+  // tile element types of the result and (after evaluation) left and right
+  // arguments
+  using result_tile_element_type = typename result_tile_type::value_type;
+  using left_tile_element_type = typename left_tile_type::value_type;
+  using right_tile_element_type = typename right_tile_type::value_type;
+
+  std::function<void(result_tile_element_type&, const left_tile_element_type&,
+                     const right_tile_element_type&)>
+      element_nonreturn_op_;  ///< Tile element operation (only non-null for
+                              ///< nested tensor expressions)
+  std::function<result_tile_element_type(const left_tile_element_type&,
+                                         const right_tile_element_type&)>
+      element_return_op_;  ///< Same as element_nonreturn_op_ but returns
+                           ///< the result
   TiledArray::detail::ProcGrid
       proc_grid_;    ///< Process grid for the contraction
   size_type K_ = 1;  ///< Inner dimension size
@@ -147,9 +159,10 @@ class ContEngine : public BinaryEngine<Derived> {
   TensorProduct inner_product_type() const {
     TA_ASSERT(inner_product_type_ !=
               TensorProduct::Invalid);  // init_indices() must initialize this
-    /// only Hadamard and contraction are supported now
+    /// only Hadamard, contraction, and scale are supported now
     TA_ASSERT(inner_product_type_ == TensorProduct::Hadamard ||
-              inner_product_type_ == TensorProduct::Contraction);
+              inner_product_type_ == TensorProduct::Contraction ||
+              inner_product_type_ == TensorProduct::Scale);
     return inner_product_type_;
   }
 
@@ -190,7 +203,7 @@ class ContEngine : public BinaryEngine<Derived> {
   void perm_indices(const BipartiteIndexList& target_indices) {
     // assert that init_indices has been called
     TA_ASSERT(left_.indices() && right_.indices());
-    if (permute_tiles_) {
+    if (!this->implicit_permute()) {
       this->template init_indices_<TensorProduct::Contraction>(target_indices);
 
       // propagate the indices down the tree, if needed
@@ -239,8 +252,8 @@ class ContEngine : public BinaryEngine<Derived> {
     // precondition checks
     // 1. if ToT inner tile op has been initialized
     if constexpr (TiledArray::detail::is_tensor_of_tensor_v<value_type>) {
-      TA_ASSERT(inner_tile_nonreturn_op_);
-      TA_ASSERT(inner_tile_return_op_);
+      TA_ASSERT(element_nonreturn_op_);
+      TA_ASSERT(element_return_op_);
     }
 
     // Initialize children
@@ -250,41 +263,78 @@ class ContEngine : public BinaryEngine<Derived> {
     // Initialize the tile operation in this function because it is used to
     // evaluate the tiled range and shape.
 
-    const math::blas::Op left_op =
-        (left_outer_permtype_ == PermutationType::matrix_transpose
-             ? math::blas::Transpose
-             : math::blas::NoTranspose);
-    const math::blas::Op right_op =
-        (right_outer_permtype_ == PermutationType::matrix_transpose
-             ? math::blas::Transpose
-             : math::blas::NoTranspose);
+    const auto left_op = to_cblas_op(left_outer_permtype_);
+    const auto right_op = to_cblas_op(right_outer_permtype_);
 
+    // initialize perm_
+    this->init_perm(target_indices);
+
+    // initialize op_, trange_, and shape_ which only refer to the outer modes
     if (outer(target_indices) != outer(indices_)) {
+      const auto outer_perm = outer(perm_);
       // Initialize permuted structure
-      perm_ = ExprEngine_::make_perm(target_indices);
       if constexpr (!TiledArray::detail::is_tensor_of_tensor_v<value_type>) {
-        op_ = op_type(left_op, right_op, factor_, outer_size(indices_),
-                      outer_size(left_indices_), outer_size(right_indices_),
-                      (permute_tiles_ ? perm_ : BipartitePermutation{}));
+        op_ = op_type(
+            left_op, right_op, factor_, outer_size(indices_),
+            outer_size(left_indices_), outer_size(right_indices_),
+            (!implicit_permute_outer_ ? std::move(outer_perm) : Permutation{}));
       } else {
+
+        auto make_total_perm = [this]() -> BipartitePermutation {
+          if (this->product_type() != TensorProduct::Contraction
+              || this->implicit_permute_inner_)
+            return this->implicit_permute_outer_
+                       ? BipartitePermutation()
+                       : BipartitePermutation(outer(this->perm_));
+
+          // Here,
+          // this->product_type() is Tensor::Contraction, and,
+          // this->implicit_permute_inner_ is false
+
+          return this->inner_product_type() == TensorProduct::Scale
+                     ? BipartitePermutation(outer(this->perm_))
+                     : this->perm_;
+        };
+
+        auto total_perm = make_total_perm();
+
         // factor_ is absorbed into inner_tile_nonreturn_op_
-        op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_),
-                      outer_size(left_indices_), outer_size(right_indices_),
-                      (permute_tiles_ ? perm_ : BipartitePermutation{}),
-                      this->inner_tile_nonreturn_op_);
+        op_ = op_type(
+            left_op, right_op, scalar_type(1), outer_size(indices_),
+            outer_size(left_indices_), outer_size(right_indices_),
+            total_perm,
+            this->element_nonreturn_op_);
       }
-      trange_ = ContEngine_::make_trange(outer(perm_));
-      shape_ = ContEngine_::make_shape(outer(perm_));
+      trange_ = ContEngine_::make_trange(outer_perm);
+      shape_ = ContEngine_::make_shape(outer_perm);
     } else {
       // Initialize non-permuted structure
+
       if constexpr (!TiledArray::detail::is_tensor_of_tensor_v<value_type>) {
         op_ = op_type(left_op, right_op, factor_, outer_size(indices_),
                       outer_size(left_indices_), outer_size(right_indices_));
       } else {
+
+        auto make_total_perm = [this]() -> BipartitePermutation {
+          if (this->product_type() != TensorProduct::Contraction
+              || this->implicit_permute_inner_)
+            return {};
+
+          // Here,
+          // this->product_type() is Tensor::Contraction, and,
+          // this->implicit_permute_inner_ is false
+
+          return this->inner_product_type() == TensorProduct::Scale
+                     ? BipartitePermutation(outer(this->perm_))
+                     : this->perm_;
+        };
+
+        auto total_perm = make_total_perm();
+
         // factor_ is absorbed into inner_tile_nonreturn_op_
         op_ = op_type(left_op, right_op, scalar_type(1), outer_size(indices_),
                       outer_size(left_indices_), outer_size(right_indices_),
-                      BipartitePermutation{}, this->inner_tile_nonreturn_op_);
+                      total_perm, this->element_nonreturn_op_);
       }
       trange_ = ContEngine_::make_trange();
       shape_ = ContEngine_::make_shape();
@@ -331,16 +381,26 @@ class ContEngine : public BinaryEngine<Derived> {
       n *= right_element_size[i];
     }
 
-    // Construct the process grid.
-    proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n);
-
-    // Initialize children
-    left_.init_distribution(world, proc_grid_.make_row_phase_pmap(K_));
-    right_.init_distribution(world, proc_grid_.make_col_phase_pmap(K_));
-
-    // Initialize the process map in not already defined
-    if (!pmap) pmap = proc_grid_.make_pmap();
-    ExprEngine_::init_distribution(world, pmap);
+    // corner case: zero-volume result ... easier to skip proc_grid_
+    // construction alltogether
+    if (M == 0 || N == 0) {
+      left_.init_distribution(world, {});
+      right_.init_distribution(world, {});
+      ExprEngine_::init_distribution(
+          world, (pmap ? pmap : policy::default_pmap(*world, M * N)));
+    } else {  // M!=0 && N!=0
+
+      // Construct the process grid.
+      proc_grid_ = TiledArray::detail::ProcGrid(*world, M, N, m, n);
+
+      // Initialize children
+      left_.init_distribution(world, proc_grid_.make_row_phase_pmap(K_));
+      right_.init_distribution(world, proc_grid_.make_col_phase_pmap(K_));
+
+      // Initialize the process map if not already defined
+      if (!pmap) pmap = proc_grid_.make_pmap();
+      ExprEngine_::init_distribution(world, pmap);
+    }
   }
 
   /// Tiled range factory function
@@ -457,120 +517,189 @@ class ContEngine : public BinaryEngine<Derived> {
 
  protected:
   void init_inner_tile_op(const IndexList& inner_target_indices) {
-    if constexpr (TiledArray::detail::is_tensor_of_tensor_v<value_type>) {
-      using inner_tile_type = typename value_type::value_type;
+    if constexpr (TiledArray::detail::is_tensor_of_tensor_v<result_tile_type>) {
+      constexpr bool tot_x_tot = TiledArray::detail::is_tensor_of_tensor_v<
+          result_tile_type, left_tile_type, right_tile_type>;
       const auto inner_prod = this->inner_product_type();
       TA_ASSERT(inner_prod == TensorProduct::Contraction ||
-                inner_prod == TensorProduct::Hadamard);
+                inner_prod == TensorProduct::Hadamard ||
+                inner_prod == TensorProduct::Scale);
       if (inner_prod == TensorProduct::Contraction) {
-        using inner_tile_type = typename value_type::value_type;
-        using contract_inner_tile_type =
-            TiledArray::detail::ContractReduce<inner_tile_type, inner_tile_type,
-                                               inner_tile_type, scalar_type>;
-        // factor_ is absorbed into inner_tile_nonreturn_op_
-        auto contrreduce_op =
-            (inner_target_indices != inner(this->indices_))
-                ? contract_inner_tile_type(
-                      to_cblas_op(this->left_inner_permtype_),
-                      to_cblas_op(this->right_inner_permtype_), this->factor_,
-                      inner_size(this->indices_),
-                      inner_size(this->left_indices_),
-                      inner_size(this->right_indices_),
-                      (this->permute_tiles_ ? inner(this->perm_)
-                                            : Permutation{}))
-                : contract_inner_tile_type(
-                      to_cblas_op(this->left_inner_permtype_),
-                      to_cblas_op(this->right_inner_permtype_), this->factor_,
-                      inner_size(this->indices_),
-                      inner_size(this->left_indices_),
-                      inner_size(this->right_indices_));
-        this->inner_tile_nonreturn_op_ = [contrreduce_op](
-                                             inner_tile_type& result,
-                                             const inner_tile_type& left,
-                                             const inner_tile_type& right) {
-          contrreduce_op(result, left, right);
-        };
+        TA_ASSERT(tot_x_tot);
+        if constexpr (tot_x_tot) {
+          using op_type = TiledArray::detail::ContractReduce<
+              result_tile_element_type, left_tile_element_type,
+              right_tile_element_type, scalar_type>;
+          // factor_ is absorbed into inner_tile_nonreturn_op_
+          auto contrreduce_op =
+              (inner_target_indices != inner(this->indices_))
+                  ? op_type(to_cblas_op(this->left_inner_permtype_),
+                            to_cblas_op(this->right_inner_permtype_),
+                            this->factor_, inner_size(this->indices_),
+                            inner_size(this->left_indices_),
+                            inner_size(this->right_indices_),
+                            (!this->implicit_permute_inner_ ? inner(this->perm_)
+                                                            : Permutation{}))
+                  : op_type(to_cblas_op(this->left_inner_permtype_),
+                            to_cblas_op(this->right_inner_permtype_),
+                            this->factor_, inner_size(this->indices_),
+                            inner_size(this->left_indices_),
+                            inner_size(this->right_indices_));
+          this->element_nonreturn_op_ =
+              [contrreduce_op, permute_inner = this->product_type() !=
+                                                   TensorProduct::Contraction](
+                  result_tile_element_type& result,
+                  const left_tile_element_type& left,
+                  const right_tile_element_type& right) {
+                contrreduce_op(result, left, right);
+                // permutations of result are applied as "postprocessing"
+                if (permute_inner && !TA::empty(result))
+                  result = contrreduce_op(result);
+              };
+        }  // ToT x ToT
       } else if (inner_prod == TensorProduct::Hadamard) {
-        // inner tile op depends on the outer op ... e.g. if outer op
-        // is contract then inner must implement (ternary) multiply-add;
-        // if the outer is hadamard then the inner is binary multiply
-        const auto outer_prod = this->product_type();
-        if (this->factor_ == 1) {
-          using base_op_type =
-              TiledArray::detail::Mult<inner_tile_type, inner_tile_type,
-                                       inner_tile_type, false, false>;
-          using op_type = TiledArray::detail::BinaryWrapper<
-              base_op_type>;  // can't consume inputs if they are used multiple
-                              // times, e.g. when outer op is gemm
-          auto mult_op = (inner_target_indices != inner(this->indices_))
-                             ? op_type(base_op_type(), this->permute_tiles_
-                                                           ? inner(this->perm_)
-                                                           : Permutation{})
-                             : op_type(base_op_type());
-          this->inner_tile_nonreturn_op_ = [mult_op, outer_prod](
-                                               inner_tile_type& result,
-                                               const inner_tile_type& left,
-                                               const inner_tile_type& right) {
-            if (outer_prod == TensorProduct::Hadamard)
-              result = mult_op(left, right);
-            else {
-              TA_ASSERT(outer_prod == TensorProduct::Hadamard ||
-                        outer_prod == TensorProduct::Contraction);
-              // there is currently no fused MultAdd ternary Op, only Add and
-              // Mult thus implement this as 2 separate steps
-              // TODO optimize by implementing (ternary) MultAdd
-              if (empty(result))
-                result = mult_op(left, right);
-              else {
-                auto result_increment = mult_op(left, right);
-                add_to(result, result_increment);
-              }
-            }
-          };
-        } else {
-          using base_op_type =
-              TiledArray::detail::ScalMult<inner_tile_type, inner_tile_type,
-                                           inner_tile_type, scalar_type, false,
-                                           false>;
-          using op_type = TiledArray::detail::BinaryWrapper<
-              base_op_type>;  // can't consume inputs if they are used multiple
-                              // times, e.g. when outer op is gemm
-          auto mult_op = (inner_target_indices != inner(this->indices_))
-                             ? op_type(base_op_type(this->factor_),
-                                       this->permute_tiles_ ? inner(this->perm_)
-                                                            : Permutation{})
-                             : op_type(base_op_type(this->factor_));
-          this->inner_tile_nonreturn_op_ = [mult_op, outer_prod](
-                                               inner_tile_type& result,
-                                               const inner_tile_type& left,
-                                               const inner_tile_type& right) {
-            TA_ASSERT(outer_prod == TensorProduct::Hadamard ||
-                      outer_prod == TensorProduct::Contraction);
-            if (outer_prod == TensorProduct::Hadamard)
-              result = mult_op(left, right);
-            else {
-              // there is currently no fused MultAdd ternary Op, only Add and
-              // Mult thus implement this as 2 separate steps
-              // TODO optimize by implementing (ternary) MultAdd
-              if (empty(result))
-                result = mult_op(left, right);
-              else {
-                auto result_increment = mult_op(left, right);
-                add_to(result, result_increment);
-              }
-            }
+        TA_ASSERT(tot_x_tot);
+        if constexpr (tot_x_tot) {
+          // inner tile op depends on the outer op ... e.g. if outer op
+          // is contract then inner must implement (ternary) multiply-add;
+          // if the outer is hadamard then the inner is binary multiply
+          const auto outer_prod = this->product_type();
+          if (this->factor_ == scalar_type{1}) {
+            using base_op_type =
+                TiledArray::detail::Mult<result_tile_element_type,
+                                         left_tile_element_type,
+                                         right_tile_element_type, false, false>;
+            using op_type = TiledArray::detail::BinaryWrapper<
+                base_op_type>;  // can't consume inputs if they are used
+                                // multiple times, e.g. when outer op is gemm
+            auto mult_op =
+                (inner_target_indices != inner(this->indices_))
+                    ? op_type(base_op_type(), !this->implicit_permute_inner_
+                                                  ? inner(this->perm_)
+                                                  : Permutation{})
+                    : op_type(base_op_type());
+            this->element_nonreturn_op_ =
+                [mult_op, outer_prod](result_tile_element_type& result,
+                                      const left_tile_element_type& left,
+                                      const right_tile_element_type& right) {
+                  if (outer_prod == TensorProduct::Hadamard)
+                    result = mult_op(left, right);
+                  else {
+                    TA_ASSERT(outer_prod == TensorProduct::Hadamard ||
+                              outer_prod == TensorProduct::Contraction);
+                    // there is currently no fused MultAdd ternary Op, only Add
+                    // and Mult thus implement this as 2 separate steps
+                    // TODO optimize by implementing (ternary) MultAdd
+                    if (empty(result))
+                      result = mult_op(left, right);
+                    else {
+                      auto result_increment = mult_op(left, right);
+                      add_to(result, result_increment);
+                    }
+                  }
+                };
+          } else {
+            using base_op_type = TiledArray::detail::ScalMult<
+                result_tile_element_type, left_tile_element_type,
+                right_tile_element_type, scalar_type, false, false>;
+            using op_type = TiledArray::detail::BinaryWrapper<
+                base_op_type>;  // can't consume inputs if they are used
+                                // multiple times, e.g. when outer op is gemm
+            auto mult_op = (inner_target_indices != inner(this->indices_))
+                               ? op_type(base_op_type(this->factor_),
+                                         !this->implicit_permute_inner_
+                                             ? inner(this->perm_)
+                                             : Permutation{})
+                               : op_type(base_op_type(this->factor_));
+            this->element_nonreturn_op_ =
+                [mult_op, outer_prod](result_tile_element_type& result,
+                                      const left_tile_element_type& left,
+                                      const right_tile_element_type& right) {
+                  TA_ASSERT(outer_prod == TensorProduct::Hadamard ||
+                            outer_prod == TensorProduct::Contraction);
+                  if (outer_prod == TensorProduct::Hadamard)
+                    result = mult_op(left, right);
+                  else {
+                    // there is currently no fused MultAdd ternary Op, only Add
+                    // and Mult thus implement this as 2 separate steps
+                    // TODO optimize by implementing (ternary) MultAdd
+                    if (empty(result))
+                      result = mult_op(left, right);
+                    else {
+                      auto result_increment = mult_op(left, right);
+                      add_to(result, result_increment);
+                    }
+                  }
+                };
+          }
+        }  // ToT x T or T x ToT
+      } else if (inner_prod == TensorProduct::Scale) {
+        TA_ASSERT(!tot_x_tot);
+        constexpr bool tot_x_t =
+            TiledArray::detail::is_tensor_of_tensor_v<result_tile_type,
+                                                      left_tile_type> &&
+            TiledArray::detail::is_tensor_v<right_tile_type>;
+        constexpr bool t_x_tot =
+            TiledArray::detail::is_tensor_of_tensor_v<result_tile_type,
+                                                      right_tile_type> &&
+            TiledArray::detail::is_tensor_v<left_tile_type>;
+        if constexpr (tot_x_t || t_x_tot) {
+          using arg_tile_element_type =
+              std::conditional_t<tot_x_t, left_tile_element_type,
+                                 right_tile_element_type>;
+          using scalar_type =
+              std::conditional_t<tot_x_t, right_tile_element_type,
+                                 left_tile_element_type>;
+
+          auto scal_op = [perm = !this->implicit_permute_inner_
+                                     ? inner(this->perm_)
+                                     : Permutation{}](
+                             const left_tile_element_type& left,
+                             const right_tile_element_type& right)
+              -> result_tile_element_type {
+            using TiledArray::scale;
+            if constexpr (tot_x_t) {
+              if (perm)
+                return scale(left, right, perm);
+              else
+                return scale(left, right);
+            } else if constexpr (t_x_tot) {
+              if (perm)
+                return scale(right, left, perm);
+              else
+                return scale(right, left);
+            } else
+              abort();  // unreachable
           };
+          this->element_nonreturn_op_ =
+              [scal_op, outer_prod = (this->product_type())](
+                  result_tile_element_type& result,
+                  const left_tile_element_type& left,
+                  const right_tile_element_type& right) {
+                if (outer_prod == TensorProduct::Contraction) {
+                  if (empty(result))
+                    result = scal_op(left, right);
+                  else {
+                    auto result_increment = scal_op(left, right);
+                    add_to(result, result_increment);
+                  }
+                  // result += scal_op(left, right);
+                } else {
+                  result = scal_op(left, right);
+                }
+              };
         }
       } else
         abort();  // unsupported TensorProduct type
-      TA_ASSERT(inner_tile_nonreturn_op_);
-      this->inner_tile_return_op_ =
-          [inner_tile_nonreturn_op = this->inner_tile_nonreturn_op_](
-              const inner_tile_type& left, const inner_tile_type& right) {
-            inner_tile_type result;
-            inner_tile_nonreturn_op(result, left, right);
-            return result;
-          };
+      TA_ASSERT(element_nonreturn_op_);
+      this->element_return_op_ = [inner_tile_nonreturn_op =
+                                      this->element_nonreturn_op_](
+                                     const left_tile_element_type& left,
+                                     const right_tile_element_type& right) {
+        result_tile_element_type result;
+        inner_tile_nonreturn_op(result, left, right);
+        return result;
+      };
     }
   }
 
diff --git a/src/TiledArray/expressions/expr.h b/src/TiledArray/expressions/expr.h
index bcc65cb412..3b1e9f43be 100644
--- a/src/TiledArray/expressions/expr.h
+++ b/src/TiledArray/expressions/expr.h
@@ -40,13 +40,17 @@
 #include "TiledArray/tile.h"
 #include "TiledArray/tile_interface/trace.h"
 #include "expr_engine.h"
-#ifdef TILEDARRAY_HAS_CUDA
-#include <TiledArray/cuda/cuda_task_fn.h>
-#include <TiledArray/external/cuda.h>
+#ifdef TILEDARRAY_HAS_DEVICE
+#include <TiledArray/device/device_task_fn.h>
+#include <TiledArray/external/device.h>
 #endif
 
 #include <TiledArray/tensor/type_traits.h>
 
+#include <range/v3/algorithm/equal.hpp>
+#include <range/v3/range/conversion.hpp>
+#include <range/v3/view/zip_with.hpp>
+
 namespace TiledArray::expressions {
 
 template <typename Engine>
@@ -186,8 +190,8 @@ class Expr {
       typename A, typename I, typename T,
       typename std::enable_if<!std::is_same<typename A::value_type, T>::value &&
                               is_lazy_tile<T>::value
-#ifdef TILEDARRAY_HAS_CUDA
-                              && !::TiledArray::detail::is_cuda_tile_v<T>
+#ifdef TILEDARRAY_HAS_DEVICE
+                              && !::TiledArray::detail::is_device_tile_v<T>
 #endif
                               >::type* = nullptr>
   void set_tile(A& array, const I& index, const Future<T>& tile) const {
@@ -195,7 +199,7 @@ class Expr {
                          TiledArray::Cast<typename A::value_type, T>(), tile));
   }
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
   /// Set an array tile with a lazy tile
 
   /// Spawn a task to evaluate a lazy tile and set the \a array tile at
@@ -210,9 +214,9 @@ class Expr {
             typename std::enable_if<
                 !std::is_same<typename A::value_type, T>::value &&
                 is_lazy_tile<T>::value &&
-                ::TiledArray::detail::is_cuda_tile_v<T>>::type* = nullptr>
+                ::TiledArray::detail::is_device_tile_v<T>>::type* = nullptr>
   void set_tile(A& array, const I& index, const Future<T>& tile) const {
-    array.set(index, madness::add_cuda_task(
+    array.set(index, madness::add_device_task(
                          array.world(),
                          TiledArray::Cast<typename A::value_type, T>(), tile));
   }
@@ -246,22 +250,22 @@ class Expr {
   template <
       typename A, typename I, typename T, typename Op,
       typename std::enable_if<!std::is_same<typename A::value_type, T>::value
-#ifdef TILEDARRAY_HAS_CUDA
-                              && !::TiledArray::detail::is_cuda_tile_v<T>
+#ifdef TILEDARRAY_HAS_DEVICE
+                              && !::TiledArray::detail::is_device_tile_v<T>
 #endif
                               >::type* = nullptr>
   void set_tile(A& array, const I index, const Future<T>& tile,
                 const std::shared_ptr<Op>& op) const {
-    auto eval_tile_fn =
-        &Expr_::template eval_tile<typename A::value_type, const T&,
-                                   TiledArray::Cast<typename A::value_type, T>,
-                                   Op>;
-    array.set(index, array.world().taskq.add(
-                         eval_tile_fn, tile,
-                         TiledArray::Cast<typename A::value_type, T>(), op));
+    auto eval_tile_fn = &Expr_::template eval_tile<
+        typename A::value_type, const T&,
+        TiledArray::Cast<typename Op::argument_type, T>, Op>;
+    array.set(index,
+              array.world().taskq.add(
+                  eval_tile_fn, tile,
+                  TiledArray::Cast<typename Op::argument_type, T>(), op));
   }
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
   /// Set an array tile with a lazy tile
 
   /// Spawn a task to evaluate a lazy tile and set the \a array tile at
@@ -275,16 +279,16 @@ class Expr {
   template <typename A, typename I, typename T, typename Op,
             typename std::enable_if<
                 !std::is_same<typename A::value_type, T>::value &&
-                ::TiledArray::detail::is_cuda_tile_v<T>>::type* = nullptr>
+                ::TiledArray::detail::is_device_tile_v<T>>::type* = nullptr>
   void set_tile(A& array, const I index, const Future<T>& tile,
                 const std::shared_ptr<Op>& op) const {
-    auto eval_tile_fn =
-        &Expr_::template eval_tile<typename A::value_type, const T&,
-                                   TiledArray::Cast<typename A::value_type, T>,
-                                   Op>;
-    array.set(index, madness::add_cuda_task(
-                         array.world(), eval_tile_fn, tile,
-                         TiledArray::Cast<typename A::value_type, T>(), op));
+    auto eval_tile_fn = &Expr_::template eval_tile<
+        typename A::value_type, const T&,
+        TiledArray::Cast<typename Op::argument_type, T>, Op>;
+    array.set(index,
+              madness::add_device_task(
+                  array.world(), eval_tile_fn, tile,
+                  TiledArray::Cast<typename Op::argument_type, T>(), op));
   }
 #endif
 
@@ -303,8 +307,8 @@ class Expr {
   template <
       typename A, typename I, typename T, typename Op,
       typename std::enable_if<std::is_same<typename A::value_type, T>::value
-#ifdef TILEDARRAY_HAS_CUDA
-                              && !::TiledArray::detail::is_cuda_tile_v<T>
+#ifdef TILEDARRAY_HAS_DEVICE
+                              && !::TiledArray::detail::is_device_tile_v<T>
 #endif
                               >::type* = nullptr>
   void set_tile(A& array, const I index, const Future<T>& tile,
@@ -317,7 +321,7 @@ class Expr {
     array.set(index, array.world().taskq.add(eval_tile_fn_ptr, tile, op));
   }
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
   /// Spawn a task to evaluate a lazy tile and set the \a array tile at
   /// \c index with the result.
@@ -332,7 +336,7 @@ class Expr {
   template <typename A, typename I, typename T, typename Op,
             typename std::enable_if<
                 std::is_same<typename A::value_type, T>::value&& ::TiledArray::
-                    detail::is_cuda_tile_v<T>>::type* = nullptr>
+                    detail::is_device_tile_v<T>>::type* = nullptr>
   void set_tile(A& array, const I index, const Future<T>& tile,
                 const std::shared_ptr<Op>& op) const {
     auto eval_tile_fn_ptr = &Expr_::template eval_tile<const T&, Op>;
@@ -340,8 +344,8 @@ class Expr {
     static_assert(madness::detail::function_traits<fn_ptr_type(
                       const T&, const std::shared_ptr<Op>&)>::value,
                   "ouch");
-    array.set(index, madness::add_cuda_task(array.world(), eval_tile_fn_ptr,
-                                            tile, op));
+    array.set(index, madness::add_device_task(array.world(), eval_tile_fn_ptr,
+                                              tile, op));
   }
 #endif
 
@@ -420,6 +424,10 @@ class Expr {
     dist_eval.wait();
     // Swap the new array with the result array object.
     result.swap(tsr.array());
+
+#if 0
+    std::cout << "array.id()=" << tsr.array().id() << " evaluated using dist_eval.id=" << dist_eval.id() << std::endl;
+#endif
   }
 
   /// Evaluate this object and assign it to \c tsr
@@ -457,6 +465,16 @@ class Expr {
     // set even though this is a requirement.
 #endif  // NDEBUG
 
+    // Assignment to block expression uses trange of the array it is bounded to
+    // Assert that the user did not try to override the trange by accident using
+    // set_trange_lobound or at least that it matches tsr.array's trange
+    TA_ASSERT(!tsr.trange_lobound().has_value() ||
+              (ranges::equal(tsr.trange_lobound().value(),
+                             tsr.array()
+                                 .trange()
+                                 .make_tile_range(tsr.lower_bound())
+                                 .lobound())));
+
     // Get the target world.
     World& world = tsr.array().world();
 
@@ -500,10 +518,19 @@ class Expr {
     // Move the data from dist_eval into the sub-block of result array.
     // This step may involve communication when the tiles are moved from the
     // sub-block distribution to the array distribution.
-    {
+    // N.B. handle the corner case of zero-volume host array, then no data needs
+    // to be moved
+    if (tsr.array().trange().tiles_range().volume() != 0) {
       // N.B. must deep copy
-      const container::svector<long> shift =
-          tsr.array().trange().make_tile_range(tsr.lower_bound()).lobound();
+      TA_ASSERT(tsr.array().trange().tiles_range().includes(tsr.lower_bound()));
+      // N.B. this expression's range,
+      // dist_eval.trange().elements_range().lobound(), may not be zero!
+      const auto shift =
+          ranges::views::zip_with(
+              [](auto a, auto b) { return a - b; },
+              tsr.array().trange().make_tile_range(tsr.lower_bound()).lobound(),
+              dist_eval.trange().elements_range().lobound()) |
+          ranges::to<container::svector<long>>();
 
       std::shared_ptr<op_type> shift_op =
           std::make_shared<op_type>(shift_op_type(shift));
@@ -637,7 +664,20 @@ class Expr {
     right_dist_eval.eval();
 
 #ifndef NDEBUG
-    if (left_dist_eval.trange() != right_dist_eval.trange()) {
+    if (ignore_tile_position()) {
+      if (!is_congruent(left_dist_eval.trange(), right_dist_eval.trange())) {
+        if (TiledArray::get_default_world().rank() == 0) {
+          TA_USER_ERROR_MESSAGE(
+              "The TiledRanges of the left- and right-hand arguments the "
+              "binary "
+              "reduction are not congruent:"
+              << "\n    left  = " << left_dist_eval.trange()
+              << "\n    right = " << right_dist_eval.trange());
+        }
+        TA_EXCEPTION(
+            "The TiledRange objects of a binary reduction are not congruent.");
+      }
+    } else if (left_dist_eval.trange() != right_dist_eval.trange()) {
       if (TiledArray::get_default_world().rank() == 0) {
         TA_USER_ERROR_MESSAGE(
             "The TiledRanges of the left- and right-hand arguments the binary "
@@ -647,7 +687,7 @@ class Expr {
       }
 
       TA_EXCEPTION(
-          "The TiledRange objects of a binary expression are not equal.");
+          "The TiledRange objects of a binary reduction are not equal.");
     }
 #endif  // NDEBUG
 
diff --git a/src/TiledArray/expressions/expr_engine.h b/src/TiledArray/expressions/expr_engine.h
index bd4dbd9ccd..a502857af9 100644
--- a/src/TiledArray/expressions/expr_engine.h
+++ b/src/TiledArray/expressions/expr_engine.h
@@ -54,6 +54,8 @@ class ExprEngine : private NO_DEFAULTS {
       typename EngineTrait<Derived>::op_type op_type;  ///< Tile operation type
   typedef
       typename EngineTrait<Derived>::policy policy;  ///< The result policy type
+  typedef typename EngineTrait<Derived>::eval_type
+      eval_type;  ///< Evaluation tile type
   typedef typename EngineTrait<Derived>::dist_eval_type
       dist_eval_type;  ///< This expression's distributed evaluator type
 
@@ -73,10 +75,13 @@ class ExprEngine : private NO_DEFAULTS {
   World* world_;  ///< The world where this expression will be evaluated
   BipartiteIndexList
       indices_;  ///< The index list of this expression; bipartite due to need
-                 ///< to support recursive tensors (i.e. Tensor-of-Tensor)
-  bool permute_tiles_;  ///< Result tile permutation flag (\c true == permute
-                        ///< tile)
-  /// The permutation that will be applied to the outer tensor of tensors
+                 ///< to support nested tensors (e.g. tensors of tensors)
+  bool implicit_permute_outer_ = false;  ///< If false, result tiles' outer
+                                         ///< modes will not need to be permuted
+  bool implicit_permute_inner_ = false;  ///< If false, result tiles' inner
+                                         ///< modes will not need to be permuted
+  /// The permutation that will be applied to the result tensor (or tensor of
+  /// tensors)
   BipartitePermutation perm_;
   trange_type trange_;  ///< The tiled range of the result tensor
   shape_type shape_;    ///< The shape of the result tensor
@@ -93,7 +98,6 @@ class ExprEngine : private NO_DEFAULTS {
   ExprEngine(const Expr<D>& expr)
       : world_(NULL),
         indices_(),
-        permute_tiles_(true),
         perm_(),
         trange_(),
         shape_(),
@@ -141,7 +145,7 @@ class ExprEngine : private NO_DEFAULTS {
 
   /// This function will initialize the permutation, tiled range, and shape
   /// for the result tensor. These members are initialized with the
-  /// <tt>make_perm()</tt>, \c make_trange(), and make_shape() functions.
+  /// \c init_perm(), \c make_trange(), and make_shape() functions.
   /// Derived classes may customize the structure initialization by
   /// providing their own implementation of this function or any of the
   /// above initialization.
@@ -149,7 +153,7 @@ class ExprEngine : private NO_DEFAULTS {
   /// \param target_indices The target index list for the result tensor
   void init_struct(const BipartiteIndexList& target_indices) {
     if (target_indices != indices_) {
-      perm_ = derived().make_perm(target_indices);
+      if (!perm_) perm_ = make_perm(target_indices);
       trange_ = derived().make_trange(outer(perm_));
       shape_ = derived().make_shape(outer(perm_));
     } else {
@@ -187,20 +191,41 @@ class ExprEngine : private NO_DEFAULTS {
   /// providing their own implementation it.
   BipartitePermutation make_perm(
       const BipartiteIndexList& target_indices) const {
+    TA_ASSERT(target_indices != indices_);
     return target_indices.permutation(indices_);
   }
 
+  void init_perm(const BipartiteIndexList& target_indices) {
+    if (!perm_ && target_indices != indices_) perm_ = make_perm(target_indices);
+  }
+
   /// Tile operation factory function
 
   /// This function will generate the tile operations by calling
   /// \c make_tile_op(). The permuting or non-permuting version of the tile
-  /// operation will be selected based on permute_tiles(). Derived classes
-  /// may customize this function by providing their own implementation it.
+  /// operation will be selected based on implicit_permute_outer(). Derived
+  /// classes may customize this function by providing their own implementation
+  /// it.
   op_type make_op() const {
-    if (perm_ && permute_tiles_)
-      // permutation can only be applied to the tile, not to its element (if
-      // tile = tensor-of-tensors)
-      return derived().make_tile_op(perm_);
+    // figure out which permutations (of outer or inner modes) must be enacted
+    // explicitly
+    BipartitePermutation explicit_perm;
+    if (implicit_permute_outer_) {
+      if (!implicit_permute_inner_) {
+        explicit_perm = BipartitePermutation(Permutation{}, inner(perm_));
+      }
+    } else {
+      if (implicit_permute_inner_) {
+        explicit_perm = BipartitePermutation(outer(perm_), Permutation{});
+      } else {
+        explicit_perm = perm_;
+      }
+    }
+    const bool explicit_perm_is_nontrivial =
+        !(explicit_perm.first().is_identity() &&
+          explicit_perm.second().is_identity());
+    if (explicit_perm && explicit_perm_is_nontrivial)
+      return derived().make_tile_op(explicit_perm);
     else
       return derived().make_tile_op();
   }
@@ -243,11 +268,47 @@ class ExprEngine : private NO_DEFAULTS {
   /// \return A const reference to the process map
   const std::shared_ptr<const pmap_interface>& pmap() const { return pmap_; }
 
-  /// Set the permute tiles flag
+  /// Set the flag that controls whether tiles' permutation will be implicit
+
+  /// some consuming operations (like GEMM) permutation can perform some
+  /// permutation types implicitly. setting this to true indicates that the
+  /// result tiles' outer modes do not need to be permuted and permutation will
+  /// be performed implicitly by the consuming operation
+  /// \param status The new value for the implicit permute flag
+  /// (true => will not permute outer modes of result tiles;
+  ///  false => will permute outer modes of result tiles if needed)
+  /// \note for plain tensors, i.e., tensor-of-scalars, any mode is
+  /// outer
+  void implicit_permute_outer(const bool status) {
+    implicit_permute_outer_ = status;
+  }
+
+  /// Set the flag that controls whether tiles' permutation will be implicit
+
+  /// some consuming operations (like GEMM) permutation can perform some
+  /// permutation types implicitly. setting this to true indicates that the
+  /// result tiles' inner modes do not need to be permuted and permutation will
+  /// be performed implicitly by the consuming operation
+  /// \param status The new value for the implicit permute flag
+  /// (true => will not permute inner modes of result tiles;
+  ///  false => will permute inner modes of result tiles if needed)
+  /// \note for plain tensors, i.e., tensor-of-scalars, there are no
+  /// inner modes and this should not be used
+  void implicit_permute_inner(const bool status) {
+    TA_ASSERT(TiledArray::detail::is_tensor_of_tensor_v<eval_type>);
+    implicit_permute_inner_ = status;
+  }
 
-  /// \param status The new status for permute tiles (true == permute result
-  /// tiles)
-  void permute_tiles(const bool status) { permute_tiles_ = status; }
+  /// Reports whether permutation of the result tiles will be implicit, i.e.
+  /// will be fused into the consuming operation
+
+  /// \return true if will not permute of result tiles; false will indicate that
+  /// the  result tiles will be permuted if needed
+  bool implicit_permute() const {
+    constexpr bool is_tot =
+        TiledArray::detail::is_tensor_of_tensor_v<eval_type>;
+    return (implicit_permute_outer_ || (is_tot && implicit_permute_inner_));
+  }
 
   /// Expression print
 
@@ -255,9 +316,23 @@ class ExprEngine : private NO_DEFAULTS {
   /// \param target_indices The target index list for this expression
   void print(ExprOStream& os, const BipartiteIndexList& target_indices) const {
     if (perm_) {
-      os << "[P " << target_indices << "]"
-         << (permute_tiles_ ? " " : " [no permute tiles] ")
-         << derived().make_tag() << indices_ << "\n";
+      os << "[P " << target_indices << "]";
+      if (implicit_permute_outer_ || implicit_permute_inner_) {
+        os << " [implicit ";
+        constexpr bool is_tot =
+            TiledArray::detail::is_tensor_of_tensor_v<eval_type>;
+        if constexpr (is_tot) {
+          if (implicit_permute_outer_ && implicit_permute_inner_) {
+            os << "outer&inner ";
+          } else if (implicit_permute_outer_) {
+            os << "outer ";
+          } else
+            os << "inner ";
+        }
+        os << "permute ] ";
+      } else
+        os << " ";
+      os << derived().make_tag() << indices_ << "\n";
     } else {
       os << derived().make_tag() << indices_ << "\n";
     }
diff --git a/src/TiledArray/expressions/fwd.h b/src/TiledArray/expressions/fwd.h
index 7960baf648..e56dea8b83 100644
--- a/src/TiledArray/expressions/fwd.h
+++ b/src/TiledArray/expressions/fwd.h
@@ -28,7 +28,6 @@
 
 #include <type_traits>
 
-
 namespace TiledArray::expressions {
 
 template <typename>
@@ -68,6 +67,6 @@ class ScalTsrExpr;
 template <typename, typename, typename>
 class ScalTsrEngine;
 
-}  // namespace TiledArray
+}  // namespace TiledArray::expressions
 
 #endif  // TILEDARRAY_EXPRESSIONS_FWD_H__INCLUDED
diff --git a/src/TiledArray/expressions/leaf_engine.h b/src/TiledArray/expressions/leaf_engine.h
index 5e273fb5dc..8804989d6f 100644
--- a/src/TiledArray/expressions/leaf_engine.h
+++ b/src/TiledArray/expressions/leaf_engine.h
@@ -70,9 +70,10 @@ class LeafEngine : public ExprEngine<Derived> {
 
  protected:
   // Import base class variables to this scope
+  using ExprEngine_::implicit_permute_inner_;
+  using ExprEngine_::implicit_permute_outer_;
   using ExprEngine_::indices_;
   using ExprEngine_::perm_;
-  using ExprEngine_::permute_tiles_;
   using ExprEngine_::pmap_;
   using ExprEngine_::shape_;
   using ExprEngine_::trange_;
diff --git a/src/TiledArray/expressions/mult_engine.h b/src/TiledArray/expressions/mult_engine.h
index 19788505fd..84d11bd4c0 100644
--- a/src/TiledArray/expressions/mult_engine.h
+++ b/src/TiledArray/expressions/mult_engine.h
@@ -189,12 +189,14 @@ struct EngineTrait<ScalMultEngine<Left, Right, Scalar, Result>> {
 /// Multiplication expression engine
 
 /// This implements any expression encoded with the multiplication operator.
-/// This includes Hadamard product, e.g. \code (c("i,j")=)a("i,j")*b("i,j")
-/// \endcode , and pure contractions, e.g. \code (c("i,j")=)a("i,k")*b("k,j")
-/// \endcode . \internal mixed Hadamard-contraction case, e.g. \code
-/// c("i,j,l")=a("i,l,k")*b("j,l,k") \endcode , is not supported since
-///   this requires that the result labels are assigned by user (currently they
-///   are computed by this engine)
+/// This includes Hadamard product, e.g.
+/// \code (c("i,j")=)a("i,j")*b("i,j") \endcode ,
+/// and pure contractions, e.g. \code (c("i,j")=)a("i,k")*b("k,j") \endcode .
+/// \internal mixed Hadamard-contraction case, e.g.
+/// \code c("i,j,l")=a("i,l,k")*b("j,l,k") \endcode ,
+/// is not supported since
+/// this requires that the result labels are assigned by user (currently they
+/// are computed by this engine)
 /// \tparam Left The left-hand engine type
 /// \tparam Right The right-hand engine type
 /// \tparam Result The result tile type
@@ -297,7 +299,6 @@ class MultEngine : public ContEngine<MultEngine<Left, Right, Result>> {
     // the tile op; the type of the tile op does not need to match the type of
     // the operation on the outer indices
     if (this->product_type() == TensorProduct::Hadamard) {
-      // assumes inner op is also Hadamard
       BinaryEngine_::perm_indices(target_indices);
     } else {
       auto children_initialized = true;
@@ -333,6 +334,9 @@ class MultEngine : public ContEngine<MultEngine<Left, Right, Result>> {
   /// for the result tensor.
   /// \param target_indices The target index list for the result tensor
   void init_struct(const BipartiteIndexList& target_indices) {
+    this->init_perm(target_indices);
+
+    // for ContEngine_::init_struct need to initialize element op first
     this->init_inner_tile_op(inner(target_indices));
     if (this->product_type() == TensorProduct::Contraction)
       ContEngine_::init_struct(target_indices);
@@ -404,7 +408,9 @@ class MultEngine : public ContEngine<MultEngine<Left, Right, Result>> {
                                 // dimensions as well
         return op_type(op_base_type());
       } else if (inner_prod == TensorProduct::Contraction) {
-        return op_type(op_base_type(this->inner_tile_return_op_));
+        return op_type(op_base_type(this->element_return_op_));
+      } else if (inner_prod == TensorProduct::Scale) {
+        return op_type(op_base_type());
       } else
         abort();
     } else {  // plain tensors
@@ -417,9 +423,10 @@ class MultEngine : public ContEngine<MultEngine<Left, Right, Result>> {
 
   /// \param perm The permutation to be applied to the result
   /// \return The tile operation
-  template <typename Perm, typename = std::enable_if_t<
-                               TiledArray::detail::is_permutation_v<Perm>>>
-  op_type make_tile_op(const Perm& perm) const {
+  template <typename Perm,
+            typename = std::enable_if_t<TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>>>>
+  op_type make_tile_op(Perm&& perm) const {
     if constexpr (TiledArray::detail::is_tensor_of_tensor_v<
                       value_type>) {  // nested tensors
       const auto inner_prod = this->inner_product_type();
@@ -427,13 +434,21 @@ class MultEngine : public ContEngine<MultEngine<Left, Right, Result>> {
         TA_ASSERT(this->product_type() ==
                   inner_prod);  // Hadamard automatically works for inner
                                 // dimensions as well
-        return op_type(op_base_type(), perm);
+        return op_type(op_base_type(), std::forward<Perm>(perm));
       } else if (inner_prod == TensorProduct::Contraction) {
-        return op_type(op_base_type(this->inner_tile_return_op_), perm);
+        // inner permutation, if needed, was fused into inner op, do not apply
+        // inner part of the perm again
+        return op_type(op_base_type(this->element_return_op_),
+                       outer(std::forward<Perm>(perm)));
+      } else if (inner_prod == TensorProduct::Scale) {
+        // inner permutation, if needed, was fused into inner op, do not apply
+        // inner part of the perm again
+        return op_type(op_base_type(this->element_return_op_),
+                       outer(std::forward<Perm>(perm)));
       } else
         abort();
     } else {  // plain tensor
-      return op_type(op_base_type(), perm);
+      return op_type(op_base_type(), std::forward<Perm>(perm));
     }
     abort();  // unreachable
   }
@@ -587,6 +602,9 @@ class ScalMultEngine
   /// for the result tensor.
   /// \param target_indices The target index list for the result tensor
   void init_struct(const BipartiteIndexList& target_indices) {
+    this->init_perm(target_indices);
+
+    // for ContEngine_::init_struct need to initialize element op first
     this->init_inner_tile_op(inner(target_indices));
     if (this->product_type() == TensorProduct::Contraction)
       ContEngine_::init_struct(target_indices);
@@ -667,10 +685,12 @@ class ScalMultEngine
 
   /// \param perm The permutation to be applied to tiles
   /// \return The tile operation
-  template <typename Perm, typename = std::enable_if_t<
-                               TiledArray::detail::is_permutation_v<Perm>>>
-  op_type make_tile_op(const Perm& perm) const {
-    return op_type(op_base_type(ContEngine_::factor_), perm);
+  template <typename Perm,
+            typename = std::enable_if_t<TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>>>>
+  op_type make_tile_op(Perm&& perm) const {
+    return op_type(op_base_type(ContEngine_::factor_),
+                   std::forward<Perm>(perm));
   }
 
   /// Expression identification tag
diff --git a/src/TiledArray/expressions/permopt.cpp b/src/TiledArray/expressions/permopt.cpp
new file mode 100644
index 0000000000..9b125fdc04
--- /dev/null
+++ b/src/TiledArray/expressions/permopt.cpp
@@ -0,0 +1,32 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2020  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Eduard Valeyev
+ *  Department of Chemistry, Virginia Tech
+ *
+ *  permopt.cpp
+ *  Nov 21, 2023
+ *
+ */
+
+#include <TiledArray/expressions/permopt.h>
+
+namespace TiledArray::expressions {
+
+IndexList ScalePermutationOptimizer::null_indices_;
+
+}  // namespace TiledArray::expressions
diff --git a/src/TiledArray/expressions/permopt.h b/src/TiledArray/expressions/permopt.h
index 21d4a0ec39..291604faa8 100644
--- a/src/TiledArray/expressions/permopt.h
+++ b/src/TiledArray/expressions/permopt.h
@@ -28,6 +28,7 @@
 
 #include <TiledArray/expressions/index_list.h>
 #include <TiledArray/expressions/product.h>
+#include <TiledArray/math/blas.h>
 #include <TiledArray/permutation.h>
 #include <memory>
 
@@ -44,13 +45,66 @@ namespace expressions {
 enum class PermutationType { identity = 1, matrix_transpose = 2, general = 3 };
 
 inline blas::Op to_cblas_op(PermutationType permtype) {
-  TA_ASSERT(permtype == PermutationType::matrix_transpose ||
-            permtype == PermutationType::identity);
+  // N.B. 3 cases:
+  // - permtype == identity : no transpose needed
+  // - permtype == matrix_transpose : transpose needed
+  // - permtype == general : the argument will be explicitly permuted to be in a
+  // layout which does not require permutation hence no need for a switch ...
   return permtype == PermutationType::matrix_transpose
              ? math::blas::Transpose
              : math::blas::NoTranspose;
 }
 
+/// Optimizer of permutations for a unary operation
+class UnaryOpPermutationOptimizer {
+ public:
+  /// construct using initial indices for the argument
+  /// \param argument_indices the initial argument index list
+  UnaryOpPermutationOptimizer(const IndexList& argument_indices)
+      : argument_indices_(argument_indices) {}
+
+  /// construct using initial indices for the argument,
+  /// and the desired result indices
+  /// \param result_indices the desired result index list
+  /// \param argument_indices the initial argument index list
+  UnaryOpPermutationOptimizer(const IndexList& result_indices,
+                              const IndexList& argument_indices)
+      : result_indices_(result_indices), argument_indices_(argument_indices) {
+    TA_ASSERT(argument_indices_.is_permutation(argument_indices_));
+    target_result_indices_ = argument_indices_;
+  }
+
+  UnaryOpPermutationOptimizer() = delete;
+  UnaryOpPermutationOptimizer(const UnaryOpPermutationOptimizer&) = default;
+  UnaryOpPermutationOptimizer& operator=(const UnaryOpPermutationOptimizer&) =
+      default;
+  virtual ~UnaryOpPermutationOptimizer() = default;
+
+  /// \return the desired result indices
+  const IndexList& result_indices() const {
+    TA_ASSERT(result_indices_);
+    return result_indices_;
+  }
+  /// \return initial argument indices
+  const IndexList& argument_indices() const { return argument_indices_; }
+
+  /// \return the proposed argument index list
+  const IndexList& target_argument_indices() const {
+    return target_result_indices_;
+  }
+  /// \return the proposed result index list (not necessarily same as that
+  /// returned by result_indices())
+  const IndexList& target_result_indices() const {
+    return target_result_indices_;
+  }
+  /// \return the type of permutation bringing the initial left index list to
+  /// the target left index list
+  PermutationType argument_permtype() const { return PermutationType::general; }
+
+ private:
+  IndexList result_indices_, argument_indices_, target_result_indices_;
+};
+
 /// Abstract optimizer of permutations for a binary operation
 class BinaryOpPermutationOptimizer {
  public:
@@ -479,6 +533,61 @@ class HadamardPermutationOptimizer : public BinaryOpPermutationOptimizer {
   IndexList target_result_indices_;
 };
 
+// clang-format off
+/// Implements BinaryOpPermutationOptimizer interface for a scale operation viewed as a binary tensor product, i.e.
+/// a tensor product between an order-0 tensor and an arbitrary tensor
+// clang-format on
+class ScalePermutationOptimizer : public BinaryOpPermutationOptimizer {
+ public:
+  ScalePermutationOptimizer(const ScalePermutationOptimizer&) = default;
+  ScalePermutationOptimizer& operator=(const ScalePermutationOptimizer&) =
+      default;
+  ~ScalePermutationOptimizer() = default;
+
+  ScalePermutationOptimizer(const IndexList& left_indices,
+                            const IndexList& right_indices)
+      : BinaryOpPermutationOptimizer(left_indices, right_indices,
+                                     left_indices ? true : false),
+        left_argument_is_scalar_(!left_indices),
+        target_result_indices_(left_argument_is_scalar_ ? right_indices
+                                                        : left_indices) {}
+
+  ScalePermutationOptimizer(const IndexList& result_indices,
+                            const IndexList& left_indices,
+                            const IndexList& right_indices)
+      : BinaryOpPermutationOptimizer(result_indices, left_indices,
+                                     right_indices,
+                                     left_indices ? true : false),
+        left_argument_is_scalar_(!left_indices) {
+    const auto& arg_indices =
+        left_argument_is_scalar_ ? right_indices : left_indices;
+    TA_ASSERT(arg_indices.is_permutation(result_indices));
+    target_result_indices_ = arg_indices;
+  }
+
+  const IndexList& target_left_indices() const override final {
+    return !left_argument_is_scalar_ ? target_result_indices_ : null_indices_;
+  }
+  const IndexList& target_right_indices() const override final {
+    return left_argument_is_scalar_ ? target_result_indices_ : null_indices_;
+  }
+  const IndexList& target_result_indices() const override final {
+    return target_result_indices_;
+  }
+  PermutationType left_permtype() const override final {
+    return PermutationType::general;
+  }
+  PermutationType right_permtype() const override final {
+    return PermutationType::general;
+  }
+  TensorProduct op_type() const override final { return TensorProduct::Scale; }
+
+ private:
+  bool left_argument_is_scalar_;
+  IndexList target_result_indices_;
+  static IndexList null_indices_;
+};
+
 class NullBinaryOpPermutationOptimizer : public BinaryOpPermutationOptimizer {
  public:
   NullBinaryOpPermutationOptimizer(const NullBinaryOpPermutationOptimizer&) =
@@ -540,6 +649,9 @@ inline std::shared_ptr<BinaryOpPermutationOptimizer> make_permutation_optimizer(
     case TensorProduct::Invalid:
       return std::make_shared<NullBinaryOpPermutationOptimizer>(
           left_indices, right_indices, prefer_to_permute_left);
+    case TensorProduct::Scale:
+      return std::make_shared<ScalePermutationOptimizer>(left_indices,
+                                                         right_indices);
     default:
       abort();
   }
@@ -559,6 +671,9 @@ inline std::shared_ptr<BinaryOpPermutationOptimizer> make_permutation_optimizer(
     case TensorProduct::Invalid:
       return std::make_shared<NullBinaryOpPermutationOptimizer>(
           target_indices, left_indices, right_indices, prefer_to_permute_left);
+    case TensorProduct::Scale:
+      return std::make_shared<ScalePermutationOptimizer>(
+          target_indices, left_indices, right_indices);
     default:
       abort();
   }
diff --git a/src/TiledArray/expressions/product.h b/src/TiledArray/expressions/product.h
index d364764964..df2867a360 100644
--- a/src/TiledArray/expressions/product.h
+++ b/src/TiledArray/expressions/product.h
@@ -39,6 +39,9 @@ enum class TensorProduct {
   Contraction,
   /// free, fused, and contracted indices
   General,
+  /// no indices on one, free indices on the other; only used for inner index
+  /// products in mixed nested products (ToT x T)
+  Scale,
   /// invalid
   Invalid = -1
 };
@@ -57,6 +60,9 @@ inline TensorProduct compute_product_type(const IndexList& left_indices,
       result = TensorProduct::Hadamard;
     else
       result = TensorProduct::Contraction;
+  } else if ((left_indices && !right_indices) ||
+             (!left_indices && right_indices)) {  // used for ToT*T or T*ToT
+    result = TensorProduct::Scale;
   }
   return result;
 }
@@ -67,8 +73,10 @@ inline TensorProduct compute_product_type(const IndexList& left_indices,
                                           const IndexList& right_indices,
                                           const IndexList& target_indices) {
   auto result = compute_product_type(left_indices, right_indices);
-  if (result == TensorProduct::Hadamard)
+  if (result == TensorProduct::Hadamard) {
     TA_ASSERT(left_indices.is_permutation(target_indices));
+    TA_ASSERT(right_indices.is_permutation(target_indices));
+  }
   return result;
 }
 
diff --git a/src/TiledArray/expressions/scal_engine.h b/src/TiledArray/expressions/scal_engine.h
index a2312fccb7..2c0d33bf33 100644
--- a/src/TiledArray/expressions/scal_engine.h
+++ b/src/TiledArray/expressions/scal_engine.h
@@ -146,10 +146,11 @@ class ScalEngine : public UnaryEngine<ScalEngine<Arg, Scalar, Result>> {
 
   /// \param perm The permutation to be applied to tiles
   /// \return The tile operation
-  template <typename Perm, typename = std::enable_if_t<
-                               TiledArray::detail::is_permutation_v<Perm>>>
-  op_type make_tile_op(const Perm& perm) const {
-    return op_type(perm, factor_);
+  template <typename Perm,
+            typename = std::enable_if_t<TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>>>>
+  op_type make_tile_op(Perm&& perm) const {
+    return op_type(std::forward<Perm>(perm), factor_);
   }
 
   /// Expression identification tag
diff --git a/src/TiledArray/expressions/scal_tsr_engine.h b/src/TiledArray/expressions/scal_tsr_engine.h
index 8dfcc596d9..8b38362740 100644
--- a/src/TiledArray/expressions/scal_tsr_engine.h
+++ b/src/TiledArray/expressions/scal_tsr_engine.h
@@ -140,10 +140,11 @@ class ScalTsrEngine : public LeafEngine<ScalTsrEngine<Array, Scalar, Result>> {
 
   /// \param perm The permutation to be applied to tiles
   /// \return The tile operation
-  template <typename Perm, typename = std::enable_if_t<
-                               TiledArray::detail::is_permutation_v<Perm>>>
-  op_type make_tile_op(const Perm& perm) const {
-    return op_type(op_base_type(factor_), perm);
+  template <typename Perm,
+            typename = std::enable_if_t<TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>>>>
+  op_type make_tile_op(Perm&& perm) const {
+    return op_type(op_base_type(factor_), std::forward<Perm>(perm));
   }
 
   /// Expression identification tag
diff --git a/src/TiledArray/expressions/subt_engine.h b/src/TiledArray/expressions/subt_engine.h
index ab93dde1ea..3750a199c5 100644
--- a/src/TiledArray/expressions/subt_engine.h
+++ b/src/TiledArray/expressions/subt_engine.h
@@ -195,10 +195,11 @@ class SubtEngine : public BinaryEngine<SubtEngine<Left, Right, Result>> {
 
   /// \param perm The permutation to be applied to tiles
   /// \return The tile operation
-  template <typename Perm, typename = std::enable_if_t<
-                               TiledArray::detail::is_permutation_v<Perm>>>
-  static op_type make_tile_op(const Perm& perm) {
-    return op_type(op_base_type(), perm);
+  template <typename Perm,
+            typename = std::enable_if_t<TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>>>>
+  static op_type make_tile_op(Perm&& perm) {
+    return op_type(op_base_type(), std::forward<Perm>(perm));
   }
 
   /// Expression identification tag
@@ -296,10 +297,11 @@ class ScalSubtEngine
 
   /// \param perm The permutation to be applied to tiles
   /// \return The tile operation
-  template <typename Perm, typename = std::enable_if_t<
-                               TiledArray::detail::is_permutation_v<Perm>>>
-  op_type make_tile_op(const Perm& perm) const {
-    return op_type(op_base_type(factor_), perm);
+  template <typename Perm,
+            typename = std::enable_if_t<TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>>>>
+  op_type make_tile_op(Perm&& perm) const {
+    return op_type(op_base_type(factor_), std::forward<Perm>(perm));
   }
 
   /// Expression identification tag
diff --git a/src/TiledArray/expressions/tsr_engine.h b/src/TiledArray/expressions/tsr_engine.h
index 5219af37ca..20b893ead3 100644
--- a/src/TiledArray/expressions/tsr_engine.h
+++ b/src/TiledArray/expressions/tsr_engine.h
@@ -126,10 +126,11 @@ class TsrEngine : public LeafEngine<TsrEngine<Array, Result, Alias>> {
 
   /// \param perm The permutation to be applied to tiles
   /// \return The tile operation
-  template <typename Perm, typename = std::enable_if_t<
-                               TiledArray::detail::is_permutation_v<Perm>>>
-  static op_type make_tile_op(const Perm& perm) {
-    return op_type(op_base_type(), perm);
+  template <typename Perm,
+            typename = std::enable_if_t<TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>>>>
+  static op_type make_tile_op(Perm&& perm) {
+    return op_type(op_base_type(), std::forward<Perm>(perm));
   }
 
 };  // class TsrEngine
diff --git a/src/TiledArray/expressions/tsr_expr.h b/src/TiledArray/expressions/tsr_expr.h
index a17fa65cbc..e17ee2ddfa 100644
--- a/src/TiledArray/expressions/tsr_expr.h
+++ b/src/TiledArray/expressions/tsr_expr.h
@@ -112,8 +112,14 @@ class TsrExpr : public Expr<TsrExpr<Array, Alias>> {
   /// Expression assignment operator
 
   /// \param other The expression that will be assigned to the array
-  array_type& operator=(TsrExpr_& other) {
-    other.eval_to(*this);
+  array_type& operator=(const TsrExpr_& other) {
+    // N.B. corner case: whether A("i,j") = B("i,j") is deep or shallow copy
+    // depends on whether the copy semantics of tiles ... to be sure use clone
+    if (IndexList(this->annotation()) == IndexList(other.annotation())) {
+      array_ = other.array().clone();
+    } else {
+      other.eval_to(*this);
+    }
     return array_;
   }
 
@@ -191,7 +197,7 @@ class TsrExpr : public Expr<TsrExpr<Array, Alias>> {
     return TsrExpr<Array, false>(array(), annotation_);
   }
 
-  /// immutable Block expression factory
+  /// makes an immutable Block expression
 
   /// \tparam Index1 An integral range type
   /// \tparam Index2 An integral range type
@@ -207,7 +213,26 @@ class TsrExpr : public Expr<TsrExpr<Array, Alias>> {
                                           upper_bound);
   }
 
-  /// immutable Block expression factory
+  /// makes an immutable Block expression that preserves the underlying tensor's
+  /// trange lobound
+
+  /// \tparam Index1 An integral range type
+  /// \tparam Index2 An integral range type
+  /// \param lower_bound The lower_bound of the block
+  /// \param upper_bound The upper_bound of the block
+  template <typename Index1, typename Index2,
+            typename = std::enable_if_t<
+                TiledArray::detail::is_integral_range_v<Index1> &&
+                TiledArray::detail::is_integral_range_v<Index2>>>
+  BlkTsrExpr<const Array, Alias> block(const Index1& lower_bound,
+                                       const Index2& upper_bound,
+                                       preserve_lobound_t) const {
+    return BlkTsrExpr<const Array, Alias>(array_, annotation_, lower_bound,
+                                          upper_bound)
+        .preserve_lobound();
+  }
+
+  /// makes an immutable Block expression
 
   /// \tparam Index1 An integral type
   /// \tparam Index2 An integral type
@@ -223,7 +248,26 @@ class TsrExpr : public Expr<TsrExpr<Array, Alias>> {
                                           upper_bound);
   }
 
-  /// immutable Block expression factory
+  /// makes an immutable Block expression that preserves the underlying tensor's
+  /// trange lobound
+
+  /// \tparam Index1 An integral type
+  /// \tparam Index2 An integral type
+  /// \param lower_bound The lower_bound of the block
+  /// \param upper_bound The upper_bound of the block
+  template <typename Index1, typename Index2,
+            typename = std::enable_if_t<std::is_integral_v<Index1> &&
+                                        std::is_integral_v<Index2>>>
+  BlkTsrExpr<const Array, Alias> block(
+      const std::initializer_list<Index1>& lower_bound,
+      const std::initializer_list<Index2>& upper_bound,
+      preserve_lobound_t) const {
+    return BlkTsrExpr<const Array, Alias>(array_, annotation_, lower_bound,
+                                          upper_bound)
+        .preserve_lobound();
+  }
+
+  /// makes an immutable Block expression
 
   /// \tparam PairRange Type representing a range of generalized pairs (see
   /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of
@@ -235,7 +279,22 @@ class TsrExpr : public Expr<TsrExpr<Array, Alias>> {
     return BlkTsrExpr<const Array, Alias>(array_, annotation_, bounds);
   }
 
-  /// immutable Block expression factory
+  /// makes an immutable Block expression that preserves the underlying tensor's
+  /// trange lobound
+
+  /// \tparam PairRange Type representing a range of generalized pairs (see
+  /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of
+  /// the block
+  template <typename PairRange,
+            typename = std::enable_if_t<
+                TiledArray::detail::is_gpair_range_v<PairRange>>>
+  BlkTsrExpr<const Array, Alias> block(const PairRange& bounds,
+                                       preserve_lobound_t) const {
+    return BlkTsrExpr<const Array, Alias>(array_, annotation_, bounds)
+        .preserve_lobound();
+  }
+
+  /// makes an immutable Block expression
 
   /// \tparam Index An integral type
   /// \param bounds The {lower,upper} bounds of the block
@@ -246,7 +305,21 @@ class TsrExpr : public Expr<TsrExpr<Array, Alias>> {
     return BlkTsrExpr<const Array, Alias>(array_, annotation_, bounds);
   }
 
-  /// mutable Block expression factory
+  /// makes an immutable Block expression that preserves the underlying tensor's
+  /// trange lobound
+
+  /// \tparam Index An integral type
+  /// \param bounds The {lower,upper} bounds of the block
+  template <typename Index,
+            typename = std::enable_if_t<std::is_integral_v<Index>>>
+  BlkTsrExpr<const Array, Alias> block(
+      const std::initializer_list<std::initializer_list<Index>>& bounds,
+      preserve_lobound_t) const {
+    return BlkTsrExpr<const Array, Alias>(array_, annotation_, bounds)
+        .preserve_lobound();
+  }
+
+  /// makes a mutable Block expression
 
   /// \tparam Index1 An integral range type
   /// \tparam Index2 An integral range type
@@ -262,7 +335,26 @@ class TsrExpr : public Expr<TsrExpr<Array, Alias>> {
                                     upper_bound);
   }
 
-  /// mutable Block expression factory
+  /// makes a mutable Block expression that preserves the underlying tensor's
+  /// trange lobound
+
+  /// \tparam Index1 An integral range type
+  /// \tparam Index2 An integral range type
+  /// \param lower_bound The lower_bound of the block
+  /// \param upper_bound The upper_bound of the block
+  template <typename Index1, typename Index2,
+            typename = std::enable_if_t<
+                TiledArray::detail::is_integral_range_v<Index1> &&
+                TiledArray::detail::is_integral_range_v<Index2>>>
+  BlkTsrExpr<Array, Alias> block(const Index1& lower_bound,
+                                 const Index2& upper_bound,
+                                 preserve_lobound_t) {
+    return BlkTsrExpr<Array, Alias>(array_, annotation_, lower_bound,
+                                    upper_bound)
+        .preserve_lobound();
+  }
+
+  /// makes a mutable Block expression
 
   /// \tparam Index1 An integral type
   /// \tparam Index2 An integral type
@@ -278,7 +370,25 @@ class TsrExpr : public Expr<TsrExpr<Array, Alias>> {
                                     upper_bound);
   }
 
-  /// mutable Block expression factory
+  /// makes a mutable Block expression that preserves the underlying tensor's
+  /// trange lobound
+
+  /// \tparam Index1 An integral type
+  /// \tparam Index2 An integral type
+  /// \param lower_bound The lower_bound of the block
+  /// \param upper_bound The upper_bound of the block
+  template <typename Index1, typename Index2,
+            typename = std::enable_if_t<std::is_integral_v<Index1> &&
+                                        std::is_integral_v<Index2>>>
+  BlkTsrExpr<Array, Alias> block(
+      const std::initializer_list<Index1>& lower_bound,
+      const std::initializer_list<Index2>& upper_bound, preserve_lobound_t) {
+    return BlkTsrExpr<Array, Alias>(array_, annotation_, lower_bound,
+                                    upper_bound)
+        .preserve_lobound();
+  }
+
+  /// makes a mutable Block expression
 
   /// \tparam PairRange Type representing a range of generalized pairs (see
   /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of
@@ -290,7 +400,21 @@ class TsrExpr : public Expr<TsrExpr<Array, Alias>> {
     return BlkTsrExpr<Array, Alias>(array_, annotation_, bounds);
   }
 
-  /// mutable Block expression factory
+  /// makes a mutable Block expression that preserves the underlying tensor's
+  /// trange lobound
+
+  /// \tparam PairRange Type representing a range of generalized pairs (see
+  /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of
+  /// the block
+  template <typename PairRange,
+            typename = std::enable_if_t<
+                TiledArray::detail::is_gpair_range_v<PairRange>>>
+  BlkTsrExpr<Array, Alias> block(const PairRange& bounds, preserve_lobound_t) {
+    return BlkTsrExpr<Array, Alias>(array_, annotation_, bounds)
+        .preserve_lobound();
+  }
+
+  /// makes a mutable Block expression
 
   /// \tparam Index An integral type
   /// \param bounds The {lower,upper} bounds of the block
@@ -301,6 +425,20 @@ class TsrExpr : public Expr<TsrExpr<Array, Alias>> {
     return BlkTsrExpr<Array, Alias>(array_, annotation_, bounds);
   }
 
+  /// makes a mutable Block expression that preserves the underlying tensor's
+  /// trange lobound
+
+  /// \tparam Index An integral type
+  /// \param bounds The {lower,upper} bounds of the block
+  template <typename Index,
+            typename = std::enable_if_t<std::is_integral_v<Index>>>
+  BlkTsrExpr<Array, Alias> block(
+      const std::initializer_list<std::initializer_list<Index>>& bounds,
+      preserve_lobound_t) {
+    return BlkTsrExpr<Array, Alias>(array_, annotation_, bounds)
+        .preserve_lobound();
+  }
+
   /// Conjugated-tensor expression factor
 
   /// \return A conjugated expression object
@@ -385,6 +523,24 @@ class TsrExpr<const Array, true> : public Expr<TsrExpr<const Array, true>> {
 
   /// Block expression
 
+  /// \tparam Index1 An integral range type
+  /// \tparam Index2 An integral range type
+  /// \param lower_bound The lower_bound of the block
+  /// \param upper_bound The upper_bound of the block
+  template <typename Index1, typename Index2,
+            typename = std::enable_if_t<
+                TiledArray::detail::is_integral_range_v<Index1> &&
+                TiledArray::detail::is_integral_range_v<Index2>>>
+  BlkTsrExpr<const Array, true> block(const Index1& lower_bound,
+                                      const Index2& upper_bound,
+                                      preserve_lobound_t) const {
+    return BlkTsrExpr<const Array, true>(array_, annotation_, lower_bound,
+                                         upper_bound)
+        .preserve_lobound();
+  }
+
+  /// Block expression
+
   /// \tparam Index1 An integral type
   /// \tparam Index2 An integral type
   /// \param lower_bound The lower_bound of the block
@@ -401,8 +557,27 @@ class TsrExpr<const Array, true> : public Expr<TsrExpr<const Array, true>> {
 
   /// Block expression
 
+  /// \tparam Index1 An integral type
+  /// \tparam Index2 An integral type
+  /// \param lower_bound The lower_bound of the block
+  /// \param upper_bound The upper_bound of the block
+  template <typename Index1, typename Index2,
+            typename = std::enable_if_t<std::is_integral_v<Index1> &&
+                                        std::is_integral_v<Index2>>>
+  BlkTsrExpr<const Array, true> block(
+      const std::initializer_list<Index1>& lower_bound,
+      const std::initializer_list<Index2>& upper_bound,
+      preserve_lobound_t) const {
+    return BlkTsrExpr<const Array, true>(array_, annotation_, lower_bound,
+                                         upper_bound)
+        .preserve_lobound();
+  }
+
+  /// Block expression
+
   /// \tparam PairRange Type representing a range of generalized pairs (see
-  /// TiledArray::detail::is_gpair_v ) \param bounds The {lower,upper} bounds of
+  /// TiledArray::detail::is_gpair_v )
+  /// \param bounds The {lower,upper} bounds of
   /// the block
   template <typename PairRange,
             typename = std::enable_if_t<
@@ -413,6 +588,21 @@ class TsrExpr<const Array, true> : public Expr<TsrExpr<const Array, true>> {
 
   /// Block expression
 
+  /// \tparam PairRange Type representing a range of generalized pairs (see
+  /// TiledArray::detail::is_gpair_v )
+  /// \param bounds The {lower,upper} bounds of
+  /// the block
+  template <typename PairRange,
+            typename = std::enable_if_t<
+                TiledArray::detail::is_gpair_range_v<PairRange>>>
+  BlkTsrExpr<const Array, true> block(const PairRange& bounds,
+                                      preserve_lobound_t) const {
+    return BlkTsrExpr<const Array, true>(array_, annotation_, bounds)
+        .preserve_lobound();
+  }
+
+  /// Block expression
+
   /// \tparam Index An integral type
   /// \param bounds The {lower,upper} bounds of the block
   template <typename Index,
@@ -422,6 +612,19 @@ class TsrExpr<const Array, true> : public Expr<TsrExpr<const Array, true>> {
     return BlkTsrExpr<const Array, true>(array_, annotation_, bounds);
   }
 
+  /// Block expression
+
+  /// \tparam Index An integral type
+  /// \param bounds The {lower,upper} bounds of the block
+  template <typename Index,
+            typename = std::enable_if_t<std::is_integral_v<Index>>>
+  BlkTsrExpr<const Array, true> block(
+      const std::initializer_list<std::initializer_list<Index>>& bounds,
+      preserve_lobound_t) const {
+    return BlkTsrExpr<const Array, true>(array_, annotation_, bounds)
+        .preserve_lobound();
+  }
+
   /// Conjugated-tensor expression factor
 
   /// \return A conjugated expression object
diff --git a/src/TiledArray/expressions/unary_engine.h b/src/TiledArray/expressions/unary_engine.h
index 621c4a71b3..631fca8fed 100644
--- a/src/TiledArray/expressions/unary_engine.h
+++ b/src/TiledArray/expressions/unary_engine.h
@@ -70,9 +70,10 @@ class UnaryEngine : ExprEngine<Derived> {
 
  protected:
   // Import base class variables to this scope
+  using ExprEngine_::implicit_permute_inner_;
+  using ExprEngine_::implicit_permute_outer_;
   using ExprEngine_::indices_;
   using ExprEngine_::perm_;
-  using ExprEngine_::permute_tiles_;
   using ExprEngine_::pmap_;
   using ExprEngine_::shape_;
   using ExprEngine_::trange_;
@@ -99,7 +100,7 @@ class UnaryEngine : ExprEngine<Derived> {
   /// children such that the number of permutations is minimized.
   /// \param target_indices The target index list for this expression
   void perm_indices(const BipartiteIndexList& target_indices) {
-    TA_ASSERT(permute_tiles_);
+    TA_ASSERT(!this->implicit_permute());
 
     indices_ = target_indices;
     if (arg_.indices() != target_indices) arg_.perm_indices(target_indices);
diff --git a/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c b/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c
new file mode 100644
index 0000000000..f3706ef1fa
--- /dev/null
+++ b/src/TiledArray/external/agnerfog/intel_cpu_feature_patch.c
@@ -0,0 +1,48 @@
+/***********************  intel_cpu_feature_patch.c  **************************
+ * Author:           Agner Fog
+ * Date created:     2014-07-30
+ * Last modified:    2019-12-29
+ * Source URL:       https://www.agner.org/optimize/intel_dispatch_patch.zip
+ * Language:         C or C++
+ *
+ * Description:
+ * Patch for Intel compiler version 13.0 and later, including the general
+ * libraries, LIBM and SVML, but not MKL and VML.
+ *
+ * Example of how to patch Intel's CPU feature dispatcher in order to improve
+ * compatibility of generated code with non-Intel processors.
+ * In Windows: Use the static link libraries (*.lib), not the dynamic link
+ * librarise (*.DLL).
+ * In Linux and Mac: use static linking (*.a) or dynamic linking (*.so).
+ *
+ * Include this code in your C or C++ program and call intel_cpu_patch();
+ * before any call to the library functions.
+ *
+ * Copyright (c) 2014-2019. BSD License 2.0
+ ******************************************************************************/
+#include <stdint.h>
+
+#ifdef __cplusplus  // use C-style linking
+extern "C" {
+#endif
+
+// link to Intel libraries
+extern int64_t __intel_cpu_feature_indicator;    // CPU feature bits
+extern int64_t __intel_cpu_feature_indicator_x;  // CPU feature bits
+void __intel_cpu_features_init();  // unfair dispatcher: checks CPU features for
+                                   // Intel CPU's only
+void __intel_cpu_features_init_x();  // fair dispatcher: checks CPU features
+                                     // without discriminating by CPU brand
+
+#ifdef __cplusplus
+}  // end of extern "C"
+#endif
+
+void intel_cpu_patch() {
+  // force a re-evaluation of the CPU features without discriminating by CPU
+  // brand
+  __intel_cpu_feature_indicator = 0;
+  __intel_cpu_feature_indicator_x = 0;
+  __intel_cpu_features_init_x();
+  __intel_cpu_feature_indicator = __intel_cpu_feature_indicator_x;
+}
diff --git a/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c b/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c
new file mode 100644
index 0000000000..b88a1807f7
--- /dev/null
+++ b/src/TiledArray/external/agnerfog/intel_mkl_cpuid_patch.c
@@ -0,0 +1,61 @@
+/***********************  intel_mkl_cpuid_patch.c  **************************
+ * Author:           Agner Fog
+ * Date created:     2019-12-29
+ * Source URL:       https://www.agner.org/optimize/intel_dispatch_patch.zip
+ * Language:         C or C++
+ *
+ * Description:
+ * Patch for Intel Math Kernel Library (MKL) version 14.0 and later, except
+ * the Vector Math Library (VML).
+ *
+ * Example of how to override Intel's CPU feature dispatcher in order to improve
+ * compatibility of Intel function libraries with non-Intel processors.
+ *
+ * Include this code in your C or C++ program and make sure it is linked before
+ * any Intel libraries. You may need to include intel_mkl_feature_patch.c as
+ *well.
+ *
+ * Copyright (c) 2019. BSD License 2.0
+ ******************************************************************************/
+#include <stdint.h>
+
+#ifdef __cplusplus  // use C-style linking
+extern "C" {
+#endif
+
+// detect if Intel CPU
+int mkl_serv_intel_cpu() { return 1; }
+
+// detect if Intel CPU
+int mkl_serv_intel_cpu_true() { return 1; }
+
+int mkl_serv_cpuhaspnr_true() { return 1; }
+
+int mkl_serv_cpuhaspnr() { return 1; }
+
+int mkl_serv_cpuhasnhm() { return 1; }
+
+int mkl_serv_cpuisbulldozer() { return 0; }
+
+int mkl_serv_cpuiszen() { return 0; }
+
+int mkl_serv_cpuisatomsse4_2() { return 0; }
+
+int mkl_serv_cpuisatomssse3() { return 0; }
+
+int mkl_serv_cpuisitbarcelona() { return 0; }
+
+int mkl_serv_cpuisskl() { return 0; }
+
+int mkl_serv_cpuisknm() { return 0; }
+
+int mkl_serv_cpuisclx() { return 0; }
+
+int mkl_serv_get_microarchitecture() {
+  // I don't know what this number means
+  return 33;
+}
+
+#ifdef __cplusplus
+}  // end of extern "C"
+#endif
diff --git a/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c b/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c
new file mode 100644
index 0000000000..4844f2621d
--- /dev/null
+++ b/src/TiledArray/external/agnerfog/intel_mkl_feature_patch.c
@@ -0,0 +1,49 @@
+/***********************  intel_mkl_feature_patch.c  **************************
+ * Author:           Agner Fog
+ * Date created:     2014-07-30
+ * Last modified:    2019-12-29
+ * Source URL:       https://www.agner.org/optimize/intel_dispatch_patch.zip
+ * Language:         C or C++
+ *
+ * Description:
+ * Patch for Intel Math Kernel Library (MKL) version 14.0 and later, except
+ * the Vector Math Library (VML).
+ *
+ * Example of how to patch Intel's CPU feature dispatcher in order to improve
+ * compatibility of Intel function libraries with non-Intel processors.
+ * In Windows: Use the static link libraries (*.lib), not the dynamic link
+ * librarise (*.DLL).
+ * In Linux and Mac: use static linking (*.a) or dynamic linking (*.so).
+ *
+ * Include this code in your C or C++ program and call intel_mkl_patch();
+ * before any call to the MKL functions. You may need to include
+ * intel_mkl_cpuid_patch.c as well.
+ *
+ * Copyright (c) 2014-2019. BSD License 2.0
+ ******************************************************************************/
+#include <stdint.h>
+
+#ifdef __cplusplus  // use C-style linking
+extern "C" {
+#endif
+
+// link to MKL libraries
+extern int64_t __intel_mkl_feature_indicator;    // CPU feature bits
+extern int64_t __intel_mkl_feature_indicator_x;  // CPU feature bits
+void __intel_mkl_features_init();  // unfair dispatcher: checks CPU features for
+                                   // Intel CPU's only
+void __intel_mkl_features_init_x();  // fair dispatcher: checks CPU features
+                                     // without discriminating by CPU brand
+
+#ifdef __cplusplus
+}  // end of extern "C"
+#endif
+
+void intel_mkl_use_fair_dispatch() {
+  // force a re-evaluation of the CPU features without discriminating by CPU
+  // brand
+  __intel_mkl_feature_indicator = 0;
+  __intel_mkl_feature_indicator_x = 0;
+  __intel_mkl_features_init_x();
+  __intel_mkl_feature_indicator = __intel_mkl_feature_indicator_x;
+}
diff --git a/src/TiledArray/external/agnerfog/readme.txt b/src/TiledArray/external/agnerfog/readme.txt
new file mode 100644
index 0000000000..0f891c9ed3
--- /dev/null
+++ b/src/TiledArray/external/agnerfog/readme.txt
@@ -0,0 +1,84 @@
+               intel_dispatch_patch.zip
+               ========================
+
+By Agner Fog, Technical University of Denmark, 2019.
+
+Intel's compilers are generating code that will run slower than necessary when
+the code is executed on a CPU that is not produced by Intel. This has been
+observed with Intel C, C++, and Fortran compilers.
+
+The same happens when certain function libraries produced by Intel are used,
+even if the code is compiled with another compiler, such as Microsoft, Gnu
+or Clang compilers.
+
+This problem is affecting several commonly used software programs such as 
+Matlab, because they are using Intel software libraries.
+
+The library code and the code generated by an Intel compiler may contain
+multiple versions, each optimized for a particular instruction set extension.
+A so-called CPU dispatcher is chosing the optimal version of the code at
+runtime, based on which CPU it is running on.
+
+CPU dispatchers can be fair or unfair. A fair CPU dispatcher is chosing the
+optimal code based only on which instruction set extensions are supported
+by the CPU. An unfair dispatcher first checks the CPU brand. If the brand
+is not Intel, then the unfair dispatcher will chose the "generic" version 
+of the code, i.e. the slowest version that is compatible with old CPUs 
+without the relevant instruction set extensions.
+
+The CPU dispatchers in many Intel function libraries have two versions, a 
+fair and an unfair one. It is not clear when the fair dispatcher is used
+and when the unfair dispatcher is used. My observations about fair and
+unfair CPU dispatching are as follows:
+
+* Code compiled with an Intel compiler will usually have unfair CPU dispatching.
+
+* The SVML (Short Vector Math Library) and IPP (Intel Performance Primitives)
+  function libraries from Intel are using the fair CPU dispatcher when used 
+  with a non-Intel compiler.
+
+* The MKL (Math Kernel Library) library contains both fair and unfair
+  dispatchers. It is not clear which dispatcher is used on each function.
+
+The code examples contained herein may be used for circumventing unfair CPU
+dispatching in order to improve compatibility with non-Intel CPUs.
+
+The following files are contained:
+
+intel_cpu_feature_patch.c
+-------------------------
+This code makes sure the fair dispatcher is called instead of the unfair
+one for code generated with an Intel compiler and for general Intel
+function libraries.
+
+intel_mkl_feature_patch.c
+-------------------------
+This does the same for the Intel MKL library.
+
+intel_mkl_cpuid_patch.c
+-----------------------
+This code example is overriding CPU detection functions in Intel's MKL 
+function library. The mkl_serv_intel_cpu() function in MKL is returning
+1 when running on an Intel CPU and 0 when running on any other brand of
+CPU. You may include this code to replace this function in MKL with a
+function that returns 1 regardless of CPU brand.
+
+It may be necessary to use both intel_mkl_feature_patch.c and 
+intel_mkl_cpuid_patch.c when using the MKL library in software that
+may run on any brand of CPU.
+
+An alternative method is to set the environment variable
+   MKL_DEBUG_CPU_TYPE=5
+when running on an AMD processor. This may be useful when you do not have
+access to the source code, for example when running Matlab software.
+
+The patches provided here are based on undocumented features in Intel
+function libraries. Use them at your own risk, and make sure to test your
+code properly to make sure it works as intended.
+
+The most reliable solution is, of course, to avoid Intel compilers and 
+Intel function libraries in code that may run on other CPU brands such
+as AMD and VIA. You may find other function libraries on the web, or 
+you may make your own functions. My vector class library (VCL) is useful
+for making mathematical functions that process multiple data in parallel,
+using the vector processing features of modern CPUs.
diff --git a/src/TiledArray/external/btas.h b/src/TiledArray/external/btas.h
index 483be905df..c22afd3813 100644
--- a/src/TiledArray/external/btas.h
+++ b/src/TiledArray/external/btas.h
@@ -62,6 +62,13 @@ class boxrange_iteration_order<TiledArray::Range> {
   static constexpr int value = row_major;
 };
 
+template <typename T, typename A>
+class is_tensor<TiledArray::Tensor<T, A>> : public std::true_type {};
+
+template <typename T, typename R, typename O>
+class is_tensor<TiledArray::detail::TensorInterface<T, R, O>>
+    : public std::true_type {};
+
 }  // namespace btas
 
 namespace TiledArray {
@@ -109,6 +116,34 @@ inline bool is_congruent(const btas::RangeNd<Order, Args...>& r1,
                     r2.extent_data());
 }
 
+/// Test if a BTAS range and a TA range are congruent
+
+/// This function tests that the rank and extent of
+/// \c r1 are equal to those of \c r2.
+/// \param r1 The first Range to compare
+/// \param r2 The second Range to compare
+template <blas::Layout Order, typename... Args>
+inline bool is_congruent(const btas::RangeNd<Order, Args...>& r1,
+                         const TiledArray::Range& r2) {
+  return (r1.rank() == r2.rank()) &&
+         std::equal(r1.extent_data(), r1.extent_data() + r1.rank(),
+                    r2.extent_data());
+}
+
+/// Test if a TA range and a BTAS range are congruent
+
+/// This function tests that the rank and extent of
+/// \c r1 are equal to those of \c r2.
+/// \param r1 The first Range to compare
+/// \param r2 The second Range to compare
+template <blas::Layout Order, typename... Args>
+inline bool is_congruent(const TiledArray::Range& r1,
+                         const btas::RangeNd<Order, Args...>& r2) {
+  return (r1.rank() == r2.rank()) &&
+         std::equal(r1.extent_data(), r1.extent_data() + r1.rank(),
+                    r2.extent_data());
+}
+
 template <typename T, typename Range, typename Storage>
 decltype(auto) make_ti(const btas::Tensor<T, Range, Storage>& arg) {
   return TiledArray::detail::TensorInterface<const T, TiledArray::Range,
@@ -633,16 +668,19 @@ inline btas::Tensor<T, Range, Storage> gemm(
   gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range());
 
   // Get the leading dimension for left and right matrices.
-  const integer lda =
-      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m);
-  const integer ldb =
-      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k);
+  const integer lda = std::max(
+      integer{1},
+      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m));
+  const integer ldb = std::max(
+      integer{1},
+      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k));
 
   T factor_t(factor);
 
+  const integer ldc = std::max(integer{1}, n);
   TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m,
                                n, k, factor_t, left.data(), lda, right.data(),
-                               ldb, T(0), result.data(), n);
+                               ldb, T(0), result.data(), ldc);
 
   return result;
 }
@@ -708,16 +746,19 @@ inline void gemm(btas::Tensor<T, Range, Storage>& result,
   gemm_helper.compute_matrix_sizes(m, n, k, left.range(), right.range());
 
   // Get the leading dimension for left and right matrices.
-  const integer lda =
-      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m);
-  const integer ldb =
-      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k);
+  const integer lda = std::max(
+      integer{1},
+      (gemm_helper.left_op() == TiledArray::math::blas::Op::NoTrans ? k : m));
+  const integer ldb = std::max(
+      integer{1},
+      (gemm_helper.right_op() == TiledArray::math::blas::Op::NoTrans ? n : k));
 
   T factor_t(factor);
 
+  const integer ldc = std::max(integer{1}, n);
   TiledArray::math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m,
                                n, k, factor_t, left.data(), lda, right.data(),
-                               ldb, T(1), result.data(), n);
+                               ldb, T(1), result.data(), ldc);
 }
 
 // sum of the hyperdiagonal elements
@@ -841,6 +882,20 @@ struct ordinal_traits<btas::RangeNd<_Order, _Index, _Ordinal>> {
                                          : OrdinalType::ColMajor;
 };
 
+template <typename T, typename Range, typename Storage>
+struct real_t_impl<btas::Tensor<T, Range, Storage>> {
+  using type =
+      typename btas::Tensor<T, Range, Storage>::template rebind_numeric_t<
+          typename btas::Tensor<T, Range, Storage>::scalar_type>;
+};
+
+template <typename T, typename Range, typename Storage>
+struct complex_t_impl<btas::Tensor<T, Range, Storage>> {
+  using type =
+      typename btas::Tensor<T, Range, Storage>::template rebind_numeric_t<
+          std::complex<typename btas::Tensor<T, Range, Storage>::scalar_type>>;
+};
+
 }  // namespace detail
 }  // namespace TiledArray
 
diff --git a/src/TiledArray/external/cuda.h b/src/TiledArray/external/cuda.h
index 26424fa9f6..7a6e4d50e1 100644
--- a/src/TiledArray/external/cuda.h
+++ b/src/TiledArray/external/cuda.h
@@ -1,472 +1,3 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2018  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Chong Peng
- *  Department of Chemistry, Virginia Tech
- *  July 23, 2018
- *
- */
-
-#ifndef TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED
-#define TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED
-
-#include <cassert>
-#include <cstdlib>
-#include <vector>
-
-#include <TiledArray/config.h>
-
-#ifdef TILEDARRAY_HAS_CUDA
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <nvToolsExt.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
-#include <TiledArray/external/umpire.h>
-
-#include <TiledArray/external/madness.h>
-#include <madness/world/print.h>
-#include <madness/world/safempi.h>
-#include <madness/world/thread.h>
-
-#include <TiledArray/error.h>
-
-#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
-#define CudaSafeCallNoThrow(err) __cudaSafeCallNoThrow(err, __FILE__, __LINE__)
-#define CudaCheckError() __cudaCheckError(__FILE__, __LINE__)
-
-inline void __cudaSafeCall(cudaError err, const char* file, const int line) {
-#ifdef TILEDARRAY_CHECK_CUDA_ERROR
-  if (cudaSuccess != err) {
-    std::stringstream ss;
-    ss << "cudaSafeCall() failed at: " << file << ":" << line;
-    std::string what = ss.str();
-    throw thrust::system_error(err, thrust::cuda_category(), what);
-  }
-#endif
-}
-
-inline void __cudaSafeCallNoThrow(cudaError err, const char* file,
-                                  const int line) {
-#ifdef TILEDARRAY_CHECK_CUDA_ERROR
-  if (cudaSuccess != err) {
-    madness::print_error("cudaSafeCallNoThrow() failed at: ", file, ":", line);
-  }
-#endif
-}
-
-inline void __cudaCheckError(const char* file, const int line) {
-#ifdef TILEDARRAY_CHECK_CUDA_ERROR
-  cudaError err = cudaGetLastError();
-  if (cudaSuccess != err) {
-    std::stringstream ss;
-    ss << "cudaCheckError() failed at: " << file << ":" << line;
-    std::string what = ss.str();
-    throw thrust::system_error(err, thrust::cuda_category(), what);
-  }
-#endif
-}
-
-namespace TiledArray {
-
-namespace detail {
-
-inline std::pair<int, int> mpi_local_rank_size(World& world) {
-  auto host_comm =
-      world.mpi.comm().Split_type(SafeMPI::Intracomm::SHARED_SPLIT_TYPE, 0);
-  return std::make_pair(host_comm.Get_rank(), host_comm.Get_size());
-}
-
-inline int num_cuda_streams() {
-  int num_streams = -1;
-  char* num_stream_char = std::getenv("TA_CUDA_NUM_STREAMS");
-  /// default num of streams is 3
-  if (num_stream_char) {
-    num_streams = std::atoi(num_stream_char);
-  } else {
-    num_streams = 3;
-  }
-  return num_streams;
-}
-
-inline int num_cuda_devices() {
-  int num_devices = -1;
-  CudaSafeCall(cudaGetDeviceCount(&num_devices));
-  return num_devices;
-}
-
-inline int current_cuda_device_id(World& world) {
-  int mpi_local_size = -1;
-  int mpi_local_rank = -1;
-  std::tie(mpi_local_rank, mpi_local_size) = mpi_local_rank_size(world);
-
-  int num_devices = detail::num_cuda_devices();
-
-  int cuda_device_id = -1;
-  // devices may already be pre-mapped
-  // if mpi_local_size <= num_devices : all ranks are in same resource set, map
-  // round robin
-  if (mpi_local_size <= num_devices) {
-    cuda_device_id = mpi_local_rank % num_devices;
-  } else {  // mpi_local_size > num_devices
-    char* cvd_cstr = std::getenv("CUDA_VISIBLE_DEVICES");
-    if (cvd_cstr) {  // CUDA_VISIBLE_DEVICES is set, assume that pre-mapped
-      // make sure that there is only 1 device available here
-      if (num_devices != 1) {
-        throw std::runtime_error(
-            std::string(
-                "CUDA_VISIBLE_DEVICES environment variable is set, hence using "
-                "the provided device-to-rank mapping; BUT TiledArray found ") +
-            std::to_string(num_devices) +
-            " CUDA devices; only 1 CUDA device / MPI process is supported");
-      }
-      cuda_device_id = 0;
-    } else {  // not enough devices + devices are not pre-mapped
-      throw std::runtime_error(
-          std::string("TiledArray found ") + std::to_string(mpi_local_size) +
-          " MPI ranks on a node with " + std::to_string(num_devices) +
-          " CUDA devices; only 1 MPI process / CUDA device model is currently "
-          "supported");
-    }
-  }
-
-  return cuda_device_id;
-}
-
-inline void CUDART_CB cuda_readyflag_callback(void* userData) {
-  // convert void * to std::atomic<bool>
-  std::atomic<bool>* flag = static_cast<std::atomic<bool>*>(userData);
-  // set the flag to be true
-  flag->store(true);
-}
-
-struct ProbeFlag {
-  ProbeFlag(std::atomic<bool>* f) : flag(f) {}
-
-  bool operator()() const { return flag->load(); }
-
-  std::atomic<bool>* flag;
-};
-
-inline void thread_wait_cuda_stream(const cudaStream_t& stream) {
-  std::atomic<bool>* flag = new std::atomic<bool>(false);
-
-  CudaSafeCall(
-      cudaLaunchHostFunc(stream, detail::cuda_readyflag_callback, flag));
-
-  detail::ProbeFlag probe(flag);
-
-  // wait with sleep and do not do work
-  madness::ThreadPool::await(probe, false, true);
-  //    madness::ThreadPool::await(probe, true, true);
-
-  delete flag;
-}
-
-}  // namespace detail
-
-inline const cudaStream_t*& tls_cudastream_accessor() {
-  static thread_local const cudaStream_t* thread_local_stream_ptr{nullptr};
-  return thread_local_stream_ptr;
-}
-
-inline void synchronize_stream(const cudaStream_t* stream) {
-  tls_cudastream_accessor() = stream;
-}
-
-/**
- * cudaEnv set up global environment
- *
- * Singleton class
- */
-
-class cudaEnv {
- public:
-  ~cudaEnv() {
-    // destroy cuda streams on current device
-    for (auto& stream : cuda_streams_) {
-      CudaSafeCallNoThrow(cudaStreamDestroy(stream));
-    }
-  }
-
-  cudaEnv(const cudaEnv&) = delete;
-  cudaEnv(cudaEnv&&) = delete;
-  cudaEnv& operator=(const cudaEnv&) = delete;
-  cudaEnv& operator=(cudaEnv&&) = delete;
-
-  /// access to static member
-  static std::unique_ptr<cudaEnv>& instance() {
-    static std::unique_ptr<cudaEnv> instance_{nullptr};
-    if (!instance_) {
-      initialize(instance_, TiledArray::get_default_world());
-    }
-    return instance_;
-  }
-
-  /// initialize static member
-  static void initialize(std::unique_ptr<cudaEnv>& instance, World& world) {
-    // initialize only when not initialized
-    if (instance == nullptr) {
-      int num_streams = detail::num_cuda_streams();
-      int num_devices = detail::num_cuda_devices();
-      int device_id = detail::current_cuda_device_id(world);
-      // set device for current MPI process .. will be set in the ctor as well
-      CudaSafeCall(cudaSetDevice(device_id));
-      CudaSafeCall(cudaDeviceSetCacheConfig(cudaFuncCachePreferShared));
-
-      // uncomment to debug umpire ops
-      //
-      //      umpire::util::Logger::getActiveLogger()->setLoggingMsgLevel(
-      //          umpire::util::message::Debug);
-
-      //       make Thread Safe UM Dynamic POOL
-
-      auto& rm = umpire::ResourceManager::getInstance();
-
-      auto mem_total_free = cudaEnv::memory_total_and_free_device();
-
-      // turn off Umpire introspection for non-Debug builds
-#ifndef NDEBUG
-      constexpr auto introspect = true;
-#else
-      constexpr auto introspect = false;
-#endif
-
-      // allocate all free memory for UM pool
-      // subsequent allocs will use 1/10 of the total device memory
-      auto alloc_grain = mem_total_free.second / 10;
-      auto um_dynamic_pool =
-          rm.makeAllocator<umpire::strategy::QuickPool, introspect>(
-              "UMDynamicPool", rm.getAllocator("UM"), mem_total_free.second,
-              alloc_grain);
-
-      // allocate zero memory for device pool, same grain for subsequent allocs
-      auto dev_size_limited_alloc =
-          rm.makeAllocator<umpire::strategy::SizeLimiter, introspect>(
-              "size_limited_alloc", rm.getAllocator("DEVICE"),
-              mem_total_free.first);
-      auto dev_dynamic_pool =
-          rm.makeAllocator<umpire::strategy::QuickPool, introspect>(
-              "CUDADynamicPool", dev_size_limited_alloc, 0, alloc_grain);
-
-      auto cuda_env = std::unique_ptr<cudaEnv>(
-          new cudaEnv(world, num_devices, device_id, num_streams,
-                      um_dynamic_pool, dev_dynamic_pool));
-      instance = std::move(cuda_env);
-    }
-  }
-
-  World& world() const { return *world_; }
-
-  int num_cuda_devices() const { return num_cuda_devices_; }
-
-  int current_cuda_device_id() const { return current_cuda_device_id_; }
-
-  int num_cuda_streams() const { return num_cuda_streams_; }
-
-  bool concurrent_managed_access() const {
-    return cuda_device_concurrent_managed_access_;
-  }
-
-  size_t stream_id(const cudaStream_t& stream) const {
-    auto it = std::find(cuda_streams_.begin(), cuda_streams_.end(), stream);
-    if (it == cuda_streams_.end()) abort();
-    return it - cuda_streams_.begin();
-  }
-
-  /// @return the total size of all and free device memory on the current device
-  static std::pair<size_t, size_t> memory_total_and_free_device() {
-    std::pair<size_t, size_t> result;
-    // N.B. cudaMemGetInfo returns {free,total}
-    CudaSafeCall(cudaMemGetInfo(&result.second, &result.first));
-    return result;
-  }
-
-  /// Collective call to probe CUDA {total,free} memory
-
-  /// @return the total size of all and free device memory on every rank's
-  /// device
-  std::vector<std::pair<size_t, size_t>> memory_total_and_free() const {
-    auto world_size = world_->size();
-    std::vector<size_t> total_memory(world_size, 0), free_memory(world_size, 0);
-    auto rank = world_->rank();
-    std::tie(total_memory.at(rank), free_memory.at(rank)) =
-        cudaEnv::memory_total_and_free_device();
-    world_->gop.sum(total_memory.data(), total_memory.size());
-    world_->gop.sum(free_memory.data(), free_memory.size());
-    std::vector<std::pair<size_t, size_t>> result(world_size);
-    for (int r = 0; r != world_size; ++r) {
-      result.at(r) = {total_memory.at(r), free_memory.at(r)};
-    }
-    return result;
-  }
-
-  const cudaStream_t& cuda_stream(std::size_t i) const {
-    return cuda_streams_.at(i);
-  }
-
-  const cudaStream_t& cuda_stream_h2d() const {
-    return cuda_streams_[num_cuda_streams_];
-  }
-
-  const cudaStream_t& cuda_stream_d2h() const {
-    return cuda_streams_[num_cuda_streams_ + 1];
-  }
-
-  /// @return a (non-thread-safe) Umpire allocator for CUDA UM
-  umpire::Allocator& um_allocator() { return um_allocator_; }
-
-  // clang-format off
-  /// @return the max actual amount of memory held by um_allocator()
-  /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()`
-  /// @note if there is only 1 Umpire allocator using UM memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("UM").getHighWatermark()`
-  // clang-format on
-  std::size_t um_allocator_getActualHighWatermark() {
-    TA_ASSERT(dynamic_cast<umpire::strategy::QuickPool*>(
-                  um_allocator_.getAllocationStrategy()) != nullptr);
-    return dynamic_cast<umpire::strategy::QuickPool*>(
-               um_allocator_.getAllocationStrategy())
-        ->getActualHighwaterMark();
-  }
-
-  /// @return a (non-thread-safe) Umpire allocator for CUDA device memory
-  umpire::Allocator& device_allocator() { return device_allocator_; }
-
-  // clang-format off
-  /// @return the max actual amount of memory held by um_allocator()
-  /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()`
-  /// @note if there is only 1 Umpire allocator using DEVICE memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("DEVICE").getHighWatermark()`
-  // clang-format on
-  std::size_t device_allocator_getActualHighWatermark() {
-    TA_ASSERT(dynamic_cast<umpire::strategy::QuickPool*>(
-                  device_allocator_.getAllocationStrategy()) != nullptr);
-    return dynamic_cast<umpire::strategy::QuickPool*>(
-               device_allocator_.getAllocationStrategy())
-        ->getActualHighwaterMark();
-  }
-
- protected:
-  cudaEnv(World& world, int num_devices, int device_id, int num_streams,
-          umpire::Allocator um_alloc, umpire::Allocator device_alloc)
-      : world_(&world),
-        um_allocator_(um_alloc),
-        device_allocator_(device_alloc),
-        num_cuda_devices_(num_devices),
-        current_cuda_device_id_(device_id),
-        num_cuda_streams_(num_streams) {
-    if (num_devices <= 0) {
-      throw std::runtime_error("No CUDA-Enabled GPUs Found!\n");
-    }
-
-    // set device for current MPI process
-    CudaSafeCall(cudaSetDevice(current_cuda_device_id_));
-
-    /// check the capability of CUDA device
-    cudaDeviceProp prop;
-    CudaSafeCall(cudaGetDeviceProperties(&prop, device_id));
-    if (!prop.managedMemory) {
-      throw std::runtime_error("CUDA Device doesn't support managedMemory\n");
-    }
-    int concurrent_managed_access;
-    CudaSafeCall(cudaDeviceGetAttribute(&concurrent_managed_access,
-                                        cudaDevAttrConcurrentManagedAccess,
-                                        device_id));
-    cuda_device_concurrent_managed_access_ = concurrent_managed_access;
-    if (!cuda_device_concurrent_managed_access_) {
-      std::cout << "\nWarning: CUDA Device doesn't support "
-                   "ConcurrentManagedAccess!\n\n";
-    }
-
-    // creates cuda streams on current device
-    cuda_streams_.resize(num_cuda_streams_ + 2);
-    for (auto& stream : cuda_streams_) {
-      CudaSafeCall(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-    }
-    std::cout << "created " << num_cuda_streams_
-              << " CUDA streams + 2 I/O streams" << std::endl;
-  }
-
- private:
-  // the world used to initialize this
-  World* world_;
-
-  /// allocator backed by a (non-thread-safe) dynamically-sized pool for CUDA UM
-  umpire::Allocator um_allocator_;
-  /// allocator backed by a (non-thread-safe) dynamically-sized pool for device
-  /// memory
-  umpire::Allocator device_allocator_;
-
-  int num_cuda_devices_;
-  int current_cuda_device_id_;
-  bool cuda_device_concurrent_managed_access_;
-
-  int num_cuda_streams_;
-  std::vector<cudaStream_t> cuda_streams_;
-};
-
-namespace detail {
-
-template <typename Range>
-const cudaStream_t& get_stream_based_on_range(const Range& range) {
-  // TODO better way to get stream based on the id of tensor
-  auto stream_id = range.offset() % cudaEnv::instance()->num_cuda_streams();
-  auto& stream = cudaEnv::instance()->cuda_stream(stream_id);
-  return stream;
-}
-
-}  // namespace detail
-
-namespace nvidia {
-
-// Color definitions for nvtxcalls
-enum class argbColor : uint32_t {
-  red = 0xFFFF0000,
-  blue = 0xFF0000FF,
-  green = 0xFF008000,
-  yellow = 0xFFFFFF00,
-  cyan = 0xFF00FFFF,
-  magenta = 0xFFFF00FF,
-  gray = 0xFF808080,
-  purple = 0xFF800080
-};
-
-/// enter a profiling range by calling nvtxRangePushEx
-/// \param[in] range_title a char string containing the range title
-/// \param[in] range_color the range color
-inline void range_push(const char* range_title, argbColor range_color) {
-  nvtxEventAttributes_t eventAttrib = {0};
-  eventAttrib.version = NVTX_VERSION;
-  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
-  eventAttrib.colorType = NVTX_COLOR_ARGB;
-  eventAttrib.color = static_cast<uint32_t>(range_color);
-  eventAttrib.message.ascii = range_title;
-  nvtxRangePushEx(&eventAttrib);
-}
-
-/// exits the current profiling range by calling nvtxRangePopEx
-inline void range_pop() { nvtxRangePop(); }
-
-}  // namespace nvidia
-
-}  // namespace TiledArray
-
-#endif  // TILEDARRAY_HAS_CUDA
-
-#endif  // TILEDARRAY_EXTERNAL_CUDA_H__INCLUDED
+#warning \
+    "This header is deprecated. Please use TiledArray/external/device.h instead."
+#include <TiledArray/external/device.h>
diff --git a/src/TiledArray/external/device.h b/src/TiledArray/external/device.h
new file mode 100644
index 0000000000..76d769b472
--- /dev/null
+++ b/src/TiledArray/external/device.h
@@ -0,0 +1,959 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2018  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Chong Peng
+ *  Department of Chemistry, Virginia Tech
+ *  July 23, 2018
+ *
+ */
+
+#ifndef TILEDARRAY_EXTERNAL_DEVICE_H__INCLUDED
+#define TILEDARRAY_EXTERNAL_DEVICE_H__INCLUDED
+
+#include <cassert>
+#include <cstdlib>
+#include <optional>
+#include <vector>
+
+#include <TiledArray/config.h>
+
+#if defined(TILEDARRAY_HAS_HIP)
+#include <hip/hip_runtime.h>
+#elif defined(TILEDARRAY_HAS_CUDA)
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nvtx3/nvToolsExt.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+#endif
+
+#include <TiledArray/external/madness.h>
+#include <madness/world/print.h>
+#include <madness/world/safempi.h>
+#include <madness/world/thread.h>
+
+#include <TiledArray/error.h>
+#include <TiledArray/initialize.h>
+
+#include <TiledArray/external/umpire.h>
+
+namespace TiledArray::detail {
+
+struct get_um_allocator {
+  inline umpire::Allocator& operator()();
+};
+
+struct get_pinned_allocator {
+  inline umpire::Allocator& operator()();
+};
+
+}  // namespace TiledArray::detail
+
+#if defined(TILEDARRAY_HAS_CUDA)
+
+inline void __DeviceSafeCall(cudaError err, const char* file, const int line) {
+  if (cudaSuccess != err) {
+    std::stringstream ss;
+    ss << "DeviceSafeCall() failed at: " << file << ":" << line;
+    std::string what = ss.str();
+    throw thrust::system_error(err, thrust::cuda_category(), what);
+  }
+}
+
+inline void __cudaSafeCallNoThrow(cudaError err, const char* file,
+                                  const int line) {
+  if (cudaSuccess != err) {
+    madness::print_error("cudaSafeCallNoThrow() failed at: ", file, ":", line);
+  }
+}
+
+inline void __cudaCheckError(const char* file, const int line) {
+  cudaError err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    std::stringstream ss;
+    ss << "cudaCheckError() failed at: " << file << ":" << line;
+    std::string what = ss.str();
+    throw thrust::system_error(err, thrust::cuda_category(), what);
+  }
+}
+
+#define DeviceSafeCall(err) __DeviceSafeCall(err, __FILE__, __LINE__)
+#define DeviceSafeCallNoThrow(err) \
+  __cudaSafeCallNoThrow(err, __FILE__, __LINE__)
+#define DeviceCheckError() __cudaCheckError(__FILE__, __LINE__)
+
+#elif defined(TILEDARRAY_HAS_HIP)
+
+inline void __hipSafeCall(hipError_t err, const char* file, const int line) {
+  if (hipSuccess != err) {
+    std::stringstream ss;
+    ss << "hipSafeCall() failed at: " << file << ":" << line << ": ";
+    ss << hipGetErrorString(err);
+    throw std::runtime_error(ss.str());
+  }
+}
+
+inline void __hipSafeCallNoThrow(hipError_t err, const char* file,
+                                 const int line) {
+  if (hipSuccess != err) {
+    madness::print_error("hipSafeCallNoThrow() failed at: ", file, ":", line,
+                         ": ", hipGetErrorString(err));
+  }
+}
+
+inline void __hipCheckError(const char* file, const int line) {
+  auto err = hipGetLastError();
+  if (hipSuccess != err) {
+    std::stringstream ss;
+    ss << "hipCheckError() failed at: " << file << ":" << line << ": ";
+    ss << hipGetErrorString(err);
+    throw std::runtime_error(ss.str());
+  }
+}
+
+#define DeviceSafeCall(err) __hipSafeCall(err, __FILE__, __LINE__)
+#define DeviceSafeCallNoThrow(err) __hipSafeCallNoThrow(err, __FILE__, __LINE__)
+#define DeviceCheckError() __hipCheckError(__FILE__, __LINE__)
+
+#endif
+
+namespace TiledArray {
+namespace device {
+
+#if defined(TILEDARRAY_HAS_CUDA)
+inline namespace cuda {
+using stream_t = cudaStream_t;
+using error_t = cudaError_t;
+using hostFn_t = cudaHostFn_t;
+using deviceProp_t = cudaDeviceProp;
+using deviceAttr_t = cudaDeviceAttr;
+#define DeviceAttributeConcurrentManagedAccess \
+  cudaDevAttrConcurrentManagedAccess
+#define DEVICERT_CB CUDART_CB
+
+const inline auto Success = cudaSuccess;
+
+enum DeviceId {
+  CpuDeviceId = cudaCpuDeviceId,
+  InvalidDeviceId = cudaInvalidDeviceId
+};
+
+enum MemAttach {
+  MemAttachGlobal = cudaMemAttachGlobal,
+  MemAttachHost = cudaMemAttachHost,
+  MemAttachSingle = cudaMemAttachSingle
+};
+
+enum MemcpyKind {
+  MemcpyHostToHost = cudaMemcpyHostToHost,
+  MemcpyHostToDevice = cudaMemcpyHostToDevice,
+  MemcpyDeviceToHost = cudaMemcpyDeviceToHost,
+  MemcpyDeviceToDevice = cudaMemcpyDeviceToDevice,
+  MemcpyDefault = cudaMemcpyDefault
+};
+
+enum FuncCache {
+  FuncCachePreferNone = cudaFuncCachePreferNone,
+  FuncCachePreferShared = cudaFuncCachePreferShared,
+  FuncCachePreferL1 = cudaFuncCachePreferL1,
+  FuncCachePreferEqual = cudaFuncCachePreferEqual
+};
+
+enum StreamCreateFlags {
+  StreamDefault = cudaStreamDefault,
+  StreamNonBlocking = cudaStreamNonBlocking
+};
+
+constexpr inline auto DevAttrUnifiedAddressing = cudaDevAttrUnifiedAddressing;
+constexpr inline auto DevAttrConcurrentManagedAccess =
+    cudaDevAttrConcurrentManagedAccess;
+
+inline error_t driverVersion(int* v) { return cudaDriverGetVersion(v); }
+
+inline error_t runtimeVersion(int* v) { return cudaRuntimeGetVersion(v); }
+
+inline error_t setDevice(int device) { return cudaSetDevice(device); }
+
+inline error_t getDevice(int* device) { return cudaGetDevice(device); }
+
+inline error_t getDeviceCount(int* num_devices) {
+  return cudaGetDeviceCount(num_devices);
+}
+
+inline error_t deviceSetCacheConfig(FuncCache cache_config) {
+  return cudaDeviceSetCacheConfig(static_cast<cudaFuncCache>(cache_config));
+}
+
+inline error_t memGetInfo(size_t* free, size_t* total) {
+  return cudaMemGetInfo(free, total);
+}
+
+inline error_t getDeviceProperties(deviceProp_t* prop, int device) {
+  return cudaGetDeviceProperties(prop, device);
+}
+
+inline error_t deviceGetAttribute(int* value, deviceAttr_t attr, int device) {
+  return cudaDeviceGetAttribute(value, attr, device);
+}
+
+inline error_t streamCreate(stream_t* pStream) {
+  return cudaStreamCreate(pStream);
+}
+
+inline error_t streamCreateWithFlags(stream_t* pStream,
+                                     StreamCreateFlags flags) {
+  return cudaStreamCreateWithFlags(pStream, flags);
+}
+
+inline error_t deviceSynchronize() { return cudaDeviceSynchronize(); }
+inline error_t streamSynchronize(stream_t stream) {
+  return cudaStreamSynchronize(stream);
+}
+
+template <typename T>
+inline error_t malloc(T** devPtr, size_t size) {
+  return cudaMalloc(devPtr, size);
+}
+
+template <typename T>
+inline error_t mallocHost(T** devPtr, size_t size) {
+  return cudaMallocHost(devPtr, size);
+}
+
+template <typename T>
+inline error_t mallocManaged(T** devPtr, size_t size,
+                             unsigned int flag = MemAttachGlobal) {
+  return cudaMallocManaged(devPtr, size, flag);
+}
+
+template <typename T>
+error_t free(T* devPtr) {
+  return cudaFree(devPtr);
+}
+
+template <typename T>
+error_t memcpy(T* dst, const T* src, size_t count, MemcpyKind kind) {
+  return cudaMemcpy(dst, src, count, static_cast<cudaMemcpyKind>(kind));
+}
+
+template <typename T>
+error_t memcpyAsync(T* dst, const T* src, size_t count, MemcpyKind kind,
+                    stream_t stream = 0) {
+  return cudaMemcpyAsync(dst, src, count, static_cast<cudaMemcpyKind>(kind),
+                         stream);
+}
+
+template <typename T>
+error_t memPrefetchAsync(const T* devPtr, size_t count, int dstDevice,
+                         stream_t stream = 0) {
+  return cudaMemPrefetchAsync(devPtr, count, dstDevice, stream);
+}
+
+inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) {
+  return cudaLaunchHostFunc(stream, fn, userData);
+}
+
+inline error_t streamDestroy(stream_t stream) {
+  return cudaStreamDestroy(stream);
+}
+
+}  // namespace cuda
+#elif defined(TILEDARRAY_HAS_HIP)
+inline namespace hip {
+using stream_t = hipStream_t;
+using error_t = hipError_t;
+using hostFn_t = hipHostFn_t;
+using deviceProp_t = hipDeviceProp_t;
+using deviceAttr_t = hipDeviceAttribute_t;
+#define DeviceAttributeConcurrentManagedAccess \
+  hipDeviceAttributeConcurrentManagedAccess
+#define DEVICERT_CB
+
+const inline auto Success = hipSuccess;
+
+enum DeviceId {
+  CpuDeviceId = hipCpuDeviceId,
+  InvalidDeviceId = hipInvalidDeviceId
+};
+
+enum MemcpyKind {
+  MemcpyHostToHost = hipMemcpyHostToHost,
+  MemcpyHostToDevice = hipMemcpyHostToDevice,
+  MemcpyDeviceToHost = hipMemcpyDeviceToHost,
+  MemcpyDeviceToDevice = hipMemcpyDeviceToDevice,
+  MemcpyDefault = hipMemcpyDefault
+};
+
+enum MemAttach {
+  MemAttachGlobal = hipMemAttachGlobal,
+  MemAttachHost = hipMemAttachHost,
+  MemAttachSingle = hipMemAttachSingle
+};
+
+enum FuncCache {
+  FuncCachePreferNone = hipFuncCachePreferNone,
+  FuncCachePreferShared = hipFuncCachePreferShared,
+  FuncCachePreferL1 = hipFuncCachePreferL1,
+  FuncCachePreferEqual = hipFuncCachePreferEqual
+};
+
+enum StreamCreateFlags {
+  StreamDefault = hipStreamDefault,
+  StreamNonBlocking = hipStreamNonBlocking
+};
+
+constexpr inline auto DevAttrUnifiedAddressing =
+    hipDeviceAttributeUnifiedAddressing;
+constexpr inline auto DevAttrConcurrentManagedAccess =
+    hipDeviceAttributeConcurrentManagedAccess;
+
+inline error_t driverVersion(int* v) { return hipDriverGetVersion(v); }
+
+inline error_t runtimeVersion(int* v) { return hipRuntimeGetVersion(v); }
+
+inline error_t setDevice(int device) { return hipSetDevice(device); }
+
+inline error_t getDevice(int* device) { return hipGetDevice(device); }
+
+inline error_t getDeviceCount(int* num_devices) {
+  return hipGetDeviceCount(num_devices);
+}
+
+inline error_t deviceSetCacheConfig(FuncCache cache_config) {
+  return hipDeviceSetCacheConfig(static_cast<hipFuncCache_t>(cache_config));
+}
+
+inline error_t memGetInfo(size_t* free, size_t* total) {
+  return hipMemGetInfo(free, total);
+}
+
+inline error_t getDeviceProperties(deviceProp_t* prop, int device) {
+  return hipGetDeviceProperties(prop, device);
+}
+
+inline error_t deviceGetAttribute(int* value, deviceAttr_t attr, int device) {
+  return hipDeviceGetAttribute(value, attr, device);
+}
+
+inline error_t streamCreate(stream_t* pStream) {
+  return hipStreamCreate(pStream);
+}
+
+inline error_t streamCreateWithFlags(stream_t* pStream,
+                                     StreamCreateFlags flags) {
+  return hipStreamCreateWithFlags(pStream, flags);
+}
+
+inline error_t deviceSynchronize() { return hipDeviceSynchronize(); }
+
+inline error_t streamSynchronize(stream_t stream) {
+  return hipStreamSynchronize(stream);
+}
+
+template <typename T>
+inline error_t malloc(T** devPtr, size_t size) {
+  return hipMalloc(devPtr, size);
+}
+
+template <typename T>
+inline error_t mallocHost(T** devPtr, size_t size) {
+  return hipMallocHost(devPtr, size);
+}
+
+template <typename T>
+inline error_t mallocManaged(T** devPtr, size_t size,
+                             unsigned int flag = MemAttachGlobal) {
+  return hipMallocManaged(devPtr, size, flag);
+}
+
+template <typename T>
+error_t free(T* devPtr) {
+  return hipFree(devPtr);
+}
+
+template <typename T>
+error_t memcpy(T* dst, const T* src, size_t count, MemcpyKind kind) {
+  return hipMemcpy(dst, src, count, static_cast<hipMemcpyKind>(kind));
+}
+
+template <typename T>
+error_t memcpyAsync(T* dst, const T* src, size_t count, MemcpyKind kind,
+                    stream_t stream = 0) {
+  return hipMemcpyAsync(dst, src, count, static_cast<hipMemcpyKind>(kind),
+                        stream);
+}
+
+template <typename T>
+error_t memPrefetchAsync(const T* devPtr, size_t count, int dstDevice,
+                         stream_t stream = 0) {
+  return hipMemPrefetchAsync(devPtr, count, dstDevice, stream);
+}
+
+inline error_t launchHostFunc(stream_t stream, hostFn_t fn, void* userData) {
+  return hipLaunchHostFunc(stream, fn, userData);
+}
+
+inline error_t streamDestroy(stream_t stream) {
+  return hipStreamDestroy(stream);
+}
+
+}  // namespace hip
+#endif
+
+#ifdef TILEDARRAY_HAS_DEVICE
+
+inline int num_streams_per_device() {
+  int num_streams = -1;
+  char* num_stream_char = std::getenv("TA_DEVICE_NUM_STREAMS");
+  if (num_stream_char) {
+    num_streams = std::atoi(num_stream_char);
+  } else {
+#if defined(TILEDARRAY_HAS_CUDA)
+    char* num_stream_char = std::getenv("TA_CUDA_NUM_STREAMS");
+#elif defined(TILEDARRAY_HAS_HIP)
+    char* num_stream_char = std::getenv("TA_HIP_NUM_STREAMS");
+#endif
+    if (num_stream_char) {
+      num_streams = std::atoi(num_stream_char);
+    } else {
+      /// default num of streams is 3
+      num_streams = 3;
+    }
+  }
+  return num_streams;
+}
+
+inline void DEVICERT_CB readyflag_callback(void* userData) {
+  // convert void * to std::atomic<bool>
+  std::atomic<bool>* flag = static_cast<std::atomic<bool>*>(userData);
+  // set the flag to be true
+  flag->store(true);
+}
+
+struct ProbeFlag {
+  ProbeFlag(std::atomic<bool>* f) : flag(f) {}
+
+  bool operator()() const { return flag->load(); }
+
+  std::atomic<bool>* flag;
+};
+
+inline void thread_wait_stream(const stream_t& stream) {
+  std::atomic<bool>* flag = new std::atomic<bool>(false);
+
+  DeviceSafeCall(launchHostFunc(stream, readyflag_callback, flag));
+
+  ProbeFlag probe(flag);
+
+  // wait with sleep and do not do work
+  madness::ThreadPool::await(probe, false, true);
+  //    madness::ThreadPool::await(probe, true, true);
+
+  delete flag;
+}
+
+/// Stream is a `{device, stream_t}` pair, i.e. the analog of blas::Queue.
+/// It exists as a syntactic sugar around stream_t, and to avoid the need
+/// to deduce the device from stream
+/// \internal did not name it queue to avoid naming dichotomies
+/// all over the place
+struct Stream {
+  int device;
+  stream_t stream;
+  Stream(int device, stream_t stream) : device(device), stream(stream) {}
+
+  /// Stream is implicitly convertible to stream
+  operator stream_t() const { return stream; }
+};
+
+/**
+ * Env maintains the device-related part of the runtime environment,
+ * such as specialized data structures like device streams and memory allocators
+ *
+ * \note this is a Singleton
+ */
+class Env {
+ public:
+  ~Env() {
+    // destroy streams on current device
+    for (auto& [device, stream] : streams_) {
+      DeviceSafeCallNoThrow(streamDestroy(stream));
+    }
+  }
+
+  Env(const Env&) = delete;
+  Env(Env&&) = delete;
+  Env& operator=(const Env&) = delete;
+  Env& operator=(Env&&) = delete;
+
+  /// access the singleton instance; if not initialized will be
+  /// initialized via Env::initialize() with the default params
+  static std::unique_ptr<Env>& instance() {
+    if (!instance_accessor()) {
+      initialize();
+    }
+    return instance_accessor();
+  }
+
+  // clang-format off
+  /// initialize the instance using explicit params
+  /// \param world the world to use for initialization
+  /// \param page_size memory added to the pools supporting `this->um_allocator()`, `this->device_allocator()`, and `this->pinned_allocator()` in chunks of at least
+  ///                  this size (bytes) [default=2^25]
+  /// \param pinned_alloc_limit the maximum total amount of memory (in bytes) that
+  ///        allocator returned by `this->pinned_allocator()` can allocate [default=2^40]
+  // clang-format on
+  static void initialize(World& world = TiledArray::get_default_world(),
+                         const std::uint64_t page_size = (1ul << 25),
+                         const std::uint64_t pinned_alloc_limit = (1ul << 40)) {
+    static std::mutex mtx;  // to make initialize() reentrant
+    std::scoped_lock lock{mtx};
+    // only the winner of the lock race gets to initialize
+    if (instance_accessor() == nullptr) {
+      int num_streams_per_device = device::num_streams_per_device();
+      const int num_visible_devices = []() {
+        int num_visible_devices = -1;
+        DeviceSafeCall(getDeviceCount(&num_visible_devices));
+        return num_visible_devices;
+      }();
+      const auto compute_devices = [num_visible_devices](World& world) {
+        std::vector<int> compute_devices;
+        static const std::tuple<int, int> local_rank_size =
+            TiledArray::detail::mpi_local_rank_size(world);
+        const auto& [mpi_local_rank, mpi_local_size] = local_rank_size;
+        // map ranks to default device round robin
+        int device_id = mpi_local_rank % num_visible_devices;
+        while (device_id < num_visible_devices) {
+          compute_devices.push_back(device_id);
+          device_id += mpi_local_size;
+        }
+
+        return compute_devices;
+      }(world);
+
+      // configure devices for this rank
+      for (auto device : compute_devices) {
+        DeviceSafeCall(setDevice(device));
+        DeviceSafeCall(deviceSetCacheConfig(FuncCachePreferShared));
+      }
+      // use the first device as default:
+      DeviceSafeCall(setDevice(compute_devices[0]));
+
+      // uncomment to debug umpire ops
+      //
+      //      umpire::util::Logger::getActiveLogger()->setLoggingMsgLevel(
+      //          umpire::util::message::Debug);
+
+      //       make Thread Safe UM Dynamic POOL
+
+      auto& rm = umpire::ResourceManager::getInstance();
+
+      auto mem_total_free = Env::memory_total_and_free_device();
+
+      // turn off Umpire introspection for non-Debug builds
+#ifndef NDEBUG
+      constexpr auto introspect = true;
+#else
+      constexpr auto introspect = false;
+#endif
+
+      // allocate all currently-free memory for UM pool
+      auto um_dynamic_pool =
+          rm.makeAllocator<umpire::strategy::QuickPool, introspect>(
+              "UMDynamicPool", rm.getAllocator("UM"),
+              /* first_minimum_pool_allocation_size = */ 0,
+              /* next_minimum_pool_allocation_size = */ page_size);
+
+      // allocate zero memory for device pool
+      auto dev_size_limited_alloc =
+          rm.makeAllocator<umpire::strategy::SizeLimiter, introspect>(
+              "size_limited_alloc", rm.getAllocator("DEVICE"),
+              mem_total_free.first);
+      auto dev_dynamic_pool =
+          rm.makeAllocator<umpire::strategy::QuickPool, introspect>(
+              "DEVICEDynamicPool", dev_size_limited_alloc,
+              /* first_minimum_pool_allocation_size = */ 0,
+              /* next_minimum_pool_allocation_size = */ page_size);
+
+      // allocate pinned_alloc_limit in pinned memory
+      auto pinned_size_limited_alloc =
+          rm.makeAllocator<umpire::strategy::SizeLimiter, introspect>(
+              "SizeLimited_PINNED", rm.getAllocator("PINNED"),
+              pinned_alloc_limit);
+      auto pinned_dynamic_pool =
+          rm.makeAllocator<umpire::strategy::QuickPool, introspect>(
+              "QuickPool_SizeLimited_PINNED", pinned_size_limited_alloc,
+              /* first_minimum_pool_allocation_size = */ 0,
+              /* next_minimum_pool_allocation_size = */ page_size,
+              /* alignment */ TILEDARRAY_ALIGN_SIZE);
+
+      auto env = std::unique_ptr<Env>(new Env(
+          world, num_visible_devices, compute_devices, num_streams_per_device,
+          um_dynamic_pool, dev_dynamic_pool, pinned_dynamic_pool));
+      instance_accessor() = std::move(env);
+    }
+  }
+
+  World& world() const { return *world_; }
+
+  /// @return the number of devices visible to this rank
+  int num_visible_devices() const { return num_devices_visible_; }
+
+  /// @return the number of compute devices assigned to this rank
+  int num_compute_devices() const { return compute_devices_.size(); }
+
+  /// @return the device pointed to by the currently-active device runtime
+  /// context
+  int current_device_id() const {
+    TA_ASSERT(num_compute_devices() > 0);
+    int current_device = -1;
+    DeviceSafeCall(getDevice(&current_device));
+    return current_device;
+  }
+
+  /// @return the total number of compute streams (for all devices)
+  /// visible to this rank
+  int num_streams_total() const { return streams_.size(); }
+
+  bool concurrent_managed_access() const {
+    return device_concurrent_managed_access_;
+  }
+
+  size_t stream_id(const stream_t& stream) const {
+    auto it = std::find(streams_.begin(), streams_.end(), stream);
+    if (it == streams_.end()) abort();
+    return it - streams_.begin();
+  }
+
+  /// @return the total size of all and free device memory on the current device
+  static std::pair<size_t, size_t> memory_total_and_free_device() {
+    std::pair<size_t, size_t> result;
+    // N.B. *MemGetInfo returns {free,total}
+    DeviceSafeCall(memGetInfo(&result.second, &result.first));
+    return result;
+  }
+
+  /// Collective call to probe device {total,free} memory
+
+  /// @return the total size of all and free device memory on every rank's
+  /// device
+  std::vector<std::pair<size_t, size_t>> memory_total_and_free() const {
+    auto world_size = world_->size();
+    std::vector<size_t> total_memory(world_size, 0), free_memory(world_size, 0);
+    auto rank = world_->rank();
+    std::tie(total_memory.at(rank), free_memory.at(rank)) =
+        Env::memory_total_and_free_device();
+    world_->gop.sum(total_memory.data(), total_memory.size());
+    world_->gop.sum(free_memory.data(), free_memory.size());
+    std::vector<std::pair<size_t, size_t>> result(world_size);
+    for (int r = 0; r != world_size; ++r) {
+      result.at(r) = {total_memory.at(r), free_memory.at(r)};
+    }
+    return result;
+  }
+
+  /// @param[in] i compute stream ordinal
+  /// @pre `i<num_stream_total()`
+  /// @return `i`th compute stream
+  const Stream& stream(std::size_t i) const {
+    TA_ASSERT(i < this->num_streams_total());
+    return streams_[i];
+  }
+
+  /// @return a (non-thread-safe) Umpire allocator for device UM
+  umpire::Allocator& um_allocator() { return um_allocator_; }
+
+  // clang-format off
+  /// @return the max actual amount of memory held by um_allocator()
+  /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()`
+  /// @note if there is only 1 Umpire allocator using UM memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("UM").getHighWatermark()`
+  // clang-format on
+  std::size_t um_allocator_getActualHighWatermark() {
+    TA_ASSERT(dynamic_cast<umpire::strategy::QuickPool*>(
+                  um_allocator_.getAllocationStrategy()) != nullptr);
+    return dynamic_cast<umpire::strategy::QuickPool*>(
+               um_allocator_.getAllocationStrategy())
+        ->getActualHighwaterMark();
+  }
+
+  /// @return a (non-thread-safe) Umpire allocator for device memory
+  umpire::Allocator& device_allocator() { return device_allocator_; }
+
+  // clang-format off
+  /// @return the max actual amount of memory held by um_allocator()
+  /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()`
+  /// @note if there is only 1 Umpire allocator using DEVICE memory should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("DEVICE").getHighWatermark()`
+  // clang-format on
+  std::size_t device_allocator_getActualHighWatermark() {
+    TA_ASSERT(dynamic_cast<umpire::strategy::QuickPool*>(
+                  device_allocator_.getAllocationStrategy()) != nullptr);
+    return dynamic_cast<umpire::strategy::QuickPool*>(
+               device_allocator_.getAllocationStrategy())
+        ->getActualHighwaterMark();
+  }
+
+  /// @return an Umpire allocator that allocates from a
+  ///         pinned memory pool
+  /// @warning this is not a thread-safe allocator, should be only used when
+  ///          wrapped into umpire_based_allocator_impl
+  umpire::Allocator& pinned_allocator() { return pinned_allocator_; }
+
+  // clang-format off
+  /// @return the max actual amount of memory held by pinned_allocator()
+  /// @details returns the value provided by `umpire::strategy::QuickPool::getHighWatermark()`
+  /// @note if there is only 1 Umpire allocator using PINNED memory this should be identical to the value returned by `umpire::ResourceManager::getInstance().getAllocator("PINNED").getHighWatermark()`
+  // clang-format on
+  std::size_t pinned_allocator_getActualHighWatermark() {
+    TA_ASSERT(dynamic_cast<umpire::strategy::QuickPool*>(
+                  pinned_allocator_.getAllocationStrategy()) != nullptr);
+    return dynamic_cast<umpire::strategy::QuickPool*>(
+               pinned_allocator_.getAllocationStrategy())
+        ->getActualHighwaterMark();
+  }
+
+ protected:
+  Env(World& world, int num_visible_devices, std::vector<int> compute_devices,
+      int num_streams_per_device, umpire::Allocator um_alloc,
+      umpire::Allocator device_alloc, umpire::Allocator pinned_alloc)
+      : world_(&world),
+        um_allocator_(um_alloc),
+        device_allocator_(device_alloc),
+        pinned_allocator_(pinned_alloc),
+        num_devices_visible_(num_visible_devices),
+        compute_devices_(std::move(compute_devices)),
+        num_streams_per_device_(num_streams_per_device) {
+    if (compute_devices_.size() <= 0) {
+      throw std::runtime_error("No " TILEDARRAY_DEVICE_RUNTIME_STR
+                               " compute devices found!\n");
+    }
+
+    streams_.reserve(num_streams_per_device_ * compute_devices_.size());
+
+    /// ensure the desired capabilities of each device
+    for (auto device : compute_devices_) {
+      deviceProp_t prop;
+      DeviceSafeCall(getDeviceProperties(&prop, device));
+      if (!prop.managedMemory) {
+        throw std::runtime_error(TILEDARRAY_DEVICE_RUNTIME_STR
+                                 "device doesn't support managedMemory\n");
+      }
+      int concurrent_managed_access;
+      DeviceSafeCall(deviceGetAttribute(&concurrent_managed_access,
+                                        DeviceAttributeConcurrentManagedAccess,
+                                        device));
+      device_concurrent_managed_access_ =
+          device_concurrent_managed_access_ && concurrent_managed_access;
+      if (!initialized_to_be_quiet() && !device_concurrent_managed_access_) {
+        std::cout << "\nWarning: " TILEDARRAY_DEVICE_RUNTIME_STR
+                     " device doesn't support "
+                     "ConcurrentManagedAccess!\n\n";
+      }
+
+      // creates streams on current device
+      DeviceSafeCall(setDevice(device));
+      for (int s = 0; s != num_streams_per_device_; ++s) {
+        stream_t stream;
+        DeviceSafeCall(streamCreateWithFlags(&stream, StreamNonBlocking));
+        streams_.emplace_back(device, stream);
+      }
+    }
+
+    if (!initialized_to_be_quiet() && world.rank() == 0) {
+      auto nstreams = streams_.size();
+      std::cout << "created " << nstreams
+                << " " TILEDARRAY_DEVICE_RUNTIME_STR " stream"
+                << (nstreams == 1 ? "" : "s") << std::endl;
+    }
+
+    // lastly, set default device for current MPI process's (main) thread
+    DeviceSafeCall(setDevice(compute_devices_.front()));
+  }
+
+ private:
+  // the world used to initialize this
+  World* world_;
+
+  /// allocator backed by a (non-thread-safe) dynamically-sized pool for UM
+  umpire::Allocator um_allocator_;
+  /// allocator backed by a (non-thread-safe) dynamically-sized pool for device
+  /// memory
+  umpire::Allocator device_allocator_;
+  // allocates from a dynamic, size-limited pinned memory pool
+  // N.B. not thread safe, so must be wrapped into umpire_based_allocator_impl
+  umpire::Allocator pinned_allocator_;
+
+  int num_devices_visible_;  // total number of devices visible to this rank
+  std::vector<int>
+      compute_devices_;  // list of devices assigned to this rank,
+                         // compute_devices_.size()<=num_devices_visible_
+  bool device_concurrent_managed_access_ = true;
+
+  int num_streams_per_device_;
+  std::vector<Stream> streams_;  // streams_.size() == (num_streams_per_device_)
+                                 // * compute_devices_.size()
+
+  inline static std::unique_ptr<Env>& instance_accessor() {
+    static std::unique_ptr<Env> instance_{nullptr};
+    return instance_;
+  }
+};  // class Env
+
+namespace detail {
+
+// in a madness device task point to its local optional stream to use by
+// madness_task_stream_opt; set to nullptr after task callable finished
+inline std::optional<Stream>*& madness_task_stream_opt_ptr_accessor() {
+  static thread_local std::optional<Stream>* stream_opt_ptr = nullptr;
+  return stream_opt_ptr;
+}
+
+inline std::optional<Stream>& tls_stream_opt_accessor() {
+  static thread_local std::optional<Stream> stream_opt =
+
+      device::Env::instance()->stream(0);
+  return stream_opt;
+}
+
+inline std::optional<Stream>& madness_task_stream_opt_accessor() {
+  if (madness_task_stream_opt_ptr_accessor() != nullptr)  // in a device task?
+    return *madness_task_stream_opt_ptr_accessor();
+  else
+    return tls_stream_opt_accessor();
+}
+}  // namespace detail
+
+/// must call this before exiting the device task submitted to
+/// the MADNESS runtime via madness::add_device_task
+/// to synchronize with \p s
+/// before task completion
+/// \param s the stream to synchronize this task with
+inline void sync_madness_task_with(const Stream& s) {
+  if (!detail::madness_task_stream_opt_accessor())
+    detail::madness_task_stream_opt_accessor() = s;
+  else {
+    TA_ASSERT(*detail::madness_task_stream_opt_accessor() == s);
+  }
+}
+
+/// must call this before exiting the device task submitted to
+/// the MADNESS runtime via madness::add_device_task
+/// to synchronize with \p stream associated with device \p device
+/// on the *current* device before task completion
+/// \param device the device associated with \p stream
+/// \param stream the stream to synchronize this task with
+inline void sync_madness_task_with(int device, stream_t stream) {
+  sync_madness_task_with(Stream{device, stream});
+}
+
+/// must call this before exiting the device task submitted to
+/// the MADNESS runtime via madness::add_device_task
+/// to synchronize with \p stream on the *current* device
+/// before task completion
+/// \param stream the stream to synchronize this task with
+inline void sync_madness_task_with(stream_t stream) {
+  TA_ASSERT(stream != nullptr);
+  int current_device = -1;
+  DeviceSafeCall(getDevice(&current_device));
+  sync_madness_task_with(current_device, stream);
+}
+
+/// @return the optional Stream with which this task will be synced
+inline std::optional<Stream> madness_task_current_stream() {
+  return detail::madness_task_stream_opt_accessor();
+}
+
+/// should call this within a task submitted to
+/// the MADNESS runtime via madness::add_device_task
+/// to cancel the previous calls to sync_madness_task_with()
+/// if, e.g., it synchronized with any work performed
+/// before exiting
+inline void cancel_madness_task_sync() {
+  detail::madness_task_stream_opt_accessor() = {};
+}
+
+/// maps a (tile) Range to device::Stream; if had already pushed work into a
+/// device::Stream (as indicated by madness_task_current_stream() )
+/// will return that Stream instead
+/// @param[in] range will determine the device::Stream to compute an object
+/// associated with this Range object
+/// @return the device::Stream to use for creating tasks generating work
+/// associated with Range \p range
+template <typename Range>
+device::Stream stream_for(const Range& range) {
+  const auto stream_opt = madness_task_current_stream();
+  if (!stream_opt) {
+    auto stream_ord =
+        range.offset() % device::Env::instance()->num_streams_total();
+    return device::Env::instance()->stream(stream_ord);
+  } else
+    return *stream_opt;
+}
+
+}  // namespace device
+
+namespace detail {
+
+inline umpire::Allocator& get_um_allocator::operator()() {
+  return deviceEnv::instance()->um_allocator();
+}
+
+inline umpire::Allocator& get_pinned_allocator::operator()() {
+  return deviceEnv::instance()->pinned_allocator();
+}
+
+#endif  // TILEDARRAY_HAS_DEVICE
+
+}  // namespace detail
+
+#ifdef TILEDARRAY_HAS_CUDA
+namespace nvidia {
+
+// Color definitions for nvtxcalls
+enum class argbColor : uint32_t {
+  red = 0xFFFF0000,
+  blue = 0xFF0000FF,
+  green = 0xFF008000,
+  yellow = 0xFFFFFF00,
+  cyan = 0xFF00FFFF,
+  magenta = 0xFFFF00FF,
+  gray = 0xFF808080,
+  purple = 0xFF800080
+};
+
+/// enter a profiling range by calling nvtxRangePushEx
+/// \param[in] range_title a char string containing the range title
+/// \param[in] range_color the range color
+inline void range_push(const char* range_title, argbColor range_color) {
+  nvtxEventAttributes_t eventAttrib = {0};
+  eventAttrib.version = NVTX_VERSION;
+  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  eventAttrib.colorType = NVTX_COLOR_ARGB;
+  eventAttrib.color = static_cast<uint32_t>(range_color);
+  eventAttrib.message.ascii = range_title;
+  nvtxRangePushEx(&eventAttrib);
+}
+
+/// exits the current profiling range by calling nvtxRangePopEx
+inline void range_pop() { nvtxRangePop(); }
+
+}  // namespace nvidia
+#endif  // TILEDARRAY_HAS_CUDA
+
+}  // namespace TiledArray
+
+#endif  // TILEDARRAY_EXTERNAL_DEVICE_H__INCLUDED
diff --git a/src/TiledArray/external/librett.h b/src/TiledArray/external/librett.h
index 46d116c45b..b6b6cee3bc 100644
--- a/src/TiledArray/external/librett.h
+++ b/src/TiledArray/external/librett.h
@@ -26,7 +26,7 @@
 
 #include <TiledArray/config.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
 #include <algorithm>
 #include <vector>
@@ -74,11 +74,12 @@ inline void permutation_to_col_major(std::vector<int>& perm) {
  * @param outData pointer to data in output Tensor, must be accessible on GPU
  * @param range the Range of input Tensor inData
  * @param perm  the permutation object
- * @param stream  the CUDA stream this permutation will be submitted to
+ * @param stream  the device stream this permutation will be submitted to
  */
 template <typename T>
 void librett_permute(T* inData, T* outData, const TiledArray::Range& range,
-                     const TiledArray::Permutation& perm, cudaStream_t stream) {
+                     const TiledArray::Permutation& perm,
+                     device::stream_t stream) {
   auto extent = range.extent();
   std::vector<int> extent_int(extent.begin(), extent.end());
 
@@ -110,6 +111,6 @@ void librett_permute(T* inData, T* outData, const TiledArray::Range& range,
 
 }  // namespace TiledArray
 
-#endif  //  TILEDARRAY_HAS_CUDA
+#endif  //  TILEDARRAY_HAS_DEVICE
 
 #endif  // TILEDARRAY_EXTERNAL_LIBRETT_H__INCLUDED
diff --git a/src/TiledArray/external/madness.h b/src/TiledArray/external/madness.h
index ecfa313d9b..bf75813c61 100644
--- a/src/TiledArray/external/madness.h
+++ b/src/TiledArray/external/madness.h
@@ -20,11 +20,6 @@
 #ifndef TILEDARRAY_EXTERNAL_MADNESS_H__INCLUDED
 #define TILEDARRAY_EXTERNAL_MADNESS_H__INCLUDED
 
-// This needs to be defined before world/worldreduce.h and world/worlddc.h
-#ifndef WORLD_INSTANTIATE_STATIC_TEMPLATES
-#define WORLD_INSTANTIATE_STATIC_TEMPLATES
-#endif  // WORLD_INSTANTIATE_STATIC_TEMPLATES
-
 #include <memory>
 
 #include <TiledArray/config.h>
@@ -133,6 +128,14 @@ inline World split(const World& w, int color, int key = 0) {
   return std::move(comm);
 }
 
+namespace detail {
+inline std::pair<int, int> mpi_local_rank_size(World& world) {
+  auto host_comm =
+      world.mpi.comm().Split_type(SafeMPI::Intracomm::SHARED_SPLIT_TYPE, 0);
+  return std::make_pair(host_comm.Get_rank(), host_comm.Get_size());
+}
+}  // namespace detail
+
 }  // namespace TiledArray
 
 #endif  // TILEDARRAY_EXTERNAL_MADNESS_H__INCLUDED
diff --git a/src/TiledArray/external/umpire.h b/src/TiledArray/external/umpire.h
index 644039abe7..ac23a60260 100644
--- a/src/TiledArray/external/umpire.h
+++ b/src/TiledArray/external/umpire.h
@@ -33,6 +33,8 @@
 #include <umpire/strategy/QuickPool.hpp>
 #include <umpire/strategy/SizeLimiter.hpp>
 
+#include <madness/world/archive.h>
+
 #include <memory>
 #include <stdexcept>
 
@@ -45,7 +47,7 @@ struct NullLock {
   static void unlock() {}
 };
 
-template <typename Tag = void>
+template <typename Tag>
 class MutexLock {
   static std::mutex mtx_;
 
@@ -69,7 +71,7 @@ std::mutex MutexLock<Tag>::mtx_;
 /// \tparam StaticLock a type providing static `lock()` and `unlock()` methods ;
 ///         defaults to NullLock which does not lock
 template <class T, class StaticLock = detail::NullLock>
-class umpire_allocator_impl {
+class umpire_based_allocator_impl {
  public:
   using value_type = T;
   using pointer = value_type*;
@@ -87,18 +89,20 @@ class umpire_allocator_impl {
       typename std::pointer_traits<pointer>::difference_type;
   using size_type = std::make_unsigned_t<difference_type>;
 
-  umpire_allocator_impl(umpire::Allocator* umpalloc) noexcept
+  umpire_based_allocator_impl(umpire::Allocator* umpalloc) noexcept
       : umpalloc_(umpalloc) {}
 
   template <class U>
-  umpire_allocator_impl(const umpire_allocator_impl<U>& rhs) noexcept
+  umpire_based_allocator_impl(
+      const umpire_based_allocator_impl<U, StaticLock>& rhs) noexcept
       : umpalloc_(rhs.umpalloc_) {}
 
   /// allocates memory using umpire dynamic pool
   pointer allocate(size_t n) {
     TA_ASSERT(umpalloc_);
 
-    size_t nbytes = n * sizeof(T);
+    // QuickPool::allocate_internal does not handle zero-size allocations
+    size_t nbytes = n == 0 ? 1 : n * sizeof(T);
     pointer result = nullptr;
     auto* allocation_strategy = umpalloc_->getAllocationStrategy();
 
@@ -117,7 +121,8 @@ class umpire_allocator_impl {
   void deallocate(pointer ptr, size_t n) {
     TA_ASSERT(umpalloc_);
 
-    const auto nbytes = n * sizeof(T);
+    // QuickPool::allocate_internal does not handle zero-size allocations
+    const auto nbytes = n == 0 ? 1 : n * sizeof(T);
     auto* allocation_strategy = umpalloc_->getAllocationStrategy();
 
     // N.B. with multiple threads would have to do this test in
@@ -135,17 +140,67 @@ class umpire_allocator_impl {
 
  private:
   umpire::Allocator* umpalloc_;
-};  // class umpire_allocator
+};  // class umpire_based_allocator_impl
+
+template <class T1, class T2, class StaticLock>
+bool operator==(
+    const umpire_based_allocator_impl<T1, StaticLock>& lhs,
+    const umpire_based_allocator_impl<T2, StaticLock>& rhs) noexcept {
+  return lhs.umpire_allocator() == rhs.umpire_allocator();
+}
+
+template <class T1, class T2, class StaticLock>
+bool operator!=(
+    const umpire_based_allocator_impl<T1, StaticLock>& lhs,
+    const umpire_based_allocator_impl<T2, StaticLock>& rhs) noexcept {
+  return !(lhs == rhs);
+}
 
-template <class T1, class T2>
-bool operator==(const umpire_allocator_impl<T1>& lhs,
-                const umpire_allocator_impl<T2>& rhs) noexcept {
+template <class T, class StaticLock, typename UmpireAllocatorAccessor>
+class umpire_based_allocator
+    : public umpire_based_allocator_impl<T, StaticLock> {
+ public:
+  using base_type = umpire_based_allocator_impl<T, StaticLock>;
+  using typename base_type::const_pointer;
+  using typename base_type::const_reference;
+  using typename base_type::pointer;
+  using typename base_type::reference;
+  using typename base_type::value_type;
+
+  umpire_based_allocator() noexcept : base_type(&UmpireAllocatorAccessor{}()) {}
+
+  template <class U>
+  umpire_based_allocator(
+      const umpire_based_allocator<U, StaticLock, UmpireAllocatorAccessor>&
+          rhs) noexcept
+      : base_type(
+            static_cast<const umpire_based_allocator_impl<U, StaticLock>&>(
+                rhs)) {}
+
+  template <typename T1, typename T2, class StaticLock_,
+            typename UmpireAllocatorAccessor_>
+  friend bool operator==(
+      const umpire_based_allocator<T1, StaticLock_, UmpireAllocatorAccessor_>&
+          lhs,
+      const umpire_based_allocator<T2, StaticLock_, UmpireAllocatorAccessor_>&
+          rhs) noexcept;
+};  // class umpire_based_allocator
+
+template <class T1, class T2, class StaticLock,
+          typename UmpireAllocatorAccessor>
+bool operator==(
+    const umpire_based_allocator<T1, StaticLock, UmpireAllocatorAccessor>& lhs,
+    const umpire_based_allocator<T2, StaticLock, UmpireAllocatorAccessor>&
+        rhs) noexcept {
   return lhs.umpire_allocator() == rhs.umpire_allocator();
 }
 
-template <class T1, class T2>
-bool operator!=(const umpire_allocator_impl<T1>& lhs,
-                const umpire_allocator_impl<T2>& rhs) noexcept {
+template <class T1, class T2, class StaticLock,
+          typename UmpireAllocatorAccessor>
+bool operator!=(
+    const umpire_based_allocator<T1, StaticLock, UmpireAllocatorAccessor>& lhs,
+    const umpire_based_allocator<T2, StaticLock, UmpireAllocatorAccessor>&
+        rhs) noexcept {
   return !(lhs == rhs);
 }
 
@@ -169,6 +224,9 @@ class default_init_allocator : public A {
 
   using A::A;
 
+  default_init_allocator(A const& a) noexcept : A(a) {}
+  default_init_allocator(A&& a) noexcept : A(std::move(a)) {}
+
   template <typename U>
   void construct(U* ptr) noexcept(
       std::is_nothrow_default_constructible<U>::value) {
@@ -182,4 +240,85 @@ class default_init_allocator : public A {
 
 }  // namespace TiledArray
 
-#endif  // TILEDARRAY_CUDA_UM_ALLOCATOR_H___INCLUDED
+namespace madness {
+namespace archive {
+
+template <class Archive, class T, class StaticLock>
+struct ArchiveLoadImpl<Archive,
+                       TiledArray::umpire_based_allocator_impl<T, StaticLock>> {
+  static inline void load(
+      const Archive& ar,
+      TiledArray::umpire_based_allocator_impl<T, StaticLock>& allocator) {
+    std::string allocator_name;
+    ar & allocator_name;
+    allocator = TiledArray::umpire_based_allocator_impl<T, StaticLock>(
+        umpire::ResourceManager::getInstance().getAllocator(allocator_name));
+  }
+};
+
+template <class Archive, class T, class StaticLock>
+struct ArchiveStoreImpl<
+    Archive, TiledArray::umpire_based_allocator_impl<T, StaticLock>> {
+  static inline void store(
+      const Archive& ar,
+      const TiledArray::umpire_based_allocator_impl<T, StaticLock>& allocator) {
+    ar & allocator.umpire_allocator()->getName();
+  }
+};
+
+template <class Archive, typename T, typename A>
+struct ArchiveLoadImpl<Archive, TiledArray::default_init_allocator<T, A>> {
+  static inline void load(const Archive& ar,
+                          TiledArray::default_init_allocator<T, A>& allocator) {
+    if constexpr (!std::allocator_traits<A>::is_always_equal::value) {
+      A base_allocator;
+      ar & base_allocator;
+      allocator = TiledArray::default_init_allocator<T, A>(base_allocator);
+    }
+  }
+};
+
+template <class Archive, typename T, typename A>
+struct ArchiveStoreImpl<Archive, TiledArray::default_init_allocator<T, A>> {
+  static inline void store(
+      const Archive& ar,
+      const TiledArray::default_init_allocator<T, A>& allocator) {
+    if constexpr (!std::allocator_traits<A>::is_always_equal::value) {
+      ar& static_cast<const A&>(allocator);
+    }
+  }
+};
+
+}  // namespace archive
+}  // namespace madness
+
+namespace madness {
+namespace archive {
+
+template <class Archive, class T, class StaticLock,
+          typename UmpireAllocatorAccessor>
+struct ArchiveLoadImpl<Archive, TiledArray::umpire_based_allocator<
+                                    T, StaticLock, UmpireAllocatorAccessor>> {
+  static inline void load(
+      const Archive& ar,
+      TiledArray::umpire_based_allocator<T, StaticLock,
+                                         UmpireAllocatorAccessor>& allocator) {
+    allocator = TiledArray::umpire_based_allocator<T, StaticLock,
+                                                   UmpireAllocatorAccessor>{};
+  }
+};
+
+template <class Archive, class T, class StaticLock,
+          typename UmpireAllocatorAccessor>
+struct ArchiveStoreImpl<Archive, TiledArray::umpire_based_allocator<
+                                     T, StaticLock, UmpireAllocatorAccessor>> {
+  static inline void store(
+      const Archive& ar,
+      const TiledArray::umpire_based_allocator<
+          T, StaticLock, UmpireAllocatorAccessor>& allocator) {}
+};
+
+}  // namespace archive
+}  // namespace madness
+
+#endif  // TILEDARRAY_EXTERNAL_UMPIRE_H___INCLUDED
diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h
index 87af0e6115..e33aea5c18 100644
--- a/src/TiledArray/fwd.h
+++ b/src/TiledArray/fwd.h
@@ -36,12 +36,30 @@ class aligned_allocator;
 
 // fwddecl host_allocator
 namespace TiledArray {
-template <class T>
-class host_allocator_impl;
-template <typename T, typename A>
+namespace detail {
+struct get_host_allocator;
+struct NullLock;
+template <typename Tag = void>
+class MutexLock;
+}  // namespace detail
+
+template <class T, class StaticLock, typename UmpireAllocatorAccessor>
+class umpire_based_allocator;
+
+template <typename T, typename A = std::allocator<T>>
 class default_init_allocator;
+
+namespace host {
+class Env;
+}
+using hostEnv = host::Env;
+
+/// pooled thread-safe host memory allocator
 template <typename T>
-using host_allocator = default_init_allocator<T, host_allocator_impl<T>>;
+using host_allocator =
+    default_init_allocator<T,
+                           umpire_based_allocator<T, detail::MutexLock<hostEnv>,
+                                                  detail::get_host_allocator>>;
 }  // namespace TiledArray
 
 namespace madness {
@@ -81,31 +99,43 @@ typedef Tensor<long> TensorL;
 typedef Tensor<std::complex<double>> TensorZ;
 typedef Tensor<std::complex<float>> TensorC;
 
-// CUDA tensor
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
+namespace device {
+class Env;
+}
+using deviceEnv = device::Env;
 
-template <class T>
-class cuda_um_allocator_impl;
+namespace detail {
+struct get_um_allocator;
+struct get_pinned_allocator;
+}  // namespace detail
 
-template <typename T, typename A = std::allocator<T>>
-class default_init_allocator;
+/// pooled thread-safe unified memory (UM) allocator for device computing
+template <typename T>
+using device_um_allocator = default_init_allocator<
+    T, umpire_based_allocator<T, detail::MutexLock<deviceEnv>,
+                              detail::get_um_allocator>>;
 
+/// pooled thread-safe pinned host memory allocator for device computing
 template <typename T>
-using cuda_um_allocator = default_init_allocator<T, cuda_um_allocator_impl<T>>;
+using device_pinned_allocator = default_init_allocator<
+    T, umpire_based_allocator<T, detail::MutexLock<deviceEnv>,
+                              detail::get_pinned_allocator>>;
 
-/// \brief a vector that lives in CUDA Unified Memory, with most operations
+/// \brief a vector that lives in UM, with most operations
 /// implemented on the CPU
 template <typename T>
-using cuda_um_btas_varray = ::btas::varray<T, TiledArray::cuda_um_allocator<T>>;
+using device_um_btas_varray =
+    ::btas::varray<T, TiledArray::device_um_allocator<T>>;
 
 /**
- * btas::Tensor with UM storage cuda_um_btas_varray
+ * btas::Tensor with UM storage device_um_btas_varray
  */
 template <typename T, typename Range = TiledArray::Range>
 using btasUMTensorVarray =
-    ::btas::Tensor<T, Range, TiledArray::cuda_um_btas_varray<T>>;
+    ::btas::Tensor<T, Range, TiledArray::device_um_btas_varray<T>>;
 
-#endif
+#endif  // TILEDARRAY_HAS_DEVICE
 
 template <typename>
 class Tile;
@@ -117,10 +147,32 @@ namespace symmetry {
 class Permutation;
 }
 
+// shapes
+class DenseShape;
+template <typename T = float>
+class SparseShape;
+
 // TiledArray Arrays
 template <typename, typename>
 class DistArray;
 
+/// Type trait to detect dense shape types
+template <typename S>
+struct is_dense : public std::false_type {};
+
+template <>
+struct is_dense<DenseShape> : public std::true_type {};
+
+template <>
+struct is_dense<DensePolicy> : public std::true_type {};
+
+template <typename Tile, typename Policy>
+struct is_dense<DistArray<Tile, Policy>>
+    : public is_dense<typename DistArray<Tile, Policy>::shape_type> {};
+
+template <typename T>
+constexpr const bool is_dense_v = is_dense<T>::value;
+
 // Dense Array Typedefs
 template <typename T>
 using TArray = DistArray<Tensor<T>, DensePolicy>;
@@ -151,6 +203,35 @@ using Array
 
 enum class HostExecutor { Thread, MADWorld, Default = MADWorld };
 
+/// fence types
+enum class Fence {
+  Global,  //!< global fence (`world.gop.fence()`)
+  Local,   //!< local fence (all local work done, equivalent to
+           //!< `world.taskq.fence() in absence of active messages)
+  No       //!< no fence
+};
+
+namespace conversions {
+
+/// user defined conversions
+
+/// must define
+/// \code
+///  To operator()(From&& from);
+/// \endcode
+template <typename To, typename From>
+struct to;
+
+}  // namespace conversions
+
+/// used to indicate that block tensor expression should preserve the underlying
+/// tensor's trange lobound
+struct preserve_lobound_t {};
+
+/// used to tag block tensor expression methods that preserve the underlying
+/// tensor's trange lobound
+inline constexpr preserve_lobound_t preserve_lobound;
+
 }  // namespace TiledArray
 
 #ifndef TILEDARRAY_DISABLE_NAMESPACE_TA
diff --git a/src/TiledArray/host/allocator.h b/src/TiledArray/host/allocator.h
deleted file mode 100644
index 9e221c42d7..0000000000
--- a/src/TiledArray/host/allocator.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2021  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Eduard Valeyev
- *  Department of Chemistry, Virginia Tech
- *  Jan 31, 2018
- *
- */
-
-#ifndef TILEDARRAY_HOST_ALLOCATOR_H___INCLUDED
-#define TILEDARRAY_HOST_ALLOCATOR_H___INCLUDED
-
-#include <TiledArray/config.h>
-
-#include <TiledArray/external/umpire.h>
-#include <TiledArray/host/env.h>
-
-#include <TiledArray/fwd.h>
-
-#include <memory>
-#include <stdexcept>
-
-namespace TiledArray {
-
-/// pooled, thread-safe allocator for host memory
-template <class T>
-class host_allocator_impl
-    : public umpire_allocator_impl<T, detail::MutexLock<hostEnv>> {
- public:
-  using base_type = umpire_allocator_impl<T, detail::MutexLock<hostEnv>>;
-  using typename base_type::const_pointer;
-  using typename base_type::const_reference;
-  using typename base_type::pointer;
-  using typename base_type::reference;
-  using typename base_type::value_type;
-
-  host_allocator_impl() noexcept
-      : base_type(&hostEnv::instance()->host_allocator()) {}
-
-  template <class U>
-  host_allocator_impl(const host_allocator_impl<U>& rhs) noexcept
-      : base_type(static_cast<const umpire_allocator_impl<U>&>(rhs)) {}
-
-  template <typename T1, typename T2>
-  friend bool operator==(const host_allocator_impl<T1>& lhs,
-                         const host_allocator_impl<T2>& rhs) noexcept;
-};  // class host_allocator_impl
-
-template <class T1, class T2>
-bool operator==(const host_allocator_impl<T1>& lhs,
-                const host_allocator_impl<T2>& rhs) noexcept {
-  return lhs.umpire_allocator() == rhs.umpire_allocator();
-}
-
-template <class T1, class T2>
-bool operator!=(const host_allocator_impl<T1>& lhs,
-                const host_allocator_impl<T2>& rhs) noexcept {
-  return !(lhs == rhs);
-}
-
-}  // namespace TiledArray
-
-#endif  // TILEDARRAY_HOST_ALLOCATOR_H___INCLUDED
diff --git a/examples/cuda/cuda_librett.cpp b/src/TiledArray/host/env.cpp
similarity index 60%
rename from examples/cuda/cuda_librett.cpp
rename to src/TiledArray/host/env.cpp
index c513f41af1..16d3a71a50 100644
--- a/examples/cuda/cuda_librett.cpp
+++ b/src/TiledArray/host/env.cpp
@@ -1,6 +1,6 @@
 /*
  *  This file is a part of TiledArray.
- *  Copyright (C) 2018  Virginia Tech
+ *  Copyright (C) 2021  Virginia Tech
  *
  *  This program is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
@@ -15,33 +15,22 @@
  *  You should have received a copy of the GNU General Public License
  *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
  *
- *  Created by Chong Peng on 7/19/18.
+ *  Chong Peng
+ *  Department of Chemistry, Virginia Tech
+ *  July 23, 2018
  *
  */
 
-#include <TiledArray/config.h>
+#include <TiledArray/host/env.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
+namespace TiledArray {
 
-#include <TiledArray/cuda/btas_um_tensor.h>
-#include <tiledarray.h>
+namespace detail {
 
-#include <iostream>
-
-/**
- *  Test LibreTT
- */
-
-const std::size_t N = 100;
-using namespace TiledArray;
-
-int main(int argc, char* argv[]) {
-  TA_SCOPED_INITIALIZE(argc, argv);
-
-  std::vector<int> extent{N, N};
-  std::vector<int> perm{1, 0};
-
-  return 0;
+umpire::Allocator& get_host_allocator::operator()() {
+  return TiledArray::host::Env::instance()->host_allocator();
 }
 
-#endif
+}  // namespace detail
+
+}  // namespace TiledArray
diff --git a/src/TiledArray/host/env.h b/src/TiledArray/host/env.h
index 3feef3c4cc..b469704a72 100644
--- a/src/TiledArray/host/env.h
+++ b/src/TiledArray/host/env.h
@@ -41,37 +41,52 @@
 
 namespace TiledArray {
 
+namespace detail {
+
+struct get_host_allocator {
+  umpire::Allocator& operator()();
+};
+
+}  // namespace detail
+
+namespace host {
+
 /**
- * hostEnv maintains the (host-side, as opposed to device-side) environment,
+ * Env maintains the (host-side, as opposed to device-side) environment,
  * such as memory allocators
  *
  * \note this is a Singleton
  */
-class hostEnv {
+class Env {
  public:
-  ~hostEnv() = default;
+  ~Env() = default;
 
-  hostEnv(const hostEnv&) = delete;
-  hostEnv(hostEnv&&) = delete;
-  hostEnv& operator=(const hostEnv&) = delete;
-  hostEnv& operator=(hostEnv&&) = delete;
+  Env(const Env&) = delete;
+  Env(Env&&) = delete;
+  Env& operator=(const Env&) = delete;
+  Env& operator=(Env&&) = delete;
 
   /// access the singleton instance; if not initialized will be
-  /// initialized via hostEnv::initialize() with the default params
-  static std::unique_ptr<hostEnv>& instance() {
+  /// initialized via Env::initialize() with the default params
+  static std::unique_ptr<Env>& instance() {
     if (!instance_accessor()) {
       initialize();
     }
     return instance_accessor();
   }
 
+  // clang-format off
   /// initialize the instance using explicit params
-  /// \param max_memory_size max amount of memory (bytes) that TiledArray
-  ///        can use for storage of TA::Tensor objects (these by default
+  /// \param world the world to use for initialization
+  /// \param host_alloc_limit the maximum total amount of memory (in bytes) that
+  ///        allocator returned by `this->host_allocator()` can allocate;
+  ///        this allocator is used by TiledArray for storage of TA::Tensor objects (these by default
   ///        store DistArray tile data and (if sparse) shape [default=2^40]
   /// \param page_size memory added to the pool in chunks of at least
   ///                  this size (bytes) [default=2^25]
-  static void initialize(const std::uint64_t max_memory_size = (1ul << 40),
+  // clang-format on
+  static void initialize(World& world = TiledArray::get_default_world(),
+                         const std::uint64_t host_alloc_limit = (1ul << 40),
                          const std::uint64_t page_size = (1ul << 25)) {
     static std::mutex mtx;  // to make initialize() reentrant
     std::scoped_lock lock{mtx};
@@ -92,14 +107,13 @@ class hostEnv {
       // use QuickPool for host memory allocation, with min grain of 1 page
       auto host_size_limited_alloc =
           rm.makeAllocator<umpire::strategy::SizeLimiter, introspect>(
-              "SizeLimited_HOST", rm.getAllocator("HOST"), max_memory_size);
+              "SizeLimited_HOST", rm.getAllocator("HOST"), host_alloc_limit);
       auto host_dynamic_pool =
           rm.makeAllocator<umpire::strategy::QuickPool, introspect>(
               "QuickPool_SizeLimited_HOST", host_size_limited_alloc, page_size,
               page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE);
 
-      auto host_env = std::unique_ptr<hostEnv>(
-          new hostEnv(TiledArray::get_default_world(), host_dynamic_pool));
+      auto host_env = std::unique_ptr<Env>(new Env(world, host_dynamic_pool));
       instance_accessor() = std::move(host_env);
     }
   }
@@ -109,7 +123,7 @@ class hostEnv {
   /// @return an Umpire allocator that allocates from a
   ///         host memory pool
   /// @warning this is not a thread-safe allocator, should be only used when
-  ///          wrapped into umpire_allocator_impl
+  ///          wrapped into umpire_based_allocator_impl
   umpire::Allocator& host_allocator() { return host_allocator_; }
 
   // clang-format off
@@ -126,7 +140,7 @@ class hostEnv {
   }
 
  protected:
-  hostEnv(World& world, umpire::Allocator host_alloc)
+  Env(World& world, umpire::Allocator host_alloc)
       : world_(&world), host_allocator_(host_alloc) {}
 
  private:
@@ -134,15 +148,17 @@ class hostEnv {
   World* world_;
 
   // allocates from a dynamic, size-limited host memory pool
-  // N.B. not thread safe, so must be wrapped into umpire_allocator_impl
+  // N.B. not thread safe, so must be wrapped into umpire_based_allocator_impl
   umpire::Allocator host_allocator_;
 
-  inline static std::unique_ptr<hostEnv>& instance_accessor() {
-    static std::unique_ptr<hostEnv> instance_{nullptr};
+  inline static std::unique_ptr<Env>& instance_accessor() {
+    static std::unique_ptr<Env> instance_{nullptr};
     return instance_;
   }
 };
 
+}  // namespace host
+
 }  // namespace TiledArray
 
 #endif  // TILEDARRAY_HOST_ENV_H__INCLUDED
diff --git a/src/TiledArray/initialize.h b/src/TiledArray/initialize.h
index 324f772ccf..7d75d33c0d 100644
--- a/src/TiledArray/initialize.h
+++ b/src/TiledArray/initialize.h
@@ -17,6 +17,10 @@ bool initialized();
 /// @return true if TiledArray has been finalized at least once
 bool finalized();
 
+/// @return true if TiledArray (and, necessarily, MADWorld runtime) was
+/// initialized to be quiet
+bool initialized_to_be_quiet();
+
 // clang-format off
 /// @name TiledArray initialization.
 ///       These functions initialize TiledArray and (if needed) MADWorld
diff --git a/src/TiledArray/math/linalg/basic.h b/src/TiledArray/math/linalg/basic.h
index 9fec71f41e..2045c8a82c 100644
--- a/src/TiledArray/math/linalg/basic.h
+++ b/src/TiledArray/math/linalg/basic.h
@@ -79,14 +79,14 @@ template <typename Tile, typename Policy>
 inline void vec_multiply(DistArray<Tile, Policy>& a1,
                          const DistArray<Tile, Policy>& a2) {
   auto vars = TiledArray::detail::dummy_annotation(rank(a1));
-  a1(vars) = a1(vars) * a2(vars);
+  a1.make_tsrexpr(vars) = a1.make_tsrexpr(vars) * a2.make_tsrexpr(vars);
 }
 
 template <typename Tile, typename Policy, typename S>
 inline void scale(DistArray<Tile, Policy>& a, S scaling_factor) {
   using numeric_type = typename DistArray<Tile, Policy>::numeric_type;
   auto vars = TiledArray::detail::dummy_annotation(rank(a));
-  a(vars) = numeric_type(scaling_factor) * a(vars);
+  a.make_tsrexpr(vars) = numeric_type(scaling_factor) * a.make_tsrexpr(vars);
 }
 
 template <typename Tile, typename Policy>
@@ -99,7 +99,8 @@ inline void axpy(DistArray<Tile, Policy>& y, S alpha,
                  const DistArray<Tile, Policy>& x) {
   using numeric_type = typename DistArray<Tile, Policy>::numeric_type;
   auto vars = TiledArray::detail::dummy_annotation(rank(y));
-  y(vars) = y(vars) + numeric_type(alpha) * x(vars);
+  y.make_tsrexpr(vars) =
+      y.make_tsrexpr(vars) + numeric_type(alpha) * x.make_tsrexpr(vars);
 }
 
 /// selector for concat
@@ -122,6 +123,8 @@ inline DistArray<Tile, Policy> concat(const DistArray<Tile, Policy>& a,
     case Concat::Both:
       return TiledArray::concat<Tile, Policy>({a, b},
                                               std::vector<bool>{true, true});
+    default:
+      TA_EXCEPTION("Invalid Concat value");
   }
 }
 
@@ -135,7 +138,8 @@ using TiledArray::math::linalg::set_linalg_crossover_to_distributed;
 
 namespace Eigen {
 
-// freestanding adaptors for Eigen::MatrixBase needed by solvers like DIIS
+// freestanding adaptors for Eigen::MatrixBase and Eigen::Block
+// needed by solvers like DIIS
 
 template <typename Derived>
 inline void vec_multiply(Eigen::MatrixBase<Derived>& a1,
@@ -143,15 +147,39 @@ inline void vec_multiply(Eigen::MatrixBase<Derived>& a1,
   a1.array() *= a2.array();
 }
 
+template <typename XprType1, int BlockRows1, int BlockCols1, bool InnerPanel1,
+          typename XprType2, int BlockRows2, int BlockCols2, bool InnerPanel2>
+inline void vec_multiply(
+    Eigen::Block<XprType1, BlockRows1, BlockCols1, InnerPanel1>& a1,
+    const Eigen::Block<XprType2, BlockRows2, BlockCols2, InnerPanel2>& a2) {
+  a1.array() *= a2.array();
+}
+
 template <typename Derived, typename S>
 inline void scale(Eigen::MatrixBase<Derived>& a, S scaling_factor) {
   using numeric_type = typename Eigen::MatrixBase<Derived>::value_type;
   a.array() *= numeric_type(scaling_factor);
 }
 
+template <typename XprType1, int BlockRows1, int BlockCols1, bool InnerPanel1,
+          typename S>
+inline void scale(
+    Eigen::Block<XprType1, BlockRows1, BlockCols1, InnerPanel1>& a,
+    S scaling_factor) {
+  using numeric_type = typename Eigen::Block<XprType1, BlockRows1, BlockCols1,
+                                             InnerPanel1>::value_type;
+  a.array() *= numeric_type(scaling_factor);
+}
+
 template <typename Derived>
 inline void zero(Eigen::MatrixBase<Derived>& a) {
-  a = Derived::Zero(a.rows(), a.cols());
+  a.fill(0);
+}
+
+template <typename XprType1, int BlockRows1, int BlockCols1, bool InnerPanel1>
+inline void zero(
+    Eigen::Block<XprType1, BlockRows1, BlockCols1, InnerPanel1>& a) {
+  a.fill(0);
 }
 
 template <typename Derived, typename S>
@@ -161,23 +189,56 @@ inline void axpy(Eigen::MatrixBase<Derived>& y, S alpha,
   y.array() += numeric_type(alpha) * x.array();
 }
 
+template <typename XprType1, int BlockRows1, int BlockCols1, bool InnerPanel1,
+          typename XprType2, int BlockRows2, int BlockCols2, bool InnerPanel2,
+          typename S>
+inline void axpy(
+    Eigen::Block<XprType1, BlockRows1, BlockCols1, InnerPanel1>& y, S alpha,
+    const Eigen::Block<XprType2, BlockRows2, BlockCols2, InnerPanel2>& x) {
+  using numeric_type = typename Eigen::Block<XprType2, BlockRows2, BlockCols2,
+                                             InnerPanel2>::value_type;
+  y.array() += numeric_type(alpha) * x.array();
+}
+
 template <typename Derived>
 inline auto dot(const Eigen::MatrixBase<Derived>& l,
                 const Eigen::MatrixBase<Derived>& r) {
   return l.adjoint().dot(r);
 }
 
+template <typename XprType1, int BlockRows1, int BlockCols1, bool InnerPanel1,
+          typename XprType2, int BlockRows2, int BlockCols2, bool InnerPanel2>
+inline auto dot(
+    const Eigen::Block<XprType1, BlockRows1, BlockCols1, InnerPanel1>& l,
+    const Eigen::Block<XprType2, BlockRows2, BlockCols2, InnerPanel2>& r) {
+  return l.adjoint().dot(r);
+}
+
 template <typename Derived>
 inline auto inner_product(const Eigen::MatrixBase<Derived>& l,
                           const Eigen::MatrixBase<Derived>& r) {
   return l.dot(r);
 }
 
+template <typename XprType1, int BlockRows1, int BlockCols1, bool InnerPanel1,
+          typename XprType2, int BlockRows2, int BlockCols2, bool InnerPanel2>
+inline auto inner_product(
+    const Eigen::Block<XprType1, BlockRows1, BlockCols1, InnerPanel1>& l,
+    const Eigen::Block<XprType2, BlockRows2, BlockCols2, InnerPanel2>& r) {
+  return l.dot(r);
+}
+
 template <typename Derived>
 inline auto norm2(const Eigen::MatrixBase<Derived>& m) {
   return m.template lpNorm<2>();
 }
 
+template <typename XprType1, int BlockRows1, int BlockCols1, bool InnerPanel1>
+inline auto norm2(
+    const Eigen::Block<XprType1, BlockRows1, BlockCols1, InnerPanel1>& m) {
+  return m.template lpNorm<2>();
+}
+
 }  // namespace Eigen
 
 #ifndef TILEDARRAY_MATH_LINALG_DISPATCH_W_TTG
@@ -198,7 +259,7 @@ inline auto norm2(const Eigen::MatrixBase<Derived>& m) {
       TiledArray::math::linalg::detail::prefer_distributed(MATRIX))     \
     return TiledArray::math::linalg::ttg::FN;                           \
   if (get_linalg_backend() == LinearAlgebraBackend::ScaLAPACK)          \
-    TA_EXCEPTION("ScaLAPACK lineear algebra backend is not available"); \
+    TA_EXCEPTION("ScaLAPACK linear algebra backend is not available"); \
   return non_distributed::FN;
 #elif !TILEDARRAY_HAS_TTG && TILEDARRAY_HAS_SCALAPACK
 #define TILEDARRAY_MATH_LINALG_DISPATCH_W_TTG(FN, MATRIX)           \
@@ -215,7 +276,7 @@ inline auto norm2(const Eigen::MatrixBase<Derived>& m) {
   if (get_linalg_backend() == LinearAlgebraBackend::TTG)                \
     TA_EXCEPTION("TTG linear algebra backend is not available");        \
   if (get_linalg_backend() == LinearAlgebraBackend::ScaLAPACK)          \
-    TA_EXCEPTION("ScaLAPACK lineear algebra backend is not available"); \
+    TA_EXCEPTION("ScaLAPACK linear algebra backend is not available"); \
   return non_distributed::FN;
 #endif  // !TILEDARRAY_HAS_TTG && !TILEDARRAY_HAS_SCALAPACK
 #endif  // defined(TILEDARRAY_MATH_LINALG_DISPATCH_W_TTG)
@@ -242,7 +303,7 @@ inline auto norm2(const Eigen::MatrixBase<Derived>& m) {
     TA_EXCEPTION(TILEDARRAY_MATH_LINALG_DISPATCH_WO_TTG_STRINGIFY(      \
         FN) " is not provided by the TTG backend");                     \
   if (get_linalg_backend() == LinearAlgebraBackend::ScaLAPACK)          \
-    TA_EXCEPTION("ScaLAPACK lineear algebra backend is not available"); \
+    TA_EXCEPTION("ScaLAPACK linear algebra backend is not available"); \
   return non_distributed::FN;
 #elif !TILEDARRAY_HAS_TTG && TILEDARRAY_HAS_SCALAPACK
 #define TILEDARRAY_MATH_LINALG_DISPATCH_WO_TTG(FN, MATRIX)          \
@@ -259,7 +320,7 @@ inline auto norm2(const Eigen::MatrixBase<Derived>& m) {
   if (get_linalg_backend() == LinearAlgebraBackend::TTG)                \
     TA_EXCEPTION("TTG linear algebra backend is not available");        \
   if (get_linalg_backend() == LinearAlgebraBackend::ScaLAPACK)          \
-    TA_EXCEPTION("ScaLAPACK lineear algebra backend is not available"); \
+    TA_EXCEPTION("ScaLAPACK linear algebra backend is not available"); \
   return non_distributed::FN;
 #endif  // !TILEDARRAY_HAS_TTG && !TILEDARRAY_HAS_SCALAPACK
 #endif  // defined(TILEDARRAY_MATH_LINALG_DISPATCH_WO_TTG)
diff --git a/src/TiledArray/math/linalg/non-distributed/cholesky.h b/src/TiledArray/math/linalg/non-distributed/cholesky.h
index 4196002533..fc96a6bf1c 100644
--- a/src/TiledArray/math/linalg/non-distributed/cholesky.h
+++ b/src/TiledArray/math/linalg/non-distributed/cholesky.h
@@ -42,9 +42,7 @@ auto rank_local_cholesky(const DistArray<Tile, Policy>& A) {
 
   World& world = A.world();
   auto A_eig = detail::make_matrix(A);
-  if (world.rank() == 0) {
-    linalg::rank_local::cholesky(A_eig);
-  }
+  TA_LAPACK_ON_RANK_ZERO(cholesky, world, A_eig);
   world.gop.broadcast_serializable(A_eig, 0);
   return A_eig;
 }
@@ -140,11 +138,20 @@ auto cholesky_linv(const Array& A, TiledRange l_trange = TiledRange()) {
   // if need to return L use its copy to compute inverse
   decltype(L_eig) L_inv_eig;
 
+  std::optional<lapack::Error> error_opt;
   if (world.rank() == 0) {
-    if (Both) L_inv_eig = L_eig;
-    auto& L_inv_eig_ref = Both ? L_inv_eig : L_eig;
-    linalg::rank_local::cholesky_linv(L_inv_eig_ref);
-    detail::zero_out_upper_triangle(L_inv_eig_ref);
+    try {
+      if (Both) L_inv_eig = L_eig;
+      auto& L_inv_eig_ref = Both ? L_inv_eig : L_eig;
+      linalg::rank_local::cholesky_linv(L_inv_eig_ref);
+      detail::zero_out_upper_triangle(L_inv_eig_ref);
+    } catch (lapack::Error& err) {
+      error_opt = err;
+    }
+  }
+  world.gop.broadcast_serializable(error_opt, 0);
+  if (error_opt) {
+    throw error_opt.value();
   }
   world.gop.broadcast_serializable(Both ? L_inv_eig : L_eig, 0);
 
@@ -169,9 +176,7 @@ auto cholesky_solve(const Array& A, const Array& B,
   auto A_eig = detail::make_matrix(A);
   auto X_eig = detail::make_matrix(B);
   World& world = A.world();
-  if (world.rank() == 0) {
-    linalg::rank_local::cholesky_solve(A_eig, X_eig);
-  }
+  TA_LAPACK_ON_RANK_ZERO(cholesky_solve, world, A_eig, X_eig);
   world.gop.broadcast_serializable(X_eig, 0);
   if (x_trange.rank() == 0) x_trange = B.trange();
   return eigen_to_array<Array>(world, x_trange, X_eig);
@@ -192,9 +197,7 @@ auto cholesky_lsolve(Op transpose, const Array& A, const Array& B,
                 "scalar types");
 
   auto X_eig = detail::make_matrix(B);
-  if (world.rank() == 0) {
-    linalg::rank_local::cholesky_lsolve(transpose, L_eig, X_eig);
-  }
+  TA_LAPACK_ON_RANK_ZERO(cholesky_lsolve, world, transpose, L_eig, X_eig);
   world.gop.broadcast_serializable(X_eig, 0);
   if (l_trange.rank() == 0) l_trange = A.trange();
   if (x_trange.rank() == 0) x_trange = B.trange();
diff --git a/src/TiledArray/math/linalg/non-distributed/heig.h b/src/TiledArray/math/linalg/non-distributed/heig.h
index 8a7c244bbc..85079f356c 100644
--- a/src/TiledArray/math/linalg/non-distributed/heig.h
+++ b/src/TiledArray/math/linalg/non-distributed/heig.h
@@ -52,13 +52,11 @@ namespace TiledArray::math::linalg::non_distributed {
  */
 template <typename Array>
 auto heig(const Array& A, TiledRange evec_trange = TiledRange()) {
-  using numeric_type = typename detail::array_traits<Array>::numeric_type;
+  using scalar_type = typename detail::array_traits<Array>::scalar_type;
   World& world = A.world();
   auto A_eig = detail::make_matrix(A);
-  std::vector<numeric_type> evals;
-  if (world.rank() == 0) {
-    linalg::rank_local::heig(A_eig, evals);
-  }
+  std::vector<scalar_type> evals;
+  TA_LAPACK_ON_RANK_ZERO(heig, world, A_eig, evals);
   world.gop.broadcast_serializable(A_eig, 0);
   world.gop.broadcast_serializable(evals, 0);
   if (evec_trange.rank() == 0) evec_trange = A.trange();
@@ -93,15 +91,13 @@ auto heig(const Array& A, TiledRange evec_trange = TiledRange()) {
 template <typename ArrayA, typename ArrayB, typename EVecType = ArrayA>
 auto heig(const ArrayA& A, const ArrayB& B,
           TiledRange evec_trange = TiledRange()) {
-  using numeric_type = typename detail::array_traits<ArrayA>::numeric_type;
+  using scalar_type = typename detail::array_traits<ArrayA>::scalar_type;
   (void)detail::array_traits<ArrayB>{};
   World& world = A.world();
   auto A_eig = detail::make_matrix(A);
   auto B_eig = detail::make_matrix(B);
-  std::vector<numeric_type> evals;
-  if (world.rank() == 0) {
-    linalg::rank_local::heig(A_eig, B_eig, evals);
-  }
+  std::vector<scalar_type> evals;
+  TA_LAPACK_ON_RANK_ZERO(heig, world, A_eig, B_eig, evals);
   world.gop.broadcast_serializable(A_eig, 0);
   world.gop.broadcast_serializable(evals, 0);
   if (evec_trange.rank() == 0) evec_trange = A.trange();
diff --git a/src/TiledArray/math/linalg/non-distributed/lu.h b/src/TiledArray/math/linalg/non-distributed/lu.h
index d1b06bbb1c..6a3e1ea424 100644
--- a/src/TiledArray/math/linalg/non-distributed/lu.h
+++ b/src/TiledArray/math/linalg/non-distributed/lu.h
@@ -27,9 +27,9 @@
 
 #include <TiledArray/config.h>
 
-#include <TiledArray/math/linalg/util.h>
-#include <TiledArray/math/linalg/rank-local.h>
 #include <TiledArray/conversions/eigen.h>
+#include <TiledArray/math/linalg/rank-local.h>
+#include <TiledArray/math/linalg/util.h>
 
 namespace TiledArray::math::linalg::non_distributed {
 
@@ -37,15 +37,14 @@ namespace TiledArray::math::linalg::non_distributed {
  *  @brief Solve a linear system via LU factorization
  */
 template <typename ArrayA, typename ArrayB>
-auto lu_solve(const ArrayA& A, const ArrayB& B, TiledRange x_trange = TiledRange()) {
+auto lu_solve(const ArrayA& A, const ArrayB& B,
+              TiledRange x_trange = TiledRange()) {
   (void)detail::array_traits<ArrayA>{};
   (void)detail::array_traits<ArrayB>{};
   auto& world = A.world();
   auto A_eig = detail::make_matrix(A);
   auto B_eig = detail::make_matrix(B);
-  if (world.rank() == 0) {
-    linalg::rank_local::lu_solve(A_eig, B_eig);
-  }
+  TA_LAPACK_ON_RANK_ZERO(lu_solve, world, A_eig, B_eig);
   world.gop.broadcast_serializable(B_eig, 0);
   if (x_trange.rank() == 0) x_trange = B.trange();
   return eigen_to_array<ArrayB>(world, x_trange, B_eig);
@@ -59,14 +58,12 @@ auto lu_inv(const Array& A, TiledRange ainv_trange = TiledRange()) {
   (void)detail::array_traits<Array>{};
   auto& world = A.world();
   auto A_eig = detail::make_matrix(A);
-  if (world.rank() == 0) {
-    linalg::rank_local::lu_inv(A_eig);
-  }
+  TA_LAPACK_ON_RANK_ZERO(lu_inv, world, A_eig);
   world.gop.broadcast_serializable(A_eig, 0);
   if (ainv_trange.rank() == 0) ainv_trange = A.trange();
   return eigen_to_array<Array>(A.world(), ainv_trange, A_eig);
 }
 
-}  // namespace TiledArray::math::linalg::lapack
+}  // namespace TiledArray::math::linalg::non_distributed
 
 #endif  // TILEDARRAY_MATH_LINALG_NON_DISTRIBUTED_LU_H__INCLUDED
diff --git a/src/TiledArray/math/linalg/non-distributed/qr.h b/src/TiledArray/math/linalg/non-distributed/qr.h
index e43cec632d..b66ee222ea 100644
--- a/src/TiledArray/math/linalg/non-distributed/qr.h
+++ b/src/TiledArray/math/linalg/non-distributed/qr.h
@@ -3,35 +3,32 @@
 
 #include <TiledArray/config.h>
 
-#include <TiledArray/math/linalg/util.h>
-#include <TiledArray/math/linalg/rank-local.h>
 #include <TiledArray/conversions/eigen.h>
+#include <TiledArray/math/linalg/rank-local.h>
+#include <TiledArray/math/linalg/util.h>
 
 namespace TiledArray::math::linalg::non_distributed {
 
 template <bool QOnly, typename ArrayV>
-auto householder_qr( const ArrayV& V, TiledRange q_trange = TiledRange(),
-                     TiledRange r_trange = TiledRange() ) {
-
+auto householder_qr(const ArrayV& V, TiledRange q_trange = TiledRange(),
+                    TiledRange r_trange = TiledRange()) {
   (void)detail::array_traits<ArrayV>{};
   auto& world = V.world();
   auto V_eig = detail::make_matrix(V);
   decltype(V_eig) R_eig;
-  if( !world.rank() ) {
-    linalg::rank_local::householder_qr<QOnly>( V_eig, R_eig );
-  }
-  world.gop.broadcast_serializable( V_eig, 0 );
-  if(q_trange.rank() == 0) q_trange = V.trange();
-  auto Q = eigen_to_array<ArrayV>( world, q_trange, V_eig );
+  TA_LAPACK_ON_RANK_ZERO(householder_qr<QOnly>, world, V_eig, R_eig);
+  world.gop.broadcast_serializable(V_eig, 0);
+  if (q_trange.rank() == 0) q_trange = V.trange();
+  auto Q = eigen_to_array<ArrayV>(world, q_trange, V_eig);
   if constexpr (not QOnly) {
-    world.gop.broadcast_serializable( R_eig, 0 );
+    world.gop.broadcast_serializable(R_eig, 0);
     if (r_trange.rank() == 0) {
       // Generate a TRange based on column tiling of V
       auto col_tiling = V.trange().dim(1);
-      r_trange = TiledRange( {col_tiling, col_tiling} );
+      r_trange = TiledRange({col_tiling, col_tiling});
     }
-    auto R = eigen_to_array<ArrayV>( world, r_trange, R_eig );
-    return std::make_tuple( Q, R );
+    auto R = eigen_to_array<ArrayV>(world, r_trange, R_eig);
+    return std::make_tuple(Q, R);
   } else {
     return Q;
   }
diff --git a/src/TiledArray/math/linalg/non-distributed/svd.h b/src/TiledArray/math/linalg/non-distributed/svd.h
index 9c146784ef..3e3608240e 100644
--- a/src/TiledArray/math/linalg/non-distributed/svd.h
+++ b/src/TiledArray/math/linalg/non-distributed/svd.h
@@ -27,23 +27,23 @@
 
 #include <TiledArray/config.h>
 
-#include <TiledArray/math/linalg/util.h>
-#include <TiledArray/math/linalg/rank-local.h>
 #include <TiledArray/conversions/eigen.h>
+#include <TiledArray/math/linalg/rank-local.h>
+#include <TiledArray/math/linalg/util.h>
 
 namespace TiledArray::math::linalg::non_distributed {
 
 /**
- *  @brief Compute the singular value decomposition (SVD) via ScaLAPACK
+ *  @brief Compute the singular value decomposition (SVD) via LAPACK
  *
  *  A(i,j) = S(k) U(i,k) conj(V(j,k))
  *
  *  Example Usage:
  *
- *  auto S          = svd<SVDValuesOnly>  (A, ...)
- *  auto [S, U]     = svd<SVDLeftstd::vectors> (A, ...)
- *  auto [S, VT]    = svd<SVDRightstd::vectors>(A, ...)
- *  auto [S, U, VT] = svd<SVDAllstd::vectors>  (A, ...)
+ *  auto S          = svd<SVD::Vectors::ValuesOnly>  (A, ...)
+ *  auto [S, U]     = svd<SVD::Vectors::LeftVectors> (A, ...)
+ *  auto [S, VT]    = svd<SVD::Vectors::RightVectors>(A, ...)
+ *  auto [S, U, VT] = svd<SVD::Vectors::AllVectors>  (A, ...)
  *
  *  @tparam Array Input array type, must be convertible to BlockCyclicMatrix
  *
@@ -52,13 +52,14 @@ namespace TiledArray::math::linalg::non_distributed {
  *  @param[in] vt_trange   TiledRange for resulting right singular vectors
  * (transposed).
  *
- *  @returns A tuple containing the eigenvalues and eigenvectors of input array
- *  as std::vector and in TA format, respectively.
+ *  @returns A tuple containing the singular values and singular vectors of
+ * input array as std::vector and in TA format, respectively.
  */
-template<SVD::Vectors Vectors, typename Array>
-auto svd(const Array& A, TiledRange u_trange = TiledRange(), TiledRange vt_trange = TiledRange()) {
-
+template <SVD::Vectors Vectors, typename Array>
+auto svd(const Array& A, TiledRange u_trange = TiledRange(),
+         TiledRange vt_trange = TiledRange()) {
   using T = typename Array::numeric_type;
+  using TS = typename Array::scalar_type;
   using Matrix = linalg::rank_local::Matrix<T>;
 
   World& world = A.world();
@@ -68,21 +69,19 @@ auto svd(const Array& A, TiledRange u_trange = TiledRange(), TiledRange vt_trang
   constexpr bool need_u = (Vectors == SVD::LeftVectors) or svd_all_vectors;
   constexpr bool need_vt = (Vectors == SVD::RightVectors) or svd_all_vectors;
 
-  std::vector<T> S;
+  std::vector<TS> S;
   std::unique_ptr<Matrix> U, VT;
 
   if constexpr (need_u) U = std::make_unique<Matrix>();
   if constexpr (need_vt) VT = std::make_unique<Matrix>();
 
-  if (world.rank() == 0) {
-    linalg::rank_local::svd(A_eig, S, U.get(), VT.get());
-  }
+  TA_LAPACK_ON_RANK_ZERO(svd, world, A_eig, S, U.get(), VT.get());
 
   world.gop.broadcast_serializable(S, 0);
   if (U) world.gop.broadcast_serializable(*U, 0);
   if (VT) world.gop.broadcast_serializable(*VT, 0);
 
-  auto make_array = [&world](auto && ... args) {
+  auto make_array = [&world](auto&&... args) {
     return eigen_to_array<Array>(world, args...);
   };
 
@@ -97,7 +96,6 @@ auto svd(const Array& A, TiledRange u_trange = TiledRange(), TiledRange vt_trang
   }
 
   if constexpr (!need_u && !need_vt) return S;
-
 }
 
 }  // namespace TiledArray::math::linalg::non_distributed
diff --git a/src/TiledArray/math/linalg/rank-local.cpp b/src/TiledArray/math/linalg/rank-local.cpp
index a1e2e5538b..6db050ee5c 100644
--- a/src/TiledArray/math/linalg/rank-local.cpp
+++ b/src/TiledArray/math/linalg/rank-local.cpp
@@ -40,7 +40,7 @@ inline int ta_lapack_fortran_call(F f, Args... args) {
   return info;
 }
 
-#define TA_LAPACK_ERROR(F) throw std::runtime_error("lapack::" #F " failed")
+#define TA_LAPACK_ERROR(F) throw lapack::Error("lapack::" #F " failed")
 
 #define TA_LAPACK_FORTRAN_CALL(F, ARGS...) \
   ((ta_lapack_fortran_call(F, ARGS) == 0) || (TA_LAPACK_ERROR(F), 0))
@@ -113,19 +113,24 @@ void cholesky_lsolve(Op transpose, Matrix<T>& A, Matrix<T>& X) {
 }
 
 template <typename T>
-void heig(Matrix<T>& A, std::vector<T>& W) {
+void heig(Matrix<T>& A, std::vector<TiledArray::detail::real_t<T>>& W) {
   auto jobz = lapack::Job::Vec;
   auto uplo = lapack::Uplo::Lower;
   integer n = A.rows();
   T* a = A.data();
   integer lda = A.rows();
   W.resize(n);
-  T* w = W.data();
-  TA_LAPACK(syev, jobz, uplo, n, a, lda, w);
+  auto* w = W.data();
+  if (n == 0) return;
+  if constexpr (TiledArray::detail::is_complex_v<T>)
+    TA_LAPACK(heev, jobz, uplo, n, a, lda, w);
+  else
+    TA_LAPACK(syev, jobz, uplo, n, a, lda, w);
 }
 
 template <typename T>
-void heig(Matrix<T>& A, Matrix<T>& B, std::vector<T>& W) {
+void heig(Matrix<T>& A, Matrix<T>& B,
+          std::vector<TiledArray::detail::real_t<T>>& W) {
   integer itype = 1;
   auto jobz = lapack::Job::Vec;
   auto uplo = lapack::Uplo::Lower;
@@ -135,12 +140,18 @@ void heig(Matrix<T>& A, Matrix<T>& B, std::vector<T>& W) {
   T* b = B.data();
   integer ldb = B.rows();
   W.resize(n);
-  T* w = W.data();
-  TA_LAPACK(sygv, itype, jobz, uplo, n, a, lda, b, ldb, w);
+  auto* w = W.data();
+  if (n == 0) return;
+  if constexpr (TiledArray::detail::is_complex_v<T>)
+    TA_LAPACK(hegv, itype, jobz, uplo, n, a, lda, b, ldb, w);
+  else
+    TA_LAPACK(sygv, itype, jobz, uplo, n, a, lda, b, ldb, w);
 }
 
 template <typename T>
-void svd(Job jobu, Job jobvt, Matrix<T>& A, std::vector<T>& S, Matrix<T>* U, Matrix<T>* VT) {
+void svd(Job jobu, Job jobvt, Matrix<T>& A,
+         std::vector<TiledArray::detail::real_t<T>>& S, Matrix<T>* U,
+         Matrix<T>* VT) {
   integer m = A.rows();
   integer n = A.cols();
   integer k = std::min(m, n);
@@ -148,40 +159,42 @@ void svd(Job jobu, Job jobvt, Matrix<T>& A, std::vector<T>& S, Matrix<T>* U, Mat
   integer lda = A.rows();
 
   S.resize(k);
-  T* s = S.data();
+  auto* s = S.data();
 
-  T* u  = nullptr;
+  T* u = nullptr;
   T* vt = nullptr;
   integer ldu = 1, ldvt = 1;
-  if( (jobu == Job::SomeVec or jobu == Job::AllVec) and (not U) ) 
-    TA_LAPACK_ERROR("Requested out-of-place right singular vectors with null U input");
-  if( (jobvt == Job::SomeVec or jobvt == Job::AllVec) and (not VT) ) 
-    TA_LAPACK_ERROR("Requested out-of-place left singular vectors with null VT input");
+  if ((jobu == Job::SomeVec or jobu == Job::AllVec) and (not U))
+    TA_LAPACK_ERROR(
+        "Requested out-of-place right singular vectors with null U input");
+  if ((jobvt == Job::SomeVec or jobvt == Job::AllVec) and (not VT))
+    TA_LAPACK_ERROR(
+        "Requested out-of-place left singular vectors with null VT input");
 
-  if( jobu == Job::SomeVec ) {
+  if (jobu == Job::SomeVec) {
     U->resize(m, k);
     u = U->data();
     ldu = m;
   }
 
-  if( jobu == Job::AllVec ) {
+  if (jobu == Job::AllVec) {
     U->resize(m, m);
     u = U->data();
     ldu = m;
   }
 
-  if( jobvt == Job::SomeVec ) {
+  if (jobvt == Job::SomeVec) {
     VT->resize(k, n);
     vt = VT->data();
     ldvt = k;
   }
 
-  if( jobvt == Job::AllVec ) {
+  if (jobvt == Job::AllVec) {
     VT->resize(n, n);
     vt = VT->data();
     ldvt = n;
   }
-    
+
   TA_LAPACK(gesvd, jobu, jobvt, m, n, a, lda, s, u, ldu, vt, ldvt);
 }
 
@@ -208,47 +221,51 @@ void lu_inv(Matrix<T>& A) {
 }
 
 template <bool QOnly, typename T>
-void householder_qr( Matrix<T> &V, Matrix<T> &R ) {
+void householder_qr(Matrix<T>& V, Matrix<T>& R) {
   integer m = V.rows();
   integer n = V.cols();
-  integer k = std::min(m,n);
-  integer ldv = V.rows(); // Col Major
+  integer k = std::min(m, n);
+  integer ldv = V.rows();  // Col Major
   T* v = V.data();
   std::vector<T> tau(k);
-  lapack::geqrf( m, n, v, ldv, tau.data() );
+  lapack::geqrf(m, n, v, ldv, tau.data());
 
   // Extract R
-  if constexpr ( not QOnly ) {
+  if constexpr (not QOnly) {
     // Resize R just in case
-    R.resize(k,n);
+    R.resize(k, n);
     R.fill(0.);
     // Extract Upper triangle into R
     integer ldr = R.rows();
     T* r = R.data();
-    lapack::lacpy( lapack::MatrixType::Upper, k, n, v, ldv, r, ldr );
+    lapack::lacpy(lapack::MatrixType::Upper, k, n, v, ldv, r, ldr);
   }
 
   // Explicitly form Q
   // TODO: This is wrong for complex, but it doesn't look like R/C is caught
   //       anywhere else either...
-  lapack::orgqr( m, n, k, v, ldv, tau.data() );
-
+  if constexpr (TiledArray::detail::is_complex_v<T>)
+    lapack::ungqr(m, n, k, v, ldv, tau.data());
+  else
+    lapack::orgqr(m, n, k, v, ldv, tau.data());
 }
 
-#define TA_LAPACK_EXPLICIT(MATRIX, VECTOR)                       \
-  template void cholesky(MATRIX&);                               \
-  template void cholesky_linv(MATRIX&);                          \
-  template void cholesky_solve(MATRIX&, MATRIX&);                \
-  template void cholesky_lsolve(Op, MATRIX&, MATRIX&);           \
-  template void heig(MATRIX&, VECTOR&);                          \
-  template void heig(MATRIX&, MATRIX&, VECTOR&);                 \
-  template void svd(Job,Job,MATRIX&, VECTOR&, MATRIX*, MATRIX*); \
-  template void lu_solve(MATRIX&, MATRIX&);                      \
-  template void lu_inv(MATRIX&);                                 \
-  template void householder_qr<true>(MATRIX&,MATRIX&);           \
-  template void householder_qr<false>(MATRIX&,MATRIX&);
+#define TA_LAPACK_EXPLICIT(MATRIX, VECTOR)                         \
+  template void cholesky(MATRIX&);                                 \
+  template void cholesky_linv(MATRIX&);                            \
+  template void cholesky_solve(MATRIX&, MATRIX&);                  \
+  template void cholesky_lsolve(Op, MATRIX&, MATRIX&);             \
+  template void heig(MATRIX&, VECTOR&);                            \
+  template void heig(MATRIX&, MATRIX&, VECTOR&);                   \
+  template void svd(Job, Job, MATRIX&, VECTOR&, MATRIX*, MATRIX*); \
+  template void lu_solve(MATRIX&, MATRIX&);                        \
+  template void lu_inv(MATRIX&);                                   \
+  template void householder_qr<true>(MATRIX&, MATRIX&);            \
+  template void householder_qr<false>(MATRIX&, MATRIX&);
 
 TA_LAPACK_EXPLICIT(Matrix<double>, std::vector<double>);
 TA_LAPACK_EXPLICIT(Matrix<float>, std::vector<float>);
+TA_LAPACK_EXPLICIT(Matrix<std::complex<double>>, std::vector<double>);
+TA_LAPACK_EXPLICIT(Matrix<std::complex<float>>, std::vector<float>);
 
 }  // namespace TiledArray::math::linalg::rank_local
diff --git a/src/TiledArray/math/linalg/rank-local.h b/src/TiledArray/math/linalg/rank-local.h
index 77774c195a..625807663a 100644
--- a/src/TiledArray/math/linalg/rank-local.h
+++ b/src/TiledArray/math/linalg/rank-local.h
@@ -42,19 +42,21 @@ template <typename T>
 void cholesky_lsolve(Op transpose, Matrix<T> &A, Matrix<T> &X);
 
 template <typename T>
-void heig(Matrix<T> &A, std::vector<T> &W);
+void heig(Matrix<T> &A, std::vector<TiledArray::detail::real_t<T>> &W);
 
 template <typename T>
-void heig(Matrix<T> &A, Matrix<T> &B, std::vector<T> &W);
+void heig(Matrix<T> &A, Matrix<T> &B,
+          std::vector<TiledArray::detail::real_t<T>> &W);
 
 template <typename T>
-void svd(Job jobu, Job jobvt, Matrix<T> &A, std::vector<T> &S, Matrix<T> *U,
+void svd(Job jobu, Job jobvt, Matrix<T> &A,
+         std::vector<TiledArray::detail::real_t<T>> &S, Matrix<T> *U,
          Matrix<T> *VT);
 
 template <typename T>
-void svd(Matrix<T> &A, std::vector<T> &S, Matrix<T> *U, Matrix<T> *VT) {
-  svd(U ? Job::SomeVec : Job::NoVec, VT ? Job::SomeVec : Job::NoVec, A, S, U,
-      VT);
+void svd(Matrix<T> &A, std::vector<TiledArray::detail::real_t<T>> &S,
+         Matrix<T> *U, Matrix<T> *VT) {
+  svd(U ? Job::AllVec : Job::NoVec, VT ? Job::AllVec : Job::NoVec, A, S, U, VT);
 }
 
 template <typename T>
@@ -68,4 +70,42 @@ void householder_qr(Matrix<T> &V, Matrix<T> &R);
 
 }  // namespace TiledArray::math::linalg::rank_local
 
+namespace madness::archive {
+
+/// Serialize (deserialize) an lapack::Error
+
+/// \tparam Archive The archive type.
+template <class Archive>
+struct ArchiveSerializeImpl<Archive, lapack::Error> {
+  static inline void serialize(const Archive &ar, lapack::Error &e) {
+    MAD_ARCHIVE_DEBUG(std::cout << "(de)serialize lapack::Error" << std::endl);
+    if constexpr (is_output_archive_v<Archive>) {  // serialize
+      const std::string msg = e.what();
+      ar & msg;
+    } else {
+      std::string msg;
+      ar & msg;
+      e = lapack::Error(msg);
+    }
+  }
+};
+
+}  // namespace madness::archive
+
+/// TA_LAPACK_ON_RANK_ZERO(fn,args...) invokes  linalg::rank_local::fn(args...)
+/// on rank 0 and broadcasts/rethrows the exception, if any
+#define TA_LAPACK_ON_RANK_ZERO(fn, world, args...) \
+  std::optional<lapack::Error> error_opt;          \
+  if (world.rank() == 0) {                         \
+    try {                                          \
+      linalg::rank_local::fn(args);                \
+    } catch (lapack::Error & err) {                \
+      error_opt = err;                             \
+    }                                              \
+  }                                                \
+  world.gop.broadcast_serializable(error_opt, 0);  \
+  if (error_opt) {                                 \
+    throw error_opt.value();                       \
+  }
+
 #endif  // TILEDARRAY_MATH_LINALG_RANK_LOCAL_H__INCLUDED
diff --git a/src/TiledArray/math/linalg/scalapack/block_cyclic.h b/src/TiledArray/math/linalg/scalapack/block_cyclic.h
index 902312788b..f47cbc3cb9 100644
--- a/src/TiledArray/math/linalg/scalapack/block_cyclic.h
+++ b/src/TiledArray/math/linalg/scalapack/block_cyclic.h
@@ -133,7 +133,7 @@ class BlockCyclicMatrix : public madness::WorldObject<BlockCyclicMatrix<T>> {
   template <typename Tile,
             typename = std::enable_if_t<
                 TiledArray::detail::is_contiguous_tensor_v<Tile>>>
-  Tile extract_submatrix(std::vector<size_t> lo, std::vector<size_t> up) {
+  Tile extract_submatrix(std::array<size_t, 2> lo, std::array<size_t, 2> up) {
     assert(bc_dist_.i_own(lo[0], lo[1]));
 
     auto [i_st, j_st] = bc_dist_.local_indx(lo[0], lo[1]);
@@ -247,8 +247,10 @@ class BlockCyclicMatrix : public madness::WorldObject<BlockCyclicMatrix<T>> {
           const auto j_block_end = std::min(n, j_block_begin + nb);
 
           // Cut block if necessary to adhere to tile dimensions
-          const auto i_last = std::min(i_block_end, static_cast<decltype(m)>(up[0]));
-          const auto j_last = std::min(j_block_end, static_cast<decltype(m)>(up[1]));
+          const auto i_last =
+              std::min(i_block_end, static_cast<decltype(m)>(up[0]));
+          const auto j_last =
+              std::min(j_block_end, static_cast<decltype(m)>(up[1]));
 
           // Calculate extents of the block to be copied
           i_extent = i_last - i;
@@ -263,22 +265,25 @@ class BlockCyclicMatrix : public madness::WorldObject<BlockCyclicMatrix<T>> {
                 local_mat_.block(i_local, j_local, i_extent, j_extent);
 
           } else {
-            std::vector<size_t> lo{i, j};
-            std::vector<size_t> up{i_last, j_last};
+            std::array<size_t, 2> lo{i, j};
+            std::array<size_t, 2> up{i_last, j_last};
+            // N.B. send instead of task guarantees progress
             madness::Future<Tensor<T>> remtile_fut = world_base_t::send(
                 owner(i, j),
                 &BlockCyclicMatrix<T>::template extract_submatrix<Tensor<T>>,
                 lo, up);
 
+            // N.B. Future::get(dowork=false) since calling from within a task
+            // and PaRSEC gets sad otherwise
             if constexpr (TiledArray::detail::is_ta_tensor_v<Tile>)
-              tile.block(lo, up) = remtile_fut.get();
+              tile.block(lo, up) = remtile_fut.get(/* dowork = */ false);
             else {
               auto tile_blk_range = TiledArray::BlockRange(
                   TiledArray::detail::make_ta_range(tile.range()), lo, up);
               using std::data;
               auto tile_blk_view =
                   TiledArray::make_map(data(tile), tile_blk_range);
-              tile_blk_view = remtile_fut.get();
+              tile_blk_view = remtile_fut.get(/* dowork = */ false);
             }
           }
         }
diff --git a/src/TiledArray/math/linalg/scalapack/heig.h b/src/TiledArray/math/linalg/scalapack/heig.h
index bc9edeaa91..d7e84ae706 100644
--- a/src/TiledArray/math/linalg/scalapack/heig.h
+++ b/src/TiledArray/math/linalg/scalapack/heig.h
@@ -58,7 +58,7 @@ namespace TiledArray::math::linalg::scalapack {
 template <typename Array>
 auto heig(const Array& A, TiledRange evec_trange = TiledRange(),
           size_t NB = default_block_size()) {
-  using value_type = typename Array::element_type;
+  using value_type = typename Array::numeric_type;
   using real_type = scalapackpp::detail::real_t<value_type>;
 
   auto& world = A.world();
@@ -80,9 +80,8 @@ auto heig(const Array& A, TiledRange evec_trange = TiledRange(),
   scalapack::BlockCyclicMatrix<value_type> evecs(world, grid, N, N, NB, NB);
 
   auto info = scalapackpp::hereig(
-      scalapackpp::Job::Vec, blacspp::Uplo::Lower, N,
-      matrix.local_mat().data(), 1, 1, desc, evals.data(),
-      evecs.local_mat().data(), 1, 1, desc);
+      scalapackpp::Job::Vec, blacspp::Uplo::Lower, N, matrix.local_mat().data(),
+      1, 1, desc, evals.data(), evecs.local_mat().data(), 1, 1, desc);
   if (info) TA_EXCEPTION("EVP Failed");
 
   if (evec_trange.rank() == 0) evec_trange = A.trange();
@@ -122,8 +121,8 @@ template <typename ArrayA, typename ArrayB, typename EVecType = ArrayA>
 auto heig(const ArrayA& A, const ArrayB& B,
           TiledRange evec_trange = TiledRange(),
           size_t NB = default_block_size()) {
-  using value_type = typename ArrayA::element_type;
-  static_assert(std::is_same_v<typename ArrayB::element_type, value_type>);
+  using value_type = typename ArrayA::numeric_type;
+  static_assert(std::is_same_v<typename ArrayB::numeric_type, value_type>);
   using real_type = scalapackpp::detail::real_t<value_type>;
 
   auto& world = A.world();
@@ -150,9 +149,9 @@ auto heig(const ArrayA& A, const ArrayB& B,
   scalapack::BlockCyclicMatrix<value_type> evecs(world, grid, N, N, NB, NB);
 
   auto info = scalapackpp::hereig_gen(
-      scalapackpp::Job::Vec, blacspp::Uplo::Lower, N,
-      A_sca.local_mat().data(), 1, 1, desc, B_sca.local_mat().data(), 1, 1,
-      desc, evals.data(), evecs.local_mat().data(), 1, 1, desc);
+      scalapackpp::Job::Vec, blacspp::Uplo::Lower, N, A_sca.local_mat().data(),
+      1, 1, desc, B_sca.local_mat().data(), 1, 1, desc, evals.data(),
+      evecs.local_mat().data(), 1, 1, desc);
   if (info) TA_EXCEPTION("EVP Failed");
 
   if (evec_trange.rank() == 0) evec_trange = A.trange();
diff --git a/src/TiledArray/math/linalg/scalapack/svd.h b/src/TiledArray/math/linalg/scalapack/svd.h
index dc68d374c5..aa9f459ba9 100644
--- a/src/TiledArray/math/linalg/scalapack/svd.h
+++ b/src/TiledArray/math/linalg/scalapack/svd.h
@@ -42,10 +42,10 @@ namespace TiledArray::math::linalg::scalapack {
  *
  *  Example Usage:
  *
- *  auto S          = svd<SVDValuesOnly>  (A, ...)
- *  auto [S, U]     = svd<SVDLeftVectors> (A, ...)
- *  auto [S, VT]    = svd<SVDRightVectors>(A, ...)
- *  auto [S, U, VT] = svd<SVDAllVectors>  (A, ...)
+ *  auto S          = svd<SVD::Vectors::ValuesOnly>  (A, ...)
+ *  auto [S, U]     = svd<SVD::Vectors::LeftVectors> (A, ...)
+ *  auto [S, VT]    = svd<SVD::Vectors::RightVectors>(A, ...)
+ *  auto [S, U, VT] = svd<SVD::Vectors::AllVectors>  (A, ...)
  *
  *  @tparam Array Input array type, must be convertible to BlockCyclicMatrix
  *
diff --git a/src/TiledArray/math/linalg/ttg/cholesky.h b/src/TiledArray/math/linalg/ttg/cholesky.h
index 66a67a8034..0017d1ae1e 100644
--- a/src/TiledArray/math/linalg/ttg/cholesky.h
+++ b/src/TiledArray/math/linalg/ttg/cholesky.h
@@ -86,7 +86,7 @@ auto cholesky(const Array& A, TiledRange l_trange = {},
   [[maybe_unused]] auto connected = make_graph_executable(potrf_ttg.get());
 
   // uncomment to trace
-  ::ttg::trace_on();
+  //::ttg::trace_on();
 
   // start
   ::ttg::execute();
@@ -175,7 +175,7 @@ auto cholesky_linv(const Array& A, TiledRange l_trange = {},
   [[maybe_unused]] auto connected = make_graph_executable(trtri_ttg.get());
 
   // uncomment to trace
-  ::ttg::trace_on();
+  //::ttg::trace_on();
 
   // start
   ::ttg::execute();
diff --git a/src/TiledArray/math/solvers/conjgrad.h b/src/TiledArray/math/solvers/conjgrad.h
index 91992cf7de..cacfd55d63 100644
--- a/src/TiledArray/math/solvers/conjgrad.h
+++ b/src/TiledArray/math/solvers/conjgrad.h
@@ -60,7 +60,7 @@ namespace TiledArray::math {
 // clang-format on
 template <typename D, typename F>
 struct ConjugateGradientSolver {
-  typedef typename D::element_type value_type;
+  typedef typename D::numeric_type value_type;
 
   /// \param a object of type F
   /// \param b RHS
diff --git a/src/TiledArray/math/solvers/cp/cp.h b/src/TiledArray/math/solvers/cp/cp.h
index 8c82485211..9065776211 100644
--- a/src/TiledArray/math/solvers/cp/cp.h
+++ b/src/TiledArray/math/solvers/cp/cp.h
@@ -28,7 +28,6 @@
 
 #include <TiledArray/conversions/btas.h>
 #include <TiledArray/expressions/einsum.h>
-#include <tiledarray.h>
 
 namespace TiledArray::math::cp {
 
diff --git a/src/TiledArray/math/solvers/cp/cp_reconstruct.h b/src/TiledArray/math/solvers/cp/cp_reconstruct.h
index b09165d335..283f96bb76 100644
--- a/src/TiledArray/math/solvers/cp/cp_reconstruct.h
+++ b/src/TiledArray/math/solvers/cp/cp_reconstruct.h
@@ -29,7 +29,6 @@
 #include <TiledArray/conversions/btas.h>
 #include <TiledArray/expressions/einsum.h>
 #include <btas/btas.h>
-#include <tiledarray.h>
 
 namespace TiledArray::math::cp {
 
diff --git a/src/TiledArray/math/solvers/diis.h b/src/TiledArray/math/solvers/diis.h
index 252d40480b..1407ff327e 100644
--- a/src/TiledArray/math/solvers/diis.h
+++ b/src/TiledArray/math/solvers/diis.h
@@ -82,7 +82,7 @@ namespace TiledArray::math {
 template <typename D>
 class DIIS {
  public:
-  typedef typename D::element_type value_type;
+  typedef typename D::numeric_type value_type;
   typedef typename TiledArray::detail::scalar_t<value_type> scalar_type;
   typedef Eigen::Matrix<value_type, Eigen::Dynamic, Eigen::Dynamic,
                         Eigen::RowMajor>
diff --git a/src/TiledArray/meta.h b/src/TiledArray/meta.h
index 9dc4ac9f55..18a1bf69f9 100644
--- a/src/TiledArray/meta.h
+++ b/src/TiledArray/meta.h
@@ -1,70 +1,9 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2017  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Eduard Valeyev
- *  Department of Chemistry, Virginia Tech
- *
- *  meta.h
- *  April 11, 2017
- *
- */
-
 #ifndef SRC_TILEDARRAY_META_H_
 #define SRC_TILEDARRAY_META_H_
 
-#include <TiledArray/external/madness.h>
-#include <madness/world/future.h>
-#include <madness/world/world.h>
-#include <madness/world/world_task_queue.h>
-
-namespace TiledArray {
-namespace meta {
-
-/// ||'s bools
-template <bool head, bool... tail>
-struct or_reduce {
-  static constexpr bool value = head || or_reduce<tail...>::value;
-};
-
-template <bool b>
-struct or_reduce<b> {
-  static constexpr bool value = b;
-};
-
-// is any argument a Future?
-// - yes: async launch
-// -  no: direct launch
-template <typename Function, typename... Args>
-auto invoke(Function&& fn, Args&&... args) -> typename std::enable_if<
-    !or_reduce<false, madness::is_future<std::decay_t<Args>>::value...>::value,
-    decltype(fn(args...))>::type {
-  return fn(std::forward<Args>(args)...);
-}
-
-template <
-    typename Function, typename... Args,
-    typename = typename std::enable_if<or_reduce<
-        false, madness::is_future<std::decay_t<Args>>::value...>::value>::type>
-auto invoke(Function&& fn, Args&&... args) {
-  return TiledArray::get_default_world().taskq.add(std::forward<Function>(fn),
-                                                   std::forward<Args>(args)...);
-}
+#pragma message( \
+    "Header `TiledArray/meta.h` is deprecated, use `TiledArray/util/invoke.h` instead.")
 
-}  // namespace meta
-}  // namespace TiledArray
+#include <TiledArray/util/invoke.h>
 
 #endif  // SRC_TILEDARRAY_META_H_
diff --git a/src/TiledArray/permutation.h b/src/TiledArray/permutation.h
index cd527dfeef..d70b283034 100644
--- a/src/TiledArray/permutation.h
+++ b/src/TiledArray/permutation.h
@@ -271,7 +271,10 @@ class Permutation {
 
   /// \param i The element index
   /// \return The i-th element
-  index_type operator[](unsigned int i) const { return p_[i]; }
+  index_type operator[](unsigned int i) const {
+    TA_ASSERT(i < p_.size());
+    return p_[i];
+  }
 
   /// Cycles decomposition
 
@@ -329,6 +332,13 @@ class Permutation {
     return result;
   }
 
+  ///
+  /// Checks if this permutation is the identity permutation.
+  ///
+  [[nodiscard]] bool is_identity() const {
+    return std::is_sorted(p_.begin(), p_.end());
+  }
+
   /// Identity permutation factory function
 
   /// \return An identity permutation
@@ -402,11 +412,13 @@ class Permutation {
   /// Bool conversion
 
   /// \return \c true if the permutation is not empty, otherwise \c false.
+  /// \note equivalent to `this->size() != 0`
   explicit operator bool() const { return !p_.empty(); }
 
   /// Not operator
 
   /// \return \c true if the permutation is empty, otherwise \c false.
+  /// \note equivalent to `this->size() == 0`
   bool operator!() const { return p_.empty(); }
 
   /// Permutation data accessor
@@ -421,7 +433,7 @@ class Permutation {
   /// \param[in,out] ar The serialization archive
   template <typename Archive>
   void serialize(Archive& ar) {
-    ar& p_;
+    ar & p_;
   }
 
 };  // class Permutation
@@ -721,6 +733,11 @@ class BipartitePermutation {
     init();
   }
 
+  BipartitePermutation(Permutation&& p, index_type second_partition_size = 0)
+      : base_(std::move(p)), second_size_(second_partition_size) {
+    init();
+  }
+
   BipartitePermutation(const Permutation& first, const Permutation& second)
       : second_size_(second.size()) {
     vector<index_type> base;
@@ -778,9 +795,14 @@ class BipartitePermutation {
   }
 
   /// \return reference to the first partition
-  const Permutation& first() const { return first_; }
+  const Permutation& first() const& { return first_; }
   /// \return reference to the second partition
-  const Permutation& second() const { return second_; }
+  const Permutation& second() const& { return second_; }
+
+  /// \return rvalue-reference to the first partition
+  Permutation&& first() && { return std::move(first_); }
+  /// \return reference to the second partition
+  Permutation&& second() && { return std::move(second_); }
 
   /// \return the size of the first partition
   index_type first_size() const { return this->size() - second_size_; }
@@ -795,7 +817,7 @@ class BipartitePermutation {
   /// \param[in,out] ar The serialization archive
   template <typename Archive>
   void serialize(Archive& ar) {
-    ar& base_& second_size_;
+    ar & base_ & second_size_;
     if constexpr (madness::is_input_archive_v<Archive>) {
       first_ = {};
       second_ = {};
@@ -858,6 +880,8 @@ inline auto inner(const Permutation& p) {
 // temporary
 inline auto outer(const Permutation& p) { return p; }
 
+inline Permutation&& outer(Permutation&& p) { return std::move(p); }
+
 inline auto inner_size(const Permutation& p) {
   abort();
   return 0;
@@ -867,8 +891,16 @@ inline auto outer_size(const Permutation& p) { return p.size(); }
 
 inline auto inner(const BipartitePermutation& p) { return p.second(); }
 
+inline Permutation&& inner(BipartitePermutation&& p) {
+  return std::move(p).second();
+}
+
 inline auto outer(const BipartitePermutation& p) { return p.first(); }
 
+inline Permutation&& outer(BipartitePermutation&& p) {
+  return std::move(p).first();
+}
+
 inline auto inner_size(const BipartitePermutation& p) {
   return p.second_size();
 }
diff --git a/src/TiledArray/pmap/cyclic_pmap.h b/src/TiledArray/pmap/cyclic_pmap.h
index 6d2df0088b..250b4f677b 100644
--- a/src/TiledArray/pmap/cyclic_pmap.h
+++ b/src/TiledArray/pmap/cyclic_pmap.h
@@ -84,10 +84,6 @@ class CyclicPmap : public Pmap {
         cols_(cols),
         proc_cols_(proc_cols),
         proc_rows_(proc_rows) {
-    // Check that the size is non-zero
-    TA_ASSERT(rows_ >= 1ul);
-    TA_ASSERT(cols_ >= 1ul);
-
     // Check limits of process rows and columns
     TA_ASSERT(proc_rows_ >= 1ul);
     TA_ASSERT(proc_cols_ >= 1ul);
diff --git a/src/TiledArray/pmap/user_pmap.h b/src/TiledArray/pmap/user_pmap.h
index 50966f5744..31f5c51e53 100644
--- a/src/TiledArray/pmap/user_pmap.h
+++ b/src/TiledArray/pmap/user_pmap.h
@@ -52,7 +52,7 @@ class UserPmap : public Pmap {
   UserPmap(World& world, size_type size, Index2Rank&& i2r)
       : Pmap(world, size), index2rank_(std::forward<Index2Rank>(i2r)) {}
 
-  /// Constructs map that does not know the number of local elements
+  /// Constructs map that knows the number of local elements
 
   /// \tparam Index2Rank a callable type with `size_type(size_t)` signature
   /// \param world A reference to the world
@@ -88,10 +88,10 @@ class UserPmap : public Pmap {
   virtual bool known_local_size() const { return known_local_size_; }
 
   virtual const_iterator begin() const {
-    return Iterator(*this, 0, this->size_, 0, false);
+    return Iterator(*this, 0, this->size_, 0,  /* checking = */ true);
   }
   virtual const_iterator end() const {
-    return Iterator(*this, 0, this->size_, this->size_, false);
+    return Iterator(*this, 0, this->size_, this->size_, /* checking = */ true);
   }
 
  private:
diff --git a/src/TiledArray/proc_grid.h b/src/TiledArray/proc_grid.h
index a401e0ac1e..cd15c1b73e 100644
--- a/src/TiledArray/proc_grid.h
+++ b/src/TiledArray/proc_grid.h
@@ -288,12 +288,6 @@ class ProcGrid {
         local_rows_(0ul),
         local_cols_(0ul),
         local_size_(0ul) {
-    // Check for non-zero sizes
-    TA_ASSERT(rows_ >= 1u);
-    TA_ASSERT(cols_ >= 1u);
-    TA_ASSERT(row_size >= 1ul);
-    TA_ASSERT(col_size >= 1ul);
-
     init(world_->rank(), world_->size(), row_size, col_size);
   }
 
diff --git a/src/TiledArray/range.h b/src/TiledArray/range.h
index 8108ecf227..cdebd7ddfc 100644
--- a/src/TiledArray/range.h
+++ b/src/TiledArray/range.h
@@ -49,9 +49,12 @@ class Range {
   typedef Range Range_;                ///< This object type
   typedef TA_1INDEX_TYPE index1_type;  ///< 1-index type, to conform to
                                        ///< Tensor Working Group (TWG) spec
+  typedef std::make_signed_t<TA_1INDEX_TYPE>
+      index1_difference_type;  ///< type representing difference of 1-indices
   typedef container::svector<index1_type>
-      index_type;            ///< Coordinate index type, to conform to
-                             ///< TWG spec
+      index_type;  ///< Coordinate index type, to conform to
+                   ///< TWG spec
+  typedef container::svector<index1_difference_type> index_difference_type;
   typedef index_type index;  ///< Coordinate index type (deprecated)
   typedef detail::SizeArray<const index1_type>
       index_view_type;  ///< Non-owning variant of index_type
@@ -610,10 +613,10 @@ class Range {
 
   /// Permuting copy constructor
 
-  /// \param perm The permutation applied to other
-  /// \param other The range to be permuted and copied
+  /// \param perm The permutation applied to other; if `!perm` then no
+  /// permutation is applied \param other The range to be permuted and copied
   Range(const Permutation& perm, const Range_& other) {
-    TA_ASSERT(perm.size() == other.rank_);
+    TA_ASSERT(perm.size() == other.rank_ || !perm);
 
     if (other.rank_ > 0ul) {
       rank_ = other.rank_;
@@ -946,7 +949,7 @@ class Range {
     return *this;
   }
 
-  /// Shift the lower and upper bound of this range
+  /// Shifts the lower and upper bounds of this range
 
   /// \tparam Index An integral range type
   /// \param bound_shift The shift to be applied to the range
@@ -984,7 +987,7 @@ class Range {
     return *this;
   }
 
-  /// Shift the lower and upper bound of this range
+  /// Shifts the lower and upper bounds of this range
 
   /// \tparam Index An integral type
   /// \param bound_shift The shift to be applied to the range
@@ -995,27 +998,28 @@ class Range {
     return inplace_shift<std::initializer_list<Index>>(bound_shift);
   }
 
-  /// Create a Range with shiften lower and upper bounds
+  /// Create a Range with shifted lower and upper bounds
 
   /// \tparam Index An integral range type
   /// \param bound_shift The shift to be applied to the range
   /// \return A shifted copy of this range
   template <typename Index,
             typename = std::enable_if_t<detail::is_integral_range_v<Index>>>
-  Range_ shift(const Index& bound_shift) {
+  [[nodiscard]] Range_ shift(const Index& bound_shift) const {
     Range_ result(*this);
     result.inplace_shift(bound_shift);
     return result;
   }
 
-  /// Create a Range with shiften lower and upper bounds
+  /// Create a Range with shifted lower and upper bounds
 
   /// \tparam Index An integral type
   /// \param bound_shift The shift to be applied to the range
   /// \return A shifted copy of this range
   template <typename Index,
             typename = std::enable_if_t<std::is_integral_v<Index>>>
-  Range_ shift(const std::initializer_list<Index>& bound_shift) {
+  [[nodiscard]] Range_ shift(
+      const std::initializer_list<Index>& bound_shift) const {
     Range_ result(*this);
     result.inplace_shift(bound_shift);
     return result;
@@ -1136,7 +1140,7 @@ class Range {
 
   template <typename Archive>
   void serialize(Archive& ar) {
-    ar& rank_;
+    ar & rank_;
     const auto four_x_rank = rank_ << 2;
     // read via madness::archive::wrap to be able to
     // - avoid having to serialize datavec_'s size
@@ -1148,7 +1152,7 @@ class Range {
       ar << madness::archive::wrap(datavec_.data(), four_x_rank);
     } else
       abort();  // unreachable
-    ar& offset_& volume_;
+    ar & offset_ & volume_;
   }
 
   void swap(Range_& other) {
@@ -1245,6 +1249,10 @@ class Range {
 
 };  // class Range
 
+// lift Range::index_type and Range::index_view_type into user-land
+using Index = Range::index_type;
+using IndexView = Range::index_view_type;
+
 inline Range& Range::operator*=(const Permutation& perm) {
   TA_ASSERT(perm.size() == rank_);
   if (rank_ > 1ul) {
diff --git a/src/TiledArray/range1.h b/src/TiledArray/range1.h
index ef6d422dcc..a29e0d607c 100644
--- a/src/TiledArray/range1.h
+++ b/src/TiledArray/range1.h
@@ -32,7 +32,8 @@ namespace TiledArray {
 /// an integer range `[first,second)`
 /// @note previously represented by std::pair, hence the design
 struct Range1 {
-  typedef TA_1INDEX_TYPE index1_type;
+  using index1_type = TA_1INDEX_TYPE;
+  using signed_index1_type = std::make_signed_t<index1_type>;
   index1_type first = 0;
   index1_type second = 0;  //< N.B. second >= first
 
@@ -74,6 +75,9 @@ struct Range1 {
   /// @return the extent of this range, i.e. second - first
   auto extent() const noexcept { return second - first; }
 
+  /// @return the volume of this range, i.e. second - first
+  auto volume() const noexcept { return second - first; }
+
   /// swaps `*this` with @p other
   /// @p other a Range1 object
   void swap(Range1& other) noexcept {
@@ -87,6 +91,21 @@ struct Range1 {
     return std::make_pair(first, second);
   }
 
+  /// Checks if a given index is within this range
+  /// @return true if \p i is within this range
+  template <typename I>
+  typename std::enable_if<std::is_integral<I>::value, bool>::type includes(
+      const I& i) const {
+    return first <= i && i < second;
+  }
+
+  /// Checks if a given range overlaps with this range
+
+  /// @return true if \p r overlaps with this range
+  bool overlaps_with(const Range1& rng) const {
+    return lobound() < rng.upbound() && upbound() > rng.lobound();
+  }
+
   /// \brief Range1 iterator type
   ///
   /// Iterates over Range1
@@ -144,20 +163,43 @@ struct Range1 {
   /// \return An iterator that points to the beginning of the local element set
   const_iterator cend() const { return end(); }
 
-  /// @}
+  /// shifts this Range1
+
+  /// @param[in] shift the shift to apply
+  /// @return reference to this
+  Range1& inplace_shift(signed_index1_type shift) {
+    if (shift == 0) return *this;
+    // ensure that it's safe to shift
+    TA_ASSERT(shift <= 0 || upbound() <= 0 ||
+              (shift <= (std::numeric_limits<index1_type>::max() - upbound())));
+    TA_ASSERT(shift >= 0 || lobound() >= 0 ||
+              (std::abs(shift) <=
+               (lobound() - std::numeric_limits<index1_type>::min())));
+    first += shift;
+    second += shift;
+    return *this;
+  }
+
+  /// creates a shifted Range1
+
+  /// @param[in] shift the shift value
+  /// @return a copy of this shifted by @p shift
+  [[nodiscard]] Range1 shift(signed_index1_type shift) const {
+    return Range1(*this).inplace_shift(shift);
+  }
 
   template <typename Archive,
             typename std::enable_if<madness::is_input_archive_v<
                 std::decay_t<Archive>>>::type* = nullptr>
   void serialize(Archive& ar) {
-    ar& first& second;
+    ar & first & second;
   }
 
   template <typename Archive,
             typename std::enable_if<madness::is_output_archive_v<
                 std::decay_t<Archive>>>::type* = nullptr>
   void serialize(Archive& ar) const {
-    ar& first& second;
+    ar & first & second;
   }
 };
 
@@ -172,6 +214,12 @@ inline void swap(Range1& r0, Range1& r1) {  // no throw
   r0.swap(r1);
 }
 
+/// Range1 ostream operator
+inline std::ostream& operator<<(std::ostream& out, const Range1& rng) {
+  out << "[ " << rng.first << ", " << rng.second << " )";
+  return out;
+}
+
 /// Test that two Range1 objects are congruent
 
 /// This function tests that the sizes of the two Range1 objects coincide.
diff --git a/src/TiledArray/reduce_task.h b/src/TiledArray/reduce_task.h
index 753ac5df58..7d8924b0c3 100644
--- a/src/TiledArray/reduce_task.h
+++ b/src/TiledArray/reduce_task.h
@@ -24,11 +24,12 @@
 #include <TiledArray/error.h>
 #include <TiledArray/external/madness.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
-#include <TiledArray/cuda/cuda_task_fn.h>
-#include <TiledArray/external/cuda.h>
+#ifdef TILEDARRAY_HAS_DEVICE
+#include <TiledArray/device/device_task_fn.h>
+#include <TiledArray/external/device.h>
 #include <TiledArray/tensor/type_traits.h>
 #include <TiledArray/util/time.h>
+inline std::atomic<std::int64_t> global_reduce_task_counter(0);
 #endif
 
 namespace TiledArray {
@@ -304,9 +305,10 @@ class ReduceTask {
 
     };  // class ReduceObject
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
-    static void CUDART_CB cuda_reduceobject_delete_callback(void* userData) {
+    static void DEVICERT_CB
+    device_reduceobject_delete_callback(void* userData) {
       TA_ASSERT(!madness::is_madness_thread());
 
       const auto t0 = TiledArray::now();
@@ -334,15 +336,15 @@ class ReduceTask {
       };
 
       /// use madness task to call the destroy function, since it might call
-      /// cuda API
+      /// device API
       world->taskq.add(destroy_vector, objects, TaskAttributes::hipri());
 
       const auto t1 = TiledArray::now();
-      TiledArray::detail::cuda_callback_duration_ns<0>() +=
+      TiledArray::detail::device_callback_duration_ns<0>() +=
           TiledArray::duration_in_ns(t0, t1);
     }
 
-    static void CUDART_CB cuda_dependency_dec_callback(void* userData) {
+    static void DEVICERT_CB device_dependency_dec_callback(void* userData) {
       TA_ASSERT(!madness::is_madness_thread());
 
       const auto t0 = TiledArray::now();
@@ -361,12 +363,12 @@ class ReduceTask {
       //                           " call 2\n";
 
       const auto t1 = TiledArray::now();
-      TiledArray::detail::cuda_callback_duration_ns<1>() +=
+      TiledArray::detail::device_callback_duration_ns<1>() +=
           TiledArray::duration_in_ns(t0, t1);
     }
 
-    static void CUDART_CB
-    cuda_dependency_dec_reduceobject_delete_callback(void* userData) {
+    static void DEVICERT_CB
+    device_dependency_dec_reduceobject_delete_callback(void* userData) {
       TA_ASSERT(!madness::is_madness_thread());
 
       const auto t0 = TiledArray::now();
@@ -399,11 +401,11 @@ class ReduceTask {
       delete objects;
 
       const auto t1 = TiledArray::now();
-      TiledArray::detail::cuda_callback_duration_ns<2>() +=
+      TiledArray::detail::device_callback_duration_ns<2>() +=
           TiledArray::duration_in_ns(t0, t1);
     }
 
-    static void CUDART_CB cuda_readyresult_reset_callback(void* userData) {
+    static void DEVICERT_CB device_readyresult_reset_callback(void* userData) {
       TA_ASSERT(!madness::is_madness_thread());
 
       const auto t0 = TiledArray::now();
@@ -429,7 +431,7 @@ class ReduceTask {
       world->taskq.add(reset, objects, TaskAttributes::hipri());
 
       const auto t1 = TiledArray::now();
-      TiledArray::detail::cuda_callback_duration_ns<3>() +=
+      TiledArray::detail::device_callback_duration_ns<3>() +=
           TiledArray::duration_in_ns(t0, t1);
     }
 
@@ -455,15 +457,21 @@ class ReduceTask {
           ready_object_ = nullptr;
           lock_.unlock();  // <<< End critical section
 
+#ifdef TILEDARRAY_HAS_DEVICE
+          TA_ASSERT(device::detail::madness_task_stream_opt_ptr_accessor() ==
+                    nullptr);
+          device::detail::madness_task_stream_opt_ptr_accessor() = &stream_;
+#endif
+
           // Reduce the argument that was held by ready_object_
           op_(*result, ready_object->arg());
 
           // cleanup the argument
-#ifdef TILEDARRAY_HAS_CUDA
-          auto stream_ptr = tls_cudastream_accessor();
+#ifdef TILEDARRAY_HAS_DEVICE
+          device::detail::madness_task_stream_opt_ptr_accessor() = nullptr;
 
-          /// non-CUDA op
-          if (stream_ptr == nullptr) {
+          // need to sync with a device stream?
+          if (!stream_) {  // no
             ReduceObject::destroy(ready_object);
             this->dec();
           } else {
@@ -471,12 +479,11 @@ class ReduceTask {
             (*callback_object)[0] = &world_;
             (*callback_object)[1] = this;
             (*callback_object)[2] = ready_object;
-            CudaSafeCall(
-                cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-            CudaSafeCall(cudaLaunchHostFunc(
-                *stream_ptr, cuda_dependency_dec_reduceobject_delete_callback,
+            DeviceSafeCall(device::setDevice(stream_->device));
+            DeviceSafeCall(device::launchHostFunc(
+                stream_->stream,
+                device_dependency_dec_reduceobject_delete_callback,
                 callback_object));
-            synchronize_stream(nullptr);
             //                std::cout << std::to_string(world().rank()) + "
             //                add 3\n";
           }
@@ -490,25 +497,32 @@ class ReduceTask {
           ready_result_.reset();
           lock_.unlock();  // <<< End critical section
 
+#ifdef TILEDARRAY_HAS_DEVICE
+          TA_ASSERT(device::detail::madness_task_stream_opt_ptr_accessor() ==
+                    nullptr);
+          device::detail::madness_task_stream_opt_ptr_accessor() = &stream_;
+#endif
+
           // Reduce the result that was held by ready_result_
           op_(*result, *ready_result);
 
           // cleanup the result
-#ifdef TILEDARRAY_HAS_CUDA
-          auto stream_ptr = tls_cudastream_accessor();
-          if (stream_ptr == nullptr) {
+#ifdef TILEDARRAY_HAS_DEVICE
+          device::detail::madness_task_stream_opt_ptr_accessor() = nullptr;
+
+          // need to sync with a stream?
+          if (!stream_) {  // no
             ready_result.reset();
-          } else {
+          } else {  // yes
             auto ready_result_heap =
                 new std::shared_ptr<result_type>(ready_result);
             auto callback_object = new std::vector<void*>(2);
             (*callback_object)[0] = &world_;
             (*callback_object)[1] = ready_result_heap;
-            CudaSafeCall(
-                cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-            CudaSafeCall(cudaLaunchHostFunc(
-                *stream_ptr, cuda_readyresult_reset_callback, callback_object));
-            synchronize_stream(nullptr);
+            auto& [device, stream] = *stream_;
+            DeviceSafeCall(device::setDevice(device));
+            DeviceSafeCall(device::launchHostFunc(
+                stream, device_readyresult_reset_callback, callback_object));
             //                std::cout << std::to_string(world().rank()) + "
             //                add 4\n";
           }
@@ -530,43 +544,49 @@ class ReduceTask {
     /// \param object The reduction argument to be reduced
     void reduce_result_object(std::shared_ptr<result_type> result,
                               const ReduceObject* object) {
+#ifdef TILEDARRAY_HAS_DEVICE
+      TA_ASSERT(device::detail::madness_task_stream_opt_ptr_accessor() ==
+                nullptr);
+      device::detail::madness_task_stream_opt_ptr_accessor() = &stream_;
+#endif
+
       // Reduce the argument
       op_(*result, object->arg());
 
       // Cleanup the argument
-#ifdef TILEDARRAY_HAS_CUDA
-      auto stream_ptr = tls_cudastream_accessor();
-      if (stream_ptr == nullptr) {
+#ifdef TILEDARRAY_HAS_DEVICE
+      device::detail::madness_task_stream_opt_ptr_accessor() = nullptr;
+
+      if (!stream_) {
         ReduceObject::destroy(object);
       } else {
         auto callback_object = new std::vector<void*>(2);
         (*callback_object)[0] = &world_;
         (*callback_object)[1] = const_cast<ReduceObject*>(object);
-        CudaSafeCall(
-            cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-        CudaSafeCall(cudaLaunchHostFunc(
-            *stream_ptr, cuda_reduceobject_delete_callback, callback_object));
-        synchronize_stream(nullptr);
+        DeviceSafeCall(device::setDevice(stream_->device));
+        DeviceSafeCall(device::launchHostFunc(
+            stream_->stream, device_reduceobject_delete_callback,
+            callback_object));
         //            std::cout << std::to_string(world().rank()) + " add 1\n";
       }
 #else
       ReduceObject::destroy(object);
 #endif
+
       // Check for more reductions
       reduce(result);
 
       // Decrement the dependency counter for the argument. This must
       // be done after the reduce call to avoid a race condition.
-#ifdef TILEDARRAY_HAS_CUDA
-      if (stream_ptr == nullptr) {
+#ifdef TILEDARRAY_HAS_DEVICE
+      if (!stream_) {
         this->dec();
       } else {
         auto callback_object2 = new std::vector<void*>(1);
         (*callback_object2)[0] = this;
-        CudaSafeCall(
-            cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-        CudaSafeCall(cudaLaunchHostFunc(
-            *stream_ptr, cuda_dependency_dec_callback, callback_object2));
+        DeviceSafeCall(device::setDevice(stream_->device));
+        DeviceSafeCall(device::launchHostFunc(
+            stream_->stream, device_dependency_dec_callback, callback_object2));
         //            std::cout << std::to_string(world().rank()) + " add 2\n";
       }
 #else
@@ -577,6 +597,12 @@ class ReduceTask {
     /// Reduce two reduction arguments
     void reduce_object_object(const ReduceObject* object1,
                               const ReduceObject* object2) {
+#ifdef TILEDARRAY_HAS_DEVICE
+      TA_ASSERT(device::detail::madness_task_stream_opt_ptr_accessor() ==
+                nullptr);
+      device::detail::madness_task_stream_opt_ptr_accessor() = &stream_;
+#endif
+
       // Construct an empty result object
       auto result = std::make_shared<result_type>(op_());
 
@@ -585,9 +611,10 @@ class ReduceTask {
       op_(*result, object2->arg());
 
       // Cleanup arguments
-#ifdef TILEDARRAY_HAS_CUDA
-      auto stream_ptr = tls_cudastream_accessor();
-      if (stream_ptr == nullptr) {
+#ifdef TILEDARRAY_HAS_DEVICE
+      device::detail::madness_task_stream_opt_ptr_accessor() = nullptr;
+
+      if (!stream_) {
         ReduceObject::destroy(object1);
         ReduceObject::destroy(object2);
       } else {
@@ -595,11 +622,10 @@ class ReduceTask {
         (*callback_object1)[0] = &world_;
         (*callback_object1)[1] = const_cast<ReduceObject*>(object1);
         (*callback_object1)[2] = const_cast<ReduceObject*>(object2);
-        CudaSafeCall(
-            cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-        CudaSafeCall(cudaLaunchHostFunc(
-            *stream_ptr, cuda_reduceobject_delete_callback, callback_object1));
-        synchronize_stream(nullptr);
+        DeviceSafeCall(device::setDevice(stream_->device));
+        DeviceSafeCall(device::launchHostFunc(
+            stream_->stream, device_reduceobject_delete_callback,
+            callback_object1));
         //            std::cout << std::to_string(world().rank()) + " add 1\n";
       }
 #else
@@ -612,18 +638,17 @@ class ReduceTask {
 
       // Decrement the dependency counter for the two arguments. This
       // must be done after the reduce call to avoid a race condition.
-#ifdef TILEDARRAY_HAS_CUDA
-      if (stream_ptr == nullptr) {
+#ifdef TILEDARRAY_HAS_DEVICE
+      if (!stream_) {
         this->dec();
         this->dec();
       } else {
         auto callback_object2 = new std::vector<void*>(2);
         (*callback_object2)[0] = this;
         (*callback_object2)[1] = this;
-        CudaSafeCall(
-            cudaSetDevice(cudaEnv::instance()->current_cuda_device_id()));
-        CudaSafeCall(cudaLaunchHostFunc(
-            *stream_ptr, cuda_dependency_dec_callback, callback_object2));
+        DeviceSafeCall(device::setDevice(stream_->device));
+        DeviceSafeCall(device::launchHostFunc(
+            stream_->stream, device_dependency_dec_callback, callback_object2));
         //            std::cout << std::to_string(world().rank()) + " add 2\n";
       }
 
@@ -633,13 +658,13 @@ class ReduceTask {
 #endif
     }
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
     template <typename Result = result_type>
-    std::enable_if_t<detail::is_cuda_tile_v<Result>, void> internal_run(
+    std::enable_if_t<detail::is_device_tile_v<Result>, void> internal_run(
         const madness::TaskThreadEnv&) {
       TA_ASSERT(ready_result_);
 
-      auto post_result = madness::add_cuda_task(world_, op_, *ready_result_);
+      auto post_result = madness::add_device_task(world_, op_, *ready_result_);
       result_.set(post_result);
 
       if (callback_) {
@@ -648,7 +673,7 @@ class ReduceTask {
     }
 
     template <typename Result = result_type>
-    std::enable_if_t<!detail::is_cuda_tile_v<Result>, void>
+    std::enable_if_t<!detail::is_device_tile_v<Result>, void>
 #else
     void
 #endif
@@ -668,7 +693,10 @@ class ReduceTask {
     Future<result_type> result_;  ///< The result of the reduction task
     madness::Spinlock lock_;      ///< Task lock
     madness::CallbackInterface* callback_;  ///< The completion callback
-    int task_id_;                           ///< Task id
+    std::int64_t task_id_;                  ///< Task id
+#ifdef TILEDARRAY_HAS_DEVICE
+    std::optional<device::Stream> stream_;  // round-robined by task_id
+#endif
 
    public:
     /// Implementation constructor
@@ -679,7 +707,7 @@ class ReduceTask {
     ///        has completed
     /// \param task_id the task id (for debugging)
     ReduceTaskImpl(World& world, opT op, madness::CallbackInterface* callback,
-                   int task_id = -1)
+                   std::int64_t task_id = -1)
         : madness::TaskInterface(1, TaskAttributes::hipri()),
           world_(world),
           op_(op),
@@ -688,7 +716,16 @@ class ReduceTask {
           result_(),
           lock_(),
           callback_(callback),
-          task_id_(task_id) {}
+          task_id_(task_id) {
+#ifdef TILEDARRAY_HAS_DEVICE
+      if (task_id_ == -1) {
+        task_id_ = global_reduce_task_counter++;
+        const std::size_t stream_ord =
+            task_id_ % device::Env::instance()->num_streams_total();
+        stream_ = device::Env::instance()->stream(stream_ord);
+      }
+#endif
+    }
 
     virtual ~ReduceTaskImpl() {}
 
@@ -753,7 +790,8 @@ class ReduceTask {
   ///        this task is complete
   /// \param task_id the task id (for debugging)
   ReduceTask(World& world, const opT& op = opT(),
-             madness::CallbackInterface* callback = nullptr, int task_id = -1)
+             madness::CallbackInterface* callback = nullptr,
+             std::int64_t task_id = -1)
       : pimpl_(new ReduceTaskImpl(world, op, callback, task_id)), count_(0ul) {}
 
   /// Move constructor
diff --git a/src/TiledArray/shape.h b/src/TiledArray/shape.h
index b630d7e019..9b8de8f6ef 100644
--- a/src/TiledArray/shape.h
+++ b/src/TiledArray/shape.h
@@ -23,29 +23,4 @@
 #include <TiledArray/dense_shape.h>
 #include <TiledArray/sparse_shape.h>
 
-namespace TiledArray {
-
-template <typename, typename>
-class DistArray;
-class DensePolicy;
-
-/// Type trait to detect dense shape types
-template <typename S>
-struct is_dense : public std::false_type {};
-
-template <>
-struct is_dense<DenseShape> : public std::true_type {};
-
-template <>
-struct is_dense<DensePolicy> : public std::true_type {};
-
-template <typename Tile, typename Policy>
-struct is_dense<DistArray<Tile, Policy> >
-    : public is_dense<typename DistArray<Tile, Policy>::shape_type> {};
-
-template <typename T>
-constexpr const bool is_dense_v = is_dense<T>::value;
-
-}  // namespace TiledArray
-
 #endif  // TILEDARRAY_SHAPE_H__INCLUDED
diff --git a/src/TiledArray/size_array.h b/src/TiledArray/size_array.h
index 6edbecb222..ef2ed1e121 100644
--- a/src/TiledArray/size_array.h
+++ b/src/TiledArray/size_array.h
@@ -26,6 +26,8 @@
 #include <TiledArray/util/vector.h>
 #include <cstddef>
 
+#include <range/v3/range/concepts.hpp>
+
 namespace TiledArray {
 namespace detail {
 
@@ -42,6 +44,15 @@ class SizeArray {
   T* first_ = nullptr;  ///< First element of the array
   T* last_ = nullptr;   ///< Last element of the array
 
+  // can compare to any sized range
+  template <typename U, typename SizedRange>
+  friend std::enable_if_t<
+      is_sized_range_v<std::remove_reference_t<SizedRange>> &&
+          !std::is_same_v<SizeArray<U>, std::remove_reference_t<SizedRange>> &&
+          !std::is_base_of_v<SizeArray<U>, std::remove_reference_t<SizedRange>>,
+      bool>
+  operator==(const SizeArray<U>&, SizedRange&&);
+
  public:
   // type definitions
   typedef T value_type;
@@ -436,6 +447,33 @@ class SizeArray {
 
 };  // class SizeArray
 
+}  // namespace detail
+}  // namespace TiledArray
+
+namespace ranges {
+template <typename T>
+inline constexpr bool enable_view<TiledArray::detail::SizeArray<T>> = true;
+}  // namespace ranges
+
+static_assert(ranges::range<TiledArray::detail::SizeArray<const long long>>);
+static_assert(
+    ranges::viewable_range<TiledArray::detail::SizeArray<const long long>>);
+
+namespace TiledArray::detail {
+
+template <typename U, typename SizedRange>
+std::enable_if_t<
+    is_sized_range_v<std::remove_reference_t<SizedRange>> &&
+        !std::is_same_v<SizeArray<U>, std::remove_reference_t<SizedRange>> &&
+        !std::is_base_of_v<SizeArray<U>, std::remove_reference_t<SizedRange>>,
+    bool>
+operator==(const SizeArray<U>& idx1, SizedRange&& idx2) {
+  if (idx1.size() == idx2.size())
+    return std::equal(idx1.begin(), idx1.end(), idx2.begin());
+  else
+    return false;
+}
+
 template <typename T>
 inline std::vector<T> operator*(const Permutation& perm,
                                 const SizeArray<T>& orig) {
@@ -451,7 +489,6 @@ inline std::ostream& operator<<(std::ostream& os,
   return os;
 }
 
-}  // namespace detail
-}  // namespace TiledArray
+}  // namespace TiledArray::detail
 
 #endif  // TILEDARRAY_SIZE_ARRAY_H__INCLUDED
diff --git a/src/TiledArray/sparse_shape.h b/src/TiledArray/sparse_shape.h
index 7346f45d1c..a7df1c520c 100644
--- a/src/TiledArray/sparse_shape.h
+++ b/src/TiledArray/sparse_shape.h
@@ -26,6 +26,8 @@
 #ifndef TILEDARRAY_SPARSE_SHAPE_H__INCLUDED
 #define TILEDARRAY_SPARSE_SHAPE_H__INCLUDED
 
+#include <TiledArray/fwd.h>
+
 #include <TiledArray/tensor.h>
 #include <TiledArray/tensor/shift_wrapper.h>
 #include <TiledArray/tensor/tensor_interface.h>
@@ -514,10 +516,13 @@ class SparseShape {
 
   /// Sparsity of the shape
 
-  /// \return The fraction of tiles that are zero.
+  /// \return The fraction of tiles that are zero. Always returns 0 if
+  /// `this->data().size()` is zero.
   float sparsity() const {
     TA_ASSERT(!tile_norms_.empty());
-    return float(zero_tile_count_) / float(tile_norms_.size());
+    return tile_norms_.size() != 0
+               ? float(zero_tile_count_) / float(tile_norms_.size())
+               : 0.f;
   }
 
   // clang-format off
@@ -795,6 +800,13 @@ class SparseShape {
     return equal;
   }
 
+  /// Bitwise comparison
+  /// \param other a SparseShape object
+  /// \return true if this object and @c other object are bitwise NOT identical
+  inline bool operator!=(const SparseShape<T>& other) const {
+    return !(*this == other);
+  }
+
  private:
   /// Create a copy of a sub-block of the shape
 
@@ -828,7 +840,7 @@ class SparseShape {
 
       // Check that the input indices are in range
       TA_ASSERT(lower_d >= tile_norms_.range().lobound(d));
-      TA_ASSERT(lower_d < upper_d);
+      TA_ASSERT(lower_d <= upper_d);
       TA_ASSERT(upper_d <= tile_norms_.range().upbound(d));
 
       // Construct the size vector for rank i
@@ -862,7 +874,7 @@ class SparseShape {
 
       // Check that the input indices are in range
       TA_ASSERT(lower_d >= tile_norms_.range().lobound(d));
-      TA_ASSERT(lower_d < upper_d);
+      TA_ASSERT(lower_d <= upper_d);
       TA_ASSERT(upper_d <= tile_norms_.range().upbound(d));
 
       // Construct the size vector for rank i
@@ -1670,23 +1682,23 @@ class SparseShape {
             typename std::enable_if<madness::is_input_archive_v<
                 std::decay_t<Archive>>>::type* = nullptr>
   void serialize(Archive& ar) {
-    ar& tile_norms_;
+    ar & tile_norms_;
     const unsigned int dim = tile_norms_.range().rank();
     // allocate size_vectors_
     size_vectors_ = std::move(std::shared_ptr<vector_type>(
         new vector_type[dim], std::default_delete<vector_type[]>()));
-    for (unsigned d = 0; d != dim; ++d) ar& size_vectors_.get()[d];
-    ar& zero_tile_count_;
+    for (unsigned d = 0; d != dim; ++d) ar & size_vectors_.get()[d];
+    ar & zero_tile_count_;
   }
 
   template <typename Archive,
             typename std::enable_if<madness::is_output_archive_v<
                 std::decay_t<Archive>>>::type* = nullptr>
   void serialize(Archive& ar) const {
-    ar& tile_norms_;
+    ar & tile_norms_;
     const unsigned int dim = tile_norms_.range().rank();
-    for (unsigned d = 0; d != dim; ++d) ar& size_vectors_.get()[d];
-    ar& zero_tile_count_;
+    for (unsigned d = 0; d != dim; ++d) ar & size_vectors_.get()[d];
+    ar & zero_tile_count_;
   }
 
  private:
diff --git a/src/TiledArray/special/diagonal_array.h b/src/TiledArray/special/diagonal_array.h
index 825d66fd98..eac0c65e92 100644
--- a/src/TiledArray/special/diagonal_array.h
+++ b/src/TiledArray/special/diagonal_array.h
@@ -31,6 +31,8 @@
 #include <TiledArray/tensor.h>
 #include <TiledArray/tiled_range.h>
 
+#include <range/v3/view/repeat.hpp>
+
 #include <vector>
 
 namespace TiledArray {
@@ -43,7 +45,7 @@ namespace detail {
 /// empty Range
 /// \param[in] rng an input (rank-d) Range
 /// \return the range of diagonal elements, as a rank-1 Range
-inline Range diagonal_range(Range const &rng) {
+inline Range1 diagonal_range(Range const &rng) {
   const auto rank = rng.rank();
   TA_ASSERT(rng.rank() > 0);
   auto lo = rng.lobound_data();
@@ -56,92 +58,64 @@ inline Range diagonal_range(Range const &rng) {
   // If the max small elem is less than the min large elem then a diagonal
   // elem is in this tile;
   if (max_low < min_up) {
-    return Range({max_low}, {min_up});
+    return Range1{max_low, min_up};
   } else {
-    return Range();
+    return Range1{};
   }
 }
 
-/// \brief computes shape data (i.e. Frobenius norms of the tiles) for a
-/// constant diagonal tensor
-/// \tparam T a numeric type
-/// \param trange a TiledRange of the result
-/// \param val value of the diagonal elements
-/// \return a Tensor<float> containing the Frobenius norms of
-///         the tiles of a DistArray with \p val on the diagonal and
-///         zeroes elsewhere
-template <typename T>
-Tensor<float> diagonal_shape(TiledRange const &trange, T val) {
-  Tensor<float> shape(trange.tiles_range(), 0.0);
-
-  auto ext = trange.elements_range().extent();
-  auto diag_extent = *std::min_element(std::begin(ext), std::end(ext));
-
-  auto ndim = trange.rank();
-  auto diag_elem = 0ul;
-  // the diagonal elements will never be larger than the length of the
-  // shortest dimension
-  while (diag_elem < diag_extent) {
-    // Get the tile index corresponding to the current diagonal_elem
-    auto tile_idx =
-        trange.element_to_tile(container::svector<long>(ndim, diag_elem));
-    auto tile_range = trange.make_tile_range(tile_idx);
-
-    // Compute the range of diagonal elements in the tile
-    auto d_range = diagonal_range(tile_range);
-
-    // Since each diag elem has the same value the  norm of the tile is
-    // \sqrt{\sum_{diag} val^2}  = \sqrt{ndiags * val^2}
-    float t_norm = std::sqrt(val * val * d_range.volume());
-    shape(tile_idx) = t_norm;
-
-    // Update diag_elem to the next elem not in this tile
-    diag_elem = d_range.upbound_data()[0];
-  }
-
-  return shape;
-}
-
 /// \brief computes shape data (i.e. Frobenius norms of the tiles) for a
 /// non-constant diagonal tensor
 /// \tparam RandomAccessIterator an iterator over
 /// the range of diagonal elements
+/// \tparam Sentinel sentinel type for the range of diagonal elements
 /// \param[in] trange a TiledRange of the result
 /// \param[in] diagonals_begin the begin iterator of the range of the diagonals
 /// \param[in] diagonals_end the end iterator of the range of the diagonals; if
 /// not given, default initialized and thus will not be checked
 /// \return a Tensor<float> containing the Frobenius norms of the tiles of
 /// a DistArray with \p val on the diagonal and zeroes elsewhere
-template <typename RandomAccessIterator>
+template <typename RandomAccessIterator, typename Sentinel>
 std::enable_if_t<is_iterator<RandomAccessIterator>::value, Tensor<float>>
 diagonal_shape(TiledRange const &trange, RandomAccessIterator diagonals_begin,
-               RandomAccessIterator diagonals_end = {}) {
-  const bool have_end = diagonals_end == RandomAccessIterator{};
+               Sentinel diagonals_end = {}) {
+  bool have_end = false;
+  if constexpr (detail::is_equality_comparable_v<Sentinel>) {
+    have_end = diagonals_end != Sentinel{};
+  }
 
   Tensor<float> shape(trange.tiles_range(), 0.0);
 
   const auto rank = trange.rank();
-  auto ext = trange.elements_range().extent_data();
-  auto diag_extent = *std::min_element(ext, ext + rank);
+  TA_ASSERT(rank > 0);
+  const auto *lobound = trange.elements_range().lobound_data();
+  const auto diag_lobound = *std::max_element(lobound, lobound + rank);
+  const auto *upbound = trange.elements_range().upbound_data();
+  const auto diag_upbound = *std::min_element(upbound, upbound + rank);
 
-  auto ndim = trange.rank();
-  auto diag_elem = 0ul;
+  auto diag_elem = diag_lobound;
   // the diagonal elements will never be larger than the length of the
   // shortest dimension
-  while (diag_elem < diag_extent) {
+  while (diag_elem < diag_upbound) {
     // Get the tile index corresponding to the current diagonal_elem
-    auto tile_idx = trange.element_to_tile(std::vector<int>(ndim, diag_elem));
+    auto tile_idx = trange.element_to_tile(Index(rank, diag_elem));
     auto tile_range = trange.make_tile_range(tile_idx);
 
     // Compute the range of diagonal elements in the tile
     auto d_range = diagonal_range(tile_range);
-    TA_ASSERT(d_range != Range{});
-    TA_ASSERT(diag_elem == d_range.lobound_data()[0]);
-    const auto beg = diag_elem;
-    const auto end = d_range.upbound_data()[0];
+    TA_ASSERT(d_range != Range1{});
+    TA_ASSERT(diag_elem == d_range.lobound());
+    const auto beg = d_range.lobound();
+    const auto end = d_range.upbound();
     if (have_end) {
-      TA_ASSERT(diagonals_begin + beg < diagonals_end);
-      TA_ASSERT(diagonals_begin + end <= diagonals_end);
+      if constexpr (detail::are_less_than_comparable_v<RandomAccessIterator,
+                                                       Sentinel>) {
+        TA_ASSERT(diagonals_begin + beg < diagonals_end);
+      }
+      if constexpr (detail::are_less_than_or_equal_comparable_v<
+                        RandomAccessIterator, Sentinel>) {
+        TA_ASSERT(diagonals_begin + end <= diagonals_end);
+      }
     }
 
     auto t_norm = std::accumulate(diagonals_begin + beg, diagonals_begin + end,
@@ -149,7 +123,7 @@ diagonal_shape(TiledRange const &trange, RandomAccessIterator diagonals_begin,
                                     const auto abs_val = std::abs(val);
                                     return sum + abs_val * abs_val;
                                   });
-    shape(tile_idx) = static_cast<float>(t_norm);
+    shape(tile_idx) = std::sqrt(static_cast<float>(t_norm));
 
     // Update diag_elem to the next elem not in this tile
     diag_elem = end;
@@ -158,36 +132,18 @@ diagonal_shape(TiledRange const &trange, RandomAccessIterator diagonals_begin,
   return shape;
 }
 
-/// \brief Writes tiles of a constant diagonal array
-
-/// \tparam Array a DistArray type
+/// \brief computes shape data (i.e. Frobenius norms of the tiles) for a
+/// constant diagonal tensor
 /// \tparam T a numeric type
-/// \param[in] A an Array object
-/// \param[in] val the value of the diagonal elements of A
-template <typename Array, typename T>
-void write_diag_tiles_to_array_val(Array &A, T val) {
-  using Tile = typename Array::value_type;
-
-  // Task to create each tile
-  A.init_tiles([val](const Range &rng) {
-    // Compute range of diagonal elements in the tile
-    auto diags = detail::diagonal_range(rng);
-    const auto rank = rng.rank();
-
-    Tile tile(rng, 0.0);
-
-    if (diags.volume() > 0) {  // If the tile has diagonal elems
-
-      // Loop over the elements and write val into them
-      auto diag_lo = diags.lobound_data()[0];
-      auto diag_hi = diags.upbound_data()[0];
-      for (auto elem = diag_lo; elem < diag_hi; ++elem) {
-        tile(std::vector<int>(rank, elem)) = val;
-      }
-    }
-
-    return tile;
-  });
+/// \param trange a TiledRange of the result
+/// \param val value of the diagonal elements
+/// \return a Tensor<float> containing the Frobenius norms of
+///         the tiles of a DistArray with \p val on the diagonal and
+///         zeroes elsewhere
+template <typename T>
+Tensor<float> diagonal_shape(TiledRange const &trange, T val) {
+  auto val_range = ranges::views::repeat(val);
+  return diagonal_shape(trange, val_range.begin(), val_range.end());
 }
 
 /// \brief Writes tiles of a nonconstant diagonal array
@@ -201,7 +157,8 @@ std::enable_if_t<is_iterator<RandomAccessIterator>::value, void>
 write_diag_tiles_to_array_rng(Array &A, RandomAccessIterator diagonals_begin) {
   using Tile = typename Array::value_type;
 
-  A.init_tiles(
+  // N.B. Fence::Local ensures lifetime of the diagonals range
+  A.template init_tiles<HostExecutor::Default, Fence::Local>(
       // Task to create each tile
       [diagonals_begin](const Range &rng) {
         // Compute range of diagonal elements in the tile
@@ -212,10 +169,11 @@ write_diag_tiles_to_array_rng(Array &A, RandomAccessIterator diagonals_begin) {
 
         if (diags.volume() > 0) {  // If the tile has diagonal elems
           // Loop over the elements and write val into them
-          auto diag_lo = diags.lobound_data()[0];
-          auto diag_hi = diags.upbound_data()[0];
-          for (auto elem = diag_lo; elem < diag_hi; ++elem) {
-            tile(std::vector<int>(rank, elem)) = *(diagonals_begin + elem);
+          auto diag_lo = diags.lobound();
+          auto diag_hi = diags.upbound();
+          auto elem_it = diagonals_begin + diag_lo;
+          for (auto elem = diag_lo; elem < diag_hi; ++elem, ++elem_it) {
+            tile(Index(rank, elem)) = *elem_it;
           }
         }
 
@@ -225,36 +183,6 @@ write_diag_tiles_to_array_rng(Array &A, RandomAccessIterator diagonals_begin) {
 
 }  // namespace detail
 
-/// \brief Creates a constant diagonal DistArray
-
-/// Creates an array whose only nonzero values are the (hyper)diagonal elements
-/// (i.e. (n,n,n, ..., n) ), and they are all have the same value
-/// \tparam Policy the policy type of the resulting DistArray
-/// \tparam T a numeric type
-/// \param world The world for the array
-/// \param[in] trange The trange for the array
-/// \param[in] val The value of the diagonal elements
-/// \return a constant diagonal DistArray
-template <typename Array, typename T = double>
-Array diagonal_array(World &world, TiledRange const &trange, T val = 1) {
-  using Policy = typename Array::policy_type;
-  // Init the array
-  if constexpr (is_dense_v<Policy>) {
-    Array A(world, trange);
-    detail::write_diag_tiles_to_array_val(A, val);
-    return A;
-  } else {
-    // Compute shape and init the Array
-    auto shape_norm = detail::diagonal_shape(trange, val);
-    using ShapeType = typename Policy::shape_type;
-    ShapeType shape(shape_norm, trange);
-    Array A(world, trange, shape);
-    detail::write_diag_tiles_to_array_val(A, val);
-    return A;
-  }
-  abort();  // unreachable
-}
-
 /// \brief Creates a non-constant diagonal DistArray
 
 /// Creates an array whose only nonzero values are the (hyper)diagonal elements
@@ -262,24 +190,32 @@ Array diagonal_array(World &world, TiledRange const &trange, T val = 1) {
 /// input range
 /// \tparam Array a DistArray type
 /// \tparam RandomAccessIterator an iterator over the range of diagonal elements
+/// \tparam Sentinel sentinel type for the range of diagonal elements
 /// \param world The world for the array
 /// \param[in] trange The trange for the array
 /// \param[in] diagonals_begin the begin iterator of the range of the diagonals
 /// \param[in] diagonals_end the end iterator of the range of the diagonals;
 ///            if not given, default initialized and thus will not be checked
-/// \return a constant diagonal DistArray
-template <typename Array, typename RandomAccessIterator>
+/// \return a diagonal DistArray
+template <typename Array, typename RandomAccessIterator, typename Sentinel>
 std::enable_if_t<detail::is_iterator<RandomAccessIterator>::value, Array>
 diagonal_array(World &world, TiledRange const &trange,
                RandomAccessIterator diagonals_begin,
-               RandomAccessIterator diagonals_end = {}) {
+               Sentinel diagonals_end = {}) {
   using Policy = typename Array::policy_type;
 
-  if (diagonals_end != RandomAccessIterator{}) {
-    const auto rank = trange.rank();
-    auto ext = trange.elements_range().extent_data();
-    [[maybe_unused]] auto diag_extent = *std::min_element(ext, ext + rank);
-    TA_ASSERT(diagonals_begin + diag_extent <= diagonals_end);
+  if constexpr (detail::is_equality_comparable_v<Sentinel>) {
+    if (diagonals_end != Sentinel{}) {
+      auto diagonals_range = detail::diagonal_range(trange.elements_range());
+      if constexpr (detail::are_less_than_comparable_v<RandomAccessIterator,
+                                                       Sentinel>) {
+        TA_ASSERT(diagonals_begin + diagonals_range.lobound() < diagonals_end);
+      }
+      if constexpr (detail::are_less_than_or_equal_comparable_v<
+                        RandomAccessIterator, Sentinel>) {
+        TA_ASSERT(diagonals_begin + diagonals_range.upbound() <= diagonals_end);
+      }
+    }
   }
 
   // Init the array
@@ -300,6 +236,23 @@ diagonal_array(World &world, TiledRange const &trange,
   abort();  // unreachable
 }
 
+/// \brief Creates a constant diagonal DistArray
+
+/// Creates an array whose only nonzero values are the (hyper)diagonal elements
+/// (i.e. (n,n,n, ..., n) ), and they are all have the same value
+/// \tparam Policy the policy type of the resulting DistArray
+/// \tparam T a numeric type
+/// \param world The world for the array
+/// \param[in] trange The trange for the array
+/// \param[in] val The value of the diagonal elements
+/// \return a constant diagonal DistArray
+template <typename Array, typename T = double>
+Array diagonal_array(World &world, TiledRange const &trange, T val = 1) {
+  auto val_range = ranges::views::repeat(val);
+  return diagonal_array<Array>(world, trange, val_range.begin(),
+                               val_range.end());
+}
+
 }  // namespace TiledArray
 
 #endif  // TILEDARRAY_SPECIALARRAYS_DIAGONAL_ARRAY_H__INCLUDED
diff --git a/src/TiledArray/special/kronecker_delta.h b/src/TiledArray/special/kronecker_delta.h
index 2b1df03294..35a8da6e57 100644
--- a/src/TiledArray/special/kronecker_delta.h
+++ b/src/TiledArray/special/kronecker_delta.h
@@ -37,31 +37,28 @@
 #include <TiledArray/policies/dense_policy.h>
 #include <TiledArray/policies/sparse_policy.h>
 
+namespace TiledArray {
+
+// clang-format off
 /// *generalized* (asymmetric) Kronecker delta
 
-/// *generalized* (asymmetric) Kronecker delta is a product of \c _N ordinary
-/// Kronecker deltas Definition: KroneckerDeltaTile(b,k) = (b==k) ? 1 : 0
-/// KroneckerDeltaTile(b0,k0,b1,k1,b2,k2...bN,kN) = KroneckerDeltaTile(b0,k0)
-/// KroneckerDeltaTile(b1,k1) ...`KroneckerDeltaTile(bN,kN)
-///
-/// \note This is a stateful data tile. Meant to be generated by its (stateless)
-/// lazy generator, \c LazyKroneckerDeltaTile.
-///
-/// \tparam _N the number of ordinal Kronecker deltas in this product
-template <unsigned _N = 1>
+/// *generalized* (asymmetric) Kronecker delta is a product of `N` ordinary
+/// Kronecker deltas
+/// Definition: `KroneckerDeltaTile(b,k) = (b==k) ? 1 : 0` and
+/// `KroneckerDeltaTile(b1,b2,...bN,k1,k2,...kN) = KroneckerDeltaTile(b0,k0) KroneckerDeltaTile(b1,k1) ... KroneckerDeltaTile(bN,kN)`.
+/// The implicit layout is hardwired to `b0,b1,b2,...,bN,k0,k1,k2,...,kN` since the intended use is for taking slices.
+// clang-format on
 class KroneckerDeltaTile {
  public:
-  // Constants
-  static constexpr unsigned N = _N;
   // Concept typedefs
-  typedef TiledArray::Range range_type;  // range type
-  typedef int value_type;                // Element type
+  typedef Range range_type;  // range type
+  typedef int value_type;    // Element type
   typedef value_type
       numeric_type;  // The scalar type that is compatible with value_type
   typedef size_t size_type;  // Size type
 
  private:
-  range_type range_;
+  range_type range_;  // range_.rank() = 2*N
   bool empty_;
 
  public:
@@ -69,8 +66,13 @@ class KroneckerDeltaTile {
   KroneckerDeltaTile() : empty_(true) {}
 
   /// Productive ctor 1
+  /// \param[in] range the range of the tile, by definition must be even-order
+  /// such that the number of Kronecker deltas `N` is `range.rank() / 2` \pre
+  /// `range.rank() % 2 == 1`
   KroneckerDeltaTile(const range_type& range)
-      : range_(range), empty_(is_empty(range_)) {}
+      : range_(range), empty_(is_empty(range_)) {
+    TA_ASSERT(range.rank() % 2 == 0);
+  }
 
   /// copy constructor (= deep copy)
   KroneckerDeltaTile(const KroneckerDeltaTile&) = default;
@@ -88,6 +90,9 @@ class KroneckerDeltaTile {
 
   bool empty() const { return empty_; }
 
+  /// \return the number of Kronecker deltas in the product
+  unsigned int N() const { return range_.rank() / 2; }
+
   /// MADNESS compliant serialization
   template <typename Archive>
   void serialize(Archive& ar) {
@@ -100,13 +105,15 @@ class KroneckerDeltaTile {
   /// @return false if contains any nonzeros
   static bool is_empty(const range_type& range) {
     bool empty = false;
-    TA_ASSERT(range.rank() == 2 * N);
+    TA_ASSERT(range.rank() % 2 == 0);
+    const auto N = range.rank() / 2;
     auto lobound = range.lobound_data();
     auto upbound = range.upbound_data();
-    for (auto i = 0; i != 2 * N && not empty; i += 2)
-      empty = (upbound[i] > lobound[i + 1] && upbound[i + 1] > lobound[i])
-                  ? true
-                  : false;  // assumes extents > 0
+    for (auto i = 0; i != N && not empty; ++i) {
+      const auto lo = std::max(lobound[i], lobound[i + N]);
+      const auto up = std::min(upbound[i], upbound[i + N]);
+      empty = lo >= up;
+    }
     return empty;
   }
 
@@ -115,155 +122,191 @@ class KroneckerDeltaTile {
 // these are to satisfy interfaces, but not needed, actually
 
 // Sum of hyper diagonal elements
-template <unsigned _N>
-typename KroneckerDeltaTile<_N>::numeric_type trace(
-    const KroneckerDeltaTile<_N>& arg);
+typename KroneckerDeltaTile::numeric_type trace(const KroneckerDeltaTile& arg);
 // foreach(i) result += arg[i]
-template <unsigned _N>
-typename KroneckerDeltaTile<_N>::numeric_type sum(
-    const KroneckerDeltaTile<_N>& arg);
+typename KroneckerDeltaTile::numeric_type sum(const KroneckerDeltaTile& arg);
 // foreach(i) result *= arg[i]
-template <unsigned _N>
-typename KroneckerDeltaTile<_N>::numeric_type product(
-    const KroneckerDeltaTile<_N>& arg);
+typename KroneckerDeltaTile::numeric_type product(
+    const KroneckerDeltaTile& arg);
 // foreach(i) result += arg[i] * arg[i]
-template <unsigned _N>
-typename KroneckerDeltaTile<_N>::numeric_type squared_norm(
-    const KroneckerDeltaTile<_N>& arg);
+typename KroneckerDeltaTile::numeric_type squared_norm(
+    const KroneckerDeltaTile& arg);
 // foreach(i) result = min(result, arg[i])
-template <unsigned _N>
-typename KroneckerDeltaTile<_N>::numeric_type min(
-    const KroneckerDeltaTile<_N>& arg);
+typename KroneckerDeltaTile::numeric_type min(const KroneckerDeltaTile& arg);
 // foreach(i) result = max(result, arg[i])
-template <unsigned _N>
-typename KroneckerDeltaTile<_N>::numeric_type max(
-    const KroneckerDeltaTile<_N>& arg);
+typename KroneckerDeltaTile::numeric_type max(const KroneckerDeltaTile& arg);
 // foreach(i) result = abs_min(result, arg[i])
-template <unsigned _N>
-typename KroneckerDeltaTile<_N>::numeric_type abs_min(
-    const KroneckerDeltaTile<_N>& arg);
+typename KroneckerDeltaTile::numeric_type abs_min(
+    const KroneckerDeltaTile& arg);
 // foreach(i) result = abs_max(result, arg[i])
-template <unsigned _N>
-typename KroneckerDeltaTile<_N>::numeric_type abs_max(
-    const KroneckerDeltaTile<_N>& arg);
+typename KroneckerDeltaTile::numeric_type abs_max(
+    const KroneckerDeltaTile& arg);
 
 // Permutation operation
 
 // returns a tile for which result[perm ^ i] = tile[i]
-template <
-    unsigned N, typename Perm,
-    typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
-KroneckerDeltaTile<N> permute(const KroneckerDeltaTile<N>& tile,
-                              const Perm& perm) {
+template <typename Perm,
+          typename = std::enable_if_t<detail::is_permutation_v<Perm>>>
+KroneckerDeltaTile permute(const KroneckerDeltaTile& tile, const Perm& perm) {
   abort();
 }
 
 // dense_result[i] = dense_arg1[i] * sparse_arg2[i]
-template <typename T, unsigned _N>
-TiledArray::Tensor<T> mult(const KroneckerDeltaTile<_N>& arg1,
-                           const TiledArray::Tensor<T>& arg2) {
+template <typename T>
+Tensor<T> mult(const KroneckerDeltaTile& arg1, const Tensor<T>& arg2) {
   abort();
 }
 // dense_result[perm ^ i] = dense_arg1[i] * sparse_arg2[i]
-template <
-    typename T, unsigned _N, typename Perm,
-    typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
-TiledArray::Tensor<T> mult(const KroneckerDeltaTile<_N>& arg1,
-                           const TiledArray::Tensor<T>& arg2,
-                           const Perm& perm) {
+template <typename T, typename Perm,
+          typename = std::enable_if_t<detail::is_permutation_v<Perm>>>
+Tensor<T> mult(const KroneckerDeltaTile& arg1, const Tensor<T>& arg2,
+               const Perm& perm) {
   abort();
 }
 
 // dense_result[i] *= sparse_arg1[i]
-template <typename T, unsigned N>
-TiledArray::Tensor<T>& mult_to(TiledArray::Tensor<T>& result,
-                               const KroneckerDeltaTile<N>& arg1) {
+template <typename T>
+Tensor<T>& mult_to(Tensor<T>& result, const KroneckerDeltaTile& arg1) {
   abort();
   return result;
 }
 
 // dense_result[i] = binary(dense_arg1[i], sparse_arg2[i], op)
-template <typename T, unsigned _N, typename Op>
-TiledArray::Tensor<T> binary(const KroneckerDeltaTile<_N>& arg1,
-                             const TiledArray::Tensor<T>& arg2, Op&& op) {
+template <typename T, typename Op>
+Tensor<T> binary(const KroneckerDeltaTile& arg1, const Tensor<T>& arg2,
+                 Op&& op) {
   abort();
 }
 // dense_result[perm ^ i] = binary(dense_arg1[i], sparse_arg2[i], op)
-template <
-    typename T, unsigned _N, typename Op, typename Perm,
-    typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm>>>
-TiledArray::Tensor<T> binary(const KroneckerDeltaTile<_N>& arg1,
-                             const TiledArray::Tensor<T>& arg2, Op&& op,
-                             const Perm& perm) {
+template <typename T, typename Op, typename Perm,
+          typename = std::enable_if_t<detail::is_permutation_v<Perm>>>
+Tensor<T> binary(const KroneckerDeltaTile& arg1, const Tensor<T>& arg2, Op&& op,
+                 const Perm& perm) {
   abort();
 }
 
-// Contraction operation
+// Contraction operations
 
 // GEMM operation with fused indices as defined by gemm_config:
-// dense_result[i,j] = dense_arg1[i,k] * sparse_arg2[k,j]
-template <typename T, unsigned N>
-TiledArray::Tensor<T> gemm(
-    const KroneckerDeltaTile<N>& arg1, const TiledArray::Tensor<T>& arg2,
-    const typename TiledArray::Tensor<T>::numeric_type factor,
-    const TiledArray::math::GemmHelper& gemm_config) {
+// dense_result[i,j] += dense_arg1[i,k] * sparse_arg2[k,j]
+template <typename T>
+void gemm(Tensor<T>& result, const KroneckerDeltaTile& arg1,
+          const Tensor<T>& arg2, const typename Tensor<T>::numeric_type factor,
+          const math::GemmHelper& gemm_config) {
   // preconditions:
-  // 1. implemented only outer product
-  assert(gemm_config.result_rank() ==
-         gemm_config.left_rank() + gemm_config.right_rank());
+  // 1. implemented only kronecker transform (every mode of arg2 is contracted
+  // with the matching mode of arg1)
+  TA_ASSERT((gemm_config.result_rank() == gemm_config.right_rank() &&
+             gemm_config.left_rank() ==
+                 gemm_config.result_rank() + gemm_config.right_rank()));
 
   auto arg1_range = arg1.range();
   auto arg2_range = arg2.range();
-  auto result_range =
-      gemm_config.make_result_range<TiledArray::Range>(arg1_range, arg2_range);
-  TiledArray::Tensor<T> result(result_range, 0);
+  // if result is empty, initialize it
+  const auto& result_range =
+      result.empty()
+          ? gemm_config.make_result_range<Range>(arg1_range, arg2_range)
+          : result.range();
+  if (result.empty()) result = Tensor<T>(result_range, 0);
 
   auto result_data = result.data();
   auto arg1_extents = arg1_range.extent_data();
   auto arg2_data = arg2.data();
   auto arg2_volume = arg2_range.volume();
 
-  if (not arg1.empty()) {
-    switch (N) {
-      case 1: {
-        auto i0_range = std::min(arg1_extents[0], arg1_extents[1]);
-        for (decltype(i0_range) i0 = 0; i0 != i0_range; ++i0) {
-          auto result_i0i0_ptr =
-              result_data + (i0 * arg1_extents[1] + i0) * arg2_volume;
-          std::copy(arg2_data, arg2_data + arg2_volume, result_i0i0_ptr);
-        }
-      } break;
-      case 2: {
-        auto i0_range = std::min(arg1_extents[0], arg1_extents[1]);
-        auto i1_range = std::min(arg1_extents[2], arg1_extents[3]);
-        auto ndim23 = arg1_extents[2] * arg1_extents[3];
-        for (decltype(i0_range) i0 = 0; i0 != i0_range; ++i0) {
-          auto result_i0i0i1i1_ptr_offset =
-              result_data + (i0 * arg1_extents[1] + i0) * ndim23 * arg2_volume;
-          for (decltype(i1_range) i1 = 0; i1 != i1_range; ++i1) {
-            auto result_i0i0i1i1_ptr =
-                result_i0i0i1i1_ptr_offset +
-                (i1 * arg1_extents[3] + i1) * arg2_volume;
-            std::copy(arg2_data, arg2_data + arg2_volume, result_i0i0i1i1_ptr);
-          }
-        }
-      } break;
-
-      default:
-        abort();  // not implemented
-    }
-  }
+  TA_ASSERT(!arg1.empty());
+  const auto N = arg1.N();
+  auto max = [&](const auto* v1, const auto* v2) {
+    TA::Index result(N);
+    for (auto i = 0; i != N; ++i) result[i] = std::max(v1[i], v2[i]);
+    return result;
+  };
+  auto min = [&](const auto* v1, const auto* v2) {
+    TA::Index result(N);
+    for (auto i = 0; i != N; ++i) result[i] = std::min(v1[i], v2[i]);
+    return result;
+  };
+  const auto read_lobound =
+      max(result_range.lobound_data(), arg2_range.lobound_data());
+  const auto read_upbound =
+      min(result_range.upbound_data(), arg2_range.upbound_data());
+  result.block(read_lobound, read_upbound) =
+      arg2.block(read_lobound, read_upbound);
+}
 
+// GEMM operation with fused indices as defined by gemm_config:
+// dense_result[b0,..bN] = kronecker_arg1[b1,...bN,k1,...kN] *
+// dense_arg2[k1,...kN]
+template <typename T>
+Tensor<T> gemm(const KroneckerDeltaTile& arg1, const Tensor<T>& arg2,
+               const typename Tensor<T>::numeric_type factor,
+               const math::GemmHelper& gemm_config) {
+  Tensor<T> result;
+  gemm(result, arg1, arg2, factor, gemm_config);
   return result;
 }
-// GEMM operation with fused indices as defined by gemm_config:
-// dense_result[i,j] += dense_arg1[i,k] * sparse_arg2[k,j]
-template <typename T, unsigned N>
-void gemm(TiledArray::Tensor<T>& result, const KroneckerDeltaTile<N>& arg1,
-          const TiledArray::Tensor<T>& arg2,
-          const typename TiledArray::Tensor<T>::numeric_type factor,
-          const TiledArray::math::GemmHelper& gemm_config) {
-  abort();
+
+namespace detail {
+
+/// \brief computes shape data (i.e. Frobenius norms of the tiles) for a
+/// DistArray of KroneckerDeltaTile
+/// \param trange a TiledRange of the result
+/// \return a Tensor<float> containing the Frobenius norms of
+///         the tiles of a DistArray of KroneckerDeltaTile's
+/// \note Unlike diagonal_shape() which works for hyperdiagonal tensor with
+/// `N` modes (`t(i,i,...i) = 1`), this works for product of `N`
+/// Kroneckers (`t(i1,...iN,i1,...iN) = 1`, with `N` = `trange.rank() / 2`).
+inline Tensor<float> kronecker_shape(TiledRange const& trange) {
+  // preconditions
+  TA_ASSERT(trange.rank() % 2 == 0);
+
+  Tensor<float> shape(trange.tiles_range(), 0.0);
+  const auto N = trange.rank() / 2;
+
+  // for every bra-ket pair of modes compute list of
+  // {bra tile index, ket tile index, number of nonzeros}
+  using bkn_type = std::tuple<std::size_t, std::size_t, std::size_t>;
+  std::vector<std::vector<bkn_type>> bkns(N);
+  for (auto d = 0; d != N; ++d) {
+    auto& bkn = bkns[d];
+    auto& bra_tr1 = trange.dim(d);
+    auto& ket_tr1 = trange.dim(d + N);
+    auto eidx = std::max(bra_tr1.elements_range().lobound(),
+                         ket_tr1.elements_range().lobound());
+    const auto eidx_fence = std::min(bra_tr1.elements_range().upbound(),
+                                     ket_tr1.elements_range().upbound());
+    while (eidx < eidx_fence) {
+      const auto bra_tile_idx = bra_tr1.element_to_tile(eidx);
+      const auto& bra_tile = bra_tr1.tile(bra_tile_idx);
+      auto ket_tile_idx = ket_tr1.element_to_tile(eidx);
+      const auto& ket_tile = ket_tr1.tile(ket_tile_idx);
+      // closest tile boundary
+      const auto next_eidx = std::min(bra_tile.upbound(), ket_tile.upbound());
+      bkn.emplace_back(bra_tile_idx, ket_tile_idx, next_eidx - eidx);
+      eidx = next_eidx;
+    }
+  }
+
+  // number of nonzero tiles per mode
+  TA::Index nnz_tiles(N);
+  for (auto d = 0; d != N; ++d) nnz_tiles[d] = bkns[d].size();
+  TA::Range nztiles_range(nnz_tiles);
+  TA::Index tile_idx(2 * N);
+  for (auto&& nztile : nztiles_range) {
+    std::size_t nnz_elements = 1;
+    for (auto d = 0; d != N; ++d) {
+      tile_idx[d] = std::get<0>(bkns[d][nztile[d]]);
+      tile_idx[d + N] = std::get<1>(bkns[d][nztile[d]]);
+      nnz_elements *= std::get<2>(bkns[d][nztile[d]]);
+    }
+    shape(tile_idx) = std::sqrt(nnz_elements);
+  }
+
+  return shape;
 }
 
+}  // namespace detail
+
+}  // namespace TiledArray
+
 #endif  // TILEDARRAY_TEST_SPARSE_TILE_H__INCLUDED
diff --git a/src/TiledArray/tensor.h b/src/TiledArray/tensor.h
index edb7ba2e47..20ecab9e0e 100644
--- a/src/TiledArray/tensor.h
+++ b/src/TiledArray/tensor.h
@@ -63,8 +63,8 @@ inline std::ostream& operator<<(std::ostream& os, const T& t) {
   os << t.range() << " { ";
   const auto n = t.range().volume();
   std::size_t offset = 0ul;
-  const auto more_than_1_batch = t.batch_size() > 1;
-  for (auto b = 0ul; b != t.batch_size(); ++b) {
+  const auto more_than_1_batch = t.nbatch() > 1;
+  for (auto b = 0ul; b != t.nbatch(); ++b) {
     if (more_than_1_batch) {
       os << "[batch " << b << "]{ ";
     }
diff --git a/src/TiledArray/tensor/complex.h b/src/TiledArray/tensor/complex.h
index 33698521a2..676327427f 100644
--- a/src/TiledArray/tensor/complex.h
+++ b/src/TiledArray/tensor/complex.h
@@ -27,6 +27,7 @@
 #define TILEDARRAY_SRC_TILEDARRAY_TENSOR_COMPLEX_H__INCLUDED
 
 #include <TiledArray/config.h>
+#include <TiledArray/fwd.h>
 #include <TiledArray/type_traits.h>
 
 namespace TiledArray {
@@ -80,30 +81,30 @@ TILEDARRAY_FORCE_INLINE auto inner_product(const L l, const R r) {
   return TiledArray::detail::conj(l) * r;
 }
 
-/// Wrapper function for `std::norm`
+/// Squared norm of a real number
 
 /// This function disables the call to `std::conj` for real values to
 /// prevent the result from being converted into a complex value.
 /// \tparam R A real scalar type
 /// \param r The real scalar
-/// \return `r`
+/// \return squared norm of `z` `r*r`
 template <typename R,
           typename std::enable_if<is_numeric_v<R> &&
                                   !is_complex<R>::value>::type* = nullptr>
-TILEDARRAY_FORCE_INLINE R norm(const R r) {
+TILEDARRAY_FORCE_INLINE R squared_norm(const R r) {
   return r * r;
 }
 
-/// Compute the norm of a complex number `z`
+/// Compute the squared norm of a complex number `z`
 
 /// \f[
-///   {\rm norm}(z) = zz^* = {\rm Re}(z)^2 + {\rm Im}(z)^2
+///   {\rm norm}(z)^2 = zz^* = {\rm Re}(z)^2 + {\rm Im}(z)^2
 /// \f]
 /// \tparam R The scalar type
 /// \param z The complex scalar
-/// \return The complex conjugate of `z`
+/// \return squared norm of `z`
 template <typename R>
-TILEDARRAY_FORCE_INLINE R norm(const std::complex<R> z) {
+TILEDARRAY_FORCE_INLINE R squared_norm(const std::complex<R> z) {
   const R real = z.real();
   const R imag = z.imag();
   return real * real + imag * imag;
@@ -274,7 +275,46 @@ inline auto abs(const ComplexConjugate<T>& a) {
 
 inline int abs(const ComplexConjugate<void>& a) { return 1; }
 
+template <typename L, typename R,
+          typename = std::enable_if_t<std::is_integral_v<L>>>
+TILEDARRAY_FORCE_INLINE auto operator*(const L l, const std::complex<R> r) {
+  return static_cast<R>(l) * r;
+}
+
+template <typename L, typename R,
+          typename = std::enable_if_t<std::is_integral_v<R>>>
+TILEDARRAY_FORCE_INLINE auto operator*(const std::complex<L> l, const R r) {
+  return l * static_cast<L>(r);
+}
+
+template <typename L, typename R>
+TILEDARRAY_FORCE_INLINE
+    std::enable_if_t<std::is_floating_point_v<L>, std::complex<R>>
+    operator*(const L l, const std::complex<R> r) {
+  return std::complex<R>(l, 0.) * r;
+}
+
+template <typename L, typename R>
+TILEDARRAY_FORCE_INLINE
+    std::enable_if_t<std::is_floating_point_v<R>, std::complex<L>>
+    operator*(const std::complex<L> l, const R r) {
+  return l * std::complex<L>(r, 0.);
+}
+
 }  // namespace detail
+
+namespace conversions {
+
+template <typename T>
+struct to<T, std::complex<T>> {
+  T operator()(const std::complex<T>& v) {
+    TA_ASSERT(v.imag() == 0);
+    return v.real();
+  }
+};
+
+}  // namespace conversions
+
 }  // namespace TiledArray
 
 #endif  // TILEDARRAY_SRC_TILEDARRAY_TENSOR_COMPLEX_H__INCLUDED
diff --git a/src/TiledArray/tensor/kernels.h b/src/TiledArray/tensor/kernels.h
index c65d0e5c69..0b0767ed81 100644
--- a/src/TiledArray/tensor/kernels.h
+++ b/src/TiledArray/tensor/kernels.h
@@ -26,8 +26,11 @@
 #ifndef TILEDARRAY_TENSOR_KENERLS_H__INCLUDED
 #define TILEDARRAY_TENSOR_KENERLS_H__INCLUDED
 
+#include <TiledArray/einsum/index.h>
+#include <TiledArray/math/gemm_helper.h>
 #include <TiledArray/tensor/permute.h>
 #include <TiledArray/tensor/utility.h>
+#include <TiledArray/util/vector.h>
 
 namespace TiledArray {
 
@@ -36,6 +39,196 @@ class Tensor;
 
 namespace detail {
 
+// -------------------------------------------------------------------------
+// Tensor GEMM
+
+/// Contract two tensors
+
+/// GEMM is limited to matrix like contractions. For example, the following
+/// contractions are supported:
+/// \code
+/// C[a,b] = A[a,i,j] * B[i,j,b]
+/// C[a,b] = A[a,i,j] * B[b,i,j]
+/// C[a,b] = A[i,j,a] * B[i,j,b]
+/// C[a,b] = A[i,j,a] * B[b,i,j]
+///
+/// C[a,b,c,d] = A[a,b,i,j] * B[i,j,c,d]
+/// C[a,b,c,d] = A[a,b,i,j] * B[c,d,i,j]
+/// C[a,b,c,d] = A[i,j,a,b] * B[i,j,c,d]
+/// C[a,b,c,d] = A[i,j,a,b] * B[c,d,i,j]
+/// \endcode
+/// Notice that in the above contractions, the inner and outer indices of
+/// the arguments for exactly two contiguous groups in each tensor and that
+/// each group is in the same order in all tensors. That is, the indices of
+/// the tensors must fit the one of the following patterns:
+/// \code
+/// C[M...,N...] = A[M...,K...] * B[K...,N...]
+/// C[M...,N...] = A[M...,K...] * B[N...,K...]
+/// C[M...,N...] = A[K...,M...] * B[K...,N...]
+/// C[M...,N...] = A[K...,M...] * B[N...,K...]
+/// \endcode
+/// This allows use of optimized BLAS functions to evaluate tensor
+/// contractions. Tensor contractions that do not fit this pattern require
+/// one or more tensor permutation so that the tensors fit the required
+/// pattern.
+/// \tparam U The left-hand tensor element type
+/// \tparam AU The left-hand tensor allocator type
+/// \tparam V The right-hand tensor element type
+/// \tparam AV The right-hand tensor allocator type
+/// \tparam W The type of the scaling factor
+/// \param left The left-hand tensor that will be contracted
+/// \param right The right-hand tensor that will be contracted
+/// \param factor The contraction result will be scaling by this value, then
+/// accumulated into \c this \param gemm_helper The *GEMM operation meta data
+/// \return A reference to \c this
+/// \note if this is uninitialized, i.e., if \c this->empty()==true will
+/// this is equivalent to
+/// \code
+///   return (*this = left.gemm(right, factor, gemm_helper));
+/// \endcode
+template <typename Alpha, typename... As, typename... Bs, typename Beta,
+          typename... Cs>
+void gemm(Alpha alpha, const Tensor<As...>& A, const Tensor<Bs...>& B,
+          Beta beta, Tensor<Cs...>& C, const math::GemmHelper& gemm_helper) {
+  static_assert(!detail::is_tensor_of_tensor_v<Tensor<As...>, Tensor<Bs...>,
+                                               Tensor<Cs...>>,
+                "TA::Tensor<T,Allocator>::gemm without custom element op is "
+                "only applicable to "
+                "plain tensors");
+  {
+    // Check that tensor C is not empty and has the correct rank
+    TA_ASSERT(!C.empty());
+    TA_ASSERT(C.range().rank() == gemm_helper.result_rank());
+
+    // Check that the arguments are not empty and have the correct ranks
+    TA_ASSERT(!A.empty());
+    TA_ASSERT(A.range().rank() == gemm_helper.left_rank());
+    TA_ASSERT(!B.empty());
+    TA_ASSERT(B.range().rank() == gemm_helper.right_rank());
+
+    TA_ASSERT(A.nbatch() == 1);
+    TA_ASSERT(B.nbatch() == 1);
+    TA_ASSERT(C.nbatch() == 1);
+
+    // Check that the outer dimensions of left match the corresponding
+    // dimensions in result
+    TA_ASSERT(gemm_helper.left_result_congruent(A.range().extent_data(),
+                                                C.range().extent_data()));
+    TA_ASSERT(ignore_tile_position() ||
+              gemm_helper.left_result_congruent(A.range().lobound_data(),
+                                                C.range().lobound_data()));
+    TA_ASSERT(ignore_tile_position() ||
+              gemm_helper.left_result_congruent(A.range().upbound_data(),
+                                                C.range().upbound_data()));
+
+    // Check that the outer dimensions of right match the corresponding
+    // dimensions in result
+    TA_ASSERT(gemm_helper.right_result_congruent(B.range().extent_data(),
+                                                 C.range().extent_data()));
+    TA_ASSERT(ignore_tile_position() ||
+              gemm_helper.right_result_congruent(B.range().lobound_data(),
+                                                 C.range().lobound_data()));
+    TA_ASSERT(ignore_tile_position() ||
+              gemm_helper.right_result_congruent(B.range().upbound_data(),
+                                                 C.range().upbound_data()));
+
+    // Check that the inner dimensions of left and right match
+    TA_ASSERT(gemm_helper.left_right_congruent(A.range().extent_data(),
+                                               B.range().extent_data()));
+    TA_ASSERT(ignore_tile_position() ||
+              gemm_helper.left_right_congruent(A.range().lobound_data(),
+                                               B.range().lobound_data()));
+    TA_ASSERT(ignore_tile_position() ||
+              gemm_helper.left_right_congruent(A.range().upbound_data(),
+                                               B.range().upbound_data()));
+
+    // Compute gemm dimensions
+    using integer = TiledArray::math::blas::integer;
+    integer m, n, k;
+    gemm_helper.compute_matrix_sizes(m, n, k, A.range(), B.range());
+
+    // Get the leading dimension for left and right matrices.
+    const integer lda = std::max(
+        integer{1},
+        (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose ? k : m));
+    const integer ldb = std::max(
+        integer{1},
+        (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? n
+                                                                       : k));
+
+    // may need to split gemm into multiply + accumulate for tracing purposes
+#ifdef TA_ENABLE_TILE_OPS_LOGGING
+    {
+      using numeric_type = typename Tensor<Cs...>::numeric_type;
+      using T = numeric_type;
+      const bool twostep =
+          TiledArray::TileOpsLogger<T>::get_instance().gemm &&
+          TiledArray::TileOpsLogger<T>::get_instance().gemm_print_contributions;
+      std::unique_ptr<T[]> data_copy;
+      size_t tile_volume;
+      if (twostep) {
+        tile_volume = C.range().volume();
+        data_copy = std::make_unique<T[]>(tile_volume);
+        std::copy(C.data(), C.data() + tile_volume, data_copy.get());
+      }
+      non_distributed::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n,
+                            k, alpha, A.data(), lda, B.data(), ldb,
+                            twostep ? numeric_type(0) : beta, C.data(), n);
+
+      if (TiledArray::TileOpsLogger<T>::get_instance_ptr() != nullptr &&
+          TiledArray::TileOpsLogger<T>::get_instance().gemm) {
+        auto& logger = TiledArray::TileOpsLogger<T>::get_instance();
+        auto apply = [](auto& fnptr, const Range& arg) {
+          return fnptr ? fnptr(arg) : arg;
+        };
+        auto tformed_left_range =
+            apply(logger.gemm_left_range_transform, A.range());
+        auto tformed_right_range =
+            apply(logger.gemm_right_range_transform, B.range());
+        auto tformed_result_range =
+            apply(logger.gemm_result_range_transform, C.range());
+        if ((!logger.gemm_result_range_filter ||
+             logger.gemm_result_range_filter(tformed_result_range)) &&
+            (!logger.gemm_left_range_filter ||
+             logger.gemm_left_range_filter(tformed_left_range)) &&
+            (!logger.gemm_right_range_filter ||
+             logger.gemm_right_range_filter(tformed_right_range))) {
+          logger << "TA::Tensor::gemm+: left=" << tformed_left_range
+                 << " right=" << tformed_right_range
+                 << " result=" << tformed_result_range << std::endl;
+          if (TiledArray::TileOpsLogger<T>::get_instance()
+                  .gemm_print_contributions) {
+            if (!TiledArray::TileOpsLogger<T>::get_instance()
+                     .gemm_printer) {  // default printer
+              // must use custom printer if result's range transformed
+              if (!logger.gemm_result_range_transform)
+                logger << C << std::endl;
+              else
+                logger << make_map(C.data(), tformed_result_range) << std::endl;
+            } else {
+              TiledArray::TileOpsLogger<T>::get_instance().gemm_printer(
+                  *logger.log, tformed_left_range, A.data(),
+                  tformed_right_range, B.data(), tformed_right_range, C.data(),
+                  C.nbatch());
+            }
+          }
+        }
+      }
+
+      if (twostep) {
+        for (size_t v = 0; v != tile_volume; ++v) {
+          C.data()[v] += data_copy[v];
+        }
+      }
+    }
+#else   // TA_ENABLE_TILE_OPS_LOGGING
+    const integer ldc = std::max(integer{1}, n);
+    math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, k,
+                     alpha, A.data(), lda, B.data(), ldb, beta, C.data(), ldc);
+#endif  // TA_ENABLE_TILE_OPS_LOGGING
+  }
+}
+
 /// customization point transform functionality to tensor class T, useful for
 /// nonintrusive extension of T to be usable as element type T in Tensor<T>
 template <typename T>
@@ -60,13 +253,14 @@ struct transform;
 /// \param tensor1 The first argument tensor
 /// \param tensors The remaining argument tensors
 template <typename TR, typename Op, typename T1, typename... Ts,
-          typename std::enable_if<
-              is_tensor<TR, T1, Ts...>::value ||
-              is_tensor_of_tensor<TR, T1, Ts...>::value>::type* = nullptr>
+          typename = std::enable_if_t<
+              detail::is_nested_tensor_v<TR, T1, Ts...> ||
+              std::is_invocable_r_v<TR, Op, const T1&, const Ts&...>>>
 inline TR tensor_op(Op&& op, const T1& tensor1, const Ts&... tensors) {
   if constexpr (std::is_invocable_r_v<TR, Op, const T1&, const Ts&...>) {
     return std::forward<Op>(op)(tensor1, tensors...);
   } else {
+    static_assert(detail::is_nested_tensor_v<TR, T1, Ts...>);
     return TiledArray::detail::transform<TR>()(std::forward<Op>(op), tensor1,
                                                tensors...);
   }
@@ -92,8 +286,7 @@ inline TR tensor_op(Op&& op, const T1& tensor1, const Ts&... tensors) {
 /// \param[in] tensors The remaining argument tensors
 template <typename TR, typename Op, typename T1, typename... Ts,
           typename std::enable_if<
-              (is_tensor<T1, Ts...>::value ||
-               is_tensor_of_tensor<TR, T1, Ts...>::value) &&
+              is_nested_tensor_v<T1, Ts...> &&
               is_contiguous_tensor<T1, Ts...>::value>::type* = nullptr>
 inline TR tensor_op(Op&& op, const Permutation& perm, const T1& tensor1,
                     const Ts&... tensors) {
@@ -107,12 +300,14 @@ inline TR tensor_op(Op&& op, const Permutation& perm, const T1& tensor1,
 }
 
 /// provides transform functionality to class \p T, useful for nonintrusive
-/// extension of a tensor type \p T to be usable as element type \p T in \c
-/// Tensor<T> \tparam T a tensor type \note The default implementation
+/// extension of a tensor type \p T to be usable as element type \p T in
+/// \c Tensor<T>
+/// \tparam T a tensor type
+/// \note The default implementation
 /// constructs T, then computes it by coiterating over elements of the argument
 /// tensors and transforming with the transform \c Op .
-///       This should be specialized for classes like TiledArray::Tensor that
-///       already include the appropriate transform constructors already
+/// This should be specialized for classes like TiledArray::Tensor that
+/// already include the appropriate transform constructors already
 template <typename T>
 struct transform {
   /// creates a result tensor in which element \c i is obtained by \c
@@ -216,16 +411,21 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) {
 /// \param[in] tensors The argument tensors
 template <typename Op, typename TR, typename... Ts,
           typename std::enable_if<
-              is_tensor_of_tensor<TR, Ts...>::value &&
+              !is_tensor_v<TR, Ts...> &&
               is_contiguous_tensor<TR, Ts...>::value>::type* = nullptr>
 inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) {
   TA_ASSERT(!empty(result, tensors...));
   TA_ASSERT(is_range_set_congruent(result, tensors...));
 
-  const auto volume = result.range().volume();
-
-  for (decltype(result.range().volume()) ord = 0ul; ord < volume; ++ord) {
-    inplace_tensor_op(op, result.at_ordinal(ord), tensors.at_ordinal(ord)...);
+  auto volume = result.total_size();
+  for (decltype(volume) ord = 0; ord < volume; ++ord) {
+    if constexpr (is_tensor_of_tensor_v<TR, Ts...>)
+      if (((tensors.data()[ord].range().volume() == 0) || ...)) continue;
+    if constexpr (std::is_invocable_r_v<void, Op, typename TR::value_type&,
+                                        typename Ts::value_type...>)
+      op(result.data()[ord], tensors.data()[ord]...);
+    else
+      inplace_tensor_op(op, result.data()[ord], tensors.data()[ord]...);
   }
 }
 
@@ -283,29 +483,23 @@ inline void inplace_tensor_op(InputOp&& input_op, OutputOp&& output_op,
 /// \endcode
 /// The expected signature of the output
 /// operations is:
-/// \code void op(TR::value_type::value_type*, const
+/// \code
+/// void op(TR::value_type::value_type*, const
 /// TR::value_type::value_type)
 /// \endcode
-/// \tparam InputOp The input operation
-/// type
+/// \tparam InputOp The input operation type
 /// \tparam OutputOp The output operation type
-/// \tparam TR The result tensor
-/// type
+/// \tparam TR The result tensor type
 /// \tparam T1 The first argument tensor type
-/// \tparam Ts The remaining
-/// argument tensor types
+/// \tparam Ts The remaining argument tensor types
 /// \param[in] input_op The operation that is used to
 /// generate the output value from the input arguments
-/// \param[in] output_op The
-/// operation that is used to set the value of the result tensor given the
-/// element pointer and the result value
-/// \param[in] perm The permutation applied
-/// to the argument tensors
+/// \param[in] output_op The operation that is used to set the value
+/// of the result tensor given the element pointer and the result value
+/// \param[in] perm The permutation applied to the argument tensors
 /// \param[in,out] result The result tensor
-/// \param[in]
-/// tensor1 The first argument tensor
-/// \param[in] tensors The remaining argument
-/// tensors
+/// \param[in] tensor1 The first argument tensor
+/// \param[in] tensors The remaining argument tensors
 template <typename InputOp, typename OutputOp, typename TR, typename T1,
           typename... Ts,
           typename std::enable_if<
@@ -357,13 +551,26 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) {
   TA_ASSERT(!empty(result, tensors...));
   TA_ASSERT(is_range_set_congruent(result, tensors...));
 
-  const auto stride = inner_size(result, tensors...);
   const auto volume = result.range().volume();
 
-  for (decltype(result.range().volume()) i = 0ul; i < volume; i += stride)
-    math::inplace_vector_op(std::forward<Op>(op), stride,
-                            result.data() + result.range().ordinal(i),
-                            (tensors.data() + tensors.range().ordinal(i))...);
+  if constexpr (detail::has_member_function_data_anyreturn_v<TR> &&
+                (detail::has_member_function_data_anyreturn_v<Ts> && ...)) {
+    const auto stride = inner_size(result, tensors...);
+    for (std::decay_t<decltype(volume)> i = 0ul; i < volume; i += stride)
+      math::inplace_vector_op(std::forward<Op>(op), stride,
+                              result.data() + result.range().ordinal(i),
+                              (tensors.data() + tensors.range().ordinal(i))...);
+  } else {  // if 1+ tensor lacks data() must iterate over individual elements
+    auto& result_rng = result.range();
+    using signed_idx_t = Range::index_difference_type;
+    auto result_lobound = signed_idx_t(result_rng.lobound());
+    for (auto&& idx : result_rng) {
+      using namespace container::operators;
+      std::forward<Op>(op)(
+          result[idx], (tensors[idx - result_lobound +
+                                signed_idx_t(tensors.range().lobound())])...);
+    }
+  }
 }
 
 /// In-place tensor of tensors operations with non-contiguous data
@@ -384,20 +591,42 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) {
   TA_ASSERT(!empty(result, tensors...));
   TA_ASSERT(is_range_set_congruent(result, tensors...));
 
-  const auto stride = inner_size(result, tensors...);
   const auto volume = result.range().volume();
 
-  auto inplace_tensor_range =
-      [&op, stride](
-          typename TR::pointer MADNESS_RESTRICT const result_data,
-          typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) {
-        for (decltype(result.range().volume()) i = 0ul; i < stride; ++i)
-          inplace_tensor_op(op, result_data[i], tensors_data[i]...);
-      };
-
-  for (decltype(result.range().volume()) ord = 0ul; ord < volume; ord += stride)
-    inplace_tensor_range(result.data() + result.range().ordinal(ord),
-                         (tensors.data() + tensors.range().ordinal(ord))...);
+  if constexpr (detail::has_member_function_data_anyreturn_v<TR> &&
+                (detail::has_member_function_data_anyreturn_v<Ts> && ...)) {
+    const auto stride = inner_size(result, tensors...);
+    auto inplace_tensor_range =
+        [&op, stride](
+            typename TR::pointer MADNESS_RESTRICT const result_data,
+            typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) {
+          for (decltype(result.range().volume()) i = 0ul; i < stride; ++i) {
+            if constexpr (std::is_invocable_v<
+                              std::remove_reference_t<Op>,
+                              typename std::remove_reference_t<TR>::value_type&,
+                              typename std::remove_reference_t<
+                                  Ts>::value_type const&...>) {
+              std::forward<Op>(op)(result_data[i], tensors_data[i]...);
+            } else {
+              inplace_tensor_op(op, result_data[i], tensors_data[i]...);
+            }
+          }
+        };
+
+    for (std::decay_t<decltype(volume)> ord = 0ul; ord < volume; ord += stride)
+      inplace_tensor_range(result.data() + result.range().ordinal(ord),
+                           (tensors.data() + tensors.range().ordinal(ord))...);
+  } else {  // if 1+ tensor lacks data() must iterate over individual elements
+    auto& result_rng = result.range();
+    using signed_idx_t = Range::index_difference_type;
+    auto result_lobound = signed_idx_t(result_rng.lobound());
+    for (auto&& idx : result_rng) {
+      using namespace container::operators;
+      std::forward<Op>(op)(
+          result[idx], (tensors[idx - result_lobound +
+                                signed_idx_t(tensors.range().lobound())])...);
+    }
+  }
 }
 
 // -------------------------------------------------------------------------
@@ -407,8 +636,9 @@ inline void inplace_tensor_op(Op&& op, TR& result, const Ts&... tensors) {
 /// Initialize tensor with contiguous tensor arguments
 
 /// This function initializes the \c i -th element of \c result with the result
-/// of \c op(tensors[i]...) \pre The memory of \c tensor1 has been allocated but
-/// not initialized. \tparam Op The element initialization operation type
+/// of \c op(tensors[i]...)
+/// \pre The memory of \c tensor1 has been allocated but not initialized.
+/// \tparam Op The element initialization operation type
 /// \tparam TR The result tensor type
 /// \tparam Ts The argument tensor types
 /// \param[in] op The result tensor element initialization operation
@@ -433,12 +663,11 @@ inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) {
                       tensors.data()...);
 }
 
-/// Initialize tensor of tensors with contiguous tensor arguments
+/// Initialize nested tensor with contiguous tensor arguments
 
 /// This function initializes the \c i -th element of \c result with the result
 /// of \c op(tensors[i]...)
-/// \pre The memory of \c tensor1 has been allocated but
-/// not initialized.
+/// \pre The memory of \c tensor1 has been allocated but not initialized.
 /// \tparam Op The element initialization operation type
 /// \tparam TR The result tensor type
 /// \tparam Ts The argument tensor types
@@ -447,17 +676,21 @@ inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) {
 /// \param[in] tensors The argument tensors
 template <
     typename Op, typename TR, typename... Ts,
-    typename std::enable_if<is_tensor_of_tensor<TR, Ts...>::value &&
-                            is_contiguous_tensor<TR>::value>::type* = nullptr>
+    typename std::enable_if<
+        (is_nested_tensor<TR, Ts...>::value && !is_tensor<TR, Ts...>::value) &&
+        is_contiguous_tensor<TR, Ts...>::value>::type* = nullptr>
 inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) {
   TA_ASSERT(!empty(result, tensors...));
   TA_ASSERT(is_range_set_congruent(result, tensors...));
 
-  const auto volume = result.range().volume();
-
-  for (decltype(result.range().volume()) ord = 0ul; ord < volume; ++ord) {
-    new (result.data() + ord) typename TR::value_type(
-        tensor_op<typename TR::value_type>(op, tensors.at_ordinal(ord)...));
+  if constexpr (std::is_invocable_r_v<TR, Op, const Ts&...>) {
+    result = std::forward<Op>(op)(tensors...);
+  } else {
+    const auto volume = result.total_size();
+    for (std::remove_cv_t<decltype(volume)> ord = 0ul; ord < volume; ++ord) {
+      new (result.data() + ord) typename TR::value_type(
+          tensor_op<typename TR::value_type>(op, (*(tensors.data() + ord))...));
+    }
   }
 }
 
@@ -467,21 +700,15 @@ inline void tensor_init(Op&& op, TR& result, const Ts&... tensors) {
 /// of \c op(tensor1[i], tensors[i]...)
 /// \pre The memory of \c result has been
 /// allocated but not initialized.
-/// \tparam Op The element initialization
-/// operation type
+/// \tparam Op The element initialization operation type
 /// \tparam TR The result tensor type
-/// \tparam T1 The first
-/// argument tensor type
+/// \tparam T1 The first argument tensor type
 /// \tparam Ts The argument tensor types
-/// \param[in] op The
-/// result tensor element initialization operation
-/// \param[in] perm The
-/// permutation that will be applied to tensor2
-/// \param[out] result The result
-/// tensor
+/// \param[in] op The result tensor element initialization operation
+/// \param[in] perm The permutation that will be applied to tensor2
+/// \param[out] result The result tensor
 /// \param[in] tensor1 The first argument tensor
-/// \param[in] tensors The
-/// argument tensors
+/// \param[in] tensors The argument tensors
 template <
     typename Op, typename TR, typename T1, typename... Ts,
     typename std::enable_if<is_tensor<TR, T1, Ts...>::value>::type* = nullptr>
@@ -505,8 +732,7 @@ inline void tensor_init(Op&& op, const Permutation& perm, TR& result,
 
 /// This function initializes the \c i -th element of \c result with the result
 /// of \c op(tensor1[i], tensors[i]...)
-/// \pre The memory of \c result has been
-/// allocated but not initialized.
+/// \pre The memory of \c result has been allocated but not initialized.
 /// \tparam Op The element initialization operation type
 /// \tparam Perm A permutation type
 /// \tparam TR The result tensor type
@@ -516,9 +742,10 @@ inline void tensor_init(Op&& op, const Permutation& perm, TR& result,
 /// \param[out] result The result tensor
 /// \param[in] tensor1 The first argument tensor
 /// \param[in] tensors The argument tensors
-template <typename Op, typename TR, typename T1, typename... Ts,
-          typename std::enable_if<
-              is_tensor_of_tensor<TR, T1, Ts...>::value>::type* = nullptr>
+template <
+    typename Op, typename TR, typename T1, typename... Ts,
+    typename std::enable_if<is_nested_tensor<TR, T1, Ts...>::value &&
+                            !is_tensor<TR, T1, Ts...>::value>::type* = nullptr>
 inline void tensor_init(Op&& op, const Permutation& perm, TR& result,
                         const T1& tensor1, const Ts&... tensors) {
   TA_ASSERT(!empty(result, tensor1, tensors...));
@@ -546,18 +773,13 @@ inline void tensor_init(Op&& op, const Permutation& perm, TR& result,
 
 /// This function initializes the \c i -th element of \c result with the result
 /// of \c op(tensor1[i], tensors[i]...)
-/// \pre The memory of \c tensor1 has been
-/// allocated but not initialized.
-/// \tparam Op The element initialization
-/// operation type
+/// \pre The memory of \c tensor1 has been allocated but not initialized.
+/// \tparam Op The element initialization operation type
 /// \tparam T1 The result tensor type
-/// \tparam Ts The argument
-/// tensor types
-/// \param[in] op The result tensor element initialization
-/// operation
+/// \tparam Ts The argument tensor types
+/// \param[in] op The result tensor element initialization operation
 /// \param[out] result The result tensor
-/// \param[in] tensor1 The first
-/// argument tensor
+/// \param[in] tensor1 The first argument tensor
 /// \param[in] tensors The argument tensors
 template <
     typename Op, typename TR, typename T1, typename... Ts,
@@ -569,7 +791,6 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1,
   TA_ASSERT(!empty(result, tensor1, tensors...));
   TA_ASSERT(is_range_set_congruent(result, tensor1, tensors...));
 
-  const auto stride = inner_size(tensor1, tensors...);
   const auto volume = tensor1.range().volume();
 
   auto wrapper_op = [&op](typename TR::pointer MADNESS_RESTRICT result_ptr,
@@ -578,11 +799,27 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1,
     new (result_ptr) typename T1::value_type(op(value1, values...));
   };
 
-  for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume;
-       ord += stride)
-    math::vector_ptr_op(wrapper_op, stride, result.data() + ord,
-                        (tensor1.data() + tensor1.range().ordinal(ord)),
-                        (tensors.data() + tensors.range().ordinal(ord))...);
+  if constexpr (detail::has_member_function_data_anyreturn_v<TR> &&
+                (detail::has_member_function_data_anyreturn_v<Ts> && ...)) {
+    const auto stride = inner_size(tensor1, tensors...);
+    for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume;
+         ord += stride)
+      math::vector_ptr_op(wrapper_op, stride, result.data() + ord,
+                          (tensor1.data() + tensor1.range().ordinal(ord)),
+                          (tensors.data() + tensors.range().ordinal(ord))...);
+  } else {  // if 1+ tensor lacks data() must iterate over individual elements
+    auto& result_rng = result.range();
+    using signed_idx_t = Range::index_difference_type;
+    auto result_lobound = signed_idx_t(result_rng.lobound());
+    for (auto&& idx : result_rng) {
+      using namespace container::operators;
+      const signed_idx_t relidx = idx - result_lobound;
+      wrapper_op(
+          &(result[idx]),
+          tensor1[relidx + signed_idx_t(tensor1.range().lobound())],
+          (tensors[relidx + signed_idx_t(tensors.range().lobound())])...);
+    }
+  }
 }
 
 /// Initialize tensor with one or more non-contiguous tensor arguments
@@ -591,13 +828,10 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1,
 /// of \c op(tensor1[i],tensors[i]...)
 /// \pre The memory of \c tensor1 has been
 /// allocated but not initialized.
-/// \tparam Op The element initialization
-/// operation type
+/// \tparam Op The element initialization operation type
 /// \tparam T1 The result tensor type
-/// \tparam Ts The argument
-/// tensor types
-/// \param[in] op The result tensor element initialization
-/// operation
+/// \tparam Ts The argument tensor types
+/// \param[in] op The result tensor element initialization operation
 /// \param[out] result The result tensor
 /// \param[in] tensor1 The first
 /// argument tensor
@@ -612,24 +846,40 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1,
   TA_ASSERT(!empty(result, tensor1, tensors...));
   TA_ASSERT(is_range_set_congruent(result, tensor1, tensors...));
 
-  const auto stride = inner_size(tensor1, tensors...);
   const auto volume = tensor1.range().volume();
 
-  auto inplace_tensor_range =
-      [&op, stride](
-          typename TR::pointer MADNESS_RESTRICT const result_data,
-          typename T1::const_pointer MADNESS_RESTRICT const tensor1_data,
-          typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) {
-        for (decltype(result.range().volume()) i = 0ul; i < stride; ++i)
-          new (result_data + i)
-              typename TR::value_type(tensor_op<typename TR::value_type>(
-                  op, tensor1_data[i], tensors_data[i]...));
-      };
-
-  for (decltype(volume) ord = 0ul; ord < volume; ord += stride)
-    inplace_tensor_range(result.data() + ord,
-                         (tensor1.data() + tensor1.range().ordinal(ord)),
-                         (tensors.data() + tensors.range().ordinal(ord))...);
+  if constexpr (detail::has_member_function_data_anyreturn_v<TR> &&
+                (detail::has_member_function_data_anyreturn_v<Ts> && ...)) {
+    const auto stride = inner_size(tensor1, tensors...);
+    auto inplace_tensor_range =
+        [&op, stride](
+            typename TR::pointer MADNESS_RESTRICT const result_data,
+            typename T1::const_pointer MADNESS_RESTRICT const tensor1_data,
+            typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) {
+          for (std::decay_t<decltype(volume)> i = 0ul; i < stride; ++i)
+            new (result_data + i)
+                typename TR::value_type(tensor_op<typename TR::value_type>(
+                    op, tensor1_data[i], tensors_data[i]...));
+        };
+
+    for (std::decay_t<decltype(volume)> ord = 0ul; ord < volume; ord += stride)
+      inplace_tensor_range(result.data() + ord,
+                           (tensor1.data() + tensor1.range().ordinal(ord)),
+                           (tensors.data() + tensors.range().ordinal(ord))...);
+  } else {
+    auto& result_rng = result.range();
+    using signed_idx_t = Range::index_difference_type;
+    auto result_lobound = signed_idx_t(result_rng.lobound());
+    for (auto&& idx : result_rng) {
+      using namespace container::operators;
+      const signed_idx_t relidx = idx - result_lobound;
+
+      new (&(result[idx]))
+          typename TR::value_type(tensor_op<typename TR::value_type>(
+              op, tensor1[relidx + signed_idx_t(tensor1.range().lobound())],
+              (tensors[relidx + signed_idx_t(tensors.range().lobound())])...));
+    }
+  }
 }
 
 // -------------------------------------------------------------------------
@@ -639,41 +889,45 @@ inline void tensor_init(Op&& op, TR& result, const T1& tensor1,
 
 /// Perform an element-wise reduction of the tensors by
 /// executing <tt>join_op(result, reduce_op(result, &tensor1[i],
-/// &tensors[i]...))</tt> for each \c i in the index range of \c tensor1 . \c
-/// result is initialized to \c identity . If HAVE_INTEL_TBB is defined, the
-/// reduction will be executed in an undefined order, otherwise will execute in
-/// the order of increasing \c i .
-/// \tparam ReduceOp The element-wise reduction
-/// operation type
+/// &tensors[i]...))</tt> for each \c i in the index range of \c tensor1 .
+/// \c result is initialized to \c identity . If `HAVE_INTEL_TBB` is defined,
+/// the reduction will be executed in an undefined order, otherwise will
+/// execute in the order of increasing \c i .
+/// \tparam ReduceOp The element-wise reduction operation type
 /// \tparam JoinOp The result operation type
-/// \tparam Scalar A
-/// scalar type
+/// \tparam Identity A type that can be used as an argument to ReduceOp
 /// \tparam T1 The first argument tensor type
-/// \tparam Ts The
-/// argument tensor types
+/// \tparam Ts The argument tensor types
 /// \param reduce_op The element-wise reduction operation
 /// \param identity The initial value for the reduction and the result
 /// \param tensor1 The first tensor to be reduced
 /// \param tensors The other tensors to be reduced
 /// \return The reduced value of the tensor(s)
 template <
-    typename ReduceOp, typename JoinOp, typename Scalar, typename T1,
+    typename ReduceOp, typename JoinOp, typename Identity, typename T1,
     typename... Ts,
     typename std::enable_if_t<
         is_tensor<T1, Ts...>::value && is_contiguous_tensor<T1, Ts...>::value &&
-        !is_reduce_op_v<std::decay_t<ReduceOp>, std::decay_t<Scalar>,
+        !is_reduce_op_v<std::decay_t<ReduceOp>, std::decay_t<Identity>,
                         std::decay_t<T1>, std::decay_t<Ts>...>>* = nullptr>
-Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
-                     const T1& tensor1, const Ts&... tensors) {
+auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Identity&& identity,
+                   const T1& tensor1, const Ts&... tensors) {
   TA_ASSERT(!empty(tensor1, tensors...));
   TA_ASSERT(is_range_set_congruent(tensor1, tensors...));
 
-  const auto volume = tensor1.range().volume();
+  const auto volume = [&tensor1]() {
+    if constexpr (detail::has_total_size_v<T1>)
+      return tensor1.total_size();
+    else
+      return tensor1.size();
+  }();
 
-  math::reduce_op(reduce_op, join_op, identity, volume, identity,
+  auto init = std::forward<Identity>(identity);
+  math::reduce_op(std::forward<ReduceOp>(reduce_op),
+                  std::forward<JoinOp>(join_op), init, volume, init,
                   tensor1.data(), tensors.data()...);
 
-  return identity;
+  return init;
 }
 
 /// Reduction operation for tensors
@@ -698,8 +952,8 @@ template <
         is_tensor<T1, Ts...>::value && is_contiguous_tensor<T1, Ts...>::value &&
         is_reduce_op_v<std::decay_t<ReduceOp>, std::decay_t<Scalar>,
                        std::decay_t<T1>, std::decay_t<Ts>...>>* = nullptr>
-Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
-                     const T1& tensor1, const Ts&... tensors) {
+auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
+                   const T1& tensor1, const Ts&... tensors) {
   reduce_op(identity, &tensor1, &tensors...);
   return identity;
 }
@@ -708,10 +962,10 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
 
 /// Perform reduction of the tensor-of-tensors' elements by
 /// executing <tt>join_op(result, reduce_op(tensor1[i], tensors[i]...))</tt> for
-/// each \c i in the index range of \c tensor1 . \c result is initialized to \c
-/// identity . This will execute serially, in the order of increasing \c i (each
-/// element's reduction can however be executed in parallel, depending on the
-/// element type).
+/// each \c i in the index range of \c tensor1 . \c result is initialized to
+/// \c identity . This will execute serially, in the order of increasing
+/// \c i (each element's reduction can however be executed in parallel,
+/// depending on the element type).
 /// \tparam ReduceOp The tensor-wise reduction operation type
 /// \tparam JoinOp The result operation type
 /// \tparam Scalar A scalar type
@@ -723,23 +977,30 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
 /// \param tensor1 The first tensor to be reduced
 /// \param tensors The other tensors to be reduced
 /// \return The reduced value of the tensor(s)
-template <typename ReduceOp, typename JoinOp, typename Scalar, typename T1,
+template <typename ReduceOp, typename JoinOp, typename Identity, typename T1,
           typename... Ts,
           typename std::enable_if<
               is_tensor_of_tensor<T1, Ts...>::value &&
               is_contiguous_tensor<T1, Ts...>::value>::type* = nullptr>
-Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
-                     const T1& tensor1, const Ts&... tensors) {
+auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
+                   const Identity& identity, const T1& tensor1,
+                   const Ts&... tensors) {
   TA_ASSERT(!empty(tensor1, tensors...));
   TA_ASSERT(is_range_set_congruent(tensor1, tensors...));
 
-  const auto volume = tensor1.range().volume();
+  const auto volume = [&tensor1]() {
+    if constexpr (detail::has_total_size_v<T1>)
+      return tensor1.total_size();
+    else
+      return tensor1.size();
+  }();
 
   auto result = identity;
-  for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume; ++ord) {
-    auto temp =
-        tensor_reduce(reduce_op, join_op, identity, tensor1.at_ordinal(ord),
-                      tensors.at_ordinal(ord)...);
+  for (std::remove_cv_t<decltype(volume)> ord = 0ul; ord < volume; ++ord) {
+    if (tensor1.data()[ord].range().volume() == 0
+        || ((tensors.data()[ord].range().volume() == 0) || ...)) continue;
+    auto temp = tensor_reduce(reduce_op, join_op, identity, tensor1.data()[ord],
+                              tensors.data()[ord]...);
     join_op(result, temp);
   }
 
@@ -750,10 +1011,10 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
 
 /// Perform an element-wise reduction of the tensors by
 /// executing <tt>join_op(result, reduce_op(tensor1[i], tensors[i]...))</tt> for
-/// each \c i in the index range of \c tensor1 . \c result is initialized to \c
-/// identity . This will execute serially, in the order of increasing \c i (each
-/// element-wise reduction can however be executed in parallel, depending on the
-/// element type).
+/// each \c i in the index range of \c tensor1 . \c result is initialized to
+/// \c identity . This will execute serially, in the order of increasing
+/// \c i (each element-wise reduction can however be executed in parallel,
+/// depending on the element type).
 /// \tparam ReduceOp The element-wise reduction operation type
 /// \tparam JoinOp The result operation type
 /// \tparam Scalar A scalar type
@@ -765,28 +1026,49 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op, Scalar identity,
 /// \param tensor1 The first tensor to be reduced
 /// \param tensors The other tensors to be reduced
 /// \return The reduced value of the tensor(s)
-template <typename ReduceOp, typename JoinOp, typename Scalar, typename T1,
+template <typename ReduceOp, typename JoinOp, typename Identity, typename T1,
           typename... Ts,
           typename std::enable_if<
               is_tensor<T1, Ts...>::value &&
               !is_contiguous_tensor<T1, Ts...>::value>::type* = nullptr>
-Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
-                     const Scalar identity, const T1& tensor1,
-                     const Ts&... tensors) {
+auto tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
+                   const Identity& identity, const T1& tensor1,
+                   const Ts&... tensors) {
   TA_ASSERT(!empty(tensor1, tensors...));
   TA_ASSERT(is_range_set_congruent(tensor1, tensors...));
 
-  const auto stride = inner_size(tensor1, tensors...);
-  const auto volume = tensor1.range().volume();
+  const auto volume = [&tensor1]() {
+    if constexpr (detail::has_total_size_v<T1>)
+      return tensor1.total_size();
+    else
+      return tensor1.size();
+  }();
 
-  Scalar result = identity;
-  for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume;
-       ord += stride) {
-    Scalar temp = identity;
-    math::reduce_op(reduce_op, join_op, identity, stride, temp,
-                    tensor1.data() + tensor1.range().ordinal(ord),
-                    (tensors.data() + tensors.range().ordinal(ord))...);
-    join_op(result, temp);
+  auto result = identity;
+  if constexpr (detail::has_member_function_data_anyreturn_v<T1> &&
+                (detail::has_member_function_data_anyreturn_v<Ts> && ...)) {
+    const auto stride = inner_size(tensor1, tensors...);
+    for (std::decay_t<decltype(volume)> ord = 0ul; ord < volume;
+         ord += stride) {
+      auto temp = identity;
+      math::reduce_op(reduce_op, join_op, identity, stride, temp,
+                      tensor1.data() + tensor1.range().ordinal(ord),
+                      (tensors.data() + tensors.range().ordinal(ord))...);
+      join_op(result, temp);
+    }
+  } else {  // if 1+ tensor lacks data() must iterate over individual elements
+    // TA_ASSERT(tensor1.nbatch() == 1); // todo: assert the same for the
+    // remaining tensors
+    auto& t1_rng = tensor1.range();
+    using signed_idx_t = Range::index_difference_type;
+    auto t1_lobound = signed_idx_t(t1_rng.lobound());
+    for (auto&& idx : t1_rng) {
+      using namespace container::operators;
+      signed_idx_t relidx = idx - t1_lobound;
+      reduce_op(result, tensor1[idx],
+                (tensors[idx - t1_lobound +
+                         signed_idx_t(tensors.range().lobound())])...);
+    }
   }
 
   return result;
@@ -796,10 +1078,11 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
 
 /// Perform an element-wise reduction of the tensors by
 /// executing <tt>join_op(result, reduce_op(tensor1[i], tensors[i]...))</tt> for
-/// each \c i in the index range of \c tensor1 . \c result is initialized to \c
-/// identity . This will execute serially, in the order of increasing \c i (each
-/// element-wise reduction can however be executed in parallel, depending on the
-/// element type). \tparam ReduceOp The element-wise reduction operation type
+/// each \c i in the index range of \c tensor1 . \c result is initialized to
+/// \c identity . This will execute serially, in the order of increasing
+/// \c i (each element-wise reduction can however be executed in parallel,
+/// depending on the element type).
+/// \tparam ReduceOp The element-wise reduction operation type
 /// \tparam JoinOp The result operation type
 /// \tparam Scalar A scalar type
 /// \tparam T1 The first argument tensor type
@@ -820,32 +1103,196 @@ Scalar tensor_reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
                      const Ts&... tensors) {
   TA_ASSERT(!empty(tensor1, tensors...));
   TA_ASSERT(is_range_set_congruent(tensor1, tensors...));
+  // TA_ASSERT(tensor1.nbatch() == 1); // todo: assert the same for the
+  // remaining tensors
 
-  const auto stride = inner_size(tensor1, tensors...);
-  const auto volume = tensor1.range().volume();
-
-  auto tensor_reduce_range =
-      [&reduce_op, &join_op, &identity, stride](
-          Scalar& MADNESS_RESTRICT result,
-          typename T1::const_pointer MADNESS_RESTRICT const tensor1_data,
-          typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) {
-        for (decltype(result.range().volume()) i = 0ul; i < stride; ++i) {
-          Scalar temp = tensor_reduce(reduce_op, join_op, identity,
-                                      tensor1_data[i], tensors_data[i]...);
-          join_op(result, temp);
-        }
-      };
+  const auto volume = [&tensor1]() {
+    if constexpr (detail::has_total_size_v<T1>)
+      return tensor1.total_size();
+    else
+      return tensor1.size();
+  }();
 
   Scalar result = identity;
-  for (decltype(tensor1.range().volume()) ord = 0ul; ord < volume;
-       ord += stride) {
-    Scalar temp = tensor_reduce_range(
-        result, tensor1.data() + tensor1.range().ordinal(ord),
-        (tensors.data() + tensors.range().ordinal(ord))...);
-    join_op(result, temp);
+
+  if constexpr (detail::has_member_function_data_anyreturn_v<T1> &&
+                (detail::has_member_function_data_anyreturn_v<Ts> && ...)) {
+    const auto stride = inner_size(tensor1, tensors...);
+    auto tensor_reduce_range =
+        [&reduce_op, &join_op, &identity, stride](
+            Scalar& MADNESS_RESTRICT result,
+            typename T1::const_pointer MADNESS_RESTRICT const tensor1_data,
+            typename Ts::const_pointer MADNESS_RESTRICT const... tensors_data) {
+          for (std::remove_cv_t<decltype(volume)> i = 0ul; i < stride; ++i) {
+            Scalar temp = tensor_reduce(reduce_op, join_op, identity,
+                                        tensor1_data[i], tensors_data[i]...);
+            join_op(result, temp);
+          }
+        };
+
+    for (std::decay_t<decltype(volume)> ord = 0ul; ord < volume;
+         ord += stride) {
+      Scalar temp = tensor_reduce_range(
+          result, tensor1.data() + tensor1.range().ordinal(ord),
+          (tensors.data() + tensors.range().ordinal(ord))...);
+      join_op(result, temp);
+    }
+  } else {  // if 1+ tensor lacks data() must iterate over individual elements
+    auto& t1_rng = tensor1.range();
+    using signed_idx_t = Range::index_difference_type;
+    auto t1_lobound = signed_idx_t(t1_rng.lobound());
+    for (auto&& idx : t1_rng) {
+      using namespace container::operators;
+      signed_idx_t relidx = idx - t1_lobound;
+
+      Scalar temp =
+          tensor_reduce(reduce_op, join_op, identity, tensor1[idx],
+                        (tensors[idx - t1_lobound +
+                                 signed_idx_t(tensors.range().lobound())])...);
+      join_op(result, temp);
+    }
   }
 
-  return identity;
+  return result;
+}
+
+///
+/// todo: constraint ResultTensorAllocator type so that non-sensical Allocators
+/// are prohibited
+///
+template <typename ResultTensorAllocator = void, typename TensorA,
+          typename TensorB, typename Annot,
+          typename = std::enable_if_t<is_tensor_v<TensorA, TensorB> &&
+                                      is_annotation_v<Annot>>>
+auto tensor_contract(TensorA const& A, Annot const& aA, TensorB const& B,
+                     Annot const& aB, Annot const& aC) {
+  using Result = result_tensor_t<std::multiplies<>, TensorA, TensorB,
+                                 ResultTensorAllocator>;
+
+  using Indices = ::Einsum::index::Index<typename Annot::value_type>;
+  using Permutation = ::Einsum::index::Permutation;
+  using ::Einsum::index::permutation;
+
+  // Check that the ranks of the tensors match that of the annotation.
+  TA_ASSERT(A.range().rank() == aA.size());
+  TA_ASSERT(B.range().rank() == aB.size());
+
+  struct {
+    Indices  //
+        A,   // indices of A
+        B,   // indices of B
+        C,   // indices of C (target indices)
+        h,   // Hadamard indices (aA intersection aB intersection aC)
+        e,   // external indices (aA symmetric difference aB)
+        i;   // internal indices ((aA intersection aB) set difference aC)
+  } const indices{aA,
+                  aB,
+                  aC,
+                  (indices.A & indices.B & indices.C),
+                  (indices.A ^ indices.B),
+                  ((indices.A & indices.B) - indices.h)};
+
+  TA_ASSERT(!indices.h && "Hadamard indices not supported");
+  TA_ASSERT(indices.e && "Dot product not supported");
+
+  struct {
+    Indices A, B, C;
+  } const blas_layout{(indices.A - indices.B) | indices.i,
+                      indices.i | (indices.B - indices.A), indices.e};
+
+  struct {
+    Permutation A, B, C;
+  } const perm{permutation(indices.A, blas_layout.A),
+               permutation(indices.B, blas_layout.B),
+               permutation(indices.C, blas_layout.C)};
+
+  struct {
+    bool A, B, C;
+  } const do_perm{indices.A != blas_layout.A, indices.B != blas_layout.B,
+                  indices.C != blas_layout.C};
+
+  math::GemmHelper gemm_helper{blas::Op::NoTrans, blas::Op::NoTrans,
+                               static_cast<unsigned int>(indices.e.size()),
+                               static_cast<unsigned int>(indices.A.size()),
+                               static_cast<unsigned int>(indices.B.size())};
+
+  // initialize result with the correct extents
+  Result result;
+  {
+    using Index = typename Indices::value_type;
+    using Extent = std::remove_cv_t<
+        typename decltype(std::declval<Range>().extent())::value_type>;
+    using ExtentMap = ::Einsum::index::IndexMap<Index, Extent>;
+
+    // Map tensor indices to their extents.
+    // Note that whether the contracting indices have matching extents is
+    // implicitly checked here by the pipe(|) operator on ExtentMap.
+
+    ExtentMap extent = (ExtentMap{indices.A, A.range().extent()} |
+                        ExtentMap{indices.B, B.range().extent()});
+
+    container::vector<Extent> rng;
+    rng.reserve(indices.e.size());
+    for (auto&& ix : indices.e) {
+      // assuming ix _exists_ in extent
+      rng.emplace_back(extent[ix]);
+    }
+    result = Result{TA::Range(rng)};
+  }
+
+  using Numeric = typename Result::numeric_type;
+
+  // call gemm
+  gemm(Numeric{1},                         //
+       do_perm.A ? A.permute(perm.A) : A,  //
+       do_perm.B ? B.permute(perm.B) : B,  //
+       Numeric{0}, result, gemm_helper);
+
+  return do_perm.C ? result.permute(perm.C.inv()) : result;
+}
+
+template <typename TensorA, typename TensorB, typename Annot,
+          typename = std::enable_if_t<is_tensor_v<TensorA, TensorB> &&
+                                      is_annotation_v<Annot>>>
+auto tensor_hadamard(TensorA const& A, Annot const& aA, TensorB const& B,
+                     Annot const& aB, Annot const& aC) {
+  using ::Einsum::index::Permutation;
+  using ::Einsum::index::permutation;
+  using Indices = ::Einsum::index::Index<typename Annot::value_type>;
+
+  struct {
+    Permutation  //
+        AB,      // permutes A to B
+        AC,      // permutes A to C
+        BC;      // permutes B to C
+  } const perm{permutation(Indices(aA), Indices(aB)),
+               permutation(Indices(aA), Indices(aC)),
+               permutation(Indices(aB), Indices(aC))};
+
+  struct {
+    bool no_perm, perm_to_c, perm_a, perm_b;
+  } const do_this{
+      perm.AB.is_identity() && perm.AC.is_identity() && perm.BC.is_identity(),
+      perm.AB.is_identity(),  //
+      perm.BC.is_identity(),  //
+      perm.AC.is_identity()};
+
+  if (do_this.no_perm) {
+    return A.mult(B);
+  } else if (do_this.perm_to_c) {
+    return A.mult(B, perm.AC);
+  } else if (do_this.perm_a) {
+    auto pA = A.permute(perm.AC);
+    pA.mult_to(B);
+    return pA;
+  } else if (do_this.perm_b) {
+    auto pB = B.permute(perm.BC);
+    pB.mult_to(A);
+    return pB;
+  } else {
+    auto pA = A.permute(perm.AC);
+    return pA.mult_to(B.permute(perm.BC));
+  }
 }
 
 }  // namespace detail
diff --git a/src/TiledArray/tensor/operators.h b/src/TiledArray/tensor/operators.h
index f7c7a5f2ae..b8ed77671d 100644
--- a/src/TiledArray/tensor/operators.h
+++ b/src/TiledArray/tensor/operators.h
@@ -41,11 +41,10 @@ namespace TiledArray {
 /// \param right The right-hand tensor argument
 /// \return A tensor where element \c i is equal to <tt>left[i] + right[i]</tt>
 template <typename T1, typename T2,
-          typename std::enable_if<
-              detail::is_tensor<T1, T2>::value ||
-              detail::is_tensor_of_tensor<T1, T2>::value>::type* = nullptr>
-inline auto operator+(const T1& left, const T2& right) {
-  return add(left, right);
+          typename = std::enable_if_t<detail::tensors_have_equal_nested_rank_v<
+              detail::remove_cvr_t<T1>, detail::remove_cvr_t<T2>>>>
+inline decltype(auto) operator+(T1&& left, T2&& right) {
+  return add(std::forward<T1>(left), std::forward<T2>(right));
 }
 
 /// Tensor minus operator
@@ -57,11 +56,10 @@ inline auto operator+(const T1& left, const T2& right) {
 /// \param right The right-hand tensor argument
 /// \return A tensor where element \c i is equal to <tt>left[i] - right[i]</tt>
 template <typename T1, typename T2,
-          typename std::enable_if<
-              detail::is_tensor<T1, T2>::value ||
-              detail::is_tensor_of_tensor<T1, T2>::value>::type* = nullptr>
-inline auto operator-(const T1& left, const T2& right) {
-  return subt(left, right);
+          typename = std::enable_if_t<detail::tensors_have_equal_nested_rank_v<
+              detail::remove_cvr_t<T1>, detail::remove_cvr_t<T2>>>>
+inline decltype(auto) operator-(T1&& left, T2&& right) {
+  return subt(std::forward<T1>(left), std::forward<T2>(right));
 }
 
 /// Tensor multiplication operator
@@ -72,12 +70,12 @@ inline auto operator-(const T1& left, const T2& right) {
 /// \param left The left-hand tensor argument
 /// \param right The right-hand tensor argument
 /// \return A tensor where element \c i is equal to <tt>left[i] * right[i]</tt>
-template <typename T1, typename T2,
-          typename std::enable_if<
-              detail::is_tensor<T1, T2>::value ||
-              detail::is_tensor_of_tensor<T1, T2>::value>::type* = nullptr>
-inline auto operator*(const T1& left, const T2& right) {
-  return mult(left, right);
+template <
+    typename T1, typename T2,
+    typename std::enable_if<detail::is_nested_tensor_v<
+        detail::remove_cvr_t<T1>, detail::remove_cvr_t<T2>>>::type* = nullptr>
+inline decltype(auto) operator*(T1&& left, T2&& right) {
+  return mult(std::forward<T1>(left), std::forward<T2>(right));
 }
 
 /// Create a copy of \c left that is scaled by \c right
@@ -89,11 +87,11 @@ inline auto operator*(const T1& left, const T2& right) {
 /// \param right The right-hand scalar argument
 /// \return A tensor where element \c i is equal to <tt> left[i] * right </tt>
 template <typename T, typename N,
-          typename std::enable_if<(detail::is_tensor<T>::value ||
-                                   detail::is_tensor_of_tensor<T>::value) &&
-                                  detail::is_numeric_v<N>>::type* = nullptr>
-inline auto operator*(const T& left, N right) {
-  return scale(left, right);
+          typename std::enable_if<
+              detail::is_nested_tensor_v<detail::remove_cvr_t<T>> &&
+              detail::is_numeric_v<N>>::type* = nullptr>
+inline decltype(auto) operator*(T&& left, N right) {
+  return scale(std::forward<T>(left), right);
 }
 
 /// Create a copy of \c right that is scaled by \c left
@@ -103,13 +101,13 @@ inline auto operator*(const T& left, N right) {
 /// \param left The left-hand scalar argument
 /// \param right The right-hand tensor argument
 /// \return A tensor where element \c i is equal to <tt> left * right[i] </tt>
-template <typename N, typename T,
-          typename std::enable_if<
-              detail::is_numeric_v<N> &&
-              (detail::is_tensor<T>::value ||
-               detail::is_tensor_of_tensor<T>::value)>::type* = nullptr>
-inline auto operator*(N left, const T& right) {
-  return scale(right, left);
+template <
+    typename N, typename T,
+    typename std::enable_if<
+        detail::is_numeric_v<N> &&
+        detail::is_nested_tensor_v<detail::remove_cvr_t<T>>>::type* = nullptr>
+inline decltype(auto) operator*(N left, T&& right) {
+  return scale(std::forward<T>(right), left);
 }
 
 /// Create a negated copy of \c arg
@@ -117,11 +115,12 @@ inline auto operator*(N left, const T& right) {
 /// \tparam T The element type of \c arg
 /// \param arg The argument tensor
 /// \return A tensor where element \c i is equal to \c -arg[i]
-template <typename T, typename std::enable_if<detail::is_tensor<T>::value ||
-                                              detail::is_tensor_of_tensor<
-                                                  T>::value>::type* = nullptr>
-inline auto operator-(const T& arg) -> decltype(arg.neg()) {
-  return neg(arg);
+template <typename T, typename std::enable_if<
+                          detail::is_tensor<detail::remove_cvr_t<T>>::value ||
+                          detail::is_tensor_of_tensor<
+                              detail::remove_cvr_t<T>>::value>::type* = nullptr>
+inline decltype(auto) operator-(T&& arg) {
+  return neg(std::forward<T>(arg));
 }
 
 /// Create a permuted copy of \c arg
@@ -129,11 +128,12 @@ inline auto operator-(const T& arg) -> decltype(arg.neg()) {
 /// \tparam T The argument tensor type
 /// \param perm The permutation to be applied to \c arg
 /// \param arg The argument tensor to be permuted
-template <typename T, typename std::enable_if<detail::is_tensor<T>::value ||
-                                              detail::is_tensor_of_tensor<
-                                                  T>::value>::type* = nullptr>
-inline auto operator*(const Permutation& perm, const T& arg) {
-  return permute(arg, perm);
+template <typename T, typename std::enable_if<
+                          detail::is_tensor<detail::remove_cvr_t<T>>::value ||
+                          detail::is_tensor_of_tensor<
+                              detail::remove_cvr_t<T>>::value>::type* = nullptr>
+inline decltype(auto) operator*(const Permutation& perm, T&& arg) {
+  return permute(std::forward<T>(arg), perm);
 }
 
 /// Tensor plus operator
@@ -146,10 +146,11 @@ inline auto operator*(const Permutation& perm, const T& arg) {
 /// \return A tensor where element \c i is equal to <tt>left[i] + right[i]</tt>
 template <typename T1, typename T2,
           typename std::enable_if<
-              detail::is_tensor<T1, T2>::value ||
-              detail::is_tensor_of_tensor<T1, T2>::value>::type* = nullptr>
-inline auto operator+=(T1& left, const T2& right) {
-  return add_to(left, right);
+              detail::is_tensor<detail::remove_cvr_t<T1>, T2>::value ||
+              detail::is_tensor_of_tensor<detail::remove_cvr_t<T1>,
+                                          T2>::value>::type* = nullptr>
+inline decltype(auto) operator+=(T1&& left, const T2& right) {
+  return add_to(std::forward<T1>(left), right);
 }
 
 /// Tensor minus operator
@@ -162,10 +163,11 @@ inline auto operator+=(T1& left, const T2& right) {
 /// \return A reference to \c left
 template <typename T1, typename T2,
           typename std::enable_if<
-              detail::is_tensor<T1, T2>::value ||
-              detail::is_tensor_of_tensor<T1, T2>::value>::type* = nullptr>
-inline auto operator-=(T1& left, const T2& right) {
-  return sub_to(left, right);
+              detail::is_tensor<detail::remove_cvr_t<T1>, T2>::value ||
+              detail::is_tensor_of_tensor<detail::remove_cvr_t<T1>,
+                                          T2>::value>::type* = nullptr>
+inline decltype(auto) operator-=(T1&& left, const T2& right) {
+  return subt_to(std::forward<T1>(left), right);
 }
 
 /// In place tensor multiplication
@@ -178,10 +180,11 @@ inline auto operator-=(T1& left, const T2& right) {
 /// \return A reference to \c left
 template <typename T1, typename T2,
           typename std::enable_if<
-              detail::is_tensor<T1, T2>::value ||
-              detail::is_tensor_of_tensor<T1, T2>::value>::type* = nullptr>
-inline auto operator*=(T1& left, const T2& right) {
-  return mult_to(left, right);
+              detail::is_tensor<detail::remove_cvr_t<T1>, T2>::value ||
+              detail::is_tensor_of_tensor<detail::remove_cvr_t<T1>,
+                                          T2>::value>::type* = nullptr>
+inline decltype(auto) operator*=(T1&& left, const T2& right) {
+  return mult_to(std::forward<T1>(left), right);
 }
 
 /// In place tensor add constant
@@ -193,11 +196,12 @@ inline auto operator*=(T1& left, const T2& right) {
 /// \param right The right-hand scalar argument
 /// \return A reference to \c left
 template <typename T, typename N,
-          typename std::enable_if<(detail::is_tensor<T>::value ||
-                                   detail::is_tensor_of_tensor<T>::value) &&
-                                  detail::is_numeric_v<N>>::type* = nullptr>
-inline auto operator+=(T& left, N right) {
-  return add_to(left, right);
+          typename std::enable_if<
+              (detail::is_tensor<detail::remove_cvr_t<T>>::value ||
+               detail::is_tensor_of_tensor<detail::remove_cvr_t<T>>::value) &&
+              detail::is_numeric_v<N>>::type* = nullptr>
+inline decltype(auto) operator+=(T&& left, N right) {
+  return add_to(std::forward<T>(left), right);
 }
 
 /// In place tensor subtract constant
@@ -209,11 +213,12 @@ inline auto operator+=(T& left, N right) {
 /// \param right The right-hand scalar argument
 /// \return A reference to \c left
 template <typename T, typename N,
-          typename std::enable_if<(detail::is_tensor<T>::value ||
-                                   detail::is_tensor_of_tensor<T>::value) &&
-                                  detail::is_numeric_v<N>>::type* = nullptr>
-inline auto operator-=(T& left, N right) {
-  return subt_to(left, right);
+          typename std::enable_if<
+              (detail::is_tensor<detail::remove_cvr_t<T>>::value ||
+               detail::is_tensor_of_tensor<detail::remove_cvr_t<T>>::value) &&
+              detail::is_numeric_v<N>>::type* = nullptr>
+inline decltype(auto) operator-=(T&& left, N right) {
+  return subt_to(std::forward<T>(left), right);
 }
 
 /// In place tensor scale
@@ -225,11 +230,12 @@ inline auto operator-=(T& left, N right) {
 /// \param right The right-hand scalar argument
 /// \return A reference to \c left
 template <typename T, typename N,
-          typename std::enable_if<(detail::is_tensor<T>::value ||
-                                   detail::is_tensor_of_tensor<T>::value) &&
-                                  detail::is_numeric_v<N>>::type* = nullptr>
-inline auto operator*=(T& left, N right) {
-  return scale_to(left, right);
+          typename std::enable_if<
+              (detail::is_tensor<detail::remove_cvr_t<T>>::value ||
+               detail::is_tensor_of_tensor<detail::remove_cvr_t<T>>::value) &&
+              detail::is_numeric_v<N>>::type* = nullptr>
+inline decltype(auto) operator*=(T&& left, N right) {
+  return scale_to(std::forward<T>(left), right);
 }
 
 }  // namespace TiledArray
diff --git a/src/TiledArray/tensor/permute.h b/src/TiledArray/tensor/permute.h
index 1b888e3a3d..4d46907172 100644
--- a/src/TiledArray/tensor/permute.h
+++ b/src/TiledArray/tensor/permute.h
@@ -97,10 +97,14 @@ inline void fuse_dimensions(SizeType* MADNESS_RESTRICT const fused_size,
 
 /// The expected signature of the input operations is:
 /// \code
-/// Result::value_type input_op(const Arg0::value_type, const
-/// Args::value_type...) \endcode The expected signature of the output
-/// operations is: \code void output_op(Result::value_type*, const
-/// Result::value_type) \endcode \tparam InputOp The input operation type
+/// Result::value_type input_op(const Arg0::value_type,
+///                             const Args::value_type...)
+/// \endcode
+/// The expected signature of the output
+/// operations is:
+/// \code void output_op(Result::value_type*, const Result::value_type)
+/// \endcode
+/// \tparam InputOp The input operation type
 /// \tparam OutputOp The output operation type
 /// \tparam Result The result tensor type
 /// \tparam Arg0 The first tensor argument type
@@ -123,6 +127,9 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result,
   const unsigned int ndim1 = ndim - 1;
   const auto volume = arg0.range().volume();
 
+  // handle the corner case of empty result/args
+  if (volume == 0) return;
+
   // Get pointer to arg extent
   const auto* MADNESS_RESTRICT const arg0_extent = arg0.range().extent_data();
 
@@ -146,13 +153,13 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result,
     };
 
     // Permute the data
-    for (typename Result::ordinal_type index = 0ul; index < volume;
-         index += block_size) {
-      const typename Result::ordinal_type perm_index = perm_index_op(index);
+    for (typename Result::ordinal_type ord = 0ul; ord < volume;
+         ord += block_size) {
+      const typename Result::ordinal_type perm_ord = perm_index_op(ord);
 
       // Copy the block
-      math::vector_ptr_op(op, block_size, result.data() + perm_index,
-                          arg0.data() + index, (args.data() + index)...);
+      math::vector_ptr_op(op, block_size, result.data() + perm_ord,
+                          &arg0.at_ordinal(ord), &args.at_ordinal(ord)...);
     }
 
   } else {
@@ -186,16 +193,16 @@ inline void permute(InputOp&& input_op, OutputOp&& output_op, Result& result,
     // Copy data from the input to the output matrix via a series of matrix
     // transposes.
     for (typename Result::ordinal_type i = 0ul; i < other_fused_size[0]; ++i) {
-      typename Result::ordinal_type index = i * other_fused_weight[0];
+      typename Result::ordinal_type ord = i * other_fused_weight[0];
       for (typename Result::ordinal_type j = 0ul; j < other_fused_size[2];
-           ++j, index += other_fused_weight[2]) {
+           ++j, ord += other_fused_weight[2]) {
         // Compute the ordinal index of the input and output matrices.
-        typename Result::ordinal_type perm_index = perm_index_op(index);
+        typename Result::ordinal_type perm_ord = perm_index_op(ord);
 
         math::transpose(input_op, output_op, other_fused_size[1],
                         other_fused_size[3], result_outer_stride,
-                        result.data() + perm_index, other_fused_weight[1],
-                        arg0.data() + index, (args.data() + index)...);
+                        &result.at_ordinal(perm_ord), other_fused_weight[1],
+                        &arg0.at_ordinal(ord), &args.at_ordinal(ord)...);
       }
     }
   }
diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h
index 1f09b92701..a394594b8e 100644
--- a/src/TiledArray/tensor/tensor.h
+++ b/src/TiledArray/tensor/tensor.h
@@ -22,7 +22,8 @@
 
 #include "TiledArray/config.h"
 
-#include "TiledArray/host/allocator.h"
+#include "TiledArray/external/umpire.h"
+#include "TiledArray/host/env.h"
 
 #include "TiledArray/math/blas.h"
 #include "TiledArray/math/gemm_helper.h"
@@ -36,11 +37,6 @@
 
 namespace TiledArray {
 
-template <typename Alpha, typename... As, typename... Bs, typename Beta,
-          typename... Cs>
-void gemm(Alpha alpha, const Tensor<As...>& A, const Tensor<Bs...>& B,
-          Beta beta, Tensor<Cs...>& C, const math::GemmHelper& gemm_helper);
-
 namespace detail {
 
 /// Signals that we can take the trace of a Tensor<T, A> (for numeric \c T)
@@ -60,9 +56,9 @@ template <typename T, typename Allocator>
 class Tensor {
   // meaningful error if T& is not assignable, see
   // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=48101
-  static_assert(
-      std::is_assignable<std::add_lvalue_reference_t<T>, T>::value,
-      "Tensor<T>: T must be an assignable type (e.g. cannot be const)");
+  static_assert(std::is_assignable<std::add_lvalue_reference_t<T>, T>::value,
+                "Tensor<T,Allocator>: T must be an assignable type (e.g. "
+                "cannot be const)");
 
 #ifdef TA_TENSOR_MEM_TRACE
   template <typename... Ts>
@@ -80,16 +76,17 @@ class Tensor {
   typedef typename range_type::ordinal_type
       size_type;  ///< Size type (to meet the container concept)
   typedef Allocator allocator_type;  ///< Allocator type
-  typedef
-      typename allocator_type::value_type value_type;  ///< Array element type
-  typedef
-      typename allocator_type::reference reference;  ///< Element reference type
-  typedef typename allocator_type::const_reference
-      const_reference;                               ///< Element reference type
-  typedef typename allocator_type::pointer pointer;  ///< Element pointer type
-  typedef typename allocator_type::const_pointer
+  typedef typename std::allocator_traits<allocator_type>::value_type
+      value_type;  ///< Array element type
+  typedef std::add_lvalue_reference_t<value_type>
+      reference;  ///< Element (lvalue) reference type
+  typedef std::add_lvalue_reference_t<std::add_const_t<value_type>>
+      const_reference;  ///< Element (const lvalue) reference type
+  typedef typename std::allocator_traits<allocator_type>::pointer
+      pointer;  ///< Element pointer type
+  typedef typename std::allocator_traits<allocator_type>::const_pointer
       const_pointer;  ///< Element const pointer type
-  typedef typename allocator_type::difference_type
+  typedef typename std::allocator_traits<allocator_type>::difference_type
       difference_type;                   ///< Difference type
   typedef pointer iterator;              ///< Element iterator type
   typedef const_pointer const_iterator;  ///< Element const iterator type
@@ -99,6 +96,8 @@ class Tensor {
       scalar_type;  ///< the scalar type that supports T
 
  private:
+  template <typename X>
+  using value_t = typename X::value_type;
   template <typename X>
   using numeric_t = typename TiledArray::detail::numeric_type<X>::type;
 
@@ -108,16 +107,45 @@ class Tensor {
                                   detail::is_tensor_of_tensor<Ts...>::value;
   };
 
+ public:
+  /// compute type of Tensor with different element type
+  template <typename U,
+            typename OtherAllocator = typename std::allocator_traits<
+                Allocator>::template rebind_alloc<U>>
+  using rebind_t = Tensor<U, OtherAllocator>;
+
+  template <typename U, typename V = value_type, typename = void>
+  struct rebind_numeric;
+  template <typename U, typename V>
+  struct rebind_numeric<U, V, std::enable_if_t<is_tensor<V>::value>> {
+    using VU = typename V::template rebind_numeric<U>::type;
+    using type = Tensor<VU, typename std::allocator_traits<
+                                Allocator>::template rebind_alloc<VU>>;
+  };
+  template <typename U, typename V>
+  struct rebind_numeric<U, V, std::enable_if_t<!is_tensor<V>::value>> {
+    using type = Tensor<
+        U, typename std::allocator_traits<Allocator>::template rebind_alloc<U>>;
+  };
+
+  /// compute type of Tensor with different numeric type
+  template <typename U>
+  using rebind_numeric_t = typename rebind_numeric<U, value_type>::type;
+
+ private:
   using default_construct = bool;
 
-  Tensor(const range_type& range, size_t batch_size, bool default_construct)
-      : range_(range), batch_size_(batch_size) {
-    size_t size = range_.volume() * batch_size;
+  Tensor(const range_type& range, size_t nbatch, bool default_construct)
+      : range_(range), nbatch_(nbatch) {
+    size_t size = range_.volume() * nbatch;
     allocator_type allocator;
     auto* ptr = allocator.allocate(size);
-    if (default_construct) {
-      std::uninitialized_default_construct_n(ptr, size);
-      // std::uninitialized_value_construct_n(ptr, size);
+    // default construct elements of data only if can have any effect ...
+    if constexpr (!std::is_trivially_default_constructible_v<T>) {
+      // .. and requested
+      if (default_construct) {
+        std::uninitialized_default_construct_n(ptr, size);
+      }
     }
     auto deleter = [
 #ifdef TA_TENSOR_MEM_TRACE
@@ -137,7 +165,7 @@ class Tensor {
 #endif
       allocator.deallocate(ptr, size);
     };
-    this->data_ = std::shared_ptr<value_type>(ptr, std::move(deleter));
+    this->data_ = std::shared_ptr<value_type[]>(ptr, std::move(deleter));
 #ifdef TA_TENSOR_MEM_TRACE
     if (nbytes() >= trace_if_larger_than_) {
       ptr_registry()->insert(
@@ -148,14 +176,17 @@ class Tensor {
 #endif
   }
 
-  Tensor(range_type&& range, size_t batch_size, bool default_construct)
-      : range_(std::move(range)), batch_size_(batch_size) {
-    size_t size = range_.volume() * batch_size;
+  Tensor(range_type&& range, size_t nbatch, bool default_construct)
+      : range_(std::move(range)), nbatch_(nbatch) {
+    size_t size = range_.volume() * nbatch;
     allocator_type allocator;
     auto* ptr = allocator.allocate(size);
-    if (default_construct) {
-      std::uninitialized_default_construct_n(ptr, size);
-      // std::uninitialized_value_construct_n(ptr, size);
+    // default construct elements of data only if can have any effect ...
+    if constexpr (!std::is_trivially_default_constructible_v<T>) {
+      // .. and requested
+      if (default_construct) {
+        std::uninitialized_default_construct_n(ptr, size);
+      }
     }
     auto deleter = [
 #ifdef TA_TENSOR_MEM_TRACE
@@ -175,7 +206,7 @@ class Tensor {
 #endif
       allocator.deallocate(ptr, size);
     };
-    this->data_ = std::shared_ptr<value_type>(ptr, std::move(deleter));
+    this->data_ = std::shared_ptr<value_type[]>(ptr, std::move(deleter));
 #ifdef TA_TENSOR_MEM_TRACE
     if (nbytes() >= trace_if_larger_than_) {
       ptr_registry()->insert(
@@ -186,11 +217,25 @@ class Tensor {
 #endif
   }
 
+  template <typename T_>
+  static decltype(auto) value_converter(const T_& arg) {
+    using arg_type = detail::remove_cvr_t<decltype(arg)>;
+    if constexpr (detail::is_tensor_v<arg_type>)  // clone nested tensors
+      return arg.clone();
+    else if constexpr (!std::is_same_v<arg_type, value_type>) {  // convert
+      if constexpr (std::is_convertible_v<arg_type, value_type>)
+        return static_cast<value_type>(arg);
+      else
+        return conversions::to<value_type, arg_type>()(arg);
+    } else
+      return arg;
+  };
+
   range_type range_;  ///< Range
   /// Number of `range_`-sized blocks in `data_`
   /// \note this is not used for (in)equality comparison
-  size_t batch_size_ = 1;
-  std::shared_ptr<value_type> data_;  ///< Shared pointer to the data
+  size_t nbatch_ = 1;
+  std::shared_ptr<value_type[]> data_;  ///< Shared pointer to the data
 
  public:
   /// constructs an empty (null) Tensor
@@ -203,9 +248,7 @@ class Tensor {
   /// \post `*this` is a shallow copy of \p other ,
   /// i.e. `*this == other && this->data()==other.data()`
   Tensor(const Tensor& other)
-      : range_(other.range_),
-        batch_size_(other.batch_size_),
-        data_(other.data_) {
+      : range_(other.range_), nbatch_(other.nbatch_), data_(other.data_) {
 #ifdef TA_TENSOR_MEM_TRACE
     if (nbytes() >= trace_if_larger_than_) {
       ptr_registry()->insert(
@@ -223,7 +266,7 @@ class Tensor {
   /// \post `other.empty()`
   Tensor(Tensor&& other)
       : range_(std::move(other.range_)),
-        batch_size_(std::move(other.batch_size_)),
+        nbatch_(std::move(other.nbatch_)),
         data_(std::move(other.data_)) {
 #ifdef TA_TENSOR_MEM_TRACE
     if (nbytes() >= trace_if_larger_than_) {
@@ -247,12 +290,15 @@ class Tensor {
   }
 
   /// Construct a tensor with a range equal to \c range. The data is
-  /// uninitialized.
+  /// default-initialized (which, for `T` with trivial default constructor,
+  /// means data is uninitialized).
   /// \param range The range of the tensor
-  explicit Tensor(const range_type& range)
-      : Tensor(range, 1, default_construct{true}) {}
+  /// \param nbatch The number of batches (default is 1)
+  explicit Tensor(const range_type& range, size_type nbatch = 1)
+      : Tensor(range, nbatch, default_construct{true}) {}
 
-  /// Construct a tensor with a fill value
+  /// Construct a tensor of tensor values, setting all elements to the same
+  /// value
 
   /// \param range An array with the size of of each dimension
   /// \param value The value of the tensor elements
@@ -269,12 +315,14 @@ class Tensor {
       new (data + i) value_type(cloner(value));
   }
 
-  /// Construct a tensor with a fill value
+  /// Construct a tensor of scalars, setting all elements to the same value
 
   /// \param range An array with the size of of each dimension
   /// \param value The value of the tensor elements
-  template <typename Value, typename std::enable_if<
-                                detail::is_numeric_v<Value>>::type* = nullptr>
+  template <typename Value,
+            typename std::enable_if<std::is_convertible_v<Value, value_type> &&
+                                    !detail::is_tensor<Value>::value>::type* =
+                nullptr>
   Tensor(const range_type& range, const Value& value)
       : Tensor(range, 1, default_construct{false}) {
     detail::tensor_init([value]() -> Value { return value; }, *this);
@@ -282,17 +330,19 @@ class Tensor {
 
   /// Construct a tensor with a fill op that takes an element index
 
-  /// \tparam ElementIndexOp callable of signature `value_type(const
-  /// Range::index_type&)` \param range An array with the size of of each
-  /// dimension \param element_idx_op a callable of type ElementIndexOp
+  /// \tparam ElementIndexOp callable of signature
+  /// `value_type(const Range::index_type&)`
+  /// \param range An array with the size of of each dimension
+  /// \param element_idx_op a callable of type ElementIndexOp
   template <typename ElementIndexOp,
             typename = std::enable_if_t<std::is_invocable_r_v<
                 value_type, ElementIndexOp, const Range::index_type&>>>
   Tensor(const range_type& range, const ElementIndexOp& element_idx_op)
       : Tensor(range, 1, default_construct{false}) {
-    auto* data_ptr = data_.get();
+    pointer MADNESS_RESTRICT const data = this->data();
     for (auto&& element_idx : range) {
-      data_ptr[range.ordinal(element_idx)] = element_idx_op(element_idx);
+      const auto ord = range.ordinal(element_idx);
+      new (data + ord) value_type(element_idx_op(element_idx));
     }
   }
 
@@ -304,8 +354,9 @@ class Tensor {
   Tensor(const range_type& range, InIter it)
       : Tensor(range, 1, default_construct{false}) {
     auto n = range.volume();
-    pointer MADNESS_RESTRICT const data = this->data();
-    for (size_type i = 0ul; i < n; ++i, ++it) data[i] = *it;
+    pointer MADNESS_RESTRICT data = this->data();
+    for (size_type i = 0ul; i < n; ++i, ++it, ++data)
+      new (data) value_type(*it);
   }
 
   template <typename U>
@@ -314,7 +365,7 @@ class Tensor {
     math::uninitialized_copy_vector(range.volume(), u, this->data());
   }
 
-  Tensor(const Range& range, std::initializer_list<T> il)
+  explicit Tensor(const Range& range, std::initializer_list<T> il)
       : Tensor(range, il.begin()) {}
 
   /// Construct a copy of a tensor interface object
@@ -323,6 +374,12 @@ class Tensor {
   /// \param other The tensor to be copied
   /// \note this constructor is disabled if \p T1 already has a conversion
   ///       operator to this type
+  /// \warning if `T1` is a tensor of tensors its elements are _cloned_ rather
+  ///          than copied to make the semantics of  this to be consistent
+  ///          between tensors of scalars and tensors of scalars; specifically,
+  ///          if `T1` is a tensor of scalars the constructed tensor is
+  ///          is independent of \p other, thus should apply clone to inner
+  ///          tensor nests to behave similarly for nested tensors
   template <
       typename T1,
       typename std::enable_if<
@@ -330,9 +387,7 @@ class Tensor {
           !detail::has_conversion_operator_v<T1, Tensor>>::type* = nullptr>
   explicit Tensor(const T1& other)
       : Tensor(detail::clone_range(other), 1, default_construct{false}) {
-    auto op = [](const numeric_t<T1> arg) -> numeric_t<T1> { return arg; };
-
-    detail::tensor_init(op, *this, other);
+    detail::tensor_init(value_converter<typename T1::value_type>, *this, other);
   }
 
   /// Construct a permuted tensor copy
@@ -341,28 +396,44 @@ class Tensor {
   /// \tparam Perm A permutation type
   /// \param other The tensor to be copied
   /// \param perm The permutation that will be applied to the copy
+  /// \warning if `T1` is a tensor of tensors its elements are _cloned_ rather
+  ///          than copied to make the semantics of  this to be consistent
+  ///          between tensors of scalars and tensors of tensors; specifically,
+  ///          if `T1` is a tensor of scalars the constructed tensor is
+  ///          is independent of \p other, thus should apply clone to inner
+  ///          tensor nests to behave similarly for nested tensors
   template <
       typename T1, typename Perm,
-      typename std::enable_if<is_tensor<T1>::value &&
+      typename std::enable_if<detail::is_nested_tensor_v<T1> &&
                               detail::is_permutation_v<Perm>>::type* = nullptr>
   Tensor(const T1& other, const Perm& perm)
-      : Tensor(outer(perm) * other.range(), 1, default_construct{false}) {
-    auto op = [](const numeric_t<T1> arg) -> numeric_t<T1> { return arg; };
-
-    detail::tensor_init(op, outer(perm), *this, other);
+      : Tensor(outer(perm) * other.range(), other.nbatch(),
+               default_construct{false}) {
+    const auto outer_perm = outer(perm);
+    if (outer_perm) {
+      detail::tensor_init(value_converter<typename T1::value_type>, outer_perm,
+                          *this, other);
+    } else {
+      detail::tensor_init(value_converter<typename T1::value_type>, *this,
+                          other);
+    }
 
     // If we actually have a ToT the inner permutation was not applied above so
     // we do that now
     constexpr bool is_tot = detail::is_tensor_of_tensor_v<Tensor>;
     constexpr bool is_bperm = detail::is_bipartite_permutation_v<Perm>;
     // tile ops pass bipartite permutations here even if this is a plain tensor
-    // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does
-    // not match Tensor");
     if constexpr (is_tot && is_bperm) {
       if (inner_size(perm) != 0) {
-        auto inner_perm = inner(perm);
+        const auto inner_perm = inner(perm);
         Permute<value_type, value_type> p;
-        for (auto& x : *this) x = p(x, inner_perm);
+
+        auto volume = total_size();
+        for (decltype(volume) i = 0; i < volume; ++i) {
+          auto& el = *(data() + i);
+          if (!el.empty())
+            el = p(el, inner_perm);
+        }
       }
     }
   }
@@ -399,10 +470,8 @@ class Tensor {
     // If we actually have a ToT the inner permutation was not applied above so
     // we do that now
     constexpr bool is_tot = detail::is_tensor_of_tensor_v<Tensor>;
-    constexpr bool is_bperm = detail::is_bipartite_permutation_v<Perm>;
     // tile ops pass bipartite permutations here even if this is a plain tensor
-    // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does
-    // not match Tensor");
+    constexpr bool is_bperm = detail::is_bipartite_permutation_v<Perm>;
     if constexpr (is_tot && is_bperm) {
       if (inner_size(perm) != 0) {
         auto inner_perm = inner(perm);
@@ -421,7 +490,7 @@ class Tensor {
   /// \param right The right-hand tensor argument
   /// \param op The element-wise operation
   template <typename T1, typename T2, typename Op,
-            typename std::enable_if<is_tensor<T1, T2>::value>::type* = nullptr>
+            typename = std::enable_if_t<detail::is_nested_tensor_v<T1, T2>>>
   Tensor(const T1& left, const T2& right, Op&& op)
       : Tensor(detail::clone_range(left), 1, default_construct{false}) {
     detail::tensor_init(op, *this, left, right);
@@ -439,7 +508,7 @@ class Tensor {
   /// \param perm The permutation that will be applied to the arguments
   template <
       typename T1, typename T2, typename Op, typename Perm,
-      typename std::enable_if<is_tensor<T1, T2>::value &&
+      typename std::enable_if<detail::is_nested_tensor<T1, T2>::value &&
                               detail::is_permutation_v<Perm>>::type* = nullptr>
   Tensor(const T1& left, const T2& right, Op&& op, const Perm& perm)
       : Tensor(outer(perm) * left.range(), 1, default_construct{false}) {
@@ -447,10 +516,8 @@ class Tensor {
     // If we actually have a ToT the inner permutation was not applied above so
     // we do that now
     constexpr bool is_tot = detail::is_tensor_of_tensor_v<Tensor>;
-    constexpr bool is_bperm = detail::is_bipartite_permutation_v<Perm>;
     // tile ops pass bipartite permutations here even if this is a plain tensor
-    // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does
-    // not match Tensor");
+    constexpr bool is_bperm = detail::is_bipartite_permutation_v<Perm>;
     if constexpr (is_tot && is_bperm) {
       if (inner_size(perm) != 0) {
         auto inner_perm = inner(perm);
@@ -462,55 +529,77 @@ class Tensor {
 
   /// Construct a tensor with a range equal to \c range using existing data
   /// \param range The range of the tensor
-  /// \param batch_size The batch size
+  /// \param nbatch The number of batches
   /// \param data shared pointer to the data
-  Tensor(const range_type& range, size_t batch_size,
-         std::shared_ptr<value_type> data)
-      : range_(range), batch_size_(batch_size), data_(data) {
+  Tensor(const range_type& range, size_t nbatch,
+         std::shared_ptr<value_type[]> data)
+      : range_(range), nbatch_(nbatch), data_(std::move(data)) {
 #ifdef TA_TENSOR_MEM_TRACE
     if (nbytes() >= trace_if_larger_than_) {
       ptr_registry()->insert(
-          this, make_string("TA::Tensor(range, batch_size, data)::data_.get()=",
+          this, make_string("TA::Tensor(range, nbatch, data)::data_.get()=",
                             data_.get()));
     }
 #endif
   }
 
+  /// Construct a tensor with a range equal to \c range using existing data
+  /// assuming unit batch size \param range The range of the tensor \param data
+  /// shared pointer to the data
+  Tensor(const range_type& range, std::shared_ptr<value_type[]> data)
+      : range_(range), nbatch_(1), data_(std::move(data)) {
+#ifdef TA_TENSOR_MEM_TRACE
+    if (nbytes() >= trace_if_larger_than_) {
+      ptr_registry()->insert(
+          this,
+          make_string("TA::Tensor(range, data)::data_.get()=", data_.get()));
+    }
+#endif
+  }
+
   /// The batch size accessor
 
   /// @return the size of tensor batch represented by `*this`
-  size_t batch_size() const { return this->batch_size_; }
+  size_t nbatch() const { return this->nbatch_; }
 
   /// @param[in] idx the batch index
-  /// @pre `idx < this->batch_size()`
-  /// @return (plain, i.e. batch_size=1) Tensor representing element \p idx of
+  /// @pre `idx < this->nbatch()`
+  /// @return (plain, i.e. nbatch=1) Tensor representing element \p idx of
   /// the batch
   Tensor batch(size_t idx) const {
-    TA_ASSERT(idx < this->batch_size());
-    std::shared_ptr<value_type> data(this->data_,
-                                     this->data_.get() + idx * this->size());
+    TA_ASSERT(idx < this->nbatch());
+    std::shared_ptr<value_type[]> data(this->data_,
+                                       this->data_.get() + idx * this->size());
     return Tensor(this->range(), 1, data);
   }
 
   /// Returns Tensor representing the data using another range and batch size
 
   /// @param[in] range the Range of the result
-  /// @param[in] batch_size the batch size of the result
+  /// @param[in] nbatch the number of batches of the result
   /// @return Tensor object representing `this->data()` using @p range and @p
-  /// batch_size
-  auto reshape(const range_type& range, size_t batch_size = 1) const {
-    TA_ASSERT(this->range().volume() * this->batch_size() ==
-              range.volume() * batch_size);
-    return Tensor(range, batch_size, this->data_);
+  /// nbatch
+  auto reshape(const range_type& range, size_t nbatch = 1) const {
+    TA_ASSERT(this->range().volume() * this->nbatch() ==
+              range.volume() * nbatch);
+    return Tensor(range, nbatch, this->data_);
   }
 
   /// @return a deep copy of `*this`
   Tensor clone() const {
     Tensor result;
     if (data_) {
-      result = detail::tensor_op<Tensor>(
-          [](const numeric_type value) -> numeric_type { return value; },
-          *this);
+      if constexpr (detail::is_tensor_of_tensor_v<Tensor>) {
+        result = Tensor(*this, [](value_type const& el) { return el.clone(); });
+      } else {
+        result = detail::tensor_op<Tensor>(
+            [](const numeric_type value) -> numeric_type { return value; },
+            *this);
+      }
+    } else if (range_) {  // corner case: data_ = null implies range_.volume()
+                          // == 0;
+      TA_ASSERT(range_.volume() == 0);
+      result = Tensor(range_);
     }
     return result;
   }
@@ -542,7 +631,7 @@ class Tensor {
     }
 #endif
     range_ = other.range_;
-    batch_size_ = other.batch_size_;
+    nbatch_ = other.nbatch_;
     data_ = other.data_;
 #ifdef TA_TENSOR_MEM_TRACE
     if (nbytes() >= trace_if_larger_than_) {
@@ -575,7 +664,7 @@ class Tensor {
     }
 #endif
     range_ = std::move(other.range_);
-    batch_size_ = std::move(other.batch_size_);
+    nbatch_ = std::move(other.nbatch_);
     data_ = std::move(other.data_);
 #ifdef TA_TENSOR_MEM_TRACE
     if (nbytes() >= trace_if_larger_than_) {
@@ -597,12 +686,16 @@ class Tensor {
   /// \return The number of elements in the tensor
   ordinal_type size() const { return (this->range().volume()); }
 
+  /// \return The number of elements in the tensor by summing up the sizes of
+  /// the batches.
+  ordinal_type total_size() const { return size() * nbatch(); }
+
   /// Tensor data size (in bytes) accessor
 
   /// \return The number of bytes occupied by this tensor's data
   /// \warning this only returns valid value if this is a tensor of scalars
   std::size_t nbytes() const {
-    return this->range().volume() * this->batch_size_ * sizeof(T);
+    return this->range().volume() * this->nbatch_ * sizeof(T);
   }
 
   /// Const element accessor
@@ -611,17 +704,17 @@ class Tensor {
   /// \param[in] ord an ordinal index
   /// \return Const reference to the element at position \c ord .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Ordinal,
             std::enable_if_t<std::is_integral<Ordinal>::value>* = nullptr>
   const_reference operator[](const Ordinal ord) const {
     TA_ASSERT(!this->empty());
     // can't distinguish between operator[](Index...) and operator[](ordinal)
-    // thus assume at_ordinal() if this->rank()==1
+    // thus insist on at_ordinal() if this->rank()==1
     TA_ASSERT(this->range_.rank() != 1 &&
               "use Tensor::operator[](index) or "
               "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1");
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     TA_ASSERT(this->range_.includes_ordinal(ord));
     return this->data()[ord];
   }
@@ -632,17 +725,17 @@ class Tensor {
   /// \param[in] ord an ordinal index
   /// \return Reference to the element at position \c ord .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Ordinal,
             std::enable_if_t<std::is_integral<Ordinal>::value>* = nullptr>
   reference operator[](const Ordinal ord) {
     TA_ASSERT(!this->empty());
     // can't distinguish between operator[](Index...) and operator[](ordinal)
-    // thus assume at_ordinal() if this->rank()==1
+    // thus insist on at_ordinal() if this->rank()==1
     TA_ASSERT(this->range_.rank() != 1 &&
               "use Tensor::operator[](index) or "
               "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1");
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     TA_ASSERT(this->range_.includes_ordinal(ord));
     return this->data()[ord];
   }
@@ -653,12 +746,12 @@ class Tensor {
   /// \param[in] ord an ordinal index
   /// \return Const reference to the element at position \c ord .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Ordinal,
             std::enable_if_t<std::is_integral<Ordinal>::value>* = nullptr>
   const_reference at_ordinal(const Ordinal ord) const {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     TA_ASSERT(this->range_.includes_ordinal(ord));
     return this->data()[ord];
   }
@@ -669,12 +762,12 @@ class Tensor {
   /// \param[in] ord an ordinal index
   /// \return Reference to the element at position \c ord .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Ordinal,
             std::enable_if_t<std::is_integral<Ordinal>::value>* = nullptr>
   reference at_ordinal(const Ordinal ord) {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     TA_ASSERT(this->range_.includes_ordinal(ord));
     return this->data()[ord];
   }
@@ -685,12 +778,12 @@ class Tensor {
   /// \param[in] i an index
   /// \return Const reference to the element at position \c i .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Index,
             std::enable_if_t<detail::is_integral_range_v<Index>>* = nullptr>
   const_reference operator[](const Index& i) const {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     const auto iord = this->range_.ordinal(i);
     TA_ASSERT(this->range_.includes_ordinal(iord));
     return this->data()[iord];
@@ -702,12 +795,12 @@ class Tensor {
   /// \param[in] i an index
   /// \return Reference to the element at position \c i .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Index,
             std::enable_if_t<detail::is_integral_range_v<Index>>* = nullptr>
   reference operator[](const Index& i) {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     const auto iord = this->range_.ordinal(i);
     TA_ASSERT(this->range_.includes_ordinal(iord));
     return this->data()[iord];
@@ -719,12 +812,12 @@ class Tensor {
   /// \param[in] i an index
   /// \return Const reference to the element at position \c i .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Integer,
             std::enable_if_t<std::is_integral_v<Integer>>* = nullptr>
   const_reference operator[](const std::initializer_list<Integer>& i) const {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     const auto iord = this->range_.ordinal(i);
     TA_ASSERT(this->range_.includes_ordinal(iord));
     return this->data()[iord];
@@ -736,12 +829,12 @@ class Tensor {
   /// \param[in] i an index
   /// \return Reference to the element at position \c i .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Integer,
             std::enable_if_t<std::is_integral_v<Integer>>* = nullptr>
   reference operator[](const std::initializer_list<Integer>& i) {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     const auto iord = this->range_.ordinal(i);
     TA_ASSERT(this->range_.includes_ordinal(iord));
     return this->data()[iord];
@@ -753,14 +846,14 @@ class Tensor {
   /// \param[in] ord an ordinal index
   /// \return Const reference to the element at position \c ord .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Ordinal,
             std::enable_if_t<std::is_integral_v<Ordinal>>* = nullptr>
   const_reference operator()(const Ordinal& ord) const {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     // can't distinguish between operator[](Index...) and operator[](ordinal)
-    // thus assume at_ordinal() if this->rank()==1
+    // thus insist on at_ordinal() if this->rank()==1
     TA_ASSERT(this->range_.rank() != 1 &&
               "use Tensor::operator()(index) or "
               "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1");
@@ -774,14 +867,14 @@ class Tensor {
   /// \param[in] ord an ordinal index
   /// \return Reference to the element at position \c ord .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p ord is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Ordinal,
             std::enable_if_t<std::is_integral_v<Ordinal>>* = nullptr>
   reference operator()(const Ordinal& ord) {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     // can't distinguish between operator[](Index...) and operator[](ordinal)
-    // thus assume at_ordinal() if this->rank()==1
+    // thus insist on at_ordinal() if this->rank()==1
     TA_ASSERT(this->range_.rank() != 1 &&
               "use Tensor::operator()(index) or "
               "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1");
@@ -795,12 +888,12 @@ class Tensor {
   /// \param[in] i an index
   /// \return Const reference to the element at position \c i .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Index,
             std::enable_if_t<detail::is_integral_range_v<Index>>* = nullptr>
   const_reference operator()(const Index& i) const {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     const auto iord = this->range_.ordinal(i);
     TA_ASSERT(this->range_.includes_ordinal(iord));
     return this->data()[iord];
@@ -812,12 +905,12 @@ class Tensor {
   /// \param[in] i an index
   /// \return Reference to the element at position \c i .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Index,
             std::enable_if_t<detail::is_integral_range_v<Index>>* = nullptr>
   reference operator()(const Index& i) {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     const auto iord = this->range_.ordinal(i);
     TA_ASSERT(this->range_.includes_ordinal(iord));
     return this->data()[iord];
@@ -829,12 +922,12 @@ class Tensor {
   /// \param[in] i an index
   /// \return Const reference to the element at position \c i .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Integer,
             std::enable_if_t<std::is_integral_v<Integer>>* = nullptr>
   const_reference operator()(const std::initializer_list<Integer>& i) const {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     const auto iord = this->range_.ordinal(i);
     TA_ASSERT(this->range_.includes_ordinal(iord));
     return this->data()[iord];
@@ -846,12 +939,12 @@ class Tensor {
   /// \param[in] i an index
   /// \return Reference to the element at position \c i .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <typename Integer,
             std::enable_if_t<std::is_integral_v<Integer>>* = nullptr>
   reference operator()(const std::initializer_list<Integer>& i) {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
     const auto iord = this->range_.ordinal(i);
     TA_ASSERT(this->range_.includes_ordinal(iord));
     return this->data()[iord];
@@ -864,14 +957,20 @@ class Tensor {
   /// \param[in] i an index \return Const reference to the element at position
   /// \c i .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <
       typename... Index,
       std::enable_if_t<(sizeof...(Index) > 1ul) &&
                        detail::is_integral_list<Index...>::value>* = nullptr>
   const_reference operator()(const Index&... i) const {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
+    TA_ASSERT(this->range().rank() == sizeof...(Index));
+    // can't distinguish between operator()(Index...) and operator()(ordinal)
+    // thus insist on at_ordinal() if this->rank()==1
+    TA_ASSERT(this->range_.rank() != 1 &&
+              "use Tensor::operator()(index) or "
+              "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1");
     using Int = std::common_type_t<Index...>;
     const auto iord = this->range_.ordinal(
         std::array<Int, sizeof...(Index)>{{static_cast<Int>(i)...}});
@@ -886,14 +985,20 @@ class Tensor {
   /// \param[in] i an index \return Reference to the element at position \c i
   /// .
   /// \note This asserts (using TA_ASSERT) that this is not empty, \p i is
-  /// included in the range, and `batch_size()==1`
+  /// included in the range, and `nbatch()==1`
   template <
       typename... Index,
       std::enable_if_t<(sizeof...(Index) > 1ul) &&
                        detail::is_integral_list<Index...>::value>* = nullptr>
   reference operator()(const Index&... i) {
     TA_ASSERT(!this->empty());
-    TA_ASSERT(this->batch_size() == 1);
+    TA_ASSERT(this->nbatch() == 1);
+    TA_ASSERT(this->range().rank() == sizeof...(Index));
+    // can't distinguish between operator()(Index...) and operator()(ordinal)
+    // thus insist on at_ordinal() if this->rank()==1
+    TA_ASSERT(this->range_.rank() != 1 &&
+              "use Tensor::operator()(index) or "
+              "Tensor::at_ordinal(index_ordinal) if this->range().rank()==1");
     using Int = std::common_type_t<Index...>;
     const auto iord = this->range_.ordinal(
         std::array<Int, sizeof...(Index)>{{static_cast<Int>(i)...}});
@@ -903,7 +1008,7 @@ class Tensor {
 
   /// Iterator factory
 
-  /// \return An iterator to the first data element
+  /// \return A const iterator to the first data element
   const_iterator begin() const { return (this->data() ? this->data() : NULL); }
 
   /// Iterator factory
@@ -913,7 +1018,7 @@ class Tensor {
 
   /// Iterator factory
 
-  /// \return An iterator to the last data element
+  /// \return A const iterator to the last data element
   const_iterator end() const {
     return (this->data() ? this->data() + this->size() : NULL);
   }
@@ -923,6 +1028,30 @@ class Tensor {
   /// \return An iterator to the last data element
   iterator end() { return (this->data() ? this->data() + this->size() : NULL); }
 
+  /// Iterator factory
+
+  /// \return A const iterator to the first data element
+  const_iterator cbegin() const { return (this->data() ? this->data() : NULL); }
+
+  /// Iterator factory
+
+  /// \return A const iterator to the first data element
+  const_iterator cbegin() { return (this->data() ? this->data() : NULL); }
+
+  /// Iterator factory
+
+  /// \return A const iterator to the last data element
+  const_iterator cend() const {
+    return (this->data() ? this->data() + this->size() : NULL);
+  }
+
+  /// Iterator factory
+
+  /// \return A const iterator to the last data element
+  const_iterator cend() {
+    return (this->data() ? this->data() + this->size() : NULL);
+  }
+
   /// Read-only access to the data
 
   /// \return A const pointer to the tensor data
@@ -933,15 +1062,33 @@ class Tensor {
   /// \return A mutable pointer to the tensor data
   pointer data() { return this->data_.get(); }
 
+  /// @param[in] batch_idx the batch index
+  /// @pre `batch_idx < this->nbatch()`
+  /// @return A const pointer to the tensor data of the batch \p batch_idx
+  const_pointer batch_data(size_t batch_idx) const {
+    TA_ASSERT(batch_idx < this->nbatch());
+    return data() + batch_idx * size();
+  }
+
+  /// @param[in] batch_idx the batch index
+  /// @pre `batch_idx < this->nbatch()`
+  /// @return A const pointer to the tensor data of the batch \p batch_idx
+  pointer batch_data(size_t batch_idx) {
+    TA_ASSERT(batch_idx < this->nbatch());
+    return data() + batch_idx * size();
+  }
+
   /// Read-only shared_ptr to the data
 
   /// \return A const shared_ptr to the tensor data
-  std::shared_ptr<const value_type> data_shared() const { return this->data_; }
+  std::shared_ptr<const value_type[]> data_shared() const {
+    return this->data_;
+  }
 
   /// Mutable shared_ptr to the data
 
   /// \return A mutable shared_ptr to the tensor data
-  std::shared_ptr<value_type> data_shared() { return this->data_; }
+  std::shared_ptr<value_type[]> data_shared() { return this->data_; }
 
   /// Test if the tensor is empty
 
@@ -952,9 +1099,9 @@ class Tensor {
   ///       (`this->empty()` is equivalent to `*this == Tensor{}`),
   ///       but is not identical
   ///       to a default-constructed Tensor (e.g., `this->empty()` does not
-  ///       imply `this->batch_size() == Tensor{}.batch_size()`)
+  ///       imply `this->nbatch() == Tensor{}.nbatch()`)
   bool empty() const {
-    // empty data_ implies default values for range_ (but NOT batch_size_)
+    // empty data_ implies default values for range_ (but NOT nbatch_)
     TA_ASSERT(
         (this->data_.use_count() == 0 && !this->range_) ||
         (this->data_.use_count() != 0 && this->range_));  // range is empty
@@ -970,16 +1117,16 @@ class Tensor {
   void serialize(Archive& ar) {
     bool empty = this->empty();
     auto range = this->range_;
-    auto batch_size = this->batch_size_;
-    ar& empty;
+    auto nbatch = this->nbatch_;
+    ar & empty;
     if (!empty) {
-      ar& range;
-      ar& batch_size;
+      ar & range;
+      ar & nbatch;
       if constexpr (madness::is_input_archive_v<Archive>) {
-        *this = Tensor(std::move(range), batch_size, default_construct{true});
+        *this = Tensor(std::move(range), nbatch, default_construct{true});
       }
       ar& madness::archive::wrap(this->data_.get(),
-                                 this->range_.volume() * batch_size);
+                                 this->range_.volume() * nbatch);
     } else {
       if constexpr (madness::is_input_archive_v<Archive>) {
         *this = Tensor{};
@@ -1008,7 +1155,7 @@ class Tensor {
 #endif
     std::swap(data_, other.data_);
     std::swap(range_, other.range_);
-    std::swap(batch_size_, other.batch_size_);
+    std::swap(nbatch_, other.nbatch_);
 #ifdef TA_TENSOR_MEM_TRACE
     if (other_to_be_traced) {
       ptr_registry()->insert(
@@ -1137,7 +1284,8 @@ class Tensor {
   // clang-format on
   /// @{
   template <typename PairRange,
-            typename = std::enable_if_t<detail::is_gpair_range_v<PairRange>>>
+            typename = std::enable_if_t<detail::is_gpair_range_v<PairRange> &&
+                                        !std::is_same_v<PairRange, Range>>>
   detail::TensorInterface<const T, BlockRange> block(
       const PairRange& bounds) const {
     return detail::TensorInterface<const T, BlockRange>(
@@ -1145,7 +1293,8 @@ class Tensor {
   }
 
   template <typename PairRange,
-            typename = std::enable_if_t<detail::is_gpair_range_v<PairRange>>>
+            typename = std::enable_if_t<detail::is_gpair_range_v<PairRange> &&
+                                        !std::is_same_v<PairRange, Range>>>
   detail::TensorInterface<T, BlockRange> block(const PairRange& bounds) {
     return detail::TensorInterface<T, BlockRange>(
         BlockRange(this->range_, bounds), this->data());
@@ -1184,6 +1333,38 @@ class Tensor {
   }
   /// @}
 
+  // clang-format off
+  /// Constructs a view of the block defined by a TiledArray::Range .
+
+  /// Examples of using this:
+  /// \code
+  ///   std::vector<size_t> lobounds = {0, 1, 2};
+  ///   std::vector<size_t> upbounds = {4, 6, 8};
+  ///
+  ///   auto tview = t.block(TiledArray::Range(lobounds, upbounds));
+  /// \endcode
+  /// \tparam PairRange Type representing a range of generalized pairs (see TiledArray::detail::is_gpair_v )
+  /// \param bounds The block bounds
+  /// \return a {const,mutable} view of the block defined by its \p bounds
+  /// \throw TiledArray::Exception When the size of \p lower_bound is not
+  /// equal to that of \p upper_bound.
+  /// \throw TiledArray::Exception When `get<0>(bounds[i]) >= get<1>(bounds[i])`
+  // clang-format on
+  /// @{
+  detail::TensorInterface<const T, BlockRange> block(
+      const Range& bounds) const {
+    return detail::TensorInterface<const T, BlockRange>(
+        BlockRange(this->range_, bounds.lobound(), bounds.upbound()),
+        this->data());
+  }
+
+  detail::TensorInterface<T, BlockRange> block(const Range& bounds) {
+    return detail::TensorInterface<T, BlockRange>(
+        BlockRange(this->range_, bounds.lobound(), bounds.upbound()),
+        this->data());
+  }
+  /// @}
+
   /// Create a permuted copy of this tensor
 
   /// \tparam Perm A permutation tile
@@ -1192,33 +1373,7 @@ class Tensor {
   template <typename Perm,
             typename = std::enable_if_t<detail::is_permutation_v<Perm>>>
   Tensor permute(const Perm& perm) const {
-    constexpr bool is_tot = detail::is_tensor_of_tensor_v<Tensor>;
-    [[maybe_unused]] constexpr bool is_bperm =
-        detail::is_bipartite_permutation_v<Perm>;
-    // tile ops pass bipartite permutations here even if this is a plain tensor
-    // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does
-    // not match Tensor");
-    if constexpr (!is_tot) {
-      if constexpr (is_bperm) {
-        TA_ASSERT(inner_size(perm) == 0);  // ensure this is a plain permutation
-        return Tensor(*this, outer(perm));
-      } else
-        return Tensor(*this, perm);
-    } else {
-      // If we have a ToT we need to apply the permutation in two steps. The
-      // first step is identical to the non-ToT case (permute the outer modes)
-      // the second step does the inner modes
-      Tensor rv(*this, outer(perm));
-      if constexpr (is_bperm) {
-        if (inner_size(perm) != 0) {
-          auto inner_perm = inner(perm);
-          Permute<value_type, value_type> p;
-          for (auto& inner_t : rv) inner_t = p(inner_t, inner_perm);
-        }
-      }
-      return rv;
-    }
-    abort();  // unreachable
+    return Tensor(*this, perm);
   }
 
   /// Shift the lower and upper bound of this tensor
@@ -1284,8 +1439,13 @@ class Tensor {
   /// \c op(*this[i],other[i])
   template <typename Right, typename Op,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
-  Tensor binary(const Right& right, Op&& op) const {
-    return Tensor(*this, right, op);
+  auto binary(const Right& right, Op&& op) const {
+    using result_value_type = decltype(op(
+        std::declval<const T&>(), std::declval<const value_t<Right>&>()));
+    using result_allocator_type = typename std::allocator_traits<
+        Allocator>::template rebind_alloc<result_value_type>;
+    using ResultTensor = Tensor<result_value_type, result_allocator_type>;
+    return ResultTensor(*this, right, op);
   }
 
   /// Use a binary, element wise operation to construct a new, permuted tensor
@@ -1294,33 +1454,40 @@ class Tensor {
   /// \tparam Op The binary operation type
   /// \tparam Perm A permutation tile
   /// \param right The right-hand argument in the binary operation
-  /// \param op The binary, element-wise operation
+  /// \param op The binary element-wise operation
   /// \param perm The permutation to be applied to this tensor
   /// \return A tensor where element \c i of the new tensor is equal to
   /// \c op(*this[i],other[i])
-  template <
-      typename Right, typename Op, typename Perm,
-      typename std::enable_if<is_tensor<Right>::value &&
-                              detail::is_permutation_v<Perm>>::type* = nullptr>
-  Tensor binary(const Right& right, Op&& op, const Perm& perm) const {
-    constexpr bool is_tot = detail::is_tensor_of_tensor_v<Tensor>;
+  template <typename Right, typename Op, typename Perm,
+            typename std::enable_if<is_tensor<Right>::value &&
+                                    detail::is_permutation_v<
+                                        std::remove_reference_t<Perm>>>::type* =
+                nullptr>
+  auto binary(const Right& right, Op&& op, Perm&& perm) const {
+    using result_value_type = decltype(op(
+        std::declval<const T&>(), std::declval<const value_t<Right>&>()));
+    using result_allocator_type = typename std::allocator_traits<
+        Allocator>::template rebind_alloc<result_value_type>;
+    using ResultTensor = Tensor<result_value_type, result_allocator_type>;
+    // tile ops pass bipartite permutations here even if the result is a plain
+    // tensor
     [[maybe_unused]] constexpr bool is_bperm =
         detail::is_bipartite_permutation_v<Perm>;
-    // tile ops pass bipartite permutations here even if this is a plain tensor
-    // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does
-    // not match Tensor");
-    if constexpr (!is_tot) {
+    constexpr bool result_is_tot = detail::is_tensor_of_tensor_v<ResultTensor>;
+
+    if constexpr (!result_is_tot) {
       if constexpr (is_bperm) {
-        TA_ASSERT(inner_size(perm) == 0);  // ensure this is a plain permutation
-        return Tensor(*this, right, op, outer(perm));
+        TA_ASSERT(!inner(perm));  // ensure this is a plain permutation since
+                                  // ResultTensor is plain
+        return ResultTensor(*this, right, op, outer(std::forward<Perm>(perm)));
       } else
-        return Tensor(*this, right, op, perm);
+        return ResultTensor(*this, right, op, std::forward<Perm>(perm));
     } else {
       // AFAIK the other branch fundamentally relies on raw pointer arithmetic,
       // which won't work for ToTs.
       auto temp = binary(right, std::forward<Op>(op));
-      Permute<Tensor, Tensor> p;
-      return p(temp, perm);
+      Permute<decltype(temp), decltype(temp)> p;
+      return p(temp, std::forward<Perm>(perm));
     }
     abort();  // unreachable
   }
@@ -1330,7 +1497,7 @@ class Tensor {
   /// \tparam Right The right-hand tensor type
   /// \tparam Op The binary operation type
   /// \param right The right-hand argument in the binary operation
-  /// \param op The binary, element-wise operation
+  /// \param op The binary element-wise operation
   /// \return A reference to this object
   /// \throw TiledArray::Exception When this tensor is empty.
   /// \throw TiledArray::Exception When \c other is empty.
@@ -1338,7 +1505,8 @@ class Tensor {
   /// to the range of \c other.
   /// \throw TiledArray::Exception When this and \c other are the same.
   template <typename Right, typename Op,
-            typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
+            typename std::enable_if<detail::is_nested_tensor_v<Right>>::type* =
+                nullptr>
   Tensor& inplace_binary(const Right& right, Op&& op) {
     detail::inplace_tensor_op(op, *this, right);
     return *this;
@@ -1347,7 +1515,7 @@ class Tensor {
   /// Use a unary, element wise operation to construct a new tensor
 
   /// \tparam Op The unary operation type
-  /// \param op The unary, element-wise operation
+  /// \param op The unary element-wise operation
   /// \return A tensor where element \c i of the new tensor is equal to
   /// \c op(*this[i])
   /// \throw TiledArray::Exception When this tensor is empty.
@@ -1360,31 +1528,31 @@ class Tensor {
 
   /// \tparam Op The unary operation type
   /// \tparam Perm A permutation tile
-  /// \param op The unary operation
+  /// \param op The unary element-wise operation
   /// \param perm The permutation to be applied to this tensor
   /// \return A permuted tensor with elements that have been modified by \c op
   /// \throw TiledArray::Exception When this tensor is empty.
   /// \throw TiledArray::Exception The dimension of \c perm does not match
   /// that of this tensor.
   template <typename Op, typename Perm,
-            typename = std::enable_if_t<detail::is_permutation_v<Perm>>>
-  Tensor unary(Op&& op, const Perm& perm) const {
+            typename = std::enable_if_t<
+                detail::is_permutation_v<std::remove_reference_t<Perm>>>>
+  Tensor unary(Op&& op, Perm&& perm) const {
     constexpr bool is_tot = detail::is_tensor_of_tensor_v<Tensor>;
     [[maybe_unused]] constexpr bool is_bperm =
         detail::is_bipartite_permutation_v<Perm>;
     // tile ops pass bipartite permutations here even if this is a plain tensor
-    // static_assert(is_tot || (!is_tot && !is_bperm), "Permutation type does
-    // not match Tensor");
     if constexpr (!is_tot) {
+      if (empty()) return *this;
       if constexpr (is_bperm) {
         TA_ASSERT(inner_size(perm) == 0);  // ensure this is a plain permutation
-        return Tensor(*this, op, outer(perm));
+        return Tensor(*this, op, outer(std::forward<Perm>(perm)));
       } else
-        return Tensor(*this, op, perm);
+        return Tensor(*this, op, std::forward<Perm>(perm));
     } else {
       auto temp = unary(std::forward<Op>(op));
       Permute<Tensor, Tensor> p;
-      return p(temp, perm);
+      return p(temp, std::forward<Perm>(perm));
     }
     abort();  // unreachable
   }
@@ -1412,8 +1580,11 @@ class Tensor {
   template <typename Scalar, typename std::enable_if<
                                  detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor scale(const Scalar factor) const {
-    return unary(
-        [factor](const numeric_type a) -> numeric_type { return a * factor; });
+    if (range().volume() == 0) return *this;
+    return unary([factor](const value_type& a) -> decltype(auto) {
+      using namespace TiledArray::detail;
+      return a * factor;
+    });
   }
 
   /// Construct a scaled and permuted copy of this tensor
@@ -1429,7 +1600,10 @@ class Tensor {
                                         detail::is_permutation_v<Perm>>>
   Tensor scale(const Scalar factor, const Perm& perm) const {
     return unary(
-        [factor](const numeric_type a) -> numeric_type { return a * factor; },
+        [factor](const numeric_type a) -> numeric_type {
+          using namespace TiledArray::detail;
+          return a * factor;
+        },
         perm);
   }
 
@@ -1442,12 +1616,12 @@ class Tensor {
                                  detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor& scale_to(const Scalar factor) {
     return inplace_unary(
-        [factor](numeric_type& MADNESS_RESTRICT res) { res *= factor; });
+        [factor](value_type& MADNESS_RESTRICT res) { res *= factor; });
   }
 
   // Addition operations
 
-  /// Add this and \c other to construct a new tensors
+  /// Add this and \c other to construct a new tensor
 
   /// \tparam Right The right-hand tensor type
   /// \param right The tensor that will be added to this tensor
@@ -1455,14 +1629,32 @@ class Tensor {
   /// \c this and \c other
   template <typename Right,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
-  Tensor add(const Right& right) const {
+  Tensor add(const Right& right) const& {
+    if (right.empty()) return *this;
     return binary(
         right,
-        [](const numeric_type l, const numeric_t<Right> r) -> numeric_type {
+        [](const value_type& l, const value_t<Right>& r) -> decltype(auto) {
+          if constexpr (detail::is_tensor_v<value_type>) {
+            if (l.empty() && r.empty())
+              return value_type{};
+          }
           return l + r;
         });
   }
 
+  /// Add this and \c other to construct a new tensor
+
+  /// \tparam Right The right-hand tensor type
+  /// \param right The tensor that will be added to this tensor
+  /// \return A new tensor where the elements are the sum of the elements of
+  /// \c this and \c other
+  template <typename Right,
+            typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
+  Tensor add(const Right& right) && {
+    add_to(right);
+    return std::move(*this);
+  }
+
   /// Add this and \c other to construct a new, permuted tensor
 
   /// \tparam Right The right-hand tensor type
@@ -1478,7 +1670,7 @@ class Tensor {
   Tensor add(const Right& right, const Perm& perm) const {
     return binary(
         right,
-        [](const numeric_type l, const numeric_t<Right> r) -> numeric_type {
+        [](const value_type& l, const value_type& r) -> decltype(auto) {
           return l + r;
         },
         perm);
@@ -1497,9 +1689,11 @@ class Tensor {
       typename std::enable_if<is_tensor<Right>::value &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor add(const Right& right, const Scalar factor) const {
-    return binary(right,
-                  [factor](const numeric_type l, const numeric_t<Right> r)
-                      -> numeric_type { return (l + r) * factor; });
+    return binary(
+        right,
+        [factor](const value_type& l, const value_type& r) -> decltype(auto) {
+          return (l + r) * factor;
+        });
   }
 
   /// Scale and add this and \c other to construct a new, permuted tensor
@@ -1519,8 +1713,9 @@ class Tensor {
   Tensor add(const Right& right, const Scalar factor, const Perm& perm) const {
     return binary(
         right,
-        [factor](const numeric_type l, const numeric_t<Right> r)
-            -> numeric_type { return (l + r) * factor; },
+        [factor](const value_type& l, const value_type& r) -> decltype(auto) {
+          return (l + r) * factor;
+        },
         perm);
   }
 
@@ -1557,8 +1752,12 @@ class Tensor {
   template <typename Right,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
   Tensor& add_to(const Right& right) {
-    return inplace_binary(right, [](numeric_type& MADNESS_RESTRICT l,
-                                    const numeric_t<Right> r) { l += r; });
+    if (right.empty()) return *this;
+    if (empty()) {
+      *this = Tensor{right.range(), value_type{}};
+    }
+    return inplace_binary(right, [](value_type& MADNESS_RESTRICT l,
+                                    const value_t<Right> r) { l += r; });
   }
 
   /// Add \c other to this tensor, and scale the result
@@ -1574,8 +1773,8 @@ class Tensor {
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor& add_to(const Right& right, const Scalar factor) {
     return inplace_binary(
-        right, [factor](numeric_type& MADNESS_RESTRICT l,
-                        const numeric_t<Right> r) { (l += r) *= factor; });
+        right, [factor](value_type& MADNESS_RESTRICT l,
+                        const value_t<Right> r) { (l += r) *= factor; });
   }
 
   /// Add a constant to this tensor
@@ -1596,11 +1795,11 @@ class Tensor {
   /// \return A new tensor where the elements are the different between the
   /// elements of \c this and \c right
   template <typename Right,
-            typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
+            typename = std::enable_if<
+                detail::tensors_have_equal_nested_rank_v<Tensor, Right>>>
   Tensor subt(const Right& right) const {
     return binary(
-        right,
-        [](const numeric_type l, const numeric_t<Right> r) -> numeric_type {
+        right, [](const value_type& l, const value_type& r) -> decltype(auto) {
           return l - r;
         });
   }
@@ -1620,7 +1819,7 @@ class Tensor {
   Tensor subt(const Right& right, const Perm& perm) const {
     return binary(
         right,
-        [](const numeric_type l, const numeric_t<Right> r) -> numeric_type {
+        [](const value_type& l, const value_type& r) -> decltype(auto) {
           return l - r;
         },
         perm);
@@ -1640,9 +1839,11 @@ class Tensor {
       typename std::enable_if<is_tensor<Right>::value &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor subt(const Right& right, const Scalar factor) const {
-    return binary(right,
-                  [factor](const numeric_type l, const numeric_t<Right> r)
-                      -> numeric_type { return (l - r) * factor; });
+    return binary(
+        right,
+        [factor](const value_type& l, const value_type& r) -> decltype(auto) {
+          return (l - r) * factor;
+        });
   }
 
   /// Subtract \c right from this and return the result scaled by a scaling \c
@@ -1663,8 +1864,9 @@ class Tensor {
   Tensor subt(const Right& right, const Scalar factor, const Perm& perm) const {
     return binary(
         right,
-        [factor](const numeric_type l, const numeric_t<Right> r)
-            -> numeric_type { return (l - r) * factor; },
+        [factor](const value_type& l, const value_type& r) -> decltype(auto) {
+          return (l - r) * factor;
+        },
         perm);
   }
 
@@ -1695,8 +1897,8 @@ class Tensor {
   template <typename Right,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
   Tensor& subt_to(const Right& right) {
-    return inplace_binary(right, [](numeric_type& MADNESS_RESTRICT l,
-                                    const numeric_t<Right> r) { l -= r; });
+    return inplace_binary(
+        right, [](auto& MADNESS_RESTRICT l, const auto& r) { l -= r; });
   }
 
   /// Subtract \c right from and scale this tensor
@@ -1711,9 +1913,10 @@ class Tensor {
       typename std::enable_if<is_tensor<Right>::value &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor& subt_to(const Right& right, const Scalar factor) {
-    return inplace_binary(
-        right, [factor](numeric_type& MADNESS_RESTRICT l,
-                        const numeric_t<Right> r) { (l -= r) *= factor; });
+    return inplace_binary(right,
+                          [factor](auto& MADNESS_RESTRICT l, const auto& r) {
+                            (l -= r) *= factor;
+                          });
   }
 
   /// Subtract a constant from this tensor
@@ -1730,13 +1933,20 @@ class Tensor {
   /// \return A new tensor where the elements are the product of the elements
   /// of \c this and \c right
   template <typename Right,
-            typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
-  Tensor mult(const Right& right) const {
-    return binary(
-        right,
-        [](const numeric_type l, const numeric_t<Right> r) -> numeric_type {
-          return l * r;
-        });
+            typename std::enable_if<detail::is_nested_tensor_v<Right>>::type* =
+                nullptr>
+  decltype(auto) mult(const Right& right) const {
+
+    auto mult_op =[](const value_type& l, const value_t<Right>& r) -> decltype(auto) {
+      return l * r;
+    };
+
+    if (empty() || right.empty()) {
+      using res_t = decltype(std::declval<Tensor>().binary(std::declval<Right>(), mult_op));
+      return res_t{};
+    }
+
+    return binary(right, mult_op);
   }
 
   /// Multiply this by \c right to create a new, permuted tensor
@@ -1749,12 +1959,12 @@ class Tensor {
   /// of \c this and \c right
   template <
       typename Right, typename Perm,
-      typename std::enable_if<is_tensor<Right>::value &&
+      typename std::enable_if<detail::is_nested_tensor_v<Right> &&
                               detail::is_permutation_v<Perm>>::type* = nullptr>
-  Tensor mult(const Right& right, const Perm& perm) const {
+  decltype(auto) mult(const Right& right, const Perm& perm) const {
     return binary(
         right,
-        [](const numeric_type l, const numeric_t<Right> r) -> numeric_type {
+        [](const value_type& l, const value_t<Right>& r) -> decltype(auto) {
           return l * r;
         },
         perm);
@@ -1770,12 +1980,12 @@ class Tensor {
   /// of \c this and \c right, scaled by \c factor
   template <
       typename Right, typename Scalar,
-      typename std::enable_if<is_tensor<Right>::value &&
+      typename std::enable_if<detail::is_nested_tensor_v<Right> &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
-  Tensor mult(const Right& right, const Scalar factor) const {
+  decltype(auto) mult(const Right& right, const Scalar factor) const {
     return binary(right,
-                  [factor](const numeric_type l, const numeric_t<Right> r)
-                      -> numeric_type { return (l * r) * factor; });
+                  [factor](const value_type& l, const value_t<Right>& r)
+                      -> decltype(auto) { return (l * r) * factor; });
   }
 
   /// Scale and multiply this by \c right to create a new, permuted tensor
@@ -1788,15 +1998,17 @@ class Tensor {
   /// \param perm The permutation to be applied to this tensor
   /// \return A new tensor where the elements are the product of the elements
   /// of \c this and \c right, scaled by \c factor
-  template <typename Right, typename Scalar, typename Perm,
-            typename std::enable_if<
-                is_tensor<Right>::value && detail::is_numeric_v<Scalar> &&
-                detail::is_permutation_v<Perm>>::type* = nullptr>
-  Tensor mult(const Right& right, const Scalar factor, const Perm& perm) const {
+  template <
+      typename Right, typename Scalar, typename Perm,
+      typename std::enable_if<detail::is_nested_tensor_v<Right> &&
+                              detail::is_numeric_v<Scalar> &&
+                              detail::is_permutation_v<Perm>>::type* = nullptr>
+  decltype(auto) mult(const Right& right, const Scalar factor,
+                      const Perm& perm) const {
     return binary(
         right,
-        [factor](const numeric_type l, const numeric_t<Right> r)
-            -> numeric_type { return (l * r) * factor; },
+        [factor](const value_type& l, const value_t<Right>& r)
+            -> decltype(auto) { return (l * r) * factor; },
         perm);
   }
 
@@ -1806,10 +2018,11 @@ class Tensor {
   /// \param right The tensor that will be multiplied by this tensor
   /// \return A reference to this tensor
   template <typename Right,
-            typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
+            typename std::enable_if<detail::is_nested_tensor_v<Right>>::type* =
+                nullptr>
   Tensor& mult_to(const Right& right) {
-    return inplace_binary(right, [](numeric_type& MADNESS_RESTRICT l,
-                                    const numeric_t<Right> r) { l *= r; });
+    return inplace_binary(right, [](value_type& MADNESS_RESTRICT l,
+                                    const value_t<Right>& r) { l *= r; });
   }
 
   /// Scale and multiply this tensor by \c right
@@ -1821,12 +2034,12 @@ class Tensor {
   /// \return A reference to this tensor
   template <
       typename Right, typename Scalar,
-      typename std::enable_if<is_tensor<Right>::value &&
+      typename std::enable_if<detail::is_nested_tensor_v<Right> &&
                               detail::is_numeric_v<Scalar>>::type* = nullptr>
   Tensor& mult_to(const Right& right, const Scalar factor) {
     return inplace_binary(
-        right, [factor](numeric_type& MADNESS_RESTRICT l,
-                        const numeric_t<Right> r) { (l *= r) *= factor; });
+        right, [factor](value_type& MADNESS_RESTRICT l,
+                        const value_t<Right>& r) { (l *= r) *= factor; });
   }
 
   // Negation operations
@@ -1987,11 +2200,11 @@ class Tensor {
     if (this->empty()) {
       *this =
           Tensor(gemm_helper.make_result_range<range_type>(A.range_, B.range()),
-                 A.batch_size(), default_construct{true});
+                 A.nbatch(), default_construct{true});
       beta = 0;
     }
-    TA_ASSERT(this->batch_size() == A.batch_size());
-    TA_ASSERT(this->batch_size() == B.batch_size());
+    TA_ASSERT(this->nbatch() == A.nbatch());
+    TA_ASSERT(this->nbatch() == B.nbatch());
 
     // may need to split gemm into multiply + accumulate for tracing purposes
 #ifdef TA_ENABLE_TILE_OPS_LOGGING
@@ -2002,11 +2215,11 @@ class Tensor {
       std::unique_ptr<T[]> data_copy;
       size_t tile_volume;
       if (twostep) {
-        tile_volume = range().volume() * batch_size();
+        tile_volume = range().volume() * nbatch();
         data_copy = std::make_unique<T[]>(tile_volume);
         std::copy(data_.get(), data_.get() + tile_volume, data_copy.get());
       }
-      for (size_t i = 0; i < this->batch_size(); ++i) {
+      for (size_t i = 0; i < this->nbatch(); ++i) {
         auto Ci = this->batch(i);
         TiledArray::gemm(alpha, A.batch(i), B.batch(i),
                          twostep ? numeric_type(0) : numeric_type(1), Ci,
@@ -2047,7 +2260,7 @@ class Tensor {
               TiledArray::TileOpsLogger<T>::get_instance().gemm_printer(
                   *logger.log, tformed_left_range, A.data(),
                   tformed_right_range, B.data(), tformed_right_range,
-                  this->data(), this->batch_size());
+                  this->data(), this->nbatch());
             }
           }
         }
@@ -2060,9 +2273,10 @@ class Tensor {
       }
     }
 #else   // TA_ENABLE_TILE_OPS_LOGGING
-    for (size_t i = 0; i < this->batch_size(); ++i) {
+    for (size_t i = 0; i < this->nbatch(); ++i) {
       auto Ci = this->batch(i);
-      TiledArray::gemm(alpha, A.batch(i), B.batch(i), beta, Ci, gemm_helper);
+      TiledArray::detail::gemm(alpha, A.batch(i), B.batch(i), beta, Ci,
+                               gemm_helper);
     }
 #endif  // TA_ENABLE_TILE_OPS_LOGGING
 
@@ -2082,6 +2296,8 @@ class Tensor {
     TA_ASSERT(left.range().rank() == gemm_helper.left_rank());
     TA_ASSERT(!right.empty());
     TA_ASSERT(right.range().rank() == gemm_helper.right_rank());
+    TA_ASSERT(left.nbatch() == right.nbatch());
+    const auto batch_sz = left.nbatch();
 
     // Check that the inner dimensions of left and right match
     TA_ASSERT(gemm_helper.left_right_congruent(left.range().extent_data(),
@@ -2095,7 +2311,8 @@ class Tensor {
 
     if (this->empty()) {  // initialize, if empty
       *this = Tensor(gemm_helper.make_result_range<range_type>(left.range(),
-                                                               right.range()));
+                                                               right.range()),
+                     batch_sz);
     } else {
       // Check that the outer dimensions of left match the corresponding
       // dimensions in result
@@ -2118,6 +2335,9 @@ class Tensor {
       TA_ASSERT(ignore_tile_position() ||
                 gemm_helper.right_result_congruent(
                     right.range().upbound_data(), this->range_.upbound_data()));
+
+      // check that batch size of this matches that of left and right
+      TA_ASSERT(this->nbatch() == batch_sz);
     }
 
     // Compute gemm dimensions
@@ -2131,20 +2351,25 @@ class Tensor {
     const integer ldb =
         (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? N : K);
 
-    for (integer m = 0; m != M; ++m) {
-      for (integer n = 0; n != N; ++n) {
-        auto c_offset = m * N + n;
-        for (integer k = 0; k != K; ++k) {
-          auto a_offset =
-              gemm_helper.left_op() == TiledArray::math::blas::NoTranspose
-                  ? m * lda + k
-                  : k * lda + m;
-          auto b_offset =
-              gemm_helper.right_op() == TiledArray::math::blas::NoTranspose
-                  ? k * ldb + n
-                  : n * ldb + k;
-          elem_muladd_op(*(this->data() + c_offset), *(left.data() + a_offset),
-                         *(right.data() + b_offset));
+    for (integer b = 0; b != nbatch(); ++b) {
+      auto this_data = this->batch_data(b);
+      auto left_data = left.batch_data(b);
+      auto right_data = right.batch_data(b);
+      for (integer m = 0; m != M; ++m) {
+        for (integer n = 0; n != N; ++n) {
+          auto c_offset = m * N + n;
+          for (integer k = 0; k != K; ++k) {
+            auto a_offset =
+                gemm_helper.left_op() == TiledArray::math::blas::NoTranspose
+                    ? m * lda + k
+                    : k * lda + m;
+            auto b_offset =
+                gemm_helper.right_op() == TiledArray::math::blas::NoTranspose
+                    ? k * ldb + n
+                    : n * ldb + k;
+            elem_muladd_op(*(this_data + c_offset), *(left_data + a_offset),
+                           *(right_data + b_offset));
+          }
         }
       }
     }
@@ -2174,18 +2399,19 @@ class Tensor {
   /// identity . If HAVE_INTEL_TBB is defined, and this is a contiguous tensor,
   /// the reduction will be executed in an undefined order, otherwise will
   /// execute in the order of increasing \c i .
-  /// \tparam ReduceOp The reduction
-  /// operation type
+  /// \tparam ReduceOp The reduction operation type
   /// \tparam JoinOp The join operation type
-  /// \param reduce_op The
-  /// element-wise reduction operation
+  /// \tparam T a type that can be used as argument to ReduceOp
+  /// \param reduce_op The element-wise reduction operation
   /// \param join_op The join result operation
   /// \param identity The identity value of the reduction
   /// \return The reduced value
-  template <typename ReduceOp, typename JoinOp, typename Scalar>
-  decltype(auto) reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
-                        Scalar identity) const {
-    return detail::tensor_reduce(reduce_op, join_op, identity, *this);
+  template <typename ReduceOp, typename JoinOp, typename Identity>
+  auto reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
+              Identity&& identity) const {
+    return detail::tensor_reduce(std::forward<ReduceOp>(reduce_op),
+                                 std::forward<JoinOp>(join_op),
+                                 std::forward<Identity>(identity), *this);
   }
 
   /// Binary reduction operation
@@ -2196,22 +2422,23 @@ class Tensor {
   /// \c identity . If HAVE_INTEL_TBB is defined, and this is a contiguous
   /// tensor, the reduction will be executed in an undefined order, otherwise
   /// will execute in the order of increasing \c i .
-  /// \tparam Right The
-  /// right-hand argument tensor type
-  /// \tparam ReduceOp The reduction operation
-  /// type
+  /// \tparam Right The right-hand argument tensor type
+  /// \tparam ReduceOp The reduction operation type
   /// \tparam JoinOp The join operation type
-  /// \param other The right-hand
-  /// argument of the binary reduction
-  /// \param reduce_op The element-wise
-  /// reduction operation \param join_op The join result operation
+  /// \tparam Identity A type that can be used as argument to ReduceOp
+  /// \param other The right-hand argument of the binary reduction
+  /// \param reduce_op The element-wise reduction operation
+  /// \param join_op The join result operation
   /// \param identity The identity value of the reduction
   /// \return The reduced value
-  template <typename Right, typename ReduceOp, typename JoinOp, typename Scalar,
+  template <typename Right, typename ReduceOp, typename JoinOp,
+            typename Identity,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
-  decltype(auto) reduce(const Right& other, ReduceOp&& reduce_op,
-                        JoinOp&& join_op, Scalar identity) const {
-    return detail::tensor_reduce(reduce_op, join_op, identity, *this, other);
+  auto reduce(const Right& other, ReduceOp&& reduce_op, JoinOp&& join_op,
+              Identity&& identity) const {
+    return detail::tensor_reduce(
+        std::forward<ReduceOp>(reduce_op), std::forward<JoinOp>(join_op),
+        std::forward<Identity>(identity), *this, other);
   }
 
   /// Sum of elements
@@ -2236,9 +2463,20 @@ class Tensor {
 
   /// \return The vector norm of this tensor
   scalar_type squared_norm() const {
+    if constexpr (detail::is_tensor_v<T>) {
+      // If uninitialized tensor of tensor return zero.
+      // All elements of this->data() are empty tensors in this case,
+      // however, we only look at the first element.
+      // Because
+      //          - It is expensive to look at all elements.
+      //          - The state of the array having only some empty elements
+      //            is ill-defined and should never happen.
+      if (detail::empty(*data())) return 0;
+    }
+
     auto square_op = [](scalar_type& MADNESS_RESTRICT res,
                         const numeric_type arg) {
-      res += TiledArray::detail::norm(arg);
+      res += TiledArray::detail::squared_norm(arg);
     };
     auto sum_op = [](scalar_type& MADNESS_RESTRICT res, const scalar_type arg) {
       res += arg;
@@ -2381,194 +2619,11 @@ std::size_t Tensor<T, A>::trace_if_larger_than_ =
     std::numeric_limits<std::size_t>::max();
 #endif
 
-template <typename T>
-Tensor<T> operator*(const Permutation& p, const Tensor<T>& t) {
+template <typename T, typename A>
+Tensor<T, A> operator*(const Permutation& p, const Tensor<T, A>& t) {
   return t.permute(p);
 }
 
-/// Contract two tensors and accumulate the scaled result to this tensor
-
-/// GEMM is limited to matrix like contractions. For example, the following
-/// contractions are supported:
-/// \code
-/// C[a,b] = A[a,i,j] * B[i,j,b]
-/// C[a,b] = A[a,i,j] * B[b,i,j]
-/// C[a,b] = A[i,j,a] * B[i,j,b]
-/// C[a,b] = A[i,j,a] * B[b,i,j]
-///
-/// C[a,b,c,d] = A[a,b,i,j] * B[i,j,c,d]
-/// C[a,b,c,d] = A[a,b,i,j] * B[c,d,i,j]
-/// C[a,b,c,d] = A[i,j,a,b] * B[i,j,c,d]
-/// C[a,b,c,d] = A[i,j,a,b] * B[c,d,i,j]
-/// \endcode
-/// Notice that in the above contractions, the inner and outer indices of
-/// the arguments for exactly two contiguous groups in each tensor and that
-/// each group is in the same order in all tensors. That is, the indices of
-/// the tensors must fit the one of the following patterns:
-/// \code
-/// C[M...,N...] = A[M...,K...] * B[K...,N...]
-/// C[M...,N...] = A[M...,K...] * B[N...,K...]
-/// C[M...,N...] = A[K...,M...] * B[K...,N...]
-/// C[M...,N...] = A[K...,M...] * B[N...,K...]
-/// \endcode
-/// This allows use of optimized BLAS functions to evaluate tensor
-/// contractions. Tensor contractions that do not fit this pattern require
-/// one or more tensor permutation so that the tensors fit the required
-/// pattern.
-/// \tparam U The left-hand tensor element type
-/// \tparam AU The left-hand tensor allocator type
-/// \tparam V The right-hand tensor element type
-/// \tparam AV The right-hand tensor allocator type
-/// \tparam W The type of the scaling factor
-/// \param left The left-hand tensor that will be contracted
-/// \param right The right-hand tensor that will be contracted
-/// \param factor The contraction result will be scaling by this value, then
-/// accumulated into \c this \param gemm_helper The *GEMM operation meta data
-/// \return A reference to \c this
-/// \note if this is uninitialized, i.e., if \c this->empty()==true will
-/// this is equivalent to
-/// \code
-///   return (*this = left.gemm(right, factor, gemm_helper));
-/// \endcode
-template <typename Alpha, typename... As, typename... Bs, typename Beta,
-          typename... Cs>
-void gemm(Alpha alpha, const Tensor<As...>& A, const Tensor<Bs...>& B,
-          Beta beta, Tensor<Cs...>& C, const math::GemmHelper& gemm_helper) {
-  static_assert(
-      !detail::is_tensor_of_tensor_v<Tensor<As...>, Tensor<Bs...>,
-                                     Tensor<Cs...>>,
-      "TA::Tensor<T>::gemm without custom element op is only applicable to "
-      "plain tensors");
-  {
-    // Check that tensor C is not empty and has the correct rank
-    TA_ASSERT(!C.empty());
-    TA_ASSERT(C.range().rank() == gemm_helper.result_rank());
-
-    // Check that the arguments are not empty and have the correct ranks
-    TA_ASSERT(!A.empty());
-    TA_ASSERT(A.range().rank() == gemm_helper.left_rank());
-    TA_ASSERT(!B.empty());
-    TA_ASSERT(B.range().rank() == gemm_helper.right_rank());
-
-    TA_ASSERT(A.batch_size() == 1);
-    TA_ASSERT(B.batch_size() == 1);
-    TA_ASSERT(C.batch_size() == 1);
-
-    // Check that the outer dimensions of left match the corresponding
-    // dimensions in result
-    TA_ASSERT(gemm_helper.left_result_congruent(A.range().extent_data(),
-                                                C.range().extent_data()));
-    TA_ASSERT(ignore_tile_position() ||
-              gemm_helper.left_result_congruent(A.range().lobound_data(),
-                                                C.range().lobound_data()));
-    TA_ASSERT(ignore_tile_position() ||
-              gemm_helper.left_result_congruent(A.range().upbound_data(),
-                                                C.range().upbound_data()));
-
-    // Check that the outer dimensions of right match the corresponding
-    // dimensions in result
-    TA_ASSERT(gemm_helper.right_result_congruent(B.range().extent_data(),
-                                                 C.range().extent_data()));
-    TA_ASSERT(ignore_tile_position() ||
-              gemm_helper.right_result_congruent(B.range().lobound_data(),
-                                                 C.range().lobound_data()));
-    TA_ASSERT(ignore_tile_position() ||
-              gemm_helper.right_result_congruent(B.range().upbound_data(),
-                                                 C.range().upbound_data()));
-
-    // Check that the inner dimensions of left and right match
-    TA_ASSERT(gemm_helper.left_right_congruent(A.range().extent_data(),
-                                               B.range().extent_data()));
-    TA_ASSERT(ignore_tile_position() ||
-              gemm_helper.left_right_congruent(A.range().lobound_data(),
-                                               B.range().lobound_data()));
-    TA_ASSERT(ignore_tile_position() ||
-              gemm_helper.left_right_congruent(A.range().upbound_data(),
-                                               B.range().upbound_data()));
-
-    // Compute gemm dimensions
-    using integer = TiledArray::math::blas::integer;
-    integer m, n, k;
-    gemm_helper.compute_matrix_sizes(m, n, k, A.range(), B.range());
-
-    // Get the leading dimension for left and right matrices.
-    const integer lda =
-        (gemm_helper.left_op() == TiledArray::math::blas::NoTranspose ? k : m);
-    const integer ldb =
-        (gemm_helper.right_op() == TiledArray::math::blas::NoTranspose ? n : k);
-
-    // may need to split gemm into multiply + accumulate for tracing purposes
-#ifdef TA_ENABLE_TILE_OPS_LOGGING
-    {
-      using numeric_type = typename Tensor<Cs...>::numeric_type;
-      using T = numeric_type;
-      const bool twostep =
-          TiledArray::TileOpsLogger<T>::get_instance().gemm &&
-          TiledArray::TileOpsLogger<T>::get_instance().gemm_print_contributions;
-      std::unique_ptr<T[]> data_copy;
-      size_t tile_volume;
-      if (twostep) {
-        tile_volume = C.range().volume();
-        data_copy = std::make_unique<T[]>(tile_volume);
-        std::copy(C.data(), C.data() + tile_volume, data_copy.get());
-      }
-      non_distributed::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n,
-                            k, alpha, A.data(), lda, B.data(), ldb,
-                            twostep ? numeric_type(0) : beta, C.data(), n);
-
-      if (TiledArray::TileOpsLogger<T>::get_instance_ptr() != nullptr &&
-          TiledArray::TileOpsLogger<T>::get_instance().gemm) {
-        auto& logger = TiledArray::TileOpsLogger<T>::get_instance();
-        auto apply = [](auto& fnptr, const Range& arg) {
-          return fnptr ? fnptr(arg) : arg;
-        };
-        auto tformed_left_range =
-            apply(logger.gemm_left_range_transform, A.range());
-        auto tformed_right_range =
-            apply(logger.gemm_right_range_transform, B.range());
-        auto tformed_result_range =
-            apply(logger.gemm_result_range_transform, C.range());
-        if ((!logger.gemm_result_range_filter ||
-             logger.gemm_result_range_filter(tformed_result_range)) &&
-            (!logger.gemm_left_range_filter ||
-             logger.gemm_left_range_filter(tformed_left_range)) &&
-            (!logger.gemm_right_range_filter ||
-             logger.gemm_right_range_filter(tformed_right_range))) {
-          logger << "TA::Tensor::gemm+: left=" << tformed_left_range
-                 << " right=" << tformed_right_range
-                 << " result=" << tformed_result_range << std::endl;
-          if (TiledArray::TileOpsLogger<T>::get_instance()
-                  .gemm_print_contributions) {
-            if (!TiledArray::TileOpsLogger<T>::get_instance()
-                     .gemm_printer) {  // default printer
-              // must use custom printer if result's range transformed
-              if (!logger.gemm_result_range_transform)
-                logger << C << std::endl;
-              else
-                logger << make_map(C.data(), tformed_result_range) << std::endl;
-            } else {
-              TiledArray::TileOpsLogger<T>::get_instance().gemm_printer(
-                  *logger.log, tformed_left_range, A.data(),
-                  tformed_right_range, B.data(), tformed_right_range, C.data(),
-                  C.batch_size());
-            }
-          }
-        }
-      }
-
-      if (twostep) {
-        for (size_t v = 0; v != tile_volume; ++v) {
-          C.data()[v] += data_copy[v];
-        }
-      }
-    }
-#else   // TA_ENABLE_TILE_OPS_LOGGING
-    math::blas::gemm(gemm_helper.left_op(), gemm_helper.right_op(), m, n, k,
-                     alpha, A.data(), lda, B.data(), ldb, beta, C.data(), n);
-#endif  // TA_ENABLE_TILE_OPS_LOGGING
-  }
-}
-
 // template <typename T, typename A>
 // const typename Tensor<T, A>::range_type Tensor<T, A>::empty_range_;
 
@@ -2576,8 +2631,8 @@ void gemm(Alpha alpha, const Tensor<As...>& A, const Tensor<Bs...>& B,
 /// \param[in] a a Tensor object
 /// \param[in] b another Tensor object
 /// \return true if ranges and data of \p a and \p b are equal
-/// \internal this does not compare batch_size  so any
-///           2 empty tensors are equal even if their batch_size
+/// \internal this does not compare nbatch  so any
+///           2 empty tensors are equal even if their nbatch
 ///           differ
 template <typename T, typename A>
 bool operator==(const Tensor<T, A>& a, const Tensor<T, A>& b) {
@@ -2596,16 +2651,16 @@ bool operator!=(const Tensor<T, A>& a, const Tensor<T, A>& b) {
 
 namespace detail {
 
-/// Implements taking the trace of a Tensor<T> (\c T is a numeric type)
+/// Implements taking the trace of a Tensor<T,A>
 ///
 /// \tparam T The type of the elements in the tensor. For this specialization
 ///           to be considered must satisfy the concept of numeric type.
 /// \tparam A The type of the allocator for the tensor
 template <typename T, typename A>
 struct Trace<Tensor<T, A>, detail::enable_if_numeric_t<T>> {
-  decltype(auto) operator()(const Tensor<T>& t) const {
-    using size_type = typename Tensor<T>::size_type;
-    using value_type = typename Tensor<T>::value_type;
+  decltype(auto) operator()(const Tensor<T, A>& t) const {
+    using size_type = typename Tensor<T, A>::size_type;
+    using value_type = typename Tensor<T, A>::value_type;
     const auto range = t.range();
 
     // Get pointers to the range data
@@ -2670,6 +2725,22 @@ struct transform<Tensor<T, A>> {
 };
 }  // namespace detail
 
+namespace detail {
+
+template <typename T, typename A>
+struct real_t_impl<Tensor<T, A>> {
+  using type = typename Tensor<T, A>::template rebind_numeric_t<
+      typename Tensor<T, A>::scalar_type>;
+};
+
+template <typename T, typename A>
+struct complex_t_impl<Tensor<T, A>> {
+  using type = typename Tensor<T, A>::template rebind_numeric_t<
+      std::complex<typename Tensor<T, A>::scalar_type>>;
+};
+
+}  // namespace detail
+
 #ifndef TILEDARRAY_HEADER_ONLY
 
 extern template class Tensor<double>;
diff --git a/src/TiledArray/tensor/tensor_interface.h b/src/TiledArray/tensor/tensor_interface.h
index f39e6ff88d..5aaf9f511c 100644
--- a/src/TiledArray/tensor/tensor_interface.h
+++ b/src/TiledArray/tensor/tensor_interface.h
@@ -110,6 +110,9 @@ class TensorInterface {
   template <typename X>
   using numeric_t = typename TiledArray::detail::numeric_type<X>::type;
 
+  template <typename X>
+  using value_t = typename std::remove_reference_t<X>::value_type;
+
   template <typename, typename, typename>
   friend class TensorInterface;
 
@@ -188,14 +191,16 @@ class TensorInterface {
     TA_ASSERT(data);
   }
 
-  template <typename T1, typename std::enable_if<
-                             detail::is_tensor<T1>::value>::type* = nullptr>
+  template <typename T1, typename std::enable_if<detail::is_nested_tensor<
+                             T1>::value>::type* = nullptr>
   TensorInterface_& operator=(const T1& other) {
-    TA_ASSERT(data_ != other.data());
+    if constexpr (std::is_same_v<numeric_type, numeric_t<T1>>) {
+      TA_ASSERT(data_ != other.data());
+    }
 
-    detail::inplace_tensor_op([](numeric_type& MADNESS_RESTRICT result,
-                                 const numeric_t<T1> arg) { result = arg; },
-                              *this, other);
+    detail::inplace_tensor_op(
+        [](value_type& MADNESS_RESTRICT result, auto&& arg) { result = arg; },
+        *this, other);
 
     return *this;
   }
@@ -217,42 +222,134 @@ class TensorInterface {
 
   /// Element subscript accessor
 
-  /// \param index The ordinal element index
-  /// \return A const reference to the element at \c index.
-  const_reference operator[](const ordinal_type index) const {
-    TA_ASSERT(range_.includes(index));
-    return data_[range_.ordinal(index)];
+  /// \param index_ordinal The ordinal element index
+  /// \return A const reference to the element at \c index_ordinal.
+  const_reference operator[](const ordinal_type index_ordinal) const {
+    TA_ASSERT(range_.includes(index_ordinal));
+    return data_[range_.ordinal(index_ordinal)];
   }
 
   /// Element subscript accessor
 
   /// \param index The ordinal element index
-  /// \return A const reference to the element at \c index.
-  reference operator[](const ordinal_type index) {
-    TA_ASSERT(range_.includes(index));
-    return data_[range_.ordinal(index)];
+  /// \return A const reference to the element at \c index_ordinal.
+  reference operator[](const ordinal_type index_ordinal) {
+    TA_ASSERT(range_.includes(index_ordinal));
+    return data_[range_.ordinal(index_ordinal)];
+  }
+
+  /// Element accessor
+
+  /// \param index_ordinal The ordinal element index
+  /// \return A const reference to the element at \c index_ordinal.
+  const_reference at_ordinal(const ordinal_type index_ordinal) const {
+    TA_ASSERT(range_.includes(index_ordinal));
+    return data_[range_.ordinal(index_ordinal)];
   }
 
   /// Element accessor
 
-  /// \tparam Index An integral type pack or a single coodinate index type
+  /// \param index_ordinal The ordinal element index
+  /// \return A const reference to the element at \c index_ordinal.
+  reference at_ordinal(const ordinal_type index_ordinal) {
+    TA_ASSERT(range_.includes(index_ordinal));
+    return data_[range_.ordinal(index_ordinal)];
+  }
+
+  /// Element accessor
+
+  /// \tparam Index An integral type pack or a single coordinate index type
   /// \param idx The index pack
   template <typename... Index>
   reference operator()(const Index&... idx) {
-    TA_ASSERT(range_.includes(idx...));
-    return data_[range_.ordinal(idx...)];
+    const auto ord = range_.ordinal(idx...);
+    return data_[ord];
   }
 
   /// Element accessor
 
-  /// \tparam Index An integral type pack or a single coodinate index type
+  /// \tparam Index An integral type pack or a single coordinate index type
   /// \param idx The index pack
   template <typename... Index>
   const_reference operator()(const Index&... idx) const {
-    TA_ASSERT(range_.includes(idx...));
-    return data_[range_.ordinal(idx...)];
+    const auto ord = range_.ordinal(idx...);
+    return data_[ord];
+  }
+
+  /// \brief Tensor interface iterator type
+  ///
+  /// Iterates over elements of a tensor interface whose range is iterable
+  template <typename TI = TensorInterface_>
+  class Iterator : public boost::iterator_facade<
+                       Iterator<TI>,
+                       std::conditional_t<std::is_const_v<TI>,
+                                          const typename TI::value_type,
+                                          typename TI::value_type>,
+                       boost::forward_traversal_tag> {
+   public:
+    using range_iterator = typename TI::range_type::const_iterator;
+
+    Iterator(range_iterator idx_it, TI& ti) : idx_it(idx_it), ti(ti) {}
+
+   private:
+    range_iterator idx_it;
+    TI& ti;
+
+    friend class boost::iterator_core_access;
+
+    /// \brief increments this iterator
+    void increment() { ++idx_it; }
+
+    /// \brief Iterator comparer
+    /// \return true, if \c `*this==*other`
+    bool equal(Iterator const& other) const {
+      return this->idx_it == other.idx_it;
+    }
+
+    /// \brief dereferences this iterator
+    /// \return const reference to the current index
+    auto& dereference() const { return ti(*idx_it); }
+  };
+  friend class Iterator<TensorInterface_>;
+  friend class Iterator<const TensorInterface_>;
+
+  typedef Iterator<TensorInterface_> iterator;              ///< Iterator type
+  typedef Iterator<const TensorInterface_> const_iterator;  ///< Iterator type
+
+  /// Const begin iterator
+
+  /// \return An iterator that points to the beginning of this tensor view
+  const_iterator begin() const {
+    return const_iterator(range().begin(), *this);
   }
 
+  /// Const end iterator
+
+  /// \return An iterator that points to the end of this tensor view
+  const_iterator end() const { return const_iterator(range().end(), *this); }
+
+  /// Nonconst begin iterator
+
+  /// \return An iterator that points to the beginning of this tensor view
+  iterator begin() { return iterator(range().begin(), *this); }
+
+  /// Nonconst begin iterator
+
+  /// \return An iterator that points to the beginning of this tensor view
+  iterator end() { return iterator(range().end(), *this); }
+
+  /// Const begin iterator
+
+  /// \return An iterator that points to the beginning of this tensor view
+  const_iterator cbegin() const {
+    return const_iterator(range().begin(), *this);
+  }
+
+  /// Const end iterator
+
+  /// \return An iterator that points to the end of this tensor view
+  const_iterator cend() const { return const_iterator(range().end(), *this); }
+
   /// Check for empty view
 
   /// \return \c false
@@ -979,17 +1076,20 @@ class TensorInterface {
   /// \c i in the index range of \c this . \c result is initialized to \c
   /// identity . If HAVE_INTEL_TBB is defined, and this is a contiguous tensor,
   /// the reduction will be executed in an undefined order, otherwise will
-  /// execute in the order of increasing \c i . \tparam ReduceOp The reduction
-  /// operation type \tparam JoinOp The join operation type \param reduce_op The
-  /// element-wise reduction operation \param join_op The join result operation
+  /// execute in the order of increasing \c i .
+  /// \tparam ReduceOp The reduction operation type
+  /// \tparam JoinOp The join operation type
+  /// \tparam Identity a type that can be used as argument to ReduceOp
+  /// \param reduce_op The element-wise reduction operation
+  /// \param join_op The join result operation
   /// \param identity The identity value of the reduction
   /// \return The reduced value
-  template <typename ReduceOp, typename JoinOp>
-  numeric_type reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
-                      const numeric_type identity) const {
+  template <typename ReduceOp, typename JoinOp, typename Identity>
+  decltype(auto) reduce(ReduceOp&& reduce_op, JoinOp&& join_op,
+                        Identity&& identity) const {
     return detail::tensor_reduce(std::forward<ReduceOp>(reduce_op),
-                                 std::forward<JoinOp>(join_op), identity,
-                                 *this);
+                                 std::forward<JoinOp>(join_op),
+                                 std::forward<Identity>(identity), *this);
   }
 
   /// Binary reduction operation
@@ -999,19 +1099,24 @@ class TensorInterface {
   /// for each \c i in the index range of \c this . \c result is initialized to
   /// \c identity . If HAVE_INTEL_TBB is defined, and this is a contiguous
   /// tensor, the reduction will be executed in an undefined order, otherwise
-  /// will execute in the order of increasing \c i . \tparam Right The
-  /// right-hand argument tensor type \tparam ReduceOp The reduction operation
-  /// type \tparam JoinOp The join operation type \param other The right-hand
-  /// argument of the binary reduction \param reduce_op The element-wise
-  /// reduction operation \param join_op The join result operation \param
-  /// identity The identity value of the reduction \return The reduced value
+  /// will execute in the order of increasing \c i .
+  /// \tparam Right The right-hand argument tensor type
+  /// \tparam ReduceOp The reduction operation type
+  /// \tparam JoinOp The join operation type
+  /// \tparam Identity a type that can be used as argument to ReduceOp
+  /// \param other The right-hand argument of the binary reduction
+  /// \param reduce_op The element-wise reduction operation
+  /// \param join_op The join result operation
+  /// \param identity The identity value of the reduction
+  /// \return The reduced value
   template <typename Right, typename ReduceOp, typename JoinOp,
+            typename Identity,
             typename std::enable_if<is_tensor<Right>::value>::type* = nullptr>
-  numeric_type reduce(const Right& other, ReduceOp&& reduce_op,
-                      JoinOp&& join_op, const numeric_type identity) const {
-    return detail::tensor_reduce(std::forward<ReduceOp>(reduce_op),
-                                 std::forward<JoinOp>(join_op), identity, *this,
-                                 other);
+  decltype(auto) reduce(const Right& other, ReduceOp&& reduce_op,
+                        JoinOp&& join_op, Identity&& identity) const {
+    return detail::tensor_reduce(
+        std::forward<ReduceOp>(reduce_op), std::forward<JoinOp>(join_op),
+        std::forward<Identity>(identity), *this, other);
   }
 
   /// Sum of elements
@@ -1038,12 +1143,12 @@ class TensorInterface {
   scalar_type squared_norm() const {
     auto square_op = [](scalar_type& MADNESS_RESTRICT res,
                         const numeric_type arg) {
-      res += TiledArray::detail::norm(arg);
+      res += TiledArray::detail::squared_norm(arg);
     };
     auto sum_op = [](scalar_type& MADNESS_RESTRICT res, const scalar_type arg) {
       res += arg;
     };
-    return reduce(square_op, sum_op, numeric_type(0));
+    return reduce(square_op, sum_op, scalar_type(0));
   }
 
   /// Vector 2-norm
@@ -1077,27 +1182,29 @@ class TensorInterface {
   /// Absolute minimum element
 
   /// \return The minimum elements of this tensor
-  numeric_type abs_min() const {
-    auto abs_min_op = [](numeric_type& MADNESS_RESTRICT res,
+  scalar_type abs_min() const {
+    auto abs_min_op = [](scalar_type& MADNESS_RESTRICT res,
                          const numeric_type arg) {
       res = std::min(res, std::abs(arg));
     };
-    auto min_op = [](numeric_type& MADNESS_RESTRICT res,
-                     const numeric_type arg) { res = std::min(res, arg); };
-    return reduce(abs_min_op, min_op, std::numeric_limits<numeric_type>::max());
+    auto min_op = [](scalar_type& MADNESS_RESTRICT res, const scalar_type arg) {
+      res = std::min(res, arg);
+    };
+    return reduce(abs_min_op, min_op, std::numeric_limits<scalar_type>::max());
   }
 
   /// Absolute maximum element
 
   /// \return The maximum elements of this tensor
-  numeric_type abs_max() const {
-    auto abs_max_op = [](numeric_type& MADNESS_RESTRICT res,
+  scalar_type abs_max() const {
+    auto abs_max_op = [](scalar_type& MADNESS_RESTRICT res,
                          const numeric_type arg) {
       res = std::max(res, std::abs(arg));
     };
-    auto max_op = [](numeric_type& MADNESS_RESTRICT res,
-                     const numeric_type arg) { res = std::max(res, arg); };
-    return reduce(abs_max_op, max_op, numeric_type(0));
+    auto max_op = [](scalar_type& MADNESS_RESTRICT res, const scalar_type arg) {
+      res = std::max(res, arg);
+    };
+    return reduce(abs_max_op, max_op, scalar_type(0));
   }
 
   /// Vector dot product
diff --git a/src/TiledArray/tensor/type_traits.h b/src/TiledArray/tensor/type_traits.h
index 62448336a3..a32de32e4a 100644
--- a/src/TiledArray/tensor/type_traits.h
+++ b/src/TiledArray/tensor/type_traits.h
@@ -28,9 +28,10 @@
 
 #include <TiledArray/config.h>
 
+#include <TiledArray/fwd.h>
 #include <TiledArray/type_traits.h>
+#include <iterator>
 #include <type_traits>
-#include <TiledArray/fwd.h>
 
 namespace Eigen {
 
@@ -60,10 +61,23 @@ class ShiftWrapper;
 // Note: These type traits help differentiate different implementation
 // functions for tensors, so a tensor of tensors is not considered a tensor.
 
+/// is true type if all `Ts...` are tensors of scalars
 template <typename... Ts>
 struct is_tensor;
+/// is true type if all `Ts...` are tensors of tensors of scalars
 template <typename... Ts>
 struct is_tensor_of_tensor;
+/// is true type if all `Ts...` are _nested_ tensors; a nested tensor is a
+/// tensors of scalars or tensors of nested tensors
+template <typename... Ts>
+struct is_nested_tensor;
+/// is true type if `T1`, `T2`, and `Ts...` are tensors of same nested
+/// rank, i.e. they are all tensors of scalars or tensors of tensors of scalars,
+/// etc. ;
+/// \warning the types must be tensors, hence
+/// `tensors_have_equal_nested_rank<Scalar1,Scalar2>` is false
+template <typename T1, typename T2, typename... Ts>
+struct tensors_have_equal_nested_rank;
 
 template <typename>
 struct is_tensor_helper : public std::false_type {};
@@ -83,23 +97,41 @@ struct is_tensor_helper<ShiftWrapper<const T>> : public is_tensor_helper<T> {};
 template <typename T>
 struct is_tensor_helper<Tile<T>> : public is_tensor_helper<T> {};
 
+////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct is_nested_tensor<> : public std::false_type {};
+
 template <typename T>
-struct is_tensor_of_tensor_helper : public std::false_type {};
+struct is_nested_tensor<T> : is_tensor_helper<T> {};
 
-template <typename T, typename A>
-struct is_tensor_of_tensor_helper<Tensor<T, A>> : public is_tensor_helper<T> {};
+template <typename T1, typename T2, typename... Ts>
+struct is_nested_tensor<T1, T2, Ts...> {
+  static constexpr bool value =
+      is_tensor_helper<T1>::value && is_nested_tensor<T2, Ts...>::value;
+};
 
-template <typename T, typename... Args>
-struct is_tensor_of_tensor_helper<TensorInterface<T, Args...>>
-    : public is_tensor_helper<T> {};
+/// @tparam Ts a parameter pack
+/// @c is_nested_tensor_v<Ts...> is an alias for @c
+/// is_nested_tensor<Ts...>::value
+template <typename... Ts>
+inline constexpr const bool is_nested_tensor_v = is_nested_tensor<Ts...>::value;
 
-template <typename T>
-struct is_tensor_of_tensor_helper<ShiftWrapper<T>>
-    : public is_tensor_of_tensor_helper<T> {};
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename Enabler = void>
+struct is_tensor_of_tensor_helper : public std::false_type {};
 
 template <typename T>
-struct is_tensor_of_tensor_helper<Tile<T>>
-    : public is_tensor_of_tensor_helper<T> {};
+struct is_tensor_of_tensor_helper<
+    T, std::enable_if_t<is_tensor_helper<T>::value>> {
+  static constexpr bool value =
+      is_tensor_helper<detail::remove_cvr_t<typename T::value_type>>::value &&
+      !is_tensor_of_tensor_helper<
+          detail::remove_cvr_t<typename T::value_type>>::value;
+};
+
+////////////////////////////////////////////////////////////////////////////////
 
 template <>
 struct is_tensor<> : public std::false_type {};
@@ -119,7 +151,9 @@ struct is_tensor<T1, T2, Ts...> {
 /// @tparam Ts a parameter pack
 /// @c is_tensor_v<Ts...> is an alias for @c is_tensor<Ts...>::value
 template <typename... Ts>
-constexpr const bool is_tensor_v = is_tensor<Ts...>::value;
+inline constexpr const bool is_tensor_v = is_tensor<Ts...>::value;
+
+////////////////////////////////////////////////////////////////////////////////
 
 template <>
 struct is_tensor_of_tensor<> : public std::false_type {};
@@ -139,7 +173,71 @@ struct is_tensor_of_tensor<T1, T2, Ts...> {
 /// @c is_tensor_of_tensor_v<Ts...> is an alias for @c
 /// is_tensor_of_tensor<Ts...>::value
 template <typename... Ts>
-constexpr const bool is_tensor_of_tensor_v = is_tensor_of_tensor<Ts...>::value;
+inline constexpr const bool is_tensor_of_tensor_v =
+    is_tensor_of_tensor<Ts...>::value;
+
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T1, typename T2, typename Enabler = void>
+struct tensors_have_equal_nested_rank_helper : std::false_type {};
+
+template <typename T1, typename T2>
+struct tensors_have_equal_nested_rank_helper<
+    T1, T2, std::enable_if_t<is_nested_tensor_v<T1, T2>>> {
+  static constexpr bool value =
+      tensors_have_equal_nested_rank_helper<
+          detail::remove_cvr_t<typename T1::value_type>,
+          detail::remove_cvr_t<typename T2::value_type>>::value ||
+      (detail::is_numeric_v<detail::remove_cvr_t<typename T1::value_type>> &&
+       detail::is_numeric_v<detail::remove_cvr_t<typename T2::value_type>>);
+};
+
+template <typename T1, typename T2>
+struct tensors_have_equal_nested_rank<T1, T2>
+    : tensors_have_equal_nested_rank_helper<T1, T2> {};
+
+template <typename T1, typename T2, typename T3, typename... Ts>
+struct tensors_have_equal_nested_rank<T1, T2, T3, Ts...> {
+  static constexpr bool value =
+      tensors_have_equal_nested_rank<T1, T2>::value &&
+      tensors_have_equal_nested_rank<T2, T3, Ts...>::value;
+};
+
+/// @tparam Ts a parameter pack
+/// @c tensors_have_equal_nested_rank_v<Ts...> is an alias for @c
+/// tensors_have_equal_nested_rank<Ts...>::value
+template <typename T1, typename T2, typename... Ts>
+constexpr const bool tensors_have_equal_nested_rank_v =
+    tensors_have_equal_nested_rank<T1, T2, Ts...>::value;
+
+template <typename>
+constexpr size_t nested_rank = 0;
+
+template <typename T, typename... Ts>
+constexpr size_t nested_rank<TA::Tensor<T, Ts...>> = 1 + nested_rank<T>;
+
+template <typename T, typename... Ts>
+constexpr size_t nested_rank<const TA::Tensor<T, Ts...>> =
+    nested_rank<TA::Tensor<T, Ts...>>;
+
+template <typename T, typename P>
+constexpr size_t nested_rank<TA::DistArray<T, P>> = nested_rank<T>;
+
+template <typename T, typename P>
+constexpr size_t nested_rank<const TA::DistArray<T, P>> =
+    nested_rank<TA::DistArray<T, P>>;
+
+template <typename...>
+constexpr size_t max_nested_rank = 0;
+
+template <typename T>
+constexpr size_t max_nested_rank<T> = nested_rank<T>;
+
+template <typename T, typename U, typename... Us>
+constexpr size_t max_nested_rank<T, U, Us...> =
+    std::max(nested_rank<T>, std::max(nested_rank<U>, max_nested_rank<Us...>));
+
+////////////////////////////////////////////////////////////////////////////////
 
 template <typename T, typename Enabler = void>
 struct is_ta_tensor : public std::false_type {};
@@ -150,6 +248,8 @@ struct is_ta_tensor<Tensor<T, A>> : public std::true_type {};
 template <typename T>
 constexpr const bool is_ta_tensor_v = is_ta_tensor<T>::value;
 
+////////////////////////////////////////////////////////////////////////////////
+
 // Test if the tensor is contiguous
 
 template <typename T>
@@ -198,6 +298,8 @@ template <typename... Ts>
 constexpr const bool is_contiguous_tensor_v =
     is_contiguous_tensor<Ts...>::value;
 
+////////////////////////////////////////////////////////////////////////////////
+
 // Test if the tensor is shifted
 
 template <typename T>
@@ -241,22 +343,19 @@ template <typename ReduceOp, typename Result, typename... Args>
 constexpr const bool is_reduce_op_v =
     is_reduce_op_<void, ReduceOp, Result, Args...>::value;
 
-/// detect cuda tile
-#ifdef TILEDARRAY_HAS_CUDA
+/// detect device tile types
 template <typename T>
-struct is_cuda_tile : public std::false_type {};
+struct is_device_tile : public std::false_type {};
 
 template <typename T>
-struct is_cuda_tile<Tile<T>> : public is_cuda_tile<T> {};
+struct is_device_tile<Tile<T>> : public is_device_tile<T> {};
 
 template <typename T, typename Op>
-struct is_cuda_tile<LazyArrayTile<T, Op>>
-    : public is_cuda_tile<typename LazyArrayTile<T, Op>::eval_type> {};
+struct is_device_tile<LazyArrayTile<T, Op>>
+    : public is_device_tile<typename LazyArrayTile<T, Op>::eval_type> {};
 
 template <typename T>
-static constexpr const auto is_cuda_tile_v = is_cuda_tile<T>::value;
-
-#endif
+static constexpr const auto is_device_tile_v = is_device_tile<T>::value;
 
 template <typename Tensor, typename Enabler = void>
 struct default_permutation;
@@ -279,6 +378,9 @@ using default_permutation_t = typename default_permutation<Tensor>::type;
 template <typename T, typename Enabler = void>
 struct is_permutation : public std::false_type {};
 
+template <typename T>
+struct is_permutation<const T> : public is_permutation<T> {};
+
 template <>
 struct is_permutation<TiledArray::Permutation> : public std::true_type {};
 
@@ -295,13 +397,114 @@ static constexpr const auto is_permutation_v = is_permutation<T>::value;
 
 template <typename T>
 static constexpr const auto is_bipartite_permutation_v =
-    std::is_same_v<T, TiledArray::BipartitePermutation>;
+    std::is_same_v<T, TiledArray::BipartitePermutation> ||
+    std::is_same_v<T, const TiledArray::BipartitePermutation>;
 
 template <typename T>
 static constexpr const auto is_bipartite_permutable_v =
     is_free_function_permute_anyreturn_v<
         const T&, const TiledArray::BipartitePermutation&>;
 
+//
+template <typename, typename = void, typename = void>
+constexpr bool is_random_access_container_v{};
+
+///
+/// - The container concept is weakly tested -- any type that has
+///   @c iterator typedef gets picked up.
+///
+/// - The iterator category must be std::random_access_iterator_tag --
+///   random-access-ness is strongly tested.
+///
+/// Following lines compile, for example:
+///
+///     @c static_assert(is_random_access_container<std::vector<int>>);
+///     @c static_assert(!is_random_access_container<std::list<int>>);
+///
+template <typename T>
+constexpr bool is_random_access_container_v<
+    T, std::void_t<typename T::iterator>,
+    std::enable_if_t<std::is_same_v<
+        typename std::iterator_traits<typename T::iterator>::iterator_category,
+        std::random_access_iterator_tag>>>{true};
+
+//
+template <typename, typename = void, typename = void>
+constexpr bool is_annotation_v{};
+
+///
+/// An annotation type (T) is a type that satisfies the following constraints:
+///   - is_random_access_container_v<T> is true.
+///   - The value type of the container T are strictly ordered. Note that T is a
+///     container from the first constraint.
+///
+template <typename T>
+constexpr bool is_annotation_v<
+    T, std::void_t<typename T::value_type>,
+    std::enable_if_t<is_random_access_container_v<T> &&
+                     is_strictly_ordered_v<typename T::value_type>>
+
+    >{true};
+
+namespace {
+
+template <typename Op, typename Lhs, typename Rhs>
+using binop_result_t = std::invoke_result_t<Op, Lhs, Rhs>;
+
+template <typename Op, typename Lhs, typename Rhs, typename = void>
+constexpr bool is_binop_v{};
+
+template <typename Op, typename Lhs, typename Rhs>
+constexpr bool
+    is_binop_v<Op, Lhs, Rhs, std::void_t<binop_result_t<Op, Lhs, Rhs>>>{true};
+
+template <typename Op, typename TensorA, typename TensorB,
+          typename Allocator = void,
+          typename = std::enable_if_t<is_nested_tensor_v<TensorA, TensorB>>>
+struct result_tensor_helper {
+ private:
+  using TensorA_ = std::remove_reference_t<TensorA>;
+  using TensorB_ = std::remove_reference_t<TensorB>;
+  using value_type_A = typename TensorA_::value_type;
+  using value_type_B = typename TensorB_::value_type;
+  using allocator_type_A = typename TensorA_::allocator_type;
+  using allocator_type_B = typename TensorB_::allocator_type;
+
+ public:
+  using numeric_type = binop_result_t<Op, value_type_A, value_type_B>;
+  using allocator_type =
+      std::conditional_t<std::is_same_v<void, Allocator> &&
+                             std::is_same_v<allocator_type_A, allocator_type_B>,
+                         allocator_type_A, Allocator>;
+  using result_type =
+      std::conditional_t<std::is_same_v<void, allocator_type>,
+                         TA::Tensor<numeric_type>,
+                         TA::Tensor<numeric_type, allocator_type>>;
+};
+
+}  // namespace
+
+///
+/// The typedef is a complete TA::Tensor<NumericT, AllocatorT> type where
+/// - NumericT is determined by Op:
+///   - effectively, it is:
+///     <tt> std::invoke_result_t<Op, typename TensorA::value_type, typename
+///     TensorB::value_type> </tt>
+///
+/// - AllocatorT is
+///   - the default TA::Tensor allocator if @tparam Allocator is void
+///   - TensorA::allocator_type if TensorA and TensorB have the same allocator
+///   type
+///   - the @tparam Allocator otherwise
+/// todo: constraint what @tparam Allocator
+///
+///
+template <typename Op, typename TensorA, typename TensorB,
+          typename Allocator = void,
+          typename = std::enable_if_t<is_nested_tensor_v<TensorA, TensorB>>>
+using result_tensor_t =
+    typename result_tensor_helper<Op, TensorA, TensorB, Allocator>::result_type;
+
 }  // namespace detail
 
 /// Specifies how coordinates are mapped to ordinal values
@@ -329,6 +532,23 @@ struct ordinal_traits<T, std::enable_if_t<is_contiguous_tensor_v<T>>> {
       std::decay_t<decltype(std::declval<const T&>().range())>>::type;
 };
 
+template <class E>
+class has_total_size {
+  /// true case
+  template <class U>
+  static auto __test(U* p) -> decltype(p->total_size(), std::true_type());
+  /// false case
+  template <class>
+  static std::false_type __test(...);
+
+ public:
+  static constexpr const bool value =
+      std::is_same<std::true_type, decltype(__test<E>(0))>::value;
+};
+
+template <typename T>
+constexpr inline bool has_total_size_v = has_total_size<T>::value;
+
 }  // namespace detail
 
 }  // namespace TiledArray
diff --git a/src/TiledArray/tensor_impl.h b/src/TiledArray/tensor_impl.h
index 6811fc6cb2..7ead791fd2 100644
--- a/src/TiledArray/tensor_impl.h
+++ b/src/TiledArray/tensor_impl.h
@@ -53,6 +53,8 @@ class TensorImpl : private NO_DEFAULTS {
   const trange_type trange_;                    ///< Tiled range type
   std::shared_ptr<const shape_type> shape_;     ///< Tensor shape
   std::shared_ptr<const pmap_interface> pmap_;  ///< Process map for tiles
+  mutable std::atomic<std::make_signed_t<ordinal_type>>
+      local_nnz_;  ///< Number of nonzero tiles assigned to this rank (memoized)
 
  public:
   /// Constructor
@@ -74,6 +76,7 @@ class TensorImpl : private NO_DEFAULTS {
         trange_(trange),
         shape_(std::make_shared<shape_type>(shape)),
         pmap_(pmap) {
+    local_nnz_ = -1;
     // ensure that shapes are identical on every rank
     if (replicate_shape && !shape.is_dense())
       world.gop.broadcast_serializable(*shape_, 0);
@@ -115,8 +118,8 @@ class TensorImpl : private NO_DEFAULTS {
 
   /// Tensor tile volume accessor
 
-  /// \return The number of tiles in the tensor
-  /// \throw nothing
+  /// \return The number of tiles in the tensor, equivalent to
+  /// `this->trange().tiles_range().volume()` \throw nothing
   ordinal_type size() const { return trange_.tiles_range().volume(); }
 
   /// Max count of local tiles
@@ -131,6 +134,27 @@ class TensorImpl : private NO_DEFAULTS {
     return static_cast<ordinal_type>(pmap_->local_size());
   }
 
+  /// Count of nonzero local tiles
+
+  /// This function is primarily available for debugging  purposes.
+  /// \return The count of nonzero local tiles; for dense array this will be
+  /// equal to the value produced by local_size(), for a sparse array this will
+  /// be less than the value produced by local_size()
+  ordinal_type local_nnz() const {
+    if (local_nnz_ == -1) {
+      if (is_dense())
+        local_nnz_ = local_size();
+      else {
+        ordinal_type count = 0;
+        for (auto&& idx : trange_.tiles_range()) {
+          if (is_local(idx) && !is_zero(idx)) ++count;
+        }
+        local_nnz_ = count;
+      }
+    }
+    return local_nnz_;
+  }
+
   /// Query a tile owner
 
   /// \tparam Index The sized integral range type
diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h
index 57366dbe60..90f7366bbc 100644
--- a/src/TiledArray/tile.h
+++ b/src/TiledArray/tile.h
@@ -39,19 +39,19 @@ namespace TiledArray {
 /// object to be used in TiledArray expressions, users must also define the
 /// following functions:
 /// \li \c add
-/// \li \c add_to
+/// \li \c add_to (in-place add)
 /// \li \c subt
-/// \li \c subt_to
+/// \li \c subt_to  (in-place subt)
 /// \li \c mult
-/// \li \c mult_to
+/// \li \c mult_to (in-place mult)
 /// \li \c scale
-/// \li \c scale_to
+/// \li \c scale_to  (in-place scale)
 /// \li \c gemm
 /// \li \c neg
 /// \li \c permute
 /// \li \c empty
 /// \li \c shift
-/// \li \c shift_to
+/// \li \c shift_to  (in-place shift)
 /// \li \c trace
 /// \li \c sum
 /// \li \c product
@@ -95,6 +95,31 @@ class Tile {
   using scalar_type = typename TiledArray::detail::scalar_type<
       tensor_type>::type;  ///< the scalar type that supports T
 
+ private:
+  template <typename Element, typename = void>
+  struct rebind;
+  template <typename Element>
+  struct rebind<Element, std::enable_if_t<detail::has_rebind_v<T, Element>>> {
+    using type = Tile<typename T::template rebind_t<Element>>;
+  };
+
+  template <typename Numeric, typename = void>
+  struct rebind_numeric;
+  template <typename Numeric>
+  struct rebind_numeric<
+      Numeric, std::enable_if_t<detail::has_rebind_numeric_v<T, Numeric>>> {
+    using type = Tile<typename T::template rebind_numeric_t<Numeric>>;
+  };
+
+ public:
+  /// compute type of Tile<T> with different element type
+  template <typename ElementType>
+  using rebind_t = typename rebind<ElementType>::type;
+
+  /// compute type of Tile<T> with different numeric type
+  template <typename NumericType>
+  using rebind_numeric_t = typename rebind_numeric<NumericType>::type;
+
  private:
   std::shared_ptr<tensor_type> pimpl_;
 
@@ -176,6 +201,26 @@ class Tile {
   /// \return A const iterator to the last data element
   decltype(auto) end() const { return std::end(tensor()); }
 
+  /// Iterator factory
+
+  /// \return A const iterator to the first data element
+  decltype(auto) cbegin() { return std::cbegin(tensor()); }
+
+  /// Iterator factory
+
+  /// \return A const iterator to the first data element
+  decltype(auto) cbegin() const { return std::cbegin(tensor()); }
+
+  /// Iterator factory
+
+  /// \return A const iterator to the last data element
+  decltype(auto) cend() { return std::cend(tensor()); }
+
+  /// Iterator factory
+
+  /// \return A const iterator to the last data element
+  decltype(auto) cend() const { return std::cend(tensor()); }
+
   // Data accessor -------------------------------------------------------
 
   /// Data direct access
@@ -190,11 +235,23 @@ class Tile {
 
   // Dimension information accessors -----------------------------------------
 
-  /// Size accessors
+  /// Size accessor
 
   /// \return The number of elements in the tensor
   decltype(auto) size() const { return tensor().size(); }
 
+  /// Total size accessor
+
+  /// \return The number of elements in the tensor, tallied across batches (if
+  /// any)
+  decltype(auto) total_size() const {
+    if constexpr (detail::has_member_function_total_size_anyreturn_v<
+                      tensor_type>) {
+      return tensor().total_size();
+    } else
+      return size();
+  }
+
   /// Range accessor
 
   /// \return An object describes the upper and lower bounds of  the tensor data
@@ -213,6 +270,11 @@ class Tile {
             std::enable_if_t<std::is_integral<Ordinal>::value>* = nullptr>
   const_reference operator[](const Ordinal ord) const {
     TA_ASSERT(pimpl_);
+    // can't distinguish between operator[](Index...) and operator[](ordinal)
+    // thus insist on at_ordinal() if this->rank()==1
+    TA_ASSERT(this->range().rank() != 1 &&
+              "use Tile::operator[](index) or "
+              "Tile::at_ordinal(index_ordinal) if this->range().rank()==1");
     TA_ASSERT(tensor().range().includes_ordinal(ord));
     return tensor().data()[ord];
   }
@@ -227,6 +289,41 @@ class Tile {
   template <typename Ordinal,
             std::enable_if_t<std::is_integral<Ordinal>::value>* = nullptr>
   reference operator[](const Ordinal ord) {
+    TA_ASSERT(pimpl_);
+    // can't distinguish between operator[](Index...) and operator[](ordinal)
+    // thus insist on at_ordinal() if this->rank()==1
+    TA_ASSERT(this->range().rank() != 1 &&
+              "use Tile::operator[](index) or "
+              "Tile::at_ordinal(index_ordinal) if this->range().rank()==1");
+    TA_ASSERT(tensor().range().includes_ordinal(ord));
+    return tensor().data()[ord];
+  }
+
+  /// Const element accessor
+
+  /// \tparam Ordinal an integer type that represents an ordinal
+  /// \param[in] ord an ordinal index
+  /// \return Const reference to the element at position \c ord .
+  /// \note This asserts (using TA_ASSERT) that this is not empty and ord is
+  /// included in the range
+  template <typename Ordinal,
+            std::enable_if_t<std::is_integral<Ordinal>::value>* = nullptr>
+  const_reference at_ordinal(const Ordinal ord) const {
+    TA_ASSERT(pimpl_);
+    TA_ASSERT(tensor().range().includes_ordinal(ord));
+    return tensor().data()[ord];
+  }
+
+  /// Element accessor
+
+  /// \tparam Ordinal an integer type that represents an ordinal
+  /// \param[in] ord an ordinal index
+  /// \return Reference to the element at position \c ord .
+  /// \note This asserts (using TA_ASSERT) that this is not empty and ord is
+  /// included in the range
+  template <typename Ordinal,
+            std::enable_if_t<std::is_integral<Ordinal>::value>* = nullptr>
+  reference at_ordinal(const Ordinal ord) {
     TA_ASSERT(pimpl_);
     TA_ASSERT(tensor().range().includes_ordinal(ord));
     return tensor().data()[ord];
@@ -364,6 +461,12 @@ class Tile {
                        detail::is_integral_list<Index...>::value>* = nullptr>
   const_reference operator()(const Index&... i) const {
     TA_ASSERT(pimpl_);
+    TA_ASSERT(this->range().rank() == sizeof...(Index));
+    // can't distinguish between operator()(Index...) and operator()(ordinal)
+    // thus insist on at_ordinal() if this->rank()==1
+    TA_ASSERT(this->range().rank() != 1 &&
+              "use Tile::operator()(index) or "
+              "Tile::at_ordinal(index_ordinal) if this->range().rank()==1");
     TA_ASSERT(tensor().range().includes(i...));
     return tensor().data()[tensor().range().ordinal(i...)];
   }
@@ -380,6 +483,12 @@ class Tile {
                        detail::is_integral_list<Index...>::value>* = nullptr>
   reference operator()(const Index&... i) {
     TA_ASSERT(pimpl_);
+    TA_ASSERT(this->range().rank() == sizeof...(Index));
+    // can't distinguish between operator()(Index...) and operator()(ordinal)
+    // thus insist on at_ordinal() if this->rank()==1
+    TA_ASSERT(this->range().rank() != 1 &&
+              "use Tile::operator()(index) or "
+              "Tile::at_ordinal(index_ordinal) if this->range().rank()==1");
     TA_ASSERT(tensor().range().includes(i...));
     return tensor().data()[tensor().range().ordinal(i...)];
   }
@@ -564,7 +673,7 @@ class Tile {
   void serialize(Archive& ar) const {
     // Serialize data for empty tile check
     bool empty = !static_cast<bool>(pimpl_);
-    ar& empty;
+    ar & empty;
     if (!empty) {
       // Serialize tile data
       ar&* pimpl_;
@@ -577,12 +686,12 @@ class Tile {
   void serialize(Archive& ar) {
     // Check for empty tile
     bool empty = false;
-    ar& empty;
+    ar & empty;
 
     if (!empty) {
       // Deserialize tile data
       tensor_type tensor;
-      ar& tensor;
+      ar & tensor;
 
       // construct a new pimpl
       pimpl_ = std::make_shared<T>(std::move(tensor));
@@ -592,10 +701,10 @@ class Tile {
     }
   }
 
-  constexpr static std::size_t batch_size() { return 1; }
+  constexpr static std::size_t nbatch() { return 1; }
 
   const auto& batch(std::size_t idx) const {
-    TA_ASSERT(idx < this->batch_size());
+    TA_ASSERT(idx < this->nbatch());
     return *this;
   }
 
@@ -1648,6 +1757,22 @@ bool operator!=(const Tile<T1>& t1, const Tile<T2>& t2) {
   return !(t1 == t2);
 }
 
+namespace detail {
+
+template <typename T>
+struct real_t_impl<Tile<T>> {
+  using type = typename Tile<T>::template rebind_numeric_t<
+      typename Tile<T>::scalar_type>;
+};
+
+template <typename T>
+struct complex_t_impl<Tile<T>> {
+  using type = typename Tile<T>::template rebind_numeric_t<
+      std::complex<typename Tile<T>::scalar_type>>;
+};
+
+}  // namespace detail
+
 }  // namespace TiledArray
 
 #endif  // TILEDARRAY_TILE_H__INCLUDED
diff --git a/src/TiledArray/tile_interface/add.h b/src/TiledArray/tile_interface/add.h
index 9c0e02e558..879d2ed9d2 100644
--- a/src/TiledArray/tile_interface/add.h
+++ b/src/TiledArray/tile_interface/add.h
@@ -39,10 +39,21 @@ namespace TiledArray {
 /// \param left The left-hand argument to be added
 /// \param right The right-hand argument to be added
 /// \return A tile that is equal to <tt>(left + right)</tt>
-template <typename Left, typename Right>
-inline auto add(const Left& left, const Right& right)
-    -> decltype(left.add(right)) {
-  return left.add(right);
+template <typename Left, typename Right,
+          typename = std::enable_if_t<
+              detail::has_member_function_add_anyreturn_v<Left&&, Right&&> ||
+              detail::has_member_function_add_anyreturn_v<Right&&, Left&&>>>
+inline decltype(auto) add(Left&& left, Right&& right) {
+  constexpr auto left_right =
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&> &&
+       detail::has_member_function_add_anyreturn_v<Right&&, Left&&> &&
+       !std::is_reference_v<Right> && std::is_reference_v<Left>) ||
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&> &&
+       !detail::has_member_function_add_anyreturn_v<Right&&, Left&&>);
+  if constexpr (left_right)
+    return std::forward<Left>(left).add(std::forward<Right>(right));
+  else
+    return std::forward<Right>(right).add(std::forward<Left>(left));
 }
 
 /// Add and scale tile arguments
@@ -56,9 +67,26 @@ inline auto add(const Left& left, const Right& right)
 /// \return A tile that is equal to <tt>(left + right) * factor</tt>
 template <
     typename Left, typename Right, typename Scalar,
-    typename std::enable_if<detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline auto add(const Left& left, const Right& right, const Scalar factor) {
-  return left.add(right, factor);
+    typename = std::enable_if_t<detail::is_numeric_v<Scalar> &&
+                                (detail::has_member_function_add_anyreturn_v<
+                                     Left&&, Right&&, const Scalar> ||
+                                 detail::has_member_function_add_anyreturn_v<
+                                     Right&&, Left&&, const Scalar>)>>
+inline decltype(auto) add(Left&& left, Right&& right, const Scalar factor) {
+  constexpr auto left_right =
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&,
+                                                   const Scalar> &&
+       detail::has_member_function_add_anyreturn_v<Right&&, Left&&,
+                                                   const Scalar> &&
+       !std::is_reference_v<Right> && std::is_reference_v<Left>) ||
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&,
+                                                   const Scalar> &&
+       !detail::has_member_function_add_anyreturn_v<Right&&, Left&&,
+                                                    const Scalar>);
+  if constexpr (left_right)
+    return std::forward<Left>(left).add(std::forward<Right>(right), factor);
+  else
+    return std::forward<Right>(right).add(std::forward<Left>(left), factor);
 }
 
 /// Add and permute tile arguments
@@ -72,10 +100,25 @@ inline auto add(const Left& left, const Right& right, const Scalar factor) {
 template <
     typename Left, typename Right, typename Perm,
     typename = std::enable_if_t<TiledArray::detail::is_permutation_v<Perm> &&
-                                detail::has_member_function_add_anyreturn_v<
-                                    const Left, const Right&, const Perm&>>>
-inline auto add(const Left& left, const Right& right, const Perm& perm) {
-  return left.add(right, perm);
+                                (detail::has_member_function_add_anyreturn_v<
+                                     Left&&, Right&&, const Perm&> ||
+                                 detail::has_member_function_add_anyreturn_v<
+                                     Right&&, Left&&, const Perm&>)>>
+inline decltype(auto) add(Left&& left, Right&& right, const Perm& perm) {
+  constexpr auto left_right =
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&,
+                                                   const Perm&> &&
+       detail::has_member_function_add_anyreturn_v<Right&&, Left&&,
+                                                   const Perm&> &&
+       !std::is_reference_v<Right> && std::is_reference_v<Left>) ||
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&,
+                                                   const Perm&> &&
+       !detail::has_member_function_add_anyreturn_v<Right&&, Left&&,
+                                                    const Perm&>);
+  if constexpr (left_right)
+    return std::forward<Left>(left).add(std::forward<Right>(right), perm);
+  else
+    return std::forward<Right>(right).add(std::forward<Left>(left), perm);
 }
 
 /// Add, scale, and permute tile arguments
@@ -88,13 +131,31 @@ inline auto add(const Left& left, const Right& right, const Perm& perm) {
 /// \param factor The scaling factor
 /// \param perm The permutation to be applied to the result
 /// \return A tile that is equal to <tt>perm ^ (left + right) * factor</tt>
-template <
-    typename Left, typename Right, typename Scalar, typename Perm,
-    typename std::enable_if<detail::is_numeric_v<Scalar> &&
-                            detail::is_permutation_v<Perm>>::type* = nullptr>
-inline auto add(const Left& left, const Right& right, const Scalar factor,
-                const Perm& perm) {
-  return left.add(right, factor, perm);
+template <typename Left, typename Right, typename Scalar, typename Perm,
+          typename = std::enable_if_t<
+              detail::is_numeric_v<Scalar> && detail::is_permutation_v<Perm> &&
+              (detail::has_member_function_add_anyreturn_v<
+                   Left&&, Right&&, const Scalar, const Perm&> ||
+               detail::has_member_function_add_anyreturn_v<
+                   Right&&, Left&&, const Scalar, const Perm&>)>>
+inline decltype(auto) add(Left&& left, Right&& right, const Scalar factor,
+                          const Perm& perm) {
+  constexpr auto left_right =
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&,
+                                                   const Scalar, const Perm&> &&
+       detail::has_member_function_add_anyreturn_v<Right&&, Left&&,
+                                                   const Scalar, const Perm&> &&
+       !std::is_reference_v<Right> && std::is_reference_v<Left>) ||
+      (detail::has_member_function_add_anyreturn_v<Left&&, Right&&,
+                                                   const Scalar, const Perm&> &&
+       !detail::has_member_function_add_anyreturn_v<Right&&, Left&&,
+                                                    const Scalar, const Perm&>);
+  if constexpr (left_right)
+    return std::forward<Left>(left).add(std::forward<Right>(right), factor,
+                                        perm);
+  else
+    return std::forward<Right>(right).add(std::forward<Left>(left), factor,
+                                          perm);
 }
 
 /// Add to the result tile
@@ -104,9 +165,12 @@ inline auto add(const Left& left, const Right& right, const Scalar factor,
 /// \param result The result tile
 /// \param arg The argument to be added to the result
 /// \return A tile that is equal to <tt>result[i] += arg[i]</tt>
-template <typename Result, typename Arg>
-inline Result& add_to(Result& result, const Arg& arg) {
-  return result.add_to(arg);
+template <
+    typename Result, typename Arg,
+    typename = std::enable_if_t<
+        detail::has_member_function_add_to_anyreturn_v<Result&&, const Arg&>>>
+inline decltype(auto) add_to(Result&& result, const Arg& arg) {
+  return std::forward<Result>(result).add_to(arg);
 }
 
 /// Add and scale to the result tile
@@ -118,11 +182,14 @@ inline Result& add_to(Result& result, const Arg& arg) {
 /// \param arg The argument to be added to \c result
 /// \param factor The scaling factor
 /// \return A tile that is equal to <tt>(result[i] += arg[i]) *= factor</tt>
-template <
-    typename Result, typename Arg, typename Scalar,
-    typename std::enable_if<detail::is_numeric_v<Scalar>>::type* = nullptr>
-inline Result& add_to(Result& result, const Arg& arg, const Scalar factor) {
-  return result.add_to(arg, factor);
+template <typename Result, typename Arg, typename Scalar,
+          typename std::enable_if<
+              detail::is_numeric_v<Scalar> &&
+              detail::has_member_function_add_to_anyreturn_v<
+                  Result&&, const Arg&, const Scalar>>::type* = nullptr>
+inline decltype(auto) add_to(Result&& result, const Arg& arg,
+                             const Scalar factor) {
+  return std::forward<Result>(result).add_to(arg, factor);
 }
 
 namespace tile_interface {
diff --git a/src/TiledArray/tile_interface/cast.h b/src/TiledArray/tile_interface/cast.h
index c22b97b051..52c7a550be 100644
--- a/src/TiledArray/tile_interface/cast.h
+++ b/src/TiledArray/tile_interface/cast.h
@@ -26,8 +26,8 @@
 #ifndef TILEDARRAY_TILE_INTERFACE_CAST_H__INCLUDED
 #define TILEDARRAY_TILE_INTERFACE_CAST_H__INCLUDED
 
-#include "../meta.h"
-#include "../type_traits.h"
+#include "TiledArray/type_traits.h"
+#include "TiledArray/util/invoke.h"
 
 namespace TiledArray {
 
@@ -80,7 +80,7 @@ class Cast<Result, Arg,
     auto exec = [](Arg_&& arg) {
       return static_cast<Result_>(std::forward<Arg_>(arg));
     };
-    return TiledArray::meta::invoke(exec, arg);
+    return TiledArray::detail::invoke(exec, arg);
   }
   template <typename Result_, typename Arg_>
   static auto invoker(
@@ -93,7 +93,7 @@ class Cast<Result, Arg,
     auto exec = [](Arg_&& arg) {
       return static_cast<madness::Future<Result_>>(std::forward<Arg_>(arg));
     };
-    return TiledArray::meta::invoke(exec, std::forward<Arg_>(arg));
+    return TiledArray::detail::invoke(exec, std::forward<Arg_>(arg));
   }
 
  public:
@@ -151,7 +151,7 @@ class Cast<Result, Arg,
   auto operator()(Arg_&& arg) const {
     arg_to_eval_caster_type cast_to_eval;
     eval_to_result_caster_type cast_to_result;
-    return meta::invoke(cast_to_result, meta::invoke(cast_to_eval, arg));
+    return detail::invoke(cast_to_result, detail::invoke(cast_to_eval, arg));
   }
 
 };  // class Cast
@@ -175,7 +175,7 @@ template <typename Arg, typename Result = typename TiledArray::eval_trait<
                             madness::remove_fcvr_t<Arg>>::type>
 auto invoke_cast(Arg&& arg) {
   Cast<Result, std::decay_t<Arg>> cast;
-  return TiledArray::meta::invoke(cast, std::forward<Arg>(arg));
+  return TiledArray::detail::invoke(cast, std::forward<Arg>(arg));
 }
 
 }  // namespace TiledArray
diff --git a/src/TiledArray/tile_op/binary_reduction.h b/src/TiledArray/tile_op/binary_reduction.h
index d65d133f32..4bbac16bcf 100644
--- a/src/TiledArray/tile_op/binary_reduction.h
+++ b/src/TiledArray/tile_op/binary_reduction.h
@@ -63,8 +63,8 @@ class DotReduction {
   void operator()(result_type& result, const first_argument_type& left,
                   const second_argument_type& right) const {
     using TiledArray::dot;
-    TA_ASSERT(left.batch_size() == right.batch_size());
-    size_t nb = left.batch_size();
+    TA_ASSERT(left.nbatch() == right.nbatch());
+    size_t nb = left.nbatch();
     for (size_t i = 0; i < nb; ++i) {
       result += dot(left.batch(i), right.batch(i));
     }
diff --git a/src/TiledArray/tile_op/binary_wrapper.h b/src/TiledArray/tile_op/binary_wrapper.h
index b66be2986d..07dd9d19fd 100644
--- a/src/TiledArray/tile_op/binary_wrapper.h
+++ b/src/TiledArray/tile_op/binary_wrapper.h
@@ -129,9 +129,11 @@ class BinaryWrapper {
   BinaryWrapper<Op>& operator=(const BinaryWrapper<Op>&) = default;
   BinaryWrapper<Op>& operator=(BinaryWrapper<Op>&&) = default;
 
-  template <typename Perm, typename = std::enable_if_t<
-                               TiledArray::detail::is_permutation_v<Perm>>>
-  BinaryWrapper(const Op& op, const Perm& perm) : op_(op), perm_(perm) {}
+  template <typename Perm,
+            typename = std::enable_if_t<TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>>>>
+  BinaryWrapper(const Op& op, Perm&& perm)
+      : op_(op), perm_(std::forward<Perm>(perm)) {}
 
   BinaryWrapper(const Op& op) : op_(op), perm_() {}
 
@@ -222,7 +224,7 @@ class BinaryWrapper {
                             madness::future_to_ref_t<decltype(eval_right)> r) {
       return BinaryWrapper_::operator()(l, r);
     };
-    return meta::invoke(continuation, eval_left, eval_right);
+    return detail::invoke(continuation, eval_left, eval_right);
   }
 
   /// Evaluate lazy and non-lazy tiles
@@ -247,7 +249,7 @@ class BinaryWrapper {
                                R&& r) {
       return BinaryWrapper_::operator()(l, std::forward<R>(r));
     };
-    return meta::invoke(continuation, eval_left, std::forward<R>(right));
+    return detail::invoke(continuation, eval_left, std::forward<R>(right));
   }
 
   /// Evaluate non-lazy and lazy tiles
@@ -271,7 +273,7 @@ class BinaryWrapper {
         [this](L&& l, madness::future_to_ref_t<decltype(eval_right)> r) {
           return BinaryWrapper_::operator()(std::forward<L>(l), r);
         };
-    return meta::invoke(continuation, std::forward<L>(left), eval_right);
+    return detail::invoke(continuation, std::forward<L>(left), eval_right);
   }
 
   /// Evaluate two lazy-array tiles
@@ -292,21 +294,21 @@ class BinaryWrapper {
     auto eval_left = invoke_cast(std::forward<L>(left));
     auto eval_right = invoke_cast(std::forward<R>(right));
 
-    if (perm_) return meta::invoke(op_, eval_left, eval_right, perm_);
+    if (perm_) return detail::invoke(op_, eval_left, eval_right, perm_);
 
-    auto op_left = [=](eval_t<L>& _left, eval_t<R>& _right) {
+    auto op_left = [this](eval_t<L>& _left, eval_t<R>& _right) {
       return op_.consume_left(_left, _right);
     };
-    auto op_right = [=](eval_t<L>& _left, eval_t<R>& _right) {
+    auto op_right = [this](eval_t<L>& _left, eval_t<R>& _right) {
       return op_.consume_right(_left, _right);
     };
     // Override consumable
     if (is_consumable_tile<eval_t<L>>::value && left.is_consumable())
-      return meta::invoke(op_left, eval_left, eval_right);
+      return detail::invoke(op_left, eval_left, eval_right);
     if (is_consumable_tile<eval_t<R>>::value && right.is_consumable())
-      return meta::invoke(op_right, eval_left, eval_right);
+      return detail::invoke(op_right, eval_left, eval_right);
 
-    return meta::invoke(op_, eval_left, eval_right);
+    return detail::invoke(op_, eval_left, eval_right);
   }
 
   template <
diff --git a/src/TiledArray/tile_op/contract_reduce.h b/src/TiledArray/tile_op/contract_reduce.h
index 48b7936d26..2a5e90ea5d 100644
--- a/src/TiledArray/tile_op/contract_reduce.h
+++ b/src/TiledArray/tile_op/contract_reduce.h
@@ -64,17 +64,20 @@ class ContractReduceBase {
   using elem_muladd_op_type = void(result_value_type&, const left_value_type&,
                                    const right_value_type&);
 
-  static_assert(
-      TiledArray::detail::is_tensor_v<left_value_type> ==
-              TiledArray::detail::is_tensor_v<right_value_type> &&
-          TiledArray::detail::is_tensor_v<left_value_type> ==
-              TiledArray::detail::is_tensor_v<result_value_type>,
-      "ContractReduce can only handle plain tensors or nested tensors "
-      "(tensors-of-tensors); mixed contractions are not supported");
   static constexpr bool plain_tensors =
-      !(TiledArray::detail::is_tensor_v<left_value_type> &&
-        TiledArray::detail::is_tensor_v<right_value_type> &&
-        TiledArray::detail::is_tensor_v<result_value_type>);
+      !TiledArray::detail::is_nested_tensor_v<left_value_type> &&
+      !TiledArray::detail::is_nested_tensor_v<right_value_type> &&
+      !TiledArray::detail::is_nested_tensor_v<result_value_type>;
+  static constexpr bool nested_tensors =
+      TiledArray::detail::is_nested_tensor_v<left_value_type, right_value_type,
+                                             result_value_type>;
+  static constexpr bool mixed_tensors = !plain_tensors && !nested_tensors;
+  static_assert(!mixed_tensors ||
+                    (mixed_tensors &&
+                     TiledArray::detail::is_nested_tensor_v<result_value_type>),
+                "ContractReduce applied to 1 plain tensor and 1 nested tensor "
+                "must produce a nested tensor "
+                "(tensors-of-tensors)");
 
  private:
   struct Impl {
@@ -82,17 +85,18 @@ class ContractReduceBase {
         typename Perm = BipartitePermutation,
         typename ElemMultAddOp = TiledArray::function_ref<elem_muladd_op_type>,
         typename = std::enable_if_t<
-            TiledArray::detail::is_permutation_v<Perm> &&
+            TiledArray::detail::is_permutation_v<
+                std::remove_reference_t<Perm>> &&
             std::is_invocable_r_v<void, std::remove_reference_t<ElemMultAddOp>,
                                   result_value_type&, const left_value_type&,
                                   const right_value_type&>>>
     Impl(const math::blas::Op left_op, const math::blas::Op right_op,
          const scalar_type alpha, const unsigned int result_rank,
          const unsigned int left_rank, const unsigned int right_rank,
-         const Perm& perm = {}, ElemMultAddOp&& elem_muladd_op = {})
+         Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {})
         : gemm_helper_(left_op, right_op, result_rank, left_rank, right_rank),
           alpha_(alpha),
-          perm_(perm),
+          perm_(std::forward<Perm>(perm)),
           elem_muladd_op_(std::forward<ElemMultAddOp>(elem_muladd_op)) {
       // non-unit alpha must be absorbed into elem_muladd_op
       if (elem_muladd_op_) TA_ASSERT(alpha == scalar_type(1));
@@ -138,7 +142,7 @@ class ContractReduceBase {
       typename Perm = BipartitePermutation,
       typename ElemMultAddOp = TiledArray::function_ref<elem_muladd_op_type>,
       typename = std::enable_if_t<
-          TiledArray::detail::is_permutation_v<Perm> &&
+          TiledArray::detail::is_permutation_v<std::remove_reference_t<Perm>> &&
           std::is_invocable_r_v<void, std::remove_reference_t<ElemMultAddOp>,
                                 result_value_type&, const left_value_type&,
                                 const right_value_type&>>>
@@ -146,10 +150,11 @@ class ContractReduceBase {
                      const math::blas::Op right_op, const scalar_type alpha,
                      const unsigned int result_rank,
                      const unsigned int left_rank,
-                     const unsigned int right_rank, const Perm& perm = {},
+                     const unsigned int right_rank, Perm&& perm = {},
                      ElemMultAddOp&& elem_muladd_op = {})
       : pimpl_(std::make_shared<Impl>(
-            left_op, right_op, alpha, result_rank, left_rank, right_rank, perm,
+            left_op, right_op, alpha, result_rank, left_rank, right_rank,
+            std::forward<Perm>(perm),
             std::forward<ElemMultAddOp>(elem_muladd_op))) {}
 
   /// Gemm meta data accessor
@@ -273,16 +278,16 @@ class ContractReduce : public ContractReduceBase<Result, Left, Right, Scalar> {
       typename Perm = BipartitePermutation,
       typename ElemMultAddOp = TiledArray::function_ref<elem_muladd_op_type>,
       typename = std::enable_if_t<
-          TiledArray::detail::is_permutation_v<Perm> &&
+          TiledArray::detail::is_permutation_v<std::remove_reference_t<Perm>> &&
           std::is_invocable_r_v<void, std::remove_reference_t<ElemMultAddOp>,
                                 result_value_type&, const left_value_type&,
                                 const right_value_type&>>>
   ContractReduce(const math::blas::Op left_op, const math::blas::Op right_op,
                  const scalar_type alpha, const unsigned int result_rank,
                  const unsigned int left_rank, const unsigned int right_rank,
-                 const Perm& perm = {}, ElemMultAddOp&& elem_muladd_op = {})
+                 Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {})
       : ContractReduceBase_(left_op, right_op, alpha, result_rank, left_rank,
-                            right_rank, perm,
+                            right_rank, std::forward<Perm>(perm),
                             std::forward<ElemMultAddOp>(elem_muladd_op)) {}
 
   /// Create a result type object
@@ -321,17 +326,16 @@ class ContractReduce : public ContractReduceBase<Result, Left, Right, Scalar> {
   /// \param[in] right The right-hand tile to be contracted
   void operator()(result_type& result, const first_argument_type& left,
                   const second_argument_type& right) const {
+    using TiledArray::empty;
+    using TiledArray::gemm;
+    if (empty(left) || empty(right)) return;
+
     if constexpr (!ContractReduceBase_::plain_tensors) {
       TA_ASSERT(this->elem_muladd_op());
-      // not yet implemented
-      using TiledArray::empty;
-      using TiledArray::gemm;
       gemm(result, left, right, ContractReduceBase_::gemm_helper(),
            this->elem_muladd_op());
     } else {  // plain tensors
       TA_ASSERT(!this->elem_muladd_op());
-      using TiledArray::empty;
-      using TiledArray::gemm;
       if (empty(result))
         result = gemm(left, right, ContractReduceBase_::factor(),
                       ContractReduceBase_::gemm_helper());
@@ -401,16 +405,16 @@ class ContractReduce<Result, Left, Right,
       typename Perm = BipartitePermutation,
       typename ElemMultAddOp = TiledArray::function_ref<elem_muladd_op_type>,
       typename = std::enable_if_t<
-          TiledArray::detail::is_permutation_v<Perm> &&
+          TiledArray::detail::is_permutation_v<std::remove_reference_t<Perm>> &&
           std::is_invocable_r_v<void, std::remove_reference_t<ElemMultAddOp>,
                                 result_value_type&, const left_value_type&,
                                 const right_value_type&>>>
   ContractReduce(const math::blas::Op left_op, const math::blas::Op right_op,
                  const scalar_type alpha, const unsigned int result_rank,
                  const unsigned int left_rank, const unsigned int right_rank,
-                 const Perm& perm = {}, ElemMultAddOp&& elem_muladd_op = {})
+                 Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {})
       : ContractReduceBase_(left_op, right_op, alpha, result_rank, left_rank,
-                            right_rank, perm,
+                            right_rank, std::forward<Perm>(perm),
                             std::forward<ElemMultAddOp>(elem_muladd_op)) {}
 
   /// Create a result type object
@@ -527,16 +531,16 @@ class ContractReduce<Result, Left, Right,
       typename Perm = BipartitePermutation,
       typename ElemMultAddOp = TiledArray::function_ref<elem_muladd_op_type>,
       typename = std::enable_if_t<
-          TiledArray::detail::is_permutation_v<Perm> &&
+          TiledArray::detail::is_permutation_v<std::remove_reference_t<Perm>> &&
           std::is_invocable_r_v<void, std::remove_reference_t<ElemMultAddOp>,
                                 result_value_type&, const left_value_type&,
                                 const right_value_type&>>>
   ContractReduce(const math::blas::Op left_op, const math::blas::Op right_op,
                  const scalar_type alpha, const unsigned int result_rank,
                  const unsigned int left_rank, const unsigned int right_rank,
-                 const Perm& perm = {}, ElemMultAddOp&& elem_muladd_op = {})
+                 Perm&& perm = {}, ElemMultAddOp&& elem_muladd_op = {})
       : ContractReduceBase_(left_op, right_op, alpha, result_rank, left_rank,
-                            right_rank, perm,
+                            right_rank, std::forward<Perm>(perm),
                             std::forward<ElemMultAddOp>(elem_muladd_op)) {}
 
   /// Create a result type object
diff --git a/src/TiledArray/tile_op/mult.h b/src/TiledArray/tile_op/mult.h
index b9da1d5e24..577ea94115 100644
--- a/src/TiledArray/tile_op/mult.h
+++ b/src/TiledArray/tile_op/mult.h
@@ -128,17 +128,30 @@ class Mult {
 
   template <bool LC, bool RC, typename std::enable_if<LC>::type* = nullptr>
   result_type eval(left_type& first, const right_type& second) const {
-    TA_ASSERT(!element_op_);
-    using TiledArray::mult_to;
-    return mult_to(first, second);
+    if (!element_op_) {
+      using TiledArray::mult_to;
+      return mult_to(first, second);
+    } else {
+      // TODO figure out why this does not compiles!!!
+      //      using TiledArray::inplace_binary;
+      //      return inplace_binary(first, second, element_op_);
+      using TiledArray::binary;
+      return binary(first, second, element_op_);
+    }
   }
 
   template <bool LC, bool RC,
             typename std::enable_if<!LC && RC>::type* = nullptr>
   result_type eval(const left_type& first, right_type& second) const {
-    TA_ASSERT(!element_op_);
-    using TiledArray::mult_to;
-    return mult_to(second, first);
+    if (!element_op_) {
+      using TiledArray::mult_to;
+      return mult_to(second, first);
+    } else {  // WARNING: element_op_ might be noncommuting, so can't swap first
+              // and second! for GEMM could optimize, but can't introspect
+              // element_op_
+      using TiledArray::binary;
+      return binary(first, second, element_op_);
+    }
   }
 
   template <bool LC, bool RC, typename std::enable_if<!RC>::type* = nullptr>
diff --git a/src/TiledArray/tile_op/scal.h b/src/TiledArray/tile_op/scal.h
index 54d5337ed4..a89770c5a7 100644
--- a/src/TiledArray/tile_op/scal.h
+++ b/src/TiledArray/tile_op/scal.h
@@ -128,6 +128,8 @@ class Scal {
     return Scal_::template eval<can_consume>(arg);
   }
 
+  void set_factor(const scalar_type factor) { factor_ = factor; }
+
 };  // class Scal
 
 }  // namespace detail
diff --git a/src/TiledArray/tile_op/tile_interface.h b/src/TiledArray/tile_op/tile_interface.h
index 65d970ebeb..ee8c1093a2 100644
--- a/src/TiledArray/tile_op/tile_interface.h
+++ b/src/TiledArray/tile_op/tile_interface.h
@@ -372,8 +372,8 @@ inline auto subt(const Arg& arg, const Scalar value, const Perm& perm) {
 /// \param arg The argument to be subtracted from the result
 /// \return A tile that is equal to <tt>result[i] -= arg[i]</tt>
 template <typename Result, typename Arg>
-inline Result& subt_to(Result& result, const Arg& arg) {
-  return result.subt_to(arg);
+inline decltype(auto) subt_to(Result&& result, const Arg& arg) {
+  return std::forward<Result>(result).subt_to(arg);
 }
 
 /// Subtract and scale from the result tile
diff --git a/src/TiledArray/tile_op/unary_wrapper.h b/src/TiledArray/tile_op/unary_wrapper.h
index 3712aca4f1..e1b89e02a7 100644
--- a/src/TiledArray/tile_op/unary_wrapper.h
+++ b/src/TiledArray/tile_op/unary_wrapper.h
@@ -152,8 +152,9 @@ class UnaryWrapper {
   /// `arg`.
   template <typename A, std::enable_if_t<is_nonarray_lazy_tile_v<A>>* = nullptr>
   auto operator()(A&& arg) const {
-    return (perm_ ? meta::invoke(op_, invoke_cast(std::forward<A>(arg)), perm_)
-                  : meta::invoke(op_, invoke_cast(std::forward<A>(arg))));
+    return (perm_
+                ? detail::invoke(op_, invoke_cast(std::forward<A>(arg)), perm_)
+                : detail::invoke(op_, invoke_cast(std::forward<A>(arg))));
   }
 
   /// Evaluate a lazy array tile
@@ -176,10 +177,10 @@ class UnaryWrapper {
     //          return op_.consume(std::forward<decltype(arg)>(arg));
     //        };
     auto op_consume = [this](eval_t<A>& arg) { return op_.consume(arg); };
-    return (perm_ ? meta::invoke(op_, std::move(cast_arg), perm_)
+    return (perm_ ? detail::invoke(op_, std::move(cast_arg), perm_)
                   : (arg.is_consumable()
-                         ? meta::invoke(op_consume, cast_arg)
-                         : meta::invoke(op_, std::move(cast_arg))));
+                         ? detail::invoke(op_consume, cast_arg)
+                         : detail::invoke(op_, std::move(cast_arg))));
   }
 
   /// Consume a lazy tile
@@ -196,8 +197,8 @@ class UnaryWrapper {
     //          return op_.consume(std::forward<decltype(arg)>(arg));
     //        };
     auto op_consume = [this](eval_t<A>& arg) { return op_.consume(arg); };
-    return (perm_ ? meta::invoke(op_, std::move(cast_arg), perm_)
-                  : meta::invoke(op_consume, cast_arg));
+    return (perm_ ? detail::invoke(op_, std::move(cast_arg), perm_)
+                  : detail::invoke(op_consume, cast_arg));
   }
 
   template <typename A, std::enable_if_t<!is_lazy_tile_v<A>>* = nullptr>
diff --git a/src/TiledArray/tiled_range.h b/src/TiledArray/tiled_range.h
index 8c0714aa7d..fb73512560 100644
--- a/src/TiledArray/tiled_range.h
+++ b/src/TiledArray/tiled_range.h
@@ -277,6 +277,18 @@ class TiledRange {
     return result;
   }
 
+  /// Convert an element index to a tile index
+
+  /// \tparam Integer An integral type
+  /// \param index The element index to convert
+  /// \return The tile index that corresponds to the given element index
+  template <typename Integer,
+            typename = std::enable_if_t<std::is_integral_v<Integer>>>
+  typename range_type::index element_to_tile(
+      const std::initializer_list<Integer>& index) const {
+    return this->element_to_tile<std::initializer_list<Integer>>(index);
+  }
+
   /// The rank accessor
 
   /// \return the rank (=number of dimensions) of this object
@@ -312,18 +324,73 @@ class TiledRange {
     std::swap(ranges_, other.ranges_);
   }
 
+  /// Shifts the lower and upper bounds of this range
+
+  /// \tparam Index An integral range type
+  /// \param bound_shift The shift to be applied to the range
+  /// \return A reference to this range
+  template <typename Index,
+            typename = std::enable_if_t<detail::is_integral_range_v<Index>>>
+  TiledRange_& inplace_shift(const Index& bound_shift) {
+    elements_range_.inplace_shift(bound_shift);
+    using std::begin;
+    auto bound_shift_it = begin(bound_shift);
+    for (std::size_t d = 0; d != rank(); ++d, ++bound_shift_it) {
+      ranges_[d].inplace_shift(*bound_shift_it);
+    }
+    return *this;
+  }
+
+  /// Shifts the lower and upper bound of this range
+
+  /// \tparam Index An integral type
+  /// \param bound_shift The shift to be applied to the range
+  /// \return A reference to this range
+  template <typename Index,
+            typename = std::enable_if_t<std::is_integral_v<Index>>>
+  TiledRange_& inplace_shift(const std::initializer_list<Index>& bound_shift) {
+    return inplace_shift<std::initializer_list<Index>>(bound_shift);
+  }
+
+  /// Create a TiledRange with shifted lower and upper bounds
+
+  /// \tparam Index An integral range type
+  /// \param bound_shift The shift to be applied to the range
+  /// \return A shifted copy of this range
+  template <typename Index,
+            typename = std::enable_if_t<detail::is_integral_range_v<Index>>>
+  [[nodiscard]] TiledRange_ shift(const Index& bound_shift) const {
+    TiledRange_ result(*this);
+    result.inplace_shift(bound_shift);
+    return result;
+  }
+
+  /// Create a TiledRange with shifted lower and upper bounds
+
+  /// \tparam Index An integral type
+  /// \param bound_shift The shift to be applied to the range
+  /// \return A shifted copy of this range
+  template <typename Index,
+            typename = std::enable_if_t<std::is_integral_v<Index>>>
+  [[nodiscard]] TiledRange_ shift(
+      const std::initializer_list<Index>& bound_shift) const {
+    TiledRange_ result(*this);
+    result.inplace_shift(bound_shift);
+    return result;
+  }
+
   template <typename Archive,
             typename std::enable_if<madness::is_input_archive_v<
                 std::decay_t<Archive>>>::type* = nullptr>
   void serialize(Archive& ar) {
-    ar& range_& elements_range_& ranges_;
+    ar & range_ & elements_range_ & ranges_;
   }
 
   template <typename Archive,
             typename std::enable_if<madness::is_output_archive_v<
                 std::decay_t<Archive>>>::type* = nullptr>
   void serialize(Archive& ar) const {
-    ar& range_& elements_range_& ranges_;
+    ar & range_ & elements_range_ & ranges_;
   }
 
  private:
@@ -356,6 +423,19 @@ inline bool operator==(const TiledRange& r1, const TiledRange& r2) {
          std::equal(r1.data().begin(), r1.data().end(), r2.data().begin());
 }
 
+/// Test that two TiledRange objects are congruent
+
+/// Two tranges are congruent if one is a translation of another (i.e. their
+/// ranks and extents of all tiles) agree \param r1 a TiledRange object \param
+/// r2 a TiledRange object
+inline bool is_congruent(const TiledRange& r1, const TiledRange& r2) {
+  return r1.rank() == r2.rank() &&
+         std::equal(r1.begin(), r1.end(), r2.begin(),
+                    [](const auto& tr1_1, const auto& tr1_2) {
+                      return is_congruent(tr1_1, tr1_2);
+                    });
+}
+
 inline bool operator!=(const TiledRange& r1, const TiledRange& r2) {
   return !operator==(r1, r2);
 }
diff --git a/src/TiledArray/tiled_range1.h b/src/TiledArray/tiled_range1.h
index 239321b567..aa75916442 100644
--- a/src/TiledArray/tiled_range1.h
+++ b/src/TiledArray/tiled_range1.h
@@ -27,8 +27,10 @@
 #include <TiledArray/type_traits.h>
 #include <TiledArray/utility.h>
 #include <madness/world/archive.h>
+
 #include <cassert>
 #include <initializer_list>
+#include <memory>
 #include <mutex>
 #include <vector>
 
@@ -36,9 +38,11 @@ namespace TiledArray {
 
 /// TiledRange1 class defines a non-uniformly-tiled, contiguous, one-dimensional
 /// range. The tiling data is constructed with and stored in an array with
-/// the format {a0, a1, a2, ...}, where 0 <= a0 < a1 < a2 < ... Each tile is
+/// the format {a0, a1, a2, ...}, where a0 <= a1 <= a2 <= ... Each tile is
 /// defined as [a0,a1), [a1,a2), ... The number of tiles in the range will be
 /// equal to one less than the number of elements in the array.
+/// \note if TiledArray was configured with `TA_SIGNED_1INDEX_TYPE=OFF` then the
+/// tile boundaries must be non-negative.
 class TiledRange1 {
  private:
   struct Enabler {};
@@ -46,6 +50,7 @@ class TiledRange1 {
  public:
   using range_type = Range1;
   using index1_type = range_type::index1_type;
+  using signed_index1_type = range_type::signed_index1_type;
   using const_iterator = std::vector<range_type>::const_iterator;
 
   /// Default constructor creates an empty range (tile and element ranges are
@@ -56,18 +61,16 @@ class TiledRange1 {
   ///   assert(tr.elements_range() == (TiledRange1::range_type{0,0}));
   ///   assert(tr.begin() == tr.end());
   /// \endcode
-  TiledRange1()
-      : range_(0, 0), elements_range_(0, 0), tiles_ranges_(), elem2tile_() {}
+  TiledRange1() : range_(0, 0), elements_range_(0, 0) {}
 
-  /// Constructs a range with the boundaries provided by
+  /// Constructs a range with the tile boundaries ("hashmarks") provided by
   /// the range [ \p first , \p last ).
   /// \note validity of the [ \p first , \p last ) range is checked using
   /// #TA_ASSERT() only if preprocessor macro \c NDEBUG is not defined
   template <typename RandIter,
             typename std::enable_if<
                 detail::is_random_iterator<RandIter>::value>::type* = nullptr>
-  explicit TiledRange1(RandIter first, RandIter last)
-      : range_(), elements_range_(), tiles_ranges_(), elem2tile_() {
+  explicit TiledRange1(RandIter first, RandIter last) {
     init_tiles_(first, last, 0);
   }
 
@@ -79,7 +82,7 @@ class TiledRange1 {
 
   /// Construct a 1D tiled range.
 
-  /// This will construct a 1D tiled range with tile boundaries
+  /// This will construct a 1D tiled range with tile boundaries ("hashmarks")
   /// {\p t0 , \p t_rest... }
   /// The number of tile boundaries is n + 1, where n is the number of tiles.
   /// Tiles are defined as [\p t0, t1), [t1, t2), [t2, t3), ...
@@ -96,19 +99,38 @@ class TiledRange1 {
 
   /// Construct a 1D tiled range.
 
-  /// This will construct a 1D tiled range with tile boundaries
-  /// {\p t0 , \p t_rest... }
+  /// This will construct a 1D tiled range from range {t0, t1, t2, ... tn}
+  /// specifying the tile boundaries (hashmarks).
+  /// The number of tile boundaries is n + 1, where n is the number of tiles.
+  /// Tiles are defined as [\p t0 , t1), [t1, t2), [t2, t3), ...
+  /// Tiles are indexed starting with 0.
+  /// \tparam Integer An integral type
+  /// \param tile_boundaries The list of tile boundaries in order from smallest
+  /// to largest
+  /// \note validity of the {\p t0 , \p t_rest... } range is checked using
+  ///   #TA_ASSERT() only if preprocessor macro \c NDEBUG is not defined
+  template <typename Range,
+            typename = std::enable_if_t<detail::is_integral_range_v<Range>>>
+  explicit TiledRange1(Range&& tile_boundaries) {
+    init_tiles_(tile_boundaries.begin(), tile_boundaries.end(), 0);
+  }
+
+  /// Construct a 1D tiled range.
+
+  /// This will construct a 1D tiled range from range {t0, t1, t2, ... tn}
+  /// specifying the tile boundaries (hashmarks).
   /// The number of tile boundaries is n + 1, where n is the number of tiles.
   /// Tiles are defined as [\p t0 , t1), [t1, t2), [t2, t3), ...
   /// Tiles are indexed starting with 0.
   /// \tparam Integer An integral type
-  /// \param list The list of tile boundaries in order from smallest to largest
+  /// \param tile_boundaries The list of tile boundaries in order from smallest
+  /// to largest
   /// \note validity of the {\p t0 , \p t_rest... } range is checked using
   /// #TA_ASSERT() only if preprocessor macro \c NDEBUG is not defined
   template <typename Integer,
             typename = std::enable_if_t<std::is_integral_v<Integer>>>
-  explicit TiledRange1(const std::initializer_list<Integer>& list) {
-    init_tiles_(list.begin(), list.end(), 0);
+  explicit TiledRange1(const std::initializer_list<Integer>& tile_boundaries) {
+    init_tiles_(tile_boundaries.begin(), tile_boundaries.end(), 0);
   }
 
   /// Copy assignment operator
@@ -158,6 +180,18 @@ class TiledRange1 {
   /// \return the number of elements in the range
   index1_type extent() const { return TiledArray::extent(elements_range_); }
 
+  // clang-format off
+  /// Elements range lobound accessor
+  /// \return lower bound of the elements range (i.e., the smallest index in the elements range, `a` in `[a,b)`)
+  // clang-format on
+  index1_type lobound() const { return elements_range_.lobound(); }
+
+  // clang-format off
+  /// Elements range upbound accessor
+  /// \return upper bound of the elements range (i.e., the smallest index greater than any in the elements range, `b` in `[a,b)`)
+  // clang-format on
+  index1_type upbound() const { return elements_range_.upbound(); }
+
   /// Computes hashmarks
   /// \return the hashmarks of the tiled range, consisting of the following
   /// values:
@@ -227,9 +261,10 @@ class TiledRange1 {
   ///       across ALL TiledRange1 instances.
   const index1_type& element_to_tile(const index1_type& i) const {
     TA_ASSERT(includes(elements_range_, i));
-    if (elem2tile_.empty()) {
+    if (!elem2tile_) {
       init_elem2tile_();
     }
+    // N.B. only track elements in this range
     return elem2tile_[i - elements_range_.first];
   }
 
@@ -241,28 +276,108 @@ class TiledRange1 {
   // clang-format off
   /// @brief makes a uniform (or, as uniform as possible) TiledRange1
 
-  /// @param[in] range_size the range size
-  /// @param[in] target_block_size the desired block size
-  /// @return TiledRange1 obtained by tiling range `[0,range_size)` into `(range_size + target_block_size - 1)/target_block_size`
-  ///         blocks of approximately @p target_block_size size
+  /// @param[in] range the Range to be tiled
+  /// @param[in] target_tile_size the desired tile size
+  /// @return TiledRange1 obtained by tiling \p range into
+  /// `ntiles = (range.extent() + target_tile_size - 1)/target_tile_size`
+  ///         tiles; if `x = range.extent() % ntiles` is not zero, first `x` tiles
+  /// have size `target_tile_size` and last
+  /// `ntiles - x` tiles have size `target_tile_size - 1`, else
+  /// all tiles have size `target_tile_size` .
   // clang-format on
-  static TiledRange1 make_uniform(std::size_t range_size,
-                                  std::size_t target_block_size) {
-    if (range_size > 0) {
-      TA_ASSERT(target_block_size > 0);
-      std::size_t nblocks =
-          (range_size + target_block_size - 1) / target_block_size;
-      std::size_t block_size = (range_size + nblocks - 1) / nblocks;
+  static TiledRange1 make_uniform(const Range1& range,
+                                  std::size_t target_tile_size) {
+    const auto range_extent = range.extent();
+    if (range_extent > 0) {
+      TA_ASSERT(target_tile_size > 0);
+      std::size_t ntiles =
+          (range_extent + target_tile_size - 1) / target_tile_size;
+      auto dv = std::div((long)(range_extent + ntiles - 1), (long)ntiles);
+      auto avg_tile_size = dv.quot - 1, num_avg_plus_one = dv.rem + 1;
       std::vector<std::size_t> hashmarks;
-      hashmarks.reserve(nblocks + 1);
-      hashmarks.push_back(0);
-      for (auto i = block_size; i < range_size; i += block_size) {
-        hashmarks.push_back(i);
+      hashmarks.reserve(ntiles + 1);
+      std::size_t element = range.lobound();
+      for (auto i = 0; i < num_avg_plus_one;
+           ++i, element += avg_tile_size + 1) {
+        hashmarks.push_back(element);
+      }
+      for (auto i = num_avg_plus_one; i < ntiles;
+           ++i, element += avg_tile_size) {
+        hashmarks.push_back(element);
       }
-      hashmarks.push_back(range_size);
+      hashmarks.push_back(range.upbound());
       return TiledRange1(hashmarks.begin(), hashmarks.end());
     } else
-      return TiledRange1{};
+      return TiledRange1{range.lobound()};
+  }
+
+  /// same as make_uniform(const Range1&, std::size_t) for a 0-based range
+  /// specified by its extent
+  static TiledRange1 make_uniform(std::size_t range_extent,
+                                  std::size_t target_tile_size) {
+    return make_uniform(Range1(0, range_extent), target_tile_size);
+  }
+
+  /// same as make_uniform(const Range1&, std::size_t), using the element_range
+  /// of this TiledRange1
+  TiledRange1 make_uniform(std::size_t target_tile_size) const {
+    return make_uniform(this->elements_range(), target_tile_size);
+  }
+
+  /// make as uniformly-tiled range as possible out of this TiledRange1, with
+  /// the same number of tiles as this
+  TiledRange1 make_uniform() const {
+    return make_uniform(
+        this->elements_range(),
+        (this->elements_range().extent() + this->tile_extent() - 1) /
+            this->tile_extent());
+  }
+
+  /// shifts this TiledRange1
+
+  /// @param[in] shift the shift to apply
+  /// @return reference to this
+  TiledRange1& inplace_shift(signed_index1_type shift) {
+    if (shift == 0) return *this;
+    // ensure that it's safe to shift
+    TA_ASSERT(shift <= 0 || elements_range().upbound() <= 0 ||
+              (shift <= (std::numeric_limits<index1_type>::max() -
+                         elements_range().upbound())));
+    TA_ASSERT(shift >= 0 || elements_range().lobound() >= 0 ||
+              (std::abs(shift) <= (elements_range().lobound() -
+                                   std::numeric_limits<index1_type>::min())));
+    elements_range_.inplace_shift(shift);
+    for (auto& tile : tiles_ranges_) {
+      tile.inplace_shift(shift);
+    }
+    elem2tile_.reset();
+    return *this;
+  }
+
+  /// creates a shifted TiledRange1
+
+  /// equivalent to (but more efficient than) `TiledRange1(*this).shift(shift)`
+  /// @param[in] shift the shift value
+  [[nodiscard]] TiledRange1 shift(signed_index1_type shift) const {
+    if (shift == 0) return *this;
+    // ensure that it's safe to shift
+    TA_ASSERT(shift <= 0 || elements_range().upbound() <= 0 ||
+              (shift <= (std::numeric_limits<index1_type>::max() -
+                         elements_range().upbound())));
+    TA_ASSERT(shift >= 0 || elements_range().lobound() >= 0 ||
+              (std::abs(shift) <= (elements_range().lobound() -
+                                   std::numeric_limits<index1_type>::min())));
+    std::vector<index1_type> hashmarks;
+    hashmarks.reserve(tile_extent() + 1);
+    if (tiles_ranges_.empty())
+      hashmarks.emplace_back(elements_range_.lobound() + shift);
+    else {
+      for (auto& t : tiles_ranges_) {
+        hashmarks.push_back(t.first + shift);
+      }
+      hashmarks.push_back(elements_range_.upbound() + shift);
+    }
+    return TiledRange1(hashmarks.begin(), hashmarks.end());
   }
 
   /// swapper
@@ -280,14 +395,14 @@ class TiledRange1 {
             typename std::enable_if<madness::is_input_archive_v<
                 std::decay_t<Archive>>>::type* = nullptr>
   void serialize(Archive& ar) {
-    ar& range_& elements_range_& tiles_ranges_& elem2tile_;
+    ar & range_ & elements_range_ & tiles_ranges_;
   }
 
   template <typename Archive,
             typename std::enable_if<madness::is_output_archive_v<
                 std::decay_t<Archive>>>::type* = nullptr>
   void serialize(Archive& ar) const {
-    ar& range_& elements_range_& tiles_ranges_& elem2tile_;
+    ar & range_ & elements_range_ & tiles_ranges_;
   }
 
  private:
@@ -298,21 +413,22 @@ class TiledRange1 {
   /// Validates tile_boundaries
   template <typename RandIter>
   static void valid_(RandIter first, RandIter last) {
-    // Verify at least 2 elements are present if the vector is not empty.
-    TA_ASSERT((std::distance(first, last) >= 2) &&
-              "TiledRange1 construction failed: You need at least 2 "
-              "elements in the tile boundary list.");
-    // Verify the requirement that a0 < a1 < a2 < ...
+    // Need at least 1 tile hashmark to position the element range
+    // (zero hashmarks is handled by the default ctor)
+    TA_ASSERT((std::distance(first, last) >= 1) &&
+              "TiledRange1 construction failed: You need at least 1 "
+              "element in the tile boundary list.");
+    // Verify the requirement that a0 <= a1 <= a2 <= ...
     for (; first != (last - 1); ++first) {
       TA_ASSERT(
-          *first < *(first + 1) &&
+          *first <= *(first + 1) &&
           "TiledRange1 construction failed: Invalid tile boundary, tile "
-          "boundary i must be greater than tile boundary i+1 for all i. ");
+          "boundary i must not be greater than tile boundary i+1 for all i. ");
       TA_ASSERT(
-          static_cast<index1_type>(*first) <
+          static_cast<index1_type>(*first) <=
               static_cast<index1_type>(*(first + 1)) &&
           "TiledRange1 construction failed: Invalid tile boundary, tile "
-          "boundary i must be greater than tile boundary i+1 for all i. ");
+          "boundary i must not be greater than tile boundary i+1 for all i. ");
     }
   }
 
@@ -324,7 +440,9 @@ class TiledRange1 {
     valid_(first, last);
 #endif  // NDEBUG
     range_.first = start_tile_index;
-    range_.second = start_tile_index + last - first - 1;
+    using std::distance;
+    range_.second =
+        start_tile_index + static_cast<index1_type>(distance(first, last)) - 1;
     elements_range_.first = *first;
     elements_range_.second = *(last - 1);
     for (; first != (last - 1); ++first)
@@ -335,19 +453,30 @@ class TiledRange1 {
   void init_elem2tile_() const {
     using TiledArray::extent;
     // check for 0 size range.
-    if (extent(elements_range_) == 0) return;
+    const auto n = extent(elements_range_);
+    if (n == 0) return;
 
     static std::mutex mtx;
     {
       std::lock_guard<std::mutex> lock(mtx);
-      if (elem2tile_.empty()) {
+      if (!elem2tile_) {
         // initialize elem2tile map
-        elem2tile_.resize(extent(elements_range_));
+        auto e2t =
+            // #if __cplusplus >= 202002L  ... still broken in Xcode 14
+            //             std::make_shared<index1_type[]>(n);
+            // #else
+            std::shared_ptr<index1_type[]>(
+                new index1_type[n], [](index1_type* ptr) { delete[] ptr; });
+        // #endif
         const auto end = extent(range_);
         for (index1_type t = 0; t < end; ++t)
-          for (index1_type e = tiles_ranges_[t].first;
-               e < tiles_ranges_[t].second; ++e)
-            elem2tile_[e - elements_range_.first] = t + range_.first;
+          for (auto e : tiles_ranges_[t]) {
+            // only track elements in this range
+            e2t[e - elements_range_.first] = t + range_.first;
+          }
+        auto e2t_const = std::const_pointer_cast<const index1_type[]>(e2t);
+        // commit the changes
+        std::swap(elem2tile_, e2t_const);
       }
     }
   }
@@ -359,7 +488,7 @@ class TiledRange1 {
   range_type elements_range_;  ///< the range of element indices
   std::vector<range_type>
       tiles_ranges_;  ///< ranges of each tile (NO GAPS between tiles)
-  mutable std::vector<index1_type>
+  mutable std::shared_ptr<const index1_type[]>
       elem2tile_;  ///< maps element index to tile index (memoized data).
 
 };  // class TiledRange1
@@ -383,10 +512,8 @@ inline bool operator!=(const TiledRange1& r1, const TiledRange1& r2) {
 
 /// TiledRange1 ostream operator
 inline std::ostream& operator<<(std::ostream& out, const TiledRange1& rng) {
-  out << "( tiles = [ " << rng.tiles_range().first << ", "
-      << rng.tiles_range().second << " ), elements = [ "
-      << rng.elements_range().first << ", " << rng.elements_range().second
-      << " ) )";
+  out << "( tiles = " << rng.tiles_range()
+      << ", elements = " << rng.elements_range() << " )";
   return out;
 }
 
@@ -423,9 +550,8 @@ inline TiledRange1 concat(const TiledRange1& r1, const TiledRange1& r2) {
 /// Test that two TiledRange1 objects are congruent
 
 /// This function tests that the tile sizes of the two ranges coincide.
-/// \tparam Range The range type
-/// \param r1 an TiledRange1 object
-/// \param r2 an TiledRange1 object
+/// \param r1 a TiledRange1 object
+/// \param r2 a TiledRange1 object
 inline bool is_congruent(const TiledRange1& r1, const TiledRange1& r2) {
   return r1.tile_extent() == r2.tile_extent() &&
          std::equal(r1.begin(), r1.end(), r2.begin(),
diff --git a/src/TiledArray/tiledarray.cpp b/src/TiledArray/tiledarray.cpp
index 47ccc00d8e..2a4b3d1199 100644
--- a/src/TiledArray/tiledarray.cpp
+++ b/src/TiledArray/tiledarray.cpp
@@ -6,9 +6,9 @@
 
 #include <madness/world/safempi.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
-#include <TiledArray/cuda/cublas.h>
-#include <TiledArray/external/cuda.h>
+#ifdef TILEDARRAY_HAS_DEVICE
+#include <TiledArray/device/blas.h>
+#include <TiledArray/external/device.h>
 #include <librett.h>
 #endif
 
@@ -16,34 +16,42 @@
 #include <ttg.h>
 #endif
 
+#ifdef IntelMKL_FAIR_DISPATCH
+extern "C" void intel_mkl_use_fair_dispatch();
+#endif
+
 #include <cerrno>
+#include <csignal>
 #include <cstdlib>
 
 namespace TiledArray {
 namespace {
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 /// initialize cuda environment
-inline void cuda_initialize() {
-  /// initialize cudaGlobal
-  cudaEnv::instance();
-  //
-  cuBLASHandlePool::handle();
+inline void device_initialize() {
+  /// initialize deviceEnv
+  deviceEnv::instance();
+#if defined(TILEDARRAY_HAS_DEVICE)
+  BLASQueuePool::initialize();
+#endif
   // initialize LibreTT
   librettInitialize();
 }
 
 /// finalize cuda environment
-inline void cuda_finalize() {
-  CudaSafeCall(cudaDeviceSynchronize());
+inline void device_finalize() {
+  DeviceSafeCall(device::deviceSynchronize());
   librettFinalize();
-  cublasDestroy(cuBLASHandlePool::handle());
-  delete &cuBLASHandlePool::handle();
-  // although TA::cudaEnv is a singleton, must explicitly delete it so
-  // that CUDA runtime is not finalized before the cudaEnv dtor is called
-  cudaEnv::instance().reset(nullptr);
-}
+#if defined(TILEDARRAY_HAS_DEVICE)
+  BLASQueuePool::finalize();
 #endif
+  // although TA::deviceEnv is a singleton, must explicitly delete it so
+  // that the device runtime is not finalized before the deviceEnv dtor is
+  // called
+  deviceEnv::instance().reset(nullptr);
+}
+#endif  // TILEDARRAY_HAS_DEVICE
 
 inline bool& initialized_madworld_accessor() {
   static bool flag = false;
@@ -59,28 +67,20 @@ inline bool& finalized_accessor() {
   return flag;
 }
 
+inline bool& quiet_accessor() {
+  static bool quiet = false;
+  return quiet;
+}
+
 }  // namespace
 }  // namespace TiledArray
 
-/// @return true if TiledArray (and, necessarily, MADWorld runtime) is in an
-/// initialized state
 bool TiledArray::initialized() { return initialized_accessor(); }
 
-/// @return true if TiledArray has been finalized at least once
 bool TiledArray::finalized() { return finalized_accessor(); }
 
-/// @name TiledArray initialization.
-///       These functions initialize TiledArray and (if needed) MADWorld
-///       runtime.
-/// @note the default World object is set to the object returned by these.
-/// @warning MADWorld can only be initialized/finalized once, hence if
-/// TiledArray initializes MADWorld
-///          it can also be initialized/finalized only once.
-
-/// @{
+bool TiledArray::initialized_to_be_quiet() { return quiet_accessor(); }
 
-/// @throw TiledArray::Exception if TiledArray initialized MADWorld and
-/// TiledArray::finalize() had been called
 TiledArray::World& TiledArray::initialize(int& argc, char**& argv,
                                           const SafeMPI::Intracomm& comm,
                                           bool quiet) {
@@ -102,16 +102,28 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv,
                               ? madness::initialize(argc, argv, comm, quiet)
                               : *madness::World::find_instance(comm);
     TiledArray::set_default_world(default_world);
-#ifdef TILEDARRAY_HAS_CUDA
-    TiledArray::cuda_initialize();
+#ifdef TILEDARRAY_HAS_DEVICE
+    TiledArray::device_initialize();
+#endif
+#ifdef IntelMKL_FAIR_DISPATCH
+    intel_mkl_use_fair_dispatch();
 #endif
     TiledArray::max_threads = TiledArray::get_num_threads();
     TiledArray::set_num_threads(1);
     madness::print_meminfo_disable();
     initialized_accessor() = true;
+    quiet_accessor() = quiet;
 
-    // if have TTG initialize it also
+    // if have TTG, initialize it also
 #if TILEDARRAY_HAS_TTG
+    // MADNESS/PaRSEC creates PaRSEC context that uses MPI_COMM_SELF to avoid
+    // creation of a PaRSEC comm thread to be able to use TTG/PaRSEC need to
+    // tell PaRSEC context to use the full communicator
+    if (madness::ParsecRuntime::context()->nb_nodes != default_world.size()) {
+      auto default_world_comm = default_world.mpi.comm().Get_mpi_comm();
+      parsec_remote_dep_set_ctx(madness::ParsecRuntime::context(),
+                                (intptr_t)default_world_comm);
+    }
     ttg::initialize(argc, argv, -1, madness::ParsecRuntime::context());
 #endif
 
@@ -152,8 +164,6 @@ TiledArray::World& TiledArray::initialize(int& argc, char**& argv,
     throw Exception("TiledArray already initialized");
 }
 
-/// Finalizes TiledArray (and MADWorld runtime, if it had not been initialized
-/// when TiledArray::initialize was called).
 void TiledArray::finalize() {
   // finalize in the reverse order of initialize
 #if TILEDARRAY_HAS_TTG
@@ -164,8 +174,8 @@ void TiledArray::finalize() {
   TiledArray::set_num_threads(TiledArray::max_threads);
   TiledArray::get_default_world().gop.fence();  // this should ensure no pending
                                                 // tasks using cuda allocators
-#ifdef TILEDARRAY_HAS_CUDA
-  TiledArray::cuda_finalize();
+#ifdef TILEDARRAY_HAS_DEVICE
+  TiledArray::device_finalize();
 #endif
   if (initialized_madworld()) {
     madness::finalize();
@@ -185,7 +195,17 @@ TiledArray::detail::Finalizer::~Finalizer() noexcept {
 
 TiledArray::detail::Finalizer TiledArray::scoped_finalizer() { return {}; }
 
-void TiledArray::ta_abort() { SafeMPI::COMM_WORLD.Abort(); }
+void TiledArray::ta_abort() {
+  // if have a custom signal handler for SIGABRT (i.e. we are running under a
+  // debugger) then call abort()
+  struct sigaction sa;
+  auto rc = sigaction(SIGABRT, NULL, &sa);
+  if (rc == 0 && sa.sa_handler != SIG_DFL) {
+    abort();
+  } else {
+    SafeMPI::COMM_WORLD.Abort();
+  }
+}
 
 void TiledArray::ta_abort(const std::string& m) {
   std::cerr << m << std::endl;
diff --git a/src/TiledArray/type_traits.h b/src/TiledArray/type_traits.h
index ece535d929..80c6bd924f 100644
--- a/src/TiledArray/type_traits.h
+++ b/src/TiledArray/type_traits.h
@@ -108,9 +108,9 @@ class LazyArrayTile;
     struct Derived : T, Fallback {};                                           \
                                                                                \
     template <class U>                                                         \
-    static No& test(decltype(U::Member)*);                                     \
+    static No &test(decltype(U::Member) *);                                    \
     template <typename U>                                                      \
-    static Yes& test(U*);                                                      \
+    static Yes &test(U *);                                                     \
                                                                                \
    public:                                                                     \
     static constexpr bool value =                                              \
@@ -141,9 +141,9 @@ class LazyArrayTile;
     struct Derived : T, Fallback {};                                          \
                                                                               \
     template <class U>                                                        \
-    static No& test(typename U::Type*);                                       \
+    static No &test(typename U::Type *);                                      \
     template <typename U>                                                     \
-    static Yes& test(U*);                                                     \
+    static Yes &test(U *);                                                    \
                                                                               \
    public:                                                                    \
     static constexpr bool value =                                             \
@@ -177,11 +177,11 @@ class LazyArrayTile;
     template <typename U, Result (U::*)(Args...) const>                        \
     struct CheckConst;                                                         \
     template <typename U>                                                      \
-    static Yes test_const(CheckConst<U, &U::Member>*);                         \
+    static Yes test_const(CheckConst<U, &U::Member> *);                        \
     template <typename U>                                                      \
     static No test_const(...);                                                 \
     template <typename U>                                                      \
-    static Yes test_nonconst(Check<U, &U::Member>*);                           \
+    static Yes test_nonconst(Check<U, &U::Member> *);                          \
     template <typename U>                                                      \
     static No test_nonconst(...);                                              \
                                                                                \
@@ -215,7 +215,7 @@ class LazyArrayTile;
     using Yes = char;                                                          \
     using No = int;                                                            \
     template <typename U, typename... Args_>                                   \
-    static auto func(void*)                                                    \
+    static auto func(void *)                                                   \
         -> decltype(std::add_pointer_t<decltype(std::declval<U>().Member(      \
                         std::declval<Args_>()...))>{},                         \
                     Yes{});                                                    \
@@ -248,9 +248,10 @@ class LazyArrayTile;
     using Yes = char;                                                          \
     using No = int;                                                            \
     template <typename... Args_>                                               \
-    static auto func(void*) -> decltype(                                       \
-        std::add_pointer_t<decltype(Function(std::declval<Args_>()...))>{},    \
-        Yes{});                                                                \
+    static auto func(void *)                                                   \
+        -> decltype(std::add_pointer_t<                                        \
+                        decltype(Function(std::declval<Args_>()...))>{},       \
+                    Yes{});                                                    \
     template <typename...>                                                     \
     static No func(...);                                                       \
                                                                                \
@@ -277,7 +278,7 @@ class LazyArrayTile;
     using Yes = char;                                                          \
     using No = int;                                                            \
     template <typename... Args_>                                               \
-    static auto func(void*)                                                    \
+    static auto func(void *)                                                   \
         -> decltype(std::add_pointer_t<decltype(::std::Function(               \
                         std::declval<Args_>()...))>{},                         \
                     Yes{});                                                    \
@@ -321,6 +322,8 @@ GENERATE_HAS_MEMBER_TYPE(mapped_type)
 
 GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(size)
 GENERATE_HAS_MEMBER_FUNCTION(size)
+GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(total_size)
+GENERATE_HAS_MEMBER_FUNCTION(total_size)
 GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(data)
 GENERATE_HAS_MEMBER_FUNCTION(data)
 GENERATE_HAS_MEMBER_FUNCTION_ANYRETURN(empty)
@@ -450,7 +453,7 @@ template <typename From, typename To>
 struct has_conversion_operator<
     From, To,
     typename std::enable_if<
-        is_type<decltype(std::declval<From>().operator To&())>::value>::type>
+        is_type<decltype(std::declval<From>().operator To &())>::value>::type>
     : std::true_type {};
 #else
 template <typename From, typename To>
@@ -472,7 +475,7 @@ struct has_conversion_operator {
   /* operator exists */
   template <typename A>
   static decltype(test(&A::operator To)) test(decltype(&A::operator To),
-                                              void*) {
+                                              void *) {
     /* Operator exists. What about sig? */
     typedef decltype(test(&A::operator To)) return_type;
     return return_type();
@@ -632,6 +635,42 @@ struct is_complex<std::complex<T>> : public std::true_type {};
 template <typename T>
 constexpr const bool is_complex_v = is_complex<T>::value;
 
+template <typename T, typename Enabler = void>
+struct complex_t_impl;
+
+template <typename T>
+struct complex_t_impl<std::complex<T>> {
+  using type = std::complex<T>;
+};
+
+template <typename T>
+struct complex_t_impl<T, std::enable_if_t<std::is_floating_point_v<T>>> {
+  using type = std::complex<T>;
+};
+
+/// evaluates to std::complex<T> if T is real, else T
+/// @note specialize complex_t_impl<T> to customize the behavior for type T
+template <typename T>
+using complex_t = typename complex_t_impl<T>::type;
+
+template <typename T, typename Enabler = void>
+struct real_t_impl;
+
+template <typename T>
+struct real_t_impl<std::complex<T>> {
+  using type = T;
+};
+
+template <typename T>
+struct real_t_impl<T, std::enable_if_t<std::is_floating_point_v<T>>> {
+  using type = T;
+};
+
+/// evaluates to U if T is std::complex<U>, or if T is real then evaluates to T
+/// @note specialize real_t_impl<T> to customize the behavior for type T
+template <typename T>
+using real_t = typename real_t_impl<T>::type;
+
 template <typename T>
 struct is_numeric : public std::is_arithmetic<T> {};
 
@@ -659,6 +698,25 @@ struct is_scalar<std::complex<T>> : public std::false_type {};
 template <typename T>
 constexpr const bool is_scalar_v = is_scalar<T>::value;
 
+template <typename T>
+struct is_blas_numeric : public std::false_type {};
+
+template <>
+struct is_blas_numeric<float> : public std::true_type {};
+
+template <>
+struct is_blas_numeric<double> : public std::true_type {};
+
+template <>
+struct is_blas_numeric<std::complex<float>> : public std::true_type {};
+
+template <>
+struct is_blas_numeric<std::complex<double>> : public std::true_type {};
+
+/// \c is_blas_numeric_v<T> is an alias for \c is_blas_numeric<T>::value
+template <typename T>
+constexpr const bool is_blas_numeric_v = is_blas_numeric<T>::value;
+
 /// Detect tiles used by \c ArrayEvalImpl
 
 /// \c is_array_tile evaluates to \c std::true_type when \c T is a \c
@@ -760,14 +818,40 @@ struct scalar_type<T, typename std::enable_if<!is_numeric_v<T>>::type>
 template <typename T>
 using scalar_t = typename TiledArray::detail::scalar_type<T>::type;
 
+/// is true type if `T::rebind_t<Element>` is defined
+template <typename T, typename Element, typename = void>
+struct has_rebind : std::false_type {};
+template <typename T, typename Element>
+struct has_rebind<T, Element,
+                  std::void_t<typename T::template rebind_t<Element>>>
+    : std::true_type {};
+
+/// alias to has_rebind<T, Element>::value
+template <typename T, typename Element>
+inline constexpr bool has_rebind_v = has_rebind<T, Element>::value;
+
+/// is true type if `T::rebind_numeric_t<Numeric>` is defined
+template <typename T, typename Numeric, typename = void>
+struct has_rebind_numeric : std::false_type {};
+template <typename T, typename Numeric>
+struct has_rebind_numeric<
+    T, Numeric, std::void_t<typename T::template rebind_numeric_t<Numeric>>>
+    : std::true_type {};
+
+/// alias to has_rebind_numeric<T, Element>::value
+template <typename T, typename Element>
+inline constexpr bool has_rebind_numeric_v =
+    has_rebind_numeric<T, Element>::value;
+
 template <typename T>
 struct is_strictly_ordered_helper {
   using Yes = char;
   using No = int;
   template <typename U>
-  static auto test(void*) -> decltype(
-      std::add_pointer_t<decltype(std::declval<U>() < std::declval<U>())>{},
-      Yes{});
+  static auto test(void *)
+      -> decltype(std::add_pointer_t<decltype(std::declval<U>() <
+                                              std::declval<U>())>{},
+                  Yes{});
   template <typename...>
   static No test(...);
 
@@ -775,6 +859,160 @@ struct is_strictly_ordered_helper {
   static constexpr const bool value = sizeof(test<T>(0)) == sizeof(Yes);
 };
 
+///////// is_less_than_comparable /////////
+
+template <typename T, typename = std::void_t<>>
+struct is_less_than_comparable : public std::false_type {};
+
+template <typename T>
+struct is_less_than_comparable<T,
+                               std::void_t<decltype(std::declval<const T &>() <
+                                                    std::declval<const T &>())>>
+    : public std::true_type {};
+
+template <typename T>
+static constexpr bool is_less_than_comparable_v =
+    is_less_than_comparable<T>::value;
+
+///////// are_less_than_comparable /////////
+
+template <typename T, typename U, typename = std::void_t<>>
+struct are_less_than_comparable : public std::false_type {};
+
+template <typename T, typename U>
+struct are_less_than_comparable<
+    T, U,
+    std::void_t<decltype(std::declval<const T &>() <
+                         std::declval<const U &>())>> : public std::true_type {
+};
+
+template <typename T, typename U>
+static constexpr bool are_less_than_comparable_v =
+    are_less_than_comparable<T, U>::value;
+
+///////// is_less_than_or_equal_comparable /////////
+
+template <typename T, typename = std::void_t<>>
+struct is_less_than_or_equal_comparable : public std::false_type {};
+
+template <typename T>
+struct is_less_than_or_equal_comparable<
+    T, std::void_t<decltype(std::declval<const T &>() <=
+                            std::declval<const T &>())>>
+    : public std::true_type {};
+
+template <typename T>
+static constexpr bool is_less_than_or_equal_comparable_v =
+    is_less_than_or_equal_comparable<T>::value;
+
+///////// are_less_than_comparable /////////
+
+template <typename T, typename U, typename = std::void_t<>>
+struct are_less_than_or_equal_comparable : public std::false_type {};
+
+template <typename T, typename U>
+struct are_less_than_or_equal_comparable<
+    T, U,
+    std::void_t<decltype(std::declval<const T &>() <=
+                         std::declval<const U &>())>> : public std::true_type {
+};
+
+template <typename T, typename U>
+static constexpr bool are_less_than_or_equal_comparable_v =
+    are_less_than_or_equal_comparable<T, U>::value;
+
+///////// is_greater_than_comparable /////////
+
+template <typename T, typename = std::void_t<>>
+struct is_greater_than_comparable : public std::false_type {};
+
+template <typename T>
+struct is_greater_than_comparable<
+    T, std::void_t<decltype(std::declval<const T &>() >
+                            std::declval<const T &>())>>
+    : public std::true_type {};
+
+template <typename T>
+static constexpr bool is_greater_than_comparable_v =
+    is_greater_than_comparable<T>::value;
+
+///////// are_greater_than_comparable /////////
+
+template <typename T, typename U, typename = std::void_t<>>
+struct are_greater_than_comparable : public std::false_type {};
+
+template <typename T, typename U>
+struct are_greater_than_comparable<
+    T, U,
+    std::void_t<decltype(std::declval<const T &>() >
+                         std::declval<const U &>())>> : public std::true_type {
+};
+
+template <typename T, typename U>
+static constexpr bool are_greater_than_comparable_v =
+    are_greater_than_comparable<T, U>::value;
+
+///////// is_greater_than_or_equal_comparable /////////
+
+template <typename T, typename = std::void_t<>>
+struct is_greater_than_or_equal_comparable : public std::false_type {};
+
+template <typename T>
+struct is_greater_than_or_equal_comparable<
+    T, std::void_t<decltype(std::declval<const T &>() >=
+                            std::declval<const T &>())>>
+    : public std::true_type {};
+
+template <typename T>
+static constexpr bool is_greater_than_or_equal_comparable_v =
+    is_greater_than_or_equal_comparable<T>::value;
+
+///////// are_greater_than_comparable /////////
+
+template <typename T, typename U, typename = std::void_t<>>
+struct are_greater_than_or_equal_comparable : public std::false_type {};
+
+template <typename T, typename U>
+struct are_greater_than_or_equal_comparable<
+    T, U,
+    std::void_t<decltype(std::declval<const T &>() >=
+                         std::declval<const U &>())>> : public std::true_type {
+};
+
+template <typename T, typename U>
+static constexpr bool are_greater_than_or_equal_comparable_v =
+    are_greater_than_or_equal_comparable<T, U>::value;
+
+///////// is_equality_comparable /////////
+
+template <typename T, typename = std::void_t<>>
+struct is_equality_comparable : public std::false_type {};
+
+template <typename T>
+struct is_equality_comparable<T,
+                              std::void_t<decltype(std::declval<const T &>() ==
+                                                   std::declval<const T &>())>>
+    : public std::true_type {};
+
+template <typename T>
+static constexpr bool is_equality_comparable_v =
+    is_equality_comparable<T>::value;
+
+///////// are_equality_comparable /////////
+
+template <typename T, typename U, typename = std::void_t<>>
+struct are_equality_comparable : public std::false_type {};
+
+template <typename T, typename U>
+struct are_equality_comparable<T, U,
+                               std::void_t<decltype(std::declval<const T &>() ==
+                                                    std::declval<const U &>())>>
+    : public std::true_type {};
+
+template <typename T, typename U>
+static constexpr bool are_equality_comparable_v =
+    are_equality_comparable<T, U>::value;
+
 /// \c is_strictly_ordered<T>::value is true if strict order is defined for T,
 /// i.e. "T < T" is defined
 template <typename T>
@@ -834,7 +1072,7 @@ struct is_std_gettable : std::false_type {};
 
 template <std::size_t I, typename T>
 struct is_std_gettable<
-    I, T, std::void_t<decltype(::std::get<I>(std::declval<const T&>()))>>
+    I, T, std::void_t<decltype(::std::get<I>(std::declval<const T &>()))>>
     : std::true_type {};
 
 template <std::size_t I, typename T>
@@ -845,7 +1083,7 @@ struct is_boost_gettable : std::false_type {};
 
 template <std::size_t I, typename T>
 struct is_boost_gettable<
-    I, T, std::void_t<decltype(::boost::get<I>(std::declval<const T&>()))>>
+    I, T, std::void_t<decltype(::boost::get<I>(std::declval<const T &>()))>>
     : std::true_type {};
 
 template <std::size_t I, typename T>
@@ -856,7 +1094,7 @@ constexpr const bool is_gettable_v =
     is_std_gettable_v<I, T> || is_boost_gettable_v<I, T>;
 
 template <std::size_t I, typename T>
-auto get(T&& t) {
+auto get(T &&t) {
   using boost::get;
   using std::get;
   return get<I>(std::forward<T>(t));
@@ -1017,22 +1255,22 @@ struct is_iterator<T, typename std::enable_if<
 };
 
 template <typename T>
-struct is_iterator<T*, void> : std::true_type {
+struct is_iterator<T *, void> : std::true_type {
   typedef std::random_access_iterator_tag iterator_category;
 };
 
 template <typename T>
-struct is_iterator<const T*, void> : std::true_type {
+struct is_iterator<const T *, void> : std::true_type {
   typedef std::random_access_iterator_tag iterator_category;
 };
 
 template <typename T>
-struct is_iterator<T* const, void> : std::true_type {
+struct is_iterator<T *const, void> : std::true_type {
   typedef std::random_access_iterator_tag iterator_category;
 };
 
 template <typename T>
-struct is_iterator<const T* const, void> : std::true_type {
+struct is_iterator<const T *const, void> : std::true_type {
   typedef std::random_access_iterator_tag iterator_category;
 };
 
@@ -1071,8 +1309,8 @@ template <typename T, typename Enabler = void>
 struct is_range : std::false_type {};
 
 template <typename T>
-struct is_range<T, std::void_t<decltype(std::begin(std::declval<T&>()),
-                                        std::end(std::declval<T&>()))>>
+struct is_range<T, std::void_t<decltype(std::begin(std::declval<T &>()),
+                                        std::end(std::declval<T &>()))>>
     : std::true_type {};
 
 /// \c is_range_v<T> is an alias for \c is_range<T>::value
@@ -1087,7 +1325,7 @@ template <typename T, typename Enabler = void>
 struct is_sized_range : std::false_type {};
 
 template <typename T>
-struct is_sized_range<T, std::void_t<decltype(std::size(std::declval<T&>()))>>
+struct is_sized_range<T, std::void_t<decltype(std::size(std::declval<T &>()))>>
     : is_range<T> {};
 
 /// `is_sized_range_v<T>` is an alias for `is_sized_range<T>::value`
@@ -1102,9 +1340,8 @@ template <typename T, typename Enabler = void>
 struct is_contiguous_range : std::false_type {};
 
 template <typename T>
-struct is_contiguous_range<T,
-                           std::void_t<decltype(std::data(std::declval<T&>()))>>
-    : is_range<T> {};
+struct is_contiguous_range<
+    T, std::void_t<decltype(std::data(std::declval<T &>()))>> : is_range<T> {};
 
 /// `is_contiguous_range_v<T>` is an alias for `is_contiguous_range<T>::value`
 template <typename T>
@@ -1115,14 +1352,14 @@ static constexpr bool is_contiguous_range_v = is_contiguous_range<T>::value;
 /// std::begin(T&)
 /// @warning will be replaced by C++20 ranges::iterator_t
 template <class T>
-using iterator_t = decltype(std::begin(std::declval<T&>()));
+using iterator_t = decltype(std::begin(std::declval<T &>()));
 
 /// @tparam T a range type
 /// @c value_t<T> is the value type, i.e. the type to which @c std::begin(T&)
 /// dereferences to
 /// @warning will be replaced by C++20 ranges::value_t
 template <class T>
-using value_t = remove_cvr_t<decltype(*std::begin(std::declval<T&>()))>;
+using value_t = remove_cvr_t<decltype(*std::begin(std::declval<T &>()))>;
 
 /// @tparam T a type
 /// `is_integral_range<T>::value` is true if @p T is a range type that
@@ -1176,8 +1413,9 @@ struct is_array : public std::false_type {};
 template <typename T, typename P>
 struct is_array<DistArray<T, P>> : public std::true_type {};
 
-template <typename T>
-static constexpr bool is_array_v = is_array<T>::value;
+template <typename... Ts>
+constexpr bool is_array_v =
+    (is_array<std::remove_reference_t<Ts>>::value && ...);
 
 template <typename T>
 using trange_t = typename T::trange_type;
@@ -1278,7 +1516,7 @@ static constexpr bool is_gpair_range_v = is_gpair_range<T>::value;
 template <typename GeneralizedPair,
           typename = std::enable_if_t<
               is_gpair_v<std::remove_reference_t<GeneralizedPair>>>>
-decltype(auto) at(GeneralizedPair&& v, std::size_t idx) {
+decltype(auto) at(GeneralizedPair &&v, std::size_t idx) {
   assert(idx == 0 || idx == 1);
   if constexpr (is_gettable_pair_v<std::decay_t<decltype(v)>>) {
 #if __cplusplus <= 201703L
diff --git a/src/TiledArray/util/bug.cpp b/src/TiledArray/util/bug.cpp
index 5e58ba667c..0105635f37 100644
--- a/src/TiledArray/util/bug.cpp
+++ b/src/TiledArray/util/bug.cpp
@@ -77,7 +77,6 @@ Debugger::~Debugger() {
   for (int i = 0; i < NSIG; i++) {
     if (mysigs_[i]) signals[i] = nullptr;
   }
-  delete[] mysigs_;
 }
 
 void Debugger::init() {
@@ -91,7 +90,7 @@ void Debugger::init() {
   debug_ = 1;
   wait_for_debugger_ = 1;
 
-  mysigs_ = new int[NSIG];
+  mysigs_ = std::make_unique<int[]>(NSIG);
   for (int i = 0; i < NSIG; i++) {
     mysigs_[i] = 0;
   }
@@ -106,14 +105,14 @@ static void handler(int sig) {
 void Debugger::handle(int sig) {
   if (sig >= NSIG) return;
   typedef void (*handler_type)(int);
-  signal(sig, (handler_type)handler);
+  std::signal(sig, (handler_type)handler);
   signals[sig] = this;
   mysigs_[sig] = 1;
 }
 
 void Debugger::release(int sig) {
   if (sig >= NSIG) return;
-  signal(sig, SIG_DFL);
+  std::signal(sig, SIG_DFL);
   signals[sig] = nullptr;
   mysigs_[sig] = 0;
 }
@@ -166,7 +165,7 @@ void Debugger::set_prefix(const char *p) {
 
 void Debugger::set_prefix(int i) {
   char p[128];
-  sprintf(p, "%3d: ", i);
+  snprintf(p, sizeof(p), "%3d: ", i);
   set_prefix(p);
 }
 
@@ -180,25 +179,48 @@ void Debugger::default_cmd() {
   }
 }
 
+const std::string Debugger::gdb_cmd_ =
+    "gdb -ex \"set variable debugger_ready_=1\" --pid=$(PID) $(EXEC)";
+const std::string Debugger::lldb_cmd_ =
+    "lldb -p $(PID) -o \"expr debugger_ready_=1\"";
+
 void Debugger::resolve_cmd_alias() {
   if (cmd_ == "gdb_xterm") {
-    cmd_ =
-        "xterm -title \"$(PREFIX)$(EXEC)\" -e gdb -ex \"set variable "
-        "debugger_ready_=1\" --pid=$(PID) $(EXEC) &";
+    cmd_ = "xterm -title \"$(PREFIX)$(EXEC)\" -e " + gdb_cmd_ + " &";
   } else if (cmd_ == "lldb_xterm") {
-    cmd_ =
-        "xterm -title \"$(PREFIX)$(EXEC)\" -e lldb -p $(PID) -o \"expr "
-        "debugger_ready_=1\" &";
+    cmd_ = "xterm -title \"$(PREFIX)$(EXEC)\" -e " + lldb_cmd_ + " &";
   }
 }
 
+std::string Debugger::replace_macros(std::string str) {
+  if (!str.empty()) {
+    int pid = getpid();
+    std::string::size_type pos;
+    std::string pidvar("$(PID)");
+    while ((pos = str.find(pidvar)) != std::string::npos) {
+      std::string pidstr;
+      pidstr += std::to_string(pid);
+      str.replace(pos, pidvar.size(), pidstr);
+    }
+    std::string execvar("$(EXEC)");
+    while ((pos = str.find(execvar)) != std::string::npos) {
+      str.replace(pos, execvar.size(), exec_);
+    }
+    std::string prefixvar("$(PREFIX)");
+    while ((pos = str.find(prefixvar)) != std::string::npos) {
+      str.replace(pos, prefixvar.size(), prefix_);
+    }
+  }
+  return str;
+}
+
 void Debugger::set_cmd(const char *cmd) {
   if (cmd) {
     cmd_ = cmd;
-    resolve_cmd_alias();
   } else {
     cmd_.resize(0);
   }
+  this->resolve_cmd_alias();
 }
 
 void Debugger::debug(const char *reason) {
@@ -209,58 +231,48 @@ void Debugger::debug(const char *reason) {
     std::cout << "no reason given";
   std::cout << std::endl;
 
-  if (!cmd_.empty()) {
-    int pid = getpid();
-    // contruct the command name
-    std::string cmd = cmd_;
-    std::string::size_type pos;
-    std::string pidvar("$(PID)");
-    while ((pos = cmd.find(pidvar)) != std::string::npos) {
-      std::string pidstr;
-      pidstr += std::to_string(pid);
-      cmd.replace(pos, pidvar.size(), pidstr);
-    }
-    std::string execvar("$(EXEC)");
-    while ((pos = cmd.find(execvar)) != std::string::npos) {
-      cmd.replace(pos, execvar.size(), exec_);
-    }
-    std::string prefixvar("$(PREFIX)");
-    while ((pos = cmd.find(prefixvar)) != std::string::npos) {
-      cmd.replace(pos, prefixvar.size(), prefix_);
-    }
-
-    // start the debugger
-    // before starting the debugger de-register signal handler for SIGTRAP to
-    // let the debugger take over
-    release(SIGTRAP);
+  const std::string cmd = replace_macros(cmd_);
+  // start the debugger
+  // before starting the debugger de-register signal handler for SIGTRAP to
+  // let the debugger take over
+  release(SIGTRAP);
+  int system_retvalue = 0;
+  if (!cmd.empty()) {
     std::cout << prefix_ << "Debugger: starting \"" << cmd << "\"" << std::endl;
-    debugger_ready_ = 0;
-    const auto system_retvalue = system(cmd.c_str());
-    if (system_retvalue != 0) {  // call to system() failed
-      std::cout << prefix_
-                << "Failed debugger launch: system() did not succeed ..."
-                << std::endl;
-    } else {  // call to system() succeeded
-      // wait until the debugger is ready
-      if (sleep_) {
-        std::cout << prefix_ << "Sleeping " << sleep_
-                  << " seconds to wait for debugger ..." << std::endl;
-        sleep(sleep_);
-      }
-      if (wait_for_debugger_) {
-        std::string make_ready_message;
-        if (cmd_.find(" gdb ") != std::string::npos ||
-            cmd_.find(" lldb ") != std::string::npos) {
-          make_ready_message =
-              " configure debugging session (set breakpoints/watchpoints, "
-              "etc.) then type 'c' to continue running";
-        }
-
-        std::cout << prefix_ << ": waiting for the user ..."
-                  << make_ready_message << std::endl;
-        while (!debugger_ready_)
-          ;
+    system_retvalue = std::system(cmd.c_str());
+  }
+  if (system_retvalue != 0) {
+    std::cout << prefix_
+              << "Failed debugger launch: system() did not succeed ..."
+              << std::endl;
+  } else {  // call to system() succeeded
+    // wait until the debugger is ready
+    if (sleep_) {
+      std::cout << prefix_ << "Debugger: sleeping " << sleep_
+                << " seconds to wait for debugger ..." << std::endl;
+      sleep(sleep_);
+    }
+    if (wait_for_debugger_) {
+      std::cout << prefix_ << "Debugger: waiting for the user ...";
+      if (cmd_.find(" gdb ") != std::string::npos ||
+          cmd_.find(" lldb ") != std::string::npos) {
+        std::cout
+            << " configure debugging session (set breakpoints/watchpoints, "
+               "etc.) then type 'c' to continue running";
+      } else if (cmd.empty()) {
+        std::cout << " attach debugger to process " << std::to_string(getpid())
+                  << " as follows:" << std::endl
+                  << prefix_
+                  << "Debugger: - if using  gdb: " << replace_macros(gdb_cmd_)
+                  << std::endl
+                  << prefix_
+                  << "Debugger: - if using lldb: " << replace_macros(lldb_cmd_);
       }
+      std::cout << std::endl;
+
+      debugger_ready_ = 0;
+      while (!debugger_ready_)
+        ;
     }
   }
 }
@@ -286,6 +298,10 @@ void Debugger::got_signal(int sig) {
   else
     signame = "UNKNOWN SIGNAL";
 
+  for (auto const &action : actions_) {
+    action();
+  }
+  actions_.clear();
   if (traceback_) {
     traceback(signame);
   }
@@ -355,6 +371,10 @@ void Debugger::__traceback(const std::string &prefix, const char *reason) {
     std::cout << result.str(nframes_to_skip) << std::endl;
 }
 
+void Debugger::register_prelaunch_action(std::function<void()> action) {
+  actions_.push_back(action);
+}
+
 void create_debugger(const char *cmd, const char *exec, std::int64_t rank) {
   auto debugger = std::make_shared<TiledArray::Debugger>();
   if (cmd) debugger->set_cmd(cmd);
diff --git a/src/TiledArray/util/bug.h b/src/TiledArray/util/bug.h
index 829c592ee1..5367497b62 100644
--- a/src/TiledArray/util/bug.h
+++ b/src/TiledArray/util/bug.h
@@ -29,6 +29,8 @@
 #define TILEDARRAY_UTIL_BUG_H_
 
 #include <cassert>
+#include <cstdint>
+#include <functional>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -290,7 +292,7 @@ class Debugger {
   bool sleep_;
   bool wait_for_debugger_;
   bool handle_sigint_;
-  int *mysigs_;
+  std::unique_ptr<int[]> mysigs_;
 
   void init();
 
@@ -324,11 +326,11 @@ class Debugger {
    @param reason optional string specifying the reason for traceback
    */
   virtual void traceback(const char *reason);
-  /// Turn on or off debugging on a signel.  The default is on.
+  /// Turn on or off debugging on a signal.  The default is on.
   virtual void set_debug_on_signal(int);
-  /// Turn on or off traceback on a signel.  The default is on.
+  /// Turn on or off traceback on a signal.  The default is on.
   virtual void set_traceback_on_signal(int);
-  /// Turn on or off exit after a signel.  The default is on.
+  /// Turn on or off exit after a signal.  The default is on.
   virtual void set_exit_on_signal(int);
   /** Turn on or off running an infinite loop after the debugger is started.
       This loop gives the debugger a chance to attack to the process.
@@ -343,9 +345,9 @@ class Debugger {
   /// This calls handle(int) with all of the major signals.
   virtual void handle_defaults();
 
-  /// This sets a prefix which preceeds all messages printing by Debugger.
+  /// This sets a prefix which precedes all messages printing by Debugger.
   virtual void set_prefix(const char *p);
-  /// Set the prefix to the decimal represention of p followed by a ": ".
+  /// Set the prefix to the decimal representation of p followed by a ": ".
   virtual void set_prefix(int p);
 
   // clang-format off
@@ -369,7 +371,7 @@ class Debugger {
   virtual void default_cmd();
   /** Set the name of the executable for the current process.
       It is up to the programmer to set this, even if the Debugger
-      is initialized with the KeyVal constructor. */
+      is initialized with the constructor. */
   virtual void set_exec(const char *);
 
   /// Called when signal sig is received.  This is mainly for internal use.
@@ -380,9 +382,23 @@ class Debugger {
   /// Return the global default debugger.
   static std::shared_ptr<Debugger> default_debugger();
 
+  /// Register a (one-time) action to be executed when debugger is launched
+  /// @param action an action to be executed
+  /// @note multiple actions registered via this will be executed in order of
+  ///       their registration
+  void register_prelaunch_action(std::function<void()> action);
+
  private:
   /// Replaces alias in cmd_ with its full form
   void resolve_cmd_alias();
+  /// Replace macros (\c PID , \c EXEC , \c PREFIX ) in \p cmd by their values
+  /// \param cmd a string
+  /// \return processed str
+  std::string replace_macros(std::string cmd);
+
+  static const std::string gdb_cmd_;
+  static const std::string lldb_cmd_;
+  std::vector<std::function<void()>> actions_;  // prelaunch actions
 };
 
 /// Use this to create a Debugger object and make it the default
diff --git a/src/TiledArray/util/invoke.h b/src/TiledArray/util/invoke.h
new file mode 100644
index 0000000000..ff8bbed191
--- /dev/null
+++ b/src/TiledArray/util/invoke.h
@@ -0,0 +1,70 @@
+/*
+ *  This file is a part of TiledArray.
+ *  Copyright (C) 2017  Virginia Tech
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *  Eduard Valeyev
+ *  Department of Chemistry, Virginia Tech
+ *
+ *  meta.h
+ *  April 11, 2017
+ *
+ */
+
+#ifndef TILEDARRAY_UTIL_INVOKE_H
+#define TILEDARRAY_UTIL_INVOKE_H
+
+#include <TiledArray/external/madness.h>
+#include <madness/world/future.h>
+#include <madness/world/world.h>
+#include <madness/world/world_task_queue.h>
+
+namespace TiledArray {
+namespace detail {
+
+/// ||'s bools
+template <bool head, bool... tail>
+struct or_reduce {
+  static constexpr bool value = head || or_reduce<tail...>::value;
+};
+
+template <bool b>
+struct or_reduce<b> {
+  static constexpr bool value = b;
+};
+
+// is any argument a Future?
+// - yes: async launch
+// -  no: direct launch
+template <typename Function, typename... Args>
+auto invoke(Function&& fn, Args&&... args) -> typename std::enable_if<
+    !or_reduce<false, madness::is_future<std::decay_t<Args>>::value...>::value,
+    decltype(fn(args...))>::type {
+  return fn(std::forward<Args>(args)...);
+}
+
+template <
+    typename Function, typename... Args,
+    typename = typename std::enable_if<or_reduce<
+        false, madness::is_future<std::decay_t<Args>>::value...>::value>::type>
+auto invoke(Function&& fn, Args&&... args) {
+  return TiledArray::get_default_world().taskq.add(std::forward<Function>(fn),
+                                                   std::forward<Args>(args)...);
+}
+
+}  // namespace detail
+}  // namespace TiledArray
+
+#endif  // TILEDARRAY_UTIL_INVOKE_H
diff --git a/src/TiledArray/util/random.h b/src/TiledArray/util/random.h
index b096654bc6..15daf0d716 100644
--- a/src/TiledArray/util/random.h
+++ b/src/TiledArray/util/random.h
@@ -20,12 +20,13 @@
 #ifndef TILEDARRAY_RANDOM_H__INCLUDED
 #define TILEDARRAY_RANDOM_H__INCLUDED
 
+#include <boost/random/mersenne_twister.hpp>
+
 #include <complex>      // for std::complex
+#include <cstdint>      // for std::int64_t
 #include <cstdlib>      // for std::rand
 #include <type_traits>  // for true_type, false_type, and enable_if
 
-#include <boost/random/mersenne_twister.hpp>
-
 namespace TiledArray {
 
 /// \return reference to the thread-specific random engine used to implement
diff --git a/src/TiledArray/util/time.h b/src/TiledArray/util/time.h
index aa0639bc0a..8ae649a6af 100644
--- a/src/TiledArray/util/time.h
+++ b/src/TiledArray/util/time.h
@@ -26,7 +26,10 @@
 #ifndef TILEDARRAY_UTIL_TIME_H__INCLUDED
 #define TILEDARRAY_UTIL_TIME_H__INCLUDED
 
+#include <algorithm>
 #include <chrono>
+#include <cmath>
+#include <deque>
 
 namespace TiledArray {
 
@@ -46,6 +49,86 @@ inline int64_t duration_in_ns(time_point const &t0, time_point const &t1) {
   return std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
 }
 
+namespace detail {
+inline std::deque<double> &call_durations_accessor() {
+  static std::deque<double> call_durations;
+  return call_durations;
+}
+}  // namespace detail
+
+/// Access recorded durations
+inline const std::deque<double> &durations() {
+  return detail::call_durations_accessor();
+}
+
+/// Clear recorded durations
+inline void clear_durations() { detail::call_durations_accessor().clear(); }
+
+/// Record duration since the given time point
+/// \param tp_start The start time point
+inline void record_duration_since(const time_point &tp_start) {
+  detail::call_durations_accessor().push_back(duration_in_s(tp_start, now()));
+}
+
+/// Record duration of a single function call
+template <typename F, typename... Args>
+void record_duration(F &&f, Args &&...args) {
+  auto tp_start = now();
+  std::forward<F>(f)(std::forward<Args>(args)...);
+  record_duration_since(tp_start);
+}
+
+/// Statistics of recorded durations
+struct duration_stats_t {
+  double min = 0.0;
+  double max = 0.0;
+  double mean = 0.0;
+  double stddev = 0.0;
+  double median = 0.0;
+  double mean_reciprocal = 0.0;
+};
+
+/// Compute statistics of recorded durations
+/// \return Statistics of recorded durations
+inline duration_stats_t duration_statistics() {
+  duration_stats_t stats;
+  auto &durations = detail::call_durations_accessor();
+  if (durations.empty()) return stats;
+
+  stats.min = durations.front();
+  stats.max = durations.front();
+  stats.mean = durations.front();
+  stats.mean_reciprocal = 1.0 / durations.front();
+  double total = stats.mean;
+  double total_reciprocal = stats.mean_reciprocal;
+  for (size_t i = 1; i < durations.size(); ++i) {
+    total += durations[i];
+    total_reciprocal += 1. / durations[i];
+    stats.min = std::min(stats.min, durations[i]);
+    stats.max = std::max(stats.max, durations[i]);
+  }
+  stats.mean = total / durations.size();
+  stats.mean_reciprocal = total_reciprocal / durations.size();
+
+  double sum_sq = 0.0;
+  for (size_t i = 0; i < durations.size(); ++i) {
+    sum_sq += (durations[i] - stats.mean) * (durations[i] - stats.mean);
+  }
+  stats.stddev =
+      durations.size() > 1 ? std::sqrt(sum_sq / (durations.size() - 1)) : 0.0;
+
+  std::sort(durations.begin(), durations.end());
+  stats.median = durations[durations.size() / 2];
+
+  return stats;
+}
+
 }  // namespace TiledArray
 
+#ifndef TA_RECORD_DURATION
+/// Record duration of a statement
+#define TA_RECORD_DURATION(statement) \
+  TiledArray::record_duration([&] { statement; });
+#endif  // !defined(TA_RECORD_DURATION)
+
 #endif  // TILEDARRAY_UTIL_TIME_H__INCLUDED
diff --git a/src/TiledArray/util/vector.h b/src/TiledArray/util/vector.h
index 12c5d0dfcd..6e69f523f4 100644
--- a/src/TiledArray/util/vector.h
+++ b/src/TiledArray/util/vector.h
@@ -27,13 +27,16 @@
 #define TILEDARRAY_UTIL_VECTOR_H
 
 #include <boost/container/small_vector.hpp>
+#include <boost/version.hpp>
 
 // Boost.Container 1.75 and earlier uses standard exception classes, 1.76+ use
-// Boost.Container exceptions, unless BOOST_CONTAINER_USE_STD_EXCEPTIONS is defined:
+// Boost.Container exceptions, unless BOOST_CONTAINER_USE_STD_EXCEPTIONS is
+// defined:
 // https://www.boost.org/doc/libs/master/doc/html/container/release_notes.html#container.release_notes.release_notes_boost_1_76_00
-// Define BOOST_CONTAINER_USE_STD_EXCEPTIONS for Boost <1.76 so that exception checking can use this macro with all versions of Boost
+// Define BOOST_CONTAINER_USE_STD_EXCEPTIONS for Boost <1.76 so that exception
+// checking can use this macro with all versions of Boost
 #if BOOST_VERSION < 107600 && !defined(BOOST_CONTAINER_USE_STD_EXCEPTIONS)
-# define BOOST_CONTAINER_USE_STD_EXCEPTIONS 1
+#define BOOST_CONTAINER_USE_STD_EXCEPTIONS 1
 #endif
 
 #include <vector>
@@ -41,6 +44,7 @@
 
 #include <TiledArray/utility.h>
 #include <madness/world/archive.h>
+#include "TiledArray/error.h"
 
 namespace TiledArray {
 
@@ -90,6 +94,32 @@ constexpr auto iv(Int i0, Ints... rest) {
   return result;
 }
 
+namespace operators {
+
+template <typename T1, std::size_t N1, typename T2, std::size_t N2>
+decltype(auto) operator+(const boost::container::small_vector<T1, N1>& v1,
+                         const boost::container::small_vector<T2, N2>& v2) {
+  TA_ASSERT(v1.size() == v2.size());
+  boost::container::small_vector<std::common_type_t<T1, T2>, std::max(N1, N2)>
+      result(v1.size());
+  std::transform(v1.begin(), v1.end(), v2.begin(), result.begin(),
+                 [](auto&& a, auto&& b) { return a + b; });
+  return result;
+}
+
+template <typename T1, std::size_t N1, typename T2, std::size_t N2>
+decltype(auto) operator-(const boost::container::small_vector<T1, N1>& v1,
+                         const boost::container::small_vector<T2, N2>& v2) {
+  TA_ASSERT(v1.size() == v2.size());
+  boost::container::small_vector<std::common_type_t<T1, T2>, std::max(N1, N2)>
+      result(v1.size());
+  std::transform(v1.begin(), v1.end(), v2.begin(), result.begin(),
+                 [](auto&& a, auto&& b) { return a - b; });
+  return result;
+}
+
+}  // namespace operators
+
 }  // namespace container
 }  // namespace TiledArray
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 32a8e9ee6c..a30770fb18 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -34,6 +34,8 @@ set(executable ta_test)
 set(ta_test_src_files  ta_test.cpp
     range1.cpp
     range.cpp
+    block_range.cpp
+    type_traits.cpp
     tensor.cpp
     tensor_of_tensor.cpp
     tensor_tensor_view.cpp
@@ -99,14 +101,24 @@ set(ta_test_src_files  ta_test.cpp
     einsum.cpp
     linalg.cpp
     cp.cpp
+    btas.cpp
 )
 
-if(CUDA_FOUND)
-    list(APPEND ta_test_src_files librett.cpp expressions_cuda_um.cpp tensor_um.cpp)
+if(TILEDARRAY_HAS_CUDA OR TILEDARRAY_HAS_HIP)
+  list(APPEND ta_test_src_files librett.cpp expressions_device_um.cpp tensor_um.cpp)
+endif()
+
+# if using C++20 must use Boost 1.74 or later:
+# - https://en.cppreference.com/w/cpp/io/basic_ostream/operator_ltlt2
+# - https://github.com/boostorg/test/commit/db6b98c72783351e0acd3c558691323a7a103ba9
+if (CMAKE_CXX_STANDARD GREATER_EQUAL 20 AND DEFINED Boost_VERSION)
+  if (Boost_VERSION VERSION_LESS 1.74.0)
+     message(FATAL_ERROR "Boost 1.74 or later required to build TA unit tests when using C++20 or higher")
+  endif()
 endif()
 
 # if tiledarray library was compiled without exceptions, use TA header-only (see below)
-if (NOT (TA_ASSERT_POLICY STREQUAL TA_ASSERT_THROW) AND NOT CUDA_FOUND AND FALSE)
+if (NOT (TA_ASSERT_POLICY STREQUAL TA_ASSERT_THROW) AND NOT TILEDARRAY_HAS_CUDA AND FALSE)
   add_ta_executable(${executable} "${ta_test_src_files}" "MADworld;${TILEDARRAY_PRIVATE_LINK_LIBRARIES}")
   target_compile_definitions(${executable} PRIVATE TILEDARRAY_HEADER_ONLY=1)
   if (LAPACK_INCLUDE_DIRS)
@@ -121,6 +133,10 @@ if (NOT (TA_ASSERT_POLICY STREQUAL TA_ASSERT_THROW) AND NOT CUDA_FOUND AND FALSE
 else()
   add_ta_executable(${executable} "${ta_test_src_files}" "tiledarray")
 endif()
+# if Boost is modularized, need to explicitly state that we need Boost::test module
+if (Boost_IS_MODULARIZED)
+  target_link_libraries(${executable} PRIVATE Boost::unit_test_framework)
+endif()
 # Add include directories and compiler flags for ta_test
 target_include_directories(${executable} PRIVATE
     ${CMAKE_CURRENT_SOURCE_DIR}
@@ -133,16 +149,14 @@ target_include_directories(${executable} PRIVATE
 #      is too late to do this here; must set TA_ERROR=throw if want to run unit tests
 target_compile_definitions(${executable} PRIVATE TILEDARRAY_NO_USER_ERROR_MESSAGES=1
     MADNESS_DISPLAY_EXCEPTION_BREAK_MESSAGE=0)
-# optional dependencies
-if (TARGET range-v3::range-v3)
-  target_link_libraries(${executable} PRIVATE range-v3::range-v3)
-  target_compile_definitions(${executable} PRIVATE TILEDARRAY_HAS_RANGEV3=1)
-endif (TARGET range-v3::range-v3)
 
 # Add targets
 add_test(tiledarray/unit/build "${CMAKE_COMMAND}" --build ${PROJECT_BINARY_DIR} --target ${executable})
 set_tests_properties(tiledarray/unit/build PROPERTIES FIXTURES_SETUP TA_UNIT_TESTS_EXEC)
 
+# N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now
+set(TA_UNIT_TESTS_ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1")
+
 # Add a test(s)
 if(ENABLE_MPI)
   set (${executable}_np_1_args --run_test=!@distributed)
@@ -155,22 +169,19 @@ if(ENABLE_MPI)
         $<TARGET_FILE:${executable}> --log_level=unit_scope ${${executable}_np_${p}_args}
         ${MPIEXEC_POSTFLAGS}
       )
-    # N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now
+    if (p GREATER 1)
+      set(TA_UNIT_TESTS_ENVIRONMENT "${TA_UNIT_TESTS_ENVIRONMENT};TA_UT_DISTRIBUTED=1")
+    endif()
     set_tests_properties(tiledarray/unit/run-np-${p}
             PROPERTIES FIXTURES_REQUIRED TA_UNIT_TESTS_EXEC
-            ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1"
+            ENVIRONMENT "${TA_UNIT_TESTS_ENVIRONMENT}"
             )
-
-    if (p GREATER 1)
-      set_tests_properties(tiledarray/unit/run-np-${p} PROPERTIES ENVIRONMENT TA_UT_DISTRIBUTED=1)
-    endif()
   endforeach(p)
 else()
   add_test(NAME tiledarray/unit/run-np-1
            COMMAND ${executable})
-  # N.B. some CUDA unit tests require TA_CUDA_NUM_STREAMS=1 for now
   set_tests_properties(tiledarray/unit/run-np-1
           PROPERTIES FIXTURES_REQUIRED TA_UNIT_TESTS_EXEC
-          ENVIRONMENT "MAD_NUM_THREADS=2;TA_CUDA_NUM_STREAMS=1"
+          ENVIRONMENT "${TA_UNIT_TESTS_ENVIRONMENT}"
           )
 endif()
diff --git a/tests/annotation.cpp b/tests/annotation.cpp
index f3494b5ac9..48acaa189c 100644
--- a/tests/annotation.cpp
+++ b/tests/annotation.cpp
@@ -201,7 +201,7 @@ BOOST_AUTO_TEST_SUITE(split_index_fxn)
 
 BOOST_AUTO_TEST_CASE(invalid_idx) {
   if (TiledArray::get_default_world().nproc() == 1)
-    BOOST_CHECK_THROW(split_index("i,"), TiledArray::Exception);
+    BOOST_CHECK_TA_ASSERT(split_index("i,"), TiledArray::Exception);
 }
 
 BOOST_AUTO_TEST_CASE(non_tot) {
diff --git a/tests/bipartite_index_list.cpp b/tests/bipartite_index_list.cpp
index 71025297af..364f894659 100644
--- a/tests/bipartite_index_list.cpp
+++ b/tests/bipartite_index_list.cpp
@@ -122,7 +122,7 @@ BOOST_AUTO_TEST_CASE(default_ctor) {
  */
 BOOST_AUTO_TEST_CASE(string_ctor) {
   if (world.nproc() == 1) {
-    BOOST_CHECK_THROW(BipartiteIndexList("i,"), TiledArray::Exception);
+    BOOST_CHECK_TA_ASSERT(BipartiteIndexList("i,"), TiledArray::Exception);
   }
 
   for (auto&& [str, idx] : idxs) {
@@ -192,7 +192,7 @@ BOOST_AUTO_TEST_CASE(copy_assignment) {
 BOOST_AUTO_TEST_CASE(string_assignment) {
   if (world.nproc() == 1) {
     BipartiteIndexList v1;
-    BOOST_CHECK_THROW(v1.operator=("i,"), TiledArray::Exception);
+    BOOST_CHECK_TA_ASSERT(v1.operator=("i,"), TiledArray::Exception);
   }
 
   for (auto&& [str, idx] : idxs) {
@@ -282,7 +282,7 @@ BOOST_AUTO_TEST_CASE(permute_in_place) {
   if (world.nproc() == 1) {
     BipartiteIndexList v0;
     Permutation p{0, 1};
-    BOOST_CHECK_THROW(v0 *= p, TiledArray::Exception);
+    BOOST_CHECK_TA_ASSERT(v0 *= p, TiledArray::Exception);
   }
 
   Permutation p({1, 2, 3, 0});
@@ -335,13 +335,13 @@ BOOST_AUTO_TEST_CASE(end_itr) {
 BOOST_AUTO_TEST_CASE(at_member) {
   for (auto&& [str, idx] : idxs) {
     if (world.nproc() == 1) {
-    BOOST_CHECK_THROW(idx.at(idx.size()),
+      BOOST_CHECK_TA_ASSERT(idx.at(idx.size()),
 #ifdef BOOST_CONTAINER_USE_STD_EXCEPTIONS
-                        std::out_of_range
+                            std::out_of_range
 #else
-        boost::container::out_of_range
+                            boost::container::out_of_range
 #endif
-                            );
+      );
     }
     auto [outer, inner] = detail::split_index(str);
     for (size_type i = 0; i < outer.size(); ++i)
@@ -498,23 +498,23 @@ BOOST_AUTO_TEST_CASE(permutation_fxn) {
 
     {  // not both ToT
       BipartiteIndexList v1("i;j");
-      BOOST_CHECK_THROW(v1.permutation(v0), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(v1.permutation(v0), TiledArray::Exception);
     }
 
     {  // wrong size
       BipartiteIndexList v1("i");
-      BOOST_CHECK_THROW(v1.permutation(v0), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(v1.permutation(v0), TiledArray::Exception);
     }
 
     {  // not a permutation
       BipartiteIndexList v1("i, a");
-      BOOST_CHECK_THROW(v1.permutation(v0), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(v1.permutation(v0), TiledArray::Exception);
     }
 
     {  // ToTs mix outer and inner
       BipartiteIndexList v1("i,j;k,l");
       BipartiteIndexList v2("i,k;j,l");
-      BOOST_CHECK_THROW(v1.permutation(v2), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(v1.permutation(v2), TiledArray::Exception);
     }
   }
 
diff --git a/tests/bitset.cpp b/tests/bitset.cpp
index 289a47c295..0ffcf56114 100644
--- a/tests/bitset.cpp
+++ b/tests/bitset.cpp
@@ -102,8 +102,8 @@ BOOST_AUTO_TEST_CASE(accessor) {
 
   // Check that exceptions are thrown when accessing an element that is out of
   // range.
-  BOOST_CHECK_THROW(set[set.size()], Exception);
-  BOOST_CHECK_THROW(set[set.size() + 1], Exception);
+  BOOST_CHECK_TA_ASSERT(set[set.size()], Exception);
+  BOOST_CHECK_TA_ASSERT(set[set.size() + 1], Exception);
 }
 
 BOOST_AUTO_TEST_CASE(set_bit) {
diff --git a/tests/block_range.cpp b/tests/block_range.cpp
index 135c36d0b4..47f9d88e8f 100644
--- a/tests/block_range.cpp
+++ b/tests/block_range.cpp
@@ -25,9 +25,7 @@
 
 #include <TiledArray/util/eigen.h>
 #include <boost/range/combine.hpp>
-#ifdef TILEDARRAY_HAS_RANGEV3
 #include <range/v3/view/zip.hpp>
-#endif
 
 #include "TiledArray/block_range.h"
 #include "range_fixture.h"
@@ -72,7 +70,7 @@ BOOST_AUTO_TEST_CASE(block_zero_lower_bound) {
       for (unsigned int i = 0u; i < upper.size(); ++i) ++(upper[i]);
 
       if (std::equal(lower.begin(), lower.end(), upper.begin(),
-                     [](std::size_t l, std::size_t r0) { return l < r0; })) {
+                     [](std::size_t l, std::size_t r0) { return l <= r0; })) {
         if (count_valid == target_count) continue;
         ++count_valid;
 
@@ -141,7 +139,7 @@ BOOST_AUTO_TEST_CASE(block) {
       for (unsigned int i = 0u; i < r.rank(); ++i) ++(upper[i]);
 
       if (std::equal(lower.begin(), lower.end(), upper.begin(),
-                     [](std::size_t l, std::size_t r) { return l < r; })) {
+                     [](std::size_t l, std::size_t r) { return l <= r; })) {
         if (count_valid == target_count) continue;
         ++count_valid;
 
@@ -229,14 +227,12 @@ BOOST_AUTO_TEST_CASE(block) {
           BlockRange br2(r, boost::combine(lobounds, upbounds));
           BOOST_CHECK_EQUAL(br2, bref);
 
-#ifdef TILEDARRAY_HAS_RANGEV3
           // using zipped ranges of bounds (using Ranges-V3)
           // need to #include <range/v3/view/zip.hpp>
           BOOST_CHECK_NO_THROW(
               BlockRange br3(r, ranges::views::zip(lobounds, upbounds)));
           BlockRange br3(r, ranges::views::zip(lobounds, upbounds));
           BOOST_CHECK_EQUAL(br3, bref);
-#endif
 
           // using nested initializer_list
           BOOST_CHECK_NO_THROW(BlockRange br4(r, {{0, 4}, {1, 6}, {2, 8}}));
@@ -269,4 +265,26 @@ BOOST_AUTO_TEST_CASE(block) {
 end:;
 }
 
+BOOST_AUTO_TEST_CASE(empty_trange1) {
+  using TiledArray::eigen::iv;
+  // host range is non-empty but one of the dimensions will have no tiles
+  {
+    BOOST_CHECK_NO_THROW(BlockRange(r, iv(3, 3, 3), iv(4, 3, 5)));
+    BlockRange br(r, iv(3, 3, 3), iv(4, 3, 5));
+    BOOST_CHECK_EQUAL(br.volume(), 0);
+    BOOST_CHECK_TA_ASSERT(br.ordinal(0), Exception);
+  }
+
+  // host range is non-empty but one of the dimensions will have no tiles
+  {
+    BOOST_CHECK_NO_THROW(
+        BlockRange(Range({Range1{0, 3}, Range1{}, Range1{0, 4}}), iv(0, 0, 0),
+                   iv(1, 0, 1)));
+    BlockRange br(Range({Range1{0, 3}, Range1{}, Range1{0, 4}}), iv(0, 0, 0),
+                  iv(1, 0, 1));
+    BOOST_CHECK_EQUAL(br.volume(), 0);
+    BOOST_CHECK_TA_ASSERT(br.ordinal(0), Exception);
+  }
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/blocked_pmap.cpp b/tests/blocked_pmap.cpp
index 4ad055d885..80ab449570 100644
--- a/tests/blocked_pmap.cpp
+++ b/tests/blocked_pmap.cpp
@@ -25,7 +25,7 @@
 using namespace TiledArray;
 
 struct BlockedPmapFixture {
-  BlockedPmapFixture() {}
+  constexpr static std::size_t max_ntiles = 10ul;
 };
 
 // =============================================================================
@@ -34,7 +34,7 @@ struct BlockedPmapFixture {
 BOOST_FIXTURE_TEST_SUITE(blocked_pmap_suite, BlockedPmapFixture)
 
 BOOST_AUTO_TEST_CASE(constructor) {
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     BOOST_REQUIRE_NO_THROW(
         TiledArray::detail::BlockedPmap pmap(*GlobalFixture::world, tiles));
     TiledArray::detail::BlockedPmap pmap(*GlobalFixture::world, tiles);
@@ -51,7 +51,7 @@ BOOST_AUTO_TEST_CASE(owner) {
   ProcessID* p_owner = new ProcessID[size];
 
   // Check various pmap sizes
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     TiledArray::detail::BlockedPmap pmap(*GlobalFixture::world, tiles);
 
     for (std::size_t tile = 0; tile < tiles; ++tile) {
@@ -71,7 +71,7 @@ BOOST_AUTO_TEST_CASE(owner) {
 }
 
 BOOST_AUTO_TEST_CASE(local_size) {
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     TiledArray::detail::BlockedPmap pmap(*GlobalFixture::world, tiles);
 
     std::size_t total_size = pmap.local_size();
@@ -87,7 +87,7 @@ BOOST_AUTO_TEST_CASE(local_size) {
 BOOST_AUTO_TEST_CASE(local_group) {
   ProcessID tile_owners[100];
 
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     TiledArray::detail::BlockedPmap pmap(*GlobalFixture::world, tiles);
 
     // Check that all local elements map to this rank
diff --git a/tests/btas.cpp b/tests/btas.cpp
index a31329a80d..c396110a2f 100644
--- a/tests/btas.cpp
+++ b/tests/btas.cpp
@@ -256,6 +256,27 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tensor_ctor, Tensor, tensor_types) {
   BOOST_REQUIRE_NO_THROW(Tensor t1 = t0);
   Tensor t1 = t0;
   BOOST_CHECK(t1.empty());
+
+  // can copy TA::Tensor to btas::Tensor
+  TA::Tensor<typename Tensor::value_type> ta_tensor;
+  ta_tensor = make_rand_tile<decltype(ta_tensor)>(r);
+  BOOST_REQUIRE_NO_THROW(Tensor(ta_tensor));
+  Tensor t2(ta_tensor);
+  for (auto i : r) {
+    BOOST_CHECK_EQUAL(ta_tensor(i), t2(i));
+  }
+
+  // can copy TA::TensorInterface to btas::Tensor
+  {
+    const auto l = {3, 3, 3};
+    const auto u = r.upbound();
+    BOOST_REQUIRE(r.includes(l));
+    BOOST_REQUIRE_NO_THROW(Tensor(ta_tensor.block(l, u)));
+    Tensor t3(ta_tensor.block(l, u));
+    for (auto i : t3.range()) {
+      BOOST_CHECK_EQUAL(ta_tensor(i), t3(i));
+    }
+  }
 }
 
 BOOST_AUTO_TEST_CASE_TEMPLATE(copy, Array, array_types) {
@@ -324,8 +345,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(dense_array_conversion, bTensor, tensor_types) {
 
   // make tiled range
   using trange1_t = TiledArray::TiledRange1;
-  TiledArray::TiledRange trange(
-      {trange1_t(0, 10, 20), trange1_t(0, 11, 22), trange1_t(0, 12, 24)});
+  TiledArray::TiledRange trange({trange1_t(0, 10, 20),
+                                 trange1_t(0, 11, 22).inplace_shift(1),
+                                 trange1_t(0, 12, 24).inplace_shift(2)});
 
   // convert to a replicated DistArray
   using T = typename bTensor::value_type;
@@ -371,6 +393,22 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(dense_array_conversion, bTensor, tensor_types) {
       BOOST_CHECK(src_copy == btas::Tensor<T>{});
     }
   }
+
+  // convert the replicated DistArray back to a btas::Tensor while preserving
+  // the DistArray range
+  {
+    btas::Tensor<T> src_copy;
+    BOOST_REQUIRE_NO_THROW(
+        src_copy = array_to_btas_tensor(dst, TiledArray::preserve_lobound));
+    BOOST_CHECK(ranges::equal(src_copy.range().lobound(),
+                              dst.trange().elements_range().lobound()));
+    for (const auto& i : src.range()) {
+      auto i_copy = i;
+      i_copy[1] += 1;
+      i_copy[2] += 2;
+      BOOST_CHECK_EQUAL(src(i), src_copy(i_copy));
+    }
+  }
 }
 
 BOOST_AUTO_TEST_CASE_TEMPLATE(sparse_array_conversion, bTensor, tensor_types) {
diff --git a/tests/conversions.cpp b/tests/conversions.cpp
index a66386564d..9cab83bca7 100644
--- a/tests/conversions.cpp
+++ b/tests/conversions.cpp
@@ -23,14 +23,21 @@
  *
  */
 
-#include "kmp5_compute_trange1.h"
-#include "range_fixture.h"
-#include "tiledarray.h"
 #include "unit_test_config.h"
 
-#include "TiledArray/conversions/concat.h"
 #include "TiledArray/conversions/vector_of_arrays.h"
 
+#include "TiledArray/conversions/concat.h"
+
+#include "TiledArray/conversions/dense_to_sparse.h"
+#include "TiledArray/conversions/make_array.h"
+#include "TiledArray/conversions/sparse_to_dense.h"
+#include "TiledArray/conversions/to_new_tile_type.h"
+
+#include "TiledArray/expressions/tsr_expr.h"
+
+#include "range_fixture.h"
+
 using namespace TiledArray;
 
 struct ConversionsFixture : public TiledRangeFixture {
@@ -340,8 +347,8 @@ BOOST_AUTO_TEST_CASE(tiles_of_arrays_non_unit_blocking) {
   std::size_t dim_one = 1336;
   std::size_t dim_two = 552;
   {
-    TA::TiledRange1 tr1_mode0 = kmp5_compute_trange1(dim_one, block_size);
-    TA::TiledRange1 tr1_mode1 = kmp5_compute_trange1(dim_two, 10);
+    TA::TiledRange1 tr1_mode0 = TiledRange1::make_uniform(dim_one, block_size);
+    TA::TiledRange1 tr1_mode1 = TiledRange1::make_uniform(dim_two, 10);
     tr = TiledArray::TiledRange({tr1_mode0, tr1_mode1});
     tr_split = TiledArray::TiledRange({tr1_mode1});
   }
@@ -531,12 +538,12 @@ BOOST_AUTO_TEST_CASE(concat) {
       }
     }
     // ranges of non-concatted dims must match
-    BOOST_CHECK_THROW((TiledArray::concat<Tensor, Policy>(
-                          {a, b_t}, std::vector<bool>{false, true})),
-                      TiledArray::Exception);
-    BOOST_CHECK_THROW((TiledArray::concat<Tensor, Policy>(
-                          {a, b_t}, std::vector<bool>{true, false})),
-                      TiledArray::Exception);
+    BOOST_CHECK_TA_ASSERT((TiledArray::concat<Tensor, Policy>(
+                              {a, b_t}, std::vector<bool>{false, true})),
+                          TiledArray::Exception);
+    BOOST_CHECK_TA_ASSERT((TiledArray::concat<Tensor, Policy>(
+                              {a, b_t}, std::vector<bool>{true, false})),
+                          TiledArray::Exception);
   };
 
   do_test(static_cast<TArrayI*>(nullptr));
diff --git a/tests/cyclic_pmap.cpp b/tests/cyclic_pmap.cpp
index 509b9f92bf..a1a029c1cd 100644
--- a/tests/cyclic_pmap.cpp
+++ b/tests/cyclic_pmap.cpp
@@ -24,7 +24,7 @@
 using namespace TiledArray;
 
 struct CyclicPmapFixture {
-  CyclicPmapFixture() {}
+  constexpr static std::size_t max_ntiles_per_dim = 4ul;
 };
 
 // =============================================================================
@@ -60,28 +60,22 @@ BOOST_AUTO_TEST_CASE(constructor) {
 
   ProcessID size = GlobalFixture::world->size();
 
-  BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap(*GlobalFixture::world,
-                                                        0ul, 10ul, 1, 1),
-                    TiledArray::Exception);
-  BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap(*GlobalFixture::world,
-                                                        10ul, 0ul, 1, 1),
-                    TiledArray::Exception);
-  BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap(*GlobalFixture::world,
-                                                        10ul, 10ul, 0, 1),
-                    TiledArray::Exception);
-  BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap(*GlobalFixture::world,
-                                                        10ul, 10ul, 1, 0),
-                    TiledArray::Exception);
-  BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap(
-                        *GlobalFixture::world, 10ul, 10ul, size * 2, 1),
-                    TiledArray::Exception);
-  BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap(
-                        *GlobalFixture::world, 10ul, 10ul, 1, size * 2),
-                    TiledArray::Exception);
+  BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap(
+                            *GlobalFixture::world, 10ul, 10ul, 0, 1),
+                        TiledArray::Exception);
+  BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap(
+                            *GlobalFixture::world, 10ul, 10ul, 1, 0),
+                        TiledArray::Exception);
+  BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap(
+                            *GlobalFixture::world, 10ul, 10ul, size * 2, 1),
+                        TiledArray::Exception);
+  BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap(
+                            *GlobalFixture::world, 10ul, 10ul, 1, size * 2),
+                        TiledArray::Exception);
   if (size > 1) {
-    BOOST_CHECK_THROW(TiledArray::detail::CyclicPmap pmap(
-                          *GlobalFixture::world, 10ul, 10ul, size, size),
-                      TiledArray::Exception);
+    BOOST_CHECK_TA_ASSERT(TiledArray::detail::CyclicPmap pmap(
+                              *GlobalFixture::world, 10ul, 10ul, size, size),
+                          TiledArray::Exception);
   }
 }
 
@@ -92,8 +86,8 @@ BOOST_AUTO_TEST_CASE(owner) {
   ProcessID* p_owner = new ProcessID[size];
 
   // Check various pmap sizes
-  for (std::size_t x = 1ul; x < 10ul; ++x) {
-    for (std::size_t y = 1ul; y < 10ul; ++y) {
+  for (std::size_t x = 1ul; x < max_ntiles_per_dim; ++x) {
+    for (std::size_t y = 1ul; y < max_ntiles_per_dim; ++y) {
       // Compute the limits for process rows
       const std::size_t min_proc_rows = std::max<std::size_t>(
           ((GlobalFixture::world->size() + y - 1ul) / y), 1ul);
@@ -129,8 +123,8 @@ BOOST_AUTO_TEST_CASE(owner) {
 }
 
 BOOST_AUTO_TEST_CASE(local_size) {
-  for (std::size_t x = 1ul; x < 10ul; ++x) {
-    for (std::size_t y = 1ul; y < 10ul; ++y) {
+  for (std::size_t x = 1ul; x < max_ntiles_per_dim; ++x) {
+    for (std::size_t y = 1ul; y < max_ntiles_per_dim; ++y) {
       // Compute the limits for process rows
       const std::size_t min_proc_rows = std::max<std::size_t>(
           ((GlobalFixture::world->size() + y - 1ul) / y), 1ul);
@@ -162,8 +156,8 @@ BOOST_AUTO_TEST_CASE(local_size) {
 BOOST_AUTO_TEST_CASE(local_group) {
   ProcessID tile_owners[100];
 
-  for (std::size_t x = 1ul; x < 10ul; ++x) {
-    for (std::size_t y = 1ul; y < 10ul; ++y) {
+  for (std::size_t x = 1ul; x < max_ntiles_per_dim; ++x) {
+    for (std::size_t y = 1ul; y < max_ntiles_per_dim; ++y) {
       // Compute the limits for process rows
       const std::size_t min_proc_rows = std::max<std::size_t>(
           ((GlobalFixture::world->size() + y - 1ul) / y), 1ul);
diff --git a/tests/dist_array.cpp b/tests/dist_array.cpp
index 4f2e1dbe9b..64f69e69db 100644
--- a/tests/dist_array.cpp
+++ b/tests/dist_array.cpp
@@ -60,7 +60,7 @@ namespace {
 std::string to_parallel_archive_file_name(const char* prefix_name, int rank) {
   char buf[256];
   MADNESS_ASSERT(strlen(prefix_name) + 7 <= sizeof(buf));
-  sprintf(buf, "%s.%5.5d", prefix_name, rank);
+  snprintf(buf, sizeof(buf), "%s.%5.5d", prefix_name, rank);
   return buf;
 }
 }  // namespace
@@ -76,6 +76,13 @@ BOOST_AUTO_TEST_CASE(constructors) {
   for (ArrayN::const_iterator it = ad.begin(); it != ad.end(); ++it)
     BOOST_CHECK(!it->probe());
 
+  // Construct a dense array in default world
+  {
+    BOOST_REQUIRE_NO_THROW(ArrayN ad(tr));
+    ArrayN ad(tr);
+    BOOST_CHECK_EQUAL(ad.world().id(), get_default_world().id());
+  }
+
   // Construct a sparse array
   BOOST_REQUIRE_NO_THROW(
       SpArrayN as(world, tr, TiledArray::SparseShape<float>(shape_tensor, tr)));
@@ -88,6 +95,14 @@ BOOST_AUTO_TEST_CASE(constructors) {
   // now fill it
   BOOST_REQUIRE_NO_THROW(as.fill(1));
 
+  // Construct a sparse array in default world
+  {
+    BOOST_REQUIRE_NO_THROW(
+        SpArrayN as(tr, TiledArray::SparseShape<float>(shape_tensor, tr)));
+    SpArrayN as(tr, TiledArray::SparseShape<float>(shape_tensor, tr));
+    BOOST_CHECK_EQUAL(as.world().id(), get_default_world().id());
+  }
+
   // Construct a sparse array from another sparse array
   {
     auto op = [](auto& result, const auto& input) { result = input.clone(); };
@@ -107,6 +122,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) {
         ++itr;
       }
     }
+
+    // now with default world
+    {
+      TArray<double> a_vector(il);
+      BOOST_CHECK_EQUAL(a_vector.world().id(), get_default_world().id());
+    }
   }
 
   // Create a matrix with an initializer list
@@ -122,6 +143,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) {
         }
       }
     }
+
+    // now with default world
+    {
+      TArray<double> a_matrix(il);
+      BOOST_CHECK_EQUAL(a_matrix.world().id(), get_default_world().id());
+    }
   }
 
   // Create a rank 3 tensor with an initializer list
@@ -144,6 +171,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) {
         }
       }
     }
+
+    // now with default world
+    {
+      TArray<double> a_tensor3(il);
+      BOOST_CHECK_EQUAL(a_tensor3.world().id(), get_default_world().id());
+    }
   }
 
   // Create a rank 4 tensor with an initializer list
@@ -168,6 +201,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) {
         }
       }
     }
+
+    // now with default world
+    {
+      TArray<double> a_tensor4(il);
+      BOOST_CHECK_EQUAL(a_tensor4.world().id(), get_default_world().id());
+    }
   }
 
   // Create a rank 5 tensor with an initializer list
@@ -194,6 +233,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) {
         }
       }
     }
+
+    // now with default world
+    {
+      TArray<double> a_tensor5(il);
+      BOOST_CHECK_EQUAL(a_tensor5.world().id(), get_default_world().id());
+    }
   }
 
   // Create a rank 6 tensor with an initializer list
@@ -222,6 +267,12 @@ BOOST_AUTO_TEST_CASE(single_tile_initializer_list_ctors) {
         }
       }
     }
+
+    // now with default world
+    {
+      TArray<double> a_tensor6(il);
+      BOOST_CHECK_EQUAL(a_tensor6.world().id(), get_default_world().id());
+    }
   }
 }
 
@@ -232,6 +283,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) {
     TiledRange tr{{0, 1, 3}};
     TArray<double> a_vector(world, tr, il);
     BOOST_CHECK_EQUAL(a_vector.size(), 2);
+
+    // now with default world
+    {
+      TArray<double> a_vector(tr, il);
+      BOOST_CHECK_EQUAL(a_vector.world().id(), get_default_world().id());
+    }
   }
 
   {
@@ -239,6 +296,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) {
     TiledRange tr{{0, 1, 2}, {0, 1, 3}};
     TArray<double> a_matrix(world, tr, il);
     BOOST_CHECK_EQUAL(a_matrix.size(), 4);
+
+    // now with default world
+    {
+      TArray<double> a_matrix(tr, il);
+      BOOST_CHECK_EQUAL(a_matrix.world().id(), get_default_world().id());
+    }
   }
 
   {
@@ -247,6 +310,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) {
     TiledRange tr{{0, 1, 2}, {0, 1, 2}, {0, 1, 3}};
     TArray<double> a_tensor(world, tr, il);
     BOOST_CHECK_EQUAL(a_tensor.size(), 8);
+
+    // now with default world
+    {
+      TArray<double> a_tensor(tr, il);
+      BOOST_CHECK_EQUAL(a_tensor.world().id(), get_default_world().id());
+    }
   }
 
   {
@@ -257,6 +326,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) {
     TiledRange tr{{0, 1, 2}, {0, 1, 2}, {0, 1, 2}, {0, 1, 3}};
     TArray<double> a_tensor(world, tr, il);
     BOOST_CHECK_EQUAL(a_tensor.size(), 16);
+
+    // now with default world
+    {
+      TArray<double> a_tensor(tr, il);
+      BOOST_CHECK_EQUAL(a_tensor.world().id(), get_default_world().id());
+    }
   }
 
   {
@@ -269,6 +344,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) {
     TiledRange tr{{0, 1, 2}, {0, 1, 2}, {0, 1, 2}, {0, 1, 2}, {0, 1, 3}};
     TArray<double> a_tensor(world, tr, il);
     BOOST_CHECK_EQUAL(a_tensor.size(), 32);
+
+    // now with default world
+    {
+      TArray<double> a_tensor(tr, il);
+      BOOST_CHECK_EQUAL(a_tensor.world().id(), get_default_world().id());
+    }
   }
 
   {
@@ -286,6 +367,12 @@ BOOST_AUTO_TEST_CASE(multi_tile_initializer_list_ctors) {
                   {0, 1, 2}, {0, 1, 2}, {0, 1, 3}};
     TArray<double> a_tensor(world, tr, il);
     BOOST_CHECK_EQUAL(a_tensor.size(), 64);
+
+    // now with default world
+    {
+      TArray<double> a_tensor(tr, il);
+      BOOST_CHECK_EQUAL(a_tensor.world().id(), get_default_world().id());
+    }
   }
 }
 
@@ -513,7 +600,7 @@ BOOST_AUTO_TEST_CASE(make_replicated) {
   BOOST_REQUIRE_NO_THROW(a.make_replicated());
 
   // check for cda7b8a33b85f9ebe92bc369d6a362c94f1eae40 bug
-  for (const auto &tile : a) {
+  for (const auto& tile : a) {
     BOOST_CHECK(tile.get().size() != 0);
   }
 
@@ -532,7 +619,6 @@ BOOST_AUTO_TEST_CASE(make_replicated) {
          it != tile.get().end(); ++it)
       BOOST_CHECK_EQUAL(*it, distributed_pmap->owner(i) + 1);
   }
-
 }
 
 BOOST_AUTO_TEST_CASE(serialization_by_tile) {
@@ -630,7 +716,7 @@ BOOST_AUTO_TEST_CASE(parallel_serialization) {
   mktemp(archive_file_prefix_name);
   madness::archive::ParallelOutputArchive<> oar(world, archive_file_prefix_name,
                                                 nio);
-  oar& a;
+  oar & a;
   oar.close();
 
   madness::archive::ParallelInputArchive<> iar(world, archive_file_prefix_name,
@@ -654,7 +740,7 @@ BOOST_AUTO_TEST_CASE(parallel_sparse_serialization) {
   mktemp(archive_file_prefix_name);
   madness::archive::ParallelOutputArchive<> oar(world, archive_file_prefix_name,
                                                 nio);
-  oar& b;
+  oar & b;
   oar.close();
 
   madness::archive::ParallelInputArchive<> iar(world, archive_file_prefix_name,
@@ -697,7 +783,7 @@ BOOST_AUTO_TEST_CASE(issue_225) {
   madness::archive::BinaryFstreamInputArchive iar(archive_file_name);
   decltype(S) S_read;
   decltype(St) St_read;
-  iar& S_read& St_read;
+  iar & S_read & St_read;
 
   BOOST_CHECK_EQUAL(S_read.trange(), S.trange());
   BOOST_REQUIRE(S_read.shape() == S.shape());
@@ -710,4 +796,152 @@ BOOST_AUTO_TEST_CASE(issue_225) {
   std::remove(archive_file_name);
 }
 
+BOOST_AUTO_TEST_CASE(rebind) {
+  static_assert(
+      std::is_same_v<typename ArrayN::template rebind_t<TensorD>, TArrayD>);
+  static_assert(
+      std::is_same_v<typename ArrayN::template rebind_numeric_t<double>,
+                     TArrayD>);
+  static_assert(
+      std::is_same_v<typename SpArrayN::template rebind_t<TensorD>, TSpArrayD>);
+  static_assert(
+      std::is_same_v<typename SpArrayN::template rebind_numeric_t<double>,
+                     TSpArrayD>);
+  static_assert(std::is_same_v<TiledArray::detail::real_t<TArrayZ>, TArrayD>);
+  static_assert(
+      std::is_same_v<TiledArray::detail::complex_t<TArrayD>, TArrayZ>);
+  static_assert(
+      std::is_same_v<TiledArray::detail::real_t<TSpArrayZ>, TSpArrayD>);
+  static_assert(
+      std::is_same_v<TiledArray::detail::complex_t<TSpArrayD>, TSpArrayZ>);
+
+  // DistArray of Tensors
+  using SpArrayTD = DistArray<Tensor<TensorD>, SparsePolicy>;
+  using SpArrayTZ = DistArray<Tensor<TensorZ>, SparsePolicy>;
+  static_assert(std::is_same_v<typename SpArrayTD::template rebind_t<TensorZ>,
+                               TSpArrayZ>);
+  static_assert(
+      std::is_same_v<
+          typename SpArrayTD::template rebind_numeric_t<std::complex<double>>,
+          SpArrayTZ>);
+  static_assert(
+      std::is_same_v<TiledArray::detail::real_t<SpArrayTZ>, SpArrayTD>);
+  static_assert(
+      std::is_same_v<TiledArray::detail::complex_t<SpArrayTD>, SpArrayTZ>);
+}
+
+BOOST_AUTO_TEST_CASE(volume) {
+  using T = Tensor<double>;
+  using ToT = Tensor<T>;
+  using Policy = SparsePolicy;
+  using ArrayToT = DistArray<ToT, Policy>;
+
+  size_t constexpr nrows = 3;
+  size_t constexpr ncols = 4;
+  TiledRange const trange({{0, 2, 5, 7}, {0, 5, 7, 10, 12}});
+  TA_ASSERT(trange.tiles_range().extent().at(0) == nrows &&
+                trange.tiles_range().extent().at(1) == ncols,
+            "Following code depends on this condition.");
+
+  // this Range is used to construct all inner tensors of the tile with
+  // tile index @c tix.
+  auto inner_dims = [nrows, ncols](Range::index_type const& tix) -> Range {
+    static std::array<size_t, nrows> const rows{7, 8, 9};
+    static std::array<size_t, ncols> const cols{7, 8, 9, 10};
+
+    TA_ASSERT(tix.size() == 2, "Only rank-2 tensor expected.");
+    return Range({rows[tix.at(0) % nrows], cols[tix.at(1) % ncols]});
+  };
+
+  // let's make all 'diagonal' tiles zero
+  auto zero_tile = [](Range::index_type const& tix) -> bool {
+    return tix.at(0) == tix.at(1);
+  };
+
+  auto make_tile = [inner_dims, zero_tile, &trange](auto& tile,
+                                                    auto const& rng) {
+    auto&& tix = trange.element_to_tile(rng.lobound());
+    if (zero_tile(tix))
+      return 0.;
+    else {
+      tile = ToT(rng, [inner_rng = inner_dims(tix)](auto&&) {
+        return T(inner_rng, 0.1);
+      });
+      return tile.norm();
+    }
+  };
+
+  auto& world = get_default_world();
+  auto array = make_array<ArrayToT>(world, trange, make_tile);
+
+  // manually compute the volume of array
+  size_t vol = 0;
+  for (auto&& tix : trange.tiles_range())
+    if (!zero_tile(tix))
+      vol += trange.tile(tix).volume() * inner_dims(tix).volume();
+
+  BOOST_REQUIRE(vol == TA::volume(array));
+}
+
+BOOST_AUTO_TEST_CASE(reduction) {
+  using Numeric = double;
+  using T = Tensor<Numeric>;
+  using ToT = Tensor<T>;
+  using Policy = SparsePolicy;
+  using ArrayToT = DistArray<ToT, Policy>;
+
+  auto unit_T = [](Range const& rng) { return T(rng, Numeric{1}); };
+
+  auto unit_ToT = [unit_T](Range const& rngo, Range const& rngi) {
+    return ToT(rngo, unit_T(rngi));
+  };
+
+  size_t constexpr nrows = 3;
+  size_t constexpr ncols = 4;
+  TiledRange const trange({{0, 2, 5, 7}, {0, 5, 7, 10, 12}});
+  TA_ASSERT(trange.tiles_range().extent().at(0) == nrows &&
+                trange.tiles_range().extent().at(1) == ncols,
+            "Following code depends on this condition.");
+
+  // this Range is used to construct all inner tensors of the tile with
+  // tile index @c tix.
+  auto inner_dims = [nrows, ncols](Range::index_type const& tix) -> Range {
+    static std::array<size_t, nrows> const rows{7, 8, 9};
+    static std::array<size_t, ncols> const cols{7, 8, 9, 10};
+
+    TA_ASSERT(tix.size() == 2, "Only rank-2 tensor expected.");
+    return Range({rows[tix.at(0) % nrows], cols[tix.at(1) % ncols]});
+  };
+
+  // let's make all 'diagonal' tiles zero
+  auto zero_tile = [](Range::index_type const& tix) -> bool {
+    return tix.at(0) == tix.at(1);
+  };
+
+  auto make_tile = [inner_dims,  //
+                    zero_tile,   //
+                    &trange,     //
+                    unit_ToT](auto& tile, auto const& rng) {
+    auto&& tix = trange.element_to_tile(rng.lobound());
+    if (zero_tile(tix))
+      return 0.;
+    else {
+      tile = unit_ToT(rng, inner_dims(tix));
+      return tile.norm();
+    }
+  };
+
+  auto& world = get_default_world();
+
+  // all non-zero inner tensors of this ToT array are unit (ie all
+  // inner tensors' elements are 1.)
+  auto array = make_array<ArrayToT>(world, trange, make_tile);
+
+  // since all inner tensors are filled with 1.
+  double array_norm = std::sqrt(TA::volume(array));
+
+  BOOST_REQUIRE(array_norm == TA::norm2(array));
+  BOOST_REQUIRE(array_norm = std::sqrt(TA::dot(array, array)));
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/dist_op_communicator.cpp b/tests/dist_op_communicator.cpp
index 4eac7a135c..28922e8d6c 100644
--- a/tests/dist_op_communicator.cpp
+++ b/tests/dist_op_communicator.cpp
@@ -30,9 +30,9 @@ struct DistOpFixture {
   DistOpFixture()
       : group_list(),
         world_group_list(),
-        group_did(GlobalFixture::world->unique_obj_id(),
+        group_did(GlobalFixture::world->make_unique_obj_id(),
                   GlobalFixture::world->rank() % 2),
-        world_did(GlobalFixture::world->unique_obj_id(),
+        world_did(GlobalFixture::world->make_unique_obj_id(),
                   GlobalFixture::world->size()) {
     for (ProcessID p = GlobalFixture::world->rank() % 2;
          p < GlobalFixture::world->size(); p += 2)
diff --git a/tests/dist_op_group.cpp b/tests/dist_op_group.cpp
index 8027eab7e1..b4846716f4 100644
--- a/tests/dist_op_group.cpp
+++ b/tests/dist_op_group.cpp
@@ -56,14 +56,14 @@ BOOST_AUTO_TEST_CASE(constructor_empty) {
 
 #if defined(MADNESS_ASSERTIONS_THROW)
   // Check that accessing group data throws exceptions for an empty group.
-  BOOST_CHECK_THROW(empty_group.id(), madness::MadnessException);
-  BOOST_CHECK_THROW(empty_group.get_world(), madness::MadnessException);
-  BOOST_CHECK_THROW(empty_group.rank(), madness::MadnessException);
-  BOOST_CHECK_THROW(empty_group.rank(0), madness::MadnessException);
-  BOOST_CHECK_THROW(empty_group.world_rank(0), madness::MadnessException);
+  BOOST_CHECK_TA_ASSERT(empty_group.id(), madness::MadnessException);
+  BOOST_CHECK_TA_ASSERT(empty_group.get_world(), madness::MadnessException);
+  BOOST_CHECK_TA_ASSERT(empty_group.rank(), madness::MadnessException);
+  BOOST_CHECK_TA_ASSERT(empty_group.rank(0), madness::MadnessException);
+  BOOST_CHECK_TA_ASSERT(empty_group.world_rank(0), madness::MadnessException);
   ProcessID parent, child1, child2;
-  BOOST_CHECK_THROW(empty_group.make_tree(0, parent, child1, child2),
-                    madness::MadnessException);
+  BOOST_CHECK_TA_ASSERT(empty_group.make_tree(0, parent, child1, child2),
+                        madness::MadnessException);
 #endif  // MADNESS_ASSERTIONS_THROW
 }
 
diff --git a/tests/distributed_storage.cpp b/tests/distributed_storage.cpp
index 9dec84f967..895b734911 100644
--- a/tests/distributed_storage.cpp
+++ b/tests/distributed_storage.cpp
@@ -79,8 +79,8 @@ BOOST_AUTO_TEST_CASE(set_value) {
   BOOST_CHECK_EQUAL(n, t.max_size());
 
   // Check throw for an out-of-range set.
-  BOOST_CHECK_THROW(t.set(t.max_size(), 1), TiledArray::Exception);
-  BOOST_CHECK_THROW(t.set(t.max_size() + 2, 1), TiledArray::Exception);
+  BOOST_CHECK_TA_ASSERT(t.set(t.max_size(), 1), TiledArray::Exception);
+  BOOST_CHECK_TA_ASSERT(t.set(t.max_size() + 2, 1), TiledArray::Exception);
 }
 
 BOOST_AUTO_TEST_CASE(array_operator) {
@@ -97,7 +97,7 @@ BOOST_AUTO_TEST_CASE(array_operator) {
   BOOST_CHECK_EQUAL(n, t.max_size());
 
   // Check throw for an out-of-range set.
-  BOOST_CHECK_THROW(t.get(t.max_size()), TiledArray::Exception);
+  BOOST_CHECK_TA_ASSERT(t.get(t.max_size()), TiledArray::Exception);
   BOOST_CHECK_THROW(t.get(t.max_size() + 2), TiledArray::Exception);
 }
 
diff --git a/tests/eigen.cpp b/tests/eigen.cpp
index bfa4f1a0db..11ca7088b1 100644
--- a/tests/eigen.cpp
+++ b/tests/eigen.cpp
@@ -29,9 +29,16 @@ struct EigenFixture : public TiledRangeFixture {
       : trange(dims.begin(), dims.begin() + 2),
         trange1(dims.begin(), dims.begin() + 1),
         trangeN(dims.begin(), dims.begin() + GlobalFixture::dim),
+        trange_base1(dims_base1.begin(), dims_base1.begin() + 2),
+        trange1_base1(dims_base1.begin(), dims_base1.begin() + 1),
+        trangeN_base1(dims_base1.begin(),
+                      dims_base1.begin() + GlobalFixture::dim),
         array(*GlobalFixture::world, trange),
         array1(*GlobalFixture::world, trange1),
         arrayN(*GlobalFixture::world, trangeN),
+        array_base1(*GlobalFixture::world, trange_base1),
+        array1_base1(*GlobalFixture::world, trange1_base1),
+        arrayN_base1(*GlobalFixture::world, trangeN_base1),
         matrix(dims[0].elements_range().second,
                dims[1].elements_range().second),
         rmatrix(dims[0].elements_range().second,
@@ -43,9 +50,15 @@ struct EigenFixture : public TiledRangeFixture {
   TiledRange trange;
   TiledRange trange1;
   TiledRange trangeN;
+  TiledRange trange_base1;   // base-1 version of trange
+  TiledRange trange1_base1;  // base-1 version of trange1
+  TiledRange trangeN_base1;  // base-1 version of trangeN
   TArrayI array;
   TArrayI array1;
   TArrayI arrayN;
+  TArrayI array_base1;   // base-1 version of array
+  TArrayI array1_base1;  // base-1 version of array1
+  TArrayI arrayN_base1;  // base-1 version of array1
   Eigen::MatrixXi matrix;
   EigenMatrixXi rmatrix;
   Eigen::VectorXi vector;
@@ -172,15 +185,23 @@ BOOST_AUTO_TEST_CASE(matrix_to_array) {
       (array = eigen_to_array<TArrayI>(*GlobalFixture::world, trange, matrix)));
 
   // Check that the data in array is equal to that in matrix
-  for (Range::const_iterator it = array.tiles_range().begin();
-       it != array.tiles_range().end(); ++it) {
-    Future<TArrayI::value_type> tile = array.find(*it);
-    for (Range::const_iterator tile_it = tile.get().range().begin();
-         tile_it != tile.get().range().end(); ++tile_it) {
-      BOOST_CHECK_EQUAL(tile.get()[*tile_it],
-                        matrix((*tile_it)[0], (*tile_it)[1]));
+  auto test = [&](const auto& array, auto base = 0) {
+    for (Range::const_iterator it = array.tiles_range().begin();
+         it != array.tiles_range().end(); ++it) {
+      Future<TArrayI::value_type> tile = array.find(*it);
+      for (Range::const_iterator tile_it = tile.get().range().begin();
+           tile_it != tile.get().range().end(); ++tile_it) {
+        BOOST_CHECK_EQUAL(tile.get()[*tile_it],
+                          matrix((*tile_it)[0] - base, (*tile_it)[1] - base));
+      }
     }
-  }
+  };
+  test(array, 0);
+
+  // same with base-1
+  BOOST_CHECK_NO_THROW((array_base1 = eigen_to_array<TArrayI>(
+                            *GlobalFixture::world, trange_base1, matrix)));
+  test(array_base1, 1);
 }
 
 BOOST_AUTO_TEST_CASE(vector_to_array) {
@@ -193,14 +214,23 @@ BOOST_AUTO_TEST_CASE(vector_to_array) {
                                                          trange1, vector)));
 
   // Check that the data in array matches the data in vector
-  for (Range::const_iterator it = array1.tiles_range().begin();
-       it != array1.tiles_range().end(); ++it) {
-    Future<TArrayI::value_type> tile = array1.find(*it);
-    for (Range::const_iterator tile_it = tile.get().range().begin();
-         tile_it != tile.get().range().end(); ++tile_it) {
-      BOOST_CHECK_EQUAL(tile.get()[*tile_it], vector((*tile_it)[0]));
+  auto test = [&](const auto& array1, auto base = 0) {
+    for (Range::const_iterator it = array1.tiles_range().begin();
+         it != array1.tiles_range().end(); ++it) {
+      Future<TArrayI::value_type> tile = array1.find(*it);
+      for (Range::const_iterator tile_it = tile.get().range().begin();
+           tile_it != tile.get().range().end(); ++tile_it) {
+        BOOST_CHECK_EQUAL(tile.get()[*tile_it], vector((*tile_it)[0] - base));
+      }
     }
-  }
+  };
+
+  test(array1, 0);
+
+  // same with base-1
+  BOOST_CHECK_NO_THROW((array1_base1 = eigen_to_array<TArrayI>(
+                            *GlobalFixture::world, trange1_base1, vector)));
+  test(array1_base1, 1);
 }
 
 BOOST_AUTO_TEST_CASE(array_to_matrix) {
@@ -208,168 +238,180 @@ BOOST_AUTO_TEST_CASE(array_to_matrix) {
     return array_to_eigen<Tensor<int>, DensePolicy, Eigen::RowMajor>(array);
   };
 
-  if (GlobalFixture::world->size() == 1) {
-    // Fill the array with random data
-    GlobalFixture::world->srand(27);
-    for (Range::const_iterator it = array.tiles_range().begin();
-         it != array.tiles_range().end(); ++it) {
-      TArrayI::value_type tile(array.trange().make_tile_range(*it));
-      for (TArrayI::value_type::iterator tile_it = tile.begin();
-           tile_it != tile.end(); ++tile_it) {
-        *tile_it = GlobalFixture::world->rand();
+  for (auto base : {0, 1}) {
+    auto& arr = base == 1 ? array_base1 : array;
+
+    if (GlobalFixture::world->size() == 1) {
+      // Fill the array with random data
+      GlobalFixture::world->srand(27);
+      for (Range::const_iterator it = arr.tiles_range().begin();
+           it != arr.tiles_range().end(); ++it) {
+        TArrayI::value_type tile(arr.trange().make_tile_range(*it));
+        for (TArrayI::value_type::iterator tile_it = tile.begin();
+             tile_it != tile.end(); ++tile_it) {
+          *tile_it = GlobalFixture::world->rand();
+        }
+        arr.set(*it, tile);
       }
-      array.set(*it, tile);
-    }
-
-    // Convert the array to an Eigen matrices: column-major (matrix) and
-    // row-major (rmatrix)
-    BOOST_CHECK_NO_THROW(matrix = array_to_eigen(array));
-    BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(array));
 
-    // Check that the matrix dimensions are the same as the array
-    BOOST_CHECK_EQUAL(matrix.rows(), array.trange().elements_range().extent(0));
-    BOOST_CHECK_EQUAL(matrix.cols(), array.trange().elements_range().extent(1));
-    BOOST_CHECK_EQUAL(rmatrix.rows(),
-                      array.trange().elements_range().extent(0));
-    BOOST_CHECK_EQUAL(rmatrix.cols(),
-                      array.trange().elements_range().extent(1));
-
-    // Check that the data in matrix matches the data in array
-    for (Range::const_iterator it = array.tiles_range().begin();
-         it != array.tiles_range().end(); ++it) {
-      Future<TArrayI::value_type> tile = array.find(*it);
-      for (Range::const_iterator tile_it = tile.get().range().begin();
-           tile_it != tile.get().range().end(); ++tile_it) {
-        BOOST_CHECK_EQUAL(matrix((*tile_it)[0], (*tile_it)[1]),
-                          tile.get()[*tile_it]);
-        BOOST_CHECK_EQUAL(rmatrix((*tile_it)[0], (*tile_it)[1]),
-                          tile.get()[*tile_it]);
+      // Convert the array to an Eigen matrices: column-major (matrix) and
+      // row-major (rmatrix)
+      BOOST_CHECK_NO_THROW(matrix = array_to_eigen(arr));
+      BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(arr));
+      BOOST_CHECK_NO_THROW(matrix = array_to_eigen(arr));
+      BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(arr));
+
+      // Check that the matrix dimensions are the same as the array
+      BOOST_CHECK_EQUAL(matrix.rows(), arr.trange().elements_range().extent(0));
+      BOOST_CHECK_EQUAL(matrix.cols(), arr.trange().elements_range().extent(1));
+      BOOST_CHECK_EQUAL(rmatrix.rows(),
+                        arr.trange().elements_range().extent(0));
+      BOOST_CHECK_EQUAL(rmatrix.cols(),
+                        arr.trange().elements_range().extent(1));
+
+      // Check that the data in matrix matches the data in array
+      for (Range::const_iterator it = arr.tiles_range().begin();
+           it != arr.tiles_range().end(); ++it) {
+        Future<TArrayI::value_type> tile = arr.find(*it);
+        for (Range::const_iterator tile_it = tile.get().range().begin();
+             tile_it != tile.get().range().end(); ++tile_it) {
+          BOOST_CHECK_EQUAL(matrix((*tile_it)[0] - base, (*tile_it)[1] - base),
+                            tile.get()[*tile_it]);
+          BOOST_CHECK_EQUAL(rmatrix((*tile_it)[0] - base, (*tile_it)[1] - base),
+                            tile.get()[*tile_it]);
+        }
       }
-    }
-  } else {
-    // Check that eigen_to_array throws when there is more than one node
-    BOOST_CHECK_THROW(array_to_eigen(array), TiledArray::Exception);
-
-    // Fill local tiles with data
-    GlobalFixture::world->srand(27);
-    TArrayI::pmap_interface::const_iterator it = array.pmap()->begin();
-    TArrayI::pmap_interface::const_iterator end = array.pmap()->end();
-    for (; it != end; ++it) {
-      TArrayI::value_type tile(array.trange().make_tile_range(*it));
-      for (TArrayI::value_type::iterator tile_it = tile.begin();
-           tile_it != tile.end(); ++tile_it) {
-        *tile_it = GlobalFixture::world->rand();
+    } else {
+      // Check that eigen_to_array throws when there is more than one node
+      BOOST_CHECK_THROW(array_to_eigen(arr), TiledArray::Exception);
+
+      // Fill local tiles with data
+      GlobalFixture::world->srand(27);
+      TArrayI::pmap_interface::const_iterator it = arr.pmap()->begin();
+      TArrayI::pmap_interface::const_iterator end = arr.pmap()->end();
+      for (; it != end; ++it) {
+        TArrayI::value_type tile(arr.trange().make_tile_range(*it));
+        for (TArrayI::value_type::iterator tile_it = tile.begin();
+             tile_it != tile.end(); ++tile_it) {
+          *tile_it = GlobalFixture::world->rand();
+        }
+        arr.set(*it, tile);
       }
-      array.set(*it, tile);
-    }
-
-    // Distribute the data of array1 to all nodes
-    array.make_replicated();
-
-    BOOST_CHECK(array.pmap()->is_replicated());
-
-    // Convert the array to an Eigen matrix
-    BOOST_CHECK_NO_THROW(matrix = array_to_eigen(array));
-    BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(array));
-
-    // Check that the matrix dimensions are the same as the array
-    BOOST_CHECK_EQUAL(matrix.rows(), array.trange().elements_range().extent(0));
-    BOOST_CHECK_EQUAL(matrix.cols(), array.trange().elements_range().extent(1));
-    BOOST_CHECK_EQUAL(rmatrix.rows(),
-                      array.trange().elements_range().extent(0));
-    BOOST_CHECK_EQUAL(rmatrix.cols(),
-                      array.trange().elements_range().extent(1));
 
-    // Check that the data in vector matches the data in array
-    for (Range::const_iterator it = array.tiles_range().begin();
-         it != array.tiles_range().end(); ++it) {
-      BOOST_CHECK(array.is_local(*it));
-
-      Future<TArrayI::value_type> tile = array.find(*it);
-      for (Range::const_iterator tile_it = tile.get().range().begin();
-           tile_it != tile.get().range().end(); ++tile_it) {
-        BOOST_CHECK_EQUAL(matrix((*tile_it)[0], (*tile_it)[1]),
-                          tile.get()[*tile_it]);
-        BOOST_CHECK_EQUAL(rmatrix((*tile_it)[0], (*tile_it)[1]),
-                          tile.get()[*tile_it]);
+      // Distribute the data of array1 to all nodes
+      arr.make_replicated();
+
+      BOOST_CHECK(arr.pmap()->is_replicated());
+
+      // Convert the array to an Eigen matrix
+      BOOST_CHECK_NO_THROW(matrix = array_to_eigen(arr));
+      BOOST_CHECK_NO_THROW(rmatrix = a_to_e_rowmajor(arr));
+
+      // Check that the matrix dimensions are the same as the array
+      BOOST_CHECK_EQUAL(matrix.rows(), arr.trange().elements_range().extent(0));
+      BOOST_CHECK_EQUAL(matrix.cols(), arr.trange().elements_range().extent(1));
+      BOOST_CHECK_EQUAL(rmatrix.rows(),
+                        arr.trange().elements_range().extent(0));
+      BOOST_CHECK_EQUAL(rmatrix.cols(),
+                        arr.trange().elements_range().extent(1));
+
+      // Check that the data in vector matches the data in array
+      for (Range::const_iterator it = arr.tiles_range().begin();
+           it != arr.tiles_range().end(); ++it) {
+        BOOST_CHECK(arr.is_local(*it));
+
+        Future<TArrayI::value_type> tile = arr.find(*it);
+        for (Range::const_iterator tile_it = tile.get().range().begin();
+             tile_it != tile.get().range().end(); ++tile_it) {
+          BOOST_CHECK_EQUAL(matrix((*tile_it)[0] - base, (*tile_it)[1] - base),
+                            tile.get()[*tile_it]);
+          BOOST_CHECK_EQUAL(rmatrix((*tile_it)[0] - base, (*tile_it)[1] - base),
+                            tile.get()[*tile_it]);
+        }
       }
     }
-  }
+
+  }  // base=0,1
 }
 
 BOOST_AUTO_TEST_CASE(array_to_vector) {
-  if (GlobalFixture::world->size() == 1) {
-    // Fill the array with random data
-    GlobalFixture::world->srand(27);
-    for (Range::const_iterator it = array1.tiles_range().begin();
-         it != array1.tiles_range().end(); ++it) {
-      TArrayI::value_type tile(array1.trange().make_tile_range(*it));
-      for (TArrayI::value_type::iterator tile_it = tile.begin();
-           tile_it != tile.end(); ++tile_it) {
-        *tile_it = GlobalFixture::world->rand();
+  for (auto base : {0, 1}) {
+    auto& arr1 = base == 1 ? array1_base1 : array1;
+
+    if (GlobalFixture::world->size() == 1) {
+      // Fill the array with random data
+      GlobalFixture::world->srand(27);
+      for (Range::const_iterator it = arr1.tiles_range().begin();
+           it != arr1.tiles_range().end(); ++it) {
+        TArrayI::value_type tile(arr1.trange().make_tile_range(*it));
+        for (TArrayI::value_type::iterator tile_it = tile.begin();
+             tile_it != tile.end(); ++tile_it) {
+          *tile_it = GlobalFixture::world->rand();
+        }
+        arr1.set(*it, tile);
       }
-      array1.set(*it, tile);
-    }
-
-    // Convert the array to an Eigen vector
-    BOOST_CHECK_NO_THROW(vector = array_to_eigen(array1));
-
-    // Check that the matrix dimensions are the same as the array
-    BOOST_CHECK_EQUAL(vector.rows(),
-                      array1.trange().elements_range().extent(0));
-    BOOST_CHECK_EQUAL(vector.cols(), 1);
 
-    // Check that the data in vector matches the data in array
-    for (Range::const_iterator it = array1.tiles_range().begin();
-         it != array1.tiles_range().end(); ++it) {
-      Future<TArrayI::value_type> tile = array1.find(*it);
-      for (Range::const_iterator tile_it = tile.get().range().begin();
-           tile_it != tile.get().range().end(); ++tile_it) {
-        BOOST_CHECK_EQUAL(vector((*tile_it)[0]), tile.get()[*tile_it]);
+      // Convert the array to an Eigen vector
+      BOOST_CHECK_NO_THROW(vector = array_to_eigen(arr1));
+
+      // Check that the matrix dimensions are the same as the array
+      BOOST_CHECK_EQUAL(vector.rows(),
+                        arr1.trange().elements_range().extent(0));
+      BOOST_CHECK_EQUAL(vector.cols(), 1);
+
+      // Check that the data in vector matches the data in array
+      for (Range::const_iterator it = arr1.tiles_range().begin();
+           it != arr1.tiles_range().end(); ++it) {
+        Future<TArrayI::value_type> tile = arr1.find(*it);
+        for (Range::const_iterator tile_it = tile.get().range().begin();
+             tile_it != tile.get().range().end(); ++tile_it) {
+          BOOST_CHECK_EQUAL(vector((*tile_it)[0] - base), tile.get()[*tile_it]);
+        }
       }
-    }
-  } else {
-    // Check that eigen_to_array throws when there is more than one node
-    BOOST_CHECK_THROW(array_to_eigen(array1), TiledArray::Exception);
-
-    // Fill local tiles with data
-    GlobalFixture::world->srand(27);
-    TArrayI::pmap_interface::const_iterator it = array1.pmap()->begin();
-    TArrayI::pmap_interface::const_iterator end = array1.pmap()->end();
-    for (; it != end; ++it) {
-      TArrayI::value_type tile(array1.trange().make_tile_range(*it));
-      for (TArrayI::value_type::iterator tile_it = tile.begin();
-           tile_it != tile.end(); ++tile_it) {
-        *tile_it = GlobalFixture::world->rand();
+    } else {
+      // Check that eigen_to_array throws when there is more than one node
+      BOOST_CHECK_THROW(array_to_eigen(arr1), TiledArray::Exception);
+
+      // Fill local tiles with data
+      GlobalFixture::world->srand(27);
+      TArrayI::pmap_interface::const_iterator it = arr1.pmap()->begin();
+      TArrayI::pmap_interface::const_iterator end = arr1.pmap()->end();
+      for (; it != end; ++it) {
+        TArrayI::value_type tile(arr1.trange().make_tile_range(*it));
+        for (TArrayI::value_type::iterator tile_it = tile.begin();
+             tile_it != tile.end(); ++tile_it) {
+          *tile_it = GlobalFixture::world->rand();
+        }
+        arr1.set(*it, tile);
       }
-      array1.set(*it, tile);
-    }
 
-    // Distribute the data of array1 to all nodes
-    array1.make_replicated();
+      // Distribute the data of array1 to all nodes
+      arr1.make_replicated();
 
-    BOOST_CHECK(array1.pmap()->is_replicated());
+      BOOST_CHECK(arr1.pmap()->is_replicated());
 
-    // Convert the array to an Eigen vector
-    BOOST_CHECK_NO_THROW(vector = array_to_eigen(array1));
+      // Convert the array to an Eigen vector
+      BOOST_CHECK_NO_THROW(vector = array_to_eigen(arr1));
 
-    // Check that the matrix dimensions are the same as the array
-    BOOST_CHECK_EQUAL(vector.rows(),
-                      array1.trange().elements_range().extent(0));
-    BOOST_CHECK_EQUAL(vector.cols(), 1);
+      // Check that the matrix dimensions are the same as the array
+      BOOST_CHECK_EQUAL(vector.rows(),
+                        arr1.trange().elements_range().extent(0));
+      BOOST_CHECK_EQUAL(vector.cols(), 1);
 
-    // Check that the data in vector matches the data in array
-    for (Range::const_iterator it = array1.tiles_range().begin();
-         it != array1.tiles_range().end(); ++it) {
-      BOOST_CHECK(array1.is_local(*it));
+      // Check that the data in vector matches the data in array
+      for (Range::const_iterator it = arr1.tiles_range().begin();
+           it != arr1.tiles_range().end(); ++it) {
+        BOOST_CHECK(arr1.is_local(*it));
 
-      Future<TArrayI::value_type> tile = array1.find(*it);
-      for (Range::const_iterator tile_it = tile.get().range().begin();
-           tile_it != tile.get().range().end(); ++tile_it) {
-        BOOST_CHECK_EQUAL(vector((*tile_it)[0]), tile.get()[*tile_it]);
+        Future<TArrayI::value_type> tile = arr1.find(*it);
+        for (Range::const_iterator tile_it = tile.get().range().begin();
+             tile_it != tile.get().range().end(); ++tile_it) {
+          BOOST_CHECK_EQUAL(vector((*tile_it)[0] - base), tile.get()[*tile_it]);
+        }
       }
     }
-  }
+
+  }  // base=0,1
 }
 
 BOOST_AUTO_TEST_CASE(subtensor_to_tensor) {
@@ -421,26 +463,35 @@ BOOST_AUTO_TEST_CASE(tensor_to_array) {
     decltype(tensor) tensor_copy;
     if (GlobalFixture::world->rank() == 1) tensor_copy = tensor;
     GlobalFixture::world->gop.broadcast_serializable(tensor_copy, 1);
+// Eigen::TensorBase::operator== is ambiguously defined in C++20
+#if __cplusplus >= 202002L
+    Eigen::Tensor<bool, 0> eq = ((tensor - tensor_copy).abs() == 0).all();
+#else
     Eigen::Tensor<bool, 0> eq = (tensor == tensor_copy).all();
+#endif
     BOOST_CHECK(eq() == true);
   }
 
-  // Copy matrix to array
-  BOOST_CHECK_NO_THROW((array = eigen_tensor_to_array<TArrayI>(
-                            *GlobalFixture::world, trangeN, tensor)));
-
-  // Check that the data in array is equal to that in matrix
-  for (Range::const_iterator it = array.tiles_range().begin();
-       it != array.tiles_range().end(); ++it) {
-    Future<TArrayI::value_type> tile = array.find(*it);
-    for (Range::const_iterator tile_it = tile.get().range().begin();
-         tile_it != tile.get().range().end(); ++tile_it) {
-      std::array<long, GlobalFixture::dim> idx;
-      auto& t_idx = *tile_it;
-      std::copy(t_idx.begin(), t_idx.end(), idx.begin());
-      BOOST_CHECK_EQUAL(tile.get()[*tile_it], tensor(idx));
+  for (auto base : {0, 1}) {
+    auto& tr = base == 1 ? trangeN_base1 : trangeN;
+    auto& arr = base == 1 ? arrayN_base1 : arrayN;
+    // Copy matrix to array
+    BOOST_CHECK_NO_THROW((arr = eigen_tensor_to_array<TArrayI>(
+                              *GlobalFixture::world, tr, tensor)));
+
+    // Check that the data in array is equal to that in matrix
+    for (Range::const_iterator it = arr.tiles_range().begin();
+         it != arr.tiles_range().end(); ++it) {
+      Future<TArrayI::value_type> tile = arr.find(*it);
+      for (Range::const_iterator tile_it = tile.get().range().begin();
+           tile_it != tile.get().range().end(); ++tile_it) {
+        auto& t_idx = *tile_it;
+        std::array<long, GlobalFixture::dim> idx;
+        for (auto d = 0; d != GlobalFixture::dim; ++d) idx[d] = t_idx[d] - base;
+        BOOST_CHECK_EQUAL(tile.get()[*tile_it], tensor(idx));
+      }
     }
-  }
+  }  // base
 }
 
 BOOST_AUTO_TEST_CASE(array_to_tensor) {
@@ -457,57 +508,70 @@ BOOST_AUTO_TEST_CASE(array_to_tensor) {
     return result;
   };
 
-  // Fill local tiles with data
-  GlobalFixture::world->srand(27);
-  TArrayI::pmap_interface::const_iterator it = arrayN.pmap()->begin();
-  TArrayI::pmap_interface::const_iterator end = arrayN.pmap()->end();
-  for (; it != end; ++it) {
-    TArrayI::value_type tile(arrayN.trange().make_tile_range(*it));
-    for (TArrayI::value_type::iterator tile_it = tile.begin();
-         tile_it != tile.end(); ++tile_it) {
-      *tile_it = GlobalFixture::world->rand();
+  for (auto base : {0, 1}) {
+    auto& arr = base == 1 ? arrayN_base1 : arrayN;
+
+    auto to_base0 = [&](const auto& arr) {
+      std::array<Tensor::Index, GlobalFixture::dim> result;
+      for (int i = 0; i < GlobalFixture::dim; ++i) result[i] = arr[i] - base;
+      return result;
+    };
+
+    // Fill local tiles with data
+    GlobalFixture::world->srand(27);
+    TArrayI::pmap_interface::const_iterator it = arr.pmap()->begin();
+    TArrayI::pmap_interface::const_iterator end = arr.pmap()->end();
+    for (; it != end; ++it) {
+      TArrayI::value_type tile(arr.trange().make_tile_range(*it));
+      for (TArrayI::value_type::iterator tile_it = tile.begin();
+           tile_it != tile.end(); ++tile_it) {
+        *tile_it = GlobalFixture::world->rand();
+      }
+      arr.set(*it, tile);
     }
-    arrayN.set(*it, tile);
-  }
 
-  if (GlobalFixture::world->size() > 1) {
-    // Check that array_to_eigen_tensor throws when there is more than one node
-    BOOST_CHECK_THROW(array_to_eigen_tensor<Tensor>(arrayN),
-                      TiledArray::Exception);
-  }
+    if (GlobalFixture::world->size() > 1) {
+      // Check that array_to_eigen_tensor throws when there is more than one
+      // node
+      BOOST_CHECK_THROW(array_to_eigen_tensor<Tensor>(arr),
+                        TiledArray::Exception);
+    }
 
-  // Distribute the data of arrayN to all nodes
-  if (GlobalFixture::world->size() > 1) {
-    arrayN.make_replicated();
-    BOOST_CHECK(arrayN.pmap()->is_replicated());
-  }
+    // Distribute the data of arrayN to all nodes
+    if (GlobalFixture::world->size() > 1) {
+      arr.make_replicated();
+      BOOST_CHECK(arr.pmap()->is_replicated());
+    }
+
+    // Convert the array to an Eigen matrix
+    BOOST_CHECK_NO_THROW(tensor = array_to_eigen_tensor<Tensor>(arr));
+    BOOST_CHECK_NO_THROW(rtensor = a_to_e_rowmajor(arr));
+
+    // Check that the matrix dimensions are the same as the array
+    BOOST_CHECK_EQUAL_COLLECTIONS(
+        tensor.dimensions().begin(), tensor.dimensions().end(),
+        arr.trange().elements_range().extent().begin(),
+        arr.trange().elements_range().extent().end());
+    BOOST_CHECK_EQUAL_COLLECTIONS(
+        rtensor.dimensions().begin(), rtensor.dimensions().end(),
+        arr.trange().elements_range().extent().begin(),
+        arr.trange().elements_range().extent().end());
 
-  // Convert the array to an Eigen matrix
-  BOOST_CHECK_NO_THROW(tensor = array_to_eigen_tensor<Tensor>(arrayN));
-  BOOST_CHECK_NO_THROW(rtensor = a_to_e_rowmajor(arrayN));
-
-  // Check that the matrix dimensions are the same as the array
-  BOOST_CHECK_EQUAL_COLLECTIONS(
-      tensor.dimensions().begin(), tensor.dimensions().end(),
-      arrayN.trange().elements_range().extent().begin(),
-      arrayN.trange().elements_range().extent().end());
-  BOOST_CHECK_EQUAL_COLLECTIONS(
-      rtensor.dimensions().begin(), rtensor.dimensions().end(),
-      arrayN.trange().elements_range().extent().begin(),
-      arrayN.trange().elements_range().extent().end());
-
-  // Check that the data in vector matches the data in array
-  for (Range::const_iterator it = arrayN.tiles_range().begin();
-       it != arrayN.tiles_range().end(); ++it) {
-    BOOST_CHECK(arrayN.is_local(*it));
-
-    Future<TArrayI::value_type> tile = arrayN.find(*it);
-    for (Range::const_iterator tile_it = tile.get().range().begin();
-         tile_it != tile.get().range().end(); ++tile_it) {
-      BOOST_CHECK_EQUAL(tensor(to_array(*tile_it)), tile.get()[*tile_it]);
-      BOOST_CHECK_EQUAL(rtensor(to_array(*tile_it)), tile.get()[*tile_it]);
+    // Check that the data in vector matches the data in array
+    for (Range::const_iterator it = arr.tiles_range().begin();
+         it != arr.tiles_range().end(); ++it) {
+      BOOST_CHECK(arr.is_local(*it));
+
+      Future<TArrayI::value_type> tile = arr.find(*it);
+      for (Range::const_iterator tile_it = tile.get().range().begin();
+           tile_it != tile.get().range().end(); ++tile_it) {
+        BOOST_CHECK_EQUAL(tensor(to_base0(to_array(*tile_it))),
+                          tile.get()[*tile_it]);
+        BOOST_CHECK_EQUAL(rtensor(to_base0(to_array(*tile_it))),
+                          tile.get()[*tile_it]);
+      }
     }
-  }
+  }  // base=0,1
 }
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/einsum.cpp b/tests/einsum.cpp
index 1c0172e554..6be4a4a99d 100644
--- a/tests/einsum.cpp
+++ b/tests/einsum.cpp
@@ -25,6 +25,430 @@
 
 #include "TiledArray/expressions/contraction_helpers.h"
 
+BOOST_AUTO_TEST_SUITE(manual)
+
+namespace {
+using il_trange = std::initializer_list<std::initializer_list<size_t>>;
+using il_extent = std::initializer_list<size_t>;
+}  // namespace
+
+template <DeNest DeNestFlag = DeNest::False,
+          ShapeComp ShapeCompFlag = ShapeComp::True, typename ArrayA,
+          typename ArrayB,
+          typename = std::enable_if_t<TA::detail::is_array_v<ArrayA, ArrayB>>>
+bool check_manual_eval(std::string const& annot, ArrayA A, ArrayB B) {
+  auto out = TA::einsum<DeNestFlag>(annot, A, B);
+  auto ref = manual_eval<DeNestFlag>(annot, A, B);
+
+  using Policy = typename decltype(out)::policy_type;
+  if constexpr (ShapeCompFlag == ShapeComp::True &&
+                std::is_same_v<Policy, TA::SparsePolicy>) {
+    out.truncate();
+  }
+  return ToTArrayFixture::are_equal<ShapeCompFlag>(ref, out);
+}
+
+template <ShapeComp ShapeCompFlag, DeNest DeNestFlag = DeNest::False,
+          typename ArrayA, typename ArrayB,
+          typename = std::enable_if_t<TA::detail::is_array_v<ArrayA, ArrayB>>>
+bool check_manual_eval(std::string const& annot, ArrayA A, ArrayB B) {
+  return check_manual_eval<DeNestFlag, ShapeCompFlag>(annot, A, B);
+}
+
+template <typename Array, DeNest DeNestFlag = DeNest::False,
+          ShapeComp ShapeCompFlag = ShapeComp::True>
+bool check_manual_eval(std::string const& annot, il_trange trangeA,
+                       il_trange trangeB) {
+  static_assert(detail::is_array_v<Array> &&
+                detail::is_tensor_v<typename Array::value_type>);
+  auto A = random_array<Array>(TA::TiledRange(trangeA));
+  auto B = random_array<Array>(TA::TiledRange(trangeB));
+  return check_manual_eval<DeNestFlag, ShapeCompFlag>(annot, A, B);
+}
+
+template <typename Array, ShapeComp ShapeCompFlag,
+          DeNest DeNestFlag = DeNest::False>
+bool check_manual_eval(std::string const& annot, il_trange trangeA,
+                       il_trange trangeB) {
+  return check_manual_eval<Array, DeNestFlag, ShapeCompFlag>(annot, trangeA,
+                                                             trangeB);
+}
+
+template <typename ArrayA, typename ArrayB, DeNest DeNestFlag = DeNest::False,
+          ShapeComp ShapeCompFlag = ShapeComp::True>
+bool check_manual_eval(std::string const& annot, il_trange trangeA,
+                       il_trange trangeB, il_extent inner_extents) {
+  static_assert(detail::is_array_v<ArrayA, ArrayB>);
+
+  if constexpr (detail::is_tensor_of_tensor_v<typename ArrayA::value_type>) {
+    static_assert(!detail::is_tensor_of_tensor_v<typename ArrayB::value_type>);
+    return check_manual_eval<DeNestFlag, ShapeCompFlag>(
+        annot, random_array<ArrayA>(trangeA, inner_extents),
+        random_array<ArrayB>(trangeB));
+  } else {
+    static_assert(detail::is_tensor_of_tensor_v<typename ArrayB::value_type>);
+    return check_manual_eval<DeNestFlag, ShapeCompFlag>(
+        annot, random_array<ArrayA>(trangeA),
+        random_array<ArrayB>(trangeB, inner_extents));
+  }
+}
+
+template <typename ArrayA, typename ArrayB, ShapeComp ShapeCompFlag,
+          DeNest DeNestFlag = DeNest::False>
+bool check_manual_eval(std::string const& annot, il_trange trangeA,
+                       il_trange trangeB, il_extent inner_extents) {
+  return check_manual_eval<DeNestFlag, ShapeCompFlag, ArrayA, ArrayB>(
+      annot, trangeA, trangeB);
+}
+
+template <typename Array, DeNest DeNestFlag = DeNest::False,
+          ShapeComp ShapeCompFlag = ShapeComp::True>
+bool check_manual_eval(std::string const& annot, il_trange trangeA,
+                       il_trange trangeB, il_extent inner_extentsA,
+                       il_extent inner_extentsB) {
+  static_assert(detail::is_array_v<Array> &&
+                detail::is_tensor_of_tensor_v<typename Array::value_type>);
+  return check_manual_eval<DeNestFlag, ShapeCompFlag>(
+      annot, random_array<Array>(trangeA, inner_extentsA),
+      random_array<Array>(trangeB, inner_extentsB));
+}
+
+template <typename Array, ShapeComp ShapeCompFlag,
+          DeNest DeNestFlag = DeNest::False>
+bool check_manual_eval(std::string const& annot, il_trange trangeA,
+                       il_trange trangeB, il_extent inner_extentsA,
+                       il_extent inner_extentsB) {
+  return check_manual_eval<Array, DeNestFlag, ShapeCompFlag>(
+      annot, trangeA, trangeB, inner_extentsA, inner_extentsB);
+}
+
+BOOST_AUTO_TEST_CASE(contract) {
+  using Array = TA::DistArray<TA::Tensor<int>>;
+
+  BOOST_REQUIRE(check_manual_eval<Array>("ij,j->i",
+                                         {{0, 2, 4}, {0, 4, 8}},  // A's trange
+                                         {{0, 4, 8}}              // B's trange
+                                         ));
+  BOOST_REQUIRE(check_manual_eval<Array>("ik,jk->ji",
+                                         {{0, 2, 4}, {0, 4, 8}},  // A's trange
+                                         {{0, 3}, {0, 4, 8}}      // B's trange
+                                         ));
+
+  BOOST_REQUIRE(check_manual_eval<Array>(
+      "ijkl,jm->lkmi",                      //
+      {{0, 2}, {0, 4, 8}, {0, 3}, {0, 7}},  //
+      {{0, 4, 8}, {0, 5}}                   //
+      ));
+}
+
+BOOST_AUTO_TEST_CASE(hadamard) {
+  using Array = TA::DistArray<TA::Tensor<int>>;
+  BOOST_REQUIRE(check_manual_eval<Array>("i,i->i",  //
+                                         {{0, 1}},  //
+                                         {{0, 1}}   //
+                                         ));
+  BOOST_REQUIRE(check_manual_eval<Array>("i,i->i",     //
+                                         {{0, 2, 4}},  //
+                                         {{0, 2, 4}}   //
+                                         ));
+
+  BOOST_REQUIRE(check_manual_eval<Array>("ijk,kij->ikj",                  //
+                                         {{0, 2, 4}, {0, 2, 3}, {0, 5}},  //
+                                         {{0, 5}, {0, 2, 4}, {0, 2, 3}}   //
+                                         ));
+}
+
+BOOST_AUTO_TEST_CASE(general) {
+  using Array = TA::DistArray<TA::Tensor<int>>;
+  BOOST_REQUIRE(check_manual_eval<Array>("ijk,kil->ijl",                  //
+                                         {{0, 2}, {0, 3, 5}, {0, 2, 4}},  //
+                                         {{0, 2, 4}, {0, 2}, {0, 1}}      //
+                                         ));
+  using Tensor = typename Array::value_type;
+  using namespace std::string_literals;
+
+  Tensor A(TA::Range{2, 3}, {1, 2, 3, 4, 5, 6});
+  Tensor B(TA::Range{2}, {2, 10});
+  Tensor C(TA::Range{2, 3}, {2, 4, 6, 40, 50, 60});
+  BOOST_REQUIRE(
+      C == general_product<Tensor>(A, B, ProductSetup("ij"s, "i"s, "ij"s)));
+}
+
+BOOST_AUTO_TEST_CASE(equal_nested_ranks) {
+  using ArrayToT = TA::DistArray<TA::Tensor<TA::Tensor<int>>>;
+
+  // H;H (Hadamard outer; Hadamard inner)
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>("ij;mn,ji;nm->ij;mn",  //
+                                            {{0, 2, 4}, {0, 3}},   //
+                                            {{0, 3}, {0, 2, 4}},   //
+                                            {5, 7},                //
+                                            {7, 5}                 //
+                                            ));
+
+  // H;C (Hadamard outer; contraction inner)
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>("ij;mo,ji;on->ij;mn",  //
+                                            {{0, 2, 4}, {0, 3}},   //
+                                            {{0, 3}, {0, 2, 4}},   //
+                                            {3, 7},                //
+                                            {7, 4}                 //
+                                            ));
+
+  // H;C
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>("ij;mo,ji;o->ij;m",   //
+                                            {{0, 2, 4}, {0, 3}},  //
+                                            {{0, 3}, {0, 2, 4}},  //
+                                            {3, 7},               //
+                                            {7}                   //
+                                            ));
+
+  // C;C
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>("ik;mo,kj;on->ij;mn",    //
+                                            {{0, 3, 5}, {0, 2, 4}},  //
+                                            {{0, 2, 4}, {0, 2}},     //
+                                            {2, 2},                  //
+                                            {2, 2}));
+
+  // C;C
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>("ijk;dcb,ik;bc->ij;d",     //
+                                            {{0, 3}, {0, 4}, {0, 5}},  //
+                                            {{0, 3}, {0, 5}},          //
+                                            {2, 3, 4},                 //
+                                            {4, 3}));
+
+  // H+C;H
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>("ijk;mn,ijk;nm->ij;mn",    //
+                                            {{0, 2}, {0, 3}, {0, 2}},  //
+                                            {{0, 2}, {0, 3}, {0, 2}},  //
+                                            {2, 2},                    //
+                                            {2, 2}));
+
+  // H+C;C
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>("ijk;mo,ijk;no->ij;nm",    //
+                                            {{0, 2}, {0, 3}, {0, 2}},  //
+                                            {{0, 2}, {0, 3}, {0, 2}},  //
+                                            {3, 2},                    //
+                                            {3, 2}));
+
+  // H+C;C
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>("ijk;m,ijk;n->ij;nm",      //
+                                            {{0, 2}, {0, 3}, {0, 2}},  //
+                                            {{0, 2}, {0, 3}, {0, 2}},  //
+                                            {3},                       //
+                                            {2}));
+  // H+C;H+C not supported
+
+  // H;C(op)
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>(
+      "ijk;bc,j;d->kji;dcb", {{0, 1}, {0, 1}, {0, 1}}, {{0, 1}}, {2, 3}, {4}));
+}
+
+BOOST_AUTO_TEST_CASE(different_nested_ranks) {
+  using ArrayT = TA::DistArray<TA::Tensor<int>>;
+  using ArrayToT = TA::DistArray<TA::Tensor<TA::Tensor<int>>>;
+
+  {
+    // these tests do not involve permutation of inner tensors
+    // H
+    BOOST_REQUIRE(
+        (check_manual_eval<ArrayToT, ArrayT>("ij;mn,ji->ji;mn",          //
+                                             {{0, 2, 5}, {0, 3, 5, 9}},  //
+                                             {{0, 3, 5, 9}, {0, 2, 5}},  //
+                                             {2, 1})));
+
+    // H (reversed arguments)
+    BOOST_REQUIRE(
+        (check_manual_eval<ArrayT, ArrayToT>("ji,ij;mn->ji;mn",          //
+                                             {{0, 3, 5, 9}, {0, 2, 5}},  //
+                                             {{0, 2, 5}, {0, 3, 5, 9}},  //
+                                             {2, 4})));
+
+    // C (outer product)
+    BOOST_REQUIRE((check_manual_eval<ArrayToT, ArrayT>("i;mn,j->ij;mn",  //
+                                                       {{0, 5}},         //
+                                                       {{0, 3, 8}},      //
+                                                       {3, 2})));
+
+    // C (outer product) (reversed arguments)
+    BOOST_REQUIRE((check_manual_eval<ArrayT, ArrayToT>("j,i;mn->ij;mn",  //
+                                                       {{0, 3, 8}},      //
+                                                       {{0, 5}},         //
+                                                       {2, 2})));
+  }
+
+  // C (outer product)
+  BOOST_REQUIRE((check_manual_eval<ArrayToT, ArrayT>("ik;mn,j->ijk;nm",    //
+                                                     {{0, 2, 4}, {0, 4}},  //
+                                                     {{0, 3, 5}},          //
+                                                     {3, 2})));
+
+  // C (outer product) (reversed arguments)
+  BOOST_REQUIRE((check_manual_eval<ArrayT, ArrayToT>("jl,ik;mn->ijkl;nm",  //
+                                                     {{0, 3, 5}, {0, 3}},  //
+                                                     {{0, 2, 4}, {0, 4}},  //
+                                                     {3, 2})));
+
+  // H+C (outer product)
+  BOOST_REQUIRE((check_manual_eval<ArrayToT, ArrayT>("ij;mn,ik->ijk;nm",      //
+                                                     {{0, 2, 5}, {0, 3, 7}},  //
+                                                     {{0, 2, 5}, {0, 4, 7}},  //
+                                                     {2, 5})));
+
+  // H+C (outer product) (reversed arguments)
+  BOOST_REQUIRE((check_manual_eval<ArrayT, ArrayToT>("ik,ij;mn->ijk;nm",      //
+                                                     {{0, 2, 5}, {0, 4, 7}},  //
+                                                     {{0, 2, 5}, {0, 3, 7}},  //
+                                                     {2, 5})));
+
+  {
+    // these tests do not involve permutation of inner tensors
+    // H+C
+    BOOST_REQUIRE(
+        (check_manual_eval<ArrayToT, ArrayT>("ik;mn,ijk->ij;mn",        //
+                                             {{0, 2}, {0, 3}},          //
+                                             {{0, 2}, {0, 2}, {0, 3}},  //
+                                             {2, 2})));
+
+    // H+C (reversed arguments)
+    BOOST_REQUIRE(
+        (check_manual_eval<ArrayT, ArrayToT>("ijk,ik;mn->ij;mn",        //
+                                             {{0, 2}, {0, 2}, {0, 3}},  //
+                                             {{0, 2}, {0, 3}},          //
+                                             {2, 2})));
+  }
+
+  // H
+  BOOST_REQUIRE((check_manual_eval<ArrayToT, ArrayT>("ij;mn,ji->ji;nm",       //
+                                                     {{0, 2, 4, 6}, {0, 3}},  //
+                                                     {{0, 3}, {0, 2, 4, 6}},  //
+                                                     {4, 2})));
+
+  // H (reversed arguments)
+  BOOST_REQUIRE((check_manual_eval<ArrayT, ArrayToT>("ji,ij;mn->ji;nm",       //
+                                                     {{0, 3, 5}, {0, 2, 4}},  //
+                                                     {{0, 2, 4}, {0, 3, 5}},  //
+                                                     {1, 2})));
+
+  // C
+  BOOST_REQUIRE((check_manual_eval<ArrayToT, ArrayT>("ij;m,j->i;m",        //
+                                                     {{0, 5}, {0, 2, 3}},  //
+                                                     {{0, 2, 3}},          //
+                                                     {3})));
+
+  // C (reversed arguments)
+  BOOST_REQUIRE((check_manual_eval<ArrayT, ArrayToT>("j,ij;m->i;m",     //
+                                                     {{0, 2}},          //
+                                                     {{0, 1}, {0, 2}},  //
+                                                     {3})));
+
+  // H+C
+  BOOST_REQUIRE((
+      check_manual_eval<ArrayToT, ArrayT>("ik;mn,ijk->ij;nm",                 //
+                                          {{0, 2}, {0, 3, 5}},                //
+                                          {{0, 2}, {0, 2, 4, 6}, {0, 3, 5}},  //
+                                          {2, 2})));
+
+  // H+C (reversed arguments)
+  BOOST_REQUIRE(
+      (check_manual_eval<ArrayT, ArrayToT>("ijk,ik;mn->ij;nm",        //
+                                           {{0, 2}, {0, 4}, {0, 3}},  //
+                                           {{0, 2}, {0, 3}},          //
+                                           {2, 4})));
+}
+
+BOOST_AUTO_TEST_CASE(nested_rank_reduction) {
+  using T = TA::Tensor<int>;
+  using ToT = TA::Tensor<T>;
+  using Array = TA::DistArray<T>;
+  using ArrayToT = TA::DistArray<ToT>;
+  BOOST_REQUIRE(
+      (check_manual_eval<ArrayToT, DeNest::True>("ij;ab,ij;ab->ij",    //
+                                                 {{0, 2, 4}, {0, 4}},  //
+                                                 {{0, 2, 4}, {0, 4}},  //
+                                                 {3, 2},               //
+                                                 {3, 2})));
+  BOOST_REQUIRE(
+      (check_manual_eval<ArrayToT, DeNest::True>("ij;ab,ij;ab->i",     //
+                                                 {{0, 2, 4}, {0, 4}},  //
+                                                 {{0, 2, 4}, {0, 4}},  //
+                                                 {3, 2},               //
+                                                 {3, 2})));
+}
+
+BOOST_AUTO_TEST_CASE(corner_cases) {
+  using T = TA::Tensor<int>;
+  using ToT = TA::Tensor<T>;
+  using ArrayT = TA::DistArray<T>;
+  using ArrayToT = TA::DistArray<ToT>;
+
+  BOOST_REQUIRE(check_manual_eval<ArrayT>("ia,i->ia",                   //
+                                          {{0, 2, 5}, {0, 7, 11, 16}},  //
+                                          {{0, 2, 5}}));
+
+  BOOST_REQUIRE(check_manual_eval<ArrayT>("i,ai->ia",   //
+                                          {{0, 2, 5}},  //
+                                          {{0, 7, 11, 16}, {0, 2, 5}}));
+
+  BOOST_REQUIRE(check_manual_eval<ArrayT>("ijk,kj->kij",                      //
+                                          {{0, 2, 5}, {0, 3, 6}, {0, 2, 7}},  //
+                                          {{0, 2, 7}, {0, 3, 6}}));
+
+  BOOST_REQUIRE(check_manual_eval<ArrayT>("kj,ijk->kij",           //
+                                          {{0, 2, 7}, {0, 3, 6}},  //
+                                          {{0, 2, 5}, {0, 3, 6}, {0, 2, 7}}));
+
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>("kij;ab,kj;bc->kji;ac",          //
+                                            {{0, 2}, {0, 3, 5}, {0, 4, 7}},  //
+                                            {{0, 2}, {0, 4, 7}},             //
+                                            {3, 5}, {5, 2}));
+
+  BOOST_REQUIRE(
+      (check_manual_eval<ArrayToT, ArrayT>("ijk;ab,kj->kij;ba",             //
+                                           {{0, 2}, {0, 4, 6}, {0, 3, 5}},  //
+                                           {{0, 3, 5}, {0, 4, 6}},          //
+                                           {7, 5})));
+
+  BOOST_REQUIRE(
+      (check_manual_eval<ArrayT, ArrayToT>("ij,jik;ab->kji;ab",             //
+                                           {{0, 3, 5}, {0, 3, 8}},          //
+                                           {{0, 3, 8}, {0, 3, 5}, {0, 2}},  //
+                                           {3, 9})));
+
+  BOOST_REQUIRE(check_manual_eval<ArrayT>("bi,bi->i",        //
+                                          {{0, 2}, {0, 4}},  //
+                                          {{0, 2}, {0, 4}}));
+
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>("bi;a,bi;a->i;a",  //
+                                            {{0, 2}, {0, 4}},  //
+                                            {{0, 2}, {0, 4}},  //
+                                            {3}, {3}));
+
+  BOOST_REQUIRE(
+      (check_manual_eval<ArrayToT, ArrayT>("jk;a,ijk->i;a",           //
+                                           {{0, 2}, {0, 4}},          //
+                                           {{0, 3}, {0, 2}, {0, 4}},  //
+                                           {5})));
+
+  BOOST_REQUIRE((check_manual_eval<ArrayToT, ArrayT>("bi;a,bi->i;a",       //
+                                                     {{0, 4, 8}, {0, 4}},  //
+                                                     {{0, 4, 8}, {0, 4}},  //
+                                                     {8})));
+
+  BOOST_REQUIRE(check_manual_eval<ArrayToT>("il;bae,il;e->li;ab",  //
+                                            {{0, 2}, {0, 4}},      //
+                                            {{0, 2}, {0, 4}},      //
+                                            {4, 2, 3},             //
+                                            {3}));
+
+  BOOST_REQUIRE(
+      check_manual_eval<ArrayToT>("ijkl;abecdf,k;e->ijl;bafdc",      //
+                                  {{0, 2}, {0, 3}, {0, 4}, {0, 5}},  //
+                                  {{0, 4}},                          //
+                                  {2, 3, 6, 4, 5, 7},                //
+                                  {6}));
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+
 using namespace TiledArray;
 using namespace TiledArray::expressions;
 
@@ -580,6 +1004,83 @@ BOOST_AUTO_TEST_CASE(ij_mn_eq_ij_mn_times_ji_mn) {
   BOOST_CHECK(are_equal);
 }
 
+BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_kj_mn) {
+  using tot_type = DistArray<Tensor<Tensor<double>>, DensePolicy>;
+  using matrix_il = TiledArray::detail::matrix_il<Tensor<double>>;
+  auto& world = TiledArray::get_default_world();
+
+  auto random_tot = [](TA::Range const& rng) {
+    TA::Range inner_rng{7, 14};
+    TA::Tensor<double> t{inner_rng};
+    std::generate(t.begin(), t.end(), []() -> double {
+      return TA::detail::MakeRandom<double>::generate_value();
+    });
+    TA::Tensor<TA::Tensor<double>> result{rng};
+    for (auto& e : result) e = t;
+    return result;
+  };
+
+  auto random_tot_darr = [&random_tot](World& world, TiledRange const& tr) {
+    tot_type result(world, tr);
+    for (auto it = result.begin(); it != result.end(); ++it) {
+      auto tile =
+          TA::get_default_world().taskq.add(random_tot, it.make_range());
+      *it = tile;
+    }
+    return result;
+  };
+
+  TiledRange lhs_trange{{0, 2, 4}, {0, 2, 5}};
+  auto lhs = random_tot_darr(world, lhs_trange);
+
+  TiledRange rhs_trange{{0, 2, 4, 6}, {0, 2, 5}};
+  auto rhs = random_tot_darr(world, rhs_trange);
+  tot_type result;
+  BOOST_REQUIRE_NO_THROW(
+      result = einsum(lhs("i,j;m,n"), rhs("k,j;m,n"), "i,j,k;m,n"));
+
+  // i,j,k;m,n = i,j;m,n * k,j;m,n
+  TiledRange ref_result_trange{lhs.trange().dim(0), lhs.trange().dim(1),
+                               rhs.trange().dim(0)};
+  tot_type ref_result(world, ref_result_trange);
+
+  // to be able to pull remote tiles make them local AND ready
+  lhs.make_replicated();
+  rhs.make_replicated();
+  world.gop.fence();
+  auto make_tile = [&lhs, &rhs](TA::Range const& rng) {
+    tot_type::value_type result_tile{rng};
+    for (auto&& res_ix : result_tile.range()) {
+      auto i = res_ix[0];
+      auto j = res_ix[1];
+      auto k = res_ix[2];
+
+      auto lhs_tile_ix = lhs.trange().element_to_tile({i, j});
+      auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false);
+      auto rhs_tile_ix = rhs.trange().element_to_tile({k, j});
+      auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false);
+
+      auto& res_el = result_tile({i, j, k});
+      auto const& lhs_el = lhs_tile({i, j});
+      auto rhs_el = rhs_tile({k, j});
+      res_el = lhs_el.mult(rhs_el);  // m,n * m,n -> m,n
+    }
+    return result_tile;
+  };
+
+  using std::begin;
+  using std::end;
+
+  for (auto it = begin(ref_result); it != end(ref_result); ++it) {
+    if (ref_result.is_local(it.index())) {
+      *it = world.taskq.add(make_tile, it.make_range());
+    }
+  }
+  bool are_equal =
+      ToTArrayFixture::are_equal<ShapeComp::False>(result, ref_result);
+  BOOST_REQUIRE(are_equal);
+}
+
 BOOST_AUTO_TEST_CASE(xxx) {
   using dist_array_t = DistArray<Tensor<Tensor<double>>, DensePolicy>;
   using matrix_il = TiledArray::detail::matrix_il<Tensor<double>>;
@@ -714,7 +1215,394 @@ BOOST_AUTO_TEST_CASE(xxx) {
   BOOST_CHECK(are_equal);
 }
 
-BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_CASE(ij_mn_eq_ij_mo_times_ji_on) {
+  auto& world = TA::get_default_world();
+
+  using Array = TA::DistArray<TA::Tensor<TA::Tensor<int>>, TA::DensePolicy>;
+  using Perm = TA::Permutation;
+
+  TA::TiledRange lhs_trng{{0, 2, 3}, {0, 1}};
+  TA::TiledRange rhs_trng{{0, 1}, {0, 2, 3}};
+  TA::Range lhs_inner_rng{1, 1};
+  TA::Range rhs_inner_rng{1, 1};
+
+  auto lhs = random_array<Array>(lhs_trng, lhs_inner_rng);
+  auto rhs = random_array<Array>(rhs_trng, rhs_inner_rng);
+  Array out;
+  BOOST_REQUIRE_NO_THROW(out("i,j;m,n") = lhs("i,j;m,o") * rhs("j,i;o,n"));
+}
+
+BOOST_AUTO_TEST_CASE(ij_mn_eq_ijk_mo_times_ijk_no) {
+  using Array = TA::DistArray<TA::Tensor<TA::Tensor<int>>, TA::DensePolicy>;
+  using Ix = typename TA::Range::index1_type;
+  using namespace std::string_literals;
+  auto& world = TA::get_default_world();
+
+  Ix const K = 2;  // the extent of contracted outer mode
+
+  TA::Range const inner_rng{3, 7};
+  TA::TiledRange const lhs_trng{
+      std::initializer_list<std::initializer_list<Ix>>{
+          {0, 2, 4}, {0, 2}, {0, 2}}};
+  TA::TiledRange const rhs_trng(lhs_trng);
+  TA::TiledRange const ref_trng{lhs_trng.dim(0), lhs_trng.dim(1)};
+  TA::Range const ref_inner_rng{3, 3};  // contract(3x7,3x7) -> (3,3)
+  auto lhs = random_array<Array>(lhs_trng, inner_rng);
+  auto rhs = random_array<Array>(rhs_trng, inner_rng);
+
+  //
+  // manual evaluation: ij;mn = ijk;mo * ijk;no
+  //
+  Array ref{world, ref_trng};
+  {
+    lhs.make_replicated();
+    rhs.make_replicated();
+    world.gop.fence();
+
+    auto make_tile = [lhs, rhs, ref_inner_rng](TA::Range const& rng) {
+      using InnerT = typename Array::value_type::value_type;
+      typename Array::value_type result_tile{rng};
+
+      for (auto&& res_ix : result_tile.range()) {
+        auto i = res_ix[0];
+        auto j = res_ix[1];
+
+        InnerT mn;
+        for (Ix k = 0; k < K; ++k) {
+          auto lhs_tile =
+              lhs.find_local(lhs.trange().element_to_tile({i, j, k}))
+                  .get(/*dowork = */ false);
+          auto rhs_tile =
+              rhs.find_local(rhs.trange().element_to_tile({i, j, k}))
+                  .get(/*dowork = */ false);
+          mn.add_to(tensor_contract("mo,no->mn", lhs_tile({i, j, k}),
+                                    rhs_tile({i, j, k})));
+        }
+        result_tile({i, j}) = std::move(mn);
+      }
+      return result_tile;
+    };
+    using std::begin;
+    using std::end;
+
+    for (auto it = begin(ref); it != end(ref); ++it)
+      if (ref.is_local(it.index())) {
+        auto tile = world.taskq.add(make_tile, it.make_range());
+        *it = tile;
+      }
+  }
+
+  auto out = einsum(lhs("i,j,k;m,o"), rhs("i,j,k;n,o"), "i,j;m,n");
+  bool are_equal = ToTArrayFixture::are_equal<ShapeComp::False>(ref, out);
+
+  BOOST_CHECK(are_equal);
+}
+
+#ifdef TILEDARRAY_HAS_BTAS
+BOOST_AUTO_TEST_CASE(tensor_contract) {
+  using TensorT = TA::Tensor<int>;
+
+  TA::Range const rng_A{2, 3, 4};
+  TA::Range const rng_B{4, 3, 2};
+  auto const A = random_tensor<TensorT>(rng_A);
+  auto const B = random_tensor<TensorT>(rng_B);
+
+  BOOST_CHECK(tensor_contract_equal("ijk,klm->ijlm", A, B));
+  BOOST_CHECK(tensor_contract_equal("ijk,klm->milj", A, B));
+  BOOST_CHECK(tensor_contract_equal("ijk,kjm->im", A, B));
+  BOOST_CHECK(tensor_contract_equal("ijk,kli->lj", A, B));
+}
+#endif
+
+BOOST_AUTO_TEST_SUITE_END()  // einsum_tot
+
+BOOST_AUTO_TEST_SUITE(einsum_tot_t)
+
+BOOST_AUTO_TEST_CASE(ilkj_nm_eq_ij_mn_times_kl) {
+  using t_type = DistArray<Tensor<double>, SparsePolicy>;
+  using tot_type = DistArray<Tensor<Tensor<double>>, SparsePolicy>;
+  using matrix_il = TiledArray::detail::matrix_il<Tensor<double>>;
+  auto& world = TiledArray::get_default_world();
+  Tensor<double> lhs_elem_0_0(
+      Range{7, 2}, {49, 73, 28, 46, 12, 83, 29, 61, 61, 98, 57, 28, 96, 57});
+  Tensor<double> lhs_elem_0_1(
+      Range{7, 2}, {78, 15, 69, 55, 87, 94, 28, 94, 79, 30, 26, 88, 48, 74});
+  Tensor<double> lhs_elem_1_0(
+      Range{7, 2}, {70, 32, 25, 71, 6, 56, 4, 13, 72, 50, 15, 95, 52, 89});
+  Tensor<double> lhs_elem_1_1(
+      Range{7, 2}, {12, 29, 17, 68, 37, 79, 5, 52, 13, 35, 53, 54, 78, 71});
+  Tensor<double> lhs_elem_2_0(
+      Range{7, 2}, {77, 39, 34, 94, 16, 82, 63, 27, 75, 12, 14, 59, 3, 14});
+  Tensor<double> lhs_elem_2_1(
+      Range{7, 2}, {65, 90, 37, 41, 65, 75, 59, 16, 44, 85, 86, 11, 40, 24});
+  Tensor<double> lhs_elem_3_0(
+      Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79});
+  Tensor<double> lhs_elem_3_1(
+      Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97});
+  matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1},
+                   {lhs_elem_1_0, lhs_elem_1_1},
+                   {lhs_elem_2_0, lhs_elem_2_1},
+                   {lhs_elem_3_0, lhs_elem_3_1}};
+  TiledRange lhs_trange{{0, 2, 4}, {0, 2}};
+  tot_type lhs(world, lhs_trange, lhs_il);
+
+  TiledRange rhs_trange{{0, 2}, {0, 2, 4, 6}};
+  t_type rhs(world, rhs_trange);
+  rhs.fill_random();
+
+  TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(1),
+                               rhs_trange.dim(0), lhs_trange.dim(1)};
+  tot_type ref_result(world, ref_result_trange);
+
+  //
+  // i,l,k,j;n,m = i,j;m,n * k,l
+  //
+
+  lhs.make_replicated();
+  rhs.make_replicated();
+  world.gop.fence();
+
+  // why cannot lhs and rhs be captured by ref?
+  auto make_tile = [lhs, rhs](TA::Range const& rng) {
+    tot_type::value_type result_tile{rng};
+    for (auto&& res_ix : result_tile.range()) {
+      auto i = res_ix[0];
+      auto l = res_ix[1];
+      auto k = res_ix[2];
+      auto j = res_ix[3];
+
+      auto lhs_tile_ix = lhs.trange().element_to_tile({i, j});
+      auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false);
+
+      auto rhs_tile_ix = rhs.trange().element_to_tile({k, l});
+      auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false);
+
+      auto& res_el = result_tile({i, l, k, j});
+      auto const& lhs_el = lhs_tile({i, j});
+      auto rhs_el = rhs_tile({k, l});
+
+      res_el = tot_type::element_type(
+          lhs_el.scale(rhs_el),            // scale
+          TiledArray::Permutation{1, 0});  // permute [0,1] -> [1,0]
+    }
+    return result_tile;
+  };
+
+  using std::begin;
+  using std::end;
+
+  for (auto it = begin(ref_result); it != end(ref_result); ++it) {
+    if (ref_result.is_local(it.index())) {
+      auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range());
+      *it = tile;
+    }
+  }
+
+  tot_type result;
+  BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = lhs("i,j;m,n") * rhs("k,l"));
+
+  const bool are_equal =
+      ToTArrayFixture::are_equal<ShapeComp::False>(result, ref_result);
+  BOOST_CHECK(are_equal);
+
+  {  // reverse the order
+    tot_type result;
+    BOOST_REQUIRE_NO_THROW(result("i,l,k,j;n,m") = rhs("k,l") * lhs("i,j;m,n"));
+    const bool are_equal =
+        ToTArrayFixture::are_equal<ShapeComp::False>(result, ref_result);
+    BOOST_CHECK(are_equal);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(ijk_mn_eq_ij_mn_times_jk) {
+  using t_type = DistArray<Tensor<double>, SparsePolicy>;
+  using tot_type = DistArray<Tensor<Tensor<double>>, SparsePolicy>;
+  using matrix_il = TiledArray::detail::matrix_il<Tensor<double>>;
+  auto& world = TiledArray::get_default_world();
+  Tensor<double> lhs_elem_0_0(
+      Range{7, 2}, {49, 73, 28, 46, 12, 83, 29, 61, 61, 98, 57, 28, 96, 57});
+  Tensor<double> lhs_elem_0_1(
+      Range{7, 2}, {78, 15, 69, 55, 87, 94, 28, 94, 79, 30, 26, 88, 48, 74});
+  Tensor<double> lhs_elem_1_0(
+      Range{7, 2}, {70, 32, 25, 71, 6, 56, 4, 13, 72, 50, 15, 95, 52, 89});
+  Tensor<double> lhs_elem_1_1(
+      Range{7, 2}, {12, 29, 17, 68, 37, 79, 5, 52, 13, 35, 53, 54, 78, 71});
+  Tensor<double> lhs_elem_2_0(
+      Range{7, 2}, {77, 39, 34, 94, 16, 82, 63, 27, 75, 12, 14, 59, 3, 14});
+  Tensor<double> lhs_elem_2_1(
+      Range{7, 2}, {65, 90, 37, 41, 65, 75, 59, 16, 44, 85, 86, 11, 40, 24});
+  Tensor<double> lhs_elem_3_0(
+      Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79});
+  Tensor<double> lhs_elem_3_1(
+      Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97});
+  matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1},
+                   {lhs_elem_1_0, lhs_elem_1_1},
+                   {lhs_elem_2_0, lhs_elem_2_1},
+                   {lhs_elem_3_0, lhs_elem_3_1}};
+  TiledRange lhs_trange{{0, 2, 4}, {0, 2}};
+  tot_type lhs(world, lhs_trange, lhs_il);
+
+  TiledRange rhs_trange{{0, 2}, {0, 2, 4, 6}};
+  t_type rhs(world, rhs_trange);
+  rhs.fill_random();
+
+  // i,j;m,n * j,k => i,j,k;m,n
+  TiledRange ref_result_trange{lhs_trange.dim(0), rhs_trange.dim(0),
+                               rhs_trange.dim(1)};
+  tot_type ref_result(world, ref_result_trange);
+
+  lhs.make_replicated();
+  rhs.make_replicated();
+  world.gop.fence();
+
+  //
+  // why cannot lhs and rhs be captured by ref?
+  //
+  auto make_tile = [lhs, rhs](TA::Range const& rng) {
+    tot_type::value_type result_tile{rng};
+    for (auto&& res_ix : result_tile.range()) {
+      auto i = res_ix[0];
+      auto j = res_ix[1];
+      auto k = res_ix[2];
+
+      auto lhs_tile_ix = lhs.trange().element_to_tile({i, j});
+      auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork = */ false);
+
+      auto rhs_tile_ix = rhs.trange().element_to_tile({j, k});
+      auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork = */ false);
+
+      auto& res_el = result_tile({i, j, k});
+      auto const& lhs_el = lhs_tile({i, j});
+      auto rhs_el = rhs_tile({j, k});
+
+      res_el = lhs_el.scale(rhs_el);
+    }
+    return result_tile;
+  };
+
+  using std::begin;
+  using std::end;
+
+  for (auto it = begin(ref_result); it != end(ref_result); ++it) {
+    if (ref_result.is_local(it.index())) {
+      auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range());
+      *it = tile;
+    }
+  }
+
+  /////////////////////////////////////////////////////////
+  // ToT * T
+
+  // this is not supported by the expression layer since this is a
+  // - general product w.r.t. outer indices
+  // - involves ToT * T
+  // tot_type result;
+  // BOOST_REQUIRE_NO_THROW(result("i,j,k;m,n") = lhs("i,j;m,n") * rhs("j,k"));
+
+  // will try to make this work
+  tot_type result = einsum(lhs("i,j;m,n"), rhs("j,k"), "i,j,k;m,n");
+  bool are_equal =
+      ToTArrayFixture::are_equal<ShapeComp::False>(result, ref_result);
+  BOOST_REQUIRE(are_equal);
+  {
+    result = einsum(rhs("j,k"), lhs("i,j;m,n"), "i,j,k;m,n");
+    are_equal =
+        ToTArrayFixture::are_equal<ShapeComp::False>(result, ref_result);
+    BOOST_REQUIRE(are_equal);
+  }
+}
+
+BOOST_AUTO_TEST_CASE(ij_mn_eq_ji_mn_times_ij) {
+  using t_type = DistArray<Tensor<double>, SparsePolicy>;
+  using tot_type = DistArray<Tensor<Tensor<double>>, SparsePolicy>;
+  using matrix_il = TiledArray::detail::matrix_il<Tensor<double>>;
+  auto& world = TiledArray::get_default_world();
+  Tensor<double> lhs_elem_0_0(
+      Range{7, 2}, {49, 73, 28, 46, 12, 83, 29, 61, 61, 98, 57, 28, 96, 57});
+  Tensor<double> lhs_elem_0_1(
+      Range{7, 2}, {78, 15, 69, 55, 87, 94, 28, 94, 79, 30, 26, 88, 48, 74});
+  Tensor<double> lhs_elem_1_0(
+      Range{7, 2}, {70, 32, 25, 71, 6, 56, 4, 13, 72, 50, 15, 95, 52, 89});
+  Tensor<double> lhs_elem_1_1(
+      Range{7, 2}, {12, 29, 17, 68, 37, 79, 5, 52, 13, 35, 53, 54, 78, 71});
+  Tensor<double> lhs_elem_2_0(
+      Range{7, 2}, {77, 39, 34, 94, 16, 82, 63, 27, 75, 12, 14, 59, 3, 14});
+  Tensor<double> lhs_elem_2_1(
+      Range{7, 2}, {65, 90, 37, 41, 65, 75, 59, 16, 44, 85, 86, 11, 40, 24});
+  Tensor<double> lhs_elem_3_0(
+      Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79});
+  Tensor<double> lhs_elem_3_1(
+      Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97});
+  Tensor<double> lhs_elem_4_0(
+      Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79});
+  Tensor<double> lhs_elem_4_1(
+      Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97});
+  Tensor<double> lhs_elem_5_0(
+      Range{7, 2}, {77, 53, 11, 6, 99, 63, 46, 68, 83, 56, 76, 86, 91, 79});
+  Tensor<double> lhs_elem_5_1(
+      Range{7, 2}, {56, 11, 33, 90, 36, 38, 33, 54, 60, 21, 16, 28, 6, 97});
+  matrix_il lhs_il{{lhs_elem_0_0, lhs_elem_0_1}, {lhs_elem_1_0, lhs_elem_1_1},
+                   {lhs_elem_2_0, lhs_elem_2_1}, {lhs_elem_3_0, lhs_elem_3_1},
+                   {lhs_elem_4_0, lhs_elem_4_1}, {lhs_elem_5_0, lhs_elem_5_1}};
+  TiledRange lhs_trange{{0, 2, 6}, {0, 2}};
+  tot_type lhs(world, lhs_trange, lhs_il);
+
+  TiledRange rhs_trange{{0, 2}, {0, 2, 6}};
+  t_type rhs(world, rhs_trange);
+  rhs.fill_random();
+
+  //
+  // i,j;m,n = j,i;n,m * i,j
+  //
+  TiledRange ref_result_trange{rhs_trange.dim(0), rhs_trange.dim(1)};
+  tot_type ref_result(world, ref_result_trange);
+
+  lhs.make_replicated();
+  rhs.make_replicated();
+  world.gop.fence();
+
+  // why cannot lhs and rhs be captured by ref?
+  auto make_tile = [lhs, rhs](TA::Range const& rng) {
+    tot_type::value_type result_tile{rng};
+    for (auto&& res_ix : result_tile.range()) {
+      auto i = res_ix[0];
+      auto j = res_ix[1];
+
+      auto lhs_tile_ix = lhs.trange().element_to_tile({j, i});
+      auto lhs_tile = lhs.find_local(lhs_tile_ix).get(/* dowork */ false);
+
+      auto rhs_tile_ix = rhs.trange().element_to_tile({i, j});
+      auto rhs_tile = rhs.find_local(rhs_tile_ix).get(/* dowork */ false);
+
+      auto& res_el = result_tile({i, j});
+      auto const& lhs_el = lhs_tile({j, i});
+      auto rhs_el = rhs_tile({i, j});
+      res_el = tot_type::element_type(lhs_el.scale(rhs_el),          // scale
+                                      TiledArray::Permutation{0, 1}  // permute
+      );
+    }
+    return result_tile;
+  };
+
+  using std::begin;
+  using std::end;
+
+  for (auto it = begin(ref_result); it != end(ref_result); ++it) {
+    if (ref_result.is_local(it.index())) {
+      auto tile = TA::get_default_world().taskq.add(make_tile, it.make_range());
+      *it = tile;
+    }
+  }
+
+  tot_type result;
+  BOOST_REQUIRE_NO_THROW(result("i,j;m,n") = lhs("j,i;m,n") * rhs("i,j"));
+
+  const bool are_equal =
+      ToTArrayFixture::are_equal<ShapeComp::False>(result, ref_result);
+  BOOST_CHECK(are_equal);
+}
+
+BOOST_AUTO_TEST_SUITE_END()  // einsum_tot_t
 
 // Eigen einsum indices
 BOOST_AUTO_TEST_SUITE(einsum_index, TA_UT_LABEL_SERIAL)
@@ -740,7 +1628,7 @@ BOOST_AUTO_TEST_CASE(einsum_index) {
   BOOST_CHECK((v.range() == Range{src}));
 }
 
-BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()  // einsum_index
 
 #include "TiledArray/einsum/eigen.h"
 
@@ -764,7 +1652,12 @@ bool isApprox(const Eigen::TensorBase<TA, Eigen::ReadOnlyAccessors>& A,
   Eigen::Tensor<bool, 0> r;
   if constexpr (std::is_integral_v<typename TA::Scalar> &&
                 std::is_integral_v<typename TB::Scalar>) {
+// Eigen::TensorBase::operator== is ambiguously defined in C++20
+#if __cplusplus >= 202002L
+    r = ((derived(A) - derived(B)).abs() == 0).all();
+#else
     r = (derived(A) == derived(B)).all();
+#endif
   } else {  // soft floating-point comparison
     r = ((derived(A) - derived(B)).abs() <= abs_comparison_threshold).all();
   }
@@ -914,7 +1807,7 @@ BOOST_AUTO_TEST_CASE(einsum_eigen_hji_jih_hj) {
   BOOST_CHECK(isApprox(reference, result));
 }
 
-BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()  // einsum_eigen
 
 // TiledArray einsum expressions
 BOOST_AUTO_TEST_SUITE(einsum_tiledarray)
@@ -986,7 +1879,7 @@ BOOST_AUTO_TEST_CASE(einsum_tiledarray_abi_cdi_cdab) {
                                    "abi,cdi->cdab");
 }
 
-BOOST_AUTO_TEST_CASE(einsum_tiledarray_icd_ai_abcd) {
+BOOST_AUTO_TEST_CASE(einsum_tiledarray_icd_bai_abcd) {
   einsum_tiledarray_check<3, 3, 4>(random<SparsePolicy>(3, 12, 13),
                                    random<SparsePolicy>(14, 15, 3),
                                    "icd,bai->abcd");
@@ -1045,6 +1938,13 @@ BOOST_AUTO_TEST_CASE(einsum_tiledarray_hji_jih_hj) {
                                    "hji,jih->hj");
 }
 
+BOOST_AUTO_TEST_CASE(einsum_tiledarray_ik_jk_ijk) {
+  einsum_tiledarray_check<2, 2, 3>(random<SparsePolicy>(7, 5),
+                                   random<SparsePolicy>(14, 5), "ik,jk->ijk");
+  einsum_tiledarray_check<2, 2, 3>(sparse_zero(7, 5), sparse_zero(14, 5),
+                                   "ik,jk->ijk");
+}
+
 BOOST_AUTO_TEST_CASE(einsum_tiledarray_replicated) {
   einsum_tiledarray_check<3, 3, 3>(replicated(random<DensePolicy>(7, 14, 3)),
                                    random<DensePolicy>(7, 15, 3),
@@ -1093,4 +1993,4 @@ BOOST_AUTO_TEST_CASE(einsum_tiledarray_dot) {
 //   BOOST_CHECK(hik_hkj_hji == hkj_hji_hik);
 // }
 
-BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()  // einsum_tiledarray
diff --git a/tests/expressions_btas.cpp b/tests/expressions_btas.cpp
index 83ff4b1ed0..7b1ae422ce 100644
--- a/tests/expressions_btas.cpp
+++ b/tests/expressions_btas.cpp
@@ -23,6 +23,8 @@
  *
  */
 
+#include <TiledArray/config.h>
+
 #ifdef TILEDARRAY_HAS_BTAS
 #include "expressions_fixture.h"
 
diff --git a/tests/expressions_cuda_um.cpp b/tests/expressions_device_um.cpp
similarity index 99%
rename from tests/expressions_cuda_um.cpp
rename to tests/expressions_device_um.cpp
index a17b749789..d49b425372 100644
--- a/tests/expressions_cuda_um.cpp
+++ b/tests/expressions_device_um.cpp
@@ -25,9 +25,9 @@
 
 #include <TiledArray/config.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
-#include <TiledArray/cuda/btas_um_tensor.h>
+#include <TiledArray/device/btas_um_tensor.h>
 #include <range_fixture.h>
 #include <tiledarray.h>
 #include "unit_test_config.h"
@@ -76,8 +76,8 @@ struct UMExpressionsFixture : public TiledRangeFixture {
 
   static UMTensor permute_fn(const madness::Future<UMTensor>& tensor_f,
                              const Permutation& perm) {
-    return madness::add_cuda_task(*GlobalFixture::world, permute_task, tensor_f,
-                                  perm)
+    return madness::add_device_task(*GlobalFixture::world, permute_task,
+                                    tensor_f, perm)
         .get();
   }
 
@@ -85,7 +85,8 @@ struct UMExpressionsFixture : public TiledRangeFixture {
   template <typename Tile>
   static Tile make_rand_tile(const typename TA::Range& r) {
     Tile tile(r);
-    for (std::size_t i = 0ul; i < tile.size(); ++i) set_random(tile[i]);
+    for (std::size_t i = 0ul; i < tile.size(); ++i)
+      set_random(tile.at_ordinal(i));
     return tile;
   }
 
@@ -2489,6 +2490,9 @@ BOOST_AUTO_TEST_CASE(dot) {
 }
 
 BOOST_AUTO_TEST_CASE(dot_permute) {
+  // loosen the default tolerance
+  constexpr auto tolerance = 5e-13;
+
   Permutation perm({2, 1, 0});
   // Test the dot expression function
   double result = 0;
diff --git a/tests/expressions_fixture.h b/tests/expressions_fixture.h
index 3e493b1200..7a7be4c9af 100644
--- a/tests/expressions_fixture.h
+++ b/tests/expressions_fixture.h
@@ -28,9 +28,7 @@
 
 #include <TiledArray/util/eigen.h>
 #include <boost/range/combine.hpp>
-#ifdef TILEDARRAY_HAS_RANGEV3
 #include <range/v3/view/zip.hpp>
-#endif
 
 #include <TiledArray/config.h>
 
@@ -59,22 +57,36 @@ struct ExpressionsFixture : public TiledRangeFixture {
   ExpressionsFixture()
       : s_tr_1(make_random_sparseshape(tr)),
         s_tr_2(make_random_sparseshape(tr)),
+        s_tr_base1_1(make_random_sparseshape(tr_base1)),
+        s_tr_base1_2(make_random_sparseshape(tr_base1)),
         s_tr1_1(make_random_sparseshape(trange1)),
         s_tr1_2(make_random_sparseshape(trange1)),
         s_tr2(make_random_sparseshape(trange2)),
+        s_trC(make_random_sparseshape(trangeC)),
+        s_trC_f(make_random_sparseshape(trangeC_f)),
         a(*GlobalFixture::world, tr, s_tr_1),
         b(*GlobalFixture::world, tr, s_tr_2),
         c(*GlobalFixture::world, tr, s_tr_2),
+        a_base1(*GlobalFixture::world, tr_base1, s_tr_base1_1),
+        b_base1(*GlobalFixture::world, tr_base1, s_tr_base1_2),
+        c_base1(*GlobalFixture::world, tr_base1, s_tr_base1_2),
+        aC(*GlobalFixture::world, trangeC, s_trC),
+        aC_f(*GlobalFixture::world, trangeC_f, s_trC_f),
         u(*GlobalFixture::world, trange1, s_tr1_1),
         v(*GlobalFixture::world, trange1, s_tr1_2),
         w(*GlobalFixture::world, trange2, s_tr2) {
     random_fill(a);
     random_fill(b);
+    random_fill(a_base1);
+    random_fill(b_base1);
     random_fill(u);
     random_fill(v);
+    random_fill(aC);
     GlobalFixture::world->gop.fence();
     a.truncate();
     b.truncate();
+    a_base1.truncate();
+    b_base1.truncate();
     u.truncate();
     v.truncate();
   }
@@ -86,13 +98,22 @@ struct ExpressionsFixture : public TiledRangeFixture {
       : a(*GlobalFixture::world, tr),
         b(*GlobalFixture::world, tr),
         c(*GlobalFixture::world, tr),
+        a_base1(*GlobalFixture::world, tr_base1),
+        b_base1(*GlobalFixture::world, tr_base1),
+        c_base1(*GlobalFixture::world, tr_base1),
         u(*GlobalFixture::world, trange1),
         v(*GlobalFixture::world, trange1),
-        w(*GlobalFixture::world, trange2) {
+        w(*GlobalFixture::world, trange2),
+        aC(*GlobalFixture::world, trangeC),
+        aC_f(*GlobalFixture::world, trangeC_f) {
     random_fill(a);
     random_fill(b);
+    random_fill(a_base1);
+    random_fill(b_base1);
     random_fill(u);
     random_fill(v);
+    random_fill(aC);
+    random_fill(aC_f);
     GlobalFixture::world->gop.fence();
   }
 
@@ -213,17 +234,33 @@ struct ExpressionsFixture : public TiledRangeFixture {
   const TiledRange trange1{{0, 2, 5, 10, 17, 28, 41}};
   const TiledRange trange2{{0, 2, 5, 10, 17, 28, 41},
                            {0, 3, 6, 11, 18, 29, 42}};
+  // contains empty trange1
+  const TiledRange trangeC{TiledRange1{0, 2, 5, 10}, TiledRange1{},
+                           TiledRange1{0, 2, 7, 11}};
+  // like trC, but with all dimension nonempty
+  const TiledRange trangeC_f{trangeC.dim(0), TiledRange1{0, 4, 7},
+                             trangeC.dim(2)};
+
   SparseShape<float> s_tr_1;
   SparseShape<float> s_tr_2;
+  SparseShape<float> s_tr_base1_1;
+  SparseShape<float> s_tr_base1_2;
   SparseShape<float> s_tr1_1;
   SparseShape<float> s_tr1_2;
   SparseShape<float> s_tr2;
+  SparseShape<float> s_trC;
+  SparseShape<float> s_trC_f;
   TArray a;
   TArray b;
   TArray c;
+  TArray a_base1;
+  TArray b_base1;
+  TArray c_base1;
   TArray u;
   TArray v;
   TArray w;
+  TArray aC;
+  TArray aC_f;
 };  // ExpressionsFixture
 
 #endif  // TILEDARRAY_TEST_EXPRESSIONS_FIXTURE_H
diff --git a/tests/expressions_impl.h b/tests/expressions_impl.h
index 388cdd8e5d..e7c781ccc6 100644
--- a/tests/expressions_impl.h
+++ b/tests/expressions_impl.h
@@ -31,6 +31,8 @@ constexpr int nrepeats = 5;
 BOOST_FIXTURE_TEST_CASE_TEMPLATE(tensor_factories, F, Fixtures, F) {
   auto& a = F::a;
   auto& c = F::c;
+  auto& aC = F::aC;
+  auto& a_base1 = F::a_base1;
 
   const auto& ca = a;
   const std::array<int, 3> lobound{{3, 3, 3}};
@@ -64,6 +66,20 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(tensor_factories, F, Fixtures, F) {
                            ca("a,b,c").block(boost::combine(lobound, upbound)));
   BOOST_CHECK_NO_THROW(c("a,b,c") =
                            ca("a,b,c").block(iv(3, 3, 3), iv(5, 5, 5)));
+
+  BOOST_CHECK_NO_THROW(c("a,b,c") = a_base1("a,b,c").block(lobound, upbound));
+
+  // make sure that c("abc") = a("abc") does a deep copy
+  {
+    BOOST_CHECK_NO_THROW(c("a,b,c") = a("a,   b, c"));
+    for (auto&& idx : c.tiles_range()) {
+      if (c.is_local(idx) && !c.is_local(idx) && a.is_local(idx) &&
+          !a.is_zero(idx)) {
+        BOOST_CHECK(c.find_local(idx).get().data() !=
+                    a.find_local(idx).get().data());
+      }
+    }
+  }
 }
 
 BOOST_FIXTURE_TEST_CASE_TEMPLATE(block_tensor_factories, F, Fixtures, F) {
@@ -278,6 +294,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(block, F, Fixtures, F) {
   auto& a = F::a;
   auto& b = F::b;
   auto& c = F::c;
+  auto& a_base1 = F::a_base1;
 
   BlockRange block_range(a.trange().tiles_range(), {3, 3, 3}, {5, 5, 5});
 
@@ -602,6 +619,7 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_block, F, Fixtures, F) {
   for (int repeat = 0; repeat != nrepeats; ++repeat)
     BOOST_REQUIRE_NO_THROW(c("a,b,c").block({3, 3, 3}, {5, 5, 5}) =
                                2 * a("a,b,c").block({3, 3, 3}, {5, 5, 5}));
+  BOOST_REQUIRE(tile_ranges_match_trange(c));
 
   BlockRange block_range(a.trange().tiles_range(), {3, 3, 3}, {5, 5, 5});
 
@@ -670,6 +688,82 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_block, F, Fixtures, F) {
   }
 }
 
+BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_block_base1, F, Fixtures, F) {
+  auto& a = F::a;
+  auto& b = F::b;
+  auto& c = F::c;
+  auto& a_base1 = F::a_base1;
+  auto& c_base1 = F::c_base1;
+  auto& ntiles = F::ntiles;
+
+  c.fill_local(0.0);
+  c_base1.fill_local(0.0);
+
+  // block expressions by default have trange lobound (=base) set to 0 ...
+  // this is done to allow block expressions involving multiple arrays with
+  // different lobounds all work correctly
+  BOOST_REQUIRE_NO_THROW(c("a,b,c").block({3, 3, 3}, {5, 5, 5}) =
+                             a_base1("a,b,c").block({3, 3, 3}, {5, 5, 5}));
+  BOOST_REQUIRE(tile_ranges_match_trange(c));
+  BOOST_REQUIRE_NO_THROW(c_base1("a,b,c").block({3, 3, 3}, {5, 5, 5}) =
+                             a("a,b,c").block({3, 3, 3}, {5, 5, 5}));
+  BOOST_REQUIRE(tile_ranges_match_trange(c_base1));
+  BOOST_REQUIRE_NO_THROW(c_base1("a,b,c").block({3, 3, 3}, {5, 5, 5}) =
+                             a_base1("a,b,c").block({3, 3, 3}, {5, 5, 5}));
+  BOOST_REQUIRE(tile_ranges_match_trange(c_base1));
+  BOOST_REQUIRE_NO_THROW(c("a,b,c").block({0, 0, 0}, {ntiles, ntiles, ntiles}) =
+                             a_base1("a,b,c"));
+  BOOST_REQUIRE(tile_ranges_match_trange(c));
+  BOOST_REQUIRE_NO_THROW(
+      c_base1("a,b,c").block({0, 0, 0}, {ntiles, ntiles, ntiles}) = a("a,b,c"));
+  BOOST_REQUIRE(tile_ranges_match_trange(c_base1));
+
+  // however user can override the trange lobound using set_trange_lobound
+  {
+    decltype(F::c) a_block;
+    // default trange lobound is 0
+    BOOST_REQUIRE_NO_THROW(a_block("a,b,c") =
+                               a_base1("a,b,c").block({3, 3, 3}, {5, 5, 5}));
+    BOOST_REQUIRE_EQUAL(a_block.trange().elements_range().lobound(),
+                        (Range::index_type{0, 0, 0}));
+
+    // this preserves tile's lobounds, so that tile {0,0,0} in a_block has
+    // identical range to that of tile {3, 3, 3} in a_base1
+    BOOST_REQUIRE_NO_THROW(a_block("a,b,c") = a_base1("a,b,c").block(
+                               {3, 3, 3}, {5, 5, 5}, preserve_lobound));
+    BOOST_REQUIRE_EQUAL(a_block.trange().elements_range().lobound(),
+                        a_base1.trange().make_tile_range({3, 3, 3}).lobound());
+    // this explicitly makes the trange lobound of a_block to be {1,1,1}
+    BOOST_REQUIRE_NO_THROW(a_block("a,b,c") =
+                               a("a,b,c")
+                                   .block({3, 3, 3}, {5, 5, 5})
+                                   .set_trange_lobound({1, 1, 1}));
+    BOOST_REQUIRE_EQUAL(a_block.trange().elements_range().lobound(),
+                        Range::index_type({1, 1, 1}));
+    // trange of source block is ignored when it is assigned to a block of an
+    // existing array
+    BOOST_REQUIRE_NO_THROW(a_block("a,b,c").block({0, 0, 0}, {2, 2, 2}) =
+                               a_base1("a,b,c")
+                                   .block({3, 3, 3}, {5, 5, 5})
+                                   .set_trange_lobound({0, 0, 0}));
+    // overriding trange of result block is not allowed ...
+    BOOST_REQUIRE_THROW(
+        a_block("a,b,c")
+            .block({0, 0, 0}, {2, 2, 2})
+            .set_trange_lobound({0, 0, 0}) = a_base1("a,b,c")
+                                                 .block({3, 3, 3}, {5, 5, 5})
+                                                 .set_trange_lobound({0, 0, 0}),
+        Exception);
+    // ... unless makes it same as trange lobound of the underlying array
+    BOOST_REQUIRE_NO_THROW(a_block("a,b,c")
+                               .block({0, 0, 0}, {2, 2, 2})
+                               .set_trange_lobound({1, 1, 1}) =
+                               a_base1("a,b,c")
+                                   .block({3, 3, 3}, {5, 5, 5})
+                                   .set_trange_lobound({0, 0, 0}));
+  }
+}
+
 BOOST_FIXTURE_TEST_CASE_TEMPLATE(assign_subblock_permute_block, F, Fixtures,
                                  F) {
   auto& a = F::a;
@@ -2929,6 +3023,58 @@ BOOST_FIXTURE_TEST_CASE_TEMPLATE(inner_product, F, Fixtures, F) {
   BOOST_CHECK_EQUAL(result, expected);
 }
 
+// corner case: expressions involving array with empty trange1
+BOOST_FIXTURE_TEST_CASE_TEMPLATE(empty_trange1, F, Fixtures, F) {
+  auto& c = F::c;
+  auto& aC = F::aC;
+  auto& aC_f = F::aC_f;
+
+  // unary/binary expressions
+  {
+    BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c"));
+    BOOST_CHECK_NO_THROW(c("a,b,c") += aC("a,b,c"));
+    BOOST_CHECK_NO_THROW(c("a,b,c") *= aC("a,b,c"));
+    BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,b,c"));
+    BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,b,c").conj());
+    BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,c,b"));
+    BOOST_CHECK_NO_THROW(c("a,b,c") += 2 * aC("a,c,b").conj());
+    BOOST_CHECK_NO_THROW(c("a,b,c") *= 2 * aC("a,c,b").conj());
+  }
+
+  using TiledArray::eigen::iv;
+  const std::array<int, 3> lobound{{0, 0, 1}};
+  const std::array<int, 3> upbound{{1, 0, 2}};
+
+  // unary/binary block expressions
+  {
+    BOOST_CHECK_NO_THROW(c("a,b,c") = aC("a,b,c").block(lobound, upbound));
+    BOOST_CHECK_NO_THROW(c("a,b,c") +=
+                         2 * aC("a,b,c").block(lobound, upbound).conj());
+    BOOST_CHECK_NO_THROW(c("a,b,c") =
+                             2 * conj(aC("a,c,b").block(lobound, upbound)));
+  }
+
+  // contraction expressions
+  {
+    std::decay_t<decltype(c)> t2, t4;
+    // contraction over empty dim
+    BOOST_CHECK_NO_THROW(t4("a,c,e,d") = aC("a,b,c") * aC("d,b,e"));
+    // contraction over empty and nonempty dims
+    BOOST_CHECK_NO_THROW(t2("a,d") = aC("a,b,c") * aC("d,b,c"));
+    // contraction over nonempty dims
+    BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * aC("d,e,c"));
+    // contraction over nonempty dims, involving expressions with nonzero-volume
+    BOOST_CHECK_NO_THROW(t4("b,a,e,d") = aC("a,b,c") * (2. * aC_f("d,e,c")));
+  }
+
+  // reduction expressions
+  {
+    // contraction over empty dim
+    BOOST_CHECK_NO_THROW(aC("a,b,c").dot(2 * aC("a,b,c").conj()).get());
+    BOOST_CHECK_EQUAL(aC("a,b,c").dot(2 * aC("a,b,c").conj()).get(), 0);
+  }
+}
+
 BOOST_AUTO_TEST_SUITE_END()
 
 #endif  // TILEDARRAY_TEST_EXPRESSIONS_IMPL_H
diff --git a/tests/expressions_mixed.cpp b/tests/expressions_mixed.cpp
index 6b92cec695..bf79d86fc1 100644
--- a/tests/expressions_mixed.cpp
+++ b/tests/expressions_mixed.cpp
@@ -23,6 +23,7 @@
  *
  */
 
+#include "TiledArray/special/diagonal_array.h"
 #include "TiledArray/special/kronecker_delta.h"
 #include "range_fixture.h"
 #include "sparse_tile.h"
@@ -37,8 +38,8 @@ struct tag {};
 struct MixedExpressionsFixture : public TiledRangeFixture {
   typedef DistArray<EigenSparseTile<double, tag<0>>, DensePolicy> TArrayDS1;
   typedef DistArray<EigenSparseTile<double, tag<1>>, DensePolicy> TArrayDS2;
-  typedef DistArray<KroneckerDeltaTile<1>, DensePolicy>
-      ArrayKronDelta1;  // will be turned into SparsePolicy next
+  typedef DistArray<KroneckerDeltaTile, DensePolicy> ArrayKronDelta;
+  typedef DistArray<KroneckerDeltaTile, SparsePolicy> SpArrayKronDelta;
 
   MixedExpressionsFixture()
       : u(*GlobalFixture::world, trange2),
@@ -46,25 +47,17 @@ struct MixedExpressionsFixture : public TiledRangeFixture {
         e2(*GlobalFixture::world, trange2e),
         e4(*GlobalFixture::world, trange4e),
         v(*GlobalFixture::world, trange2),
-        w(*GlobalFixture::world, trange2),
-        delta1(*GlobalFixture::world, trange2, DenseShape(),
-               std::make_shared<detail::ReplicatedPmap>(
-                   *GlobalFixture::world, trange2.tiles_range().volume())),
-        delta1e(*GlobalFixture::world, trange2e, DenseShape(),
-                std::make_shared<detail::ReplicatedPmap>(
-                    *GlobalFixture::world, trange2e.tiles_range().volume())) {
+        w(*GlobalFixture::world, trange2) {
     random_fill(u);
     random_fill(v);
     u2.fill(0);
     random_fill(e2);
     e4.fill(0);
-    init_kronecker_delta(delta1);
-    init_kronecker_delta(delta1e);
     GlobalFixture::world->gop.fence();
   }
 
-  template <typename Tile>
-  static void random_fill(DistArray<Tile>& array) {
+  template <typename Tile, typename Policy>
+  static void random_fill(DistArray<Tile, Policy>& array) {
     array.fill_random();
   }
 
@@ -110,10 +103,12 @@ struct MixedExpressionsFixture : public TiledRangeFixture {
     return matrix;
   }
 
-  template <typename Tile, typename Policy>
-  static void init_kronecker_delta(DistArray<Tile, Policy>& array) {
-    array.init_tiles(
-        [=](const TiledArray::Range& range) { return Tile(range); });
+  template <typename Policy>
+  static void init_kronecker_delta(
+      DistArray<KroneckerDeltaTile, Policy>& array) {
+    array.init_tiles([=](const TiledArray::Range& range) {
+      return KroneckerDeltaTile(range);
+    });
   }
 
   ~MixedExpressionsFixture() { GlobalFixture::world->gop.fence(); }
@@ -132,8 +127,6 @@ struct MixedExpressionsFixture : public TiledRangeFixture {
   TArrayDS1 v;
   TArrayDS1 v1;
   TArrayDS2 w;
-  ArrayKronDelta1 delta1;
-  ArrayKronDelta1 delta1e;
 };  // MixedExpressionsFixture
 
 // Instantiate static variables for fixture
@@ -183,21 +176,40 @@ BOOST_AUTO_TEST_CASE(mult_factories) {
   // BOOST_CHECK_NO_THROW(w("a,b") = u("a,b") * v("a,b"));
 }
 
-BOOST_AUTO_TEST_CASE(outer_product_factories) {
+BOOST_AUTO_TEST_CASE(kronecker) {
 #if !MULT_DENSE_SPARSE_TO_SPARSE
   // ok
   BOOST_CHECK_NO_THROW(u2("a,b,c,d") += u("a,b") * v("c,d"));
 #endif
 
-  // these can only work if nproc == 1 since KroneckerDelta does not travel, and
-  // SUMMA does not support replicated args
-  if (GlobalFixture::world->nproc() == 1) {
-    // ok
-    BOOST_CHECK_NO_THROW(u2("a,b,c,d") += delta1("a,b") * u("c,d"));
+  // retile test
+  TSpArrayD x(*GlobalFixture::world, trange2);
+  random_fill(x);
 
-    // ok
-    BOOST_CHECK_NO_THROW(e4("a,c,b,d") += delta1e("a,b") * e2("c,d"));
-  }
+  // includes target tiles that receive contributions from multiple source
+  // tiles, tiny target tiles with single contribution, and tiles partially and
+  // completely outside the source range
+#ifdef TA_SIGNED_1INDEX_TYPE
+  TA::TiledRange yrange{{-1, 18, 20, 45, 47}, {-1, 20, 22, 45, 47}};
+#else
+  TA::TiledRange yrange{{5, 18, 20, 45, 47}, {7, 20, 22, 45, 47}};
+#endif
+  TA::TSpArrayD y1;
+  // identical to y1 = TA::detail::retile_v1(x, yrange);
+  TA::TiledRange retiler_range{yrange.dim(0), yrange.dim(1), trange2.dim(0),
+                               trange2.dim(1)};
+  SpArrayKronDelta retiler(
+      *GlobalFixture::world, retiler_range,
+      SparseShape(detail::kronecker_shape(retiler_range), retiler_range),
+      std::make_shared<detail::ReplicatedPmap>(
+          *GlobalFixture::world, retiler_range.tiles_range().volume()));
+  init_kronecker_delta(retiler);
+  y1("d1,d2") = retiler("d1,d2,s1,s2") * x("s1,s2");
+  // std::cout << "y1 = " << y1 << std::endl;
+
+  auto y_ref = TA::retile(x, yrange);
+  // std::cout << "y_ref = " << y_ref << std::endl;
+  BOOST_CHECK((y1("d1,d2") - y_ref("d1,d2")).norm().get() == 0.);
 }
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/hash_pmap.cpp b/tests/hash_pmap.cpp
index a9b573802c..06d721dceb 100644
--- a/tests/hash_pmap.cpp
+++ b/tests/hash_pmap.cpp
@@ -24,7 +24,7 @@
 using namespace TiledArray;
 
 struct HashPmapFixture {
-  HashPmapFixture() {}
+  constexpr static std::size_t max_ntiles = 10ul;
 };
 
 // =============================================================================
@@ -33,7 +33,7 @@ struct HashPmapFixture {
 BOOST_FIXTURE_TEST_SUITE(hash_pmap_suite, HashPmapFixture)
 
 BOOST_AUTO_TEST_CASE(constructor) {
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     BOOST_REQUIRE_NO_THROW(
         TiledArray::detail::HashPmap pmap(*GlobalFixture::world, tiles));
     TiledArray::detail::HashPmap pmap(*GlobalFixture::world, tiles);
@@ -50,7 +50,7 @@ BOOST_AUTO_TEST_CASE(owner) {
   ProcessID* p_owner = new ProcessID[size];
 
   // Check various pmap sizes
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     TiledArray::detail::HashPmap pmap(*GlobalFixture::world, tiles);
 
     for (std::size_t tile = 0; tile < tiles; ++tile) {
@@ -77,7 +77,7 @@ BOOST_AUTO_TEST_CASE(local_size) {
 BOOST_AUTO_TEST_CASE(local_group) {
   ProcessID tile_owners[100];
 
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     TiledArray::detail::HashPmap pmap(*GlobalFixture::world, tiles);
 
     // Check that all local elements map to this rank
diff --git a/tests/index_list.cpp b/tests/index_list.cpp
index bf75aaffac..c53bdd9de7 100644
--- a/tests/index_list.cpp
+++ b/tests/index_list.cpp
@@ -135,11 +135,11 @@ BOOST_AUTO_TEST_CASE(accessors) {
   BOOST_CHECK_EQUAL(v.at(3), "d");  // check last variable access
   BOOST_CHECK_EQUAL(v[0], "a");     // check 1st variable access
   BOOST_CHECK_EQUAL(v[3], "d");     // check last variable access
-  BOOST_CHECK_THROW(v.at(4),
+  BOOST_CHECK_TA_ASSERT(v.at(4),
 #ifdef BOOST_CONTAINER_USE_STD_EXCEPTIONS
-                    std::out_of_range
+                        std::out_of_range
 #else
-                    boost::container::out_of_range
+                        boost::container::out_of_range
 #endif
   );  // check for out of range throw.
 }
@@ -175,11 +175,11 @@ BOOST_AUTO_TEST_CASE(constructor) {
   BOOST_CHECK_EQUAL(v10.at(2), "c");
   BOOST_CHECK_EQUAL(v10.at(3), "d");
 
-  BOOST_CHECK_THROW(IndexList v3(",a,b,c"),
-                    Exception);  // check invalid input
-  BOOST_CHECK_THROW(IndexList v4("a,,b,c"), Exception);
-  BOOST_CHECK_THROW(IndexList v5(" ,a,b"), Exception);
-  BOOST_CHECK_THROW(IndexList v6("a,  b,   , c"), Exception);
+  BOOST_CHECK_TA_ASSERT(IndexList v3(",a,b,c"),
+                        Exception);  // check invalid input
+  BOOST_CHECK_TA_ASSERT(IndexList v4("a,,b,c"), Exception);
+  BOOST_CHECK_TA_ASSERT(IndexList v5(" ,a,b"), Exception);
+  BOOST_CHECK_TA_ASSERT(IndexList v6("a,  b,   , c"), Exception);
 
   IndexList v7(" a , b, c, d , e e ,f f, g10,h, i ");  // check input with
                                                        // various spacings.
@@ -193,7 +193,7 @@ BOOST_AUTO_TEST_CASE(constructor) {
   BOOST_CHECK_EQUAL(v7.at(7), "h");
   BOOST_CHECK_EQUAL(v7.at(8), "i");
 
-  BOOST_REQUIRE_THROW(
+  BOOST_REQUIRE_TA_ASSERT(
       IndexList v11(""),
       TiledArray::Exception);  // Empty string is not permitted constructor
 }
diff --git a/tests/initializer_list.cpp b/tests/initializer_list.cpp
index 884f5c61fd..3f5ad27b80 100644
--- a/tests/initializer_list.cpp
+++ b/tests/initializer_list.cpp
@@ -198,7 +198,7 @@ BOOST_AUTO_TEST_CASE(scalar) {
 BOOST_AUTO_TEST_CASE(empty_vector) {
   vector_il<double> il{};
   if (world.rank() == 0)  // only rank 0 does the work
-    BOOST_CHECK_THROW(tiled_range_from_il(il), Exception);
+    BOOST_CHECK_TA_ASSERT(tiled_range_from_il(il), Exception);
 }
 
 BOOST_AUTO_TEST_CASE(vector) {
@@ -471,7 +471,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(vector, T, scalar_type_list) {
   auto array = array_from_il<TArray<T>>(world, tr, il);
   using tile_type = typename TArray<T>::value_type;
   std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 2.0}),
-                   tile_type(tr.make_tile_range(1), {3.0})};
+                   tile_type(tr.make_tile_range(1), std::initializer_list<T>{3.0})};
   for (auto i = 0; i < array.size(); ++i) {
     if (!array.is_local(i)) continue;
     tile_type tile = array.find(i);
@@ -486,7 +486,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(matrix, T, scalar_type_list) {
   using tile_type = typename TArray<T>::value_type;
   std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 4.0}),
                    tile_type(tr.make_tile_range(1), {2.0, 3.0, 5.0, 6.0}),
-                   tile_type(tr.make_tile_range(2), {7.0}),
+                   tile_type(tr.make_tile_range(2), std::initializer_list<T>{7.0}),
                    tile_type(tr.make_tile_range(3), {8.0, 9.0})};
   for (auto i = 0; i < array.size(); ++i) {
     if (!array.is_local(i)) continue;
@@ -503,11 +503,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tensor, T, scalar_type_list) {
   using tile_type = typename TArray<T>::value_type;
   std::vector corr{tile_type(tr.make_tile_range(0), {1.0, 4.0}),
                    tile_type(tr.make_tile_range(1), {2.0, 3.0, 5.0, 6.0}),
-                   tile_type(tr.make_tile_range(2), {7.0}),
+                   tile_type(tr.make_tile_range(2), std::initializer_list<T>{7.0}),
                    tile_type(tr.make_tile_range(3), {8.0, 9.0}),
                    tile_type(tr.make_tile_range(4), {10.0, 13.0}),
                    tile_type(tr.make_tile_range(5), {11.0, 12.0, 14.0, 15.0}),
-                   tile_type(tr.make_tile_range(6), {16.0}),
+                   tile_type(tr.make_tile_range(6), std::initializer_list<T>{16.0}),
                    tile_type(tr.make_tile_range(7), {17.0, 18.0})};
   for (auto i = 0; i < array.size(); ++i) {
     if (!array.is_local(i)) continue;
diff --git a/tests/kmp5_compute_trange1.h b/tests/kmp5_compute_trange1.h
deleted file mode 100644
index 1e0d0b9a47..0000000000
--- a/tests/kmp5_compute_trange1.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- *  This file is a part of TiledArray.
- *  Copyright (C) 2018  Virginia Tech
- *
- *  This program is free software: you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation, either version 3 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- *  Karl Pierce
- *  Department of Chemistry, Virginia Tech
- *
- *  make_trange1.cpp
- *  June 7, 2022
- *
- */
-
-#ifndef TILEDARRAY_COMPUTE_TRANGE1__H
-#define TILEDARRAY_COMPUTE_TRANGE1__H
-
-#include "tiledarray.h"
-
-namespace TiledArray {
-
-/// this creates "uniform" TiledRange1 object using same logic as assumed in
-/// vector_of_array.h
-inline TiledArray::TiledRange1 kmp5_compute_trange1(
-    std::size_t range_size, std::size_t target_block_size) {
-  if (range_size > 0) {
-    std::size_t nblocks =
-        (range_size + target_block_size - 1) / target_block_size;
-    auto dv = std::div((int)(range_size + nblocks - 1), (int)nblocks);
-    auto avg_block_size = dv.quot - 1, num_avg_plus_one = dv.rem + 1;
-    std::vector<std::size_t> hashmarks;
-    hashmarks.reserve(nblocks + 1);
-    auto block_counter = 0;
-    for (auto i = 0; i < num_avg_plus_one;
-         ++i, block_counter += avg_block_size + 1) {
-      hashmarks.push_back(block_counter);
-    }
-    for (auto i = num_avg_plus_one; i < nblocks;
-         ++i, block_counter += avg_block_size) {
-      hashmarks.push_back(block_counter);
-    }
-    hashmarks.push_back(range_size);
-    return TA::TiledRange1(hashmarks.begin(), hashmarks.end());
-  } else
-    return TA::TiledRange1{};
-}
-
-}  // namespace TiledArray
-
-#endif  // TILEDARRAY_COMPUTE_TRANGE1__H
diff --git a/tests/librett.cpp b/tests/librett.cpp
index 91c5b5b8ad..58093d0e06 100644
--- a/tests/librett.cpp
+++ b/tests/librett.cpp
@@ -22,9 +22,9 @@
 
 #include <TiledArray/config.h>
 
-#ifdef TILEDARRAY_HAS_CUDA
+#ifdef TILEDARRAY_HAS_DEVICE
 
-#include <TiledArray/cuda/btas_um_tensor.h>
+#include <TiledArray/device/btas_um_tensor.h>
 #include "unit_test_config.h"
 
 struct LibreTTFixture {
@@ -55,12 +55,18 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) {
       iter++;
     }
   }
+
+  auto q = TiledArray::deviceEnv::instance()->stream(0);
+  DeviceSafeCall(TiledArray::device::setDevice(q.device));
+
   int* a_device;
-  cudaMalloc(&a_device, A * A * sizeof(int));
+  TiledArray::device::malloc(&a_device, A * A * sizeof(int));
   int* b_device;
-  cudaMalloc(&b_device, A * A * sizeof(int));
+  TiledArray::device::malloc(&b_device, A * A * sizeof(int));
 
-  cudaMemcpy(a_device, a_host, A * A * sizeof(int), cudaMemcpyHostToDevice);
+  TiledArray::device::memcpyAsync(a_device, a_host, A * A * sizeof(int),
+                                  TiledArray::device::MemcpyHostToDevice,
+                                  q.stream);
 
   std::vector<int> extent({A, A});
   TiledArray::extent_to_col_major(extent);
@@ -69,19 +75,23 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) {
   TiledArray::permutation_to_col_major(perm);
 
   librettHandle plan;
-  //librettResult_t status;
   librettResult status;
 
-  status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
+  status =
+      librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), q.stream);
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
 
   status = librettExecute(plan, a_device, b_device);
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
-  librettDestroy(plan);
 
-  cudaMemcpy(b_host, b_device, A * A * sizeof(int), cudaMemcpyDeviceToHost);
+  TiledArray::device::memcpyAsync(b_host, b_device, A * A * sizeof(int),
+                                  TiledArray::device::MemcpyDeviceToHost,
+                                  q.stream);
+  TiledArray::device::streamSynchronize(q.stream);
+
+  librettDestroy(plan);
 
   iter = 0;
   for (std::size_t i = 0; i < A; i++) {
@@ -94,8 +104,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem) {
   free(a_host);
   free(b_host);
 
-  cudaFree(a_device);
-  cudaFree(b_device);
+  TiledArray::device::free(a_device);
+  TiledArray::device::free(b_device);
 }
 
 BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) {
@@ -109,15 +119,19 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) {
     }
   }
 
+  auto q = TiledArray::deviceEnv::instance()->stream(0);
+  DeviceSafeCall(TiledArray::device::setDevice(q.device));
+
   int* a_device;
-  cudaMalloc(&a_device, A * B * sizeof(int));
+  TiledArray::device::malloc(&a_device, A * B * sizeof(int));
   int* b_device;
-  cudaMalloc(&b_device, A * B * sizeof(int));
+  TiledArray::device::malloc(&b_device, A * B * sizeof(int));
 
-  cudaMemcpy(a_device, a_host, A * B * sizeof(int), cudaMemcpyHostToDevice);
+  TiledArray::device::memcpyAsync(a_device, a_host, A * B * sizeof(int),
+                                  TiledArray::device::MemcpyHostToDevice,
+                                  q.stream);
 
   librettHandle plan;
-  //librettResult_t status;
   librettResult status;
 
   std::vector<int> extent({B, A});
@@ -126,16 +140,21 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) {
   std::vector<int> perm({1, 0});
   TiledArray::permutation_to_col_major(perm);
 
-  status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
+  status =
+      librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), q.stream);
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
 
   status = librettExecute(plan, a_device, b_device);
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
-  librettDestroy(plan);
 
-  cudaMemcpy(b_host, b_device, A * B * sizeof(int), cudaMemcpyDeviceToHost);
+  TiledArray::device::memcpyAsync(b_host, b_device, A * B * sizeof(int),
+                                  TiledArray::device::MemcpyDeviceToHost,
+                                  q.stream);
+  TiledArray::device::streamSynchronize(q.stream);
+
+  librettDestroy(plan);
 
   iter = 0;
   for (std::size_t i = 0; i < B; i++) {
@@ -148,8 +167,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym) {
   free(a_host);
   free(b_host);
 
-  cudaFree(a_device);
-  cudaFree(b_device);
+  TiledArray::device::free(a_device);
+  TiledArray::device::free(b_device);
 }
 
 BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) {
@@ -165,17 +184,21 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) {
     }
   }
 
+  auto q = TiledArray::deviceEnv::instance()->stream(0);
+  DeviceSafeCall(TiledArray::device::setDevice(q.device));
+
   int* a_device;
-  cudaMalloc(&a_device, A * B * C * sizeof(int));
+  TiledArray::device::malloc(&a_device, A * B * C * sizeof(int));
   int* b_device;
-  cudaMalloc(&b_device, A * B * C * sizeof(int));
+  TiledArray::device::malloc(&b_device, A * B * C * sizeof(int));
 
-  cudaMemcpy(a_device, a_host, A * B * C * sizeof(int), cudaMemcpyHostToDevice);
+  TiledArray::device::memcpyAsync(a_device, a_host, A * B * C * sizeof(int),
+                                  TiledArray::device::MemcpyHostToDevice,
+                                  q.stream);
 
   // b(j,i,k) = a(i,j,k)
 
   librettHandle plan;
-  //librettResult_t status;
   librettResult status;
 
   std::vector<int> extent3{int(A), int(B), int(C)};
@@ -183,8 +206,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) {
   std::vector<int> perm3{1, 0, 2};
   //  std::vector<int> perm3{0, 2, 1};
 
-  status = librettPlanMeasure(&plan, 3, extent3.data(), perm3.data(), sizeof(int),
-                           0, a_device, b_device);
+  status = librettPlanMeasure(&plan, 3, extent3.data(), perm3.data(),
+                              sizeof(int), q.stream, a_device, b_device);
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
 
@@ -192,7 +215,10 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) {
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  cudaMemcpy(b_host, b_device, A * B * C * sizeof(int), cudaMemcpyDeviceToHost);
+  TiledArray::device::memcpyAsync(b_host, b_device, A * B * C * sizeof(int),
+                                  TiledArray::device::MemcpyDeviceToHost,
+                                  q.stream);
+  TiledArray::device::streamSynchronize(q.stream);
 
   status = librettDestroy(plan);
 
@@ -211,8 +237,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_column_major) {
   free(a_host);
   free(b_host);
 
-  cudaFree(a_device);
-  cudaFree(b_device);
+  TiledArray::device::free(a_device);
+  TiledArray::device::free(b_device);
 }
 
 BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) {
@@ -228,17 +254,21 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) {
     }
   }
 
+  auto q = TiledArray::deviceEnv::instance()->stream(0);
+  DeviceSafeCall(TiledArray::device::setDevice(q.device));
+
   int* a_device;
-  cudaMalloc(&a_device, A * B * C * sizeof(int));
+  TiledArray::device::malloc(&a_device, A * B * C * sizeof(int));
   int* b_device;
-  cudaMalloc(&b_device, A * B * C * sizeof(int));
+  TiledArray::device::malloc(&b_device, A * B * C * sizeof(int));
 
-  cudaMemcpy(a_device, a_host, A * B * C * sizeof(int), cudaMemcpyHostToDevice);
+  TiledArray::device::memcpyAsync(a_device, a_host, A * B * C * sizeof(int),
+                                  TiledArray::device::MemcpyHostToDevice,
+                                  q.stream);
 
   // b(j,i,k) = a(i,j,k)
 
   librettHandle plan;
-  //librettResult_t status;
   librettResult status;
 
   std::vector<int> extent({A, B, C});
@@ -247,8 +277,8 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) {
   std::vector<int> perm({1, 0, 2});
   TiledArray::permutation_to_col_major(perm);
 
-  status = librettPlanMeasure(&plan, 3, extent.data(), perm.data(), sizeof(int), 0,
-                           a_device, b_device);
+  status = librettPlanMeasure(&plan, 3, extent.data(), perm.data(), sizeof(int),
+                              q.stream, a_device, b_device);
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
 
@@ -256,7 +286,10 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) {
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  cudaMemcpy(b_host, b_device, A * B * C * sizeof(int), cudaMemcpyDeviceToHost);
+  TiledArray::device::memcpyAsync(b_host, b_device, A * B * C * sizeof(int),
+                                  TiledArray::device::MemcpyDeviceToHost,
+                                  q.stream);
+  TiledArray::device::streamSynchronize(q.stream);
 
   status = librettDestroy(plan);
 
@@ -275,16 +308,16 @@ BOOST_AUTO_TEST_CASE(librett_gpu_mem_nonsym_rank_three_row_major) {
   free(a_host);
   free(b_host);
 
-  cudaFree(a_device);
-  cudaFree(b_device);
+  TiledArray::device::free(a_device);
+  TiledArray::device::free(b_device);
 }
 
 BOOST_AUTO_TEST_CASE(librett_unified_mem) {
   int* a_um;
-  cudaMallocManaged(&a_um, A * A * sizeof(int));
+  TiledArray::device::mallocManaged(&a_um, A * A * sizeof(int));
 
   int* b_um;
-  cudaMallocManaged(&b_um, A * A * sizeof(int));
+  TiledArray::device::mallocManaged(&b_um, A * A * sizeof(int));
 
   int iter = 0;
   for (std::size_t i = 0; i < A; i++) {
@@ -294,8 +327,10 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) {
     }
   }
 
+  auto q = TiledArray::deviceEnv::instance()->stream(0);
+  DeviceSafeCall(TiledArray::device::setDevice(q.device));
+
   librettHandle plan;
-  //librettResult_t status;
   librettResult status;
 
   std::vector<int> extent({A, A});
@@ -304,7 +339,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) {
   std::vector<int> perm({1, 0});
   TiledArray::permutation_to_col_major(perm);
 
-  status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
+  status =
+      librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), q.stream);
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
 
@@ -312,9 +348,9 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) {
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
 
-  librettDestroy(plan);
+  TiledArray::device::streamSynchronize(q.stream);
 
-  cudaDeviceSynchronize();
+  librettDestroy(plan);
 
   iter = 0;
   for (std::size_t i = 0; i < A; i++) {
@@ -324,16 +360,16 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem) {
     }
   }
 
-  cudaFree(a_um);
-  cudaFree(b_um);
+  TiledArray::device::free(a_um);
+  TiledArray::device::free(b_um);
 }
 
 BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) {
   int* a_um;
-  cudaMallocManaged(&a_um, A * B * sizeof(int));
+  TiledArray::device::mallocManaged(&a_um, A * B * sizeof(int));
 
   int* b_um;
-  cudaMallocManaged(&b_um, A * B * sizeof(int));
+  TiledArray::device::mallocManaged(&b_um, A * B * sizeof(int));
 
   int iter = 0;
   for (std::size_t i = 0; i < B; i++) {
@@ -343,8 +379,10 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) {
     }
   }
 
+  auto q = TiledArray::deviceEnv::instance()->stream(0);
+  DeviceSafeCall(TiledArray::device::setDevice(q.device));
+
   librettHandle plan;
-  //librettResult_t status;
   librettResult status;
 
   std::vector<int> extent({B, A});
@@ -353,7 +391,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) {
   std::vector<int> perm({1, 0});
   TiledArray::permutation_to_col_major(perm);
 
-  status = librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), 0);
+  status =
+      librettPlan(&plan, 2, extent.data(), perm.data(), sizeof(int), q.stream);
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
 
@@ -361,8 +400,9 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) {
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
 
+  TiledArray::device::streamSynchronize(q.stream);
+
   librettDestroy(plan);
-  cudaDeviceSynchronize();
 
   iter = 0;
   for (std::size_t i = 0; i < B; i++) {
@@ -371,16 +411,16 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_nonsym) {
       iter++;
     }
   }
-  cudaFree(a_um);
-  cudaFree(b_um);
+  TiledArray::device::free(a_um);
+  TiledArray::device::free(b_um);
 }
 
 BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) {
   int* a_um;
-  cudaMallocManaged(&a_um, A * B * C * sizeof(int));
+  TiledArray::device::mallocManaged(&a_um, A * B * C * sizeof(int));
 
   int* b_um;
-  cudaMallocManaged(&b_um, A * B * C * sizeof(int));
+  TiledArray::device::mallocManaged(&b_um, A * B * C * sizeof(int));
 
   int iter = 0;
   for (std::size_t i = 0; i < A; i++) {
@@ -392,8 +432,10 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) {
     }
   }
 
+  auto q = TiledArray::deviceEnv::instance()->stream(0);
+  DeviceSafeCall(TiledArray::device::setDevice(q.device));
+
   librettHandle plan;
-  //librettResult_t status;
   librettResult status;
 
   // b(k,i,j) = a(i,j,k)
@@ -404,7 +446,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) {
   std::vector<int> perm({2, 0, 1});
   TiledArray::permutation_to_col_major(perm);
 
-  status = librettPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), 0);
+  status =
+      librettPlan(&plan, 3, extent.data(), perm.data(), sizeof(int), q.stream);
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
 
@@ -412,8 +455,9 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) {
 
   BOOST_CHECK(status == LIBRETT_SUCCESS);
 
+  TiledArray::device::streamSynchronize(q.stream);
+
   librettDestroy(plan);
-  cudaDeviceSynchronize();
 
   iter = 0;
   for (std::size_t i = 0; i < A; i++) {
@@ -424,8 +468,8 @@ BOOST_AUTO_TEST_CASE(librett_unified_mem_rank_three) {
       }
     }
   }
-  cudaFree(a_um);
-  cudaFree(b_um);
+  TiledArray::device::free(a_um);
+  TiledArray::device::free(b_um);
 }
 
 BOOST_AUTO_TEST_CASE(librett_um_tensor) {
@@ -450,7 +494,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor) {
 
   auto b = permute(a, permutation);
 
-  cudaDeviceSynchronize();
+  TiledArray::device::deviceSynchronize();
   iter = 0;
   for (std::size_t i = 0; i < A; i++) {
     for (std::size_t j = 0; j < A; j++) {
@@ -482,7 +526,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_nonsym) {
 
   auto b = permute(a, permutation);
 
-  cudaDeviceSynchronize();
+  TiledArray::device::deviceSynchronize();
   iter = 0;
   for (std::size_t i = 0; i < B; i++) {
     for (std::size_t j = 0; j < A; j++) {
@@ -516,7 +560,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_three) {
 
     auto b = permute(a, permutation);
 
-    cudaDeviceSynchronize();
+    TiledArray::device::deviceSynchronize();
     iter = 0;
     for (std::size_t i = 0; i < A; i++) {
       for (std::size_t j = 0; j < B; j++) {
@@ -534,7 +578,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_three) {
 
     auto b = permute(a, permutation);
 
-    cudaDeviceSynchronize();
+    TiledArray::device::deviceSynchronize();
     iter = 0;
     for (std::size_t i = 0; i < A; i++) {
       for (std::size_t j = 0; j < B; j++) {
@@ -579,7 +623,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_four) {
 
     auto tile_b = permute(tile_a, permutation);
 
-    cudaDeviceSynchronize();
+    TiledArray::device::deviceSynchronize();
     // validate
     iter = 0;
     for (std::size_t i = 0; i < a; i++) {
@@ -600,7 +644,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_four) {
 
     auto tile_b = permute(tile_a, permutation);
 
-    cudaDeviceSynchronize();
+    TiledArray::device::deviceSynchronize();
     // validate
     iter = 0;
     for (std::size_t i = 0; i < a; i++) {
@@ -654,7 +698,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_six) {
 
     auto tile_b = permute(tile_a, permutation);
 
-    cudaDeviceSynchronize();
+    TiledArray::device::deviceSynchronize();
     // validate
     iter = 0;
     for (std::size_t i = 0; i < a; i++) {
@@ -679,7 +723,7 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_six) {
 
     auto tile_b = permute(tile_a, permutation);
 
-    cudaDeviceSynchronize();
+    TiledArray::device::deviceSynchronize();
     // validate
     iter = 0;
     for (std::size_t i = 0; i < a; i++) {
@@ -700,4 +744,4 @@ BOOST_AUTO_TEST_CASE(librett_um_tensor_rank_six) {
 }
 
 BOOST_AUTO_TEST_SUITE_END()
-#endif  // TILEDARRAY_HAS_CUDA
+#endif  // TILEDARRAY_HAS_DEVICE
diff --git a/tests/perm_index.cpp b/tests/perm_index.cpp
index 3ba48aa7a1..8a1326d7df 100644
--- a/tests/perm_index.cpp
+++ b/tests/perm_index.cpp
@@ -49,8 +49,7 @@ const std::array<std::size_t, 4> PermIndexFixture::start = {
 const std::array<std::size_t, 4> PermIndexFixture::finish = {
     {3ul, 5ul, 7ul, 11ul}};
 
-BOOST_FIXTURE_TEST_SUITE(perm_index_suite, PermIndexFixture,
-                         TA_UT_LABEL_SERIAL)
+BOOST_FIXTURE_TEST_SUITE(perm_index_suite, PermIndexFixture, TA_UT_LABEL_SERIAL)
 
 BOOST_AUTO_TEST_CASE(default_constructor) {
   BOOST_CHECK_NO_THROW(PermIndex x;);
@@ -61,7 +60,7 @@ BOOST_AUTO_TEST_CASE(default_constructor) {
   BOOST_CHECK(!x.data());
 
   // Check that an exception is thrown when using a default constructed object
-  BOOST_CHECK_THROW(x(0), TiledArray::Exception);
+  BOOST_CHECK_TA_ASSERT(x(0), TiledArray::Exception);
 }
 
 BOOST_AUTO_TEST_CASE(constructor) {
diff --git a/tests/range.cpp b/tests/range.cpp
index 1ad294363f..71f20aeb3f 100644
--- a/tests/range.cpp
+++ b/tests/range.cpp
@@ -19,9 +19,7 @@
 
 #include <TiledArray/util/eigen.h>
 #include <boost/range/combine.hpp>
-#ifdef TILEDARRAY_HAS_RANGEV3
 #include <range/v3/view/zip.hpp>
-#endif
 
 #include <numeric>
 #include <sstream>
@@ -65,15 +63,37 @@ BOOST_FIXTURE_TEST_SUITE(range_suite, RangeFixture, TA_UT_LABEL_SERIAL)
 BOOST_AUTO_TEST_CASE(dimension_accessor) {
   BOOST_CHECK_EQUAL_COLLECTIONS(r.lobound_data(), r.lobound_data() + r.rank(),
                                 start.begin(), start.end());  // check start()
+  BOOST_CHECK_EQUAL_COLLECTIONS(r.lobound().begin(), r.lobound().end(),
+                                start.begin(), start.end());  // check start()
+  BOOST_CHECK_EQUAL(r.lobound(), start);                      // check start()
+  BOOST_CHECK_EQUAL(r.lobound(),
+                    (Index{start.begin(), start.end()}));  // check finish()
   BOOST_CHECK_EQUAL_COLLECTIONS(r.upbound_data(), r.upbound_data() + r.rank(),
                                 finish.begin(),
                                 finish.end());  // check finish()
+  BOOST_CHECK_EQUAL_COLLECTIONS(r.upbound().begin(), r.upbound().end(),
+                                finish.begin(),
+                                finish.end());  // check finish()
+  BOOST_CHECK_EQUAL(r.upbound(), finish);       // check finish()
+  BOOST_CHECK_EQUAL(r.upbound(),
+                    (Index{finish.begin(), finish.end()}));  // check finish()
   BOOST_CHECK_EQUAL_COLLECTIONS(r.extent_data(), r.extent_data() + r.rank(),
                                 size.begin(), size.end());  // check size()
+  BOOST_CHECK_EQUAL_COLLECTIONS(r.extent().begin(), r.extent().end(),
+                                size.begin(), size.end());  // check size()
+  BOOST_CHECK_EQUAL(r.extent(), size);                      // check size()
+  BOOST_CHECK_EQUAL(r.extent(),
+                    (Index{size.begin(), size.end()}));  // check size()
   BOOST_CHECK_EQUAL_COLLECTIONS(r.stride_data(), r.stride_data() + r.rank(),
                                 weight.begin(),
                                 weight.end());  // check weight()
-  BOOST_CHECK_EQUAL(r.volume(), volume);        // check volume()
+  BOOST_CHECK_EQUAL_COLLECTIONS(r.stride().begin(), r.stride().end(),
+                                weight.begin(),
+                                weight.end());  // check weight()
+  BOOST_CHECK_EQUAL(r.stride(), weight);        // check weight()
+  BOOST_CHECK_EQUAL(r.stride(),
+                    (Index{weight.begin(), weight.end()}));  // check weight()
+  BOOST_CHECK_EQUAL(r.volume(), volume);                     // check volume()
   for (size_t d = 0; d != r.rank(); ++d) {
     auto range_d = r.dim(d);
     BOOST_CHECK_EQUAL(range_d.first, start[d]);
@@ -147,10 +167,8 @@ BOOST_AUTO_TEST_CASE(constructors) {
   BOOST_REQUIRE_NO_THROW(Range r2(p2, f2));  // uses index containers
   BOOST_REQUIRE_NO_THROW(
       Range r(boost::combine(p2, f2)));  // uses zipped range of p2 and f2
-#ifdef TILEDARRAY_HAS_RANGEV3
   BOOST_REQUIRE_NO_THROW(
       Range r(ranges::views::zip(p2, f2)));  // uses zipped range of p2 and f2
-#endif
 
   BOOST_CHECK_THROW(Range r2(f2, p2), Exception);  // lobound > upbound
   Range r2(p2, f2);
@@ -168,11 +186,9 @@ BOOST_AUTO_TEST_CASE(constructors) {
   Range should_be_copy_of_r2(
       boost::combine(p2, f2));  // uses zipped range of p2 and f2
   BOOST_CHECK_EQUAL(r2, should_be_copy_of_r2);
-#ifdef TILEDARRAY_HAS_RANGEV3
   Range should_be_another_copy_of_r2(
       ranges::views::zip(p2, f2));  // uses zipped range of p2 and f2
   BOOST_CHECK_EQUAL(r2, should_be_another_copy_of_r2);
-#endif
 
   // test the rest of bound-based ctors
   {
@@ -221,10 +237,8 @@ BOOST_AUTO_TEST_CASE(constructors) {
     // uses zipped bounds
     Range r7(boost::combine(std::vector{0, 1, 2}, std::array{4, 6, 8}));
     BOOST_CHECK_EQUAL(ref, r7);
-#ifdef TILEDARRAY_HAS_RANGEV3
-//    Range r8(ranges::views::zip(std::array{0, 1, 2}, std::vector{4, 6, 8}));
-//    BOOST_CHECK_EQUAL(ref, r8);
-#endif
+    //    Range r8(ranges::views::zip(std::array{0, 1, 2}, std::vector{4, 6,
+    //    8})); BOOST_CHECK_EQUAL(ref, r8);
 
     // zipped bounds with Eigen vectors
     {
@@ -256,11 +270,9 @@ BOOST_AUTO_TEST_CASE(constructors) {
       Range r14(boost::combine(iv({0, 1, 2}), iv(iv({0, 1, 2}) + iv(4, 5, 6))));
       BOOST_CHECK_EQUAL(ref, r14);
 
-#ifdef TILEDARRAY_HAS_RANGEV3
-// this requires Eigen ~3.4 (3.3.90 docs suggest it should be sufficient)
-//    Range r15(ranges::views::zip(iv(0, 1, 2), iv(4, 6, 8)));
-//    BOOST_CHECK_EQUAL(ref, r15);
-#endif
+      // this requires Eigen ~3.4 (3.3.90 docs suggest it should be sufficient)
+      //    Range r15(ranges::views::zip(iv(0, 1, 2), iv(4, 6, 8)));
+      //    BOOST_CHECK_EQUAL(ref, r15);
     }
 
     // container::svector as bounds
@@ -322,8 +334,8 @@ BOOST_AUTO_TEST_CASE(constructors) {
     BOOST_CHECK_EQUAL(r2.volume(), 48);
   }
 #else   // TA_SIGNED_1INDEX_TYPE
-  BOOST_REQUIRE_THROW(Range r2({{-1, 1}, {-2, 2}, {0, 6}}),
-                      TiledArray::Exception);
+  BOOST_REQUIRE_TA_ASSERT(Range r2({{-1, 1}, {-2, 2}, {0, 6}}),
+                          TiledArray::Exception);
 #endif  // TA_SIGNED_1INDEX_TYPE
 
   // Copy Constructor
@@ -495,6 +507,9 @@ BOOST_AUTO_TEST_CASE(permutation) {
   BOOST_CHECK_EQUAL_COLLECTIONS(r3.stride_data(), r3.stride_data() + r3.rank(),
                                 r2.stride_data(), r2.stride_data() + r2.rank());
   BOOST_CHECK_EQUAL(r3, r2);
+
+  // using null Permutation is allowed
+  BOOST_CHECK_EQUAL(Range(Permutation{}, r1), r1);
 }
 
 BOOST_AUTO_TEST_CASE(include) {
@@ -678,13 +693,13 @@ BOOST_AUTO_TEST_CASE(serialization) {
       2 * (sizeof(Range) + sizeof(std::size_t) * (4 * GlobalFixture::dim + 1));
   unsigned char* buf = new unsigned char[buf_size];
   madness::archive::BufferOutputArchive oar(buf, buf_size);
-  oar& r;
+  oar & r;
   std::size_t nbyte = oar.size();
   oar.close();
 
   Range rs;
   madness::archive::BufferInputArchive iar(buf, nbyte);
-  iar& rs;
+  iar & rs;
   iar.close();
 
   delete[] buf;
diff --git a/tests/range1.cpp b/tests/range1.cpp
index bc2fabdd6c..f8d05ed4c0 100644
--- a/tests/range1.cpp
+++ b/tests/range1.cpp
@@ -65,7 +65,7 @@ BOOST_AUTO_TEST_CASE(constructors) {
   BOOST_CHECK_NO_THROW((Range1{1, 1}));
   BOOST_CHECK_NO_THROW(Range1(1, 1));
   BOOST_CHECK_EQUAL(Range1(1, 1).first, 1);
-  BOOST_CHECK_EQUAL(Range1(1, 1).first, 1);
+  BOOST_CHECK_EQUAL(Range1(1, 1).second, 1);
 
   BOOST_CHECK_NO_THROW((Range1{-11, 13}));
   BOOST_CHECK_EQUAL(Range1(-11, 13).first, -11);
@@ -86,6 +86,15 @@ BOOST_AUTO_TEST_CASE(accessors) {
   BOOST_CHECK_EQUAL(r.upbound(), 10);
   BOOST_CHECK_NO_THROW(r.extent());
   BOOST_CHECK_EQUAL(r.extent(), 9);
+
+  // corner case: empty range
+  Range1 r1{1, 1};
+  BOOST_CHECK_NO_THROW(r1.lobound());
+  BOOST_CHECK_EQUAL(r1.lobound(), 1);
+  BOOST_CHECK_NO_THROW(r1.upbound());
+  BOOST_CHECK_EQUAL(r1.upbound(), 1);
+  BOOST_CHECK_NO_THROW(r.extent());
+  BOOST_CHECK_EQUAL(r1.extent(), 0);
 }
 
 BOOST_AUTO_TEST_CASE(iteration) {
@@ -128,19 +137,56 @@ BOOST_AUTO_TEST_CASE(comparison) {
   BOOST_CHECK(r1 != r4);
 }
 
+BOOST_AUTO_TEST_CASE(shift) {
+  Range1 r0;
+  Range1 r0_plus_1;
+  BOOST_REQUIRE_NO_THROW(r0_plus_1 = r0.shift(1));
+  BOOST_CHECK_EQUAL(r0_plus_1, Range1(1, 1));
+  BOOST_REQUIRE_NO_THROW(r0_plus_1.inplace_shift(-1));
+  BOOST_CHECK_EQUAL(r0_plus_1, r0);
+
+  using index1_type = Range1::index1_type;
+  BOOST_CHECK_TA_ASSERT((Range1{std::numeric_limits<index1_type>::max() - 1,
+                                std::numeric_limits<index1_type>::max()}
+                             .inplace_shift(1)),
+                        Exception);
+  BOOST_CHECK_TA_ASSERT((Range1{std::numeric_limits<index1_type>::min(),
+                                std::numeric_limits<index1_type>::min() + 1}
+                             .inplace_shift(-1)),
+                        Exception);
+  Range1 tmp;
+  BOOST_CHECK_TA_ASSERT(
+      tmp = (Range1{std::numeric_limits<index1_type>::max() - 1,
+                    std::numeric_limits<index1_type>::max()}
+                 .shift(1)),
+      Exception);
+  BOOST_CHECK_TA_ASSERT(
+      tmp = (Range1{std::numeric_limits<index1_type>::min(),
+                    std::numeric_limits<index1_type>::min() + 1}
+                 .shift(-1)),
+      Exception);
+
+  Range1 r1{1, 3};
+  Range1 r1_minus_1;
+  BOOST_REQUIRE_NO_THROW(r1_minus_1 = r1.shift(-1));
+  BOOST_CHECK_EQUAL(r1_minus_1, Range1(0, 2));
+  BOOST_REQUIRE_NO_THROW(r1_minus_1.inplace_shift(1));
+  BOOST_CHECK_EQUAL(r1_minus_1, r1);
+}
+
 BOOST_AUTO_TEST_CASE(serialization) {
   Range1 r{1, 10};
 
   std::size_t buf_size = sizeof(Range1);
   unsigned char* buf = new unsigned char[buf_size];
   madness::archive::BufferOutputArchive oar(buf, buf_size);
-  oar& r;
+  oar & r;
   std::size_t nbyte = oar.size();
   oar.close();
 
   Range1 rs;
   madness::archive::BufferInputArchive iar(buf, nbyte);
-  iar& rs;
+  iar & rs;
   iar.close();
 
   delete[] buf;
diff --git a/tests/range_fixture.h b/tests/range_fixture.h
index 3eb9afd611..5a554eab7c 100644
--- a/tests/range_fixture.h
+++ b/tests/range_fixture.h
@@ -65,37 +65,46 @@ struct RangeFixture {
 };
 
 struct Range1Fixture {
-  static const size_t ntiles = 5;
+  using index1_type = Range1::index1_type;
+  static const inline size_t ntiles = 5;
 
   Range1Fixture()
-      : a(init_tiling<ntiles + 1>()),
-        tiles(0, a.size() - 1),
-        elements(a.front(), a.back()),
-        tr1(a.begin(), a.end()) {}
+      : tr1_hashmarks(make_hashmarks<ntiles + 1>()),
+        a(tr1_hashmarks),
+        tiles(0, tr1_hashmarks.size() - 1),
+        elements(tr1_hashmarks.front(), tr1_hashmarks.back()),
+        tr1(tr1_hashmarks),
+        tr1_base1(make_hashmarks<ntiles + 1>(1)) {}
   ~Range1Fixture() {}
 
   template <std::size_t D>
-  static std::array<std::size_t, D> init_tiling() {
-    std::array<std::size_t, D> result;
-    result[0] = 0u;
+  static std::array<index1_type, D> make_hashmarks(index1_type offset = 0) {
+    std::array<index1_type, D> result;
+    result[0] = offset;
     for (std::size_t i = 1; i < D; ++i)
       result[i] = result[i - 1] + GlobalFixture::primes[i - 1];
     return result;
   }
 
-  const std::array<std::size_t, ntiles + 1> a;
-  const TiledRange1::range_type tiles;
-  const TiledRange1::range_type elements;
-  TiledRange1 tr1;
+  const std::array<index1_type, ntiles + 1> tr1_hashmarks;
+  const std::array<index1_type, ntiles + 1>
+      a;  // copy of tr1_hashmarks, to make legacy tests build
+  const TiledRange1::range_type tiles;     // = tr1.tiles_range()
+  const TiledRange1::range_type elements;  // = tr1.elements_range()
+  TiledRange1 tr1;                         // base-0 TiledRange1
   std::array<TiledRange1::range_type, ntiles> tile;
+  TiledRange1 tr1_base1;  // base-1 TiledRange1
 };
 
 struct TiledRangeFixtureBase : public Range1Fixture {
   TiledRangeFixtureBase() {
     std::fill(dims.begin(), dims.end(), tr1);
     std::fill(extents.begin(), extents.end(), tr1.extent());
+    std::fill(dims_base1.begin(), dims_base1.end(), tr1_base1);
   }
-  std::array<TiledRange1, GlobalFixture::dim> dims;
+  std::array<TiledRange1, GlobalFixture::dim> dims;  // base-0 TiledRange1's
+  std::array<TiledRange1, GlobalFixture::dim>
+      dims_base1;  // base-1 version of dims
   std::array<long, GlobalFixture::dim> extents;
 };  // struct TiledRangeFixtureBase
 
@@ -106,17 +115,21 @@ struct TiledRangeFixture : public RangeFixture, public TiledRangeFixtureBase {
   TiledRangeFixture()
       : tiles_range(TiledRangeFixture::index(GlobalFixture::dim, 0),
                     TiledRangeFixture::index(GlobalFixture::dim, 5)),
-        elements_range(TiledRangeFixture::tile_index(GlobalFixture::dim, 0),
-                       TiledRangeFixture::tile_index(GlobalFixture::dim, a[5])),
-        tr(dims.begin(), dims.end()) {}
+        elements_range(TiledRangeFixture::tile_index(GlobalFixture::dim,
+                                                     tr1_hashmarks.front()),
+                       TiledRangeFixture::tile_index(GlobalFixture::dim,
+                                                     tr1_hashmarks.back())),
+        tr(dims.begin(), dims.end()),
+        tr_base1(dims_base1.begin(), dims_base1.end()) {}
 
   ~TiledRangeFixture() {}
 
   static tile_index fill_tile_index(TRangeN::range_type::index::value_type);
 
   const TRangeN::range_type tiles_range;
-  const TRangeN::range_type elements_range;
-  TRangeN tr;
+  const TRangeN::range_type elements_range;  // elements range of tr
+  TRangeN tr;                                // base-0 TiledRangeN
+  TRangeN tr_base1;                          // base-1 version of tr
 };
 
 #endif  // TILEDARRAY_RANGE_FIXTURE_H__INCLUDED
diff --git a/tests/replicated_pmap.cpp b/tests/replicated_pmap.cpp
index 1a06b85ea4..f9c8b45618 100644
--- a/tests/replicated_pmap.cpp
+++ b/tests/replicated_pmap.cpp
@@ -27,16 +27,13 @@
 #include "unit_test_config.h"
 
 struct ReplicatedPmapFixture {
-  ReplicatedPmapFixture() {}
-
-  ~ReplicatedPmapFixture() {}
-
+  constexpr static std::size_t max_ntiles = 10ul;
 };  // Fixture
 
 BOOST_FIXTURE_TEST_SUITE(replicated_pmap_suite, ReplicatedPmapFixture)
 
 BOOST_AUTO_TEST_CASE(constructor) {
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     BOOST_REQUIRE_NO_THROW(
         TiledArray::detail::ReplicatedPmap pmap(*GlobalFixture::world, tiles));
     TiledArray::detail::ReplicatedPmap pmap(*GlobalFixture::world, tiles);
@@ -50,7 +47,7 @@ BOOST_AUTO_TEST_CASE(owner) {
   const std::size_t rank = GlobalFixture::world->rank();
 
   // Check various pmap sizes
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     TiledArray::detail::ReplicatedPmap pmap(*GlobalFixture::world, tiles);
 
     for (std::size_t tile = 0; tile < tiles; ++tile) {
@@ -60,7 +57,7 @@ BOOST_AUTO_TEST_CASE(owner) {
 }
 
 BOOST_AUTO_TEST_CASE(local_size) {
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     TiledArray::detail::ReplicatedPmap pmap(*GlobalFixture::world, tiles);
 
     // Check that the total number of elements in all local groups is equal to
@@ -71,7 +68,7 @@ BOOST_AUTO_TEST_CASE(local_size) {
 }
 
 BOOST_AUTO_TEST_CASE(local_group) {
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     TiledArray::detail::ReplicatedPmap pmap(*GlobalFixture::world, tiles);
 
     // Check that all local elements map to this rank
diff --git a/tests/retile.cpp b/tests/retile.cpp
index 2d9884e8af..6ac15a48c4 100644
--- a/tests/retile.cpp
+++ b/tests/retile.cpp
@@ -6,26 +6,100 @@
 BOOST_AUTO_TEST_SUITE(retile_suite)
 
 BOOST_AUTO_TEST_CASE(retile_tensor) {
-    TA::detail::matrix_il<double> some_values = {
-      {0.1, 0.2, 0.3, 0.4, 0.5},
-      {0.6, 0.7, 0.8, 0.9, 1.0},
-      {1.1, 1.2, 1.3, 1.4, 1.5},
-      {1.6, 1.7, 1.8, 1.9, 2.0},
-      {2.1, 2.2, 2.3, 2.4, 2.5}
-    };
-
-    auto range0 = TA::TiledRange1(0, 3, 5);
-    auto range1 = TA::TiledRange1(0, 4, 5);
-    auto trange = TA::TiledRange({range0, range1});
-
-    TA::TArrayD default_dense(*GlobalFixture::world, some_values);
-    TA::TSpArrayD default_sparse(*GlobalFixture::world, some_values);
-
-    auto result_dense = retile(default_dense, trange);
-    auto result_sparse = retile(default_sparse, trange);
-
-    BOOST_CHECK_EQUAL(result_dense.trange(), trange);
-    BOOST_CHECK_EQUAL(result_sparse.trange(), trange);
+  TA::detail::matrix_il<double> some_values = {{0.1, 0.2, 0.3, 0.4, 0.5},
+                                               {0.6, 0.7, 0.8, 0.9, 1.0},
+                                               {1.1, 1.2, 1.3, 1.4, 1.5},
+                                               {1.6, 1.7, 1.8, 1.9, 2.0},
+                                               {2.1, 2.2, 2.3, 2.4, 2.5}};
+
+  auto range0 = TA::TiledRange1(0, 3, 5);
+  auto range1 = TA::TiledRange1(0, 4, 5);
+  auto trange = TA::TiledRange({range0, range1});
+
+  TA::TArrayD default_dense(*GlobalFixture::world, some_values);
+  TA::TSpArrayD default_sparse(*GlobalFixture::world, some_values);
+
+  auto result_dense = retile(default_dense, trange);
+  auto result_sparse = retile(default_sparse, trange);
+
+  BOOST_CHECK_EQUAL(result_dense.trange(), trange);
+  BOOST_CHECK_EQUAL(result_sparse.trange(), trange);
+}
+
+BOOST_AUTO_TEST_CASE(retile_more) {
+  using Numeric = int;
+  using T = TA::Tensor<Numeric>;
+  using ToT = TA::Tensor<T>;
+  using ArrayT = TA::DistArray<T, TA::SparsePolicy>;
+  using ArrayToT = TA::DistArray<ToT, TA::SparsePolicy>;
+
+  auto& world = TA::get_default_world();
+
+  auto const tr_source = TA::TiledRange({{0, 2, 4, 8}, {0, 3, 5}});
+  auto const tr_target = TA::TiledRange({{0, 4, 6, 8}, {0, 2, 4, 5}});
+  auto const& elem_rng = tr_source.elements_range();
+
+  BOOST_REQUIRE(elem_rng.volume() == tr_target.elements_range().volume());
+
+  auto const inner_rng = TA::Range({3, 3});
+
+  auto rand_tensor = [](auto const& rng) -> T {
+    return T(rng, [](auto&&) {
+      return TA::detail::MakeRandom<Numeric>::generate_value();
+    });
+  };
+
+  auto set_random_tensor_tile = [rand_tensor](auto& tile, auto const& rng) {
+    tile = rand_tensor(rng);
+    return tile.norm();
+  };
+
+  auto rand_tensor_of_tensor = [rand_tensor,
+                                inner_rng](auto const& rng) -> ToT {
+    return ToT(rng, [rand_tensor, inner_rng](auto&&) {
+      return rand_tensor(inner_rng);
+    });
+  };
+
+  auto set_random_tensor_of_tensor_tile = [rand_tensor_of_tensor](
+                                              auto& tile, auto const& rng) {
+    tile = rand_tensor_of_tensor(rng);
+    return tile.norm();
+  };
+
+  auto arr_source0 =
+      TA::make_array<ArrayT>(world, tr_source, set_random_tensor_tile);
+  auto arr_target0 = TA::retile(arr_source0, tr_target);
+
+  auto get_elem = [](auto const& arr, auto const& eix) {
+    auto tix = arr.trange().element_to_tile(eix);
+    auto&& tile = arr.find(tix).get(false);
+    return tile(eix);
+  };
+
+  for (auto&& eix : elem_rng) {
+    auto tix = arr_source0.trange().element_to_tile(eix);
+    BOOST_REQUIRE(arr_source0.is_zero(tix) == arr_target0.is_zero(tix));
+    if (arr_source0.is_zero(tix)) continue;
+    BOOST_REQUIRE(get_elem(arr_source0, eix) == get_elem(arr_target0, eix));
+  }
+
+  auto arr_source = TA::make_array<ArrayToT>(world, tr_source,
+                                             set_random_tensor_of_tensor_tile);
+  auto arr_target = TA::retile(arr_source, tr_target);
+
+  arr_source.make_replicated();
+  arr_target.make_replicated();
+  arr_source.truncate();
+  arr_target.truncate();
+  world.gop.fence();
+
+  for (auto&& eix : elem_rng) {
+    auto tix = arr_source.trange().element_to_tile(eix);
+    BOOST_REQUIRE(arr_source.is_zero(tix) == arr_target.is_zero(tix));
+    if (arr_source.is_zero(tix)) continue;
+    BOOST_REQUIRE(get_elem(arr_source, eix) == get_elem(arr_target, eix));
+  }
 }
 
-BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/round_robin_pmap.cpp b/tests/round_robin_pmap.cpp
index 4851c5b5b1..7c601d4bfd 100644
--- a/tests/round_robin_pmap.cpp
+++ b/tests/round_robin_pmap.cpp
@@ -25,7 +25,7 @@
 using namespace TiledArray;
 
 struct RoundRobinPmapFixture {
-  RoundRobinPmapFixture() {}
+  constexpr static std::size_t max_ntiles = 10ul;
 };
 
 // =============================================================================
@@ -51,7 +51,7 @@ BOOST_AUTO_TEST_CASE(owner) {
   ProcessID *p_owner = new ProcessID[size];
 
   // Check various pmap sizes
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     TiledArray::detail::RoundRobinPmap pmap(*GlobalFixture::world, tiles);
 
     for (std::size_t tile = 0; tile < tiles; ++tile) {
@@ -71,7 +71,7 @@ BOOST_AUTO_TEST_CASE(owner) {
 }
 
 BOOST_AUTO_TEST_CASE(local_size) {
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     TiledArray::detail::RoundRobinPmap pmap(*GlobalFixture::world, tiles);
 
     std::size_t total_size = pmap.local_size();
@@ -87,7 +87,7 @@ BOOST_AUTO_TEST_CASE(local_size) {
 BOOST_AUTO_TEST_CASE(local_group) {
   ProcessID tile_owners[100];
 
-  for (std::size_t tiles = 1ul; tiles < 100ul; ++tiles) {
+  for (std::size_t tiles = 1ul; tiles < max_ntiles; ++tiles) {
     TiledArray::detail::RoundRobinPmap pmap(*GlobalFixture::world, tiles);
 
     // Check that all local elements map to this rank
diff --git a/tests/sparse_shape.cpp b/tests/sparse_shape.cpp
index 77ada97028..8bf1c4ae3b 100644
--- a/tests/sparse_shape.cpp
+++ b/tests/sparse_shape.cpp
@@ -24,9 +24,7 @@
  */
 
 #include <boost/range/combine.hpp>
-#ifdef TILEDARRAY_HAS_RANGEV3
 #include <range/v3/view/zip.hpp>
-#endif
 
 #include "TiledArray/sparse_shape.h"
 #include "sparse_shape_fixture.h"
@@ -49,36 +47,36 @@ BOOST_AUTO_TEST_CASE(default_constructor) {
   BOOST_CHECK(!x.validate(tr.tiles_range()));
   BOOST_CHECK_EQUAL(x.init_threshold(), SparseShape<float>::threshold());
 
-  BOOST_CHECK_THROW(x.nnz(), Exception);
+  BOOST_CHECK_TA_ASSERT(x.nnz(), Exception);
 
-  BOOST_CHECK_THROW(x[0], Exception);
+  BOOST_CHECK_TA_ASSERT(x[0], Exception);
 
-  BOOST_CHECK_THROW(x.perm(perm), Exception);
+  BOOST_CHECK_TA_ASSERT(x.perm(perm), Exception);
 
-  BOOST_CHECK_THROW(x.scale(2.0), Exception);
-  BOOST_CHECK_THROW(x.scale(2.0, perm), Exception);
+  BOOST_CHECK_TA_ASSERT(x.scale(2.0), Exception);
+  BOOST_CHECK_TA_ASSERT(x.scale(2.0, perm), Exception);
 
-  BOOST_CHECK_THROW(x.add(y), Exception);
-  BOOST_CHECK_THROW(x.add(y, 2.0), Exception);
-  BOOST_CHECK_THROW(x.add(y, perm), Exception);
-  BOOST_CHECK_THROW(x.add(y, 2.0, perm), Exception);
-  BOOST_CHECK_THROW(x.add(2.0), Exception);
-  BOOST_CHECK_THROW(x.add(2.0, perm), Exception);
+  BOOST_CHECK_TA_ASSERT(x.add(y), Exception);
+  BOOST_CHECK_TA_ASSERT(x.add(y, 2.0), Exception);
+  BOOST_CHECK_TA_ASSERT(x.add(y, perm), Exception);
+  BOOST_CHECK_TA_ASSERT(x.add(y, 2.0, perm), Exception);
+  BOOST_CHECK_TA_ASSERT(x.add(2.0), Exception);
+  BOOST_CHECK_TA_ASSERT(x.add(2.0, perm), Exception);
 
-  BOOST_CHECK_THROW(x.subt(y), Exception);
-  BOOST_CHECK_THROW(x.subt(y, 2.0), Exception);
-  BOOST_CHECK_THROW(x.subt(y, perm), Exception);
-  BOOST_CHECK_THROW(x.subt(y, 2.0, perm), Exception);
-  BOOST_CHECK_THROW(x.subt(2.0), Exception);
-  BOOST_CHECK_THROW(x.subt(2.0, perm), Exception);
+  BOOST_CHECK_TA_ASSERT(x.subt(y), Exception);
+  BOOST_CHECK_TA_ASSERT(x.subt(y, 2.0), Exception);
+  BOOST_CHECK_TA_ASSERT(x.subt(y, perm), Exception);
+  BOOST_CHECK_TA_ASSERT(x.subt(y, 2.0, perm), Exception);
+  BOOST_CHECK_TA_ASSERT(x.subt(2.0), Exception);
+  BOOST_CHECK_TA_ASSERT(x.subt(2.0, perm), Exception);
 
-  BOOST_CHECK_THROW(x.mult(y), Exception);
-  BOOST_CHECK_THROW(x.mult(y, 2.0), Exception);
-  BOOST_CHECK_THROW(x.mult(y, perm), Exception);
-  BOOST_CHECK_THROW(x.mult(y, 2.0, perm), Exception);
+  BOOST_CHECK_TA_ASSERT(x.mult(y), Exception);
+  BOOST_CHECK_TA_ASSERT(x.mult(y, 2.0), Exception);
+  BOOST_CHECK_TA_ASSERT(x.mult(y, perm), Exception);
+  BOOST_CHECK_TA_ASSERT(x.mult(y, 2.0, perm), Exception);
 
-  BOOST_CHECK_THROW(x.gemm(y, 2.0, gemm_helper), Exception);
-  BOOST_CHECK_THROW(x.gemm(y, 2.0, gemm_helper, perm), Exception);
+  BOOST_CHECK_TA_ASSERT(x.gemm(y, 2.0, gemm_helper), Exception);
+  BOOST_CHECK_TA_ASSERT(x.gemm(y, 2.0, gemm_helper, perm), Exception);
 }
 
 BOOST_AUTO_TEST_CASE(non_comm_constructor) {
@@ -121,9 +119,12 @@ BOOST_AUTO_TEST_CASE(non_comm_constructor) {
     }
   }
 
-  BOOST_CHECK_CLOSE(x.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      x.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
   BOOST_CHECK(x.nnz() == x.data().size() - zero_tile_count);
 
   // use the sparse ctor
@@ -194,9 +195,12 @@ BOOST_AUTO_TEST_CASE(comm_constructor) {
     }
   }
 
-  BOOST_CHECK_CLOSE(x.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      x.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
   BOOST_CHECK_EQUAL(x.nnz(), x.data().size() - zero_tile_count);
 
   // use the sparse ctor
@@ -270,7 +274,7 @@ BOOST_AUTO_TEST_CASE(block) {
   // change default threshold to make sure it's not inherited
   auto resetter = set_threshold_to_max();
 
-  auto less = std::less<std::size_t>();
+  auto less_equal = std::less_equal<std::size_t>();
 
   for (auto lower_it = tr.tiles_range().begin();
        lower_it != tr.tiles_range().end(); ++lower_it) {
@@ -281,7 +285,7 @@ BOOST_AUTO_TEST_CASE(block) {
       auto upper = *upper_it;
       for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1;
 
-      if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) {
+      if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) {
         // Check that the block function does not throw an exception
         SparseShape<float> result;
         BOOST_REQUIRE_NO_THROW(result = sparse_shape.block(lower, upper));
@@ -321,7 +325,9 @@ BOOST_AUTO_TEST_CASE(block) {
         }
         BOOST_CHECK_CLOSE(
             result.sparsity(),
-            float(zero_tile_count) / float(result.data().range().volume()),
+            result.data().range().volume() > 0
+                ? float(zero_tile_count) / float(result.data().range().volume())
+                : 0,
             tolerance);
 
         // validate other block functions
@@ -342,16 +348,14 @@ BOOST_AUTO_TEST_CASE(block) {
             sparse_shape.block(boost::combine(lower, upper)));
         auto result3 = sparse_shape.block(boost::combine(lower, upper));
         BOOST_CHECK_EQUAL(result, result3);
-#ifdef TILEDARRAY_HAS_RANGEV3
         BOOST_REQUIRE_NO_THROW(
             sparse_shape.block(ranges::views::zip(lower, upper)));
         auto result4 = sparse_shape.block(ranges::views::zip(lower, upper));
         BOOST_CHECK_EQUAL(result, result4);
-#endif
       } else {
         // Check that block throws an exception with a bad block range
-        BOOST_CHECK_THROW(sparse_shape.block(lower, upper),
-                          TiledArray::Exception);
+        BOOST_CHECK_TA_ASSERT(sparse_shape.block(lower, upper),
+                              TiledArray::Exception);
       }
     }
   }
@@ -361,7 +365,7 @@ BOOST_AUTO_TEST_CASE(block_scale) {
   // change default threshold to make sure it's not inherited
   auto resetter = set_threshold_to_max();
 
-  auto less = std::less<std::size_t>();
+  auto less_equal = std::less_equal<std::size_t>();
   const float factor = 3.3;
 
   for (auto lower_it = tr.tiles_range().begin();
@@ -373,7 +377,7 @@ BOOST_AUTO_TEST_CASE(block_scale) {
       auto upper = *upper_it;
       for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1;
 
-      if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) {
+      if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) {
         // Check that the block function does not throw an exception
         SparseShape<float> result;
         BOOST_REQUIRE_NO_THROW(result =
@@ -413,7 +417,9 @@ BOOST_AUTO_TEST_CASE(block_scale) {
         }
         BOOST_CHECK_CLOSE(
             result.sparsity(),
-            float(zero_tile_count) / float(result.data().range().volume()),
+            result.data().range().volume() > 0
+                ? float(zero_tile_count) / float(result.data().range().volume())
+                : 0,
             tolerance);
 
         // validate other block functions
@@ -437,18 +443,16 @@ BOOST_AUTO_TEST_CASE(block_scale) {
             sparse_shape.block(boost::combine(lower, upper), factor));
         auto result3 = sparse_shape.block(boost::combine(lower, upper), factor);
         BOOST_CHECK_EQUAL(result, result3);
-#ifdef TILEDARRAY_HAS_RANGEV3
         BOOST_REQUIRE_NO_THROW(
             sparse_shape.block(ranges::views::zip(lower, upper), factor));
         auto result4 =
             sparse_shape.block(ranges::views::zip(lower, upper), factor);
         BOOST_CHECK_EQUAL(result, result4);
-#endif
 
       } else {
         // Check that block throws an exception with a bad block range
-        BOOST_CHECK_THROW(sparse_shape.block(lower, upper),
-                          TiledArray::Exception);
+        BOOST_CHECK_TA_ASSERT(sparse_shape.block(lower, upper),
+                              TiledArray::Exception);
       }
     }
   }
@@ -458,7 +462,7 @@ BOOST_AUTO_TEST_CASE(block_perm) {
   // change default threshold to make sure it's not inherited
   auto resetter = set_threshold_to_max();
 
-  auto less = std::less<std::size_t>();
+  auto less_equal = std::less_equal<std::size_t>();
   const auto inv_perm = perm.inv();
 
   for (auto lower_it = tr.tiles_range().begin();
@@ -470,7 +474,7 @@ BOOST_AUTO_TEST_CASE(block_perm) {
       auto upper = *upper_it;
       for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1;
 
-      if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) {
+      if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) {
         // Check that the block function does not throw an exception
         SparseShape<float> result;
         BOOST_REQUIRE_NO_THROW(result = sparse_shape.block(lower, upper, perm));
@@ -513,7 +517,9 @@ BOOST_AUTO_TEST_CASE(block_perm) {
         }
         BOOST_CHECK_CLOSE(
             result.sparsity(),
-            float(zero_tile_count) / float(result.data().range().volume()),
+            result.data().range().volume() > 0
+                ? float(zero_tile_count) / float(result.data().range().volume())
+                : 0,
             tolerance);
 
         // validate other block functions
@@ -536,18 +542,16 @@ BOOST_AUTO_TEST_CASE(block_perm) {
             sparse_shape.block(boost::combine(lower, upper), perm));
         auto result3 = sparse_shape.block(boost::combine(lower, upper), perm);
         BOOST_CHECK_EQUAL(result, result3);
-#ifdef TILEDARRAY_HAS_RANGEV3
         BOOST_REQUIRE_NO_THROW(
             sparse_shape.block(ranges::views::zip(lower, upper), perm));
         auto result4 =
             sparse_shape.block(ranges::views::zip(lower, upper), perm);
         BOOST_CHECK_EQUAL(result, result4);
-#endif
 
       } else {
         // Check that block throws an exception with a bad block range
-        BOOST_CHECK_THROW(sparse_shape.block(lower, upper),
-                          TiledArray::Exception);
+        BOOST_CHECK_TA_ASSERT(sparse_shape.block(lower, upper),
+                              TiledArray::Exception);
       }
     }
   }
@@ -557,7 +561,7 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) {
   // change default threshold to make sure it's not inherited
   auto resetter = set_threshold_to_max();
 
-  auto less = std::less<std::size_t>();
+  auto less_equal = std::less_equal<std::size_t>();
   const float factor = 3.3;
   const auto inv_perm = perm.inv();
 
@@ -570,7 +574,7 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) {
       auto upper = *upper_it;
       for (auto it = upper.begin(); it != upper.end(); ++it) *it += 1;
 
-      if (std::equal(lower.begin(), lower.end(), upper.begin(), less)) {
+      if (std::equal(lower.begin(), lower.end(), upper.begin(), less_equal)) {
         // Check that the block function does not throw an exception
         SparseShape<float> result;
         BOOST_REQUIRE_NO_THROW(
@@ -614,7 +618,9 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) {
         }
         BOOST_CHECK_CLOSE(
             result.sparsity(),
-            float(zero_tile_count) / float(result.data().range().volume()),
+            result.data().range().volume() > 0
+                ? float(zero_tile_count) / float(result.data().range().volume())
+                : 0,
             tolerance);
 
         // validate other block functions
@@ -639,18 +645,16 @@ BOOST_AUTO_TEST_CASE(block_scale_perm) {
         auto result3 =
             sparse_shape.block(boost::combine(lower, upper), factor, perm);
         BOOST_CHECK_EQUAL(result, result3);
-#ifdef TILEDARRAY_HAS_RANGEV3
         BOOST_REQUIRE_NO_THROW(
             sparse_shape.block(ranges::views::zip(lower, upper), factor, perm));
         auto result4 =
             sparse_shape.block(ranges::views::zip(lower, upper), factor, perm);
         BOOST_CHECK_EQUAL(result, result4);
-#endif
 
       } else {
         // Check that block throws an exception with a bad block range
-        BOOST_CHECK_THROW(sparse_shape.block(lower, upper),
-                          TiledArray::Exception);
+        BOOST_CHECK_TA_ASSERT(sparse_shape.block(lower, upper),
+                              TiledArray::Exception);
       }
     }
   }
@@ -706,9 +710,12 @@ BOOST_AUTO_TEST_CASE(transform) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(mask) {
@@ -745,9 +752,12 @@ BOOST_AUTO_TEST_CASE(mask) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(scale) {
@@ -778,9 +788,12 @@ BOOST_AUTO_TEST_CASE(scale) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(scale_perm) {
@@ -812,9 +825,12 @@ BOOST_AUTO_TEST_CASE(scale_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(add) {
@@ -848,9 +864,12 @@ BOOST_AUTO_TEST_CASE(add) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
   BOOST_CHECK_EQUAL(result.nnz(), result.data().size() - zero_tile_count);
 }
 
@@ -885,9 +904,12 @@ BOOST_AUTO_TEST_CASE(add_scale) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(add_perm) {
@@ -922,9 +944,12 @@ BOOST_AUTO_TEST_CASE(add_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(add_scale_perm) {
@@ -959,9 +984,12 @@ BOOST_AUTO_TEST_CASE(add_scale_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(add_const) {
@@ -998,9 +1026,12 @@ BOOST_AUTO_TEST_CASE(add_const) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(add_const_perm) {
@@ -1037,9 +1068,12 @@ BOOST_AUTO_TEST_CASE(add_const_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(subt) {
@@ -1073,9 +1107,12 @@ BOOST_AUTO_TEST_CASE(subt) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(subt_scale) {
@@ -1109,9 +1146,12 @@ BOOST_AUTO_TEST_CASE(subt_scale) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(subt_perm) {
@@ -1146,9 +1186,12 @@ BOOST_AUTO_TEST_CASE(subt_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(subt_scale_perm) {
@@ -1183,9 +1226,12 @@ BOOST_AUTO_TEST_CASE(subt_scale_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(subt_const) {
@@ -1220,9 +1266,12 @@ BOOST_AUTO_TEST_CASE(subt_const) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(subt_const_perm) {
@@ -1260,9 +1309,12 @@ BOOST_AUTO_TEST_CASE(subt_const_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(mult) {
@@ -1295,9 +1347,12 @@ BOOST_AUTO_TEST_CASE(mult) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(mult_scale) {
@@ -1330,9 +1385,12 @@ BOOST_AUTO_TEST_CASE(mult_scale) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(mult_perm) {
@@ -1368,9 +1426,12 @@ BOOST_AUTO_TEST_CASE(mult_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(mult_scale_perm) {
@@ -1406,9 +1467,12 @@ BOOST_AUTO_TEST_CASE(mult_scale_perm) {
     }
   }
 
-  BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(tr.tiles_range().volume()),
-                    tolerance);
+  BOOST_CHECK_CLOSE(
+      result.sparsity(),
+      tr.tiles_range().volume() > 0
+          ? float(zero_tile_count) / float(tr.tiles_range().volume())
+          : 0,
+      tolerance);
 }
 
 BOOST_AUTO_TEST_CASE(gemm) {
@@ -1470,7 +1534,9 @@ BOOST_AUTO_TEST_CASE(gemm) {
   }
 
   BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(result_norms.size()),
+                    result_norms.size() > 0
+                        ? float(zero_tile_count) / float(result_norms.size())
+                        : 0,
                     tolerance);
 }
 
@@ -1538,7 +1604,9 @@ BOOST_AUTO_TEST_CASE(gemm_perm) {
   }
 
   BOOST_CHECK_CLOSE(result.sparsity(),
-                    float(zero_tile_count) / float(result_norms.size()),
+                    result_norms.size() > 0
+                        ? float(zero_tile_count) / float(result_norms.size())
+                        : 0,
                     tolerance);
 }
 
diff --git a/tests/sparse_tile.h b/tests/sparse_tile.h
index 6c365334fa..70897d7ca1 100644
--- a/tests/sparse_tile.h
+++ b/tests/sparse_tile.h
@@ -24,8 +24,6 @@
 #include <memory>
 #include <tuple>
 
-#include <tiledarray.h>
-
 #include <TiledArray/external/madness.h>
 
 // Array class
@@ -37,6 +35,8 @@
 #include <TiledArray/policies/dense_policy.h>
 #include <TiledArray/policies/sparse_policy.h>
 
+#include <TiledArray/tile_interface/add.h>
+
 // sparse 2-dimensional matrix type, with tag type thrown in to make expression
 // engine work harder
 template <typename T, typename TagType = std::tuple<>>
@@ -47,6 +47,8 @@ class EigenSparseTile {
   typedef T value_type;                  // Element type
   typedef T numeric_type;  // The scalar type that is compatible with value_type
   typedef size_t size_type;  // Size type
+  typedef const T& const_reference;
+  typedef size_type ordinal_type;
   // other typedefs
   typedef Eigen::SparseMatrix<T, Eigen::RowMajor> matrix_type;
 
@@ -122,10 +124,49 @@ class EigenSparseTile {
   matrix_type& matrix() { return std::get<0>(*impl_); }
 
   /// data read-write accessor
-  template <typename Index>
+  template <typename Index, typename = std::enable_if_t<
+                                detail::is_integral_sized_range_v<Index>>>
   value_type& operator[](const Index& idx) {
     auto start = range().lobound_data();
-    return std::get<0>(*impl_).coeffRef(idx[0] - start[0], idx[1] - start[1]);
+    return matrix().coeffRef(idx[0] - start[0], idx[1] - start[1]);
+  }
+
+  /// data read-write accessor
+  template <typename Ordinal,
+            std::enable_if_t<std::is_integral_v<Ordinal>>* = nullptr>
+  value_type& operator[](const Ordinal& ord) {
+    auto idx = range().idx(ord);
+    auto start = range().lobound_data();
+    return matrix().coeffRef(idx[0] - start[0], idx[1] - start[1]);
+  }
+
+  /// data read-only accessor
+  template <typename Index>
+  std::enable_if_t<detail::is_integral_sized_range_v<Index>, const value_type&>
+  operator[](const Index& idx) const {
+    static const value_type zero = 0;
+    auto start = range().lobound_data();
+    auto* ptr = coeffPtr(idx[0] - start[0], idx[1] - start[1]);
+    return ptr == nullptr ? zero : *ptr;
+  }
+
+  /// data read-only accessor
+  template <typename Ordinal,
+            typename = std::enable_if_t<std::is_integral_v<Ordinal>>>
+  const value_type& operator[](const Ordinal& ord) const {
+    static const value_type zero = 0;
+    auto idx = range().idx(ord);
+    auto start = range().lobound_data();
+    auto* ptr = coeffPtr(idx[0] - start[0], idx[1] - start[1]);
+    return ptr == nullptr ? zero : *ptr;
+  }
+
+  const value_type& at_ordinal(const ordinal_type index_ordinal) const {
+    return this->operator[](index_ordinal);
+  }
+
+  value_type& at_ordinal(const ordinal_type index_ordinal) {
+    return this->operator[](index_ordinal);
   }
 
   /// Maximum # of elements in the tile
@@ -138,8 +179,8 @@ class EigenSparseTile {
 
   // output
   template <typename Archive,
-            typename std::enable_if<madness::is_output_archive_v<
-                Archive>>::type* = nullptr>
+            typename std::enable_if<
+                madness::is_output_archive_v<Archive>>::type* = nullptr>
   void serialize(Archive& ar) {
     if (impl_) {
       ar & true;
@@ -151,7 +192,7 @@ class EigenSparseTile {
         for (typename matrix_type::InnerIterator it(mat, k); it; ++it) {
           datavec.push_back(Eigen::Triplet<T>(it.row(), it.col(), it.value()));
         }
-      ar& datavec & this->range();
+      ar & datavec& this->range();
     } else {
       ar & false;
     }
@@ -159,15 +200,15 @@ class EigenSparseTile {
 
   // output
   template <typename Archive,
-            typename std::enable_if<madness::is_input_archive_v<
-                Archive>>::type* = nullptr>
+            typename std::enable_if<
+                madness::is_input_archive_v<Archive>>::type* = nullptr>
   void serialize(Archive& ar) {
     bool have_impl = false;
-    ar& have_impl;
+    ar & have_impl;
     if (have_impl) {
       std::vector<Eigen::Triplet<T>> datavec;
       range_type range;
-      ar& datavec& range;
+      ar & datavec & range;
       auto extents = range.extent();
       matrix_type mat(extents[0], extents[1]);
       mat.setFromTriplets(datavec.begin(), datavec.end());
@@ -192,6 +233,32 @@ class EigenSparseTile {
  private:
   std::shared_ptr<impl_type> impl_;
 
+  // pointer-based coeffRef
+  const value_type* coeffPtr(Eigen::Index row, Eigen::Index col) const {
+    auto& mat = matrix();
+    constexpr bool IsRowMajor =
+        std::decay_t<decltype(mat)>::Flags & Eigen::RowMajorBit ? 1 : 0;
+    using Eigen::Index;
+    const Index outer = IsRowMajor ? row : col;
+    const Index inner = IsRowMajor ? col : row;
+
+    auto* outerIndexPtr = mat.outerIndexPtr();
+    auto* innerNonZeros = mat.innerNonZeroPtr();
+    const auto start = outerIndexPtr[outer];
+    const auto end = innerNonZeros ? outerIndexPtr[outer] + innerNonZeros[outer]
+                                   : outerIndexPtr[outer + 1];
+    TA_ASSERT(end >= start &&
+              "you probably called coeffRef on a non finalized matrix");
+    if (end <= start) return nullptr;
+    const Index p = mat.data().searchLowerIndex(
+        start, end - 1,
+        (typename std::decay_t<decltype(mat)>::StorageIndex)inner);
+    if ((p < end) && (mat.data().index(p) == inner))
+      return &(mat.data().value(p));
+    else
+      return nullptr;
+  }
+
 };  // class EigenSparseTile
 
 // configure TA traits to be usable as tile
@@ -229,22 +296,22 @@ EigenSparseTile<T, TagType> add(const EigenSparseTile<T, TagType>& arg1,
                                      arg1.range());
 }
 
-// dense_result[i] = dense_arg1[i] + sparse_arg2[i]
-template <typename T, typename TagType>
-TiledArray::Tensor<T> add(const TiledArray::Tensor<T>& arg1,
-                          const EigenSparseTile<T, TagType>& arg2) {
-  TA_ASSERT(arg1.range() == arg2.range());
-
-  // this could be done better ...
-  return TiledArray::add(arg1, static_cast<TiledArray::Tensor<T>>(arg2));
-}
-
-// dense_result[i] = sparse_arg1[i] + dense_arg2[i]
-template <typename T, typename TagType>
-TiledArray::Tensor<T> add(const EigenSparseTile<T, TagType>& arg1,
-                          const TiledArray::Tensor<T>& arg2) {
-  return TiledArray::add(arg2, arg1);
-}
+//// dense_result[i] = dense_arg1[i] + sparse_arg2[i]
+// template <typename T, typename TagType>
+// TiledArray::Tensor<T> add(const TiledArray::Tensor<T>& arg1,
+//                           const EigenSparseTile<T, TagType>& arg2) {
+//   TA_ASSERT(arg1.range() == arg2.range());
+//
+//   // this could be done better ...
+//   return TiledArray::add(arg1, static_cast<TiledArray::Tensor<T>>(arg2));
+// }
+//
+//// dense_result[i] = sparse_arg1[i] + dense_arg2[i]
+// template <typename T, typename TagType>
+// TiledArray::Tensor<T> add(const EigenSparseTile<T, TagType>& arg1,
+//                           const TiledArray::Tensor<T>& arg2) {
+//   return TiledArray::add(arg2, static_cast<TiledArray::Tensor<T>>(arg1));
+// }
 
 // dense_result[perm ^ i] = dense_arg1[i] + sparse_arg2[i]
 template <
@@ -633,7 +700,7 @@ struct ArchiveLoadImpl<Archive, Eigen::Triplet<T>> {
   static inline void load(const Archive& ar, Eigen::Triplet<T>& obj) {
     int row, col;
     T value;
-    ar& row& col& value;
+    ar & row & col & value;
     obj = Eigen::Triplet<T>(row, col, value);
   }
 };
@@ -641,7 +708,7 @@ struct ArchiveLoadImpl<Archive, Eigen::Triplet<T>> {
 template <class Archive, typename T>
 struct ArchiveStoreImpl<Archive, Eigen::Triplet<T>> {
   static inline void store(const Archive& ar, const Eigen::Triplet<T>& obj) {
-    ar& obj.row() & obj.col() & obj.value();
+    ar & obj.row() & obj.col() & obj.value();
   }
 };
 }  // namespace archive
diff --git a/tests/ta_test.cpp b/tests/ta_test.cpp
index 8d81e66849..7e5f2184bf 100644
--- a/tests/ta_test.cpp
+++ b/tests/ta_test.cpp
@@ -28,9 +28,6 @@
 #endif
 
 #include <TiledArray/error.h>
-#if (TA_ASSERT_POLICY != TA_ASSERT_THROW)
-#error "TiledArray unit tests require TA_ASSERT_POLICY=TA_ASSERT_THROW"
-#endif
 
 GlobalFixture::GlobalFixture() {
   if (world == nullptr) {
diff --git a/tests/tensor.cpp b/tests/tensor.cpp
index b329b5af44..99b10fc7b7 100644
--- a/tests/tensor.cpp
+++ b/tests/tensor.cpp
@@ -18,9 +18,7 @@
  */
 
 #include <boost/range/combine.hpp>
-#ifdef TILEDARRAY_HAS_RANGEV3
 #include <range/v3/view/zip.hpp>
-#endif
 
 #include <iterator>
 #include "TiledArray/math/gemm_helper.h"
@@ -709,9 +707,7 @@ BOOST_AUTO_TEST_CASE(block) {
   // need to #include <boost/range/combine.hpp>
   BOOST_CHECK_NO_THROW(s.block(boost::combine(lobound, upbound)));
 
-#ifdef TILEDARRAY_HAS_RANGEV3
   BOOST_CHECK_NO_THROW(s.block(ranges::views::zip(lobound, upbound)));
-#endif
 
   auto sview0 = s.block(lobound, upbound);
   BOOST_CHECK(sview0.range().includes(lobound));
@@ -724,4 +720,28 @@ BOOST_AUTO_TEST_CASE(block) {
 #endif
 }
 
+BOOST_AUTO_TEST_CASE(allocator) {
+  TensorD x(r, 1.0);
+  Tensor<double, std::allocator<double>> y(r, 1.0);
+  static_assert(std::is_same_v<decltype(x.add(y)), TensorD>);
+  static_assert(std::is_same_v<decltype(y.add(x)), decltype(y)>);
+  static_assert(std::is_same_v<decltype(x.subt(y)), TensorD>);
+  static_assert(std::is_same_v<decltype(y.subt(x)), decltype(y)>);
+  static_assert(std::is_same_v<decltype(x.mult(y)), TensorD>);
+  static_assert(std::is_same_v<decltype(y.mult(x)), decltype(y)>);
+  BOOST_REQUIRE_NO_THROW(x.add_to(y));
+  BOOST_REQUIRE_NO_THROW(x.subt_to(y));
+  BOOST_REQUIRE_NO_THROW(x.mult_to(y));
+}
+
+BOOST_AUTO_TEST_CASE(rebind) {
+  static_assert(
+      std::is_same_v<TensorD::rebind_t<std::complex<double>>, TensorZ>);
+  static_assert(
+      std::is_same_v<TensorD::rebind_numeric_t<std::complex<double>>, TensorZ>);
+  static_assert(
+      std::is_same_v<TiledArray::detail::complex_t<TensorD>, TensorZ>);
+  static_assert(std::is_same_v<TiledArray::detail::real_t<TensorZ>, TensorD>);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/tensor_of_tensor.cpp b/tests/tensor_of_tensor.cpp
index 0f4683d174..f0aa8be3e8 100644
--- a/tests/tensor_of_tensor.cpp
+++ b/tests/tensor_of_tensor.cpp
@@ -47,7 +47,10 @@ struct TensorOfTensorFixture {
   TensorOfTensorFixture()
       : a(make_rand_tensor_of_tensor(Range(size))),
         b(make_rand_tensor_of_tensor(Range(size))),
-        c(a - b)
+        c(a - b),
+        aa(make_rand_tensor(Range(size))),
+        bb(make_rand_tensor(Range(size))),
+        cc(aa - bb)
 #ifdef TILEDARRAY_HAS_BTAS
         ,
         d(make_rand_TobT(Range(size))),
@@ -123,13 +126,15 @@ struct TensorOfTensorFixture {
   static const BipartitePermutation bperm;
 
   Tensor<Tensor<int>> a, b, c;
+  Tensor<int> aa, bb, cc;
 #ifdef TILEDARRAY_HAS_BTAS
   Tensor<bTensorI> d, e, f, g, h;
 #endif  // defined(TILEDARRAY_HAS_BTAS)
 
   template <typename T>
   Tensor<T>& ToT(size_t idx);
-
+  template <typename T>
+  T& ToS(size_t idx);
 };  // TensorOfTensorFixture
 
 template <>
@@ -158,6 +163,18 @@ Tensor<bTensorI>& TensorOfTensorFixture::ToT<bTensorI>(size_t idx) {
 }
 #endif
 
+template <>
+Tensor<int>& TensorOfTensorFixture::ToS<Tensor<int>>(size_t idx) {
+  if (idx == 0)
+    return aa;
+  else if (idx == 1)
+    return bb;
+  else if (idx == 2)
+    return cc;
+  else
+    throw std::range_error("idx out of range");
+}
+
 const std::array<std::size_t, 2> TensorOfTensorFixture::size{{10, 11}};
 const Permutation TensorOfTensorFixture::perm{1, 0};
 const BipartitePermutation TensorOfTensorFixture::bperm(Permutation{1, 0, 3, 2},
@@ -171,6 +188,7 @@ typedef boost::mpl::list<TiledArray::Tensor<int>, bTensorI> itensor_types;
 #else
 typedef boost::mpl::list<TiledArray::Tensor<int>> itensor_types;
 #endif
+typedef boost::mpl::list<TiledArray::Tensor<int>> itensor_nobtas_types;
 
 BOOST_AUTO_TEST_CASE_TEMPLATE(default_constructor, ITensor, itensor_types) {
   BOOST_CHECK_NO_THROW(Tensor<ITensor> t);
@@ -182,6 +200,15 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(default_constructor, ITensor, itensor_types) {
 
 BOOST_AUTO_TEST_CASE_TEMPLATE(unary_constructor, ITensor, itensor_types) {
   const auto& a = ToT<ITensor>(0);
+
+  // apply element-wise op with default initializer
+  // this is a reproducer for
+  // https://github.com/ValeevGroup/tiledarray/issues/445
+  {
+    BOOST_CHECK_NO_THROW(
+        Tensor<ITensor> t(a.range(), [](auto&& l) { return ITensor(); }));
+  }
+
   // apply element-wise op
   BOOST_CHECK_NO_THROW(Tensor<ITensor> t(a, [](const int l) { return l * 2; }));
   Tensor<ITensor> t(a, [](const int l) { return l * 2; });
@@ -964,6 +991,46 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(scal_mult_to, ITensor, itensor_types) {
   }
 }
 
+BOOST_AUTO_TEST_CASE_TEMPLATE(mixed_mult_TxS, ITensor, itensor_nobtas_types) {
+  const auto& a = ToT<ITensor>(0);
+  const auto& b = ToS<ITensor>(0);
+  Tensor<ITensor> t;
+  BOOST_CHECK_NO_THROW(t = a.mult(b));
+
+  BOOST_CHECK(!t.empty());
+  BOOST_CHECK_EQUAL(t.range(), a.range());
+
+  for (decltype(t.range().extent(0)) i = 0; i < t.range().extent(0); ++i) {
+    for (decltype(t.range().extent(1)) j = 0; j < t.range().extent(1); ++j) {
+      BOOST_CHECK(!t(i, j).empty());
+      BOOST_CHECK_EQUAL(t(i, j).range(), a(i, j).range());
+      for (std::size_t index = 0ul; index < t(i, j).size(); ++index) {
+        BOOST_CHECK_EQUAL(t(i, j)[index], a(i, j)[index] * b(i, j));
+      }
+    }
+  }
+}
+
+BOOST_AUTO_TEST_CASE_TEMPLATE(mixed_mult_SxT, ITensor, itensor_nobtas_types) {
+  const auto& a = ToS<ITensor>(0);
+  const auto& b = ToT<ITensor>(0);
+  Tensor<ITensor> t;
+  BOOST_CHECK_NO_THROW(t = a.mult(b));
+
+  BOOST_CHECK(!t.empty());
+  BOOST_CHECK_EQUAL(t.range(), a.range());
+
+  for (decltype(t.range().extent(0)) i = 0; i < t.range().extent(0); ++i) {
+    for (decltype(t.range().extent(1)) j = 0; j < t.range().extent(1); ++j) {
+      BOOST_CHECK(!t(i, j).empty());
+      BOOST_CHECK_EQUAL(t(i, j).range(), b(i, j).range());
+      for (std::size_t index = 0ul; index < t(i, j).size(); ++index) {
+        BOOST_CHECK_EQUAL(t(i, j)[index], a(i, j) * b(i, j)[index]);
+      }
+    }
+  }
+}
+
 BOOST_AUTO_TEST_CASE_TEMPLATE(neg, ITensor, itensor_types) {
   const auto& a = ToT<ITensor>(0);
   Tensor<ITensor> t;
@@ -1234,4 +1301,19 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(serialization, ITensor, itensor_types) {
                                 cend(a_roundtrip));
 }
 
+BOOST_AUTO_TEST_CASE_TEMPLATE(rebind, ITensor, itensor_types) {
+  using ITensorD = typename ITensor::template rebind_t<double>;
+  using ITensorZ = typename ITensor::template rebind_t<std::complex<double>>;
+  static_assert(
+      std::is_same_v<typename Tensor<ITensor>::template rebind_t<double>,
+                     TensorD>);
+  static_assert(std::is_same_v<
+                typename Tensor<ITensor>::template rebind_numeric_t<double>,
+                Tensor<ITensorD>>);
+  static_assert(std::is_same_v<TiledArray::detail::real_t<Tensor<TensorZ>>,
+                               Tensor<TensorD>>);
+  static_assert(std::is_same_v<TiledArray::detail::complex_t<Tensor<TensorD>>,
+                               Tensor<TensorZ>>);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/tensor_um.cpp b/tests/tensor_um.cpp
index 33efbfd7d4..d860b7c813 100644
--- a/tests/tensor_um.cpp
+++ b/tests/tensor_um.cpp
@@ -18,7 +18,7 @@
  *  Chong Peng on 9/19/18.
  */
 
-#include <TiledArray/cuda/btas_um_tensor.h>
+#include <TiledArray/device/btas_um_tensor.h>
 #include "global_fixture.h"
 #include "unit_test_config.h"
 
diff --git a/tests/tile_op_add.cpp b/tests/tile_op_add.cpp
index b264ae15bf..c2e08c170c 100644
--- a/tests/tile_op_add.cpp
+++ b/tests/tile_op_add.cpp
@@ -26,6 +26,7 @@
 #include "../src/TiledArray/tile_op/add.h"
 #include "../src/tiledarray.h"
 #include "range_fixture.h"
+#include "sparse_tile.h"
 #include "unit_test_config.h"
 
 // using TiledArray::detail::Add;
@@ -49,8 +50,7 @@ struct AddFixture : public RangeFixture {
 
 };  // AddFixture
 
-BOOST_FIXTURE_TEST_SUITE(tile_op_add_suite, AddFixture,
-                         TA_UT_LABEL_SERIAL)
+BOOST_FIXTURE_TEST_SUITE(tile_op_add_suite, AddFixture, TA_UT_LABEL_SERIAL)
 
 BOOST_AUTO_TEST_CASE(constructor) {
   // Check that the constructors can be called without throwing exceptions
@@ -398,4 +398,84 @@ BOOST_AUTO_TEST_CASE(binary_add_right_zero_perm_consume_right) {
   }
 }
 
+BOOST_AUTO_TEST_CASE(binary_add_heterogeneous) {
+  TensorD a(RangeFixture::r, [](auto&) { return TiledArray::drand(); });
+  EigenSparseTile<double> b(RangeFixture::r);
+
+  /////////////////
+  // dense + sparse
+  /////////////////
+  {{// a is persistent
+    auto c = add(a, b);
+
+  // Check that the result range is correct
+  BOOST_CHECK_EQUAL(c.range(), a.range());
+
+  // Check that a nor b were consumed
+  BOOST_CHECK_NE(a.data(), nullptr);
+  BOOST_CHECK_NE(c.data(), a.data());
+
+  // Check that the data in the new tile is correct
+  for (std::size_t i = 0ul; i < r.volume(); ++i) {
+    BOOST_CHECK_EQUAL(c[i], a[i] + b[i]);
+  }
+}
+{  // a is consumed
+  auto a_copy = a.clone();
+  if (r.rank() == 3) a.shift({-7, 7, 0});
+  auto c = add(std::move(a), std::move(b));
+
+  // Check that the result range is correct
+  BOOST_CHECK_EQUAL(c.range(), b.range());
+
+  // Check that a was consumed
+  BOOST_CHECK_EQUAL(a.data(), nullptr);
+
+  // Check that the data in the new tile is correct
+  for (std::size_t i = 0ul; i < r.volume(); ++i) {
+    BOOST_CHECK_EQUAL(c[i], a_copy[i] + b[i]);
+  }
+  a = a_copy;
+}
+}
+
+/////////////////
+// sparse + dense
+/////////////////
+{
+  {  // a is persistent
+    auto c = add(b, a);
+
+    // Check that the result range is correct
+    BOOST_CHECK_EQUAL(c.range(), b.range());
+
+    // Check that a was not consumed
+    BOOST_CHECK_NE(a.data(), nullptr);
+    BOOST_CHECK_NE(c.data(), a.data());
+
+    // Check that the data in the new tile is correct
+    for (std::size_t i = 0ul; i < r.volume(); ++i) {
+      BOOST_CHECK_EQUAL(c[i], b[i] + a[i]);
+    }
+  }
+  {  // a is consumed
+    auto a_copy = a.clone();
+    if (r.rank() == 3) a.shift({-7, 7, 0});
+    auto c = add(std::move(b), std::move(a));
+
+    // Check that the result range is correct
+    BOOST_CHECK_EQUAL(c.range(), b.range());
+
+    // Check that a was consumed
+    BOOST_CHECK_EQUAL(a.data(), nullptr);
+
+    // Check that the data in the new tile is correct
+    for (std::size_t i = 0ul; i < r.volume(); ++i) {
+      BOOST_CHECK_EQUAL(c[i], b[i] + a_copy[i]);
+    }
+    a = a_copy;
+  }
+}
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/tile_op_contract_reduce.cpp b/tests/tile_op_contract_reduce.cpp
index 5c30a5b491..b50397097d 100644
--- a/tests/tile_op_contract_reduce.cpp
+++ b/tests/tile_op_contract_reduce.cpp
@@ -101,7 +101,7 @@ BOOST_AUTO_TEST_CASE(permute_empty) {
       TiledArray::math::blas::Op::NoTrans, TiledArray::math::blas::Op::NoTrans,
       1, 2u, 2u, 2u);
   TensorI t, result;
-  BOOST_REQUIRE_THROW(result = op(t), TiledArray::Exception);
+  BOOST_REQUIRE_TA_ASSERT(result = op(t), TiledArray::Exception);
 }
 
 // TODO: Test non-empty permutation
diff --git a/tests/tiled_range.cpp b/tests/tiled_range.cpp
index 14e47e3557..eb557b761f 100644
--- a/tests/tiled_range.cpp
+++ b/tests/tiled_range.cpp
@@ -58,6 +58,14 @@ BOOST_AUTO_TEST_CASE(constructor) {
     BOOST_CHECK_EQUAL(r1.elements_range().area(), 0);
   }
 
+  // construct with ranges containing empty tiles only
+  {
+    BOOST_REQUIRE_NO_THROW(TiledRange r1({dims[0], TiledRange1{1, 1, 1}}));
+    TiledRange r1{dims[0], TiledRange1{1, 1, 1}};
+    BOOST_CHECK_EQUAL(r1.tiles_range().area(), dims[0].tile_extent() * 2);
+    BOOST_CHECK_EQUAL(r1.elements_range().area(), 0);
+  }
+
   // check initializer list of initializer list constructor
   {
     TiledRange r1{
@@ -111,6 +119,7 @@ BOOST_AUTO_TEST_CASE(comparison) {
   TiledRange r1{{0, 2, 4, 6, 8, 10}, {0, 2, 4, 6, 8, 10}};
   TiledRange r2{{0, 2, 4, 6, 8, 10}, {0, 2, 4, 6, 8, 10}};
   TiledRange r3{{0, 3, 6, 9, 12, 15}, {0, 3, 6, 9, 12, 15}};
+  BOOST_CHECK(r1 == r1);     // self-comparison
   BOOST_CHECK(r1 == r2);     // check equality operator
   BOOST_CHECK(!(r1 != r2));  // check not-equal operator
   BOOST_CHECK(
@@ -118,6 +127,18 @@ BOOST_AUTO_TEST_CASE(comparison) {
   BOOST_CHECK(r1 != r3);
 }
 
+BOOST_AUTO_TEST_CASE(congruency) {
+  TiledRange r1{{0, 2, 4, 6, 8, 10}, {0, 2, 4, 6, 8, 10}};
+  TiledRange r2{{1, 3, 5, 7, 9, 11}, {2, 4, 6, 8, 10, 12}};
+  TiledRange r3{{0, 3, 6, 9, 12, 15}, {0, 3, 6, 9, 12, 15}};
+  BOOST_CHECK(r1 == r1 && is_congruent(r1, r1));  // congruent with self
+  BOOST_CHECK(r1 != r2 &&
+              is_congruent(r1, r2));  // r1 and r2 are not equal but congruent
+  BOOST_CHECK(
+      r1 != r3 &&
+      !is_congruent(r1, r3));  // r1 and r3 are not equal and not congruent
+}
+
 BOOST_AUTO_TEST_CASE(assignment) {
   TiledRange r1;
 
@@ -147,6 +168,17 @@ BOOST_AUTO_TEST_CASE(permutation) {
                     r1);  // check that the permutation was assigned correctly.
 }
 
+BOOST_AUTO_TEST_CASE(shift) {
+  TiledRange tr1 = tr;
+  const auto shift = std::vector<int>(GlobalFixture::dim, 1);
+  BOOST_CHECK_NO_THROW(tr1.inplace_shift(shift));
+  BOOST_CHECK_EQUAL(tr1.tiles_range(), tr.tiles_range());
+  BOOST_CHECK_EQUAL(tr1.elements_range(), tr.elements_range().shift(shift));
+  TiledRange tr1_copy;
+  BOOST_CHECK_NO_THROW(tr1_copy = tr.shift(shift));
+  BOOST_CHECK_EQUAL(tr1, tr1_copy);
+}
+
 BOOST_AUTO_TEST_CASE(make_tiles_range) {
   tile_index start(GlobalFixture::dim);
   tile_index finish(GlobalFixture::dim);
diff --git a/tests/tiled_range1.cpp b/tests/tiled_range1.cpp
index 312389ff84..947142f6dc 100644
--- a/tests/tiled_range1.cpp
+++ b/tests/tiled_range1.cpp
@@ -32,6 +32,10 @@ BOOST_AUTO_TEST_CASE(range_accessor) {
   BOOST_CHECK_EQUAL(tr1.tiles_range().second, tiles.second);
   BOOST_CHECK_EQUAL(tr1.elements_range().first, elements.first);
   BOOST_CHECK_EQUAL(tr1.elements_range().second, elements.second);
+  BOOST_CHECK_EQUAL(tr1.tile_extent(), tiles.second - tiles.first);
+  BOOST_CHECK_EQUAL(tr1.extent(), elements.second - elements.first);
+  BOOST_CHECK_EQUAL(tr1.lobound(), elements.first);
+  BOOST_CHECK_EQUAL(tr1.upbound(), elements.second);
 
   // Check individual tiles
   for (std::size_t i = 0; i < a.size() - 1; ++i) {
@@ -43,12 +47,30 @@ BOOST_AUTO_TEST_CASE(range_accessor) {
 BOOST_AUTO_TEST_CASE(range_info) {
   BOOST_CHECK_EQUAL(tr1.tiles_range().first, 0ul);
   BOOST_CHECK_EQUAL(tr1.tiles_range().second, a.size() - 1);
-  BOOST_CHECK_EQUAL(tr1.elements_range().first, 0ul);
+  BOOST_CHECK_EQUAL(tr1.elements_range().first, a.front());
   BOOST_CHECK_EQUAL(tr1.elements_range().second, a.back());
+  BOOST_CHECK_EQUAL(tr1.tile_extent(), a.size() - 1);
+  BOOST_CHECK_EQUAL(tr1.extent(), a.back() - a.front());
+  BOOST_CHECK_EQUAL(tr1.lobound(), a.front());
+  BOOST_CHECK_EQUAL(tr1.upbound(), a.back());
   for (std::size_t i = 0; i < a.size() - 1; ++i) {
     BOOST_CHECK_EQUAL(tr1.tile(i).first, a[i]);
     BOOST_CHECK_EQUAL(tr1.tile(i).second, a[i + 1]);
   }
+
+  auto a_base1 = make_hashmarks<ntiles + 1>(1);
+  BOOST_CHECK_EQUAL(tr1_base1.tiles_range().first, 0ul);
+  BOOST_CHECK_EQUAL(tr1_base1.tiles_range().second, a_base1.size() - 1);
+  BOOST_CHECK_EQUAL(tr1_base1.elements_range().first, a_base1.front());
+  BOOST_CHECK_EQUAL(tr1_base1.elements_range().second, a_base1.back());
+  BOOST_CHECK_EQUAL(tr1_base1.tile_extent(), a_base1.size() - 1);
+  BOOST_CHECK_EQUAL(tr1_base1.extent(), a_base1.back() - a_base1.front());
+  BOOST_CHECK_EQUAL(tr1_base1.lobound(), a_base1.front());
+  BOOST_CHECK_EQUAL(tr1_base1.upbound(), a_base1.back());
+  for (std::size_t i = 0; i < a.size() - 1; ++i) {
+    BOOST_CHECK_EQUAL(tr1_base1.tile(i).first, a_base1[i]);
+    BOOST_CHECK_EQUAL(tr1_base1.tile(i).second, a_base1[i + 1]);
+  }
 }
 
 BOOST_AUTO_TEST_CASE(constructor) {
@@ -60,7 +82,26 @@ BOOST_AUTO_TEST_CASE(constructor) {
     BOOST_CHECK_EQUAL(r.tiles_range().second, 0ul);
     BOOST_CHECK_EQUAL(r.elements_range().first, 0ul);
     BOOST_CHECK_EQUAL(r.elements_range().second, 0ul);
-    BOOST_CHECK_THROW(r.tile(0), Exception);
+    BOOST_CHECK_TA_ASSERT(r.tile(0), Exception);
+  }
+
+  // check construction with single tile boundary (hence zero tiles)
+  {
+    {
+      BOOST_REQUIRE_NO_THROW(TiledRange1 r(0));
+      TiledRange1 r(0);
+      BOOST_CHECK_EQUAL(r, TiledRange1{});
+    }
+    {
+      BOOST_REQUIRE_NO_THROW(TiledRange1 r(1));
+      TiledRange1 r(1);
+      BOOST_CHECK_NE(r, TiledRange1{});
+      BOOST_CHECK_EQUAL(r.tiles_range().first, 0);
+      BOOST_CHECK_EQUAL(r.tiles_range().second, 0);
+      BOOST_CHECK_EQUAL(r.elements_range().first, 1);
+      BOOST_CHECK_EQUAL(r.elements_range().second, 1);
+      BOOST_CHECK_TA_ASSERT(r.tile(0), Exception);
+    }
   }
 
   // check construction with a iterators and the range info.
@@ -110,6 +151,21 @@ BOOST_AUTO_TEST_CASE(constructor) {
     }
   }
 
+  // check constructor using range of tile boundaries.
+  {
+    if (Range1Fixture::ntiles == 5) {
+      TiledRange1 r(a);
+      BOOST_CHECK_EQUAL(r.tiles_range().first, tiles.first);
+      BOOST_CHECK_EQUAL(r.tiles_range().second, tiles.second);
+      BOOST_CHECK_EQUAL(r.elements_range().first, elements.first);
+      BOOST_CHECK_EQUAL(r.elements_range().second, elements.second);
+      for (std::size_t i = 0; i < a.size() - 1; ++i) {
+        BOOST_CHECK_EQUAL(r.tile(i).first, a[i]);
+        BOOST_CHECK_EQUAL(r.tile(i).second, a[i + 1]);
+      }
+    }
+  }
+
   // check construction with negative index values
 #ifdef TA_SIGNED_1INDEX_TYPE
   {
@@ -120,8 +176,8 @@ BOOST_AUTO_TEST_CASE(constructor) {
     BOOST_CHECK_EQUAL(r.elements_range().second, 28);
   }
 #else   // TA_SIGNED_1INDEX_TYPE
-  BOOST_CHECK_THROW(TiledRange1 r({-1, 0, 2, 5, 10, 17, 28}),
-                    TiledArray::Exception);
+  BOOST_CHECK_TA_ASSERT(TiledRange1 r({-1, 0, 2, 5, 10, 17, 28}),
+                        TiledArray::Exception);
 #endif  // TA_SIGNED_1INDEX_TYPE
 
   // check copy constructor
@@ -152,18 +208,44 @@ BOOST_AUTO_TEST_CASE(constructor) {
     }
   }
 
+  // corner cases
+  {
+    // range with 1 empty tile
+    {
+      TiledRange1 r{0, 0};
+      BOOST_CHECK_EQUAL(r.tiles_range().first, 0);
+      BOOST_CHECK_EQUAL(r.tiles_range().second, 1);
+      BOOST_CHECK_EQUAL(r.elements_range().first, 0);
+      BOOST_CHECK_EQUAL(r.elements_range().second, 0);
+      BOOST_CHECK(r.tile(0) == Range1(0, 0));
+    }
+    // range with some empty tiles
+    {
+      TiledRange1 r{1, 3, 3, 5, 5};
+      BOOST_CHECK_EQUAL(r.tiles_range().first, 0);
+      BOOST_CHECK_EQUAL(r.tiles_range().second, 4);
+      BOOST_CHECK_EQUAL(r.elements_range().first, 1);
+      BOOST_CHECK_EQUAL(r.elements_range().second, 5);
+      // test tiles
+      BOOST_CHECK(r.tile(0) == Range1(1, 3));
+      BOOST_CHECK(r.tile(1) == Range1(3, 3));
+      BOOST_CHECK(r.tile(2) == Range1(3, 5));
+      BOOST_CHECK(r.tile(3) == Range1(5, 5));
+    }
+  }
+
   // Check that invalid input throws an exception.
 #ifndef NDEBUG
   {
     std::vector<std::size_t> boundaries;
-    BOOST_CHECK_THROW(TiledRange1 r(boundaries.begin(), boundaries.end()),
-                      Exception);
-    BOOST_CHECK_THROW(TiledRange1 r(a.begin(), a.begin()), Exception);
-    BOOST_CHECK_THROW(TiledRange1 r(a.begin(), a.begin() + 1), Exception);
+    BOOST_CHECK_TA_ASSERT(TiledRange1 r(boundaries.begin(), boundaries.end()),
+                          Exception);
+    BOOST_CHECK_TA_ASSERT(TiledRange1 r(a.begin(), a.begin()), Exception);
+    BOOST_CHECK_NO_THROW(TiledRange1 r(a.begin(), a.begin() + 1));
     boundaries.push_back(2);
     boundaries.push_back(0);
-    BOOST_CHECK_THROW(TiledRange1 r(boundaries.begin(), boundaries.end()),
-                      Exception);
+    BOOST_CHECK_TA_ASSERT(TiledRange1 r(boundaries.begin(), boundaries.end()),
+                          Exception);
   }
 #endif
 }
@@ -195,6 +277,20 @@ BOOST_AUTO_TEST_CASE(element_to_tile) {
 
   // Check that the expected and internal element to tile maps match.
   BOOST_CHECK_EQUAL_COLLECTIONS(c.begin(), c.end(), e.begin(), e.end());
+
+  // corner case: empty tiles
+  {
+    // range with some empty tiles
+    {
+      TiledRange1 r{1, 3, 3, 5, 5};
+      BOOST_CHECK_TA_ASSERT(r.element_to_tile(0), Exception);
+      BOOST_CHECK_EQUAL(r.element_to_tile(1), 0);
+      BOOST_CHECK_EQUAL(r.element_to_tile(2), 0);
+      BOOST_CHECK_EQUAL(r.element_to_tile(3), 2);
+      BOOST_CHECK_EQUAL(r.element_to_tile(4), 2);
+      BOOST_CHECK_TA_ASSERT(r.element_to_tile(5), Exception);
+    }
+  }
 }
 
 BOOST_AUTO_TEST_CASE(comparison) {
@@ -253,4 +349,77 @@ BOOST_AUTO_TEST_CASE(concatenation) {
   BOOST_CHECK(concat(r2, r1) == (TiledRange1{0, 3, 4, 5, 7, 11, 13}));
 }
 
+BOOST_AUTO_TEST_CASE(make_uniform) {
+  BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{0, 0}, 0));
+  BOOST_CHECK(TiledRange1::make_uniform(Range1{0, 0}, 0) == TiledRange1{});
+  BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{1, 1}, 0));
+  BOOST_CHECK(TiledRange1::make_uniform(Range1{1, 1}, 0) == TiledRange1{1});
+  BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{3, 6}, 10));
+  BOOST_CHECK(TiledRange1::make_uniform(Range1{3, 6}, 10) ==
+              (TiledRange1{3, 6}));
+  BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{10, 60}, 10));
+  BOOST_CHECK(TiledRange1::make_uniform(Range1{10, 60}, 10) ==
+              (TiledRange1{10, 20, 30, 40, 50, 60}));
+  BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{10, 65}, 10));
+  BOOST_CHECK(TiledRange1::make_uniform(Range1{10, 65}, 10) ==
+              (TiledRange1{10, 20, 29, 38, 47, 56, 65}));
+  BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(Range1{10, 69}, 10));
+  BOOST_CHECK(TiledRange1::make_uniform(Range1{10, 69}, 10) ==
+              (TiledRange1{10, 20, 30, 40, 50, 60, 69}));
+
+  BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(0, 0));
+  BOOST_CHECK(TiledRange1::make_uniform(0, 0) == TiledRange1{});
+  BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(0, 1));
+  BOOST_CHECK(TiledRange1::make_uniform(0, 1) == TiledRange1{});
+  BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(3, 10));
+  BOOST_CHECK(TiledRange1::make_uniform(3, 10) == (TiledRange1{0, 3}));
+  BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(50, 10));
+  BOOST_CHECK(TiledRange1::make_uniform(50, 10) ==
+              (TiledRange1{0, 10, 20, 30, 40, 50}));
+  BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(55, 10));
+  BOOST_CHECK(TiledRange1::make_uniform(55, 10) ==
+              (TiledRange1{0, 10, 19, 28, 37, 46, 55}));
+  BOOST_REQUIRE_NO_THROW(TiledRange1::make_uniform(59, 10));
+  BOOST_CHECK(TiledRange1::make_uniform(59, 10) ==
+              (TiledRange1{0, 10, 20, 30, 40, 50, 59}));
+
+  // member versions
+  BOOST_REQUIRE_NO_THROW((TiledRange1{0, 10, 20, 30, 40, 50}.make_uniform(30)));
+  BOOST_CHECK((TiledRange1{0, 10, 20, 30, 40, 50}.make_uniform(30) ==
+               TiledRange1{0, 25, 50}));
+  BOOST_REQUIRE_NO_THROW((TiledRange1{0, 40, 50}.make_uniform()));
+  BOOST_CHECK(
+      (TiledRange1{0, 40, 50}.make_uniform() == TiledRange1{0, 25, 50}));
+}
+
+BOOST_AUTO_TEST_CASE(shift) {
+  TiledRange1 r0;
+  TiledRange1 r0_plus_1;
+  BOOST_REQUIRE_NO_THROW(r0_plus_1 = r0.shift(1));
+  BOOST_CHECK_EQUAL(r0_plus_1, TiledRange1(1));
+  BOOST_REQUIRE_NO_THROW(r0_plus_1.inplace_shift(-1));
+  BOOST_CHECK_EQUAL(r0_plus_1, r0);
+
+  BOOST_CHECK_TA_ASSERT(
+      TiledRange1{std::numeric_limits<index1_type>::max()}.inplace_shift(1),
+      Exception);
+  BOOST_CHECK_TA_ASSERT(
+      TiledRange1{std::numeric_limits<index1_type>::min()}.inplace_shift(-1),
+      Exception);
+  TiledRange1 tmp;
+  BOOST_CHECK_TA_ASSERT(
+      tmp = TiledRange1{std::numeric_limits<index1_type>::max()}.shift(1),
+      Exception);
+  BOOST_CHECK_TA_ASSERT(
+      tmp = TiledRange1{std::numeric_limits<index1_type>::min()}.shift(-1),
+      Exception);
+
+  TiledRange1 r1{1, 3, 7, 9};
+  TiledRange1 r1_minus_1;
+  BOOST_REQUIRE_NO_THROW(r1_minus_1 = r1.shift(-1));
+  BOOST_CHECK_EQUAL(r1_minus_1, TiledRange1(0, 2, 6, 8));
+  BOOST_REQUIRE_NO_THROW(r1_minus_1.inplace_shift(1));
+  BOOST_CHECK_EQUAL(r1_minus_1, r1);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/tot_array_fixture.h b/tests/tot_array_fixture.h
index 9d46fadcc7..2c27824961 100644
--- a/tests/tot_array_fixture.h
+++ b/tests/tot_array_fixture.h
@@ -19,10 +19,15 @@
 
 #ifndef TILEDARRAY_TEST_TOT_ARRAY_FIXTURE_H__INCLUDED
 #define TILEDARRAY_TEST_TOT_ARRAY_FIXTURE_H__INCLUDED
-#include "tiledarray.h"
+#include <TiledArray/conversions/dense_to_sparse.h>
+#include <TiledArray/conversions/make_array.h>
+#include <TiledArray/einsum/tiledarray.h>
+#include <TiledArray/expressions/tsr_expr.h>
 #include "unit_test_config.h"
 #ifdef TILEDARRAY_HAS_BTAS
+#include <TiledArray/conversions/btas.h>
 #include <TiledArray/external/btas.h>
+#include <btas/generic/contract.h>
 #endif
 
 /* Notes:
@@ -88,6 +93,699 @@ using input_archive_type = madness::archive::BinaryFstreamInputArchive;
 // Type of an output archive
 using output_archive_type = madness::archive::BinaryFstreamOutputArchive;
 
+enum class ShapeComp { True, False };
+
+namespace fixture {
+namespace {
+
+template <typename, typename = void>
+constexpr bool maps_index_to_range_v{};
+
+template <typename Invocable>
+constexpr bool maps_index_to_range_v<
+    Invocable,
+    std::enable_if_t<std::is_constructible_v<
+        TA::Range, std::invoke_result_t<Invocable, TA::Range::index_type>>>>{
+    true};
+
+using il_range = std::initializer_list<size_t>;
+using il_trange = std::initializer_list<il_range>;
+
+}  // namespace
+
+///
+/// \tparam T Non cv-qualified TA::Tensor<numeric_type,...> type.
+/// \tparam Rng TA::Range should be constructible from Rng type.
+///             Eg. TA::Range, std::initializer_list<size_t>.
+/// \param rng The range of the result tensor.
+///            TA::Range(rng) will be called explicitly.
+/// \return A TA::Tensor of a numeric type with random elements.
+///
+template <typename T, typename Rng,
+          typename = std::enable_if_t<detail::is_tensor_v<T>>,
+          typename = std::enable_if_t<std::is_constructible_v<TA::Range, Rng>>>
+auto random_tensor(Rng rng) {
+  using numeric_type = typename T::numeric_type;
+
+  auto gen = [](auto&&) {
+    return detail::MakeRandom<numeric_type>::generate_value();
+  };
+
+  return T(TA::Range(rng), gen);
+}
+
+///
+/// \tparam T Non cv-qualified
+///           TA::Tensor<TA::Tensor<numeric_type,...>,...> type.
+/// \tparam RngO TA::Range should be constructible from RngO type.
+///             Eg. TA::Range, std::initializer_list<size_t>.
+/// \tparam RngI TA::Range should be constructible from RngI type.
+///             Eg. TA::Range, std::initializer_list<size_t>.
+/// \param rngo The range of the result tensor (ie the outer tensor).
+///             TA::Range(rngo) will be called explicitly.
+/// \param rngi The range of the inner tensors. Note that ALL inner tensors
+///             will have an EQUAL range. TA::Range(rngi) will be
+///             called explicitly.
+/// \return A TA::Tensor of TA::Tensor<numeric_type,...> with random
+///         numeric_type elements.
+///
+template <
+    typename T, typename RngO, typename RngI,                               //
+    typename = std::enable_if_t<detail::is_tensor_of_tensor_v<T>>,          //
+    typename = std::enable_if_t<std::is_constructible_v<TA::Range, RngO>>,  //
+    typename = std::enable_if_t<std::is_constructible_v<TA::Range, RngI>>>
+auto random_tensor(RngO rngo, RngI rngi) {
+  using numeric_type = typename T::numeric_type;
+  using Inner = typename T::value_type;
+
+  auto gen_inner = [](auto&&) {
+    return detail::MakeRandom<numeric_type>::generate_value();
+  };
+
+  auto gen_outer = [gen_inner, rngi](auto&&) {
+    return Inner(TA::Range(rngi), gen_inner);
+  };
+
+  return T(TA::Range(rngo), gen_outer);
+}
+
+///
+/// \tparam T Non cv-qualified
+///           TA::Tensor<TA::Tensor<numeric_type,...>,...> type.
+/// \tparam RngO TA::Range should be constructible from RngO type.
+///             Eg. TA::Range, std::initializer_list<size_t>.
+/// \tparam IxMap An invocable type that maps the index of an element in the
+///               outer tensor to a value, allowing the construction of
+///               TA::Range from that value.
+/// \param rngo The range of the result tensor (ie the outer tensor).
+///             TA::Range(rngo) will be called explicitly.
+/// \param ixmap An invocable that maps the index of an element in the
+///              outer tensor to a value, allowing the construction of
+///              TA::Range from that value.
+/// \return A TA::Tensor of TA::Tensor<numeric_type,...> with random
+///         numeric_type elements.
+///
+template <
+    typename T, typename RngO, typename IxMap,                              //
+    typename = std::enable_if_t<detail::is_tensor_of_tensor_v<T>>,          //
+    typename = std::enable_if_t<std::is_constructible_v<TA::Range, RngO>>,  //
+    std::enable_if_t<maps_index_to_range_v<IxMap>, bool> = true>
+auto random_tensor(RngO rngo, IxMap ixmap) {
+  using numeric_type = typename T::numeric_type;
+
+  auto gen_inner = [](auto&&) {
+    return TA::detail::MakeRandom<numeric_type>::generate_value();
+  };
+
+  auto gen_outer = [gen_inner, ixmap](auto const& oix) {
+    auto inner_rng = TA::Range(ixmap(oix));
+    return typename T::value_type(inner_rng, gen_inner);
+  };
+
+  return T(TA::Range(rngo), gen_outer);
+}
+
+///
+/// \tparam Array Non cv-qualified TA::DistArray type that has non-nested
+///               tile type. Eg. TA::DistArray<TA::Tensor<double>>
+/// \tparam Rng TA::TiledRange should be constructible from Rng type.
+/// \param rng The TA::TiledRange of the result TA::DistArray.
+/// \return A TA::DistArray of non-nested tile type with random elements.
+///
+template <
+    typename Array, typename Rng = il_trange,
+    typename = std::enable_if_t<detail::nested_rank<Array> == 1>,
+    typename = std::enable_if_t<std::is_constructible_v<TA::TiledRange, Rng>>>
+auto random_array(Rng rng) {
+  using T = typename Array::value_type;
+
+  auto make_tile = [](auto& tile, auto const& rng) {
+    tile = random_tensor<T>(rng);
+    return tile.norm();
+  };
+
+  return TA::make_array<Array>(TA::get_default_world(), TA::TiledRange(rng),
+                               make_tile);
+}
+
+///
+/// \tparam Array Non cv-qualified TA::DistArray type that has a nested
+///               tile type.
+///         Eg. TA::DistArray<TA::Tensor<TA::Tensor<double>>>
+/// \tparam RngO TA::TiledRange should be constructible form RngO type.
+/// \tparam RngI TA::Range should be constructible from RngI type.
+/// \param rngo The TA::TiledRange of the result TA::DistArray.
+/// \param rngi The range of the inner tensors. Note that ALL inner tensors
+///             will have an EQUAL range. TA::Range(rngi) will be
+///             called explicitly.
+/// \return A TA::DistArray of nested tile type with random elements.
+///
+template <
+    typename Array, typename RngO = il_trange, typename RngI = il_range,
+    typename = std::enable_if_t<detail::nested_rank<Array> == 2>,
+    typename = std::enable_if_t<std::is_constructible_v<TA::TiledRange, RngO>>,
+    typename = std::enable_if_t<std::is_constructible_v<TA::Range, RngI>>>
+auto random_array(RngO rngo, RngI rngi) {
+  using T = typename Array::value_type;
+
+  auto make_tile = [rngi](auto& tile, auto const& rng) {
+    tile = random_tensor<T>(rng, rngi);
+    return tile.norm();
+  };
+
+  return TA::make_array<Array>(TA::get_default_world(), TA::TiledRange(rngo),
+                               make_tile);
+}
+
+///
+/// \tparam Array Non cv-qualified TA::DistArray type that has a nested
+///               tile type.
+///         Eg. TA::DistArray<TA::Tensor<TA::Tensor<double>>>
+/// \tparam RngO TA::TiledRange should be constructible form RngO type.
+/// \tparam IxMap An invocable type that maps the index of an element in the
+///               outer tensor to a value, allowing the construction of
+///               TA::Range from that value.
+/// \param rngo The TA::TiledRange of the result TA::DistArray.
+/// \param ixmap An invocable that maps the index of an element in the
+///              outer tensor to a value, allowing the construction of
+///              TA::Range from that value.
+/// \return A TA::DistArray of nested tile type with random elements.
+template <
+    typename Array, typename RngO, typename IxMap,
+    typename = std::enable_if_t<detail::nested_rank<Array> == 2>,
+    typename = std::enable_if_t<std::is_constructible_v<TA::TiledRange, RngO>>,
+    std::enable_if_t<maps_index_to_range_v<IxMap>, bool> = true>
+auto random_array(RngO rngo, IxMap ixmap) {
+  using T = typename Array::value_type;
+
+  auto make_tile = [ixmap](auto& tile, auto const& rng) {
+    tile = random_tensor<T>(rng, ixmap);
+    return tile.norm();
+  };
+
+  return TA::make_array<Array>(TA::get_default_world(), TA::TiledRange(rngo),
+                               make_tile);
+}
+
+}  // namespace fixture
+
+using fixture::random_array;
+using fixture::random_tensor;
+
+///
+/// Succinctly call TA::detail::tensor_contract
+///
+/// \tparam T TA::Tensor type.
+/// \param einsum_annot Example annot: 'ik,kj->ij', when @c A is annotated by
+/// 'i' and 'k' for its two modes, and @c B is annotated by 'k' and 'j' for the
+/// same. The result tensor is rank-2 as well and its modes are annotated by 'i'
+/// and 'j'.
+/// \return Tensor contraction result.
+///
+template <typename T, std::enable_if_t<TA::detail::is_tensor_v<T>, bool> = true>
+auto tensor_contract(std::string const& einsum_annot, T const& A, T const& B) {
+  using ::Einsum::string::split2;
+  auto [ab, aC] = split2(einsum_annot, "->");
+  auto [aA, aB] = split2(ab, ",");
+
+  return TA::detail::tensor_contract(A, aA, B, aB, aC);
+}
+
+using PartialPerm = TA::container::svector<std::pair<size_t, size_t>>;
+
+template <typename T>
+PartialPerm partial_perm(::Einsum::index::Index<T> const& from,
+                         ::Einsum::index::Index<T> const& to) {
+  PartialPerm result;
+  for (auto i = 0; i < from.size(); ++i)
+    if (auto found = to.find(from[i]); found != to.end())
+      result.emplace_back(i, std::distance(to.begin(), found));
+  return result;
+}
+
+template <typename T, typename = std::enable_if_t<
+                          TA::detail::is_random_access_container_v<T>>>
+void apply_partial_perm(T& to, T const& from, PartialPerm const& p) {
+  for (auto [f, t] : p) {
+    TA_ASSERT(f < from.size() && t < to.size() && "Invalid permutation used");
+    to[t] = from[f];
+  }
+}
+
+enum struct TensorProduct { General, Dot, Invalid };
+
+struct ProductSetup {
+  TensorProduct product_type{TensorProduct::Invalid};
+
+  PartialPerm
+      // - {<k,v>} index at kth position in C appears at vth position in A
+      //   and so on...
+      // - {<k,v>} is sorted by k
+      C_to_A,
+      C_to_B,
+      I_to_A,  // 'I' implies for contracted indices
+      I_to_B;
+  size_t       //
+      rank_A,  //
+      rank_B,
+      rank_C,  //
+      rank_H,
+      rank_E,  //
+      rank_I;
+
+  ProductSetup() = default;
+
+  template <typename T,
+            typename = std::enable_if_t<TA::detail::is_annotation_v<T>>>
+  ProductSetup(T const& aA, T const& aB, T const& aC) {
+    using Indices = ::Einsum::index::Index<typename T::value_type>;
+
+    struct {
+      // A, B, C tensor indices
+      // H, E, I Hadamard, external, and internal indices
+      Indices A, B, C, H, E, I;
+    } const ixs{Indices(aA),     Indices(aB),
+                Indices(aC),     (ixs.A & ixs.B & ixs.C),
+                (ixs.A ^ ixs.B), ((ixs.A & ixs.B) - ixs.H)};
+
+    rank_A = ixs.A.size();
+    rank_B = ixs.B.size();
+    rank_C = ixs.C.size();
+    rank_H = ixs.H.size();
+    rank_E = ixs.E.size();
+    rank_I = ixs.I.size();
+
+    C_to_A = partial_perm(ixs.C, ixs.A);
+    C_to_B = partial_perm(ixs.C, ixs.B);
+    I_to_A = partial_perm(ixs.I, ixs.A);
+    I_to_B = partial_perm(ixs.I, ixs.B);
+
+    using TP = decltype(product_type);
+
+    if (rank_A + rank_B != 0 && rank_C != 0)
+      product_type = TP::General;
+    else if (rank_A == rank_B && rank_B != 0 && rank_C == 0)
+      product_type = TP::Dot;
+    else
+      product_type = TP::Invalid;
+  }
+
+  template <typename ArrayLike,
+            typename = std::enable_if_t<
+                TA::detail::is_annotation_v<typename ArrayLike::value_type>>>
+  ProductSetup(ArrayLike const& arr)
+      : ProductSetup(std::get<0>(arr), std::get<1>(arr), std::get<2>(arr)) {}
+
+  [[nodiscard]] bool valid() const noexcept {
+    return product_type != TensorProduct::Invalid;
+  }
+};
+
+///
+/// Example: To represent A("ik;ac") * B("kj;cb") -> C("ij;ab")
+///
+/// Method 1:
+/// ---
+/// construct with a single argument std::string("ij;ac,kj;cb->ij;ab");
+/// - the substring "<outer_indices>;<inner_indices>"
+///   annotates a single object (DistArray, Tensor etc.)
+/// - "<A_indices>,<B_indices>" implies two distinct annotations (for A and B)
+///   separated by a comma
+/// - the right hand side of '->' annotates the result.
+/// - Note: the only use of comma is to separate A's and B's annotations.
+///
+/// Method 2:
+/// ---
+/// construct with three arguments:
+///   std::string("i,k;a,c"), std::string("k,j;c,b"), std::string("i,j;a,b")
+///   - Note the use of comma.
+///
+class OuterInnerSetup {
+  ProductSetup outer_;
+  ProductSetup inner_;
+
+ public:
+  OuterInnerSetup(std::string const& annot) {
+    using ::Einsum::string::split2;
+    using Ix = ::Einsum::index::Index<char>;
+
+    enum { A, B, C };
+    std::array<std::string, 3> O;
+    std::array<std::string, 3> I;
+
+    auto [ab, aC] = split2(annot, "->");
+    std::tie(O[C], I[C]) = split2(aC, ";");
+
+    auto [aA, aB] = split2(ab, ",");
+    std::tie(O[A], I[A]) = split2(aA, ";");
+    std::tie(O[B], I[B]) = split2(aB, ";");
+    outer_ = ProductSetup(Ix(O[A]), Ix(O[B]), Ix(O[C]));
+    inner_ = ProductSetup(Ix(I[A]), Ix(I[B]), Ix(I[C]));
+  }
+
+  template <int N>
+  OuterInnerSetup(const char (&s)[N]) : OuterInnerSetup{std::string(s)} {}
+
+  OuterInnerSetup(std::string const& annotA, std::string const& annotB,
+                  std::string const& annotC) {
+    using ::Einsum::string::split2;
+    using Ix = ::Einsum::index::Index<std::string>;
+
+    enum { A, B, C };
+    std::array<std::string, 3> O;
+    std::array<std::string, 3> I;
+    std::tie(O[A], I[A]) = split2(annotA, ";");
+    std::tie(O[B], I[B]) = split2(annotB, ";");
+    std::tie(O[C], I[C]) = split2(annotC, ";");
+    outer_ = ProductSetup(Ix(O[A]), Ix(O[B]), Ix(O[C]));
+    inner_ = ProductSetup(Ix(I[A]), Ix(I[B]), Ix(I[C]));
+  }
+
+  [[nodiscard]] auto const& outer() const noexcept { return outer_; }
+
+  [[nodiscard]] auto const& inner() const noexcept { return inner_; }
+};
+
+namespace {
+
+auto make_perm(PartialPerm const& pp) {
+  TA::container::svector<TA::Permutation::index_type> p(pp.size());
+  for (auto [k, v] : pp) p[k] = v;
+  return TA::Permutation(p);
+}
+
+template <typename Result, typename Tensor, typename... Setups,
+          typename = std::enable_if_t<TA::detail::is_nested_tensor_v<Tensor>>>
+inline Result general_product(Tensor const& t, typename Tensor::numeric_type s,
+                              ProductSetup const& setup,
+                              Setups const&... args) {
+  static_assert(std::is_same_v<Result, Tensor>);
+  static_assert(sizeof...(args) == 0,
+                "To-Do: Only scalar times once-nested tensor supported now");
+  return t.scale(s, make_perm(setup.C_to_A).inv());
+}
+
+template <typename Result, typename Tensor, typename... Setups,
+          typename = std::enable_if_t<TA::detail::is_nested_tensor_v<Tensor>>>
+inline Result general_product(typename Tensor::numeric_type s, Tensor const& t,
+                              ProductSetup const& setup,
+                              Setups const&... args) {
+  static_assert(std::is_same_v<Result, Tensor>);
+  static_assert(sizeof...(args) == 0,
+                "To-Do: Only scalar times once-nested tensor supported now");
+  return t.scale(s, make_perm(setup.C_to_B).inv());
+}
+
+}  // namespace
+
+template <
+    typename Result, typename TensorA, typename TensorB, typename... Setups,
+    typename =
+        std::enable_if_t<TA::detail::is_nested_tensor_v<TensorA, TensorB>>>
+Result general_product(TensorA const& A, TensorB const& B,
+                       ProductSetup const& setup, Setups const&... args) {
+  using TA::detail::max_nested_rank;
+  using TA::detail::nested_rank;
+
+  // empty tensors
+  if (A.empty() || B.empty()) return Result{};
+
+  static_assert(std::is_same_v<typename TensorA::numeric_type,
+                               typename TensorB::numeric_type>);
+
+  static_assert(max_nested_rank<TensorA, TensorB> == sizeof...(args) + 1);
+
+  TA_ASSERT(setup.valid());
+
+  constexpr bool is_tot = max_nested_rank<TensorA, TensorB> > 1;
+
+  if constexpr (std::is_same_v<Result, typename TensorA::numeric_type>) {
+    //
+    // tensor dot product evaluation
+    // T * T -> scalar
+    // ToT * ToT -> scalar
+    //
+    static_assert(nested_rank<TensorA> == nested_rank<TensorB>);
+
+    TA_ASSERT(setup.rank_C == 0 &&
+              "Attempted to evaluate dot product when the product setup does "
+              "not allow");
+
+    Result result{};
+
+    for (auto&& ix_A : A.range()) {
+      TA::Range::index_type ix_B(setup.rank_B, 0);
+      apply_partial_perm(ix_B, ix_A, setup.I_to_B);
+
+      if constexpr (is_tot) {
+        auto const& lhs = A(ix_A);
+        auto const& rhs = B(ix_B);
+        result += general_product<Result>(lhs, rhs, args...);
+      } else
+        result += A(ix_A) * B(ix_B);
+    }
+
+    return result;
+  } else {
+    //
+    // general product:
+    // T * T -> T
+    // ToT * T -> ToT
+    // ToT * ToT -> ToT
+    // ToT * ToT -> T
+    //
+
+    static_assert(nested_rank<Result> <= max_nested_rank<TensorA, TensorB>,
+                  "Tensor product not supported with increased nested rank in "
+                  "the result");
+
+    // creating the contracted TA::Range
+    TA::Range const rng_I = [&setup, &A, &B]() {
+      TA::container::svector<TA::Range1> rng1_I(setup.rank_I, TA::Range1{});
+      for (auto [f, t] : setup.I_to_A)
+        // I_to_A implies I[f] == A[t]
+        rng1_I[f] = A.range().dim(t);
+
+      return TA::Range(rng1_I);
+    }();
+
+    // creating the target TA::Range.
+    TA::Range const rng_C = [&setup, &A, &B]() {
+      TA::container::svector<TA::Range1> rng1_C(setup.rank_C, TA::Range1{0, 0});
+      for (auto [f, t] : setup.C_to_A)
+        // C_to_A implies C[f] = A[t]
+        rng1_C[f] = A.range().dim(t);
+
+      for (auto [f, t] : setup.C_to_B)
+        // C_to_B implies C[f] = B[t]
+        rng1_C[f] = B.range().dim(t);
+
+      auto zero_r1 = [](TA::Range1 const& r) { return r == TA::Range1{0, 0}; };
+
+      TA_ASSERT(std::none_of(rng1_C.begin(), rng1_C.end(), zero_r1));
+
+      return TA::Range(rng1_C);
+    }();
+
+    Result C{rng_C};
+
+    // do the computation
+    for (auto ix_C : rng_C) {
+      // finding corresponding indices of A, and B.
+      TA::Range::index_type ix_A(setup.rank_A, 0), ix_B(setup.rank_B, 0);
+      apply_partial_perm(ix_A, ix_C, setup.C_to_A);
+      apply_partial_perm(ix_B, ix_C, setup.C_to_B);
+
+      if (setup.rank_I == 0) {
+        if constexpr (is_tot) {
+          C(ix_C) = general_product<typename Result::value_type>(
+              A(ix_A), B(ix_B), args...);
+        } else {
+          TA_ASSERT(!(ix_A.empty() && ix_B.empty()));
+          C(ix_C) = ix_A.empty()   ? B(ix_B)
+                    : ix_B.empty() ? A(ix_B)
+                                   : A(ix_A) * B(ix_B);
+        }
+      } else {
+        typename Result::value_type temp{};
+        for (auto const& ix_I : rng_I) {
+          apply_partial_perm(ix_A, ix_I, setup.I_to_A);
+          apply_partial_perm(ix_B, ix_I, setup.I_to_B);
+          if constexpr (is_tot) {
+            auto temp_ = general_product<typename Result::value_type>(
+                A(ix_A), B(ix_B), args...);
+            if constexpr (TA::detail::is_nested_tensor_v<
+                              typename Result::value_type>) {
+              if (temp.empty())
+                temp = std::move(temp_);
+              else
+                temp += temp_;
+            } else {
+              temp += temp_;
+            }
+          } else {
+            TA_ASSERT(!(ix_A.empty() || ix_B.empty()));
+            temp += A(ix_A) * B(ix_B);
+          }
+        }
+        C(ix_C) = temp;
+      }
+    }
+
+    return C;
+  }
+}
+
+template <typename TileC, typename TileA, typename TileB, typename... Setups>
+auto general_product(TA::DistArray<TileA, TA::DensePolicy> A,
+                     TA::DistArray<TileB, TA::DensePolicy> B,
+                     ProductSetup const& setup, Setups const&... args) {
+  using TA::detail::max_nested_rank;
+  using TA::detail::nested_rank;
+  static_assert(nested_rank<TileC> <= max_nested_rank<TileA, TileB>);
+  static_assert(nested_rank<TileC> != 0);
+  TA_ASSERT(setup.product_type == TensorProduct::General);
+
+  auto& world = TA::get_default_world();
+
+  A.make_replicated();
+  B.make_replicated();
+  world.gop.fence();
+
+  TA::Tensor<TileA> tensorA{A.trange().tiles_range()};
+  for (auto&& ix : tensorA.range()) tensorA(ix) = A.find_local(ix).get(false);
+
+  TA::Tensor<TileB> tensorB{B.trange().tiles_range()};
+  for (auto&& ix : tensorB.range()) tensorB(ix) = B.find_local(ix).get(false);
+
+  auto result_tensor = general_product<TA::Tensor<TileC>>(
+      tensorA, tensorB, setup, setup, args...);
+
+  TA::TiledRange result_trange;
+  {
+    TA::container::svector<TA::TiledRange1> tr1s(setup.rank_C);
+    for (auto [t, f] : setup.C_to_A) {
+      tr1s.at(t) = A.trange().at(f);
+    }
+    for (auto [t, f] : setup.C_to_B) {
+      tr1s.at(t) = B.trange().at(f);
+    }
+    result_trange = TiledRange(tr1s);
+  }
+
+  TA::DistArray<TileC, TA::DensePolicy> C(world, result_trange);
+
+  for (auto it : C) {
+    if (C.is_local(it.index())) it = result_tensor(it.index());
+  }
+  return C;
+}
+
+template <typename TileA, typename TileB, typename... Setups>
+auto general_product(TA::DistArray<TileA, TA::DensePolicy> A,
+                     TA::DistArray<TileB, TA::DensePolicy> B,
+                     Setups const&... args) {
+  using TA::detail::nested_rank;
+  using TileC = std::conditional_t<(nested_rank<TileB> > nested_rank<TileA>),
+                                   TileB, TileA>;
+  return general_product<TileC>(A, B, args...);
+}
+
+template <typename TileA, typename TileB, typename... Setups>
+auto general_product(TA::DistArray<TileA, TA::SparsePolicy> A,
+                     TA::DistArray<TileB, TA::SparsePolicy> B,
+                     Setups const&... args) {
+  auto A_dense = to_dense(A);
+  auto B_dense = to_dense(B);
+  return TA::to_sparse(general_product(A_dense, B_dense, args...));
+}
+
+template <DeNest DeNestFlag = DeNest::False, typename ArrayA, typename ArrayB,
+          typename = std::enable_if_t<TA::detail::is_array_v<ArrayA, ArrayB>>>
+auto manual_eval(OuterInnerSetup const& setups, ArrayA A, ArrayB B) {
+  constexpr auto mnr = TA::detail::max_nested_rank<ArrayA, ArrayB>;
+  static_assert(mnr == 1 || mnr == 2);
+
+  auto const& outer = setups.outer();
+  auto const& inner = setups.inner();
+
+  TA_ASSERT(outer.valid());
+
+  if constexpr (mnr == 2) {
+    TA_ASSERT(inner.valid());
+    if constexpr (DeNestFlag == DeNest::True) {
+      // reduced nested rank in result
+      using TA::detail::nested_rank;
+      static_assert(nested_rank<ArrayA> == nested_rank<ArrayB>);
+      TA_ASSERT(inner.rank_C == 0);
+      using TileC = typename ArrayA::value_type::value_type;
+      return general_product<TileC>(A, B, outer, inner);
+    } else
+      return general_product(A, B, outer, inner);
+  } else {
+    return general_product(A, B, outer);
+  }
+}
+
+#ifdef TILEDARRAY_HAS_BTAS
+
+template <typename T, typename = std::enable_if_t<TA::detail::is_tensor_v<T>>>
+auto tensor_to_btas_tensor(T const& ta_tensor) {
+  using value_type = typename T::value_type;
+  using range_type = typename T::range_type;
+
+  btas::Tensor<value_type, range_type> result{ta_tensor.range()};
+  TA::tensor_to_btas_subtensor(ta_tensor, result);
+  return result;
+}
+
+template <typename NumericT, typename RangeT, typename... Ts,
+          typename = std::enable_if_t<std::is_convertible_v<RangeT, TA::Range>>>
+auto btas_tensor_to_tensor(
+    btas::Tensor<NumericT, RangeT, Ts...> const& btas_tensor) {
+  TA::Tensor<NumericT> result{TA::Range(btas_tensor.range())};
+  TA::btas_subtensor_to_tensor(btas_tensor, result);
+  return result;
+}
+
+///
+/// @c einsum_annot pattern example: 'ik,kj->ij'. See tensor_contract function.
+///
+template <typename T, std::enable_if_t<TA::detail::is_tensor_v<T>, bool> = true>
+auto tensor_contract_btas(std::string const& einsum_annot, T const& A,
+                          T const& B) {
+  using ::Einsum::string::split2;
+  auto [ab, aC] = split2(einsum_annot, "->");
+  auto [aA, aB] = split2(ab, ",");
+
+  using NumericT = typename T::numeric_type;
+
+  struct {
+    btas::Tensor<NumericT, TA::Range> A, B, C;
+  } btas_tensor{tensor_to_btas_tensor(A), tensor_to_btas_tensor(B), {}};
+
+  btas::contract(NumericT{1}, btas_tensor.A, aA, btas_tensor.B, aB, NumericT{0},
+                 btas_tensor.C, aC);
+
+  return btas_tensor_to_tensor(btas_tensor.C);
+}
+
+///
+/// \tparam T TA::Tensor type
+/// \param einsum_annot see tensor_contract_mult
+/// \return True when TA::detail::tensor_contract and btas::contract result the
+///         result. Performs bitwise comparison.
+///
+template <typename T, typename = std::enable_if_t<TA::detail::is_tensor_v<T>>>
+auto tensor_contract_equal(std::string const& einsum_annot, T const& A,
+                           T const& B) {
+  T result_ta = tensor_contract(einsum_annot, A, B);
+  T result_btas = tensor_contract_btas(einsum_annot, A, B);
+  return result_ta == result_btas;
+}
+
+#endif
+
 /*
  *
  * When generating arrays containing tensors of tensors (ToT) we adopt simple
@@ -231,15 +929,15 @@ struct ToTArrayFixture {
    * - Same type
    * - Either both are initialized or both are not initialized
    * - Same MPI context
-   * - Same shape
+   * - Same shape (unless the template parameter ShapeCmp is set false)
    * - Same distribution
    * - Same tiling
    * - Components are bit-wise equal (i.e., 3.1400000000 != 3.1400000001)
    *
    * TODO: pmap comparisons
    */
-  template <typename LHSTileType, typename LHSPolicy, typename RHSTileType,
-            typename RHSPolicy>
+  template <ShapeComp ShapeCompFlag = ShapeComp::True, typename LHSTileType,
+            typename LHSPolicy, typename RHSTileType, typename RHSPolicy>
   static bool are_equal(const DistArray<LHSTileType, LHSPolicy>& lhs,
                         const DistArray<RHSTileType, RHSPolicy>& rhs) {
     // Same type
@@ -254,7 +952,8 @@ struct ToTArrayFixture {
       if (&lhs.world() != &rhs.world()) return false;
 
       // Same shape?
-      if (lhs.shape() != rhs.shape()) return false;
+      if constexpr (ShapeCompFlag == ShapeComp::True)
+        if (lhs.shape() != rhs.shape()) return false;
 
       // Same pmap?
       // if(*lhs.pmap() != *rhs.pmap()) return false;
diff --git a/tests/tot_dist_array_part1.cpp b/tests/tot_dist_array_part1.cpp
index e71392ef8c..d95bb050a2 100644
--- a/tests/tot_dist_array_part1.cpp
+++ b/tests/tot_dist_array_part1.cpp
@@ -331,7 +331,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(begin, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.begin(), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.begin(), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
@@ -344,7 +344,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(const_begin, TestParam, test_params) {
   {
     const tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.begin(), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.begin(), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
@@ -356,7 +356,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(const_begin, TestParam, test_params) {
 BOOST_AUTO_TEST_CASE_TEMPLATE(end, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
-    if (m_world.nproc() == 1) BOOST_CHECK_THROW(t.end(), TiledArray::Exception);
+    if (m_world.nproc() == 1)
+      BOOST_CHECK_TA_ASSERT(t.end(), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
@@ -368,7 +369,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(end, TestParam, test_params) {
 BOOST_AUTO_TEST_CASE_TEMPLATE(const_end, TestParam, test_params) {
   {
     const tensor_type<TestParam> t;
-    if (m_world.nproc() == 1) BOOST_CHECK_THROW(t.end(), TiledArray::Exception);
+    if (m_world.nproc() == 1)
+      BOOST_CHECK_TA_ASSERT(t.end(), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
@@ -391,7 +393,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(find, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.find(0), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.find(0), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
diff --git a/tests/tot_dist_array_part2.cpp b/tests/tot_dist_array_part2.cpp
index b916812884..ffd1883198 100644
--- a/tests/tot_dist_array_part2.cpp
+++ b/tests/tot_dist_array_part2.cpp
@@ -37,7 +37,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(fill_local, TestParam, test_params) {
   {
     tensor_type t;
     if (m_world.nproc() == 1) {
-      BOOST_CHECK_THROW(t.fill_local(inner_type{}), except_t);
+      BOOST_CHECK_TA_ASSERT(t.fill_local(inner_type{}), except_t);
     }
   }
 
@@ -56,7 +56,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(fill_local, TestParam, test_params) {
     // Test that it throws if a tile is already set
     /*{
         if(m_world.nproc() == 1)
-          BOOST_CHECK_THROW(already_set.fill_local(inner_type{}), except_t);
+          BOOST_CHECK_TA_ASSERT(already_set.fill_local(inner_type{}), except_t);
     }*/
 
     // Test we can actually fill tiles
@@ -86,7 +86,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(fill, TestParam, test_params) {
   {
     tensor_type t;
     if (m_world.nproc() == 1) {
-      BOOST_CHECK_THROW(t.fill(inner_type{}), except_t);
+      BOOST_CHECK_TA_ASSERT(t.fill(inner_type{}), except_t);
     }
   }
 
@@ -106,7 +106,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(fill, TestParam, test_params) {
     // Test that it throws if a tile is already set
     /*{
       if(m_world.nproc() == 1)
-        BOOST_CHECK_THROW(already_set.fill(inner_type{}), except_t);
+        BOOST_CHECK_TA_ASSERT(already_set.fill(inner_type{}), except_t);
     }*/
 
     // Test we can actually fill tiles
@@ -145,7 +145,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(init_tiles, TestParam, test_params) {
     tensor_type t;
     if (m_world.nproc() == 1) {
       auto l = [](const Range&) { return tile_type<TestParam>{}; };
-      BOOST_CHECK_THROW(t.init_tiles(l), except_t);
+      BOOST_CHECK_TA_ASSERT(t.init_tiles(l), except_t);
     }
   }
 
@@ -172,7 +172,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(init_tiles, TestParam, test_params) {
     // Test that it throws if a tile is already set
     /*{
       if(m_world.nproc() == 1)
-        BOOST_CHECK_THROW(corr.init_tiles(l), except_t);
+        BOOST_CHECK_TA_ASSERT(corr.init_tiles(l), except_t);
     }*/
 
     // Test we can actually fill tiles
@@ -200,7 +200,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(init_elements, TestParam, test_params) {
     tensor_type t;
     auto l = [](const index_type&) { return inner_type{}; };
     if (m_world.nproc() == 1) {
-      BOOST_CHECK_THROW(t.init_elements(l), except_t);
+      BOOST_CHECK_TA_ASSERT(t.init_elements(l), except_t);
     }
   }
 
@@ -226,7 +226,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(init_elements, TestParam, test_params) {
     // Test that it throws if a tile is already set
     /*{
       if(m_world.nproc() == 1)
-        BOOST_CHECK_THROW(corr.init_elements(l), except_t);
+        BOOST_CHECK_TA_ASSERT(corr.init_elements(l), except_t);
     }*/
 
     // Test we can actually fill tiles
@@ -245,7 +245,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(trange, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.trange(), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.trange(), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
@@ -259,7 +259,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(tiles_range, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.tiles_range(), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.tiles_range(), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
@@ -274,7 +274,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(elements_range, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.elements_range(), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.elements_range(), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
@@ -289,7 +289,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(size, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.size(), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.size(), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
@@ -303,7 +303,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(world, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.world(), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.world(), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
@@ -317,7 +317,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(pmap, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.pmap(), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.pmap(), TiledArray::Exception);
   }
 }
 
@@ -325,7 +325,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(shape, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.shape(), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.shape(), TiledArray::Exception);
   }
   using shape_type = typename tensor_type<TestParam>::shape_type;
   for (auto tr_t : run_all<TestParam>()) {
@@ -351,9 +351,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(call_operator, TestParam, test_params) {
     if (m_world.nproc() == 1) {
       using except_t = TiledArray::Exception;
       // Throws if no semicolon
-      BOOST_CHECK_THROW(t(outer_idx), except_t);
+      BOOST_CHECK_TA_ASSERT(t(outer_idx), except_t);
       // Throws if wrong outer rank
-      BOOST_CHECK_THROW(t("i,j,k,l,m;" + inner_idx), except_t);
+      BOOST_CHECK_TA_ASSERT(t("i,j,k,l,m;" + inner_idx), except_t);
     }
 
     auto vars = outer_idx + ";" + inner_idx;
@@ -374,9 +374,9 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(const_call_operator, TestParam, test_params) {
     if (m_world.nproc() == 1) {
       using except_t = TiledArray::Exception;
       // Throws if no semicolon
-      BOOST_CHECK_THROW(t(outer_idx), except_t);
+      BOOST_CHECK_TA_ASSERT(t(outer_idx), except_t);
       // Throws if wrong outer rank
-      BOOST_CHECK_THROW(t("i,j,k,l,m;" + inner_idx), except_t);
+      BOOST_CHECK_TA_ASSERT(t("i,j,k,l,m;" + inner_idx), except_t);
     }
 
     auto vars = outer_idx + ";" + inner_idx;
@@ -399,7 +399,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(is_dense, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.is_dense(), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.is_dense(), TiledArray::Exception);
   }
 
   using shape_type = typename tensor_type<TestParam>::shape_type;
@@ -415,7 +415,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(owner, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.owner(0), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.owner(0), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
@@ -426,11 +426,11 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(owner, TestParam, test_params) {
       const auto& upbound = tr.tiles_range().upbound();
 
       // Test throws if index is out of bounds
-      BOOST_CHECK_THROW(corr.owner(upbound), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(corr.owner(upbound), TiledArray::Exception);
 
       // Throws if index has wrong rank
       std::vector<unsigned int> bad_idx(upbound.size() + 1, 0);
-      BOOST_CHECK_THROW(corr.owner(bad_idx), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(corr.owner(bad_idx), TiledArray::Exception);
     }
 
     for (auto idx : corr.tiles_range()) {
@@ -445,7 +445,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(owner_init_list, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.owner({0}), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.owner({0}), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
@@ -459,13 +459,13 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(owner_init_list, TestParam, test_params) {
 
       // Test throws if index is out of bounds
       if (rank == 1)
-        BOOST_CHECK_THROW(corr.owner({upbound[0]}), except_t);
+        BOOST_CHECK_TA_ASSERT(corr.owner({upbound[0]}), except_t);
       else if (rank == 2)
-        BOOST_CHECK_THROW(corr.owner({upbound[0], upbound[1]}), except_t);
+        BOOST_CHECK_TA_ASSERT(corr.owner({upbound[0], upbound[1]}), except_t);
 
       // Throws if index has wrong rank
       std::initializer_list<unsigned int> il2{0, 0, 0, 0, 0, 0};
-      BOOST_CHECK_THROW(corr.owner(il2), except_t);
+      BOOST_CHECK_TA_ASSERT(corr.owner(il2), except_t);
     }
 
     for (auto idx : corr.tiles_range()) {
@@ -484,7 +484,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(is_local, TestParam, test_params) {
   {
     tensor_type<TestParam> t;
     if (m_world.nproc() == 1)
-      BOOST_CHECK_THROW(t.is_local(0), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(t.is_local(0), TiledArray::Exception);
   }
 
   for (auto tr_t : run_all<TestParam>()) {
@@ -495,7 +495,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(is_local, TestParam, test_params) {
       const auto& upbound = tr.tiles_range().upbound();
 
       // Test throws if index is out of bounds
-      BOOST_CHECK_THROW(corr.is_local(upbound), TiledArray::Exception);
+      BOOST_CHECK_TA_ASSERT(corr.is_local(upbound), TiledArray::Exception);
 
       // Throws if index has wrong rank
       std::vector<unsigned int> bad_idx(upbound.size() + 1, 0);
diff --git a/tests/type_traits.cpp b/tests/type_traits.cpp
index 105ae6ff72..77940bcb6f 100644
--- a/tests/type_traits.cpp
+++ b/tests/type_traits.cpp
@@ -275,4 +275,32 @@ BOOST_AUTO_TEST_CASE(convertibility) {
   }
 }
 
+BOOST_AUTO_TEST_CASE(tensor) {
+  using TI = TiledArray::Tensor<int>;
+  using TTI = TiledArray::Tensor<TiledArray::Tensor<int>>;
+  using TTTI = TiledArray::Tensor<TiledArray::Tensor<TiledArray::Tensor<int>>>;
+  using TD = TiledArray::Tensor<double>;
+  using TTD = TiledArray::Tensor<TiledArray::Tensor<double>>;
+  using TTTD =
+      TiledArray::Tensor<TiledArray::Tensor<TiledArray::Tensor<double>>>;
+
+  using namespace TiledArray::detail;
+  BOOST_CHECK((is_tensor_v<TI>));
+  BOOST_CHECK(!(is_tensor_v<TI, TTI, TTTI, TD, TTD, TTTD>));
+  BOOST_CHECK((is_tensor_of_tensor_v<TTI, TTD>));
+  BOOST_CHECK(!(is_tensor_of_tensor_v<TTI, TTTI, TTD, TTTD>));
+  BOOST_CHECK((!is_tensor_of_tensor_v<TI>));
+  BOOST_CHECK((!is_tensor_of_tensor_v<TD>));
+  BOOST_CHECK((is_nested_tensor_v<TI, TTI, TTTI, TD, TTD, TTTD>));
+  BOOST_CHECK((!tensors_have_equal_nested_rank_v<int, double>));
+  BOOST_CHECK((tensors_have_equal_nested_rank_v<TI, TD>));
+  BOOST_CHECK((tensors_have_equal_nested_rank_v<TTI, TTD>));
+  BOOST_CHECK((tensors_have_equal_nested_rank_v<TTTI, TTTD>));
+  BOOST_CHECK((!tensors_have_equal_nested_rank_v<int, TD>));
+  BOOST_CHECK((!tensors_have_equal_nested_rank_v<TI, TTI>));
+  BOOST_CHECK((!tensors_have_equal_nested_rank_v<TI, TTTI>));
+  BOOST_CHECK((!tensors_have_equal_nested_rank_v<TTI, TTTI>));
+  BOOST_CHECK((!tensors_have_equal_nested_rank_v<TI, TD, TTD>));
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/tests/unit_test_config.h.in b/tests/unit_test_config.h.in
index bd23b8414b..59e485634f 100644
--- a/tests/unit_test_config.h.in
+++ b/tests/unit_test_config.h.in
@@ -61,4 +61,24 @@
 #define TA_UT_LABEL_DISTRIBUTED  *boost::unit_test::label("distributed")
 #define TA_UT_LABEL_SERIAL  *boost::unit_test::label("serial")
 
+#if (TA_ASSERT_POLICY == TA_ASSERT_THROW)
+
+#define BOOST_WARN_TA_ASSERT( S, E ) \
+    BOOST_WARN_THROW( S, E )
+#define BOOST_CHECK_TA_ASSERT( S, E ) \
+    BOOST_CHECK_THROW( S, E )
+#define BOOST_REQUIRE_TA_ASSERT( S, E ) \
+    BOOST_REQUIRE_THROW( S, E )
+
+#else
+
+#define BOOST_WARN_TA_ASSERT( S, E )  \
+    BOOST_WARN_MESSAGE( false, "Skipped BOOST_WARN_TA_ASSERT(" BOOST_STRINGIZE(S) "," BOOST_STRINGIZE(E) ") due to TA_ASSERT_POLICY != TA_ASSERT_THROW" )
+#define BOOST_CHECK_TA_ASSERT( S, E )  \
+    BOOST_WARN_MESSAGE( false, "Skipped BOOST_CHECK_THROW(" BOOST_STRINGIZE(S) "," BOOST_STRINGIZE(E) ") due to TA_ASSERT_POLICY != TA_ASSERT_THROW" )
+#define BOOST_REQUIRE_TA_ASSERT( S, E )  \
+    BOOST_WARN_MESSAGE( false, "Skipped BOOST_REQUIRE_THROW(" BOOST_STRINGIZE(S) "," BOOST_STRINGIZE(E) ") due to TA_ASSERT_POLICY != TA_ASSERT_THROW" )
+
+#endif
+
 #endif // TILEDARRAY_CONFIG_H__INCLUDED