diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bfbd06ce6d..c147526452 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -32,6 +32,10 @@ jobs:
     steps:
     - uses: actions/checkout@v2
 
+    - uses: maxim-lobanov/setup-xcode@v1
+      with:
+        xcode-version: '<14'
+
     - name: Host system info
       shell: bash
       run: cmake -P ${{github.workspace}}/ci/host_system_info.cmake
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e8fce70cf..a43b767e03 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -235,12 +235,15 @@ endif()
 set(INTEGER4 TRUE CACHE BOOL "If TRUE, use integer*4 Fortran integers in BLAS calls. Otherwise use integer*8.")
 mark_as_advanced(INTEGER4)
 
-# Set the CPU L1 cache line size.
-set(VECTOR_ALIGNMENT "16" CACHE STRING "Set the vector alignment in memory (DO NOT CHANGE THIS VALUE UNLESS YOU KNOW WHAT YOU ARE DOING)")
-mark_as_advanced(VECTOR_ALIGNMENT)
-set(TILEDARRAY_ALIGNMENT ${VECTOR_ALIGNMENT})
+# Set the align size
+include(DetectAlignSize)
+if (NOT DEFINED CACHE{TA_ALIGN_SIZE})
+  set(TA_ALIGN_SIZE "${TA_ALIGN_SIZE_DETECTED}" CACHE STRING "Set the default alignment of data buffers used by array tiles (DO NOT CHANGE THIS VALUE UNLESS YOU KNOW WHAT YOU ARE DOING)")
+endif()
+mark_as_advanced(TA_ALIGN_SIZE)
+set(TILEDARRAY_ALIGN_SIZE ${TA_ALIGN_SIZE})
 
-# Set the vectory.
+# Set the CPU L1 cache line size.
 set(CACHE_LINE_SIZE "64" CACHE STRING "Set the CPU L1 cache line size in bytes (DO NOT CHANGE THIS VALUE UNLESS YOU KNOW WHAT YOU ARE DOING)")
 mark_as_advanced(CACHE_LINE_SIZE)
 set(TILEDARRAY_CACHELINE_SIZE ${CACHE_LINE_SIZE})
diff --git a/INSTALL.md b/INSTALL.md
index 8a1e2f81e2..230cb643fb 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -42,7 +42,7 @@ Both methods are supported. However, for most users we _strongly_ recommend to b
   - Boost.Range: header-only, *only used for unit testing*
 - [BTAS](http://github.com/ValeevGroup/BTAS), tag fba66ad9881ab29ea8df49ac6a6006cab3fb3ce5 . If usable BTAS installation is not found, TiledArray will download and compile
   BTAS from source. *This is the recommended way to compile BTAS for all users*.
-- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 66b199a08bf5f33b1565811fc202a051ec1b0fbb .
+- [MADNESS](https://github.com/m-a-d-n-e-s-s/madness), tag 40d2e38414179a8ebce508c7339fcee21244ffc6 .
   Only the MADworld runtime and BLAS/LAPACK C API component of MADNESS is used by TiledArray.
   If usable MADNESS installation is not found, TiledArray will download and compile
   MADNESS from source. *This is the recommended way to compile MADNESS for all users*.
@@ -393,13 +393,13 @@ directory with:
 
 ## Advanced configure options:
 
-The following CMake cache variables are tuning parameters. You should only
-modify these values if you know the values for your patricular system.
+The following CMake cache variables are for performance tuning. You should only
+modify these values if you know the values for your particular system.
 
-* `VECTOR_ALIGNMENT` -- The alignment of memory for Tensor in bytes [Default=16]
-* `CACHE_LINE_SIZE` -- The cache line size in bytes [Default=64]
+* `TA_ALIGN_SIZE` -- The alignment of memory allocated by TA::Tensor (and other artifacts like TA::host_allocator), in bytes. [Default is platform-specific, if no platform-specific value is found =64]
+* `TA_CACHE_LINE_SIZE` -- The cache line size in bytes [Default=64]
 
-`VECTOR_ALIGNMENT` controls the alignment of Tensor data, and `CACHE_LINE_SIZE`
+`TA_ALIGN_SIZE` controls the alignment of memory allocated for tiles, and `TA_CACHE_LINE_SIZE`
 controls the size of automatic loop unrolling for tensor operations. TiledArray
 does not currently use explicit vector instructions (i.e. intrinsics), but
 the code is written in such a way that compilers can more easily autovectorize
@@ -416,7 +416,7 @@ support may be added.
 * `TA_TTG` -- Set to `ON` to find or fetch the TTG library. [Default=OFF]. 
 * `TA_SIGNED_1INDEX_TYPE` -- Set to `OFF` to use unsigned 1-index coordinate type (default for TiledArray 1.0.0-alpha.2 and older). The default is `ON`, which enables the use of negative indices in coordinates.
 * `TA_MAX_SOO_RANK_METADATA` -- Specifies the maximum rank for which to use Small Object Optimization (hence, avoid the use of the heap) for metadata. The default is `8`.
-* `TA_TENSOR_MEM_PROFILE` -- Set to `ON` to profile memory allocations in TA::Tensor.
+* `TA_TENSOR_MEM_PROFILE` -- Set to `ON` to profile host memory allocations used by TA::Tensor. This causes the use of Umpire for host memory allocation. This also enables additional tracing facilities provided by Umpire; these can be controlled via [environment variable `UMPIRE_LOG_LEVEL`](https://umpire.readthedocs.io/en/develop/sphinx/features/logging_and_replay.html), but note that the default is to log Umpire info into a file rather than stdout.
 * `TA_UT_CTEST_TIMEOUT` -- The value (in seconds) of the timeout to use for running the TA unit tests via CTest when building the `check`/`check-tiledarray` targets. The default timeout is 1500s.
  
 # Build TiledArray
diff --git a/cmake/modules/DetectAlignSize.cmake b/cmake/modules/DetectAlignSize.cmake
new file mode 100644
index 0000000000..6830035fd6
--- /dev/null
+++ b/cmake/modules/DetectAlignSize.cmake
@@ -0,0 +1,17 @@
+# see https://stackoverflow.com/a/69952705 and https://gitlab.kitware.com/cmake/cmake/-/blob/master/Modules/CMakeDetermineCompilerABI.cmake
+
+set(BIN "${CMAKE_PLATFORM_INFO_DIR}/cmake/modules/DetectAlignSize.bin")
+try_compile(DETECT_ALIGN_SIZE_COMPILED
+      ${CMAKE_BINARY_DIR}
+      SOURCES ${PROJECT_SOURCE_DIR}/cmake/modules/DetectAlignSize.cpp
+      CMAKE_FLAGS ${CMAKE_CXX_FLAGS}
+      COPY_FILE "${BIN}"
+      COPY_FILE_ERROR copy_error
+      OUTPUT_VARIABLE OUTPUT
+      )
+if (DETECT_ALIGN_SIZE_COMPILED AND NOT copy_error)
+  file(STRINGS "${BIN}" data REGEX "INFO:align_size\\[[^]]*\\]")
+  if (data MATCHES "INFO:align_size\\[0*([^]]*)\\]")
+     set(TA_ALIGN_SIZE_DETECTED "${CMAKE_MATCH_1}" CACHE INTERNAL "")
+  endif()
+endif()
diff --git a/cmake/modules/DetectAlignSize.cpp b/cmake/modules/DetectAlignSize.cpp
new file mode 100644
index 0000000000..7af404e4e8
--- /dev/null
+++ b/cmake/modules/DetectAlignSize.cpp
@@ -0,0 +1,39 @@
+//
+// Created by Eduard Valeyev on 10/18/22.
+//
+
+#if defined(__x86_64__)
+#if defined(__AVX__)
+#define PREFERRED_ALIGN_SIZE 32
+#elif defined(__AVX512F__)
+#define PREFERRED_ALIGN_SIZE 64
+#else  // 64-bit x86 should have SSE
+#define PREFERRED_ALIGN_SIZE 16
+#endif
+#elif (defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM) || \
+       defined(_M_ARM64))
+#define PREFERRED_ALIGN_SIZE 16
+#elif defined(__VECTOR4DOUBLE__)
+#define PREFERRED_ALIGN_SIZE 32
+#endif
+
+// else: default to typical cache line size
+#ifndef PREFERRED_ALIGN_SIZE
+#define PREFERRED_ALIGN_SIZE 64
+#endif
+
+/* Preferred align size, in bytes. */
+const char info_align_size[] = {
+    /* clang-format off */
+  'I', 'N', 'F', 'O', ':', 'a', 'l', 'i', 'g', 'n', '_', 's', 'i', 'z',
+  'e', '[', ('0' + ((PREFERRED_ALIGN_SIZE / 10) % 10)), ('0' + (PREFERRED_ALIGN_SIZE % 10)), ']',
+  '\0'
+    /* clang-format on */
+};
+
+int main(int argc, char* argv[]) {
+  int require = 0;
+  require += info_align_size[argc];
+  (void)argv;
+  return require;
+}
diff --git a/examples/dgemm/ta_dense.cpp b/examples/dgemm/ta_dense.cpp
index 4f05b662ca..85716cf05f 100644
--- a/examples/dgemm/ta_dense.cpp
+++ b/examples/dgemm/ta_dense.cpp
@@ -139,6 +139,18 @@ void gemm_(TiledArray::World& world, const TiledArray::TiledRange& trange,
       world.gop.fence();
       madness::print_meminfo(world.rank(), str);
     }
+#ifdef TA_TENSOR_MEM_PROFILE
+    {
+      world.gop.fence();
+      std::cout << str << ": TA::Tensor allocated "
+                << umpire::ResourceManager::getInstance()
+                       .getAllocator("HOST")
+                       .getHighWatermark()
+                << " bytes and used "
+                << TA::hostEnv::instance()->host_allocator().getHighWatermark()
+                << " bytes" << std::endl;
+    }
+#endif
   };
 
   memtrace("start");
diff --git a/external/versions.cmake b/external/versions.cmake
index f159d92321..e8cf72846a 100644
--- a/external/versions.cmake
+++ b/external/versions.cmake
@@ -19,8 +19,8 @@ set(TA_INSTALL_EIGEN_PREVIOUS_VERSION 3.3.7)
 set(TA_INSTALL_EIGEN_URL_HASH SHA256=b4c198460eba6f28d34894e3a5710998818515104d6e74e5cc331ce31e46e626)
 set(TA_INSTALL_EIGEN_PREVIOUS_URL_HASH MD5=b9e98a200d2455f06db9c661c5610496)
 
-set(TA_TRACKED_MADNESS_TAG 66b199a08bf5f33b1565811fc202a051ec1b0fbb)
-set(TA_TRACKED_MADNESS_PREVIOUS_TAG c0df7338779d06df7eaff31644d508940a7cfd90)
+set(TA_TRACKED_MADNESS_TAG 40d2e38414179a8ebce508c7339fcee21244ffc6)
+set(TA_TRACKED_MADNESS_PREVIOUS_TAG 66b199a08bf5f33b1565811fc202a051ec1b0fbb)
 set(TA_TRACKED_MADNESS_VERSION 0.10.1)
 set(TA_TRACKED_MADNESS_PREVIOUS_VERSION 0.10.1)
 
@@ -39,6 +39,6 @@ set(TA_TRACKED_SCALAPACKPP_PREVIOUS_TAG bf17a7246af38d34523bd0099b01d9961d06d311
 set(TA_TRACKED_RANGEV3_TAG 2e0591c57fce2aca6073ad6e4fdc50d841827864)
 set(TA_TRACKED_RANGEV3_PREVIOUS_TAG dbdaa247a25a0daa24c68f1286a5693c72ea0006)
 
-set(TA_TRACKED_TTG_URL https://github.com/therault/ttg.git)
-set(TA_TRACKED_TTG_TAG bb5309a5224e2546a5316daf7fc5c143f450f17b)
-set(TA_TRACKED_TTG_PREVIOUS_TAG 5107143b418384c44587c2776a9e87065d33d670)
+set(TA_TRACKED_TTG_URL https://github.com/TESSEorg/ttg)
+set(TA_TRACKED_TTG_TAG 1251bec25e07a74a05e5cd4cdec181a95a9baa66)
+set(TA_TRACKED_TTG_PREVIOUS_TAG bb5309a5224e2546a5316daf7fc5c143f450f17b)
diff --git a/src/TiledArray/config.h.in b/src/TiledArray/config.h.in
index a2e4f06ce4..68f5dc1374 100644
--- a/src/TiledArray/config.h.in
+++ b/src/TiledArray/config.h.in
@@ -66,7 +66,7 @@
 #cmakedefine TILEDARRAY_HAS_LONG_LONG 1
 
 /* Define the default alignment for arrays required by vector operations. */
-#cmakedefine TILEDARRAY_ALIGNMENT @TILEDARRAY_ALIGNMENT@
+#cmakedefine TILEDARRAY_ALIGN_SIZE @TILEDARRAY_ALIGN_SIZE@
 
 /* Define the size of the CPU L1 cache lines. */
 #cmakedefine TILEDARRAY_CACHELINE_SIZE @TILEDARRAY_CACHELINE_SIZE@
@@ -125,11 +125,11 @@
 /* Add macro TILEDARRAY_ALIGNED_STORAGE which forces alignment of variables */
 #if defined(__clang__) || defined(__GNUC__) || defined(__PGI) || defined(__IBMCPP__) || defined(__ARMCC_VERSION)
 
-#define TILEDARRAY_ALIGNED_STORAGE __attribute__((aligned(TILEDARRAY_ALIGNMENT)))
+#define TILEDARRAY_ALIGNED_STORAGE __attribute__((aligned(TILEDARRAY_ALIGN_SIZE)))
 
 #elif (defined _MSC_VER)
 
-#define TILEDARRAY_ALIGNED_STORAGE __declspec(align(TILEDARRAY_ALIGNMENT))
+#define TILEDARRAY_ALIGNED_STORAGE __declspec(align(TILEDARRAY_ALIGN_SIZE))
 
 #else
 
diff --git a/src/TiledArray/external/umpire.h b/src/TiledArray/external/umpire.h
index 2aefa3c6d4..9419968585 100644
--- a/src/TiledArray/external/umpire.h
+++ b/src/TiledArray/external/umpire.h
@@ -74,15 +74,20 @@ class umpire_allocator_impl {
 
     TA_ASSERT(umpalloc_);
 
-    result = static_cast<pointer>(umpalloc_->allocate(n * sizeof(T)));
+    // this, instead of umpalloc_->allocate(n*sizeof(T)), profiles memory use
+    // even if introspection is off
+    result = static_cast<pointer>(
+        umpalloc_->getAllocationStrategy()->allocate_internal(n * sizeof(T)));
 
     return result;
   }
 
   /// deallocate um memory using umpire dynamic pool
-  void deallocate(pointer ptr, size_t) {
+  void deallocate(pointer ptr, size_t size) {
     TA_ASSERT(umpalloc_);
-    umpalloc_->deallocate(ptr);
+    // this, instead of umpalloc_->deallocate(ptr, size), profiles mmeory use
+    // even if introspection is off
+    umpalloc_->getAllocationStrategy()->deallocate_internal(ptr, size);
   }
 
   const umpire::Allocator* umpire_allocator() const { return umpalloc_; }
diff --git a/src/TiledArray/fwd.h b/src/TiledArray/fwd.h
index 94ef77ebda..51e43d9bee 100644
--- a/src/TiledArray/fwd.h
+++ b/src/TiledArray/fwd.h
@@ -64,8 +64,14 @@ class DensePolicy;
 class SparsePolicy;
 
 // TiledArray Tensors
-// can also use host_allocator<T> and std::allocator<T> for A
-template <typename T, typename A = Eigen::aligned_allocator<T>>
+// can any standard-compliant allocator such as std::allocator<T>
+template <typename T, typename A =
+#ifndef TA_TENSOR_MEM_PROFILE
+                          Eigen::aligned_allocator<T>
+#else
+                          host_allocator<T>
+#endif
+          >
 class Tensor;
 
 typedef Tensor<double> TensorD;
diff --git a/src/TiledArray/host/allocator.h b/src/TiledArray/host/allocator.h
index efaaeff9c4..6a0eebde30 100644
--- a/src/TiledArray/host/allocator.h
+++ b/src/TiledArray/host/allocator.h
@@ -58,7 +58,7 @@ class host_allocator_impl : public umpire_allocator_impl<T> {
   template <typename T1, typename T2>
   friend bool operator==(const host_allocator_impl<T1>& lhs,
                          const host_allocator_impl<T2>& rhs) noexcept;
-};  // class host_allocator
+};  // class host_allocator_impl
 
 template <class T1, class T2>
 bool operator==(const host_allocator_impl<T1>& lhs,
diff --git a/src/TiledArray/host/env.h b/src/TiledArray/host/env.h
index 2ae0bf6930..21c487222d 100644
--- a/src/TiledArray/host/env.h
+++ b/src/TiledArray/host/env.h
@@ -28,6 +28,7 @@
 
 // for memory management
 #include <umpire/Umpire.hpp>
+#include <umpire/strategy/AlignedAllocator.hpp>
 #include <umpire/strategy/QuickPool.hpp>
 #include <umpire/strategy/SizeLimiter.hpp>
 #include <umpire/strategy/ThreadSafeAllocator.hpp>
@@ -42,11 +43,11 @@
 namespace TiledArray {
 
 /**
- * hostEnv set up global environment
+ * hostEnv maintains the (host-side, as opposed to device-side) environment,
+ * such as memory allocators
  *
- * Singleton class
+ * \note this is a Singleton
  */
-
 class hostEnv {
  public:
   ~hostEnv() = default;
@@ -56,20 +57,26 @@ class hostEnv {
   hostEnv& operator=(const hostEnv&) = delete;
   hostEnv& operator=(hostEnv&&) = delete;
 
-  /// access the instance, if not initialized will be initialized using default
-  /// params
+  /// access the singleton instance; if not initialized will be
+  /// initialized via hostEnv::initialize() with the default params
   static std::unique_ptr<hostEnv>& instance() {
     if (!instance_accessor()) {
-      initialize(TiledArray::get_default_world());
+      initialize();
     }
     return instance_accessor();
   }
 
   /// initialize the instance using explicit params
-  static void initialize(World& world,
-                         const std::uint64_t max_memory_size = (1ul << 40),
-                         const std::uint64_t page_size = (1ul << 22)) {
-    // initialize only when not initialized
+  /// \param max_memory_size max amount of memory (bytes) that TiledArray
+  ///        can use for storage of TA::Tensor objects (these by default
+  ///        store DistArray tile data and (if sparse) shape [default=2^40]
+  /// \param page_size memory added to the pool in chunks of at least
+  ///                  this size (bytes) [default=2^25]
+  static void initialize(const std::uint64_t max_memory_size = (1ul << 40),
+                         const std::uint64_t page_size = (1ul << 25)) {
+    static std::mutex mtx;  // to make initialize() reentrant
+    std::scoped_lock lock{mtx};
+    // only the winner of the lock race gets to initialize
     if (instance_accessor() == nullptr) {
       // uncomment to debug umpire ops
       //
@@ -80,26 +87,24 @@ class hostEnv {
 
       auto& rm = umpire::ResourceManager::getInstance();
 
-      // turn off Umpire introspection for non-Debug builds
-#ifndef NDEBUG
-      constexpr auto introspect = true;
-#else
+      // N.B. we don't rely on Umpire introspection (even for profiling)
       constexpr auto introspect = false;
-#endif
 
-      // allocate zero memory for device pool, same grain for subsequent allocs
+      // use QuickPool for host memory allocation, with min grain of 1 page
       auto host_size_limited_alloc =
           rm.makeAllocator<umpire::strategy::SizeLimiter, introspect>(
-              "size_limited_alloc", rm.getAllocator("HOST"), max_memory_size);
+              "SizeLimited_HOST", rm.getAllocator("HOST"), max_memory_size);
       auto host_dynamic_pool =
           rm.makeAllocator<umpire::strategy::QuickPool, introspect>(
-              "HostDynamicPool", host_size_limited_alloc, 0, page_size);
-      auto thread_safe_host_dynamic_pool =
+              "QuickPool_SizeLimited_HOST", host_size_limited_alloc, page_size,
+              page_size, /* alignment */ TILEDARRAY_ALIGN_SIZE);
+      auto thread_safe_host_aligned_dynamic_pool =
           rm.makeAllocator<umpire::strategy::ThreadSafeAllocator, introspect>(
-              "ThreadSafeHostDynamicPool", host_dynamic_pool);
+              "ThreadSafe_QuickPool_SizeLimited_HOST", host_dynamic_pool);
 
       auto host_env = std::unique_ptr<hostEnv>(
-          new hostEnv(world, thread_safe_host_dynamic_pool));
+          new hostEnv(TiledArray::get_default_world(),
+                      thread_safe_host_aligned_dynamic_pool));
       instance_accessor() = std::move(host_env);
     }
   }
diff --git a/src/TiledArray/math/linalg/ttg/util.h b/src/TiledArray/math/linalg/ttg/util.h
index 1a9e60b7ba..76e3c023ec 100644
--- a/src/TiledArray/math/linalg/ttg/util.h
+++ b/src/TiledArray/math/linalg/ttg/util.h
@@ -229,7 +229,7 @@ auto make_writer_ttg(
 
   auto keymap2 = [pmap = A.pmap_shared(),
                   range = A.trange().tiles_range()](const Key2& key) {
-    const auto IJ = range.ordinal({key.I, key.J});
+    const auto IJ = range.ordinal({key[0], key[1]});
     return pmap->owner(IJ);
   };
 
@@ -239,8 +239,8 @@ auto make_writer_ttg(
         (Layout == lapack::Layout::ColMajor
              ? tile.rows()
              : tile.cols()));  // the code below only works if tile's LD == rows
-    const int I = key.I;
-    const int J = key.J;
+    const int I = key[0];
+    const int J = key[1];
     auto rng = A.trange().make_tile_range({I, J});
     if constexpr (Uplo != lapack::Uplo::General) {
       if (I != J &&
diff --git a/src/TiledArray/tensor/tensor.h b/src/TiledArray/tensor/tensor.h
index 5fe2a3cb21..ed0500d7e7 100644
--- a/src/TiledArray/tensor/tensor.h
+++ b/src/TiledArray/tensor/tensor.h
@@ -40,13 +40,6 @@ void gemm(Alpha alpha, const Tensor<As...>& A, const Tensor<Bs...>& B,
 
 namespace detail {
 
-#ifdef TA_TENSOR_MEM_PROFILE
-inline static std::mutex
-    ta_tensor_mem_profile_mtx;  // protects the following statics
-inline static std::uint64_t nbytes_allocated = 0;
-inline static std::uint64_t max_nbytes_allocated = 0;
-#endif  // TA_TENSOR_MEM_PROFILE
-
 /// Signals that we can take the trace of a Tensor<T, A> (for numeric \c T)
 template <typename T, typename A>
 struct TraceIsDefined<Tensor<T, A>, enable_if_numeric_t<T>> : std::true_type {};
@@ -97,45 +90,6 @@ class Tensor {
   template <typename X>
   using numeric_t = typename TiledArray::detail::numeric_type<X>::type;
 
-#ifdef TA_TENSOR_MEM_PROFILE
-  enum class MemOp { Alloc, Dealloc };
-  void alloc_record(std::uint64_t n, MemOp action) {
-    const double to_MiB = 1 / (1024.0 * 1024.0); /* Convert from bytes to MiB */
-    const auto nbytes = n * sizeof(value_type);
-    {
-      std::scoped_lock lock(detail::ta_tensor_mem_profile_mtx);
-      if (action == MemOp::Alloc) {
-        detail::nbytes_allocated += nbytes;
-        detail::max_nbytes_allocated =
-            std::max(detail::nbytes_allocated, detail::max_nbytes_allocated);
-      } else
-        detail::nbytes_allocated -= nbytes;
-    }
-    char buf[1024];
-    auto value_type_str = []() {
-      if constexpr (std::is_same_v<value_type, double>)
-        return "double";
-      else if constexpr (std::is_same_v<value_type, float>)
-        return "float";
-      else if constexpr (std::is_same_v<value_type, std::complex<double>>)
-        return "zdouble";
-      else if constexpr (std::is_same_v<value_type, std::complex<float>>)
-        return "zfloat";
-      else
-        return "";
-    };
-    std::snprintf(
-        buf, 1023,
-        "TA::Tensor<%s>: %sallocated %lf MiB [wm = %lf MiB hwm = %lf MiB]\n",
-        value_type_str(), (action == MemOp::Dealloc ? "de" : "  "),
-        nbytes * to_MiB, detail::nbytes_allocated * to_MiB,
-        detail::max_nbytes_allocated * to_MiB);
-    auto& os = madness::print_meminfo_ostream();
-    os << buf;
-    os.flush();
-  }
-#endif
-
   template <typename... Ts>
   struct is_tensor {
     static constexpr bool value = detail::is_tensor<Ts...>::value ||
@@ -149,9 +103,6 @@ class Tensor {
     size_t size = range_.volume() * batch_size;
     allocator_type allocator;
     auto* ptr = allocator.allocate(size);
-#ifdef TA_TENSOR_MEM_PROFILE
-    alloc_record(size, MemOp::Alloc);
-#endif
     if (default_construct) {
       std::uninitialized_default_construct_n(ptr, size);
       // std::uninitialized_value_construct_n(ptr, size);
@@ -160,9 +111,6 @@ class Tensor {
                     size](auto&& ptr) mutable {
       std::destroy_n(ptr, size);
       allocator.deallocate(ptr, size);
-#ifdef TA_TENSOR_MEM_PROFILE
-      alloc_record(size, MemOp::Dealloc);
-#endif
     };
     this->data_ = std::shared_ptr<value_type>(ptr, std::move(deleter));
   }
@@ -172,9 +120,6 @@ class Tensor {
     size_t size = range_.volume() * batch_size;
     allocator_type allocator;
     auto* ptr = allocator.allocate(size);
-#ifdef TA_TENSOR_MEM_PROFILE
-    alloc_record(size, MemOp::Alloc);
-#endif
     if (default_construct) {
       std::uninitialized_default_construct_n(ptr, size);
       // std::uninitialized_value_construct_n(ptr, size);
@@ -182,10 +127,7 @@ class Tensor {
     auto deleter = [this, allocator = std::move(allocator),
                     size](auto&& ptr) mutable {
       std::destroy_n(ptr, size);
-      allocator.deallocate(ptr, size);
-#ifdef TA_TENSOR_MEM_PROFILE
-      alloc_record(size, MemOp::Dealloc);
-#endif
+      allocator.deallocate(ptr, size * sizeof(T));
     };
     this->data_ = std::shared_ptr<value_type>(ptr, std::move(deleter));
   }
@@ -2055,8 +1997,8 @@ class Tensor {
 
 };  // class Tensor
 
-template<typename T>
-Tensor<T> operator*(const Permutation &p, const Tensor<T> &t) {
+template <typename T>
+Tensor<T> operator*(const Permutation& p, const Tensor<T>& t) {
   return t.permute(p);
 }
 
diff --git a/src/TiledArray/tile.h b/src/TiledArray/tile.h
index 99be71851c..53bb56f444 100644
--- a/src/TiledArray/tile.h
+++ b/src/TiledArray/tile.h
@@ -557,8 +557,8 @@ class Tile {
   // Serialization -----------------------------------------------------------
 
   template <typename Archive,
-            typename std::enable_if<madness::is_output_archive_v<
-                Archive>>::type* = nullptr>
+            typename std::enable_if<
+                madness::is_output_archive_v<Archive>>::type* = nullptr>
   void serialize(Archive& ar) const {
     // Serialize data for empty tile check
     bool empty = !static_cast<bool>(pimpl_);
@@ -570,8 +570,8 @@ class Tile {
   }
 
   template <typename Archive,
-            typename std::enable_if<madness::is_input_archive_v<
-                Archive>>::type* = nullptr>
+            typename std::enable_if<
+                madness::is_input_archive_v<Archive>>::type* = nullptr>
   void serialize(Archive& ar) {
     // Check for empty tile
     bool empty = false;
@@ -1150,7 +1150,7 @@ inline Tile<Left>& inplace_binary(Tile<Left>& left, const Tile<Right>& right,
 
 // Scaling operations --------------------------------------------------------
 
-/// Scalar the tile argument
+/// Scale the tile argument
 
 /// \tparam Arg The tile argument type
 /// \param arg The left-hand argument to be scaled
@@ -1614,10 +1614,10 @@ inline std::ostream& operator<<(std::ostream& os, const Tile<T>& tile) {
 template <typename Allocator, typename T>
 struct Cast<
     TiledArray::Tensor<typename T::value_type, Allocator>, Tile<T>,
-    std::void_t<decltype(
-        std::declval<TiledArray::Cast<
-            TiledArray::Tensor<typename T::value_type, Allocator>, T>>()(
-            std::declval<const T&>()))>> {
+    std::void_t<
+        decltype(std::declval<TiledArray::Cast<
+                     TiledArray::Tensor<typename T::value_type, Allocator>,
+                     T>>()(std::declval<const T&>()))>> {
   auto operator()(const Tile<T>& arg) const {
     return TiledArray::Cast<
         TiledArray::Tensor<typename T::value_type, Allocator>, T>{}(
diff --git a/src/TiledArray/val_array.h b/src/TiledArray/val_array.h
index 65c2b72785..9a8620443d 100644
--- a/src/TiledArray/val_array.h
+++ b/src/TiledArray/val_array.h
@@ -26,12 +26,9 @@
 #ifndef TILEDARRAY_SHARED_BUFFER_H__INCLUDED
 #define TILEDARRAY_SHARED_BUFFER_H__INCLUDED
 
+#include <TiledArray/config.h>
 #include <TiledArray/size_array.h>
 
-#ifndef TILEDARRAY_DEFAULT_ALIGNMENT
-#define TILEDARRAY_DEFAULT_ALIGNMENT 16
-#endif  // TILEDARRAY_ALIGNMENT
-
 namespace TiledArray {
 namespace detail {
 
@@ -58,7 +55,7 @@ class ValArray : private SizeArray<T> {
   typedef typename SizeArray<T>::const_iterator
       const_iterator;  ///< Const iterator type
 
-  static const std::size_t alignment = TILEDARRAY_DEFAULT_ALIGNMENT;
+  static const std::size_t alignment = TILEDARRAY_ALIGN_SIZE;
 
  private:
   /// The pointer to reference counter
@@ -453,8 +450,7 @@ class ValArray : private SizeArray<T> {
   /// \tparam Archive An output archive type
   /// \param[out] ar an Archive object
   template <typename Archive,
-            typename = std::enable_if_t<
-                madness::is_output_archive_v<Archive>>>
+            typename = std::enable_if_t<madness::is_output_archive_v<Archive>>>
   void serialize(Archive& ar) const {
     // need to write size first to be able to init when deserializing
     ar& size() & madness::archive::wrap(data(), size());
@@ -465,8 +461,7 @@ class ValArray : private SizeArray<T> {
   /// \tparam Archive An input archive type
   /// \param[out] ar an Archive object
   template <typename Archive,
-            typename = std::enable_if_t<
-                madness::is_input_archive_v<Archive>>>
+            typename = std::enable_if_t<madness::is_input_archive_v<Archive>>>
   void serialize(Archive& ar) {
     size_t sz = 0;
     ar& sz;