XRPLF · nbougalis · Jun 8, 2022 · scottschurr · Jun 11, 2022 · nbougalis
diff --git a/Builds/CMake/RippledCore.cmake b/Builds/CMake/RippledCore.cmake
@@ -156,6 +156,7 @@ install (
     src/ripple/basics/MathUtilities.h
     src/ripple/basics/safe_cast.h
     src/ripple/basics/Slice.h
+    src/ripple/basics/spinlock.h
     src/ripple/basics/StringUtilities.h
     src/ripple/basics/ToString.h
     src/ripple/basics/UnorderedContainers.h

diff --git a/src/ripple/basics/spinlock.h b/src/ripple/basics/spinlock.h
@@ -0,0 +1,223 @@
+/*
+    This file is part of rippled: https://github.com/ripple/rippled
+    Copyright 2022, Nikolaos D. Bougalis <[email protected]>
+
+    Permission to use, copy, modify, and/or distribute this software for any
+    purpose  with  or without fee is hereby granted, provided that the above
+    copyright notice and this permission notice appear in all copies.
+
+    THE  SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+    WITH  REGARD  TO  THIS  SOFTWARE  INCLUDING  ALL  IMPLIED  WARRANTIES  OF
+    MERCHANTABILITY  AND  FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+    ANY  SPECIAL ,  DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+    WHATSOEVER  RESULTING  FROM  LOSS  OF USE, DATA OR PROFITS, WHETHER IN AN
+    ACTION  OF  CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#ifndef RIPPLE_BASICS_SPINLOCK_H_INCLUDED
+#define RIPPLE_BASICS_SPINLOCK_H_INCLUDED
+
+#include <atomic>
+#include <cassert>
+#include <limits>
+#include <type_traits>
+
+#ifndef __aarch64__
+#include <immintrin.h>
+#endif
+
+namespace ripple {
+
+namespace detail {
+/** Inform the processor that we are in a tight spin-wait loop.
+
+    Spinlocks caught in tight loops can result in the processor's pipeline
+    filling up with comparison operations, resulting in a misprediction at
+    the time the lock is finally acquired, necessitating pipeline flushing
+    which is ridiculously expensive and results in very high latency.
+
+    This function instructs the processor to "pause" for some architecture
+    specific amount of time, to prevent this.
+ */
+inline void
+spin_pause() noexcept
+{
+#ifdef __aarch64__
+    asm volatile("yield");
+#else
+    _mm_pause();
+#endif
+}
+
+}  // namespace detail
+
+/** @{ */
+/** Classes to handle arrays of spinlocks packed into a single atomic integer:
+
+    Packed spinlocks allow for tremendously space-efficient lock-sharding
+    but they come at a cost.
+
+    First, the implementation is necessarily low-level and uses advanced
+    features like memory ordering and highly platform-specific tricks to
+    maximize performance. This imposes a significant and ongoing cost to
+    developers.
+
+    Second, and perhaps most important, is that the packing of multiple
+    locks into a single integer which, albeit space-efficient, also has
+    performance implications stemming from data dependencies, increased
+    cache-coherency traffic between processors and heavier loads on the
+    processor's load/store units.
+
+    To be sure, these locks can have advantages but they are definitely
+    not general purpose locks and should not be thought of or used that
+    way. The use cases for them are likely few and far between; without
+    a compelling reason to use them, backed by profiling data, it might
+    be best to use one of the standard locking primitives instead. Note
+    that in most common platforms, `std::mutex` is so heavily optimized
+    that it can, usually, outperform spinlocks.
+
+    @tparam T An unsigned integral type (e.g. std::uint16_t)
+ */
+
+/** A class that grabs a single packed spinlock from an atomic integer.
+
+    This class meets the requirements of Lockable:
+        https://en.cppreference.com/w/cpp/named_req/Lockable
+ */
+template <class T>
+class packed_spinlock
+{
+    // clang-format off
+    static_assert(std::is_unsigned_v<T>);
+    static_assert(std::atomic<T>::is_always_lock_free);
+    static_assert(
+        std::is_same_v<decltype(std::declval<std::atomic<T>&>().fetch_or(0)), T> &&
+        std::is_same_v<decltype(std::declval<std::atomic<T>&>().fetch_and(0)), T>,
+        "std::atomic<T>::fetch_and(T) and std::atomic<T>::fetch_and(T) are required by packed_spinlock");
+    // clang-format on
+
+private:
+    std::atomic<T>& bits_;
+    T const mask_;
+
+public:
+    packed_spinlock(packed_spinlock const&) = delete;
+    packed_spinlock&
+    operator=(packed_spinlock const&) = delete;
+
+    /** A single spinlock packed inside the specified atomic
+
+        @param lock The atomic integer inside which the spinlock is packed.
+        @param index The index of the spinlock this object acquires.
+
+        @note For performance reasons, you should strive to have `lock` be
+              on a cacheline by itself.
+     */
+    packed_spinlock(std::atomic<T>& lock, int index)
+        : bits_(lock), mask_(static_cast<T>(1) << index)
+    {
+        assert(index >= 0 && (mask_ != 0));
+    }
+
+    [[nodiscard]] bool
+    try_lock()
+    {
+        return (bits_.fetch_or(mask_, std::memory_order_acquire) & mask_) == 0;
+    }
+
+    void
+    lock()
+    {
+        while (!try_lock())
+        {
+            // The use of relaxed memory ordering here is intentional and
+            // serves to help reduce cache coherency traffic during times
+            // of contention by avoiding writes that would definitely not
+            // result in the lock being acquired.
+            while ((bits_.load(std::memory_order_relaxed) & mask_) != 0)
+                detail::spin_pause();
+        }
+    }
+
+    void
+    unlock()
+    {
+        bits_.fetch_and(~mask_, std::memory_order_release);
+    }
+};
+
+/** A spinlock implemented on top of an atomic integer.
+
+    @note Using `packed_spinlock` and `spinlock` against the same underlying
+          atomic integer can result in `spinlock` not being able to actually
+          acquire the lock during periods of high contention, because of how
+          the two locks operate: `spinlock` will spin trying to grab all the
+          bits at once, whereas any given `packed_spinlock` will only try to
+          grab one bit at a time. Caveat emptor.
+
+    This class meets the requirements of Lockable:
+        https://en.cppreference.com/w/cpp/named_req/Lockable
+ */
+template <class T>
+class spinlock
+{
+    static_assert(std::is_unsigned_v<T>);
+    static_assert(std::atomic<T>::is_always_lock_free);
+
+private:
+    std::atomic<T>& lock_;
+
+public:
+    spinlock(spinlock const&) = delete;
+    spinlock&
+    operator=(spinlock const&) = delete;
+
+    /** Grabs the
+
+        @param lock The atomic integer to spin against.
+
+        @note For performance reasons, you should strive to have `lock` be
+              on a cacheline by itself.
+     */
+    spinlock(std::atomic<T>& lock) : lock_(lock)
+    {
+    }
+
+    [[nodiscard]] bool
+    try_lock()
+    {
+        T expected = 0;
+
+        return lock_.compare_exchange_weak(
+            expected,
+            std::numeric_limits<T>::max(),
+            std::memory_order_acquire,
+            std::memory_order_relaxed);
+    }
+
+    void
+    lock()
+    {
+        while (!try_lock())
+        {
+            // The use of relaxed memory ordering here is intentional and
+            // serves to help reduce cache coherency traffic during times
+            // of contention by avoiding writes that would definitely not
+            // result in the lock being acquired.
+            while (lock_.load(std::memory_order_relaxed) != 0)
+                detail::spin_pause();
+        }
+    }
+
+    void
+    unlock()
+    {
+        lock_.store(0, std::memory_order_release);
+    }
+};
+/** @} */
+
+}  // namespace ripple
+
+#endif
diff --git a/src/ripple/shamap/impl/SHAMapInnerNode.cpp b/src/ripple/shamap/impl/SHAMapInnerNode.cpp
@@ -22,102 +22,19 @@
 #include <ripple/basics/Log.h>
 #include <ripple/basics/Slice.h>
 #include <ripple/basics/contract.h>
+#include <ripple/basics/spinlock.h>
 #include <ripple/beast/core/LexicalCast.h>
 #include <ripple/protocol/HashPrefix.h>
 #include <ripple/protocol/digest.h>
 #include <ripple/shamap/SHAMapTreeNode.h>
 #include <ripple/shamap/impl/TaggedPointer.ipp>
 
-#include <openssl/sha.h>
-
 #include <algorithm>
 #include <iterator>
 #include <utility>
 
-#ifndef __aarch64__
-// This is used for the _mm_pause instruction:
-#include <immintrin.h>
-#endif
-
 namespace ripple {
 
-/** A specialized 16-way spinlock used to protect inner node branches.
-
-    This class packs 16 separate spinlocks into a single 16-bit value. It makes
-    it possible to lock any one lock at once or, alternatively, all together.
-
-    The implementation tries to use portable constructs but has to be low-level
-    for performance.
- */
-class SpinBitlock
-{
-private:
-    std::atomic<std::uint16_t>& bits_;
-    std::uint16_t mask_;
-
-public:
-    SpinBitlock(std::atomic<std::uint16_t>& lock) : bits_(lock), mask_(0xFFFF)
-    {
-    }
-
-    SpinBitlock(std::atomic<std::uint16_t>& lock, int index)
-        : bits_(lock), mask_(1 << index)
-    {
-        assert(index >= 0 && index < 16);
-    }
-
-    [[nodiscard]] bool
-    try_lock()
-    {
-        // If we want to grab all the individual bitlocks at once we cannot
-        // use `fetch_or`! To see why, imagine that `lock_ == 0x0020` which
-        // means that the `fetch_or` would return `0x0020` but all the bits
-        // would already be (incorrectly!) set. Oops!
-        std::uint16_t expected = 0;
-
-        if (mask_ != 0xFFFF)
-            return (bits_.fetch_or(mask_, std::memory_order_acquire) & mask_) ==
-                expected;
-
-        return bits_.compare_exchange_weak(
-            expected,
-            mask_,
-            std::memory_order_acquire,
-            std::memory_order_relaxed);
-    }
-
-    void
-    lock()
-    {
-        // Testing suggests that 99.9999% of the time this will succeed, so
-        // we try to optimize the fast path.
-        if (try_lock())
-            return;
-
-        do
-        {
-            // We try to spin for a few times:
-            for (int i = 0; i != 100; ++i)
-            {
-                if (try_lock())
-                    return;
-
-#ifndef __aarch64__
-                _mm_pause();
-#endif
-            }
-
-            std::this_thread::yield();
-        } while ((bits_.load(std::memory_order_relaxed) & mask_) == 0);
-    }
-
-    void
-    unlock()
-    {
-        bits_.fetch_and(~mask_, std::memory_order_release);
-    }
-};
-
 SHAMapInnerNode::SHAMapInnerNode(
     std::uint32_t cowid,
     std::uint8_t numAllocatedChildren)
@@ -185,7 +102,7 @@ SHAMapInnerNode::clone(std::uint32_t cowid) const
         });
     }
 
-    SpinBitlock sl(lock_);
+    spinlock sl(lock_);
     std::lock_guard lock(sl);
 
     if (thisIsSparse)
@@ -422,7 +339,7 @@ SHAMapInnerNode::getChildPointer(int branch)
 
     auto const index = *getChildIndex(branch);
 
-    SpinBitlock sl(lock_, index);
+    packed_spinlock sl(lock_, index);
     std::lock_guard lock(sl);
     return hashesAndChildren_.getChildren()[index].get();
 }
@@ -435,7 +352,7 @@ SHAMapInnerNode::getChild(int branch)
 
     auto const index = *getChildIndex(branch);
 
-    SpinBitlock sl(lock_, index);
+    packed_spinlock sl(lock_, index);
     std::lock_guard lock(sl);
     return hashesAndChildren_.getChildren()[index];
 }
@@ -462,7 +379,7 @@ SHAMapInnerNode::canonicalizeChild(
     auto [_, hashes, children] = hashesAndChildren_.getHashesAndChildren();
     assert(node->getHash() == hashes[childIndex]);
 
-    SpinBitlock sl(lock_, childIndex);
+    packed_spinlock sl(lock_, childIndex);
     std::lock_guard lock(sl);
 
     if (children[childIndex])