Correct a technical flaw with the spinlock locking:

The existing spinlock code, used to protect SHAMapInnerNode child lists, has a mistake that can allow the same child to be repeatedly locked under some circumstances. The bug was in the `SpinBitLock::lock` loop condition check and would result in the loop terminating early. This commit fixes this and further simplifies the lock loop making the correctness of the code easier to verify without sacrificing performance. It also promotes the spinlock class from an implementation detail to a more general purpose, easier to use lock class with clearer semantics. Two different lock types now allow developers to easily grab either a single spinlock from an a group of spinlocks (packed in an unsigned integer) or to grab all of the spinlocks at once. While this commit makes spinlocks more widely available to developers, they are rarely the best tool for the job. Use them judiciously and only after careful consideration.
XRPLF · Jul 18, 2022 · 7e46f53 · 7e46f53
1 parent 59326bb
commit 7e46f53
Show file tree

Hide file tree

Showing 3 changed files with 229 additions and 88 deletions.
diff --git a/Builds/CMake/RippledCore.cmake b/Builds/CMake/RippledCore.cmake
@@ -156,6 +156,7 @@ install (
     src/ripple/basics/MathUtilities.h
     src/ripple/basics/safe_cast.h
     src/ripple/basics/Slice.h
+    src/ripple/basics/spinlock.h
     src/ripple/basics/StringUtilities.h
     src/ripple/basics/ThreadSafetyAnalysis.h
     src/ripple/basics/ToString.h

diff --git a/src/ripple/basics/spinlock.h b/src/ripple/basics/spinlock.h
@@ -0,0 +1,223 @@
+/*
+    This file is part of rippled: https://github.com/ripple/rippled
+    Copyright 2022, Nikolaos D. Bougalis <[email protected]>
+
+    Permission to use, copy, modify, and/or distribute this software for any
+    purpose  with  or without fee is hereby granted, provided that the above
+    copyright notice and this permission notice appear in all copies.
+
+    THE  SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+    WITH  REGARD  TO  THIS  SOFTWARE  INCLUDING  ALL  IMPLIED  WARRANTIES  OF
+    MERCHANTABILITY  AND  FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+    ANY  SPECIAL ,  DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+    WHATSOEVER  RESULTING  FROM  LOSS  OF USE, DATA OR PROFITS, WHETHER IN AN
+    ACTION  OF  CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+    OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+*/
+
+#ifndef RIPPLE_BASICS_SPINLOCK_H_INCLUDED
+#define RIPPLE_BASICS_SPINLOCK_H_INCLUDED
+
+#include <atomic>
+#include <cassert>
+#include <limits>
+#include <type_traits>
+
+#ifndef __aarch64__
+#include <immintrin.h>
+#endif
+
+namespace ripple {
+
+namespace detail {
+/** Inform the processor that we are in a tight spin-wait loop.
+
+    Spinlocks caught in tight loops can result in the processor's pipeline
+    filling up with comparison operations, resulting in a misprediction at
+    the time the lock is finally acquired, necessitating pipeline flushing
+    which is ridiculously expensive and results in very high latency.
+
+    This function instructs the processor to "pause" for some architecture
+    specific amount of time, to prevent this.
+ */
+inline void
+spin_pause() noexcept
+{
+#ifdef __aarch64__
+    asm volatile("yield");
+#else
+    _mm_pause();
+#endif
+}
+
+}  // namespace detail
+
+/** @{ */
+/** Classes to handle arrays of spinlocks packed into a single atomic integer:
+
+    Packed spinlocks allow for tremendously space-efficient lock-sharding
+    but they come at a cost.
+
+    First, the implementation is necessarily low-level and uses advanced
+    features like memory ordering and highly platform-specific tricks to
+    maximize performance. This imposes a significant and ongoing cost to
+    developers.
+
+    Second, and perhaps most important, is that the packing of multiple
+    locks into a single integer which, albeit space-efficient, also has
+    performance implications stemming from data dependencies, increased
+    cache-coherency traffic between processors and heavier loads on the
+    processor's load/store units.
+
+    To be sure, these locks can have advantages but they are definitely
+    not general purpose locks and should not be thought of or used that
+    way. The use cases for them are likely few and far between; without
+    a compelling reason to use them, backed by profiling data, it might
+    be best to use one of the standard locking primitives instead. Note
+    that in most common platforms, `std::mutex` is so heavily optimized
+    that it can, usually, outperform spinlocks.
+
+    @tparam T An unsigned integral type (e.g. std::uint16_t)
+ */
+
+/** A class that grabs a single packed spinlock from an atomic integer.
+
+    This class meets the requirements of Lockable:
+        https://en.cppreference.com/w/cpp/named_req/Lockable
+ */
+template <class T>
+class packed_spinlock
+{
+    // clang-format off
+    static_assert(std::is_unsigned_v<T>);
+    static_assert(std::atomic<T>::is_always_lock_free);
+    static_assert(
+        std::is_same_v<decltype(std::declval<std::atomic<T>&>().fetch_or(0)), T> &&
+        std::is_same_v<decltype(std::declval<std::atomic<T>&>().fetch_and(0)), T>,
+        "std::atomic<T>::fetch_and(T) and std::atomic<T>::fetch_and(T) are required by packed_spinlock");
+    // clang-format on
+
+private:
+    std::atomic<T>& bits_;
+    T const mask_;
+
+public:
+    packed_spinlock(packed_spinlock const&) = delete;
+    packed_spinlock&
+    operator=(packed_spinlock const&) = delete;
+
+    /** A single spinlock packed inside the specified atomic
+
+        @param lock The atomic integer inside which the spinlock is packed.
+        @param index The index of the spinlock this object acquires.
+
+        @note For performance reasons, you should strive to have `lock` be
+              on a cacheline by itself.
+     */
+    packed_spinlock(std::atomic<T>& lock, int index)
+        : bits_(lock), mask_(static_cast<T>(1) << index)
+    {
+        assert(index >= 0 && (mask_ != 0));
+    }
+
+    [[nodiscard]] bool
+    try_lock()
+    {
+        return (bits_.fetch_or(mask_, std::memory_order_acquire) & mask_) == 0;
+    }
+
+    void
+    lock()
+    {
+        while (!try_lock())
+        {
+            // The use of relaxed memory ordering here is intentional and
+            // serves to help reduce cache coherency traffic during times
+            // of contention by avoiding writes that would definitely not
+            // result in the lock being acquired.
+            while ((bits_.load(std::memory_order_relaxed) & mask_) != 0)
+                detail::spin_pause();
+        }
+    }
+
+    void
+    unlock()
+    {
+        bits_.fetch_and(~mask_, std::memory_order_release);
+    }
+};
+
+/** A spinlock implemented on top of an atomic integer.
+
+    @note Using `packed_spinlock` and `spinlock` against the same underlying
+          atomic integer can result in `spinlock` not being able to actually
+          acquire the lock during periods of high contention, because of how
+          the two locks operate: `spinlock` will spin trying to grab all the
+          bits at once, whereas any given `packed_spinlock` will only try to
+          grab one bit at a time. Caveat emptor.
+
+    This class meets the requirements of Lockable:
+        https://en.cppreference.com/w/cpp/named_req/Lockable
+ */
+template <class T>
+class spinlock
+{
+    static_assert(std::is_unsigned_v<T>);
+    static_assert(std::atomic<T>::is_always_lock_free);
+
+private:
+    std::atomic<T>& lock_;
+
+public:
+    spinlock(spinlock const&) = delete;
+    spinlock&
+    operator=(spinlock const&) = delete;
+
+    /** Grabs the
+
+        @param lock The atomic integer to spin against.
+
+        @note For performance reasons, you should strive to have `lock` be
+              on a cacheline by itself.
+     */
+    spinlock(std::atomic<T>& lock) : lock_(lock)
+    {
+    }
+
+    [[nodiscard]] bool
+    try_lock()
+    {
+        T expected = 0;
+
+        return lock_.compare_exchange_weak(
+            expected,
+            std::numeric_limits<T>::max(),
+            std::memory_order_acquire,
+            std::memory_order_relaxed);
+    }
+
+    void
+    lock()
+    {
+        while (!try_lock())
+        {
+            // The use of relaxed memory ordering here is intentional and
+            // serves to help reduce cache coherency traffic during times
+            // of contention by avoiding writes that would definitely not
+            // result in the lock being acquired.
+            while (lock_.load(std::memory_order_relaxed) != 0)
+                detail::spin_pause();
+        }
+    }
+
+    void
+    unlock()
+    {
+        lock_.store(0, std::memory_order_release);
+    }
+};
+/** @} */
+
+}  // namespace ripple
+
+#endif
diff --git a/src/ripple/shamap/impl/SHAMapInnerNode.cpp b/src/ripple/shamap/impl/SHAMapInnerNode.cpp
@@ -22,102 +22,19 @@
 #include <ripple/basics/Log.h>
 #include <ripple/basics/Slice.h>
 #include <ripple/basics/contract.h>
+#include <ripple/basics/spinlock.h>
 #include <ripple/beast/core/LexicalCast.h>
 #include <ripple/protocol/HashPrefix.h>
 #include <ripple/protocol/digest.h>
 #include <ripple/shamap/SHAMapTreeNode.h>
 #include <ripple/shamap/impl/TaggedPointer.ipp>
 
-#include <openssl/sha.h>
-
 #include <algorithm>
 #include <iterator>
 #include <utility>
 
-#ifndef __aarch64__
-// This is used for the _mm_pause instruction:
-#include <immintrin.h>
-#endif
-
 namespace ripple {
 
-/** A specialized 16-way spinlock used to protect inner node branches.
-
-    This class packs 16 separate spinlocks into a single 16-bit value. It makes
-    it possible to lock any one lock at once or, alternatively, all together.
-
-    The implementation tries to use portable constructs but has to be low-level
-    for performance.
- */
-class SpinBitlock
-{
-private:
-    std::atomic<std::uint16_t>& bits_;
-    std::uint16_t mask_;
-
-public:
-    SpinBitlock(std::atomic<std::uint16_t>& lock) : bits_(lock), mask_(0xFFFF)
-    {
-    }
-
-    SpinBitlock(std::atomic<std::uint16_t>& lock, int index)
-        : bits_(lock), mask_(1 << index)
-    {
-        assert(index >= 0 && index < 16);
-    }
-
-    [[nodiscard]] bool
-    try_lock()
-    {
-        // If we want to grab all the individual bitlocks at once we cannot
-        // use `fetch_or`! To see why, imagine that `lock_ == 0x0020` which
-        // means that the `fetch_or` would return `0x0020` but all the bits
-        // would already be (incorrectly!) set. Oops!
-        std::uint16_t expected = 0;
-
-        if (mask_ != 0xFFFF)
-            return (bits_.fetch_or(mask_, std::memory_order_acquire) & mask_) ==
-                expected;
-
-        return bits_.compare_exchange_weak(
-            expected,
-            mask_,
-            std::memory_order_acquire,
-            std::memory_order_relaxed);
-    }
-
-    void
-    lock()
-    {
-        // Testing suggests that 99.9999% of the time this will succeed, so
-        // we try to optimize the fast path.
-        if (try_lock())
-            return;
-
-        do
-        {
-            // We try to spin for a few times:
-            for (int i = 0; i != 100; ++i)
-            {
-                if (try_lock())
-                    return;
-
-#ifndef __aarch64__
-                _mm_pause();
-#endif
-            }
-
-            std::this_thread::yield();
-        } while ((bits_.load(std::memory_order_relaxed) & mask_) == 0);
-    }
-
-    void
-    unlock()
-    {
-        bits_.fetch_and(~mask_, std::memory_order_release);
-    }
-};
-
 SHAMapInnerNode::SHAMapInnerNode(
     std::uint32_t cowid,
     std::uint8_t numAllocatedChildren)
@@ -185,7 +102,7 @@ SHAMapInnerNode::clone(std::uint32_t cowid) const
         });
     }
 
-    SpinBitlock sl(lock_);
+    spinlock sl(lock_);
     std::lock_guard lock(sl);
 
     if (thisIsSparse)
@@ -422,7 +339,7 @@ SHAMapInnerNode::getChildPointer(int branch)
 
     auto const index = *getChildIndex(branch);
 
-    SpinBitlock sl(lock_, index);
+    packed_spinlock sl(lock_, index);
     std::lock_guard lock(sl);
     return hashesAndChildren_.getChildren()[index].get();
 }
@@ -435,7 +352,7 @@ SHAMapInnerNode::getChild(int branch)
 
     auto const index = *getChildIndex(branch);
 
-    SpinBitlock sl(lock_, index);
+    packed_spinlock sl(lock_, index);
     std::lock_guard lock(sl);
     return hashesAndChildren_.getChildren()[index];
 }
@@ -462,7 +379,7 @@ SHAMapInnerNode::canonicalizeChild(
     auto [_, hashes, children] = hashesAndChildren_.getHashesAndChildren();
     assert(node->getHash() == hashes[childIndex]);
 
-    SpinBitlock sl(lock_, childIndex);
+    packed_spinlock sl(lock_, childIndex);
     std::lock_guard lock(sl);
 
     if (children[childIndex])