From 76ee8d69c9dea6e77333f386bc3f83c0961b8253 Mon Sep 17 00:00:00 2001 From: seelabs Date: Fri, 12 Apr 2024 17:36:34 -0400 Subject: [PATCH] fix: resolve database deadlock: The `rotateWithLock` function holds a lock while it calls a callback function that's passed in by the caller. This is a problematic design that needs to be used very carefully. In this case, at least one caller passed in a callback that eventually relocks the mutex on the same thread, causing UB (a deadlock was observed). The caller was from SHAMapStoreImpl, and it called `clearCaches`. This `clearCaches` can potentially call `fetchNodeObject`, which tried to relock the mutex. This patch resolves the issue by changing the mutex type to a `recursive_mutex`. Ideally, the code should be rewritten so it doesn't hold the mutex during the callback and the mutex should be changed back to a regular mutex. --- src/ripple/nodestore/impl/DatabaseRotatingImp.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/ripple/nodestore/impl/DatabaseRotatingImp.h b/src/ripple/nodestore/impl/DatabaseRotatingImp.h index b2807eeabce..3a9fd302b4f 100644 --- a/src/ripple/nodestore/impl/DatabaseRotatingImp.h +++ b/src/ripple/nodestore/impl/DatabaseRotatingImp.h @@ -22,6 +22,8 @@ #include +#include + namespace ripple { namespace NodeStore { @@ -82,7 +84,13 @@ class DatabaseRotatingImp : public DatabaseRotating private: std::shared_ptr writableBackend_; std::shared_ptr archiveBackend_; - mutable std::mutex mutex_; + // This needs to be a recursive mutex because callbacks in `rotateWithLock` + // can call function that also lock the mutex. A current example of this is + // a callback from SHAMapStoreImp, which calls `clearCaches`. This + // `clearCaches` call eventually calls `fetchNodeObject` which tries to + // relock the mutex. It would be desirable to rewrite the code so the lock + // was not held during a callback. + mutable std::recursive_mutex mutex_; std::shared_ptr fetchNodeObject(