From cea43099d291e409d58f84312d5e56d690cfc478 Mon Sep 17 00:00:00 2001 From: Mark Travis Date: Fri, 22 Mar 2024 13:22:29 -0700 Subject: [PATCH] Don't reach consensus as quickly if no other proposals seen: (#4763) This fixes a case where a peer can desync under a certain timing circumstance--if it reaches a certain point in consensus before it receives proposals. This was noticed under high transaction volumes. Namely, when we arrive at the point of deciding whether consensus is reached after minimum establish phase duration but before having received any proposals. This could be caused by finishing the previous round slightly faster and/or having some delay in receiving proposals. Existing behavior arrives at consensus immediately after the minimum establish duration with no proposals. This causes us to desync because we then close a non-validated ledger. The change in this PR causes us to wait for a configured threshold before making the decision to arrive at consensus with no proposals. This allows validators to catch up and for brief delays in receiving proposals to be absorbed. There should be no drawback since, with no proposals coming in, we needn't be in a huge rush to jump ahead. --- src/ripple/consensus/Consensus.cpp | 39 ++++++++++++++++++++++----- src/ripple/consensus/Consensus.h | 2 +- src/ripple/consensus/ConsensusParms.h | 2 +- src/test/consensus/Consensus_test.cpp | 9 +++++-- 4 files changed, 42 insertions(+), 10 deletions(-) diff --git a/src/ripple/consensus/Consensus.cpp b/src/ripple/consensus/Consensus.cpp index 1b08859c889..cc1f84270e7 100644 --- a/src/ripple/consensus/Consensus.cpp +++ b/src/ripple/consensus/Consensus.cpp @@ -87,11 +87,24 @@ checkConsensusReached( std::size_t agreeing, std::size_t total, bool count_self, - std::size_t minConsensusPct) + std::size_t minConsensusPct, + bool reachedMax) { - // If we are alone, we have a consensus + // If we are alone for too long, we have consensus. + // Delaying consensus like this avoids a circumstance where a peer + // gets ahead of proposers insofar as it has not received any proposals. + // This could happen if there's a slowdown in receiving proposals. Reaching + // consensus prematurely in this way means that the peer will likely desync. + // The check for reachedMax should allow plenty of time for proposals to + // arrive, and there should be no downside. If a peer is truly not + // receiving any proposals, then there should be no hurry. There's + // really nowhere to go. if (total == 0) - return true; + { + if (reachedMax) + return true; + return false; + } if (count_self) { @@ -120,7 +133,13 @@ checkConsensus( << prevProposers << " agree=" << currentAgree << " validated=" << currentFinished << " time=" << currentAgreeTime.count() << "/" - << previousAgreeTime.count(); + << previousAgreeTime.count() << " proposing? " << proposing + << " minimum duration to reach consensus: " + << parms.ledgerMIN_CONSENSUS.count() << "ms" + << " max consensus time " + << parms.ledgerMAX_CONSENSUS.count() << "s" + << " minimum consensus percentage: " + << parms.minCONSENSUS_PCT; if (currentAgreeTime <= parms.ledgerMIN_CONSENSUS) return ConsensusState::No; @@ -139,7 +158,11 @@ checkConsensus( // Have we, together with the nodes on our UNL list, reached the threshold // to declare consensus? if (checkConsensusReached( - currentAgree, currentProposers, proposing, parms.minCONSENSUS_PCT)) + currentAgree, + currentProposers, + proposing, + parms.minCONSENSUS_PCT, + currentAgreeTime > parms.ledgerMAX_CONSENSUS)) { JLOG(j.debug()) << "normal consensus"; return ConsensusState::Yes; @@ -148,7 +171,11 @@ checkConsensus( // Have sufficient nodes on our UNL list moved on and reached the threshold // to declare consensus? if (checkConsensusReached( - currentFinished, currentProposers, false, parms.minCONSENSUS_PCT)) + currentFinished, + currentProposers, + false, + parms.minCONSENSUS_PCT, + currentAgreeTime > parms.ledgerMAX_CONSENSUS)) { JLOG(j.warn()) << "We see no consensus, but 80% of nodes have moved on"; return ConsensusState::MovedOn; diff --git a/src/ripple/consensus/Consensus.h b/src/ripple/consensus/Consensus.h index ea88e3232ee..248bbdc4a1b 100644 --- a/src/ripple/consensus/Consensus.h +++ b/src/ripple/consensus/Consensus.h @@ -1157,7 +1157,7 @@ Consensus::shouldPause() const std::size_t const offline = trustedKeys.size(); std::stringstream vars; - vars << " (working seq: " << previousLedger_.seq() << ", " + vars << " consensuslog (working seq: " << previousLedger_.seq() << ", " << "validated seq: " << adaptor_.getValidLedgerIndex() << ", " << "am validator: " << adaptor_.validator() << ", " << "have validated: " << adaptor_.haveValidated() << ", " diff --git a/src/ripple/consensus/ConsensusParms.h b/src/ripple/consensus/ConsensusParms.h index 542b3644b42..a0b6c6be8d4 100644 --- a/src/ripple/consensus/ConsensusParms.h +++ b/src/ripple/consensus/ConsensusParms.h @@ -86,7 +86,7 @@ struct ConsensusParms * validators don't appear to be offline that are merely waiting for * laggards. */ - std::chrono::milliseconds ledgerMAX_CONSENSUS = std::chrono::seconds{10}; + std::chrono::milliseconds ledgerMAX_CONSENSUS = std::chrono::seconds{15}; //! Minimum number of seconds to wait to ensure others have computed the LCL std::chrono::milliseconds ledgerMIN_CLOSE = std::chrono::seconds{2}; diff --git a/src/test/consensus/Consensus_test.cpp b/src/test/consensus/Consensus_test.cpp index 1c19ff0708d..5c7dc2626fe 100644 --- a/src/test/consensus/Consensus_test.cpp +++ b/src/test/consensus/Consensus_test.cpp @@ -109,10 +109,15 @@ class Consensus_test : public beast::unit_test::suite ConsensusState::MovedOn == checkConsensus(10, 2, 1, 8, 3s, 10s, p, true, journal_)); - // No peers makes it easy to agree + // If no peers, don't agree until time has passed. BEAST_EXPECT( - ConsensusState::Yes == + ConsensusState::No == checkConsensus(0, 0, 0, 0, 3s, 10s, p, true, journal_)); + + // Agree if no peers and enough time has passed. + BEAST_EXPECT( + ConsensusState::Yes == + checkConsensus(0, 0, 0, 0, 3s, 16s, p, true, journal_)); } void