Skip to content

Commit

Permalink
Instead of abending, wait until server stabilizes after
Browse files Browse the repository at this point in the history
failing online delete healch check.
  • Loading branch information
mtrippled committed Apr 7, 2022
1 parent cdd37a2 commit f2143de
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 112 deletions.
15 changes: 4 additions & 11 deletions cfg/rippled-example.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -1140,17 +1140,10 @@
# The online delete process checks periodically
# that rippled is still in sync with the network,
# and that the validated ledger is less than
# 'age_threshold_seconds' old. By default, if it
# is not the online delete process aborts and
# tries again later. If 'recovery_wait_seconds'
# is set and rippled is out of sync, but likely to
# recover quickly, then online delete will wait
# this number of seconds for rippled to get back
# into sync before it aborts.
# Set this value if the node is otherwise staying
# in sync, or recovering quickly, but the online
# delete process is unable to finish.
# Default is unset.
# 'age_threshold_seconds' old. If not, then continue
# sleeping for this number of seconds and
# checking until healthy.
# Default is 5.
#
# Optional keys for Cassandra:
#
Expand Down
118 changes: 34 additions & 84 deletions src/ripple/app/misc/SHAMapStoreImp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ SHAMapStoreImp::SHAMapStoreImp(
if (get_if_exists(section, "age_threshold_seconds", temp))
ageThreshold_ = std::chrono::seconds{temp};
if (get_if_exists(section, "recovery_wait_seconds", temp))
recoveryWaitTime_.emplace(std::chrono::seconds{temp});
recoveryWaitTime_ = std::chrono::seconds{temp};

get_if_exists(section, "advisory_delete", advisoryDelete_);

Expand Down Expand Up @@ -268,7 +268,7 @@ SHAMapStoreImp::copyNode(std::uint64_t& nodeCount, SHAMapTreeNode const& node)
true);
if (!(++nodeCount % checkHealthInterval_))
{
if (health())
if (stopping())
return false;
}

Expand Down Expand Up @@ -326,7 +326,7 @@ SHAMapStoreImp::run()

bool const readyToRotate =
validatedSeq >= lastRotated + deleteInterval_ &&
canDelete_ >= lastRotated - 1 && !health();
canDelete_ >= lastRotated - 1 && !stopping();

// Make sure we don't delete ledgers currently being
// imported into the ShardStore
Expand Down Expand Up @@ -358,15 +358,8 @@ SHAMapStoreImp::run()
<< ledgerMaster_->getValidatedLedgerAge().count() << 's';

clearPrior(lastRotated);
switch (health())
{
case Health::stopping:
return;
case Health::unhealthy:
continue;
case Health::ok:
default:;
}
if (stopping())
return;

JLOG(journal_.debug()) << "copying ledger " << validatedSeq;
std::uint64_t nodeCount = 0;
Expand All @@ -375,30 +368,16 @@ SHAMapStoreImp::run()
this,
std::ref(nodeCount),
std::placeholders::_1));
switch (health())
{
case Health::stopping:
return;
case Health::unhealthy:
continue;
case Health::ok:
default:;
}
if (stopping())
return;
// Only log if we completed without a "health" abort
JLOG(journal_.debug()) << "copied ledger " << validatedSeq
<< " nodecount " << nodeCount;

JLOG(journal_.debug()) << "freshening caches";
freshenCaches();
switch (health())
{
case Health::stopping:
return;
case Health::unhealthy:
continue;
case Health::ok:
default:;
}
if (stopping())
return;
// Only log if we completed without a "health" abort
JLOG(journal_.debug()) << validatedSeq << " freshened caches";

Expand All @@ -408,15 +387,8 @@ SHAMapStoreImp::run()
<< validatedSeq << " new backend " << newBackend->getName();

clearCaches(validatedSeq);
switch (health())
{
case Health::stopping:
return;
case Health::unhealthy:
continue;
case Health::ok:
default:;
}
if (stopping())
return;

lastRotated = validatedSeq;

Expand Down Expand Up @@ -580,7 +552,7 @@ SHAMapStoreImp::clearSql(
min = *m;
}

if (min > lastRotated || health() != Health::ok)
if (min > lastRotated || stopping())
return;
if (min == lastRotated)
{
Expand All @@ -601,11 +573,11 @@ SHAMapStoreImp::clearSql(
JLOG(journal_.trace())
<< "End: Delete up to " << deleteBatch_ << " rows with LedgerSeq < "
<< min << " from: " << TableName;
if (health())
if (stopping())
return;
if (min < lastRotated)
std::this_thread::sleep_for(backOff_);
if (health())
if (stopping())
return;
}
JLOG(journal_.debug()) << "finished deleting from: " << TableName;
Expand Down Expand Up @@ -645,7 +617,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
ledgerMaster_->clearPriorLedgers(lastRotated);
JLOG(journal_.trace()) << "End: Clear internal ledgers up to "
<< lastRotated;
if (health())
if (stopping())
return;

RelationalDBInterfaceSqlite* iface =
Expand All @@ -661,7 +633,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
[&iface](LedgerIndex min) -> void {
iface->deleteBeforeLedgerSeq(min);
});
if (health())
if (stopping())
return;

if (!app_.config().useTxTables())
Expand All @@ -676,7 +648,7 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
[&iface](LedgerIndex min) -> void {
iface->deleteTransactionsBeforeLedgerSeq(min);
});
if (health())
if (stopping())
return;

clearSql(
Expand All @@ -688,52 +660,30 @@ SHAMapStoreImp::clearPrior(LedgerIndex lastRotated)
[&iface](LedgerIndex min) -> void {
iface->deleteAccountTransactionsBeforeLedgerSeq(min);
});
if (health())
if (stopping())
return;
}

SHAMapStoreImp::Health
SHAMapStoreImp::health()
bool
SHAMapStoreImp::stopping()
{
auto age = ledgerMaster_->getValidatedLedgerAge();
OperatingMode mode = netOPs_->getOperatingMode();
std::unique_lock lock(mutex_);
while (!stop_ && (mode != OperatingMode::FULL || age > ageThreshold_))
{
std::lock_guard lock(mutex_);
if (stop_)
return Health::stopping;
}
if (!netOPs_)
return Health::ok;
assert(deleteInterval_);

if (healthy_)
{
auto age = ledgerMaster_->getValidatedLedgerAge();
OperatingMode mode = netOPs_->getOperatingMode();
if (recoveryWaitTime_ && mode == OperatingMode::SYNCING &&
age < ageThreshold_)
{
JLOG(journal_.warn())
<< "Waiting " << recoveryWaitTime_->count()
<< "s for node to get back into sync with network. state: "
<< app_.getOPs().strOperatingMode(mode, false) << ". age "
<< age.count() << 's';
std::this_thread::sleep_for(*recoveryWaitTime_);

age = ledgerMaster_->getValidatedLedgerAge();
mode = netOPs_->getOperatingMode();
}
if (mode != OperatingMode::FULL || age > ageThreshold_)
{
JLOG(journal_.warn()) << "Not deleting. state: "
<< app_.getOPs().strOperatingMode(mode, false)
<< ". age " << age.count() << 's';
healthy_ = false;
}
lock.unlock();
JLOG(journal_.warn()) << "Waiting " << recoveryWaitTime_.count()
<< "s for node to stabilize. state: "
<< app_.getOPs().strOperatingMode(mode, false)
<< ". age " << age.count() << 's';
std::this_thread::sleep_for(recoveryWaitTime_);
age = ledgerMaster_->getValidatedLedgerAge();
mode = netOPs_->getOperatingMode();
lock.lock();
}

if (healthy_)
return Health::ok;
else
return Health::unhealthy;
return stop_;
}

void
Expand Down
31 changes: 14 additions & 17 deletions src/ripple/app/misc/SHAMapStoreImp.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@ class NetworkOPs;
class SHAMapStoreImp : public SHAMapStore
{
private:
enum Health : std::uint8_t { ok = 0, stopping, unhealthy };

class SavedStateDB
{
public:
Expand Down Expand Up @@ -106,12 +104,12 @@ class SHAMapStoreImp : public SHAMapStore
std::uint32_t deleteBatch_ = 100;
std::chrono::milliseconds backOff_{100};
std::chrono::seconds ageThreshold_{60};
/// If set, and the node is out of sync during an
/// If the node is out of sync during an
/// online_delete health check, sleep the thread
/// for this time and check again so the node can
/// recover.
/// for this time, and continue checking until
/// recovery.
/// See also: "recovery_wait_seconds" in rippled-example.cfg
std::optional<std::chrono::seconds> recoveryWaitTime_;
std::chrono::seconds recoveryWaitTime_{5};

// these do not exist upon SHAMapStore creation, but do exist
// as of run() or before
Expand Down Expand Up @@ -201,7 +199,7 @@ class SHAMapStoreImp : public SHAMapStore
{
dbRotating_->fetchNodeObject(
key, 0, NodeStore::FetchType::synchronous, true);
if (!(++check % checkHealthInterval_) && health())
if (!(++check % checkHealthInterval_) && stopping())
return true;
}

Expand All @@ -225,16 +223,15 @@ class SHAMapStoreImp : public SHAMapStore
void
clearPrior(LedgerIndex lastRotated);

// If rippled is not healthy, defer rotate-delete.
// If already unhealthy, do not change state on further check.
// Assume that, once unhealthy, a necessary step has been
// aborted, so the online-delete process needs to restart
// at next ledger.
// If recoveryWaitTime_ is set, this may sleep to give rippled
// time to recover, so never call it from any thread other than
// the main "run()".
Health
health();
/**
* This is a health check for online deletion that waits until rippled is
* stable until returning. If the server is stopping, then it returns
* "true" to inform the caller to allow the server to stop.
*
* @return Whether the server is stopping.
*/
bool
stopping();

public:
void
Expand Down

0 comments on commit f2143de

Please sign in to comment.