Skip to content

Commit

Permalink
Work around a known block timeout bug by using a shorter timeout
Browse files Browse the repository at this point in the history
  • Loading branch information
teor2345 committed Oct 11, 2022
1 parent 7a926b1 commit 74b29be
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 8 deletions.
12 changes: 12 additions & 0 deletions zebrad/src/components/sync.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,17 @@ pub(super) const BLOCK_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
/// the state, so we allow double that time here.
pub(super) const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(30 * 60);

/// A shorter timeout used for the first few blocks after the final checkpoint.
///
/// This is a workaround for bug #5125, where the first fully validated blocks
/// after the final checkpoint fail with a timeout, due to a UTXO race condition.
const FINAL_CHECKPOINT_BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(5 * 60);

/// The number of blocks after the final checkpoint that get the shorter timeout.
///
/// We've only seen this error on the first few blocks after the final checkpoint.
const FINAL_CHECKPOINT_BLOCK_VERIFY_TIMEOUT_LIMIT: i32 = 100;

/// Controls how long we wait to restart syncing after finishing a sync run.
///
/// This delay should be long enough to:
Expand Down Expand Up @@ -388,6 +399,7 @@ where
checkpoint_verify_concurrency_limit,
full_verify_concurrency_limit,
),
max_checkpoint_height,
)),
state,
latest_chain_tip,
Expand Down
42 changes: 34 additions & 8 deletions zebrad/src/components/sync/downloads.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@
use std::{
collections::HashMap,
convert::TryFrom,
convert::{self, TryFrom},
pin::Pin,
sync::Arc,
task::{Context, Poll},
};

use futures::{
future::TryFutureExt,
future::{FutureExt, TryFutureExt},
ready,
stream::{FuturesUnordered, Stream},
};
use pin_project::pin_project;
use thiserror::Error;
use tokio::{sync::oneshot, task::JoinHandle};
use tokio::{sync::oneshot, task::JoinHandle, time::timeout};
use tower::{hedge, Service, ServiceExt};
use tracing_futures::Instrument;

Expand All @@ -26,6 +26,10 @@ use zebra_chain::{
use zebra_network as zn;
use zebra_state as zs;

use crate::components::sync::{
FINAL_CHECKPOINT_BLOCK_VERIFY_TIMEOUT, FINAL_CHECKPOINT_BLOCK_VERIFY_TIMEOUT_LIMIT,
};

type BoxError = Box<dyn std::error::Error + Send + Sync + 'static>;

/// A multiplier used to calculate the extra number of blocks we allow in the
Expand Down Expand Up @@ -166,6 +170,9 @@ where
/// The configured lookahead limit, after applying the minimum limit.
lookahead_limit: usize,

/// The largest block height for the checkpoint verifier, based on the current config.
max_checkpoint_height: Height,

// Internal downloads state
/// A list of pending block download and verify tasks.
#[pin]
Expand Down Expand Up @@ -238,18 +245,28 @@ where
ZSTip: ChainTip + Clone + Send + 'static,
{
/// Initialize a new download stream with the provided `network` and
/// `verifier` services. Uses the `latest_chain_tip` and `lookahead_limit`
/// to drop blocks that are too far ahead of the current state tip.
/// `verifier` services.
///
/// Uses the `latest_chain_tip` and `lookahead_limit` to drop blocks
/// that are too far ahead of the current state tip.
/// Uses `max_checkpoint_height` to work around a known block timeout (#5125).
///
/// The [`Downloads`] stream is agnostic to the network policy, so retry and
/// timeout limits should be applied to the `network` service passed into
/// this constructor.
pub fn new(network: ZN, verifier: ZV, latest_chain_tip: ZSTip, lookahead_limit: usize) -> Self {
pub fn new(
network: ZN,
verifier: ZV,
latest_chain_tip: ZSTip,
lookahead_limit: usize,
max_checkpoint_height: Height,
) -> Self {
Self {
network,
verifier,
latest_chain_tip,
lookahead_limit,
max_checkpoint_height,
pending: FuturesUnordered::new(),
cancel_handles: HashMap::new(),
}
Expand Down Expand Up @@ -290,6 +307,7 @@ where
let mut verifier = self.verifier.clone();
let latest_chain_tip = self.latest_chain_tip.clone();
let lookahead_limit = self.lookahead_limit;
let max_checkpoint_height = self.max_checkpoint_height;

let task = tokio::spawn(
async move {
Expand Down Expand Up @@ -418,9 +436,17 @@ where
};

// Verify the block.
let rsp = verifier
let mut rsp = verifier
.map_err(|error| BlockDownloadVerifyError::VerifierServiceError { error })?
.call(block);
.call(block).boxed();

// Add a shorter timeout to workaround a known bug (#5125)
let short_timeout_max = (max_checkpoint_height + FINAL_CHECKPOINT_BLOCK_VERIFY_TIMEOUT_LIMIT).expect("checkpoint block height is in valid range");
if block_height >= max_checkpoint_height && block_height <= short_timeout_max {
rsp = timeout(FINAL_CHECKPOINT_BLOCK_VERIFY_TIMEOUT, rsp)
.map_err(|timeout| format!("initial fully verified block timed out: retrying: {:?}", timeout).into())
.map(|nested_result| nested_result.and_then(convert::identity)).boxed();
}

let verification = tokio::select! {
biased;
Expand Down

0 comments on commit 74b29be

Please sign in to comment.