sigp · realbigsean · May 6, 2024 · May 3, 2024 · May 3, 2024 · May 6, 2024
diff --git a/beacon_node/http_api/src/metrics.rs b/beacon_node/http_api/src/metrics.rs
@@ -31,7 +31,13 @@ lazy_static::lazy_static! {
     );
     pub static ref HTTP_API_BLOCK_BROADCAST_DELAY_TIMES: Result<HistogramVec> = try_create_histogram_vec(
         "http_api_block_broadcast_delay_times",
-        "Time between start of the slot and when the block was broadcast",
+        "Time between start of the slot and when the block completed broadcast and processing",
+        &["provenance"]
+    );
+    pub static ref HTTP_API_BLOCK_GOSSIP_TIMES: Result<HistogramVec> = try_create_histogram_vec_with_buckets(
+        "http_api_block_gossip_times",
+        "Time between receiving the block on HTTP and publishing it on gossip",
+        decimal_buckets(-2, 2),
         &["provenance"]
     );
     pub static ref HTTP_API_BLOCK_PUBLISHED_LATE_TOTAL: Result<IntCounter> = try_create_int_counter(

diff --git a/beacon_node/http_api/src/publish_blocks.rs b/beacon_node/http_api/src/publish_blocks.rs
@@ -60,6 +60,11 @@ pub async fn publish_block<T: BeaconChainTypes, B: IntoGossipVerifiedBlockConten
         ProvenancedBlock::Local(block_contents, _) => (block_contents, true),
         ProvenancedBlock::Builder(block_contents, _) => (block_contents, false),
     };
+    let provenance = if is_locally_built_block {
+        "local"
+    } else {
+        "builder"
+    };
     let block = block_contents.inner_block().clone();
     let delay = get_block_delay_ms(seen_timestamp, block.message(), &chain.slot_clock);
     debug!(log, "Signed block received in HTTP API"; "slot" => block.slot());
@@ -75,7 +80,18 @@ pub async fn publish_block<T: BeaconChainTypes, B: IntoGossipVerifiedBlockConten
             .checked_sub(seen_timestamp)
             .unwrap_or_else(|| Duration::from_secs(0));
 
-        info!(log, "Signed block published to network via HTTP API"; "slot" => block.slot(), "publish_delay" => ?publish_delay);
+        metrics::observe_timer_vec(
+            &metrics::HTTP_API_BLOCK_GOSSIP_TIMES,
+            &[provenance],
+            publish_delay,
+        );
+
+        info!(
+            log,
+            "Signed block published to network via HTTP API";
+            "slot" => block.slot(),
+            "publish_delay_ms" => publish_delay.as_millis()
+        );
 
         match block.as_ref() {
             SignedBeaconBlock::Base(_)

diff --git a/beacon_node/lighthouse_network/src/rpc/handler.rs b/beacon_node/lighthouse_network/src/rpc/handler.rs
@@ -352,6 +352,31 @@ where
         !matches!(self.state, HandlerState::Deactivated)
     }
 
+    // NOTE: This function gets polled to completion upon a connection close.
+    fn poll_close(&mut self, _: &mut Context<'_>) -> Poll<Option<Self::ToBehaviour>> {
+        // Inform the network behaviour of any failed requests
+
+        while let Some(substream_id) = self.outbound_substreams.keys().next().cloned() {
+            let outbound_info = self
+                .outbound_substreams
+                .remove(&substream_id)
+                .expect("The value must exist for a key");
+            // If the state of the connection is closing, we do not need to report this case to
+            // the behaviour, as the connection has just closed non-gracefully
+            if matches!(outbound_info.state, OutboundSubstreamState::Closing(_)) {
+                continue;
+            }
+
+            // Register this request as an RPC Error
+            return Poll::Ready(Some(HandlerEvent::Err(HandlerErr::Outbound {
+                error: RPCError::Disconnected,
+                proto: outbound_info.proto,
+                id: outbound_info.req_id,
+            })));
+        }
+        Poll::Ready(None)
+    }
+
     fn poll(
         &mut self,
         cx: &mut Context<'_>,

diff --git a/beacon_node/lighthouse_network/src/service/mod.rs b/beacon_node/lighthouse_network/src/service/mod.rs
@@ -972,6 +972,12 @@ impl<AppReqId: ReqId, E: EthSpec> Network<AppReqId, E> {
             .goodbye_peer(peer_id, reason, source);
     }
 
+    /// Hard (ungraceful) disconnect for testing purposes only
+    /// Use goodbye_peer for disconnections, do not use this function.
+    pub fn __hard_disconnect_testing_only(&mut self, peer_id: PeerId) {
+        let _ = self.swarm.disconnect_peer_id(peer_id);
+    }
+
     /// Returns an iterator over all enr entries in the DHT.
     pub fn enr_entries(&self) -> Vec<Enr> {
         self.discovery().table_entries_enr()
@@ -1373,12 +1379,18 @@ impl<AppReqId: ReqId, E: EthSpec> Network<AppReqId, E> {
         let peer_id = event.peer_id;
 
         if !self.peer_manager().is_connected(&peer_id) {
-            debug!(
-                self.log,
-                "Ignoring rpc message of disconnecting peer";
-                event
-            );
-            return None;
+            // Sync expects a RPCError::Disconnected to drop associated lookups with this peer.
+            // Silencing this event breaks the API contract with RPC where every request ends with
+            // - A stream termination event, or
+            // - An RPCError event
+            if !matches!(event.event, HandlerEvent::Err(HandlerErr::Outbound { .. })) {
+                debug!(
+                    self.log,
+                    "Ignoring rpc message of disconnecting peer";
+                    event
+                );
+                return None;
+            }
         }
 
         let handler_id = event.conn_id;

diff --git a/beacon_node/lighthouse_network/tests/rpc_tests.rs b/beacon_node/lighthouse_network/tests/rpc_tests.rs
@@ -3,7 +3,7 @@
 mod common;
 
 use common::Protocol;
-use lighthouse_network::rpc::methods::*;
+use lighthouse_network::rpc::{methods::*, RPCError};
 use lighthouse_network::{rpc::max_rpc_size, NetworkEvent, ReportSource, Request, Response};
 use slog::{debug, warn, Level};
 use ssz::Encode;
@@ -996,6 +996,96 @@ fn test_tcp_blocks_by_root_chunked_rpc_terminates_correctly() {
     })
 }
 
+#[test]
+fn test_disconnect_triggers_rpc_error() {
+    // set up the logging. The level and enabled logging or not
+    let log_level = Level::Debug;
+    let enable_logging = false;
+
+    let log = common::build_log(log_level, enable_logging);
+    let spec = E::default_spec();
+
+    let rt = Arc::new(Runtime::new().unwrap());
+    // get sender/receiver
+    rt.block_on(async {
+        let (mut sender, mut receiver) = common::build_node_pair(
+            Arc::downgrade(&rt),
+            &log,
+            ForkName::Base,
+            &spec,
+            Protocol::Tcp,
+        )
+        .await;
+
+        // BlocksByRoot Request
+        let rpc_request = Request::BlocksByRoot(BlocksByRootRequest::new(
+            // Must have at least one root for the request to create a stream
+            vec![Hash256::from_low_u64_be(0)],
+            &spec,
+        ));
+
+        // build the sender future
+        let sender_future = async {
+            loop {
+                match sender.next_event().await {
+                    NetworkEvent::PeerConnectedOutgoing(peer_id) => {
+                        // Send a STATUS message
+                        debug!(log, "Sending RPC");
+                        sender.send_request(peer_id, 42, rpc_request.clone());
+                    }
+                    NetworkEvent::RPCFailed { error, id: 42, .. } => match error {
+                        RPCError::Disconnected => return,
+                        other => panic!("received unexpected error {:?}", other),
+                    },
+                    other => {
+                        warn!(log, "Ignoring other event {:?}", other);
+                    }
+                }
+            }
+        };
+
+        // determine messages to send (PeerId, RequestId). If some, indicates we still need to send
+        // messages
+        let mut sending_peer = None;
+        let receiver_future = async {
+            loop {
+                // this future either drives the sending/receiving or times out allowing messages to be
+                // sent in the timeout
+                match futures::future::select(
+                    Box::pin(receiver.next_event()),
+                    Box::pin(tokio::time::sleep(Duration::from_secs(1))),
+                )
+                .await
+                {
+                    futures::future::Either::Left((ev, _)) => match ev {
+                        NetworkEvent::RequestReceived { peer_id, .. } => {
+                            sending_peer = Some(peer_id);
+                        }
+                        other => {
+                            warn!(log, "Ignoring other event {:?}", other);
+                        }
+                    },
+                    futures::future::Either::Right((_, _)) => {} // The timeout hit, send messages if required
+                }
+
+                // if we need to send messages send them here. This will happen after a delay
+                if let Some(peer_id) = sending_peer.take() {
+                    warn!(log, "Receiver got request, disconnecting peer");
+                    receiver.__hard_disconnect_testing_only(peer_id);
+                }
+            }
+        };
+
+        tokio::select! {
+            _ = sender_future => {}
+            _ = receiver_future => {}
+            _ = sleep(Duration::from_secs(30)) => {
+                panic!("Future timed out");
+            }
+        }
+    })
+}
+
 /// Establishes a pair of nodes and disconnects the pair based on the selected protocol via an RPC
 /// Goodbye message.
 fn goodbye_test(log_level: Level, enable_logging: bool, protocol: Protocol) {

diff --git a/beacon_node/network/src/sync/backfill_sync/mod.rs b/beacon_node/network/src/sync/backfill_sync/mod.rs
@@ -307,49 +307,15 @@ impl<T: BeaconChainTypes> BackFillSync<T> {
     /// A peer has disconnected.
     /// If the peer has active batches, those are considered failed and re-requested.
     #[must_use = "A failure here indicates the backfill sync has failed and the global sync state should be updated"]
-    pub fn peer_disconnected(
-        &mut self,
-        peer_id: &PeerId,
-        network: &mut SyncNetworkContext<T>,
-    ) -> Result<(), BackFillError> {
+    pub fn peer_disconnected(&mut self, peer_id: &PeerId) -> Result<(), BackFillError> {
         if matches!(
             self.state(),
             BackFillState::Failed | BackFillState::NotRequired
         ) {
             return Ok(());
         }
 
-        if let Some(batch_ids) = self.active_requests.remove(peer_id) {
-            // fail the batches
-            for id in batch_ids {
-                if let Some(batch) = self.batches.get_mut(&id) {
-                    match batch.download_failed(false) {
-                        Ok(BatchOperationOutcome::Failed { blacklist: _ }) => {
-                            self.fail_sync(BackFillError::BatchDownloadFailed(id))?;
-                        }
-                        Ok(BatchOperationOutcome::Continue) => {}
-                        Err(e) => {
-                            self.fail_sync(BackFillError::BatchInvalidState(id, e.0))?;
-                        }
-                    }
-                    // If we have run out of peers in which to retry this batch, the backfill state
-                    // transitions to a paused state.
-                    // We still need to reset the state for all the affected batches, so we should not
-                    // short circuit early
-                    if self.retry_batch_download(network, id).is_err() {
-                        debug!(
-                            self.log,
-                            "Batch could not be retried";
-                            "batch_id" => id,
-                            "error" => "no synced peers"
-                        );
-                    }
-                } else {
-                    debug!(self.log, "Batch not found while removing peer";
-                        "peer" => %peer_id, "batch" => id)
-                }
-            }
-        }
+        self.active_requests.remove(peer_id);
 
         // Remove the peer from the participation list
         self.participating_peers.remove(peer_id);

diff --git a/beacon_node/network/src/sync/block_lookups/common.rs b/beacon_node/network/src/sync/block_lookups/common.rs
@@ -1,9 +1,7 @@
 use crate::sync::block_lookups::single_block_lookup::{
     LookupRequestError, SingleBlockLookup, SingleLookupRequestState,
 };
-use crate::sync::block_lookups::{
-    BlobRequestState, BlockRequestState, PeerId, SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS,
-};
+use crate::sync::block_lookups::{BlobRequestState, BlockRequestState, PeerId};
 use crate::sync::manager::{BlockProcessType, Id, SLOT_IMPORT_TOLERANCE};
 use crate::sync::network_context::SyncNetworkContext;
 use beacon_chain::block_verification_types::RpcBlock;
@@ -26,11 +24,6 @@ pub enum ResponseType {
 /// is further back than the most recent head slot.
 pub(crate) const PARENT_DEPTH_TOLERANCE: usize = SLOT_IMPORT_TOLERANCE * 2;
 
-/// Wrapper around bool to prevent mixing this argument with `BlockIsProcessed`
-pub(crate) struct AwaitingParent(pub bool);
-/// Wrapper around bool to prevent mixing this argument with `AwaitingParent`
-pub(crate) struct BlockIsProcessed(pub bool);
-
 /// This trait unifies common single block lookup functionality across blocks and blobs. This
 /// includes making requests, verifying responses, and handling processing results. A
 /// `SingleBlockLookup` includes both a `BlockRequestState` and a `BlobRequestState`, this trait is
@@ -43,52 +36,6 @@ pub trait RequestState<T: BeaconChainTypes> {
     /// The type created after validation.
     type VerifiedResponseType: Clone;
 
-    /// Potentially makes progress on this request if it's in a progress-able state
-    fn continue_request(
-        &mut self,
-        id: Id,
-        awaiting_parent: AwaitingParent,
-        downloaded_block_expected_blobs: Option<usize>,
-        block_is_processed: BlockIsProcessed,
-        cx: &mut SyncNetworkContext<T>,
-    ) -> Result<(), LookupRequestError> {
-        // Attempt to progress awaiting downloads
-        if self.get_state().is_awaiting_download() {
-            // Verify the current request has not exceeded the maximum number of attempts.
-            let request_state = self.get_state();
-            if request_state.failed_attempts() >= SINGLE_BLOCK_LOOKUP_MAX_ATTEMPTS {
-                let cannot_process = request_state.more_failed_processing_attempts();
-                return Err(LookupRequestError::TooManyAttempts { cannot_process });
-            }
-
-            let peer_id = self
-                .get_state_mut()
-                .use_rand_available_peer()
-                .ok_or(LookupRequestError::NoPeers)?;
-
-            // make_request returns true only if a request needs to be made
-            if self.make_request(id, peer_id, downloaded_block_expected_blobs, cx)? {
-                self.get_state_mut().on_download_start()?;
-            } else {
-                self.get_state_mut().on_completed_request()?;
-            }
-
-        // Otherwise, attempt to progress awaiting processing
-        // If this request is awaiting a parent lookup to be processed, do not send for processing.
-        // The request will be rejected with unknown parent error.
-        } else if !awaiting_parent.0
-            && (block_is_processed.0 || matches!(Self::response_type(), ResponseType::Block))
-        {
-            // maybe_start_processing returns Some if state == AwaitingProcess. This pattern is
-            // useful to conditionally access the result data.
-            if let Some(result) = self.get_state_mut().maybe_start_processing() {
-                return Self::send_for_processing(id, result, cx);
-            }
-        }
-
-        Ok(())
-    }
-
     /// Request the network context to prepare a request of a component of `block_root`. If the
     /// request is not necessary because the component is already known / processed, return false.
     /// Return true if it sent a request and we can expect an event back from the network.