Skip to content

Commit

Permalink
Monitoring service api (sigp#2251)
Browse files Browse the repository at this point in the history
## Issue Addressed

N/A

## Proposed Changes

Adds a client side api for collecting system and process metrics and pushing it to a monitoring service.
  • Loading branch information
pawanjay176 committed May 26, 2021
1 parent 55aada0 commit fdaeec6
Show file tree
Hide file tree
Showing 30 changed files with 1,103 additions and 60 deletions.
24 changes: 24 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ members = [
"common/validator_dir",
"common/warp_utils",
"common/fallback",
"common/monitoring_api",

"consensus/cached_tree_hash",
"consensus/int_to_bytes",
Expand Down
1 change: 1 addition & 0 deletions beacon_node/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,5 @@ hyper = "0.14.4"
lighthouse_version = { path = "../common/lighthouse_version" }
hex = "0.4.2"
slasher = { path = "../slasher" }
monitoring_api = { path = "../common/monitoring_api" }
sensitive_url = { path = "../common/sensitive_url" }
1 change: 1 addition & 0 deletions beacon_node/client/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,4 @@ http_api = { path = "../http_api" }
http_metrics = { path = "../http_metrics" }
slasher = { path = "../../slasher" }
slasher_service = { path = "../../slasher/service" }
monitoring_api = {path = "../../common/monitoring_api"}
17 changes: 17 additions & 0 deletions beacon_node/client/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use environment::RuntimeContext;
use eth1::{Config as Eth1Config, Service as Eth1Service};
use eth2_libp2p::NetworkGlobals;
use genesis::{interop_genesis_state, Eth1GenesisService};
use monitoring_api::{MonitoringHttpClient, ProcessType};
use network::{NetworkConfig, NetworkMessage, NetworkService};
use slasher::Slasher;
use slasher_service::SlasherService;
Expand Down Expand Up @@ -374,6 +375,22 @@ where
SlasherService::new(beacon_chain, network_send).run(&context.executor)
}

/// Start the explorer client which periodically sends beacon
/// and system metrics to the configured endpoint.
pub fn monitoring_client(self, config: &monitoring_api::Config) -> Result<Self, String> {
let context = self
.runtime_context
.as_ref()
.ok_or("monitoring_client requires a runtime_context")?
.service_context("monitoring_client".into());
let monitoring_client = MonitoringHttpClient::new(config, context.log().clone())?;
monitoring_client.auto_update(
context.executor,
vec![ProcessType::BeaconNode, ProcessType::System],
);
Ok(self)
}

/// Immediately starts the service that periodically logs information each slot.
pub fn notifier(self) -> Result<Self, String> {
let context = self
Expand Down
2 changes: 2 additions & 0 deletions beacon_node/client/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ pub struct Config {
pub eth1: eth1::Config,
pub http_api: http_api::Config,
pub http_metrics: http_metrics::Config,
pub monitoring_api: Option<monitoring_api::Config>,
pub slasher: Option<slasher::Config>,
}

Expand All @@ -87,6 +88,7 @@ impl Default for Config {
graffiti: Graffiti::default(),
http_api: <_>::default(),
http_metrics: <_>::default(),
monitoring_api: None,
slasher: None,
validator_monitor_auto: false,
validator_monitor_pubkeys: vec![],
Expand Down
10 changes: 10 additions & 0 deletions beacon_node/client/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,14 @@ lazy_static! {
"sync_slots_per_second",
"The number of blocks being imported per second"
);

pub static ref IS_SYNCED: Result<IntGauge> = try_create_int_gauge(
"sync_eth2_synced",
"Metric to check if the beacon chain is synced to head. 0 if not synced and non-zero if synced"
);

pub static ref NOTIFIER_HEAD_SLOT: Result<IntGauge> = try_create_int_gauge(
"notifier_head_slot",
"The head slot sourced from the beacon chain notifier"
);
}
6 changes: 6 additions & 0 deletions beacon_node/client/src/notifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ pub fn spawn_notifier<T: BeaconChainTypes>(
};

let head_slot = head_info.slot;

metrics::set_gauge(&metrics::NOTIFIER_HEAD_SLOT, head_slot.as_u64() as i64);

let current_slot = match beacon_chain.slot() {
Ok(slot) => slot,
Err(e) => {
Expand Down Expand Up @@ -123,6 +126,7 @@ pub fn spawn_notifier<T: BeaconChainTypes>(

// Log if we are syncing
if sync_state.is_syncing() {
metrics::set_gauge(&metrics::IS_SYNCED, 0);
let distance = format!(
"{} slots ({})",
head_distance.as_u64(),
Expand Down Expand Up @@ -151,6 +155,7 @@ pub fn spawn_notifier<T: BeaconChainTypes>(
);
}
} else if sync_state.is_synced() {
metrics::set_gauge(&metrics::IS_SYNCED, 1);
let block_info = if current_slot > head_slot {
" … empty".to_string()
} else {
Expand All @@ -167,6 +172,7 @@ pub fn spawn_notifier<T: BeaconChainTypes>(
"slot" => current_slot,
);
} else {
metrics::set_gauge(&metrics::IS_SYNCED, 0);
info!(
log,
"Searching for peers";
Expand Down
19 changes: 19 additions & 0 deletions beacon_node/eth1/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,23 @@ lazy_static! {
pub static ref ENDPOINT_REQUESTS: Result<IntCounterVec> = try_create_int_counter_vec(
"eth1_endpoint_requests", "The number of eth1 requests for each endpoint", &["endpoint"]
);

/*
* Eth1 rpc connection
*/

pub static ref ETH1_CONNECTED: Result<IntGauge> = try_create_int_gauge(
"sync_eth1_connected", "Set to 1 if connected to an eth1 node, otherwise set to 0"
);

pub static ref ETH1_FALLBACK_CONFIGURED: Result<IntGauge> = try_create_int_gauge(
"sync_eth1_fallback_configured", "Number of configured eth1 fallbacks"
);

// Note: This metric only checks if an eth1 fallback is configured, not if it is connected and synced.
// Checking for liveness of the fallback would require moving away from lazy checking of fallbacks.
pub static ref ETH1_FALLBACK_CONNECTED: Result<IntGauge> = try_create_int_gauge(
"eth1_sync_fallback_connected", "Set to 1 if an eth1 fallback is connected, otherwise set to 0"
);

}
13 changes: 13 additions & 0 deletions beacon_node/eth1/src/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,9 @@ impl EndpointsCache {
&crate::metrics::ENDPOINT_ERRORS,
&[&endpoint.0.to_string()],
);
crate::metrics::set_gauge(&metrics::ETH1_CONNECTED, 0);
} else {
crate::metrics::set_gauge(&metrics::ETH1_CONNECTED, 1);
}
state
}
Expand Down Expand Up @@ -730,13 +733,23 @@ impl Service {

let mut interval = interval_at(Instant::now(), update_interval);

let num_fallbacks = self.config().endpoints.len() - 1;
let update_future = async move {
loop {
interval.tick().await;
self.do_update(update_interval).await.ok();
}
};

// Set the number of configured eth1 servers
metrics::set_gauge(&metrics::ETH1_FALLBACK_CONFIGURED, num_fallbacks as i64);
// Since we lazily update eth1 fallbacks, it's not possible to know connection status of fallback.
// Hence, we set it to 1 if we have atleast one configured fallback.
if num_fallbacks > 0 {
metrics::set_gauge(&metrics::ETH1_FALLBACK_CONNECTED, 1);
} else {
metrics::set_gauge(&metrics::ETH1_FALLBACK_CONNECTED, 0);
}
handle.spawn(update_future, "eth1");
}

Expand Down
17 changes: 17 additions & 0 deletions beacon_node/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,23 @@ pub fn cli_app<'a, 'b>() -> App<'a, 'b> {
.takes_value(true),
)

/*
* Monitoring metrics
*/

.arg(
Arg::with_name("monitoring-endpoint")
.long("monitoring-endpoint")
.value_name("ADDRESS")
.help("Enables the monitoring service for sending system metrics to a remote endpoint. \
This can be used to monitor your setup on certain services (e.g. beaconcha.in). \
This flag sets the endpoint where the beacon node metrics will be sent. \
Note: This will send information to a remote sever which may identify and associate your \
validators, IP address and other personal information. Always use a HTTPS connection \
and never provide an untrusted URL.")
.takes_value(true),
)

/*
* Standard staking flags
*/
Expand Down
11 changes: 11 additions & 0 deletions beacon_node/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,17 @@ pub fn get_config<E: EthSpec>(
client_config.http_metrics.allow_origin = Some(allow_origin.to_string());
}

/*
* Explorer metrics
*/
if let Some(monitoring_endpoint) = cli_args.value_of("monitoring-endpoint") {
client_config.monitoring_api = Some(monitoring_api::Config {
db_path: None,
freezer_db_path: None,
monitoring_endpoint: monitoring_endpoint.to_string(),
});
}

// Log a warning indicating an open HTTP server if it wasn't specified explicitly
// (e.g. using the --staking flag).
if cli_args.is_present("staking") {
Expand Down
12 changes: 10 additions & 2 deletions beacon_node/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,14 @@ impl<E: EthSpec> ProductionBeaconNode<E> {
let log = context.log().clone();
let datadir = client_config.create_data_dir()?;
let db_path = client_config.create_db_path()?;
let freezer_db_path_res = client_config.create_freezer_db_path();
let freezer_db_path = client_config.create_freezer_db_path()?;
let executor = context.executor.clone();

let builder = ClientBuilder::new(context.eth_spec_instance.clone())
.runtime_context(context)
.chain_spec(spec)
.http_api_config(client_config.http_api.clone())
.disk_store(&datadir, &db_path, &freezer_db_path_res?, store_config)?;
.disk_store(&datadir, &db_path, &freezer_db_path, store_config)?;

let builder = if let Some(slasher_config) = client_config.slasher.clone() {
let slasher = Arc::new(
Expand All @@ -82,6 +82,14 @@ impl<E: EthSpec> ProductionBeaconNode<E> {
builder
};

let builder = if let Some(monitoring_config) = &mut client_config.monitoring_api {
monitoring_config.db_path = Some(db_path);
monitoring_config.freezer_db_path = Some(freezer_db_path);
builder.monitoring_client(monitoring_config)?
} else {
builder
};

let builder = builder
.beacon_chain_builder(client_genesis, client_config.clone())
.await?;
Expand Down
2 changes: 1 addition & 1 deletion beacon_node/store/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ mod impls;
mod leveldb_store;
mod memory_store;
pub mod metadata;
mod metrics;
pub mod metrics;
mod partial_beacon_state;

pub mod iter;
Expand Down
43 changes: 31 additions & 12 deletions book/src/api-lighthouse.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,39 @@ curl -X GET "http://localhost:5052/lighthouse/health" -H "accept: application/j
```json
{
"data": {
"pid": 1728254,
"pid_num_threads": 47,
"pid_mem_resident_set_size": 510054400,
"pid_mem_virtual_memory_size": 3963158528,
"sys_virt_mem_total": 16715530240,
"sys_virt_mem_available": 4065374208,
"sys_virt_mem_used": 11383402496,
"sys_virt_mem_free": 1368662016,
"sys_virt_mem_percent": 75.67906,
"sys_loadavg_1": 4.92,
"sys_loadavg_5": 5.53,
"sys_loadavg_15": 5.58
"sys_virt_mem_total": 16671133696,
"sys_virt_mem_available": 8273715200,
"sys_virt_mem_used": 7304818688,
"sys_virt_mem_free": 2998190080,
"sys_virt_mem_percent": 50.37101,
"sys_virt_mem_cached": 5013975040,
"sys_virt_mem_buffers": 1354149888,
"sys_loadavg_1": 2.29,
"sys_loadavg_5": 3.48,
"sys_loadavg_15": 3.72,
"cpu_cores": 4,
"cpu_threads": 8,
"system_seconds_total": 5728,
"user_seconds_total": 33680,
"iowait_seconds_total": 873,
"idle_seconds_total": 177530,
"cpu_time_total": 217447,
"disk_node_bytes_total": 358443397120,
"disk_node_bytes_free": 70025089024,
"disk_node_reads_total": 1141863,
"disk_node_writes_total": 1377993,
"network_node_bytes_total_received": 2405639308,
"network_node_bytes_total_transmit": 328304685,
"misc_node_boot_ts_seconds": 1620629638,
"misc_os": "linux",
"pid": 4698,
"pid_num_threads": 25,
"pid_mem_resident_set_size": 783757312,
"pid_mem_virtual_memory_size": 2564665344,
"pid_process_seconds_total": 22
}
}

```

### `/lighthouse/syncing`
Expand Down
Loading

0 comments on commit fdaeec6

Please sign in to comment.