diff --git a/rs/consensus/dkg/src/lib.rs b/rs/consensus/dkg/src/lib.rs index 7a48cb98eef7..569980635f28 100644 --- a/rs/consensus/dkg/src/lib.rs +++ b/rs/consensus/dkg/src/lib.rs @@ -12,10 +12,7 @@ use ic_interfaces::{ use ic_interfaces_registry::RegistryClient; use ic_interfaces_state_manager::StateReader; use ic_logger::{ReplicaLogger, error, info}; -use ic_metrics::{ - MetricsRegistry, - buckets::{decimal_buckets, linear_buckets}, -}; +use ic_metrics::MetricsRegistry; use ic_replicated_state::ReplicatedState; use ic_types::{ Height, NodeId, ReplicaVersion, SubnetId, @@ -25,7 +22,6 @@ use ic_types::{ threshold_sig::ni_dkg::{NiDkgId, NiDkgTargetSubnet, config::NiDkgConfig}, }, }; -use prometheus::Histogram; use rayon::prelude::*; use std::{ collections::BTreeMap, @@ -33,12 +29,14 @@ use std::{ }; pub mod dkg_key_manager; +pub mod metrics; pub mod payload_builder; pub mod payload_validator; pub(crate) mod remote; use crate::remote::{build_callback_id_config_map, merge_configs}; pub use crate::utils::get_vetkey_public_keys; +use metrics::DkgClientMetrics; #[cfg(test)] mod test_utils; @@ -61,11 +59,6 @@ const MAX_REMOTE_DKG_ATTEMPTS: u32 = 5; /// Generic error string for failed remote DKG requests. const REMOTE_DKG_REPEATED_FAILURE_ERROR: &str = "Attempts to run this DKG repeatedly failed"; -struct Metrics { - on_state_change_duration: Histogram, - on_state_change_processed: Histogram, -} - /// `DkgImpl` is responsible for holding DKG dependencies and for responding to /// changes in the consensus and DKG pool. pub struct DkgImpl { @@ -77,7 +70,7 @@ pub struct DkgImpl { consensus_cache: Arc, dkg_key_manager: Arc>, logger: ReplicaLogger, - metrics: Metrics, + metrics: DkgClientMetrics, } impl DkgImpl { @@ -102,21 +95,7 @@ impl DkgImpl { consensus_cache, dkg_key_manager, logger, - metrics: Metrics { - on_state_change_duration: metrics_registry.histogram( - "consensus_dkg_on_state_change_duration_seconds", - "The time it took to execute on_state_change(), in seconds", - // 0.1ms, 0.2ms, 0.5ms, 1ms, 2ms, 5ms, 10ms, 20ms, 50ms, 100ms, 200ms, 500ms, - // 1s, 2s, 5s, 10s, 20s, 50s, 100s, 200s, 500s - decimal_buckets(-4, 2), - ), - on_state_change_processed: metrics_registry.histogram( - "consensus_dkg_on_state_change_processed", - "Number of entries processed by on_state_change()", - // 0 - 100 - linear_buckets(0.0, 1.0, 100), - ), - }, + metrics: DkgClientMetrics::new(metrics_registry), } } @@ -2040,6 +2019,7 @@ mod tests { &parent, callback_id_map, &no_op_logger(), + None, ) .unwrap(); diff --git a/rs/consensus/dkg/src/metrics.rs b/rs/consensus/dkg/src/metrics.rs new file mode 100644 index 000000000000..81fa62d3f266 --- /dev/null +++ b/rs/consensus/dkg/src/metrics.rs @@ -0,0 +1,136 @@ +use ic_metrics::{ + MetricsRegistry, + buckets::{decimal_buckets, linear_buckets}, +}; +use ic_types::{ + consensus::BlockPayload, + crypto::threshold_sig::ni_dkg::{NiDkgTag, NiDkgTargetSubnet}, +}; +use prometheus::{Histogram, HistogramTimer, HistogramVec, IntCounterVec}; +use std::collections::BTreeMap; + +pub(crate) struct DkgClientMetrics { + pub(crate) on_state_change_duration: Histogram, + pub(crate) on_state_change_processed: Histogram, +} + +impl DkgClientMetrics { + pub(crate) fn new(metrics_registry: MetricsRegistry) -> Self { + Self { + on_state_change_duration: metrics_registry.histogram( + "consensus_dkg_on_state_change_duration_seconds", + "The time it took to execute on_state_change(), in seconds", + // 0.1ms, 0.2ms, 0.5ms, 1ms, 2ms, 5ms, 10ms, 20ms, 50ms, 100ms, 200ms, 500ms, + // 1s, 2s, 5s, 10s, 20s, 50s, 100s, 200s, 500s + decimal_buckets(-4, 2), + ), + on_state_change_processed: metrics_registry.histogram( + "consensus_dkg_on_state_change_processed", + "Number of entries processed by on_state_change()", + // 0 - 100 + linear_buckets(0.0, 1.0, 100), + ), + } + } +} + +pub struct DkgPayloadMetrics { + payload_errors: IntCounterVec, + pub(crate) payload_duration: HistogramVec, +} + +impl DkgPayloadMetrics { + pub fn new(metrics_registry: MetricsRegistry) -> Self { + Self { + payload_errors: metrics_registry.int_counter_vec( + "dkg_payload_errors", + "DKG payload related errors", + &["type"], + ), + payload_duration: metrics_registry.histogram_vec( + "dkg_payload_creation_seconds", + "Time taken to create a DKG payload, in seconds", + decimal_buckets(-4, 2), + &["type"], + ), + } + } + + pub fn payload_errors_inc(&self, label: &str) { + self.payload_errors.with_label_values(&[label]).inc(); + } + + pub(crate) fn payload_creation_timer(&self, label: &str) -> HistogramTimer { + self.payload_duration + .with_label_values(&[label]) + .start_timer() + } +} + +pub(crate) trait DkgPayloadMetricsOptionExt { + fn payload_errors_inc(self, label: &str); + fn payload_creation_timer(self, label: &str) -> Option; +} + +impl DkgPayloadMetricsOptionExt for Option<&DkgPayloadMetrics> { + fn payload_errors_inc(self, label: &str) { + if let Some(metrics) = self { + metrics.payload_errors_inc(label); + } + } + + fn payload_creation_timer(self, label: &str) -> Option { + self.map(|metrics| metrics.payload_creation_timer(label)) + } +} + +pub struct DkgPayloadStats { + pub remote_dkg_attempts_map_size: Option, + pub remote_dkg_attempts_map_sum: Option, + pub dealings_included: BTreeMap<(NiDkgTag, NiDkgTargetSubnet), usize>, + pub remote_transcripts_delivered: BTreeMap, +} + +impl From<&BlockPayload> for DkgPayloadStats { + fn from(payload: &BlockPayload) -> Self { + let mut dealings_included = BTreeMap::new(); + let mut remote_transcripts_delivered = BTreeMap::new(); + let (remote_dkg_attempts_map_size, remote_dkg_attempts_map_sum) = match payload { + BlockPayload::Summary(summary_payload) => ( + Some(summary_payload.dkg.initial_dkg_attempts.len()), + Some( + summary_payload + .dkg + .initial_dkg_attempts + .values() + .map(|attempts| *attempts as u64) + .sum(), + ), + ), + BlockPayload::Data(data_payload) => { + for message in &data_payload.dkg.messages { + let dkg_id = &message.content.dkg_id; + dealings_included + .entry((dkg_id.dkg_tag.clone(), dkg_id.target_subnet)) + .and_modify(|count| *count += 1) + .or_insert(1); + } + + for (dkg_id, _, _) in &data_payload.dkg.transcripts_for_remote_subnets { + remote_transcripts_delivered + .entry(dkg_id.dkg_tag.clone()) + .and_modify(|count| *count += 1) + .or_insert(1); + } + + (None, None) + } + }; + Self { + remote_dkg_attempts_map_size, + remote_dkg_attempts_map_sum, + dealings_included, + remote_transcripts_delivered, + } + } +} diff --git a/rs/consensus/dkg/src/payload_builder.rs b/rs/consensus/dkg/src/payload_builder.rs index 546efc02dce7..1ec735798198 100644 --- a/rs/consensus/dkg/src/payload_builder.rs +++ b/rs/consensus/dkg/src/payload_builder.rs @@ -1,5 +1,6 @@ use crate::{ MAX_EARLY_REMOTE_TRANSCRIPTS, MAX_REMOTE_DKGS_PER_INTERVAL, + metrics::{DkgPayloadMetrics, DkgPayloadMetricsOptionExt}, remote::{ ConfigResult, build_callback_id_config_map, get_updated_remote_dkg_attempts, merge_configs, }, @@ -56,6 +57,7 @@ pub fn create_payload( validation_context: &ValidationContext, logger: ReplicaLogger, max_dealings_per_block: usize, + dkg_payload_metrics: Option<&DkgPayloadMetrics>, ) -> Result { let height = parent.height.increment(); // Get the last summary from the chain. @@ -78,6 +80,7 @@ pub fn create_payload( state_reader, validation_context, logger, + dkg_payload_metrics, ) .map(DkgPayload::Summary) } else { @@ -96,11 +99,13 @@ pub fn create_payload( state_reader, validation_context, logger, + dkg_payload_metrics, ) .map(DkgPayload::Data) } } +#[allow(clippy::too_many_arguments)] fn create_data_payload( this_subnet_id: SubnetId, registry_client: &dyn RegistryClient, @@ -114,7 +119,10 @@ fn create_data_payload( state_reader: &dyn StateReader, validation_context: &ValidationContext, logger: ReplicaLogger, + dkg_payload_metrics: Option<&DkgPayloadMetrics>, ) -> Result { + let _timer = dkg_payload_metrics.payload_creation_timer("data"); + // Get all existing dealer ids from the chain. let dealers_from_chain = utils::get_dealers_from_chain(pool_reader, parent); @@ -148,6 +156,7 @@ fn create_data_payload( parent, remote_config_results, &logger, + dkg_payload_metrics, )?; if !remote_dkg_transcripts.is_empty() { @@ -174,6 +183,7 @@ pub(crate) fn create_early_remote_transcripts( parent: &Block, callback_id_map: BTreeMap, logger: &ReplicaLogger, + dkg_payload_metrics: Option<&DkgPayloadMetrics>, ) -> Result)>, DkgPayloadCreationError> { // Since this function is relatively expensive, we simply return if there are no outstanding DKG contexts if callback_id_map.is_empty() { @@ -194,6 +204,7 @@ pub(crate) fn create_early_remote_transcripts( for (dkg_id, err) in errs { // Skip requests for which we already have a transcript on chain. if !completed_dkgs.contains(&dkg_id) { + dkg_payload_metrics.payload_errors_inc("remote_config_creation_failed"); error!( logger, "Failed to create remote transcript config for dkg id {:?} at height {}: {}", @@ -230,6 +241,8 @@ pub(crate) fn create_early_remote_transcripts( // For each config, try to build the necessary (dkg_id, callback_id, transcript_result) triple for config in configs.iter() { let dealings = all_dealings.remove(config.dkg_id()).unwrap_or_else(|| { + dkg_payload_metrics + .payload_errors_inc("remote_dealings_missing_after_capacity_check"); error!( logger, "We checked that all configs have enough dealings above. This is a bug." @@ -248,6 +261,7 @@ pub(crate) fn create_early_remote_transcripts( // Note that we handled the reproducible error case of not having enough dealings // already beforehand. Err(err) if err.is_reproducible() => { + dkg_payload_metrics.payload_errors_inc("remote_transcript_reproducible_error"); // Including the error in the payload will cause the context to receive // a reject response. let error_message = format!( @@ -260,6 +274,7 @@ pub(crate) fn create_early_remote_transcripts( Err(error_message) } Err(err) => { + dkg_payload_metrics.payload_errors_inc("remote_transcript_transient_error"); // Return on transient crypto errors return Err(DkgPayloadCreationError::DkgCreateTranscriptError(err)); } @@ -408,7 +423,10 @@ pub(super) fn create_summary_payload( state_reader: &dyn StateReader, validation_context: &ValidationContext, logger: ReplicaLogger, + dkg_payload_metrics: Option<&DkgPayloadMetrics>, ) -> Result { + let _timer = dkg_payload_metrics.payload_creation_timer("summary"); + let (mut all_dealings, completed_dkgs) = utils::get_dkg_dealings(pool_reader, parent); let mut next_transcripts = BTreeMap::new(); // Try to create transcripts from the last round. @@ -431,12 +449,14 @@ pub(super) fn create_summary_payload( } } Err(err) if err.is_reproducible() => { + dkg_payload_metrics.payload_errors_inc("summary_transcript_reproducible_error"); warn!( logger, "Failed to create transcript for dkg id {:?}: {:?}", dkg_id, err ); } Err(err) => { + dkg_payload_metrics.payload_errors_inc("summary_transcript_transient_error"); return Err(DkgPayloadCreationError::DkgCreateTranscriptError(err)); } }; @@ -1263,6 +1283,7 @@ mod tests { time: UNIX_EPOCH, }, no_op_logger(), + None, ) .unwrap() }; diff --git a/rs/consensus/dkg/src/payload_validator.rs b/rs/consensus/dkg/src/payload_validator.rs index 2f3848aa710f..9192df239247 100644 --- a/rs/consensus/dkg/src/payload_validator.rs +++ b/rs/consensus/dkg/src/payload_validator.rs @@ -72,6 +72,7 @@ pub fn validate_payload( state_reader, validation_context, log.clone(), + None, )?; if summary_payload.dkg != expected_summary { return Err(InvalidDkgPayloadReason::MismatchedDkgSummary( @@ -230,6 +231,7 @@ fn validate_dealings_payload( parent, remote_config_results, log, + None, )?; if dealings.transcripts_for_remote_subnets != expected_transcripts { diff --git a/rs/consensus/idkg/src/metrics.rs b/rs/consensus/idkg/src/metrics.rs index c3012e45e0a9..80582be96c82 100644 --- a/rs/consensus/idkg/src/metrics.rs +++ b/rs/consensus/idkg/src/metrics.rs @@ -228,7 +228,7 @@ impl IDkgPayloadMetrics { .inc(); } - pub(crate) fn payload_errors_inc(&self, label: &str) { + pub fn payload_errors_inc(&self, label: &str) { self.payload_errors.with_label_values(&[label]).inc(); } } diff --git a/rs/consensus/src/consensus/block_maker.rs b/rs/consensus/src/consensus/block_maker.rs index c1e9cc42c0d3..ad75c90b5c54 100755 --- a/rs/consensus/src/consensus/block_maker.rs +++ b/rs/consensus/src/consensus/block_maker.rs @@ -4,7 +4,9 @@ use crate::consensus::{ metrics::BlockMakerMetrics, status::{self, Status}, }; -use ic_consensus_dkg::payload_builder::create_payload as create_dkg_payload; +use ic_consensus_dkg::{ + metrics::DkgPayloadMetrics, payload_builder::create_payload as create_dkg_payload, +}; use ic_consensus_idkg::{self as idkg, metrics::IDkgPayloadMetrics}; use ic_consensus_utils::{ find_lowest_ranked_non_disqualified_proposals, get_notarization_delay_settings, @@ -74,6 +76,7 @@ pub(crate) struct BlockMaker { idkg_pool: Arc>, state_reader: Arc>, metrics: BlockMakerMetrics, + dkg_payload_metrics: DkgPayloadMetrics, idkg_payload_metrics: IDkgPayloadMetrics, log: ReplicaLogger, // The minimal age of the registry version we want to use for the validation context of a new @@ -111,6 +114,7 @@ impl BlockMaker { state_reader, log, metrics: BlockMakerMetrics::new(metrics_registry.clone()), + dkg_payload_metrics: DkgPayloadMetrics::new(metrics_registry.clone()), idkg_payload_metrics: IDkgPayloadMetrics::new(metrics_registry), stable_registry_version_age, } @@ -306,8 +310,13 @@ impl BlockMaker { &context, self.log.clone(), max_dealings_per_block, + Some(&self.dkg_payload_metrics), ) - .map_err(|err| warn!(self.log, "Payload construction has failed: {:?}", err)) + .map_err(|err| { + warn!(self.log, "Payload construction has failed: {:?}", err); + self.dkg_payload_metrics + .payload_errors_inc("create_data_payload"); + }) .ok()?; let payload = Payload::new( @@ -325,9 +334,12 @@ impl BlockMaker { Some(&self.idkg_payload_metrics), &self.log, ) - .map_err(|err| warn!(self.log, "Payload construction has failed: {:?}", err)) - .ok() - .flatten(); + .map_err(|err| { + warn!(self.log, "Payload construction has failed: {:?}", err); + self.idkg_payload_metrics + .payload_errors_inc("create_summary_payload"); + }) + .ok()?; BlockPayload::Summary(SummaryPayload { dkg: summary, @@ -373,10 +385,11 @@ impl BlockMaker { &self.log, ) .map_err(|err| { - warn!(self.log, "Payload construction has failed: {:?}", err) + warn!(self.log, "Payload construction has failed: {:?}", err); + self.idkg_payload_metrics + .payload_errors_inc("create_data_payload"); }) - .ok() - .flatten(); + .ok()?; (batch_payload, dkg, idkg_data) } diff --git a/rs/consensus/src/consensus/metrics.rs b/rs/consensus/src/consensus/metrics.rs index ff2c4e39df54..5cc42dc3d328 100644 --- a/rs/consensus/src/consensus/metrics.rs +++ b/rs/consensus/src/consensus/metrics.rs @@ -1,3 +1,4 @@ +use ic_consensus_dkg::metrics::DkgPayloadStats; use ic_consensus_idkg::{ metrics::{CounterPerMasterPublicKeyId, IDkgPayloadStats, KEY_ID_LABEL, key_id_label}, utils::CRITICAL_ERROR_IDKG_RESOLVE_TRANSCRIPT_REFS, @@ -12,6 +13,7 @@ use ic_types::{ Height, Time, batch::BatchPayload, consensus::{Block, BlockPayload, BlockProposal, ConsensusMessageHashable, HasHeight, HasRank}, + crypto::threshold_sig::ni_dkg::NiDkgTargetSubnet, }; use prometheus::{ GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, @@ -114,6 +116,7 @@ pub(crate) struct BlockStats { pub block_height: u64, pub block_time: Time, pub block_context_certified_height: u64, + pub dkg_stats: DkgPayloadStats, pub idkg_stats: Option, } @@ -124,6 +127,7 @@ impl From<&Block> for BlockStats { block_height: block.height().get(), block_time: block.context.time, block_context_certified_height: block.context.certified_height.get(), + dkg_stats: DkgPayloadStats::from(block.payload.as_ref()), idkg_stats: block.payload.as_ref().as_idkg().map(IDkgPayloadStats::from), } } @@ -168,6 +172,10 @@ pub(crate) struct FinalizerMetrics { pub ingress_message_bytes_delivered: Histogram, pub xnet_bytes_delivered: Histogram, pub finalization_certified_state_difference: IntGauge, + pub dkg_remote_transcript_attempts_size: IntGauge, + pub dkg_remote_transcript_attempts_sum: IntGauge, + pub dkg_dealings_included: IntCounterVec, + pub dkg_remote_transcripts_delivered: IntCounterVec, // idkg payload related metrics pub master_key_transcripts_created: IntCounterVec, pub idkg_available_pre_signatures: IntGaugeVec, @@ -212,6 +220,24 @@ impl FinalizerMetrics { "consensus_finalization_certified_state_difference", "The height difference between the finalized tip and the referenced certified state", ), + dkg_remote_transcript_attempts_size: metrics_registry.int_gauge( + "consensus_dkg_remote_transcript_attempts_size", + "Size of remote DKG attempts map for delivered summary payloads", + ), + dkg_remote_transcript_attempts_sum: metrics_registry.int_gauge( + "consensus_dkg_remote_transcript_attempts_sum", + "Sum of remote DKG attempts map values for delivered summary payloads", + ), + dkg_dealings_included: metrics_registry.int_counter_vec( + "consensus_dkg_dealings_included_total", + "Number of DKG dealings delivered in data payloads by tag and target subnet", + &["tag", "target_subnet"], + ), + dkg_remote_transcripts_delivered: metrics_registry.int_counter_vec( + "consensus_dkg_remote_transcripts_delivered_total", + "Number of remote DKG transcripts included in delivered data payloads by tag", + &["tag"], + ), ingress_messages_delivered: metrics_registry.histogram( "consensus_ingress_messages_delivered", "The number of the ingress messages delivered to Message Routing", @@ -303,6 +329,28 @@ impl FinalizerMetrics { self.finalization_certified_state_difference.set( block_stats.block_height as i64 - block_stats.block_context_certified_height as i64, ); + let dkg = &block_stats.dkg_stats; + if let Some(size) = dkg.remote_dkg_attempts_map_size { + self.dkg_remote_transcript_attempts_size.set(size as i64); + } + if let Some(total) = dkg.remote_dkg_attempts_map_sum { + self.dkg_remote_transcript_attempts_sum.set(total as i64); + } + for ((tag, target_subnet), count) in &dkg.dealings_included { + let target_str = match target_subnet { + NiDkgTargetSubnet::Local => "local".to_string(), + NiDkgTargetSubnet::Remote(_) => "remote".to_string(), + }; + self.dkg_dealings_included + .with_label_values(&[format!("{tag:?}"), target_str]) + .inc_by(*count as u64); + } + for (tag, count) in &dkg.remote_transcripts_delivered { + self.dkg_remote_transcripts_delivered + .with_label_values(&[format!("{tag:?}")]) + .inc_by(*count as u64); + } + self.canister_http_success_delivered .with_label_values(&["fully_replicated"]) .inc_by(batch_stats.canister_http.responses as u64); diff --git a/rs/test_utilities/artifact_pool/src/consensus_pool.rs b/rs/test_utilities/artifact_pool/src/consensus_pool.rs index da6ed4f77a27..0e62e1e84349 100644 --- a/rs/test_utilities/artifact_pool/src/consensus_pool.rs +++ b/rs/test_utilities/artifact_pool/src/consensus_pool.rs @@ -163,6 +163,7 @@ fn dkg_payload_builder_fn( validation_context, no_op_logger(), MAX_DEALINGS_PER_BLOCK, + None, ) .unwrap_or_else(|err| panic!("Couldn't create the payload: {err:?}")) })