1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
// Copyright (c) The Diem Core Contributors
// SPDX-License-Identifier: Apache-2.0

#![forbid(unsafe_code)]

use crate::{
    cluster::Cluster,
    health::{HealthCheck, HealthCheckContext},
    instance::Instance,
};
use async_trait::async_trait;
use futures::future::join_all;
use once_cell::sync::Lazy;
use std::{collections::HashMap, env};

pub static THRESHOLD: Lazy<i64> = Lazy::new(|| {
    if let Ok(v) = env::var("FULL_NODE_HEALTH_THRESHOLD") {
        v.parse()
            .expect("Failed to parse FULL_NODE_HEALTH_THRESHOLD")
    } else {
        15000_i64
    }
});

pub struct FullNodeHealthCheck {
    cluster: Cluster,
}

impl FullNodeHealthCheck {
    pub fn new(cluster: Cluster) -> FullNodeHealthCheck {
        Self { cluster }
    }
}

async fn get_version(instance: &Instance) -> (&Instance, i64) {
    let res = instance
        .debug_interface_client()
        .get_node_metric("diem_state_sync_version{type=committed}")
        .await;
    let content = match res {
        Ok(res) => res.unwrap_or_default(),
        _ => 0i64,
    };
    (instance, content)
}

#[async_trait]
impl HealthCheck for FullNodeHealthCheck {
    async fn verify(&mut self, ctx: &mut HealthCheckContext) {
        let validators = self.cluster.validator_instances();
        let fullnodes = self.cluster.fullnode_instances();

        let futures = validators.iter().map(get_version);
        let val_latest_versions = join_all(futures).await;
        let val_latest_versions: HashMap<_, _> = val_latest_versions
            .into_iter()
            .map(|(instance, version)| (instance.validator_group().index, version))
            .collect();

        let futures = fullnodes.iter().map(get_version);
        let fullnode_latest_versions = join_all(futures).await;

        for (fullnode, fullnode_version) in fullnode_latest_versions {
            let index = fullnode.validator_group().index;
            let val_version = val_latest_versions.get(&index).unwrap();
            if val_version - fullnode_version > *THRESHOLD {
                ctx.report_failure(
                    format!("val-{}", index),
                    format!(
                        "fullnode {} state sync committed version: {} is behind validator: {}",
                        fullnode.peer_name(),
                        fullnode_version,
                        val_version,
                    ),
                );
            }
        }
    }

    fn name(&self) -> &'static str {
        "fullnode_check"
    }
}