WeaveNet exposes the following Prometheus metrics
Does the below alerts look correct to be alerted on? On what values of these metrics we should raise alert to monitor weave-net health?
Made grafana dashboard on top of weave metrics. Here are the dashboards
Here are the useful metrics on which weave net should be monitored. Below alert are in json format.
{
"groups": [
{
"name": "nodeagent",
"rules": [
{
"alert": "UnhealthyNodes",
"expr": "changes(central_nodeagent:node_route_unhealthy_count[3m]) > 0",
"for": "1m",
"labels": {
"severity": "critical"
},
"annotations": {
"summary": "Unhealthy nodes in the cluster. Go to prometheus the below prometheus link for details.",
"description": "Actionable: Find why the node(s) are unhealthy and fix it."
}
}
]
},
{
"name": "weave-net",
"rules": [
{
"alert": "WeaveNetIPAMSPlitBrain",
"expr": "max(weave_ipam_unreachable_percentage) - min(weave_ipam_unreachable_percentage) > 0",
"for": "3m",
"labels": {
"severity": "critical"
},
"annotations": {
"summary": "WeaveNetIPAM has a split brain. Go to the below prometheus link for details.",
"description": "Actionable: Every node should see same unreachability percentage. Please check and fix why it is not so."
}
},
{
"alert": "WeaveNetIPAMUnreachable",
"expr": "weave_ipam_unreachable_percentage[10m] > 25",
"for": "10m",
"labels": {
"severity": "critical"
},
"annotations": {
"summary": "WeaveNetIPAM unreachability percentage is above threshold. Go to the below prometheus link for details.",
"description": "Actionable: Find why the unreachability threshold have increased from threshold and fix it. WeaveNet is responsible to keep it under control. Weave rm peer deployment can help clean things."
}
},
{
"alert": "WeaveNetIPAMPendingAllocates",
"expr": "sum(weave_ipam_pending_allocates) > 0",
"for": "3m",
"labels": {
"severity": "critical"
},
"annotations": {
"summary": "WeaveNet IPAM has pending allocates. Go to the below prometheus link for details.",
"description": "Actionable: Find the reason for IPAM allocates to be in pending state and fix it."
}
},
{
"alert": "WeaveNetIPAMPendingClaims",
"expr": "sum(weave_ipam_pending_claims) > 0",
"for": "3m",
"labels": {
"severity": "critical"
},
"annotations": {
"summary": "WeaveNet IPAM has pending claims. Go to the below prometheus link for details.",
"description": "Actionable: Find the reason for IPAM claims to be in pending state and fix it."
}
},
{
"alert": "WeaveNetFastDPFlowsLow",
"expr": "sum(weave_flows) < 15000",
"for": "3m",
"labels": {
"severity": "critical"
},
"annotations": {
"summary": "WeaveNet total FastDP flows is below threshold. Go to the below prometheus link for details.",
"description": "Actionable: Find the reason for fast dp flows dropping below the threshold."
}
},
{
"alert": "WeaveNetFastDPFlowsOff",
"expr": "sum(weave_flows == bool 0) > 0",
"for": "3m",
"labels": {
"severity": "critical"
},
"annotations": {
"summary": "WeaveNet FastDP flows is not happening in some or all nodes. Go to the below prometheus link for details.",
"description": "Actionable: Find the reason for fast dp being off."
}
},
{
"alert": "WeaveNetHighConnectionTerminationRate",
"expr": "rate(weave_connection_terminations_total[5m]) > 0.1",
"for": "5m",
"labels": {
"severity": "critical"
},
"annotations": {
"summary": "A lot of connections are getting terminated. Go to the below prometheus link for details.",
"description": "Actionable: Find the reason for high connection termination rate and fix it."
}
},
{
"alert": "WeaveNetConnectionsConnecting",
"expr": "sum(weave_connections{state='connecting'}) > 0",
"for": "3m",
"labels": {
"severity": "critical"
},
"annotations": {
"summary": "A lot of connections are in connecting state. Go to the below prometheus link for details.",
"description": "Actionable: Find the reason and fix it."
}
},
{
"alert": "WeaveNetConnectionsRetying",
"expr": "sum(weave_connections{state='retrying'}) > 0",
"for": "3m",
"labels": {
"severity": "critical"
},
"annotations": {
"summary": "A lot of connections are in retrying state. Go to the below prometheus link for details.",
"description": "Actionable: Find the reason and fix it."
}
},
{
"alert": "WeaveNetConnectionsPending",
"expr": "sum(weave_connections{state='pending'}) > 0",
"for": "3m",
"labels": {
"severity": "critical"
},
"annotations": {
"summary": "A lot of connections are in pending state. Go to the below prometheus link for details.",
"description": "Actionable: Find the reason and fix it."
}
},
{
"alert": "WeaveNetConnectionsFailed",
"expr": "sum(weave_connections{state='failed'}) > 0",
"for": "3m",
"labels": {
"severity": "critical"
},
"annotations": {
"summary": "A lot of connections are in failed state. Go to the below prometheus link for details.",
"description": "Actionable: Find the reason and fix it."
}
}
]
}
]
}