Skip to content

Commit

Permalink
cluster_has_replica: fix the way a healthy replica is detected
Browse files Browse the repository at this point in the history
For patroni >= version 3.0.4:
* the role is replica or sync_standby
* the state is streaming
* the lag is lower or equal to max_lag

For prio versions:
* the role is replica or sync_standby
* the state is running and with the same timeline has the leader
* the lag is lower or equal to max_lag
  • Loading branch information
blogh committed Sep 27, 2023
1 parent de8b3da commit c108093
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 9 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

### Fixed

* Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck)

### Misc

## check_patroni 1.0.0 - 2023-08-28
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,9 @@ Usage: check_patroni cluster_has_replica [OPTIONS]
Check if the cluster has healthy replicas and/or if some are sync standbies
A healthy replica:
* is in running or streaming state (V3.0.4)
* has a replica or sync_standby role
* is in running state with the same timeline has the leader (patroni < V3.0.4)
* is in streaming state (patroni >= V3.0.4)
* has a lag lower or equal to max_lag
Check:
Expand Down
3 changes: 2 additions & 1 deletion check_patroni/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,8 +343,9 @@ def cluster_has_replica(
\b
A healthy replica:
* is in running or streaming state (V3.0.4)
* has a replica or sync_standby role
* is in running state with the same timeline has the leader (patroni < V3.0.4)
* is in streaming state (patroni >= V3.0.4)
* has a lag lower or equal to max_lag
\b
Expand Down
49 changes: 42 additions & 7 deletions check_patroni/cluster.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import hashlib
import json
from collections import Counter
from packaging import version
from typing import Iterable, Union

import nagiosplugin
Expand Down Expand Up @@ -87,20 +88,54 @@ def __init__(
self.max_lag = max_lag

def probe(self: "ClusterHasReplica") -> Iterable[nagiosplugin.Metric]:
item_dict = self.rest_api("cluster")
# get patroni's version to findout if the streaming state is available
patroni_item_dict = self.rest_api("patroni")
has_streaming_state: bool
if version.parse(patroni_item_dict["patroni"]["version"]) >= version.parse("3.0.4"):
has_streaming_state = True
else:
has_streaming_state = False

# get the cluster info
cluster_item_dict = self.rest_api("cluster")

replicas = []
healthy_replica = 0
unhealthy_replica = 0
sync_replica = 0
for member in item_dict["members"]:
# FIXME are there other acceptable states
leader_tl = None

# find the leader's tl if we don't have the streaming state (version < 3.0.4)
if not has_streaming_state:
for member in cluster_item_dict["members"]:
if member["role"] == "leader":
leader_tl = member["timeline"]
break

_log.debug(
"Patroni's version is %(version)s, leader_tl is %(leader_tl)s",
{
"version": patroni_item_dict["patroni"]["version"],
"leader_tl": leader_tl,
},
)
else:
_log.debug(
"Patroni's version is %(version)s, the streaming status is available",
{"version": patroni_item_dict["patroni"]["version"]},
)

# Look for replicas
for member in cluster_item_dict["members"]:
if member["role"] in ["replica", "sync_standby"]:
# patroni 3.0.4 changed the standby state from running to streaming
if (
member["state"] in ["running", "streaming"]
and member["lag"] != "unknown"
):
(has_streaming_state and member["state"] == "streaming")
or (
not has_streaming_state
and member["state"] == "running"
and member["timeline"] == leader_tl
)
) and member["lag"] != "unknown":
replicas.append(
{
"name": member["name"],
Expand Down

0 comments on commit c108093

Please sign in to comment.