diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ad4036..6d02cf5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ ### Fixed +* Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck) + ### Misc ## check_patroni 1.0.0 - 2023-08-28 diff --git a/README.md b/README.md index 7cfe808..3f2aa3b 100644 --- a/README.md +++ b/README.md @@ -191,8 +191,9 @@ Usage: check_patroni cluster_has_replica [OPTIONS] Check if the cluster has healthy replicas and/or if some are sync standbies A healthy replica: - * is in running or streaming state (V3.0.4) * has a replica or sync_standby role + * is in running state with the same timeline has the leader (patroni < V3.0.4) + * is in streaming state (patroni >= V3.0.4) * has a lag lower or equal to max_lag Check: diff --git a/check_patroni/cli.py b/check_patroni/cli.py index d249219..c42fa70 100644 --- a/check_patroni/cli.py +++ b/check_patroni/cli.py @@ -343,8 +343,9 @@ def cluster_has_replica( \b A healthy replica: - * is in running or streaming state (V3.0.4) * has a replica or sync_standby role + * is in running state with the same timeline has the leader (patroni < V3.0.4) + * is in streaming state (patroni >= V3.0.4) * has a lag lower or equal to max_lag \b diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py index eed5325..147a410 100644 --- a/check_patroni/cluster.py +++ b/check_patroni/cluster.py @@ -1,6 +1,7 @@ import hashlib import json from collections import Counter +from packaging import version from typing import Iterable, Union import nagiosplugin @@ -87,20 +88,54 @@ def __init__( self.max_lag = max_lag def probe(self: "ClusterHasReplica") -> Iterable[nagiosplugin.Metric]: - item_dict = self.rest_api("cluster") + # get patroni's version to findout if the streaming state is available + patroni_item_dict = self.rest_api("patroni") + has_streaming_state: bool + if version.parse(patroni_item_dict["patroni"]["version"]) >= version.parse("3.0.4"): + has_streaming_state = True + else: + has_streaming_state = False + + # get the cluster info + cluster_item_dict = self.rest_api("cluster") replicas = [] healthy_replica = 0 unhealthy_replica = 0 sync_replica = 0 - for member in item_dict["members"]: - # FIXME are there other acceptable states + leader_tl = None + + # find the leader's tl if we don't have the streaming state (version < 3.0.4) + if not has_streaming_state: + for member in cluster_item_dict["members"]: + if member["role"] == "leader": + leader_tl = member["timeline"] + break + + _log.debug( + "Patroni's version is %(version)s, leader_tl is %(leader_tl)s", + { + "version": patroni_item_dict["patroni"]["version"], + "leader_tl": leader_tl, + }, + ) + else: + _log.debug( + "Patroni's version is %(version)s, the streaming status is available", + {"version": patroni_item_dict["patroni"]["version"]}, + ) + + # Look for replicas + for member in cluster_item_dict["members"]: if member["role"] in ["replica", "sync_standby"]: - # patroni 3.0.4 changed the standby state from running to streaming if ( - member["state"] in ["running", "streaming"] - and member["lag"] != "unknown" - ): + (has_streaming_state and member["state"] == "streaming") + or ( + not has_streaming_state + and member["state"] == "running" + and member["timeline"] == leader_tl + ) + ) and member["lag"] != "unknown": replicas.append( { "name": member["name"],