dalibo · blogh · Jan 9, 2024 · Nov 14, 2023 · Dec 19, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,11 @@
 
 ## Unreleased
 
+### Changed
+
+* In `cluster_node_count`, a healthy standby, sync replica or standby leaders cannot be "in
+  archive recovery" because this service doesn't check for lag and timelines.
+
 ### Added
 
 * Add the timeline in the  `cluster_has_replica` perfstats. (#50)
@@ -15,6 +20,7 @@
 * Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck)
 * Fix `cluster_has_replica` to display perfstats for replicas whenever it's possible (healthy or not). (#50)
 * Fix `cluster_has_leader` to correctly check for standby leaders. (#58, reported by @mbanck)
+* Fix `cluster_node_count` to correctly manage replication states. (#50, reported by @mbanck)
 
 ### Misc
 

diff --git a/README.md b/README.md
@@ -299,26 +299,37 @@ Usage: check_patroni cluster_node_count [OPTIONS]
 
   Count the number of nodes in the cluster.
 
+  The role refers to the role of the server in the cluster. Possible values
+  are:
+  * master or leader
+  * replica
+  * standby_leader
+  * sync_standby
+  * demoted
+  * promoted
+  * uninitialized
+
   The state refers to the state of PostgreSQL. Possible values are:
   * initializing new cluster, initdb failed
   * running custom bootstrap script, custom bootstrap failed
   * starting, start failed
   * restarting, restart failed
-  * running, streaming (for a replica V3.0.4)
+  * running, streaming, in archive recovery
   * stopping, stopped, stop failed
   * creating replica
   * crashed
 
-  The role refers to the role of the server in the cluster. Possible values
-  are:
-  * master or leader (V3.0.0+)
-  * replica
-  * demoted
-  * promoted
-  * uninitialized
+  The "healthy" checks only ensures that:
+  * a leader has the running state
+  * a standby_leader has the running or streaming (V3.0.4) state
+  * a replica or sync-standby has the running or streaming (V3.0.4) state
+
+  Since we dont check the lag or timeline, "in archive recovery" is not
+  considered a valid state for this service. See cluster_has_leader and
+  cluster_has_replica for specialized checks.
 
   Check:
-  * Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds.
+  * Compares the number of nodes against the normal and healthy nodes warning and critical thresholds.
   * `OK`:  If they are not provided.
 
   Perfdata:

diff --git a/check_patroni/cli.py b/check_patroni/cli.py
@@ -226,29 +226,40 @@ def cluster_node_count(
 ) -> None:
     """Count the number of nodes in the cluster.
 
+    \b
+    The role refers to the role of the server in the cluster. Possible values
+    are:
+    * master or leader
+    * replica
+    * standby_leader
+    * sync_standby
+    * demoted
+    * promoted
+    * uninitialized
+
     \b
     The state refers to the state of PostgreSQL. Possible values are:
     * initializing new cluster, initdb failed
     * running custom bootstrap script, custom bootstrap failed
     * starting, start failed
     * restarting, restart failed
-    * running, streaming (for a replica V3.0.4)
+    * running, streaming, in archive recovery
     * stopping, stopped, stop failed
     * creating replica
     * crashed
 
     \b
-    The role refers to the role of the server in the cluster. Possible values
-    are:
-    * master or leader (V3.0.0+)
-    * replica
-    * demoted
-    * promoted
-    * uninitialized
+    The "healthy" checks only ensures that:
+    * a leader has the running state
+    * a standby_leader has the running or streaming (V3.0.4) state
+    * a replica or sync-standby has the running or streaming (V3.0.4) state
+
+    Since we dont check the lag or timeline, "in archive recovery" is not considered a valid state
+    for this service. See cluster_has_leader and cluster_has_replica for specialized checks.
 
     \b
     Check:
-    * Compares the number of nodes against the normal and healthy (running + streaming) nodes warning and critical thresholds.
+    * Compares the number of nodes against the normal and healthy nodes warning and critical thresholds.
     * `OK`:  If they are not provided.
 
     \b

diff --git a/check_patroni/cluster.py b/check_patroni/cluster.py
@@ -15,24 +15,51 @@ def replace_chars(text: str) -> str:
 
 class ClusterNodeCount(PatroniResource):
     def probe(self) -> Iterable[nagiosplugin.Metric]:
+        def debug_member(member: Any, health: str) -> None:
+            _log.debug(
+                "Node %(node_name)s is %(health)s: role %(role)s state %(state)s.",
+                {
+                    "node_name": member["name"],
+                    "health": health,
+                    "role": member["role"],
+                    "state": member["state"],
+                },
+            )
+
+        # get the cluster info
         item_dict = self.rest_api("cluster")
+
         role_counters: Counter[str] = Counter()
         roles = []
         status_counters: Counter[str] = Counter()
         statuses = []
+        healthy_member = 0
 
         for member in item_dict["members"]:
-            roles.append(replace_chars(member["role"]))
-            statuses.append(replace_chars(member["state"]))
+            state, role = member["state"], member["role"]
+            roles.append(replace_chars(role))
+            statuses.append(replace_chars(state))
+
+            if role == "leader" and state == "running":
+                healthy_member += 1
+                debug_member(member, "healthy")
+                continue
+
+            if role in ["standby_leader", "replica", "sync_standby"] and (
+                (self.has_detailed_states() and state == "streaming")
+                or (not self.has_detailed_states() and state == "running")
+            ):
+                healthy_member += 1
+                debug_member(member, "healthy")
+                continue
+
+            debug_member(member, "unhealthy")
         role_counters.update(roles)
         status_counters.update(statuses)
 
         # The actual check: members, healthy_members
         yield nagiosplugin.Metric("members", len(item_dict["members"]))
-        yield nagiosplugin.Metric(
-            "healthy_members",
-            status_counters["running"] + status_counters.get("streaming", 0),
-        )
+        yield nagiosplugin.Metric("healthy_members", healthy_member)
 
         # The performance data : role
         for role in role_counters:

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -55,7 +55,7 @@ def cluster_api_set_replica_running(in_json: Path, target_dir: Path) -> Path:
     with in_json.open() as f:
         js = json.load(f)
     for node in js["members"]:
-        if node["role"] in ["replica", "sync_standby"]:
+        if node["role"] in ["replica", "sync_standby", "standby_leader"]:
             if node["state"] in ["streaming", "in archive recovery"]:
                 node["state"] = "running"
     assert target_dir.is_dir()

diff --git a/tests/json/cluster_node_count_ko_in_archive_recovery.json b/tests/json/cluster_node_count_ko_in_archive_recovery.json
@@ -0,0 +1,33 @@
+{
+  "members": [
+    {
+      "name": "srv1",
+      "role": "standby_leader",
+      "state": "in archive recovery",
+      "api_url": "https://10.20.199.3:8008/patroni",
+      "host": "10.20.199.3",
+      "port": 5432,
+      "timeline": 51
+    },
+    {
+      "name": "srv2",
+      "role": "replica",
+      "state": "in archive recovery",
+      "api_url": "https://10.20.199.4:8008/patroni",
+      "host": "10.20.199.4",
+      "port": 5432,
+      "timeline": 51,
+      "lag": 0
+    },
+    {
+      "name": "srv3",
+      "role": "replica",
+      "state": "streaming",
+      "api_url": "https://10.20.199.5:8008/patroni",
+      "host": "10.20.199.5",
+      "port": 5432,
+      "timeline": 51,
+      "lag": 0
+    }
+  ]
+}
diff --git a/tests/test_cluster_has_leader.py b/tests/test_cluster_has_leader.py
@@ -122,11 +122,18 @@ def cluster_has_leader_ko_standby_leader_archiving(
 
 @pytest.mark.usefixtures("cluster_has_leader_ko_standby_leader_archiving")
 def test_cluster_has_leader_ko_standby_leader_archiving(
-    runner: CliRunner, patroni_api: PatroniAPI
+    runner: CliRunner, patroni_api: PatroniAPI, old_replica_state: bool
 ) -> None:
     result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
-    assert (
-        result.stdout
-        == "CLUSTERHASLEADER WARNING - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=1;;@0 is_leader=0 is_standby_leader=1 is_standby_leader_in_arc_rec=1;@1:1\n"
-    )
-    assert result.exit_code == 1
+    if old_replica_state:
+        assert (
+            result.stdout
+            == "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0 is_leader=0 is_standby_leader=1 is_standby_leader_in_arc_rec=0;@1:1\n"
+        )
+        assert result.exit_code == 0
+    else:
+        assert (
+            result.stdout
+            == "CLUSTERHASLEADER WARNING - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=1;;@0 is_leader=0 is_standby_leader=1 is_standby_leader_in_arc_rec=1;@1:1\n"
+        )
+        assert result.exit_code == 1