Skip to content

Commit

Permalink
[PLAT-16463][dr] Create an alert for safetime
Browse files Browse the repository at this point in the history
Summary:
This diff adds the alert for the xCluster transactional safe time lag.

It also removes the extra status `ReplicationError` and uses `Error` instead.

Test Plan: Slowed the network connection between the source and target universes' nodes and observed that the safe time lag alert is being fired.

Reviewers: #yba-api-review!, cwang, amalyshev, vbansal, nbhatia

Reviewed By: amalyshev

Subscribers: sanketh, yugaware

Differential Revision: https://phorge.dev.yugabyte.com/D41385
  • Loading branch information
shahrooz1997 committed Jan 27, 2025
1 parent 3dcce62 commit f6042c9
Show file tree
Hide file tree
Showing 11 changed files with 65 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ private Metric buildMetricTemplate(
XClusterTableConfig.Status tableStatus = xClusterTableConfig.getStatus();
if (xClusterTableConfig.getStatus().equals(XClusterTableConfig.Status.Running)) {
if (!xClusterTableConfig.getReplicationStatusErrors().isEmpty()) {
tableStatus = XClusterTableConfig.Status.ReplicationError;
tableStatus = XClusterTableConfig.Status.Error;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,5 +85,6 @@ public enum AlertTemplate {
XCLUSTER_CONFIG_TABLE_BAD_STATE,
NODE_CLOCK_DRIFT,
UNIVERSE_UNEXPECTED_MASTERS_RUNNING,
UNIVERSE_UNEXPECTED_TSERVERS_RUNNING;
UNIVERSE_UNEXPECTED_TSERVERS_RUNNING,
SAFETIME_LAG;
}
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,6 @@ public static XClusterTableConfig.Status dbStatusToTableStatus(
switch (namespaceStatus) {
case Failed:
return XClusterTableConfig.Status.Failed;
case Error:
return XClusterTableConfig.Status.Error;
case Warning:
return XClusterTableConfig.Status.Warning;
case Updating:
Expand All @@ -118,7 +116,7 @@ public static XClusterTableConfig.Status dbStatusToTableStatus(
case Running:
return XClusterTableConfig.Status.Running;
default:
return XClusterTableConfig.Status.ReplicationError;
return XClusterTableConfig.Status.Error;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,7 @@ public enum Status {
DroppedFromSource("DroppedFromSource", -5), // Not stored in YBA DB.
DroppedFromTarget("DroppedFromTarget", -6), // Not stored in YBA DB.
ExtraTableOnSource("ExtraTableOnSource", -7), // Not stored in YBA DB.
ExtraTableOnTarget("ExtraTableOnTarget", -8), // Not stored in YBA DB.
ReplicationError("ReplicationError", -9); // Not stored in YBA DB.
ExtraTableOnTarget("ExtraTableOnTarget", -8); // Not stored in YBA DB.

private final String status;
@Getter private final int code;
Expand Down
30 changes: 30 additions & 0 deletions managed/src/main/resources/alert/alert_templates.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1280,6 +1280,36 @@ templates:
XCluster config tables are in bad state for universe '{{ $labels.source_name }}'
Affected XCluster configs: {{ $labels.affected_xcluster_configs }}
SAFETIME_LAG:
name: XCluster Transactional Safe Time Lag
description: Max xCluster transactional safe time lag for 10 minutes in ms on the target universe is above threshold; Safe time on the target universe has not progressed for a long time
queryTemplate: max by (universe_uuid) (max_over_time(consumer_safe_time_lag{universe_uuid="__universeUuid__"}[10m])) {{ query_condition }} {{ query_threshold }}
defaultThresholdMap:
SEVERE:
paramName: yb.alert.safetime_lag_ms
targetType: UNIVERSE
defaultThresholdCondition: GREATER_THAN
defaultThresholdUnit: MILLISECOND
thresholdUnitName: ms
labels:
affected_node_names: >-
{{ range $index, $element := query
"max by (universe_uuid, node_name)
(max_over_time(consumer_safe_time_lag{universe_uuid='{{ $labels.universe_uuid }}'}[10m])
{{ query_condition }} {{ query_threshold }})" }}
{{if $index}},{{end}}{{ $element.Labels.node_name }}{{ end }}
affected_namespace_ids: >-
{{ range $index, $element := query
"max by (universe_uuid, namespace_id)
(max_over_time(consumer_safe_time_lag{universe_uuid='{{ $labels.universe_uuid }}'}[10m])
{{ query_condition }} {{ query_threshold }})" }}
{{if $index}},{{end}}{{ $element.Labels.namespace_id }}{{ end }}
annotations:
summary: >-
Max xCluster transactional safe time lag for the target universe '{{ $labels.source_name }}' is above
{{ $labels.threshold }} ms. Current value is {{ $value | printf "%.0f" }} ms
PRIVATE_ACCESS_KEY_STATUS:
name: Private access key permission status
description: Change in universe private access keys file permissions
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
-- Copyright (c) YugaByte, Inc.

-- Clock skew alert
INSERT INTO alert_configuration
(uuid, customer_uuid, name, description, create_time, target_type, target, thresholds, threshold_unit, template, active, default_destination)
SELECT
gen_random_uuid(),
uuid,
'XCluster Transactional Safe Time Lag',
'Max xCluster transactional safe time lag for 10 minutes in ms on the target universe is above threshold; Safe time on the target universe has not progressed for a long time',
current_timestamp,
'UNIVERSE',
'{"all":true}',
'{"SEVERE":{"condition":"GREATER_THAN", "threshold":180000.0}}',
'MILLISECOND',
'SAFETIME_LAG',
true,
true
FROM customer;

SELECT create_universe_alert_definitions('XCluster Transactional Safe Time Lag');
2 changes: 2 additions & 0 deletions managed/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,8 @@ yb {
max_clock_skew_ms = 250
# Value of maximum allowed replication lag before an alert is generated (in ms).
replication_lag_ms = 180000
# Value of the maximum allowed safe time lag before an alert is generated.
safetime_lag_ms = 180000
# Value of maximum allowed percents of used memory on nodes.
max_memory_cons_pct = 90
# Alert rules configuration sync interval in seconds.
Expand Down
6 changes: 3 additions & 3 deletions managed/src/main/resources/swagger-strict.json
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@
},
"template" : {
"description" : "Template name",
"enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "YCQL_MICROSECOND_TIMESTAMPS_DETECTED", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT", "UNIVERSE_UNEXPECTED_MASTERS_RUNNING", "UNIVERSE_UNEXPECTED_TSERVERS_RUNNING" ],
"enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "YCQL_MICROSECOND_TIMESTAMPS_DETECTED", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT", "UNIVERSE_UNEXPECTED_MASTERS_RUNNING", "UNIVERSE_UNEXPECTED_TSERVERS_RUNNING", "SAFETIME_LAG" ],
"type" : "string"
},
"thresholdUnit" : {
Expand Down Expand Up @@ -727,7 +727,7 @@
"type" : "string"
},
"template" : {
"enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "YCQL_MICROSECOND_TIMESTAMPS_DETECTED", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT", "UNIVERSE_UNEXPECTED_MASTERS_RUNNING", "UNIVERSE_UNEXPECTED_TSERVERS_RUNNING" ],
"enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "YCQL_MICROSECOND_TIMESTAMPS_DETECTED", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT", "UNIVERSE_UNEXPECTED_MASTERS_RUNNING", "UNIVERSE_UNEXPECTED_TSERVERS_RUNNING", "SAFETIME_LAG" ],
"type" : "string"
},
"uuids" : {
Expand Down Expand Up @@ -888,7 +888,7 @@
},
"template" : {
"description" : "Template name",
"enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "YCQL_MICROSECOND_TIMESTAMPS_DETECTED", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT", "UNIVERSE_UNEXPECTED_MASTERS_RUNNING", "UNIVERSE_UNEXPECTED_TSERVERS_RUNNING" ],
"enum" : [ "REPLICATION_LAG", "CLOCK_SKEW", "CLOCK_SYNC_CHECK_FAILED", "MEMORY_CONSUMPTION", "HEALTH_CHECK_ERROR", "HEALTH_CHECK_NOTIFICATION_ERROR", "UNIVERSE_METRIC_COLLECTION_FAILURE", "BACKUP_FAILURE", "BACKUP_DELETION_FAILURE", "BACKUP_SCHEDULE_FAILURE", "INACTIVE_CRON_NODES", "ALERT_QUERY_FAILED", "ALERT_CONFIG_WRITING_FAILED", "ALERT_NOTIFICATION_ERROR", "ALERT_NOTIFICATION_CHANNEL_ERROR", "NODE_DOWN", "NODE_RESTART", "NODE_CPU_USAGE", "NODE_DISK_USAGE", "NODE_SYSTEM_DISK_USAGE", "NODE_FILE_DESCRIPTORS_USAGE", "NODE_OOM_KILLS", "DB_VERSION_MISMATCH", "DB_INSTANCE_DOWN", "DB_INSTANCE_RESTART", "DB_FATAL_LOGS", "DB_ERROR_LOGS", "DB_CORE_FILES", "DB_YSQL_CONNECTION", "DB_YCQL_CONNECTION", "DB_REDIS_CONNECTION", "DB_MEMORY_OVERLOAD", "DB_COMPACTION_OVERLOAD", "DB_QUEUES_OVERFLOW", "DB_DRIVE_FAILURE", "DB_WRITE_READ_TEST_ERROR", "DDL_ATOMICITY_CHECK", "NODE_TO_NODE_CA_CERT_EXPIRY", "NODE_TO_NODE_CERT_EXPIRY", "CLIENT_TO_NODE_CA_CERT_EXPIRY", "CLIENT_TO_NODE_CERT_EXPIRY", "ENCRYPTION_AT_REST_CONFIG_EXPIRY", "SSH_KEY_EXPIRY", "SSH_KEY_ROTATION_FAILURE", "PITR_CONFIG_FAILURE", "YSQL_OP_AVG_LATENCY", "YCQL_OP_AVG_LATENCY", "YSQL_OP_P99_LATENCY", "YCQL_OP_P99_LATENCY", "HIGH_NUM_YSQL_CONNECTIONS", "HIGH_NUM_YCQL_CONNECTIONS", "HIGH_NUM_YEDIS_CONNECTIONS", "YSQL_THROUGHPUT", "YCQL_THROUGHPUT", "YCQL_MICROSECOND_TIMESTAMPS_DETECTED", "MASTER_LEADER_MISSING", "MASTER_UNDER_REPLICATED", "LEADERLESS_TABLETS", "UNDER_REPLICATED_TABLETS", "PRIVATE_ACCESS_KEY_STATUS", "UNIVERSE_OS_UPDATE_REQUIRED", "DB_YCQL_WEB_SERVER_DOWN", "DB_YSQL_WEB_SERVER_DOWN", "INCREASED_REMOTE_BOOTSTRAPS", "TABLET_SERVER_AVG_READ_LATENCY", "TABLET_SERVER_AVG_WRITE_LATENCY", "REACTOR_DELAYS", "RPC_QUEUE_SIZE", "LOG_CACHE_SIZE", "CACHE_MISS", "HA_STANDBY_SYNC", "NODE_AGENT_DOWN", "UNIVERSE_RELEASE_FILES_STATUS", "HA_VERSION_MISMATCH", "TABLET_PEERS_GUARDRAIL", "XCLUSTER_CONFIG_TABLE_BAD_STATE", "NODE_CLOCK_DRIFT", "UNIVERSE_UNEXPECTED_MASTERS_RUNNING", "UNIVERSE_UNEXPECTED_TSERVERS_RUNNING", "SAFETIME_LAG" ],
"type" : "string"
},
"thresholdConditionReadOnly" : {
Expand Down
Loading

0 comments on commit f6042c9

Please sign in to comment.