From 6b3af191e79057013c409c665d895f212215af4f Mon Sep 17 00:00:00 2001 From: David Gogl <1381862+kengou@users.noreply.github.com> Date: Wed, 7 Aug 2024 14:13:31 +0200 Subject: [PATCH] add prometheus-kubernetes-rules --- prometheus-kubernetes-rules/Chart.yaml | 5 + prometheus-kubernetes-rules/README.md | 14 ++ .../aggregations/frontend/aggregation.rules | 10 ++ .../aggregations/frontend/cluster.rules | 107 ++++++++++++ .../aggregations/frontend/utilization.rules | 46 +++++ .../alerts/apiserver.alerts.tpl | 79 +++++++++ .../alerts/certificate.alerts.tpl | 23 +++ .../alerts/health.alerts.tpl | 150 ++++++++++++++++ .../alerts/kubelet.alerts.tpl | 111 ++++++++++++ .../alerts/maintenance.alerts.tpl | 54 ++++++ .../alerts/node.alerts.tpl | 165 ++++++++++++++++++ .../alerts/pod.alerts.tpl.disabled | 114 ++++++++++++ .../ci/test-values.yaml | 2 + .../templates/_helpers.tpl | 25 +++ .../collector/_resource.rules.tpl | 44 +++++ .../templates/alerts.yaml | 23 +++ .../templates/collector-aggregations.yaml | 14 ++ .../templates/frontend-aggregations.yaml | 19 ++ prometheus-kubernetes-rules/values.yaml | 32 ++++ 19 files changed, 1037 insertions(+) create mode 100644 prometheus-kubernetes-rules/Chart.yaml create mode 100644 prometheus-kubernetes-rules/README.md create mode 100644 prometheus-kubernetes-rules/aggregations/frontend/aggregation.rules create mode 100644 prometheus-kubernetes-rules/aggregations/frontend/cluster.rules create mode 100644 prometheus-kubernetes-rules/aggregations/frontend/utilization.rules create mode 100644 prometheus-kubernetes-rules/alerts/apiserver.alerts.tpl create mode 100644 prometheus-kubernetes-rules/alerts/certificate.alerts.tpl create mode 100644 prometheus-kubernetes-rules/alerts/health.alerts.tpl create mode 100644 prometheus-kubernetes-rules/alerts/kubelet.alerts.tpl create mode 100644 prometheus-kubernetes-rules/alerts/maintenance.alerts.tpl create mode 100644 prometheus-kubernetes-rules/alerts/node.alerts.tpl create mode 100644 prometheus-kubernetes-rules/alerts/pod.alerts.tpl.disabled create mode 100644 prometheus-kubernetes-rules/ci/test-values.yaml create mode 100644 prometheus-kubernetes-rules/templates/_helpers.tpl create mode 100644 prometheus-kubernetes-rules/templates/aggregations/collector/_resource.rules.tpl create mode 100644 prometheus-kubernetes-rules/templates/alerts.yaml create mode 100644 prometheus-kubernetes-rules/templates/collector-aggregations.yaml create mode 100644 prometheus-kubernetes-rules/templates/frontend-aggregations.yaml create mode 100644 prometheus-kubernetes-rules/values.yaml diff --git a/prometheus-kubernetes-rules/Chart.yaml b/prometheus-kubernetes-rules/Chart.yaml new file mode 100644 index 0000000..49a0457 --- /dev/null +++ b/prometheus-kubernetes-rules/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +appVersion: "1.0" +description: A collection of Prometheus alerting and aggregation rules for Kubernetes. +name: prometheus-kubernetes-rules +version: 1.9.11 diff --git a/prometheus-kubernetes-rules/README.md b/prometheus-kubernetes-rules/README.md new file mode 100644 index 0000000..923cd63 --- /dev/null +++ b/prometheus-kubernetes-rules/README.md @@ -0,0 +1,14 @@ +Kube rules +---------------- + +This chart is a collection of Prometheus alerting and aggregation rules for Kubernetes. + +## Configuration + +The following table provides an overview of configurable parameters of this chart and their defaults. +See the [values.yaml](./values.yaml) for more details. + +| Parameter | Description | Default | +|----------------------------------------|-------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------| +| `prometheusName` | Name of the Prometheus to which the rules should be assigned to. | `""` | +| `prometheusCollectorName` | Optional name of the Prometheus collector instance. Only required if the collector -> frontend federation pattern (metrics being pulled by frontend) is used. | `""` | diff --git a/prometheus-kubernetes-rules/aggregations/frontend/aggregation.rules b/prometheus-kubernetes-rules/aggregations/frontend/aggregation.rules new file mode 100644 index 0000000..7ba2283 --- /dev/null +++ b/prometheus-kubernetes-rules/aggregations/frontend/aggregation.rules @@ -0,0 +1,10 @@ +groups: +- name: skydns + rules: + - record: skydns_skydns_dns_error_count_total_irate + expr: irate(skydns_skydns_dns_error_count_total[5m]) + +- name: etcd + rules: + - record: instance:fd_utilization + expr: process_open_fds{component="etcd"} / process_max_fds{component="etcd"} diff --git a/prometheus-kubernetes-rules/aggregations/frontend/cluster.rules b/prometheus-kubernetes-rules/aggregations/frontend/cluster.rules new file mode 100644 index 0000000..6f5b9a8 --- /dev/null +++ b/prometheus-kubernetes-rules/aggregations/frontend/cluster.rules @@ -0,0 +1,107 @@ +groups: +- name: cluster.rules + rules: + - record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes + expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) + BY (namespace, controller, pod_name, container_name) + + - record: cluster_namespace_controller_pod_container:spec_cpu_shares + expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) + BY (namespace, controller, pod_name, container_name) + + - record: cluster_namespace_controller_pod_container:cpu_usage:rate + expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) + BY (namespace, controller, pod_name, container_name) + + - record: cluster_namespace_controller_pod_container:memory_usage:bytes + expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) + BY (namespace, controller, pod_name, container_name) + + - record: cluster_namespace_controller_pod_container:memory_working_set:bytes + expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) + BY (namespace, controller, pod_name, container_name) + + - record: cluster_namespace_controller_pod_container:memory_rss:bytes + expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) + BY (namespace, controller, pod_name, container_name) + + - record: cluster_namespace_controller_pod_container:memory_cache:bytes + expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) + BY (namespace, controller, pod_name, container_name) + + - record: cluster_namespace_controller_pod_container:memory_pagefaults:rate + expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) + BY (namespace, controller, pod_name, container_name, scope, type) + + - record: cluster_namespace_controller_pod_container:memory_oom:rate + expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) + BY (namespace, controller, pod_name, container_name, scope, type) + + - record: cluster:memory_allocation:percent + expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) / sum(machine_memory_bytes) + + - record: cluster:memory_used:percent + expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) / sum(machine_memory_bytes) + + - record: cluster:cpu_allocation:percent + expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) / sum(container_spec_cpu_shares{id="/"} * ON(instance) group_left machine_cpu_cores) + + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, job, resource, verb)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, job, resource, verb)) / 1e+06 + labels: + quantile: "0.9" + + - record: cluster_resource_verb:apiserver_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, job, resource, verb)) / 1e+06 + labels: + quantile: "0.5" + + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le)) / 1e+06 + labels: + quantile: "0.99" + + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, region)) / 1e+06 + labels: + quantile: "0.9" + + - record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le)) / 1e+06 + labels: + quantile: "0.5" + + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le)) / 1e+06 + labels: + quantile: "0.99" + + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le)) / 1e+06 + labels: + quantile: "0.9" + + - record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le)) / 1e+06 + labels: + quantile: "0.5" + + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le)) / 1e+06 + labels: + quantile: "0.99" + + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le)) / 1e+06 + labels: + quantile: "0.9" + + - record: cluster:scheduler_binding_latency:quantile_seconds + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le)) / 1e+06 + labels: + quantile: "0.5" diff --git a/prometheus-kubernetes-rules/aggregations/frontend/utilization.rules b/prometheus-kubernetes-rules/aggregations/frontend/utilization.rules new file mode 100644 index 0000000..9c227e0 --- /dev/null +++ b/prometheus-kubernetes-rules/aggregations/frontend/utilization.rules @@ -0,0 +1,46 @@ +# vim: set ft=yaml: +groups: + - name: utilization + rules: + # These rules are intermediate results to keep the final aggregation rules more readable. + # In both cases, we take the maximum of momentary resource usage and resource request. + # + # Unfortunately, there is no straight-forward way to take a maximum of two separate sets of + # timeseries. The builtin max() function can only be used to reduce cardinality on a single + # set of timeseries. As a workaround, we use the pattern `(X >= Y) or Y` to mean `max(X, Y)`. + # + # Note that `(X >= Y) or Y` only works if Y is fully populated. Therefore we need to put the + # resource requests in X (since those metrics may be missing for some containers) and the + # cadvisor metrics in Y (since those metrics are always present for all containers). + # + # Ref: + + - record: container_memory_effective_utilization_bytes + expr: | + max by (namespace, pod, container) (kube_pod_container_resource_requests{resource="memory"}) + >= max by (namespace, pod, container) (container_memory_working_set_bytes) + or max by (namespace, pod, container) (container_memory_working_set_bytes) + + - record: container_cpu_effective_utilization_average + expr: | + max by (namespace, pod, container) (kube_pod_container_resource_requests{resource="cpu"}) + >= max by (namespace, pod, container) (rate(container_cpu_usage_seconds_total[5m])) + or max by (namespace, pod, container) (rate(container_cpu_usage_seconds_total[5m])) + + - name: utilization-final + rules: + # These rules summarize various types of resource utilization metrics into one timeseries per + # owner (as defined by the ccloud/support-group and ccloud/service labels on the respective + # Kubernetes objects). + + - record: by_owner:container_memory_effective_utilization_bytes + expr: 'sum by (label_ccloud_support_group, label_ccloud_service) (container_memory_effective_utilization_bytes * on (namespace, pod) group_left (label_ccloud_support_group, label_ccloud_service) (kube_pod_labels))' + + - record: by_owner:container_cpu_effective_utilization_average + expr: 'sum by (label_ccloud_support_group, label_ccloud_service) (container_cpu_effective_utilization_average * on (namespace, pod) group_left (label_ccloud_support_group, label_ccloud_service) (kube_pod_labels))' + + - record: by_owner:persistentvolume_used_bytes + expr: 'sum by (label_ccloud_support_group, label_ccloud_service) (kubelet_volume_stats_used_bytes * on (namespace, persistentvolumeclaim) group_left (label_ccloud_support_group, label_ccloud_service) kube_persistentvolumeclaim_labels)' + + - record: by_owner:persistentvolume_capacity_bytes + expr: 'sum by (label_ccloud_support_group, label_ccloud_service) (kubelet_volume_stats_capacity_bytes * on (namespace, persistentvolumeclaim) group_left (label_ccloud_support_group, label_ccloud_service) kube_persistentvolumeclaim_labels)' diff --git a/prometheus-kubernetes-rules/alerts/apiserver.alerts.tpl b/prometheus-kubernetes-rules/alerts/apiserver.alerts.tpl new file mode 100644 index 0000000..e9e57c3 --- /dev/null +++ b/prometheus-kubernetes-rules/alerts/apiserver.alerts.tpl @@ -0,0 +1,79 @@ +# vi:syntax=yaml +groups: +- name: apiserver.alerts + rules: + - alert: KubernetesApiServerAllDown + expr: count(up{job="kubernetes-apiserver"} == 0) == count(up{job="kubernetes-apiserver"}) + for: 5m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: apiserver + meta: "{{`{{ $labels.instance }}`}}" + dashboard: kubernetes-health + playbook: docs/support/playbook/kubernetes/k8s_apiserver_down + annotations: + description: Kubernetes API is unavailable! + summary: All apiservers are down. Kubernetes API is unavailable! + + - alert: KubernetesApiServerDown + expr: up{job="kubernetes-apiserver"} == 0 + for: 15m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: apiserver + meta: "{{`{{ $labels.instance }}`}}" + dashboard: nodes?var-server={{`{{$labels.instance}}`}} + playbook: docs/support/playbook/kubernetes/k8s_apiserver_down + annotations: + description: ApiServer on {{`{{ $labels.instance }}`}} is DOWN. + summary: An ApiServer is DOWN + + - alert: KubernetesApiServerScrapeMissing + expr: up{job=~".*apiserver.*"} == 0 or absent(up{job=~".*apiserver.*"}) + for: 1h + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: info + context: apiserver + dashboard: kubernetes-health + annotations: + description: ApiServer cannot be scraped + summary: ApiServers failed to be scraped + + - alert: KubernetesApiServerLatency + expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|LIST",subresource!="log"}[5m])) by (resource, le)) / 1e6 > 2 + for: 30m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + support_group: containers + severity: info + context: apiserver + dashboard: kubernetes-apiserver + annotations: + description: ApiServerLatency for {{`{{ $labels.resource }}`}} is higher then usual for the past 15 minutes. Inspect apiserver logs for the root cause. + summary: ApiServerLatency is unusally high + + - alert: KubeAggregatedAPIDown + # We have to filter by job here because somehow the kubelet is also exporting this metric ?! and in admin/virtual/kubernikus we also scape apiservers in the + # kubernikus namespace + expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice{job="kubernetes-apiserver"}[10m]))) * 100 < 85 + for: 5m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: apiserver + annotations: + description: "Kubernetes aggregated API {{`{{ $labels.namespace }}`}}/{{`{{ $labels.name }}`}} has been only {{`{{ $value | humanize }}`}}% available over the last 10m. Run `kubectl get apiservice | grep -v Local` and confirm the services of aggregated APIs have active endpoints." + summary: Kubernetes aggregated API is down. diff --git a/prometheus-kubernetes-rules/alerts/certificate.alerts.tpl b/prometheus-kubernetes-rules/alerts/certificate.alerts.tpl new file mode 100644 index 0000000..26b8ac2 --- /dev/null +++ b/prometheus-kubernetes-rules/alerts/certificate.alerts.tpl @@ -0,0 +1,23 @@ +groups: +- name: certificate.alerts + rules: + # Per default cert-manager renews at 33% remaining lifetime + # https://cert-manager.io/docs/usage/certificate/#reissuance-triggered-by-expiry-renewal + - alert: X509CertificateLifetimeUnder20Percent + # basically we are calculating: + # remaining/lifetime < 0.2 + # to be able to get a proper duration from $value we have to reorder to: + # remaining < 0.2 * lifetime + # this means both sides need to join the support group + # also we clamp to 30 days, to get notified months in advance for long-lived certs + expr: (x509_cert_not_after - time()) * on(secret_name, secret_namespace) group_left(label_ccloud_support_group) label_replace(label_replace(kube_secret_labels, "secret_name", "$1", "secret", "(.*)"), "secret_namespace", "$1", "namespace", "(.*)") < clamp_max(0.2 * (x509_cert_not_after - x509_cert_not_before) * on(secret_name, secret_namespace) group_left(label_ccloud_support_group) label_replace(label_replace(kube_secret_labels, "secret_name", "$1", "secret", "(.*)"), "secret_namespace", "$1", "namespace", "(.*)"), 30*60*60*24) + for: 1h + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + context: availability + service: {{ required ".Values.service missing" .Values.service }} + severity: info + support_group: {{ include "supportGroupFromLabelsOrDefault" .Values.supportGroup }} + annotations: + description: The certificate for {{`{{ $labels.subject_CN }}`}} expires in {{`{{ $value | humanizeDuration }}`}}. See secret {{`{{ $labels.secret_namespace }}`}}/{{`{{ $labels.secret_name }}`}}, key {{`{{ $labels.secret_key }}`}}. + summary: Certificate expires diff --git a/prometheus-kubernetes-rules/alerts/health.alerts.tpl b/prometheus-kubernetes-rules/alerts/health.alerts.tpl new file mode 100644 index 0000000..186ace4 --- /dev/null +++ b/prometheus-kubernetes-rules/alerts/health.alerts.tpl @@ -0,0 +1,150 @@ +# vi:syntax=yaml +groups: +- name: kubernetes.alerts + rules: + - alert: KubernetesNodeManyNotReady + expr: count((kube_node_status_condition{condition="Ready",status="true"} unless on (node) (kube_node_labels{label_cloud_sap_maintenance_state="in-maintenance"} or kube_node_labels{label_kubernetes_cloud_sap_role="storage"})) == 0) > 4 + for: 1h + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: critical + context: node + meta: "{{`{{ $value }}`}} nodes NotReady" + dashboard: kubernetes-health + playbook: docs/support/playbook/kubernetes/k8s_node_not_ready + annotations: + summary: Many Nodes are NotReady + description: "{{`{{ $value }}`}} nodes are NotReady for more than an hour" + + - alert: KubernetesNodeNotReady + expr: sum by(node) (kube_node_status_condition{condition="Ready",status="true"} == 0) + for: 1h + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: node + meta: "{{`{{ $labels.node }}`}} is NotReady" + dashboard: nodes?var-server={{`{{$labels.node}}`}} + playbook: docs/support/playbook/kubernetes/k8s_node_not_ready + inhibited_by: node-maintenance + annotations: + summary: Node status is NotReady + description: Node {{`{{ $labels.node }}`}} is NotReady for more than an hour + + - alert: KubernetesNodeNotReadyFlapping + expr: changes(kube_node_status_condition{condition="Ready",status="true"}[15m]) > 2 + for: 1h + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: node + meta: "{{`{{ $labels.node }}`}}" + dashboard: "nodes?var-server={{`{{$labels.node}}`}}" + annotations: + summary: Node readiness is flapping + description: Node {{`{{ $labels.node }}`}} is flapping between Ready and NotReady + + - alert: KubernetesKubeStateMetricsScrapeFailed + expr: up{job=~".*kube-state-metrics.*"} == 0 or absent(up{job=~".*kube-state-metrics.*"}) + for: 1h + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: node + dashboard: kubernetes-health + annotations: + description: Failed to scrape kube-state-metrics. Metrics on the cluster state might be outdated. Check the kube-monitoring/kube-state-metrics deployment. + summary: Kube state metrics scrape failed + + - alert: KubernetesPodRestartingTooMuch + expr: (sum by(pod, namespace, container) (rate(kube_pod_container_status_restarts_total[15m]))) * on (pod) group_left(label_alert_tier, label_alert_service, label_ccloud_support_group, label_ccloud_service) (max without (uid) (kube_pod_labels)) > 0 + for: 1h + labels: + tier: {{ include "alertTierLabelOrDefault" .Values.tier }} + service: {{ include "serviceFromLabelsOrDefault" "k8s" }} + support_group: {{ include "supportGroupFromLabelsOrDefault" .Values.supportGroup }} + severity: warning + context: pod + meta: "Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} is restarting constantly" + playbook: docs/support/playbook/kubernetes/k8s_pod_restarting + annotations: + description: Container {{`{{ $labels.container }}`}} of pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} is restarting constantly.{{`{{ if eq $labels.support_group "containers"}}`}} Is `owner-info` set --> Contact respective service owner! If not, try finding him/her and make sure, `owner-info` is set!{{`{{ end }}`}} + summary: Pod is in a restart loop + + - alert: KubernetesPodCannotPullImage + expr: label_replace((sum by(pod_name, namespace) (rate(kube_pod_image_pull_backoff_total[15m]))), "pod", "$1", "pod_name", "(.*)") * on (pod) group_left(label_alert_tier, label_alert_service, label_ccloud_support_group, label_ccloud_service) (max without (uid) (kube_pod_labels)) > 0 + for: 1h + labels: + tier: {{ include "alertTierLabelOrDefault" .Values.tier }} + service: {{ include "serviceFromLabelsOrDefault" "k8s" }} + support_group: {{ include "supportGroupFromLabelsOrDefault" .Values.supportGroup }} + severity: warning + context: pod + meta: "Pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} cannot pull all images" + annotations: + description: The pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} cannot pull all images.{{`{{ if eq $labels.support_group "containers"}}`}} Is `owner-info` set --> Contact respective service owner! If not, try finding him/her and make sure, `owner-info` is set!{{`{{ end }}`}} + summary: Pod cannot pull all iamges + + - alert: KubernetesTooManyOpenFiles + expr: 100*process_open_fds{job=~"kubernetes-kubelet|kubernetes-apiserver"} / process_max_fds > 50 + for: 10m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: system + meta: "{{`{{ $labels.node }}`}}" + dashboard: nodes?var-server={{`{{$labels.node}}`}} + annotations: + description: "{{`{{ $labels.job }}`}} on {{`{{ $labels.node }}`}} is using {{`{{ $value }}`}}% of the available file/socket descriptors" + summary: Too many open file descriptors + + - alert: KubernetesDeploymentInsufficientReplicas + expr: (sum(kube_deployment_status_replicas) by (namespace,deployment) < sum(kube_deployment_spec_replicas) by (namespace,deployment)) * on (namespace, deployment) group_left(label_ccloud_support_group, label_ccloud_service) (kube_deployment_labels) + for: 10m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + service: {{ include "serviceFromLabelsOrDefault" "k8s" }} + support_group: {{ include "supportGroupFromLabelsOrDefault" .Values.supportGroup }} + severity: warning + context: deployment + meta: "{{`{{ $labels.namespace }}`}}/{{`{{ $labels.deployment }}`}} has insufficient replicas" + annotations: + description: Deployment {{`{{ $labels.namespace }}`}}/{{`{{ $labels.deployment }}`}} only {{`{{ $value }}`}} replica available, which is less then desired + summary: Deployment has less than desired replicas since 10m + + - alert: PodNotReady + # alert on pods that are not ready but in the Running phase on a Ready node + expr: (kube_pod_status_phase_normalized{phase="Running"} * on(pod, node, namespace) kube_pod_status_ready_normalized{condition="false"} * on(node) group_left() sum by(node) (kube_node_status_condition{condition="Ready",status="true"}) == 1) * on(pod) group_left(label_alert_tier, label_alert_service, label_cc_support_group, label_ccloud_service) (max without(uid) (kube_pod_labels)) + for: 2h + labels: + tier: {{ include "alertTierLabelOrDefault" .Values.tier }} + service: {{ include "serviceFromLabelsOrDefault" "k8s" }} + support_group: {{ include "supportGroupFromLabelsOrDefault" .Values.supportGroup }} + severity: info + annotations: + description: "The pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} is not ready for more then 2h." + summary: "Pod not ready for a long time" + + - alert: PrometheusMultiplePodScrapes + expr: sum by(pod, namespace, label_alert_service, label_alert_tier, label_ccloud_service, label_ccloud_support_group) (label_replace((up * on(instance) group_left() (sum by(instance) (up{job=~".*pod-sd"}) > 1)* on(pod) group_left(label_alert_tier, label_alert_service, label_ccloud_support_group, label_ccloud_service) (max without(uid) (kube_pod_labels))) , "pod", "$1", "kubernetes_pod_name", "(.*)-[0-9a-f]{8,10}-[a-z0-9]{5}")) + for: 30m + labels: + tier: {{ include "alertTierLabelOrDefault" .Values.tier }} + service: {{ include "serviceFromLabelsOrDefault" "k8s" }} + support_group: {{ include "supportGroupFromLabelsOrDefault" .Values.supportGroup }} + severity: warning + playbook: docs/support/playbook/kubernetes/target_scraped_multiple_times + meta: 'Prometheus is scraping {{`{{ $labels.pod }}`}} pods more than once.' + annotations: + description: Prometheus is scraping `{{`{{ $labels.pod }}`}}` pods in namespace `{{`{{ $labels.namespace }}`}}` multiple times. This is likely caused due to incorrectly placed scrape annotations. + summary: Prometheus scrapes pods multiple times diff --git a/prometheus-kubernetes-rules/alerts/kubelet.alerts.tpl b/prometheus-kubernetes-rules/alerts/kubelet.alerts.tpl new file mode 100644 index 0000000..512c5ca --- /dev/null +++ b/prometheus-kubernetes-rules/alerts/kubelet.alerts.tpl @@ -0,0 +1,111 @@ +# vi:syntax=yaml +groups: +- name: kubelet.alerts + rules: + - alert: ManyKubeletDown + expr: count(count(up{job="kubernetes-kubelet"} unless on (node) (kube_node_labels{label_cloud_sap_maintenance_state="in-maintenance"} or kube_node_labels{label_kubernetes_cloud_sap_role="storage"})) - sum(up{job="kubernetes-kubelet"} unless on (node) (kube_node_labels{label_cloud_sap_maintenance_state="in-maintenance"} or kube_node_labels{label_kubernetes_cloud_sap_role="storage"}))) > 4 + for: 10m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: critical + context: kubelet + dashboard: kubernetes-health + playbook: docs/support/playbook/kubernetes/k8s_node_not_ready + annotations: + description: Many Kubelets are DOWN + summary: More than 2 Kubelets are DOWN + + - alert: KubeletDown + expr: up{job="kubernetes-kubelet"} == 0 + for: 10m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: kubelet + meta: "{{`{{ $labels.node }}`}}" + dashboard: kubernetes-health + playbook: docs/support/playbook/kubernetes/k8s_node_not_ready + inhibited_by: node-maintenance + annotations: + description: Kublet on {{`{{ $labels.node }}`}} is DOWN. + summary: A Kubelet is DOWN + + - alert: KubeletTooManyPods + expr: kubelet_running_pod_count > 225 + for: 1h + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: kubelet + meta: "{{`{{ $labels.node }}`}}" + dashboard: nodes?var-server={{`{{$labels.node}}`}} + annotations: + description: Kubelet is close to pod limit + summary: Kubelet {{`{{ $labels.node }}`}} is running {{`{{ $value }}`}} pods, close to the limit of 250 + + - alert: KubeletFull + expr: kubelet_running_pod_count >= 250 + for: 1h + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: kubelet + meta: "{{`{{ $labels.node }}`}}" + dashboard: nodes?var-server={{`{{ $labels.node }}`}} + annotations: + description: Kubelet is full + summary: Kubelet Kubelet {{`{{$labels.node}}`}} is running {{`{{ $value }}`}} pods. That's too much! + + - alert: KubeletHighNumberOfGoRoutines + expr: go_goroutines{job="kubernetes-kubelet"} > {{ default "5000" .Values.kubelet.goroutinesHighCount }} + for: 5m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: kubelet + meta: "{{`{{ $labels.node }}`}}" + annotations: + description: Kublet on {{`{{ $labels.node }}`}} might be unresponsive due to a high number of go routines + summary: High number of Go routines + + - alert: KubeletPredictHighNumberOfGoRoutines + expr: abs(predict_linear(go_goroutines{job="kubernetes-kubelet"}[1h], 2*3600)) > {{ default "10000" .Values.kubelet.goroutinesPredictHighCount }} + for: 5m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: kubelet + meta: "{{`{{ $labels.node }}`}}" + annotations: + description: Kublet on {{`{{$labels.node}}`}} might become unresponsive due to a high number of go routines within 2 hours, take a look at the node and wait if it stabilizes. + summary: Predicting high number of Go routines + + - alert: KubeletManyRequestErrors + expr: | + (sum(rate(rest_client_requests_total{code=~"5.*", component="kubelet"}[5m])) by (node) + / + sum(rate(rest_client_requests_total{component="kubelet"}[5m])) by (node)) + * 100 > 1 + for: 10m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: kubelet + meta: "Many 5xx responses for Kubelet on {{`{{ $labels.node }}`}} " + annotations: + description: "{{`{{ printf \"%0.0f\" $value }}`}}% of requests from kubelet on {{`{{ $labels.node }}`}} error" + summary: Many 5xx responses for Kubelet diff --git a/prometheus-kubernetes-rules/alerts/maintenance.alerts.tpl b/prometheus-kubernetes-rules/alerts/maintenance.alerts.tpl new file mode 100644 index 0000000..cdb9f90 --- /dev/null +++ b/prometheus-kubernetes-rules/alerts/maintenance.alerts.tpl @@ -0,0 +1,54 @@ +### Maintenance inhibition alerts ### + +groups: +- name: maintenance.alerts + rules: + - alert: NodeInMaintenance + expr: max by (node) (kube_node_labels{label_cloud_sap_maintenance_state="in-maintenance"}) == 1 + for: 2m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: none + context: node + meta: "Node {{`{{ $labels.node }}`}} is in maintenance." + annotations: + summary: Node in maintenance + description: "Node {{`{{ $labels.node }}`}} is in scheduled maintenance. Add the label `inhibited_by: node-maintenance` to alerts that should be inhibited while a node is in maintenance" + +### Maintenance stuck alerts ### + + - alert: NodeStuckInMaintenance +{{- if eq .Values.global.clusterType "metal" }} + expr: kube_node_labels{label_cloud_sap_esx_in_maintenance="false",label_cloud_sap_maintenance_state="in-maintenance"} * on (node) group_left() (kube_node_status_condition{condition="Ready",status="true"} == 0) +{{- else }} + expr: kube_node_labels{label_cloud_sap_maintenance_state="in-maintenance"} * on (node) group_left() (kube_node_status_condition{condition="Ready",status="true"} == 0) +{{- end }} + for: 1h + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: maintenance-controller + meta: "Node {{`{{ $labels.node }}`}} is stuck in maintenance for 1 hour." + annotations: + summary: Node stuck in maintenance + description: "Node {{`{{ $labels.node }}`}} is stuck on reboot after OS upgrade or hardware maintenance. Check node console." + +### Flatcar version disparity ### + - alert: FlatcarVersionDisparity + expr: count by (cluster) (count by (label_flatcar_linux_update_v1_flatcar_linux_net_version,cluster) (kube_node_labels)) > 2 + for: 1h + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: maintenance-controller + meta: "Cluster {{`{{ $labels.cluster }}`}} has a disparity in flatcar versions." + playbook: docs/support/playbook/kubernetes/flatcar_version_disparity + annotations: + summary: More than 2 flatcar versions + description: "Cluster {{`{{ $labels.cluster }}`}} has a disparity in flatcar versions. This indicates some issue with the maintenance-controller." diff --git a/prometheus-kubernetes-rules/alerts/node.alerts.tpl b/prometheus-kubernetes-rules/alerts/node.alerts.tpl new file mode 100644 index 0000000..709f2e3 --- /dev/null +++ b/prometheus-kubernetes-rules/alerts/node.alerts.tpl @@ -0,0 +1,165 @@ +# vi:syntax=yaml +### General node health ### + +groups: +- name: node.alerts + rules: + - alert: NodeHostHighCPUUsage + expr: 100 - (avg by (node) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90 + for: 6h + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: node + meta: "High CPU usage on {{`{{ $labels.node }}`}}" + dashboard: kubernetes-node?var-server={{`{{$labels.node}}`}} + playbook: docs/support/playbook/kubernetes/k8s_node_host_high_cpu_usage + annotations: + summary: High CPU load on node + description: "Node {{`{{ $labels.node }}`}} has more than {{`{{ humanize $value }}`}}% CPU load for 6h" + + - alert: NodeKernelDeadlock + expr: kube_node_status_condition_normalized{condition="KernelDeadlock", status="true"} == 1 + for: 96h + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: info + context: availability + meta: "Kernel deadlock on {{`{{ $labels.node }}`}}" + playbook: docs/support/playbook/k8s_node_safe_rebooting + annotations: + description: Node kernel has deadlock + summary: Permanent kernel deadlock on {{`{{ $labels.node }}`}}. Please drain and reboot node + + - alert: NodeDiskPressure + expr: kube_node_status_condition_normalized{condition="DiskPressure",status="true"} == 1 + for: 5m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: node + meta: "Disk pressure on {{`{{ $labels.node }}`}}" + annotations: + description: Insufficient disk space + summary: Node {{`{{ $labels.node }}`}} under pressure due to insufficient available disk space + + - alert: NodeMemoryPressure + expr: kube_node_status_condition_normalized{condition="MemoryPressure",status="true"} == 1 + for: 5m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: node + meta: "Memory pressure on {{`{{ $labels.node }}`}}" + annotations: + description: Insufficient memory + summary: Node {{`{{ $labels.node }}`}} under pressure due to insufficient available memory + + - alert: NodeDiskUsagePercentage + expr: (100 - 100 * sum(node_filesystem_avail_bytes{device!~"/dev/mapper/usr|tmpfs|by-uuid",fstype=~"xfs|ext|ext4"} / node_filesystem_size_bytes{device!~"/dev/mapper/usr|tmpfs|by-uuid",fstype=~"xfs|ext|ext4"}) BY (node,device)) > 85 + for: 5m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: info + context: node + meta: "Node disk usage above 85% on {{`{{ $labels.node }}`}} device {{`{{ $labels.device }}`}}" + annotations: + description: "Node disk usage above 85%" + summary: "Disk usage on target {{`{{ $labels.node }}`}} at {{`{{ $value }}`}}%" + + ### Network health ### + + - alert: NodeHighNumberOfOpenConnections + expr: node_netstat_Tcp_CurrEstab > 20000 + for: 15m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: availability + meta: "{{`{{ $labels.node }}`}}" + dashboard: "nodes?var-server={{`{{ $labels.node }}`}}" + annotations: + description: High number of open TCP connections + summary: The node {{`{{ $labels.node }}`}} has more than 20000 active TCP connections. The maximally possible amount is 32768 connections + + - alert: NodeHighRiseOfOpenConnections + expr: predict_linear(node_netstat_Tcp_CurrEstab[20m], 3600) > 32768 + for: 15m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: availability + meta: "{{`{{ $labels.node }}" + dashboard: "nodes?var-server={{$labels.node}}`}}" + playbook: "docs/support/playbook/kubernetes/k8s_high_tcp_connections" + annotations: + description: High number of open TCP connections + summary: The node {{`{{ $labels.node }}`}} will likely reach 32768 active TCP connections within the next hour. If that happens, it cannot accept any new connections + + - alert: NodeContainerOOMKilled + expr: sum by (node) (changes(node_vmstat_oom_kill[24h])) > 3 + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: info + context: memory + annotations: + description: More than 3 OOM killed pods on a node within 24h + summary: More than 3 OOM killed pods on node {{`{{ $labels.node }}`}} within 24h + + - alert: NodeHighNumberOfThreads + expr: node_processes_threads > 31000 + for: 1h + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: critical + context: threads + meta: "Very high number of threads on {{`{{ $labels.node }}`}}. Forking problems are imminent." + playbook: "docs/support/playbook/kubernetes/k8s_high_threads" + annotations: + description: "Very high number of threads on {{`{{ $labels.node }}`}}. Forking problems are imminent." + summary: Very high number of threads + + - alert: NodeReadOnlyRootFilesystem + expr: sum by (node) (node_filesystem_readonly{mountpoint="/"}) > 0 + for: 15m + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: availability + meta: "Node {{`{{ $labels.node }}`}} has a read-only root filesystem." + annotations: + description: Node {{`{{ $labels.node }}`}} has a read-only root filesystem. This could lead to unforeseeable problems. A reboot of the node is advised to fix the issue. + summary: Read-only root filesystem on node + + - alert: NodeRebootsTooFast + expr: max by (node) (changes(node_boot_time_seconds[1h])) > 2 + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} + service: {{ required ".Values.service missing" .Values.service }} + severity: warning + context: availability + meta: "The node {{`{{ $labels.node }}`}} rebooted at least 3 times in the last hour" + annotations: + description: "The node {{`{{ $labels.node }}`}} rebooted {{`{{ $value }}`}} times in the past hour. It could be stuck in a reboot/panic loop." + summary: Node rebooted multiple times diff --git a/prometheus-kubernetes-rules/alerts/pod.alerts.tpl.disabled b/prometheus-kubernetes-rules/alerts/pod.alerts.tpl.disabled new file mode 100644 index 0000000..8f6432e --- /dev/null +++ b/prometheus-kubernetes-rules/alerts/pod.alerts.tpl.disabled @@ -0,0 +1,114 @@ +# vi:syntax=yaml + +### Pod resource usage ### +groups: +- name: pod.alerts + rules: + - alert: ContainerLowMemoryUsage + expr: | + sum by (pod, namespace, container, label_alert_service, label_alert_tier, label_ccloud_service, label_ccloud_support_group) ( + ( + floor( + sum by (namespace, pod, container) (container_memory_working_set_bytes{pod!=""}) + / on (namespace, pod, container) + sum by (namespace, pod, container) (kube_pod_container_resource_requests_memory_bytes > 0) + * + 100 + ) + < + 10 + ) + * on (pod) group_left (label_alert_tier, label_alert_service, label_ccloud_support_group, label_ccloud_service) + (max without (uid) (kube_pod_labels)) + ) + for: 1d + labels: + tier: {{ include "alertTierLabelOrDefault" .Values.tier }} + service: {{ include "serviceFromLabelsOrDefault" "k8s" }} + support_group: {{ include "supportGroupFromLabelsOrDefault" .Values.supportGroup }} + severity: info + context: container + meta: "Low RAM usage on {{`{{ $labels.container }}`}}" + playbook: docs/support/playbook/kubernetes/k8s_container_pod_resources/#low-ram-usage + annotations: + summary: Low RAM usage on container + description: "Memory usage for the container {{`{{ $labels.container }}`}} of pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} is under `10%` in the last 24h compared to the requested memory resources. Consider reducing `resources.requests.memory`" + - alert: ContainerHighMemoryUsage + expr: | + sum by (pod, namespace, container, label_alert_service, label_alert_tier, label_ccloud_service, label_ccloud_support_group) ( + ( + ceil( + sum by (namespace, pod, container) (container_memory_working_set_bytes{pod!=""}) + / on (namespace, pod, container) + sum by (namespace, pod, container) (kube_pod_container_resource_requests_memory_bytes > 0) + * + 100 + ) + > + 250 + ) + * on (pod) group_left (label_alert_tier, label_alert_service, label_ccloud_support_group, label_ccloud_service) + (max without (uid) (kube_pod_labels)) + ) + for: 1d + labels: + tier: {{ include "alertTierLabelOrDefault" .Values.tier }} + service: {{ include "serviceFromLabelsOrDefault" "k8s" }} + support_group: {{ include "supportGroupFromLabelsOrDefault" .Values.supportGroup }} + severity: info + context: container + meta: "High RAM usage on {{`{{ $labels.container }}`}}" + playbook: docs/support/playbook/kubernetes/k8s_container_pod_resources/#high-ram-usage + annotations: + summary: High RAM usage on container + description: "Memory usage for the container {{`{{ $labels.container }}`}} of pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} is over `250%` in the last 24h compared to the requested memory resources. Consider raising `resources.requests.memory`" + - alert: PodWithoutConfiguredMemoryRequests + expr: | + sum by (namespace, pod, container, label_alert_service, label_alert_tier, label_ccloud_service, label_ccloud_support_group) ( + ( + count by (namespace, pod, container) ( + sum by (namespace, pod, container) (kube_pod_container_info{container!=""}) + unless + sum by (namespace, pod, container) (kube_pod_container_resource_requests{resource="ram"}) + ) + ) + * on (pod) group_left (label_alert_tier, label_alert_service, label_ccloud_support_group, label_ccloud_service) + (max without (uid) (kube_pod_labels)) + ) + for: 1d + labels: + tier: {{ include "alertTierLabelOrDefault" .Values.tier }} + service: {{ include "serviceFromLabelsOrDefault" "k8s" }} + support_group: {{ include "supportGroupFromLabelsOrDefault" .Values.supportGroup }} + severity: info + context: container + meta: "No RAM requests configured for {{`{{ $labels.container }}`}}" + playbook: docs/support/playbook/kubernetes/k8s_container_pod_resources/#no-ram-requests-configured + annotations: + summary: No RAM requests configured for container + description: "The container {{`{{ $labels.container }}`}} of pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} has no `resources.requests.memory` configured." + - alert: PodWithoutConfiguredCPURequests + expr: | + sum by (namespace, pod, container, label_alert_service, label_alert_tier, label_ccloud_service, label_ccloud_support_group) ( + ( + count by (namespace, pod, container) ( + sum by (namespace, pod, container) (kube_pod_container_info{container!=""}) + unless + sum by (namespace, pod, container) (kube_pod_container_resource_requests{resource="cpu"}) + ) + ) + * on (pod) group_left (label_alert_tier, label_alert_service, label_ccloud_support_group, label_ccloud_service) + (max without (uid) (kube_pod_labels)) + ) + for: 1d + labels: + tier: {{ include "alertTierLabelOrDefault" .Values.tier }} + service: {{ include "serviceFromLabelsOrDefault" .Values.service }} + support_group: {{ include "supportGroupFromLabelsOrDefault" .Values.supportGroup }} + severity: info + context: container + meta: "No CPU requests configured for {{`{{ $labels.container }}`}}" + playbook: docs/support/playbook/kubernetes/k8s_container_pod_resources/#no-cpu-requests-configured + annotations: + summary: No CPU requests configured for container + description: "The container {{`{{ $labels.container }}`}} of pod {{`{{ $labels.namespace }}`}}/{{`{{ $labels.pod }}`}} has no `resources.requests.cpu` configured." diff --git a/prometheus-kubernetes-rules/ci/test-values.yaml b/prometheus-kubernetes-rules/ci/test-values.yaml new file mode 100644 index 0000000..7f36882 --- /dev/null +++ b/prometheus-kubernetes-rules/ci/test-values.yaml @@ -0,0 +1,2 @@ +global: + clusterType: test diff --git a/prometheus-kubernetes-rules/templates/_helpers.tpl b/prometheus-kubernetes-rules/templates/_helpers.tpl new file mode 100644 index 0000000..e902de4 --- /dev/null +++ b/prometheus-kubernetes-rules/templates/_helpers.tpl @@ -0,0 +1,25 @@ +{{/* If the collector is enabled metrics are aggregated and prefixed, so they can be federated easily. */}} +{{- define "prefix" -}} +{{ if .Values.prometheusCollectorName -}}aggregated:{{- end }} +{{- end -}} + +{{- /* +Use the 'label_alert_tier', if it exists on the time series, otherwise use the given default. +Note: The pods define the 'alert-tier' label but Prometheus replaces the hyphen with an underscore. +*/}} +{{- define "alertTierLabelOrDefault" -}} +"{{`{{ if $labels.label_alert_tier }}{{ $labels.label_alert_tier}}{{ else }}`}}{{ required "default value is missing" . }}{{`{{ end }}`}}" +{{- end -}} + +{{- /* +Use the 'label_alert_service', if it exists on the time series, otherwise use the given default. +Note: The pods define the 'alert-service' label but Prometheus replaces the hyphen with an underscore. +*/}} + +{{- define "serviceFromLabelsOrDefault" -}} +"{{`{{ if $labels.label_ccloud_service }}{{ $labels.label_ccloud_service }}{{ else }}{{ if $labels.label_alert_service }}{{ $labels.label_alert_service }}{{ else }}`}}{{ . }}{{`{{ end }}{{ end }}`}}" +{{- end -}} + +{{- define "supportGroupFromLabelsOrDefault" -}} +"{{`{{ if $labels.label_ccloud_support_group }}{{ $labels.label_ccloud_support_group }}{{ else }}`}}{{ required "default support_group is missing" . }}{{`{{ end }}`}}" +{{- end -}} diff --git a/prometheus-kubernetes-rules/templates/aggregations/collector/_resource.rules.tpl b/prometheus-kubernetes-rules/templates/aggregations/collector/_resource.rules.tpl new file mode 100644 index 0000000..17627fe --- /dev/null +++ b/prometheus-kubernetes-rules/templates/aggregations/collector/_resource.rules.tpl @@ -0,0 +1,44 @@ +groups: +- name: cpu + rules: + - record: container_cpu_saturation_ratio + expr: | + sum(irate(container_cpu_cfs_throttled_seconds_total[5m])) by (namespace, pod_name, container_name) + / + (sum(irate(container_cpu_usage_seconds_total[5m])) by (namespace, pod_name, container_name) + sum(irate(container_cpu_cfs_throttled_seconds_total[5m])) by (namespace, pod_name, container_name)) + + - record: container_cpu_utilization_ratio + expr: | + ( sum(irate(container_cpu_cfs_throttled_seconds_total[5m])) by (namespace, pod_name, container_name) + sum(irate(container_cpu_usage_seconds_total[5m])) by (namespace, pod_name, container_name) ) + / + sum(label_join(label_join(kube_pod_container_resource_requests{resource="cpu"}, "container_name", "", "container"), "pod_name", "", "pod")) by (namespace, pod_name, container_name) + + - record: container_cpu_usage_seconds_average + expr: | + avg( + rate( + container_cpu_usage_seconds_total{container_name!="",instance!=""}[10m] + ) + ) by (namespace, container_name, pod_name, node, cluster) + +- name: memory + rules: + - record: container_memory_saturation_ratio + expr: | + sum(container_memory_working_set_bytes) by (namespace, pod_name, container_name) + / + sum(label_join(label_join(kube_pod_container_resource_limits{resource="memory"}, "container_name", "", "container"), "pod_name", "", "pod")) by (namespace, pod_name, container_name) + + - record: container_memory_utilization_ratio + expr: | + sum(container_memory_working_set_bytes) by (namespace, pod_name, container_name) + / + sum(label_join(label_join(kube_pod_container_resource_requests{resource="memory"}, "container_name", "", "container"), "pod_name", "", "pod")) by (namespace, pod_name, container_name) + + - record: container_memory_usage_average + expr: | + avg( + count_over_time(container_memory_working_set_bytes{container_name!="",instance!=""}[10m]) + * + avg_over_time(container_memory_working_set_bytes{container_name!="",instance!=""}[10m]) + ) by (namespace, container_name, pod_name, node, cluster) diff --git a/prometheus-kubernetes-rules/templates/alerts.yaml b/prometheus-kubernetes-rules/templates/alerts.yaml new file mode 100644 index 0000000..03a5b4b --- /dev/null +++ b/prometheus-kubernetes-rules/templates/alerts.yaml @@ -0,0 +1,23 @@ +{{- $root := . -}} +{{- $values := .Values -}} +{{- range $path, $bytes := .Files.Glob "alerts/*.alerts.tpl" }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule + +metadata: + name: {{ printf "kubernetes-%s" $path | replace "/" "-" | trimSuffix ".tpl" }} + labels: + tier: {{ required ".Values.tier missing" $values.tier }} + type: alerting-rules + prometheus: {{ required ".Values.prometheusName missing" $values.prometheusName }} + {{- range $i, $target := $.Values.ruleSelectors }} + {{ $target.name | required (printf "$.Values.ruleSelector.[%v].name missing" $i) }}: {{ tpl ($target.value | required (printf "$.Values.ruleSelector.[%v].value missing" $i)) $ }} + {{- end }} + +spec: +{{- with $root -}} +{{- $content := printf "%s" $bytes }} +{{ tpl $content . | indent 2 }} +{{- end }} +--- +{{- end }} diff --git a/prometheus-kubernetes-rules/templates/collector-aggregations.yaml b/prometheus-kubernetes-rules/templates/collector-aggregations.yaml new file mode 100644 index 0000000..6cd4745 --- /dev/null +++ b/prometheus-kubernetes-rules/templates/collector-aggregations.yaml @@ -0,0 +1,14 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule + +metadata: + name: kubernetes-resource-aggregation-rules + labels: + tier: {{ required ".Values.tier missing" .Values.tier }} + prometheus: {{ if .Values.prometheusCollectorName -}}{{- .Values.prometheusCollectorName -}}{{- else -}}{{- .Values.prometheusName -}}{{- end }} + {{- range $i, $target := .Values.ruleSelectors }} + {{ $target.name | required (printf "$.Values.ruleSelector.[%v].name missing" $i) }}: {{ tpl ($target.value | required (printf "$.Values.ruleSelector.[%v].value missing" $i)) $ }} + {{- end }} + +spec: +{{ include (print .Template.BasePath "/aggregations/collector/_resource.rules.tpl") . | indent 2 }} diff --git a/prometheus-kubernetes-rules/templates/frontend-aggregations.yaml b/prometheus-kubernetes-rules/templates/frontend-aggregations.yaml new file mode 100644 index 0000000..641dd38 --- /dev/null +++ b/prometheus-kubernetes-rules/templates/frontend-aggregations.yaml @@ -0,0 +1,19 @@ +{{- $values := .Values }} +{{- range $path, $bytes := .Files.Glob "aggregations/frontend/*.rules" }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule + +metadata: + name: kubernetes-{{ printf "%s" $path | replace "/" "-" }} + labels: + tier: {{ required ".Values.tier missing" $values.tier }} + prometheus: {{ required ".Values.prometheusName missing" $values.prometheusName }} + {{- range $i, $target := $.Values.ruleSelectors }} + {{ $target.name | required (printf "$.Values.ruleSelector.[%v].name missing" $i) }}: {{ tpl ($target.value | required (printf "$.Values.ruleSelector.[%v].value missing" $i)) $ }} + {{- end }} + +spec: +{{ printf "%s" $bytes | indent 2 }} + +{{- end }} diff --git a/prometheus-kubernetes-rules/values.yaml b/prometheus-kubernetes-rules/values.yaml new file mode 100644 index 0000000..14ceac3 --- /dev/null +++ b/prometheus-kubernetes-rules/values.yaml @@ -0,0 +1,32 @@ +global: + # TODO: Remove the clusterType field. + clusterType: kubernetes + +# Name of the Prometheus to which the rules should be assigned to. +prometheusName: kubernetes + +# Additional label selectors for the Prometheus rules. +ruleSelectors: + # - name: plugin + # value: kube-monitoring + +# Optional name of the Prometheus collector instance. +# Only required if the collector -> frontend pattern is used. +# If not given collector aggregation rules are assigned to the frontend instead. +# prometheusCollectorName: + +kubelet: + goroutinesHighCount: + goroutinesPredictHighCount: + +# Alert routing is primarily based on the following labels. +# Tier for Prometheus alert and aggregation rules. +tier: k8s + +# The responsible support group for the alerts +# Alerts might override this if the service label is present on the underlying metric. +supportGroup: containers + +# The service for the Prometheus alert. +# Alerts might override this if the service label is present on the underlying metric. +service: k8s