Skip to content

Commit

Permalink
add prometheus-kubernetes-rules
Browse files Browse the repository at this point in the history
  • Loading branch information
kengou committed Aug 7, 2024
1 parent f720791 commit 6b3af19
Show file tree
Hide file tree
Showing 19 changed files with 1,037 additions and 0 deletions.
5 changes: 5 additions & 0 deletions prometheus-kubernetes-rules/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: v1
appVersion: "1.0"
description: A collection of Prometheus alerting and aggregation rules for Kubernetes.
name: prometheus-kubernetes-rules
version: 1.9.11
14 changes: 14 additions & 0 deletions prometheus-kubernetes-rules/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
Kube rules
----------------

This chart is a collection of Prometheus alerting and aggregation rules for Kubernetes.

## Configuration

The following table provides an overview of configurable parameters of this chart and their defaults.
See the [values.yaml](./values.yaml) for more details.

| Parameter | Description | Default |
|----------------------------------------|-------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------|
| `prometheusName` | Name of the Prometheus to which the rules should be assigned to. | `""` |
| `prometheusCollectorName` | Optional name of the Prometheus collector instance. Only required if the collector -> frontend federation pattern (metrics being pulled by frontend) is used. | `""` |
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
groups:
- name: skydns
rules:
- record: skydns_skydns_dns_error_count_total_irate
expr: irate(skydns_skydns_dns_error_count_total[5m])

- name: etcd
rules:
- record: instance:fd_utilization
expr: process_open_fds{component="etcd"} / process_max_fds{component="etcd"}
107 changes: 107 additions & 0 deletions prometheus-kubernetes-rules/aggregations/frontend/cluster.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
groups:
- name: cluster.rules
rules:
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+"))
BY (namespace, controller, pod_name, container_name)

- record: cluster_namespace_controller_pod_container:spec_cpu_shares
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+"))
BY (namespace, controller, pod_name, container_name)

- record: cluster_namespace_controller_pod_container:cpu_usage:rate
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+"))
BY (namespace, controller, pod_name, container_name)

- record: cluster_namespace_controller_pod_container:memory_usage:bytes
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+"))
BY (namespace, controller, pod_name, container_name)

- record: cluster_namespace_controller_pod_container:memory_working_set:bytes
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+"))
BY (namespace, controller, pod_name, container_name)

- record: cluster_namespace_controller_pod_container:memory_rss:bytes
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+"))
BY (namespace, controller, pod_name, container_name)

- record: cluster_namespace_controller_pod_container:memory_cache:bytes
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+"))
BY (namespace, controller, pod_name, container_name)

- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+"))
BY (namespace, controller, pod_name, container_name, scope, type)

- record: cluster_namespace_controller_pod_container:memory_oom:rate
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+"))
BY (namespace, controller, pod_name, container_name, scope, type)

- record: cluster:memory_allocation:percent
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) / sum(machine_memory_bytes)

- record: cluster:memory_used:percent
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) / sum(machine_memory_bytes)

- record: cluster:cpu_allocation:percent
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) / sum(container_spec_cpu_shares{id="/"} * ON(instance) group_left machine_cpu_cores)

- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, job, resource, verb)) / 1e+06
labels:
quantile: "0.99"
- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, job, resource, verb)) / 1e+06
labels:
quantile: "0.9"

- record: cluster_resource_verb:apiserver_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, job, resource, verb)) / 1e+06
labels:
quantile: "0.5"

- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le)) / 1e+06
labels:
quantile: "0.99"

- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, region)) / 1e+06
labels:
quantile: "0.9"

- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le)) / 1e+06
labels:
quantile: "0.5"

- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le)) / 1e+06
labels:
quantile: "0.99"

- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le)) / 1e+06
labels:
quantile: "0.9"

- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le)) / 1e+06
labels:
quantile: "0.5"

- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le)) / 1e+06
labels:
quantile: "0.99"

- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le)) / 1e+06
labels:
quantile: "0.9"

- record: cluster:scheduler_binding_latency:quantile_seconds
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le)) / 1e+06
labels:
quantile: "0.5"
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# vim: set ft=yaml:
groups:
- name: utilization
rules:
# These rules are intermediate results to keep the final aggregation rules more readable.
# In both cases, we take the maximum of momentary resource usage and resource request.
#
# Unfortunately, there is no straight-forward way to take a maximum of two separate sets of
# timeseries. The builtin max() function can only be used to reduce cardinality on a single
# set of timeseries. As a workaround, we use the pattern `(X >= Y) or Y` to mean `max(X, Y)`.
#
# Note that `(X >= Y) or Y` only works if Y is fully populated. Therefore we need to put the
# resource requests in X (since those metrics may be missing for some containers) and the
# cadvisor metrics in Y (since those metrics are always present for all containers).
#
# Ref: <https://stackoverflow.com/a/7335140>

- record: container_memory_effective_utilization_bytes
expr: |
max by (namespace, pod, container) (kube_pod_container_resource_requests{resource="memory"})
>= max by (namespace, pod, container) (container_memory_working_set_bytes)
or max by (namespace, pod, container) (container_memory_working_set_bytes)
- record: container_cpu_effective_utilization_average
expr: |
max by (namespace, pod, container) (kube_pod_container_resource_requests{resource="cpu"})
>= max by (namespace, pod, container) (rate(container_cpu_usage_seconds_total[5m]))
or max by (namespace, pod, container) (rate(container_cpu_usage_seconds_total[5m]))
- name: utilization-final
rules:
# These rules summarize various types of resource utilization metrics into one timeseries per
# owner (as defined by the ccloud/support-group and ccloud/service labels on the respective
# Kubernetes objects).

- record: by_owner:container_memory_effective_utilization_bytes
expr: 'sum by (label_ccloud_support_group, label_ccloud_service) (container_memory_effective_utilization_bytes * on (namespace, pod) group_left (label_ccloud_support_group, label_ccloud_service) (kube_pod_labels))'

- record: by_owner:container_cpu_effective_utilization_average
expr: 'sum by (label_ccloud_support_group, label_ccloud_service) (container_cpu_effective_utilization_average * on (namespace, pod) group_left (label_ccloud_support_group, label_ccloud_service) (kube_pod_labels))'

- record: by_owner:persistentvolume_used_bytes
expr: 'sum by (label_ccloud_support_group, label_ccloud_service) (kubelet_volume_stats_used_bytes * on (namespace, persistentvolumeclaim) group_left (label_ccloud_support_group, label_ccloud_service) kube_persistentvolumeclaim_labels)'

- record: by_owner:persistentvolume_capacity_bytes
expr: 'sum by (label_ccloud_support_group, label_ccloud_service) (kubelet_volume_stats_capacity_bytes * on (namespace, persistentvolumeclaim) group_left (label_ccloud_support_group, label_ccloud_service) kube_persistentvolumeclaim_labels)'
79 changes: 79 additions & 0 deletions prometheus-kubernetes-rules/alerts/apiserver.alerts.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# vi:syntax=yaml
groups:
- name: apiserver.alerts
rules:
- alert: KubernetesApiServerAllDown
expr: count(up{job="kubernetes-apiserver"} == 0) == count(up{job="kubernetes-apiserver"})
for: 5m
labels:
tier: {{ required ".Values.tier missing" .Values.tier }}
support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }}
service: {{ required ".Values.service missing" .Values.service }}
severity: warning
context: apiserver
meta: "{{`{{ $labels.instance }}`}}"
dashboard: kubernetes-health
playbook: docs/support/playbook/kubernetes/k8s_apiserver_down
annotations:
description: Kubernetes API is unavailable!
summary: All apiservers are down. Kubernetes API is unavailable!

- alert: KubernetesApiServerDown
expr: up{job="kubernetes-apiserver"} == 0
for: 15m
labels:
tier: {{ required ".Values.tier missing" .Values.tier }}
support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }}
service: {{ required ".Values.service missing" .Values.service }}
severity: warning
context: apiserver
meta: "{{`{{ $labels.instance }}`}}"
dashboard: nodes?var-server={{`{{$labels.instance}}`}}
playbook: docs/support/playbook/kubernetes/k8s_apiserver_down
annotations:
description: ApiServer on {{`{{ $labels.instance }}`}} is DOWN.
summary: An ApiServer is DOWN

- alert: KubernetesApiServerScrapeMissing
expr: up{job=~".*apiserver.*"} == 0 or absent(up{job=~".*apiserver.*"})
for: 1h
labels:
tier: {{ required ".Values.tier missing" .Values.tier }}
support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }}
service: {{ required ".Values.service missing" .Values.service }}
severity: info
context: apiserver
dashboard: kubernetes-health
annotations:
description: ApiServer cannot be scraped
summary: ApiServers failed to be scraped

- alert: KubernetesApiServerLatency
expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|LIST",subresource!="log"}[5m])) by (resource, le)) / 1e6 > 2
for: 30m
labels:
tier: {{ required ".Values.tier missing" .Values.tier }}
support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }}
service: {{ required ".Values.service missing" .Values.service }}
support_group: containers
severity: info
context: apiserver
dashboard: kubernetes-apiserver
annotations:
description: ApiServerLatency for {{`{{ $labels.resource }}`}} is higher then usual for the past 15 minutes. Inspect apiserver logs for the root cause.
summary: ApiServerLatency is unusally high

- alert: KubeAggregatedAPIDown
# We have to filter by job here because somehow the kubelet is also exporting this metric ?! and in admin/virtual/kubernikus we also scape apiservers in the
# kubernikus namespace
expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice{job="kubernetes-apiserver"}[10m]))) * 100 < 85
for: 5m
labels:
tier: {{ required ".Values.tier missing" .Values.tier }}
support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }}
service: {{ required ".Values.service missing" .Values.service }}
severity: warning
context: apiserver
annotations:
description: "Kubernetes aggregated API {{`{{ $labels.namespace }}`}}/{{`{{ $labels.name }}`}} has been only {{`{{ $value | humanize }}`}}% available over the last 10m. Run `kubectl get apiservice | grep -v Local` and confirm the services of aggregated APIs have active endpoints."
summary: Kubernetes aggregated API is down.
23 changes: 23 additions & 0 deletions prometheus-kubernetes-rules/alerts/certificate.alerts.tpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
groups:
- name: certificate.alerts
rules:
# Per default cert-manager renews at 33% remaining lifetime
# https://cert-manager.io/docs/usage/certificate/#reissuance-triggered-by-expiry-renewal
- alert: X509CertificateLifetimeUnder20Percent
# basically we are calculating:
# remaining/lifetime < 0.2
# to be able to get a proper duration from $value we have to reorder to:
# remaining < 0.2 * lifetime
# this means both sides need to join the support group
# also we clamp to 30 days, to get notified months in advance for long-lived certs
expr: (x509_cert_not_after - time()) * on(secret_name, secret_namespace) group_left(label_ccloud_support_group) label_replace(label_replace(kube_secret_labels, "secret_name", "$1", "secret", "(.*)"), "secret_namespace", "$1", "namespace", "(.*)") < clamp_max(0.2 * (x509_cert_not_after - x509_cert_not_before) * on(secret_name, secret_namespace) group_left(label_ccloud_support_group) label_replace(label_replace(kube_secret_labels, "secret_name", "$1", "secret", "(.*)"), "secret_namespace", "$1", "namespace", "(.*)"), 30*60*60*24)
for: 1h
labels:
tier: {{ required ".Values.tier missing" .Values.tier }}
context: availability
service: {{ required ".Values.service missing" .Values.service }}
severity: info
support_group: {{ include "supportGroupFromLabelsOrDefault" .Values.supportGroup }}
annotations:
description: The certificate for {{`{{ $labels.subject_CN }}`}} expires in {{`{{ $value | humanizeDuration }}`}}. See secret {{`{{ $labels.secret_namespace }}`}}/{{`{{ $labels.secret_name }}`}}, key {{`{{ $labels.secret_key }}`}}.
summary: Certificate expires
Loading

0 comments on commit 6b3af19

Please sign in to comment.