generated from cloudoperators/repository-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
19 changed files
with
1,037 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
apiVersion: v1 | ||
appVersion: "1.0" | ||
description: A collection of Prometheus alerting and aggregation rules for Kubernetes. | ||
name: prometheus-kubernetes-rules | ||
version: 1.9.11 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
Kube rules | ||
---------------- | ||
|
||
This chart is a collection of Prometheus alerting and aggregation rules for Kubernetes. | ||
|
||
## Configuration | ||
|
||
The following table provides an overview of configurable parameters of this chart and their defaults. | ||
See the [values.yaml](./values.yaml) for more details. | ||
|
||
| Parameter | Description | Default | | ||
|----------------------------------------|-------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------| | ||
| `prometheusName` | Name of the Prometheus to which the rules should be assigned to. | `""` | | ||
| `prometheusCollectorName` | Optional name of the Prometheus collector instance. Only required if the collector -> frontend federation pattern (metrics being pulled by frontend) is used. | `""` | |
10 changes: 10 additions & 0 deletions
10
prometheus-kubernetes-rules/aggregations/frontend/aggregation.rules
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
groups: | ||
- name: skydns | ||
rules: | ||
- record: skydns_skydns_dns_error_count_total_irate | ||
expr: irate(skydns_skydns_dns_error_count_total[5m]) | ||
|
||
- name: etcd | ||
rules: | ||
- record: instance:fd_utilization | ||
expr: process_open_fds{component="etcd"} / process_max_fds{component="etcd"} |
107 changes: 107 additions & 0 deletions
107
prometheus-kubernetes-rules/aggregations/frontend/cluster.rules
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
groups: | ||
- name: cluster.rules | ||
rules: | ||
- record: cluster_namespace_controller_pod_container:spec_memory_limit_bytes | ||
expr: sum(label_replace(container_spec_memory_limit_bytes{container_name!=""},"controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) | ||
BY (namespace, controller, pod_name, container_name) | ||
|
||
- record: cluster_namespace_controller_pod_container:spec_cpu_shares | ||
expr: sum(label_replace(container_spec_cpu_shares{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) | ||
BY (namespace, controller, pod_name, container_name) | ||
|
||
- record: cluster_namespace_controller_pod_container:cpu_usage:rate | ||
expr: sum(label_replace(irate(container_cpu_usage_seconds_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) | ||
BY (namespace, controller, pod_name, container_name) | ||
|
||
- record: cluster_namespace_controller_pod_container:memory_usage:bytes | ||
expr: sum(label_replace(container_memory_usage_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) | ||
BY (namespace, controller, pod_name, container_name) | ||
|
||
- record: cluster_namespace_controller_pod_container:memory_working_set:bytes | ||
expr: sum(label_replace(container_memory_working_set_bytes{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) | ||
BY (namespace, controller, pod_name, container_name) | ||
|
||
- record: cluster_namespace_controller_pod_container:memory_rss:bytes | ||
expr: sum(label_replace(container_memory_rss{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) | ||
BY (namespace, controller, pod_name, container_name) | ||
|
||
- record: cluster_namespace_controller_pod_container:memory_cache:bytes | ||
expr: sum(label_replace(container_memory_cache{container_name!=""}, "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) | ||
BY (namespace, controller, pod_name, container_name) | ||
|
||
- record: cluster_namespace_controller_pod_container:memory_pagefaults:rate | ||
expr: sum(label_replace(irate(container_memory_failures_total{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) | ||
BY (namespace, controller, pod_name, container_name, scope, type) | ||
|
||
- record: cluster_namespace_controller_pod_container:memory_oom:rate | ||
expr: sum(label_replace(irate(container_memory_failcnt{container_name!=""}[5m]), "controller", "$1", "pod_name", "^(.*)-[a-z0-9]+")) | ||
BY (namespace, controller, pod_name, container_name, scope, type) | ||
|
||
- record: cluster:memory_allocation:percent | ||
expr: 100 * sum(container_spec_memory_limit_bytes{pod_name!=""}) / sum(machine_memory_bytes) | ||
|
||
- record: cluster:memory_used:percent | ||
expr: 100 * sum(container_memory_usage_bytes{pod_name!=""}) / sum(machine_memory_bytes) | ||
|
||
- record: cluster:cpu_allocation:percent | ||
expr: 100 * sum(container_spec_cpu_shares{pod_name!=""}) / sum(container_spec_cpu_shares{id="/"} * ON(instance) group_left machine_cpu_cores) | ||
|
||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds | ||
expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket) BY (le, job, resource, verb)) / 1e+06 | ||
labels: | ||
quantile: "0.99" | ||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds | ||
expr: histogram_quantile(0.9, sum(apiserver_request_latencies_bucket) BY (le, job, resource, verb)) / 1e+06 | ||
labels: | ||
quantile: "0.9" | ||
|
||
- record: cluster_resource_verb:apiserver_latency:quantile_seconds | ||
expr: histogram_quantile(0.5, sum(apiserver_request_latencies_bucket) BY (le, job, resource, verb)) / 1e+06 | ||
labels: | ||
quantile: "0.5" | ||
|
||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds | ||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le)) / 1e+06 | ||
labels: | ||
quantile: "0.99" | ||
|
||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds | ||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) | ||
BY (le, region)) / 1e+06 | ||
labels: | ||
quantile: "0.9" | ||
|
||
- record: cluster:scheduler_e2e_scheduling_latency:quantile_seconds | ||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) BY (le)) / 1e+06 | ||
labels: | ||
quantile: "0.5" | ||
|
||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds | ||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le)) / 1e+06 | ||
labels: | ||
quantile: "0.99" | ||
|
||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds | ||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le)) / 1e+06 | ||
labels: | ||
quantile: "0.9" | ||
|
||
- record: cluster:scheduler_scheduling_algorithm_latency:quantile_seconds | ||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) BY (le)) / 1e+06 | ||
labels: | ||
quantile: "0.5" | ||
|
||
- record: cluster:scheduler_binding_latency:quantile_seconds | ||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) BY (le)) / 1e+06 | ||
labels: | ||
quantile: "0.99" | ||
|
||
- record: cluster:scheduler_binding_latency:quantile_seconds | ||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) BY (le)) / 1e+06 | ||
labels: | ||
quantile: "0.9" | ||
|
||
- record: cluster:scheduler_binding_latency:quantile_seconds | ||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) BY (le)) / 1e+06 | ||
labels: | ||
quantile: "0.5" |
46 changes: 46 additions & 0 deletions
46
prometheus-kubernetes-rules/aggregations/frontend/utilization.rules
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# vim: set ft=yaml: | ||
groups: | ||
- name: utilization | ||
rules: | ||
# These rules are intermediate results to keep the final aggregation rules more readable. | ||
# In both cases, we take the maximum of momentary resource usage and resource request. | ||
# | ||
# Unfortunately, there is no straight-forward way to take a maximum of two separate sets of | ||
# timeseries. The builtin max() function can only be used to reduce cardinality on a single | ||
# set of timeseries. As a workaround, we use the pattern `(X >= Y) or Y` to mean `max(X, Y)`. | ||
# | ||
# Note that `(X >= Y) or Y` only works if Y is fully populated. Therefore we need to put the | ||
# resource requests in X (since those metrics may be missing for some containers) and the | ||
# cadvisor metrics in Y (since those metrics are always present for all containers). | ||
# | ||
# Ref: <https://stackoverflow.com/a/7335140> | ||
|
||
- record: container_memory_effective_utilization_bytes | ||
expr: | | ||
max by (namespace, pod, container) (kube_pod_container_resource_requests{resource="memory"}) | ||
>= max by (namespace, pod, container) (container_memory_working_set_bytes) | ||
or max by (namespace, pod, container) (container_memory_working_set_bytes) | ||
- record: container_cpu_effective_utilization_average | ||
expr: | | ||
max by (namespace, pod, container) (kube_pod_container_resource_requests{resource="cpu"}) | ||
>= max by (namespace, pod, container) (rate(container_cpu_usage_seconds_total[5m])) | ||
or max by (namespace, pod, container) (rate(container_cpu_usage_seconds_total[5m])) | ||
- name: utilization-final | ||
rules: | ||
# These rules summarize various types of resource utilization metrics into one timeseries per | ||
# owner (as defined by the ccloud/support-group and ccloud/service labels on the respective | ||
# Kubernetes objects). | ||
|
||
- record: by_owner:container_memory_effective_utilization_bytes | ||
expr: 'sum by (label_ccloud_support_group, label_ccloud_service) (container_memory_effective_utilization_bytes * on (namespace, pod) group_left (label_ccloud_support_group, label_ccloud_service) (kube_pod_labels))' | ||
|
||
- record: by_owner:container_cpu_effective_utilization_average | ||
expr: 'sum by (label_ccloud_support_group, label_ccloud_service) (container_cpu_effective_utilization_average * on (namespace, pod) group_left (label_ccloud_support_group, label_ccloud_service) (kube_pod_labels))' | ||
|
||
- record: by_owner:persistentvolume_used_bytes | ||
expr: 'sum by (label_ccloud_support_group, label_ccloud_service) (kubelet_volume_stats_used_bytes * on (namespace, persistentvolumeclaim) group_left (label_ccloud_support_group, label_ccloud_service) kube_persistentvolumeclaim_labels)' | ||
|
||
- record: by_owner:persistentvolume_capacity_bytes | ||
expr: 'sum by (label_ccloud_support_group, label_ccloud_service) (kubelet_volume_stats_capacity_bytes * on (namespace, persistentvolumeclaim) group_left (label_ccloud_support_group, label_ccloud_service) kube_persistentvolumeclaim_labels)' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
# vi:syntax=yaml | ||
groups: | ||
- name: apiserver.alerts | ||
rules: | ||
- alert: KubernetesApiServerAllDown | ||
expr: count(up{job="kubernetes-apiserver"} == 0) == count(up{job="kubernetes-apiserver"}) | ||
for: 5m | ||
labels: | ||
tier: {{ required ".Values.tier missing" .Values.tier }} | ||
support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} | ||
service: {{ required ".Values.service missing" .Values.service }} | ||
severity: warning | ||
context: apiserver | ||
meta: "{{`{{ $labels.instance }}`}}" | ||
dashboard: kubernetes-health | ||
playbook: docs/support/playbook/kubernetes/k8s_apiserver_down | ||
annotations: | ||
description: Kubernetes API is unavailable! | ||
summary: All apiservers are down. Kubernetes API is unavailable! | ||
|
||
- alert: KubernetesApiServerDown | ||
expr: up{job="kubernetes-apiserver"} == 0 | ||
for: 15m | ||
labels: | ||
tier: {{ required ".Values.tier missing" .Values.tier }} | ||
support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} | ||
service: {{ required ".Values.service missing" .Values.service }} | ||
severity: warning | ||
context: apiserver | ||
meta: "{{`{{ $labels.instance }}`}}" | ||
dashboard: nodes?var-server={{`{{$labels.instance}}`}} | ||
playbook: docs/support/playbook/kubernetes/k8s_apiserver_down | ||
annotations: | ||
description: ApiServer on {{`{{ $labels.instance }}`}} is DOWN. | ||
summary: An ApiServer is DOWN | ||
|
||
- alert: KubernetesApiServerScrapeMissing | ||
expr: up{job=~".*apiserver.*"} == 0 or absent(up{job=~".*apiserver.*"}) | ||
for: 1h | ||
labels: | ||
tier: {{ required ".Values.tier missing" .Values.tier }} | ||
support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} | ||
service: {{ required ".Values.service missing" .Values.service }} | ||
severity: info | ||
context: apiserver | ||
dashboard: kubernetes-health | ||
annotations: | ||
description: ApiServer cannot be scraped | ||
summary: ApiServers failed to be scraped | ||
|
||
- alert: KubernetesApiServerLatency | ||
expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|LIST",subresource!="log"}[5m])) by (resource, le)) / 1e6 > 2 | ||
for: 30m | ||
labels: | ||
tier: {{ required ".Values.tier missing" .Values.tier }} | ||
support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} | ||
service: {{ required ".Values.service missing" .Values.service }} | ||
support_group: containers | ||
severity: info | ||
context: apiserver | ||
dashboard: kubernetes-apiserver | ||
annotations: | ||
description: ApiServerLatency for {{`{{ $labels.resource }}`}} is higher then usual for the past 15 minutes. Inspect apiserver logs for the root cause. | ||
summary: ApiServerLatency is unusally high | ||
|
||
- alert: KubeAggregatedAPIDown | ||
# We have to filter by job here because somehow the kubelet is also exporting this metric ?! and in admin/virtual/kubernikus we also scape apiservers in the | ||
# kubernikus namespace | ||
expr: (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice{job="kubernetes-apiserver"}[10m]))) * 100 < 85 | ||
for: 5m | ||
labels: | ||
tier: {{ required ".Values.tier missing" .Values.tier }} | ||
support_group: {{ required ".Values.supportGroup missing" .Values.supportGroup }} | ||
service: {{ required ".Values.service missing" .Values.service }} | ||
severity: warning | ||
context: apiserver | ||
annotations: | ||
description: "Kubernetes aggregated API {{`{{ $labels.namespace }}`}}/{{`{{ $labels.name }}`}} has been only {{`{{ $value | humanize }}`}}% available over the last 10m. Run `kubectl get apiservice | grep -v Local` and confirm the services of aggregated APIs have active endpoints." | ||
summary: Kubernetes aggregated API is down. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
groups: | ||
- name: certificate.alerts | ||
rules: | ||
# Per default cert-manager renews at 33% remaining lifetime | ||
# https://cert-manager.io/docs/usage/certificate/#reissuance-triggered-by-expiry-renewal | ||
- alert: X509CertificateLifetimeUnder20Percent | ||
# basically we are calculating: | ||
# remaining/lifetime < 0.2 | ||
# to be able to get a proper duration from $value we have to reorder to: | ||
# remaining < 0.2 * lifetime | ||
# this means both sides need to join the support group | ||
# also we clamp to 30 days, to get notified months in advance for long-lived certs | ||
expr: (x509_cert_not_after - time()) * on(secret_name, secret_namespace) group_left(label_ccloud_support_group) label_replace(label_replace(kube_secret_labels, "secret_name", "$1", "secret", "(.*)"), "secret_namespace", "$1", "namespace", "(.*)") < clamp_max(0.2 * (x509_cert_not_after - x509_cert_not_before) * on(secret_name, secret_namespace) group_left(label_ccloud_support_group) label_replace(label_replace(kube_secret_labels, "secret_name", "$1", "secret", "(.*)"), "secret_namespace", "$1", "namespace", "(.*)"), 30*60*60*24) | ||
for: 1h | ||
labels: | ||
tier: {{ required ".Values.tier missing" .Values.tier }} | ||
context: availability | ||
service: {{ required ".Values.service missing" .Values.service }} | ||
severity: info | ||
support_group: {{ include "supportGroupFromLabelsOrDefault" .Values.supportGroup }} | ||
annotations: | ||
description: The certificate for {{`{{ $labels.subject_CN }}`}} expires in {{`{{ $value | humanizeDuration }}`}}. See secret {{`{{ $labels.secret_namespace }}`}}/{{`{{ $labels.secret_name }}`}}, key {{`{{ $labels.secret_key }}`}}. | ||
summary: Certificate expires |
Oops, something went wrong.