Skip to content

Commit

Permalink
Resolve conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
diasbro committed Oct 2, 2023
2 parents 52ed118 + 80705ac commit 4ae3cd1
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 40 deletions.
14 changes: 7 additions & 7 deletions alerts/apps_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@
// label exists for 2 values. This avoids "many-to-many matching
// not allowed" errors when joining with kube_pod_status_phase.
expr: |||
sum by (namespace, pod, %(clusterLabel)s) (
max by(namespace, pod, %(clusterLabel)s) (
kube_pod_status_phase{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, phase=~"Pending|Unknown|Failed"}
) * on(namespace, pod, %(clusterLabel)s) group_left(owner_kind) topk by(namespace, pod, %(clusterLabel)s) (
1, max by(namespace, pod, owner_kind, %(clusterLabel)s) (kube_pod_owner{owner_kind!="Job"})
sum by (%(clusterLabel)s, namespace, pod) (
max by(%(clusterLabel)s, namespace, pod) (
kube_pod_status_phase{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, phase=~"Pending|Unknown"}
) * on(%(clusterLabel)s, namespace, pod) group_left(owner_kind) topk by(%(clusterLabel)s, namespace, pod) (
1, max by(%(clusterLabel)s, namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
||| % $._config,
Expand Down Expand Up @@ -208,7 +208,7 @@
},
{
expr: |||
sum by (namespace, pod, container, %(clusterLabel)s) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0
sum by (%(clusterLabel)s, namespace, pod, container) (kube_pod_container_status_waiting_reason{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}) > 0
||| % $._config,
labels: {
severity: 'warning',
Expand Down Expand Up @@ -253,7 +253,7 @@
{
alert: 'KubeJobNotCompleted',
expr: |||
time() - max by(namespace, job_name, %(clusterLabel)s) (kube_job_status_start_time{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
time() - max by(%(clusterLabel)s, namespace, job_name) (kube_job_status_start_time{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s}
and
kube_job_status_active{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s} > 0) > %(kubeJobTimeoutDuration)s
||| % $._config,
Expand Down
4 changes: 2 additions & 2 deletions alerts/resource_alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,9 @@
{
alert: 'CPUThrottlingHigh',
expr: |||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace)
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", %(cpuThrottlingSelector)s}[5m])) by (%(clusterLabel)s, container, pod, namespace)
/
sum(increase(container_cpu_cfs_periods_total{%(cpuThrottlingSelector)s}[5m])) by (container, pod, namespace)
sum(increase(container_cpu_cfs_periods_total{%(cpuThrottlingSelector)s}[5m])) by (%(clusterLabel)s, container, pod, namespace)
> ( %(cpuThrottlingPercent)s / 100 )
||| % $._config,
'for': '15m',
Expand Down
4 changes: 2 additions & 2 deletions dashboards/apiserver.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ local singlestat = grafana.singlestat;
format='s',
description='How many seconds is the 99th percentile for reading (LIST|GET) a given resource?',
)
.addTarget(prometheus.target('cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile{verb="read", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}'));
.addTarget(prometheus.target('cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb="read", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}'));

local writeAvailability =
singlestat.new(
Expand Down Expand Up @@ -130,7 +130,7 @@ local singlestat = grafana.singlestat;
format='s',
description='How many seconds is the 99th percentile for writing (POST|PUT|PATCH|DELETE) a given resource?',
)
.addTarget(prometheus.target('cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile{verb="write", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}'));
.addTarget(prometheus.target('cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{verb="write", %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{ resource }}'));

local workQueueAddRate =
graphPanel.new(
Expand Down
58 changes: 29 additions & 29 deletions rules/kube_apiserver.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
kubeApiserverReadSelector: 'verb=~"LIST|GET"',
kubeApiserverWriteSelector: 'verb=~"POST|PUT|PATCH|DELETE"',
kubeApiserverNonStreamingSelector: 'subresource!~"proxy|attach|log|exec|portforward"',
// These are buckets that exist on the apiserver_request_slo_duration_seconds_bucket histogram.
// These are buckets that exist on the apiserver_request_duration_seconds_bucket histogram.
// They are what the Kubernetes SIG Scalability is using to measure availability of Kubernetes clusters.
// If you want to change these, make sure the "le" buckets exist on the histogram!
kubeApiserverReadResourceLatency: '1',
Expand All @@ -31,18 +31,18 @@
(
(
# too slow
sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s]))
sum by (%(clusterLabel)s) (rate(apiserver_request_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s]))
-
(
(
sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}[%(window)s]))
sum by (%(clusterLabel)s) (rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"}[%(window)s]))
or
vector(0)
)
+
sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}[%(window)s]))
sum by (%(clusterLabel)s) (rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"}[%(window)s]))
+
sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}[%(window)s]))
sum by (%(clusterLabel)s) (rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverReadSelector)s,%(kubeApiserverNonStreamingSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"}[%(window)s]))
)
)
+
Expand Down Expand Up @@ -79,9 +79,9 @@
(
(
# too slow
sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s]))
sum by (%(clusterLabel)s) (rate(apiserver_request_duration_seconds_count{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s}[%(window)s]))
-
sum by (%(clusterLabel)s) (rate(apiserver_request_slo_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s,le="%(kubeApiserverWriteLatency)s"}[%(window)s]))
sum by (%(clusterLabel)s) (rate(apiserver_request_duration_seconds_bucket{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,%(kubeApiserverNonStreamingSelector)s,le="%(kubeApiserverWriteLatency)s"}[%(window)s]))
)
+
sum by (%(clusterLabel)s) (rate(apiserver_request_total{%(kubeApiserverSelector)s,%(kubeApiserverWriteSelector)s,code=~"5.."}[%(window)s]))
Expand Down Expand Up @@ -114,9 +114,9 @@
rules:
[
{
record: 'cluster_quantile:apiserver_request_slo_duration_seconds:histogram_quantile',
record: 'cluster_quantile:apiserver_request_duration_seconds:histogram_quantile',
expr: |||
histogram_quantile(0.99, sum by (%s, le, resource) (rate(apiserver_request_slo_duration_seconds_bucket{%s}[5m]))) > 0
histogram_quantile(0.99, sum by (%s, le, resource) (rate(apiserver_request_duration_seconds_bucket{%s}[5m]))) > 0
||| % [$._config.clusterLabel, std.join(',', [$._config.kubeApiserverSelector, verb.selector, $._config.kubeApiserverNonStreamingSelector])],
labels: {
verb: verb.type,
Expand Down Expand Up @@ -149,27 +149,27 @@
for verb in verbs
] + [
{
record: 'cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h',
record: 'cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h',
expr: |||
sum by (%(clusterLabel)s, verb, scope) (increase(apiserver_request_slo_duration_seconds_count{%(kubeApiserverSelector)s}[1h]))
sum by (%(clusterLabel)s, verb, scope) (increase(apiserver_request_duration_seconds_count{%(kubeApiserverSelector)s}[1h]))
||| % $._config,
},
{
record: 'cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%s' % SLODays,
record: 'cluster_verb_scope:apiserver_request_duration_seconds_count:increase%s' % SLODays,
expr: |||
sum by (%s, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h[%s]) * 24 * %s)
sum by (%s, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_duration_seconds_count:increase1h[%s]) * 24 * %s)
||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days],
},
{
record: 'cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h',
record: 'cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h',
expr: |||
sum by (%(clusterLabel)s, verb, scope, le) (increase(apiserver_request_slo_duration_seconds_bucket[1h]))
sum by (%(clusterLabel)s, verb, scope, le) (increase(apiserver_request_duration_seconds_bucket[1h]))
||| % $._config,
},
{
record: 'cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%s' % SLODays,
record: 'cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%s' % SLODays,
expr: |||
sum by (%s, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h[%s]) * 24 * %s)
sum by (%s, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase1h[%s]) * 24 * %s)
||| % [$._config.clusterLabel, SLODays, $._config.SLOs.apiserver.days],
},
{
Expand All @@ -178,24 +178,24 @@
1 - (
(
# write too slow
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s})
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s})
-
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"})
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"})
) +
(
# read too slow
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s})
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s})
-
(
(
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"})
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"})
or
vector(0)
)
+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"})
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"})
+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"})
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"})
)
) +
# errors
Expand All @@ -212,19 +212,19 @@
record: 'apiserver_request:availability%s' % SLODays,
expr: |||
1 - (
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s})
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverReadSelector)s})
-
(
# too slow
(
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"})
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope=~"resource|",le="%(kubeApiserverReadResourceLatency)s"})
or
vector(0)
)
+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"})
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="namespace",le="%(kubeApiserverReadNamespaceLatency)s"})
+
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"})
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverReadSelector)s,scope="cluster",le="%(kubeApiserverReadClusterLatency)s"})
)
+
# errors
Expand All @@ -243,9 +243,9 @@
1 - (
(
# too slow
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s})
sum by (%(clusterLabel)s) (cluster_verb_scope:apiserver_request_duration_seconds_count:increase%(SLODays)s{%(kubeApiserverWriteSelector)s})
-
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"})
sum by (%(clusterLabel)s) (cluster_verb_scope_le:apiserver_request_duration_seconds_bucket:increase%(SLODays)s{%(kubeApiserverWriteSelector)s,le="%(kubeApiserverWriteLatency)s"})
)
+
# errors
Expand Down

0 comments on commit 4ae3cd1

Please sign in to comment.