diff --git a/kubernetes/controllers/servicelevelobjective_test.go b/kubernetes/controllers/servicelevelobjective_test.go index bbade9d88..6474f7b18 100644 --- a/kubernetes/controllers/servicelevelobjective_test.go +++ b/kubernetes/controllers/servicelevelobjective_test.go @@ -99,7 +99,7 @@ func Test_makePrometheusRule(t *testing.T) { { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(http_requests_total{job="app"}) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("6m"), Annotations: map[string]string{ "description": "foo", }, @@ -210,7 +210,7 @@ func Test_makeConfigMap(t *testing.T) { annotations: description: foo expr: absent(http_requests_total{job="app"}) == 1 - for: 2m + for: 6m labels: job: app severity: critical diff --git a/slo/rules.go b/slo/rules.go index ac469fac5..8069984de 100644 --- a/slo/rules.go +++ b/slo/rules.go @@ -713,11 +713,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) { }.replace(expr) rules = append(rules, monitoringv1.Rule{ - Alert: o.AlertNameAbsent(), - Expr: intstr.FromString(expr.String()), - For: monitoringDuration(model.Duration( - (time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute), - ).String()), + Alert: o.AlertNameAbsent(), + Expr: intstr.FromString(expr.String()), + For: monitoringDuration(o.AbsentDuration().String()), Labels: alertLabels, Annotations: o.commonRuleAnnotations(), }) @@ -755,11 +753,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) { }.replace(expr) rules = append(rules, monitoringv1.Rule{ - Alert: o.AlertNameAbsent(), - Expr: intstr.FromString(expr.String()), - For: monitoringDuration(model.Duration( - (time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute), - ).String()), + Alert: o.AlertNameAbsent(), + Expr: intstr.FromString(expr.String()), + For: monitoringDuration(o.AbsentDuration().String()), Labels: alertLabels, Annotations: o.commonRuleAnnotations(), }) @@ -867,11 +863,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) { alertLabels["severity"] = string(critical) rules = append(rules, monitoringv1.Rule{ - Alert: o.AlertNameAbsent(), - Expr: intstr.FromString(expr.String()), - For: monitoringDuration(model.Duration( - (time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute), - ).String()), + Alert: o.AlertNameAbsent(), + Expr: intstr.FromString(expr.String()), + For: monitoringDuration(o.AbsentDuration().String()), Labels: alertLabels, Annotations: o.commonRuleAnnotations(), }) @@ -894,11 +888,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) { alertLabelsLe["severity"] = string(critical) rules = append(rules, monitoringv1.Rule{ - Alert: o.AlertNameAbsent(), - Expr: intstr.FromString(expr.String()), - For: monitoringDuration(model.Duration( - (time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute), - ).String()), + Alert: o.AlertNameAbsent(), + Expr: intstr.FromString(expr.String()), + For: monitoringDuration(o.AbsentDuration().String()), Labels: alertLabelsLe, Annotations: o.commonRuleAnnotations(), }) @@ -1037,11 +1029,9 @@ func (o Objective) IncreaseRules() (monitoringv1.RuleGroup, error) { alertLabels["severity"] = string(critical) rules = append(rules, monitoringv1.Rule{ - Alert: o.AlertNameAbsent(), - Expr: intstr.FromString(expr.String()), - For: monitoringDuration(model.Duration( - (time.Duration(o.Window) / (28 * 24 * (60 / 2))).Round(time.Minute), - ).String()), + Alert: o.AlertNameAbsent(), + Expr: intstr.FromString(expr.String()), + For: monitoringDuration(o.AbsentDuration().String()), Labels: alertLabels, Annotations: o.commonRuleAnnotations(), }) diff --git a/slo/rules_test.go b/slo/rules_test.go index 65d617e58..bb84e65fd 100644 --- a/slo/rules_test.go +++ b/slo/rules_test.go @@ -1263,7 +1263,7 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(http_requests_total{job="thanos-receive-default"}) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("10m"), Labels: map[string]string{"job": "thanos-receive-default", "slo": "monitoring-http-errors", "severity": "critical"}, }}, }, @@ -1280,7 +1280,7 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(http_requests_total{job="thanos-receive-default"}) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("10m"), Labels: map[string]string{"slo": "monitoring-http-errors", "severity": "critical"}, }}, }, @@ -1297,7 +1297,7 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(http_requests_total{handler=~"/api.*",job="thanos-receive-default"}) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("10m"), Labels: map[string]string{"slo": "monitoring-http-errors", "severity": "critical"}, }}, }, @@ -1314,7 +1314,7 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(grpc_server_handled_total{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api"}) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("3m"), Labels: map[string]string{"grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "job": "api", "slo": "monitoring-grpc-errors", "severity": "critical"}, }}, }, @@ -1331,7 +1331,7 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(grpc_server_handled_total{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api"}) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("3m"), Labels: map[string]string{"grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "slo": "monitoring-grpc-errors", "severity": "critical"}, }}, }, @@ -1352,12 +1352,12 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(http_request_duration_seconds_count{code=~"2..",job="metrics-service-thanos-receive-default"}) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("6m"), Labels: map[string]string{"job": "metrics-service-thanos-receive-default", "slo": "monitoring-http-latency", "severity": "critical"}, }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(http_request_duration_seconds_bucket{code=~"2..",job="metrics-service-thanos-receive-default",le="1"}) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("6m"), Labels: map[string]string{"job": "metrics-service-thanos-receive-default", "slo": "monitoring-http-latency", "le": "1", "severity": "critical"}, }}, }, @@ -1375,7 +1375,7 @@ func TestObjective_IncreaseRules(t *testing.T) { Record: "http_request_duration_seconds:increase4w", Expr: intstr.FromString(`histogram_fraction(0, 1, increase(http_request_duration_seconds{code=~"2..",job="metrics-service-thanos-receive-default"}[4w])) * histogram_count(increase(http_request_duration_seconds{code=~"2..",job="metrics-service-thanos-receive-default"}[4w]))`), Labels: map[string]string{"job": "metrics-service-thanos-receive-default", "slo": "monitoring-http-latency", "le": "1"}, - //}, { + // }, { // Alert: "SLOMetricAbsent", // Expr: intstr.FromString(`absent(http_request_duration_seconds{code=~"2..",job="metrics-service-thanos-receive-default"}) == 1`), // For: monitoringDuration("2m"), @@ -1399,12 +1399,12 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(http_request_duration_seconds_count{code=~"2..",job="metrics-service-thanos-receive-default"}) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("6m"), Labels: map[string]string{"slo": "monitoring-http-latency", "severity": "critical"}, }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(http_request_duration_seconds_bucket{code=~"2..",job="metrics-service-thanos-receive-default",le="1"}) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("6m"), Labels: map[string]string{"slo": "monitoring-http-latency", "le": "1", "severity": "critical"}, }}, }, @@ -1425,12 +1425,12 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(http_request_duration_seconds_count{code=~"2..",handler=~"/api.*",job="metrics-service-thanos-receive-default"}) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("6m"), Labels: map[string]string{"slo": "monitoring-http-latency", "severity": "critical"}, }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(http_request_duration_seconds_bucket{code=~"2..",handler=~"/api.*",job="metrics-service-thanos-receive-default",le="1"}) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("6m"), Labels: map[string]string{"slo": "monitoring-http-latency", "le": "1", "severity": "critical"}, }}, }, @@ -1451,12 +1451,12 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(grpc_server_handling_seconds_count{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api"}) == 1`), - For: monitoringDuration("1m"), + For: monitoringDuration("2m"), Labels: map[string]string{"slo": "monitoring-grpc-latency", "job": "api", "grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "severity": "critical"}, }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(grpc_server_handling_seconds_bucket{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api",le="0.6"}) == 1`), - For: monitoringDuration("1m"), + For: monitoringDuration("2m"), Labels: map[string]string{"slo": "monitoring-grpc-latency", "job": "api", "grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "le": "0.6", "severity": "critical"}, }}, }, @@ -1477,12 +1477,12 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(grpc_server_handling_seconds_count{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api"}) == 1`), - For: monitoringDuration("1m"), + For: monitoringDuration("2m"), Labels: map[string]string{"slo": "monitoring-grpc-latency", "grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "severity": "critical"}, }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(grpc_server_handling_seconds_bucket{grpc_method="Write",grpc_service="conprof.WritableProfileStore",job="api",le="0.6"}) == 1`), - For: monitoringDuration("1m"), + For: monitoringDuration("2m"), Labels: map[string]string{"slo": "monitoring-grpc-latency", "grpc_method": "Write", "grpc_service": "conprof.WritableProfileStore", "le": "0.6", "severity": "critical"}, }}, }, @@ -1499,7 +1499,7 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(prometheus_operator_reconcile_operations_total) == 1`), - For: monitoringDuration("1m"), + For: monitoringDuration("5m"), Labels: map[string]string{"slo": "monitoring-prometheus-operator-errors", "severity": "critical"}, }, { Record: "prometheus_operator_reconcile_errors:increase2w", @@ -1508,7 +1508,7 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(prometheus_operator_reconcile_errors_total) == 1`), - For: monitoringDuration("1m"), + For: monitoringDuration("5m"), Labels: map[string]string{"slo": "monitoring-prometheus-operator-errors", "severity": "critical"}, }}, }, @@ -1525,7 +1525,7 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(prometheus_operator_reconcile_operations_total) == 1`), - For: monitoringDuration("1m"), + For: monitoringDuration("5m"), Labels: map[string]string{"slo": "monitoring-prometheus-operator-errors", "severity": "critical"}, }, { Record: "prometheus_operator_reconcile_errors:increase2w", @@ -1534,7 +1534,7 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(prometheus_operator_reconcile_errors_total) == 1`), - For: monitoringDuration("1m"), + For: monitoringDuration("5m"), Labels: map[string]string{"slo": "monitoring-prometheus-operator-errors", "severity": "critical"}, }}, }, @@ -1551,7 +1551,7 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "APIServerMetricAbsent", Expr: intstr.FromString(`absent(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}) == 1`), - For: monitoringDuration("1m"), + For: monitoringDuration("5m"), Labels: map[string]string{"job": "apiserver", "slo": "apiserver-write-response-errors", "severity": "critical"}, }}, }, @@ -1572,12 +1572,12 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(apiserver_request_duration_seconds_count{job="apiserver",resource=~"resource|",verb=~"LIST|GET"}) == 1`), - For: monitoringDuration("1m"), + For: monitoringDuration("5m"), Labels: map[string]string{"job": "apiserver", "slo": "apiserver-read-resource-latency", "severity": "critical"}, }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(apiserver_request_duration_seconds_bucket{job="apiserver",le="0.1",resource=~"resource|",verb=~"LIST|GET"}) == 1`), - For: monitoringDuration("1m"), + For: monitoringDuration("5m"), Labels: map[string]string{"job": "apiserver", "slo": "apiserver-read-resource-latency", "le": "0.1", "severity": "critical"}, }}, }, @@ -1598,7 +1598,7 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(up) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("10m"), Labels: map[string]string{"severity": "critical", "slo": "up-targets"}, }}, }, @@ -1619,7 +1619,7 @@ func TestObjective_IncreaseRules(t *testing.T) { }, { Alert: "SLOMetricAbsent", Expr: intstr.FromString(`absent(up{instance!~"(127.0.0.1|localhost).*"}) == 1`), - For: monitoringDuration("2m"), + For: monitoringDuration("10m"), Labels: map[string]string{"severity": "critical", "slo": "up-targets"}, }}, }, diff --git a/slo/slo.go b/slo/slo.go index 6cc2a0eb5..045779936 100644 --- a/slo/slo.go +++ b/slo/slo.go @@ -55,6 +55,21 @@ func (o Objective) Exhausts(factor float64) model.Duration { return model.Duration(time.Second * time.Duration(time.Duration(o.Window).Seconds()/factor)) } +// AbsentDuration calculates the duration when absent alerts should fire. +// The idea is as follows: Use the most critical of the multi burn rate alerts. +// For that alert to fire, both the short AND long windows have to be above the threshold. +// The long window takes the - longest - to fire. +// Assuming absence of the metric means 100% error rate, +// the time it takes to fire is the duration for the long window to go above the threshold (factor * objective). +// Finally, we add the "for" duration we add to the multi burn rate alerts. +func (o Objective) AbsentDuration() model.Duration { + mostCritical := o.Windows()[0] + mostCriticalThreshold := mostCritical.Factor * (1 - o.Target) + mostCriticalDuration := time.Duration(mostCriticalThreshold*mostCritical.Long.Seconds()) * time.Second + mostCriticalDuration += mostCritical.For + return model.Duration(mostCriticalDuration.Round(time.Minute)) +} + type IndicatorType int const (