From fdb8aa49b4da94050a7e2d791be84f0047410ff7 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Sat, 22 Oct 2022 00:51:53 +0200 Subject: [PATCH 01/14] Move pyrra.libsonnet to core components --- example.jsonnet | 16 +- .../kube-prometheus/addons/pyrra.libsonnet | 623 ------------------ .../components/pyrra.libsonnet | 541 +++++++++++++++ jsonnet/kube-prometheus/main.libsonnet | 9 + 4 files changed, 562 insertions(+), 627 deletions(-) create mode 100644 jsonnet/kube-prometheus/components/pyrra.libsonnet diff --git a/example.jsonnet b/example.jsonnet index 8974158bcd..5f9f1eae98 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -7,7 +7,6 @@ local kp = // (import 'kube-prometheus/addons/static-etcd.libsonnet') + // (import 'kube-prometheus/addons/custom-metrics.libsonnet') + // (import 'kube-prometheus/addons/external-metrics.libsonnet') + - // (import 'kube-prometheus/addons/pyrra.libsonnet') + { values+:: { common+: { @@ -19,17 +18,26 @@ local kp = { 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) + for name in std.filter( + (function(name) + name != 'serviceMonitor' && + name != 'prometheusRule' && + name != 'sloHTTPErrors' && + name != 'sloReconcileErrors'), + std.objectFields(kp.prometheusOperator) + ) } + -// { 'setup/pyrra-slo-CustomResourceDefinition': kp.pyrra.crd } + +{ 'setup/pyrra-slo-CustomResourceDefinition': kp.pyrra.crd } + // serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + { 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'prometheus-operator-sloHTTPErrors': kp.prometheusOperator.sloHTTPErrors } + +{ 'prometheus-operator-sloReconcileErrors': kp.prometheusOperator.sloReconcileErrors } + { 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + -// { ['pyrra-' + name]: kp.pyrra[name] for name in std.objectFields(kp.pyrra) if name != 'crd' } + +{ ['pyrra-' + name]: kp.pyrra[name] for name in std.objectFields(kp.pyrra) if name != 'crd' } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + diff --git a/jsonnet/kube-prometheus/addons/pyrra.libsonnet b/jsonnet/kube-prometheus/addons/pyrra.libsonnet index 1980b22004..e69de29bb2 100644 --- a/jsonnet/kube-prometheus/addons/pyrra.libsonnet +++ b/jsonnet/kube-prometheus/addons/pyrra.libsonnet @@ -1,623 +0,0 @@ -{ - values+:: { - common+: { - versions+: { - pyrra: error 'must provide version', - } + (import '../versions.json'), - images+: { - pyrra+: 'ghcr.io/pyrra-dev/pyrra:v' + $.values.common.versions.pyrra, - }, - }, - pyrra+: { - namespace: $.values.common.namespace, - version: $.values.common.versions.pyrra, - image: $.values.common.images.pyrra, - }, - }, - - local defaults = { - local defaults = self, - - name:: 'pyrra', - namespace:: error 'must provide namespace', - version:: error 'must provide version', - image: error 'must provide image', - replicas:: 1, - port:: 9099, - - commonLabels:: { - 'app.kubernetes.io/name': 'pyrra', - 'app.kubernetes.io/version': defaults.version, - 'app.kubernetes.io/part-of': 'kube-prometheus', - }, - }, - - local pyrra = function(params) { - local pyrra = self, - _config:: defaults + params, - - crd: ( - import 'github.com/pyrra-dev/pyrra/config/crd/bases/pyrra.dev_servicelevelobjectives.json' - ), - - - _apiMetadata:: { - name: pyrra._config.name + '-api', - namespace: pyrra._config.namespace, - labels: pyrra._config.commonLabels { - 'app.kubernetes.io/component': 'api', - }, - }, - apiSelectorLabels:: { - [labelName]: pyrra._apiMetadata.labels[labelName] - for labelName in std.objectFields(pyrra._apiMetadata.labels) - if !std.setMember(labelName, ['app.kubernetes.io/version']) - }, - - apiService: { - apiVersion: 'v1', - kind: 'Service', - metadata: pyrra._apiMetadata, - spec: { - ports: [ - { name: 'http', targetPort: pyrra._config.port, port: pyrra._config.port }, - ], - selector: pyrra.apiSelectorLabels, - }, - }, - - apiDeployment: - local c = { - name: pyrra._config.name, - image: pyrra._config.image, - args: [ - 'api', - '--api-url=http://%s.%s.svc.cluster.local:9444' % [pyrra.kubernetesService.metadata.name, pyrra.kubernetesService.metadata.namespace], - '--prometheus-url=http://prometheus-k8s.%s.svc.cluster.local:9090' % pyrra._config.namespace, - ], - // resources: pyrra._config.resources, - ports: [{ containerPort: pyrra._config.port }], - securityContext: { - allowPrivilegeEscalation: false, - readOnlyRootFilesystem: true, - }, - }; - - { - apiVersion: 'apps/v1', - kind: 'Deployment', - metadata: pyrra._apiMetadata, - spec: { - replicas: pyrra._config.replicas, - selector: { - matchLabels: pyrra.apiSelectorLabels, - }, - strategy: { - rollingUpdate: { - maxSurge: 1, - maxUnavailable: 1, - }, - }, - template: { - metadata: { labels: pyrra._apiMetadata.labels }, - spec: { - containers: [c], - // serviceAccountName: $.serviceAccount.metadata.name, - nodeSelector: { 'kubernetes.io/os': 'linux' }, - }, - }, - }, - }, - - _kubernetesMetadata:: { - name: pyrra._config.name + '-kubernetes', - namespace: pyrra._config.namespace, - labels: pyrra._config.commonLabels { - 'app.kubernetes.io/component': 'kubernetes', - }, - }, - kubernetesSelectorLabels:: { - [labelName]: pyrra._kubernetesMetadata.labels[labelName] - for labelName in std.objectFields(pyrra._kubernetesMetadata.labels) - if !std.setMember(labelName, ['app.kubernetes.io/version']) - }, - - kubernetesServiceAccount: { - apiVersion: 'v1', - kind: 'ServiceAccount', - metadata: pyrra._kubernetesMetadata, - }, - - kubernetesClusterRole: { - apiVersion: 'rbac.authorization.k8s.io/v1', - kind: 'ClusterRole', - metadata: pyrra._kubernetesMetadata, - rules: [{ - apiGroups: ['monitoring.coreos.com'], - resources: ['prometheusrules'], - verbs: ['create', 'delete', 'get', 'list', 'patch', 'update', 'watch'], - }, { - apiGroups: ['monitoring.coreos.com'], - resources: ['prometheusrules/status'], - verbs: ['get'], - }, { - apiGroups: ['pyrra.dev'], - resources: ['servicelevelobjectives'], - verbs: ['create', 'delete', 'get', 'list', 'patch', 'update', 'watch'], - }, { - apiGroups: ['pyrra.dev'], - resources: ['servicelevelobjectives/status'], - verbs: ['get', 'patch', 'update'], - }], - }, - - kubernetesClusterRoleBinding: { - apiVersion: 'rbac.authorization.k8s.io/v1', - kind: 'ClusterRoleBinding', - metadata: pyrra._kubernetesMetadata, - roleRef: { - apiGroup: 'rbac.authorization.k8s.io', - kind: 'ClusterRole', - name: pyrra.kubernetesClusterRole.metadata.name, - }, - subjects: [{ - kind: 'ServiceAccount', - name: pyrra.kubernetesServiceAccount.metadata.name, - namespace: pyrra._config.namespace, - }], - }, - - kubernetesService: { - apiVersion: 'v1', - kind: 'Service', - metadata: pyrra._kubernetesMetadata, - spec: { - ports: [ - { name: 'http', targetPort: 9444, port: 9444 }, - ], - selector: pyrra.kubernetesSelectorLabels, - }, - }, - - kubernetesDeployment: - local c = { - name: pyrra._config.name, - image: pyrra._config.image, - args: [ - 'kubernetes', - ], - // resources: pyrra._config.resources, - ports: [{ containerPort: pyrra._config.port }], - securityContext: { - allowPrivilegeEscalation: false, - readOnlyRootFilesystem: true, - }, - }; - - { - apiVersion: 'apps/v1', - kind: 'Deployment', - metadata: pyrra._kubernetesMetadata { - name: pyrra._config.name + '-kubernetes', - }, - spec: { - replicas: pyrra._config.replicas, - selector: { - matchLabels: pyrra.kubernetesSelectorLabels, - }, - strategy: { - rollingUpdate: { - maxSurge: 1, - maxUnavailable: 1, - }, - }, - template: { - metadata: { labels: pyrra._kubernetesMetadata.labels }, - spec: { - containers: [c], - serviceAccountName: pyrra.kubernetesServiceAccount.metadata.name, - nodeSelector: { 'kubernetes.io/os': 'linux' }, - }, - }, - }, - }, - - // Most of these should eventually be moved to the components themselves. - // For now, this is a good start to have everything in one place. - 'slo-apiserver-read-response-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-read-response-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'apiserver_request_total{component="apiserver",verb=~"LIST|GET",code=~"5.."}', - }, - total: { - metric: 'apiserver_request_total{component="apiserver",verb=~"LIST|GET"}', - }, - }, - }, - }, - }, - - 'slo-apiserver-write-response-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-write-response-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'apiserver_request_total{component="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}', - }, - total: { - metric: 'apiserver_request_total{component="apiserver",verb=~"POST|PUT|PATCH|DELETE"}', - }, - }, - }, - }, - }, - - 'slo-apiserver-read-resource-latency': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-read-resource-latency', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - latency: { - success: { - metric: 'apiserver_request_duration_seconds_bucket{component="apiserver",scope=~"resource|",verb=~"LIST|GET",le="0.1"}', - }, - total: { - metric: 'apiserver_request_duration_seconds_count{component="apiserver",scope=~"resource|",verb=~"LIST|GET"}', - }, - }, - }, - }, - }, - - 'slo-apiserver-read-namespace-latency': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-read-namespace-latency', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - latency: { - success: { - metric: 'apiserver_request_duration_seconds_bucket{component="apiserver",scope=~"namespace|",verb=~"LIST|GET",le="5"}', - }, - total: { - metric: 'apiserver_request_duration_seconds_count{component="apiserver",scope=~"namespace|",verb=~"LIST|GET"}', - }, - }, - }, - }, - }, - - 'slo-apiserver-read-cluster-latency': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-read-cluster-latency', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - latency: { - success: { - metric: 'apiserver_request_duration_seconds_bucket{component="apiserver",scope=~"cluster|",verb=~"LIST|GET",le="5"}', - }, - total: { - metric: 'apiserver_request_duration_seconds_count{component="apiserver",scope=~"cluster|",verb=~"LIST|GET"}', - }, - }, - }, - }, - }, - - 'slo-kubelet-request-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'kubelet-request-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'rest_client_requests_total{job="kubelet",code=~"5.."}', - }, - total: { - metric: 'rest_client_requests_total{job="kubelet"}', - }, - }, - }, - }, - }, - - 'slo-kubelet-runtime-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'kubelet-runtime-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'kubelet_runtime_operations_errors_total{job="kubelet"}', - }, - total: { - metric: 'kubelet_runtime_operations_total{job="kubelet"}', - }, - }, - }, - }, - }, - - 'slo-coredns-response-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'coredns-response-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99.99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'coredns_dns_responses_total{job="kube-dns",rcode="SERVFAIL"}', - }, - total: { - metric: 'coredns_dns_responses_total{job="kube-dns"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-operator-reconcile-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-operator-reconcile-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '95', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'prometheus_operator_reconcile_errors_total{job="prometheus-operator"}', - }, - total: { - metric: 'prometheus_operator_reconcile_operations_total{job="prometheus-operator"}', - }, - grouping: ['controller'], - }, - }, - }, - }, - - 'slo-prometheus-operator-http-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-operator-http-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99.5', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'prometheus_operator_kubernetes_client_http_requests_total{job="prometheus-operator",status_code=~"5.."}', - }, - total: { - metric: 'prometheus_operator_kubernetes_client_http_requests_total{job="prometheus-operator"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-rule-evaluation-failures': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-rule-evaluation-failures', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99.99', - window: '2w', - description: 'Rule and alerting rules are being evaluated every few seconds. This needs to work for recording rules to be created and most importantly for alerts to be evaluated.', - indicator: { - ratio: { - errors: { - metric: 'prometheus_rule_evaluation_failures_total{job="prometheus-k8s"}', - }, - total: { - metric: 'prometheus_rule_evaluations_total{job="prometheus-k8s"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-sd-kubernetes-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-sd-kubernetes-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: 'If there are too many errors Prometheus is having a bad time discovering new Kubernetes services.', - indicator: { - ratio: { - errors: { - metric: 'prometheus_sd_kubernetes_http_request_total{job="prometheus-k8s",status_code=~"5..|"}', - }, - total: { - metric: 'prometheus_sd_kubernetes_http_request_total{job="prometheus-k8s"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-query-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-query-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - grouping: ['handler'], - errors: { - metric: 'prometheus_http_requests_total{job="prometheus-k8s",handler=~"/api/v1/query.*",code=~"5.."}', - }, - total: { - metric: 'prometheus_http_requests_total{job="prometheus-k8s",handler=~"/api/v1/query.*"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-notification-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-notification-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'prometheus_notifications_errors_total{job="prometheus-k8s"}', - }, - total: { - metric: 'prometheus_notifications_sent_total{job="prometheus-k8s"}', - }, - }, - }, - }, - }, - }, - - pyrra: pyrra($.values.pyrra), -} diff --git a/jsonnet/kube-prometheus/components/pyrra.libsonnet b/jsonnet/kube-prometheus/components/pyrra.libsonnet new file mode 100644 index 0000000000..4f8d41a596 --- /dev/null +++ b/jsonnet/kube-prometheus/components/pyrra.libsonnet @@ -0,0 +1,541 @@ +local defaults = { + local defaults = self, + + name:: 'pyrra', + namespace:: error 'must provide namespace', + version:: error 'must provide version', + image: error 'must provide image', + replicas:: 1, + port:: 9099, + + commonLabels:: { + 'app.kubernetes.io/name': 'pyrra', + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, +}; + +function(params) { + local pyrra = self, + _config:: defaults + params, + + crd: ( + import 'github.com/pyrra-dev/pyrra/config/crd/bases/pyrra.dev_servicelevelobjectives.json' + ), + + _apiMetadata:: { + name: pyrra._config.name + '-api', + namespace: pyrra._config.namespace, + labels: pyrra._config.commonLabels { + 'app.kubernetes.io/component': 'api', + }, + }, + apiSelectorLabels:: { + [labelName]: pyrra._apiMetadata.labels[labelName] + for labelName in std.objectFields(pyrra._apiMetadata.labels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + + apiService: { + apiVersion: 'v1', + kind: 'Service', + metadata: pyrra._apiMetadata, + spec: { + ports: [ + { name: 'http', targetPort: pyrra._config.port, port: pyrra._config.port }, + ], + selector: pyrra.apiSelectorLabels, + }, + }, + + apiDeployment: + local c = { + name: pyrra._config.name, + image: pyrra._config.image, + args: [ + 'api', + '--api-url=http://%s.%s.svc.cluster.local:9444' % [pyrra.kubernetesService.metadata.name, pyrra.kubernetesService.metadata.namespace], + '--prometheus-url=http://prometheus-k8s.%s.svc.cluster.local:9090' % pyrra._config.namespace, + ], + // resources: pyrra._config.resources, + ports: [{ containerPort: pyrra._config.port }], + securityContext: { + allowPrivilegeEscalation: false, + readOnlyRootFilesystem: true, + }, + }; + + { + apiVersion: 'apps/v1', + kind: 'Deployment', + metadata: pyrra._apiMetadata, + spec: { + replicas: pyrra._config.replicas, + selector: { + matchLabels: pyrra.apiSelectorLabels, + }, + strategy: { + rollingUpdate: { + maxSurge: 1, + maxUnavailable: 1, + }, + }, + template: { + metadata: { labels: pyrra._apiMetadata.labels }, + spec: { + containers: [c], + // serviceAccountName: $.serviceAccount.metadata.name, + nodeSelector: { 'kubernetes.io/os': 'linux' }, + }, + }, + }, + }, + + _kubernetesMetadata:: { + name: pyrra._config.name + '-kubernetes', + namespace: pyrra._config.namespace, + labels: pyrra._config.commonLabels { + 'app.kubernetes.io/component': 'kubernetes', + }, + }, + kubernetesSelectorLabels:: { + [labelName]: pyrra._kubernetesMetadata.labels[labelName] + for labelName in std.objectFields(pyrra._kubernetesMetadata.labels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + + kubernetesServiceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: pyrra._kubernetesMetadata, + }, + + kubernetesClusterRole: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: pyrra._kubernetesMetadata, + rules: [{ + apiGroups: ['monitoring.coreos.com'], + resources: ['prometheusrules'], + verbs: ['create', 'delete', 'get', 'list', 'patch', 'update', 'watch'], + }, { + apiGroups: ['monitoring.coreos.com'], + resources: ['prometheusrules/status'], + verbs: ['get'], + }, { + apiGroups: ['pyrra.dev'], + resources: ['servicelevelobjectives'], + verbs: ['create', 'delete', 'get', 'list', 'patch', 'update', 'watch'], + }, { + apiGroups: ['pyrra.dev'], + resources: ['servicelevelobjectives/status'], + verbs: ['get', 'patch', 'update'], + }], + }, + + kubernetesClusterRoleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: pyrra._kubernetesMetadata, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: pyrra.kubernetesClusterRole.metadata.name, + }, + subjects: [{ + kind: 'ServiceAccount', + name: pyrra.kubernetesServiceAccount.metadata.name, + namespace: pyrra._config.namespace, + }], + }, + + kubernetesService: { + apiVersion: 'v1', + kind: 'Service', + metadata: pyrra._kubernetesMetadata, + spec: { + ports: [ + { name: 'http', targetPort: 9444, port: 9444 }, + ], + selector: pyrra.kubernetesSelectorLabels, + }, + }, + + kubernetesDeployment: + local c = { + name: pyrra._config.name, + image: pyrra._config.image, + args: [ + 'kubernetes', + ], + // resources: pyrra._config.resources, + ports: [{ containerPort: pyrra._config.port }], + securityContext: { + allowPrivilegeEscalation: false, + readOnlyRootFilesystem: true, + }, + }; + + { + apiVersion: 'apps/v1', + kind: 'Deployment', + metadata: pyrra._kubernetesMetadata { + name: pyrra._config.name + '-kubernetes', + }, + spec: { + replicas: pyrra._config.replicas, + selector: { + matchLabels: pyrra.kubernetesSelectorLabels, + }, + strategy: { + rollingUpdate: { + maxSurge: 1, + maxUnavailable: 1, + }, + }, + template: { + metadata: { labels: pyrra._kubernetesMetadata.labels }, + spec: { + containers: [c], + serviceAccountName: pyrra.kubernetesServiceAccount.metadata.name, + nodeSelector: { 'kubernetes.io/os': 'linux' }, + }, + }, + }, + }, + + // Most of these should eventually be moved to the components themselves. + // For now, this is a good start to have everything in one place. + 'slo-apiserver-read-response-errors': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: { + name: 'apiserver-read-response-errors', + namespace: 'kube-system', + labels: { + prometheus: 'k8s', + role: 'alert-rules', + 'pyrra.dev/component': 'apiserver', + }, + }, + spec: { + target: '99', + window: '2w', + description: '', + indicator: { + ratio: { + errors: { + metric: 'apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}', + }, + total: { + metric: 'apiserver_request_total{job="apiserver",verb=~"LIST|GET"}', + }, + }, + }, + }, + }, + + 'slo-apiserver-write-response-errors': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: { + name: 'apiserver-write-response-errors', + namespace: 'kube-system', + labels: { + prometheus: 'k8s', + role: 'alert-rules', + 'pyrra.dev/component': 'apiserver', + }, + }, + spec: { + target: '99', + window: '2w', + description: '', + indicator: { + ratio: { + errors: { + metric: 'apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}', + }, + total: { + metric: 'apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}', + }, + }, + }, + }, + }, + + 'slo-apiserver-read-resource-latency': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: { + name: 'apiserver-read-resource-latency', + namespace: 'kube-system', + labels: { + prometheus: 'k8s', + role: 'alert-rules', + 'pyrra.dev/component': 'apiserver', + }, + }, + spec: { + target: '99', + window: '2w', + description: '', + indicator: { + latency: { + success: { + // metric: 'apiserver_request_duration_seconds_bucket{job="apiserver",scope=~"resource|",verb=~"LIST|GET",le="0.1"}', + metric: 'apiserver_request_slo_duration_seconds_bucket{job="apiserver",scope=~"resource|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)",le="0.1"}', + }, + total: { + // metric: 'apiserver_request_duration_seconds_count{job="apiserver",scope=~"resource|",verb=~"LIST|GET"}', + metric: 'apiserver_request_slo_duration_seconds_count{job="apiserver",scope=~"resource|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)"}', + }, + }, + }, + }, + }, + + 'slo-apiserver-read-namespace-latency': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: { + name: 'apiserver-read-namespace-latency', + namespace: 'kube-system', + labels: { + prometheus: 'k8s', + role: 'alert-rules', + 'pyrra.dev/component': 'apiserver', + }, + }, + spec: { + target: '99', + window: '2w', + description: '', + indicator: { + latency: { + success: { + //metric: 'apiserver_request_duration_seconds_bucket{job="apiserver",scope=~"namespace|",verb=~"LIST|GET",le="5"}', + metric: 'apiserver_request_slo_duration_seconds_bucket{job="apiserver",scope=~"namespace|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)",le="5"}', + }, + total: { + //metric: 'apiserver_request_duration_seconds_count{job="apiserver",scope=~"namespace|",verb=~"LIST|GET"}', + metric: 'apiserver_request_slo_duration_seconds_count{job="apiserver",scope=~"namespace|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)"}', + }, + }, + }, + }, + }, + + 'slo-apiserver-read-cluster-latency': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: { + name: 'apiserver-read-cluster-latency', + namespace: 'kube-system', + labels: { + prometheus: 'k8s', + role: 'alert-rules', + 'pyrra.dev/component': 'apiserver', + }, + }, + spec: { + target: '99', + window: '2w', + description: '', + indicator: { + latency: { + success: { + //metric: 'apiserver_request_duration_seconds_bucket{job="apiserver",scope=~"cluster|",verb=~"LIST|GET",le="5"}', + metric: 'apiserver_request_slo_duration_seconds_bucket{job="apiserver",scope=~"cluster|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)",le="5"}', + }, + total: { + //metric: 'apiserver_request_duration_seconds_count{job="apiserver",scope=~"cluster|",verb=~"LIST|GET"}', + metric: 'apiserver_request_slo_duration_seconds_count{job="apiserver",scope=~"cluster|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)"}', + }, + }, + }, + }, + }, + + 'slo-kubelet-request-errors': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: { + name: 'kubelet-request-errors', + namespace: 'kube-system', + labels: { + prometheus: 'k8s', + role: 'alert-rules', + 'pyrra.dev/component': 'kubelet', + }, + }, + spec: { + target: '99', + window: '2w', + description: '', + indicator: { + ratio: { + errors: { + metric: 'rest_client_requests_total{job="kubelet",code=~"5.."}', + }, + total: { + metric: 'rest_client_requests_total{job="kubelet"}', + }, + }, + }, + }, + }, + + 'slo-kubelet-runtime-errors': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: { + name: 'kubelet-runtime-errors', + namespace: 'kube-system', + labels: { + prometheus: 'k8s', + role: 'alert-rules', + 'pyrra.dev/component': 'kubelet', + }, + }, + spec: { + target: '99', + window: '2w', + description: '', + indicator: { + ratio: { + errors: { + metric: 'kubelet_runtime_operations_errors_total{job="kubelet"}', + }, + total: { + metric: 'kubelet_runtime_operations_total{job="kubelet"}', + }, + }, + }, + }, + }, + + 'slo-prometheus-rule-evaluation-failures': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: { + name: 'prometheus-rule-evaluation-failures', + namespace: pyrra._config.namespace, + labels: { + prometheus: 'k8s', + role: 'alert-rules', + 'pyrra.dev/component': 'prometheus', + }, + }, + spec: { + target: '99.99', + window: '2w', + description: 'Rule and alerting rules are being evaluated every few seconds. This needs to work for recording rules to be created and most importantly for alerts to be evaluated.', + indicator: { + ratio: { + errors: { + metric: 'prometheus_rule_evaluation_failures_total{job="prometheus-k8s"}', + }, + total: { + metric: 'prometheus_rule_evaluations_total{job="prometheus-k8s"}', + }, + }, + }, + }, + }, + + 'slo-prometheus-sd-kubernetes-errors': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: { + name: 'prometheus-sd-kubernetes-errors', + namespace: pyrra._config.namespace, + labels: { + prometheus: 'k8s', + role: 'alert-rules', + 'pyrra.dev/component': 'prometheus', + }, + }, + spec: { + target: '99', + window: '2w', + description: 'If there are too many errors Prometheus is having a bad time discovering new Kubernetes services.', + indicator: { + ratio: { + errors: { + metric: 'prometheus_sd_kubernetes_http_request_total{job="prometheus-k8s",status_code=~"5..|"}', + }, + total: { + metric: 'prometheus_sd_kubernetes_http_request_total{job="prometheus-k8s"}', + }, + }, + }, + }, + }, + + 'slo-prometheus-query-errors': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: { + name: 'prometheus-query-errors', + namespace: pyrra._config.namespace, + labels: { + prometheus: 'k8s', + role: 'alert-rules', + 'pyrra.dev/component': 'prometheus', + }, + }, + spec: { + target: '99', + window: '2w', + description: '', + indicator: { + ratio: { + grouping: ['handler'], + errors: { + metric: 'prometheus_http_requests_total{job="prometheus-k8s",handler=~"/api/v1/query.*",code=~"5.."}', + }, + total: { + metric: 'prometheus_http_requests_total{job="prometheus-k8s",handler=~"/api/v1/query.*"}', + }, + }, + }, + }, + }, + + 'slo-prometheus-notification-errors': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: { + name: 'prometheus-notification-errors', + namespace: pyrra._config.namespace, + labels: { + prometheus: 'k8s', + role: 'alert-rules', + 'pyrra.dev/component': 'prometheus', + }, + }, + spec: { + target: '99', + window: '2w', + description: '', + indicator: { + ratio: { + errors: { + metric: 'prometheus_notifications_errors_total{job="prometheus-k8s"}', + }, + total: { + metric: 'prometheus_notifications_sent_total{job="prometheus-k8s"}', + }, + }, + }, + }, + }, +} + +// 1000 * histogram_quantile(0.99, rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type="list_containers"}[5m])) +// rate(kubelet_runtime_operations_duration_seconds_count[5m]) + +// etcd! +// kube-proxy? +// kube-scheduler diff --git a/jsonnet/kube-prometheus/main.libsonnet b/jsonnet/kube-prometheus/main.libsonnet index 3405c8f3e3..a8365dbdf0 100644 --- a/jsonnet/kube-prometheus/main.libsonnet +++ b/jsonnet/kube-prometheus/main.libsonnet @@ -8,6 +8,7 @@ local nodeExporter = import './components/node-exporter.libsonnet'; local prometheusAdapter = import './components/prometheus-adapter.libsonnet'; local prometheusOperator = import './components/prometheus-operator.libsonnet'; local prometheus = import './components/prometheus.libsonnet'; +local pyrra = import './components/pyrra.libsonnet'; local platformPatch = import './platforms/platforms.libsonnet'; @@ -35,6 +36,7 @@ local utils = import './lib/utils.libsonnet'; prometheusOperator: error 'must provide version', kubeRbacProxy: error 'must provide version', configmapReload: error 'must provide version', + pyrra: error 'must provide version', } + (import 'versions.json'), images: { alertmanager: 'quay.io/prometheus/alertmanager:v' + $.values.common.versions.alertmanager, @@ -48,6 +50,7 @@ local utils = import './lib/utils.libsonnet'; prometheusOperatorReloader: 'quay.io/prometheus-operator/prometheus-config-reloader:v' + $.values.common.versions.prometheusOperator, kubeRbacProxy: 'quay.io/brancz/kube-rbac-proxy:v' + $.values.common.versions.kubeRbacProxy, configmapReload: 'jimmidyson/configmap-reload:v' + $.values.common.versions.configmapReload, + pyrra: 'ghcr.io/pyrra-dev/pyrra:v' + $.values.common.versions.pyrra, }, }, alertmanager: { @@ -128,6 +131,11 @@ local utils = import './lib/utils.libsonnet'; namespace: $.values.common.namespace, mixin+: { ruleLabels: $.values.common.ruleLabels }, }, + pyrra: { + namespace: $.values.common.namespace, + version: $.values.common.versions.nodeExporter, + image: $.values.common.images.nodeExporter, + }, }, alertmanager: alertmanager($.values.alertmanager), @@ -138,6 +146,7 @@ local utils = import './lib/utils.libsonnet'; prometheus: prometheus($.values.prometheus), prometheusAdapter: prometheusAdapter($.values.prometheusAdapter), prometheusOperator: prometheusOperator($.values.prometheusOperator), + pyrra: pyrra($.values.pyrra), kubernetesControlPlane: kubernetesControlPlane($.values.kubernetesControlPlane), kubePrometheus: customMixin( { From ede49489bcb0a85762c60587bfd8af61a742c2a4 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Sat, 22 Oct 2022 01:08:24 +0200 Subject: [PATCH 02/14] Add SLOs to prometheus-operator component --- .../components/prometheus-operator.libsonnet | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet b/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet index 5c0c96c69c..079ea79993 100644 --- a/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet +++ b/jsonnet/kube-prometheus/components/prometheus-operator.libsonnet @@ -43,6 +43,16 @@ local defaults = { runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/%s', }, }, + slos: { + reconcileErrors: { + target: '95', + window: '2w', + }, + HTTPErrors: { + target: '99.5', + window: '2w', + }, + }, }; function(params) @@ -168,4 +178,63 @@ function(params) }, }, }, + + sloReconcileErrors: { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: po.service.metadata { + name: po._config.name + '-reconcile-errors', + labels: po._config.commonLabels + po._config.mixin.ruleLabels + { + 'pyrra.dev/component': po._config.name, + }, + }, + spec: { + target: po._config.slos.reconcileErrors.target, + window: po._config.slos.reconcileErrors.window, + description: ||| + The Prometheus Operator reconciles the controllers object to have the underlying resource in the desired state. + If this is firing the object may not be running correctly. + |||, + indicator: { + ratio: { + errors: { + metric: 'prometheus_operator_reconcile_errors_total{%s}' % po._config.mixin._config.prometheusOperatorSelector, + }, + total: { + metric: 'prometheus_operator_reconcile_operations_total{%s}' % po._config.mixin._config.prometheusOperatorSelector, + }, + grouping: ['controller'], + }, + }, + }, + }, + + sloHTTPErrors: { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: po.service.metadata { + name: po._config.name + '-http-errors', + labels: po._config.commonLabels + po._config.mixin.ruleLabels + { + 'pyrra.dev/component': po._config.name, + }, + }, + spec: { + target: '99.5', + window: '2w', + description: ||| + The Prometheus Operator makes HTTP requests to the Kubernetes API server to read and write the objects. + If this firing the Prometheus Operator might not be able read and write the latest objects. + |||, + indicator: { + ratio: { + errors: { + metric: 'prometheus_operator_kubernetes_client_http_requests_total{%s,status_code=~"5.."}' % po._config.mixin._config.prometheusOperatorSelector, + }, + total: { + metric: 'prometheus_operator_kubernetes_client_http_requests_total{%s}' % po._config.mixin._config.prometheusOperatorSelector, + }, + }, + }, + }, + }, } From ab26f8d6842110b3aadcee37e16aa1c18345ca50 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Sat, 22 Oct 2022 01:08:43 +0200 Subject: [PATCH 03/14] Add CoreDNS SLOs --- .../components/k8s-control-plane.libsonnet | 96 ++++++++++++++++++- 1 file changed, 92 insertions(+), 4 deletions(-) diff --git a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet index a771e95dbe..6bf249f117 100644 --- a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet +++ b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet @@ -19,6 +19,7 @@ local defaults = { kubeSchedulerSelector: 'job="kube-scheduler"', kubeControllerManagerSelector: 'job="kube-controller-manager"', kubeApiserverSelector: 'job="apiserver"', + coreDNSSelector: 'job="coredns"', podLabel: 'pod', runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/%s', diskDeviceSelector: 'device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"', @@ -26,6 +27,20 @@ local defaults = { }, }, kubeProxy:: false, + coredns: { + name: 'coredns', + slos: { + responseErrors: { + target: '99.99', + window: '2w', + }, + responseLatency: { + target: '99', + latency: '0.032', // must exist as le label + window: '2w', + }, + }, + }, }; function(params) { @@ -313,17 +328,17 @@ function(params) { }, - serviceMonitorCoreDNS: { + 'coredns-ServiceMonitor': { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', metadata: k8s._metadata { - name: 'coredns', - labels+: { 'app.kubernetes.io/name': 'coredns' }, + name: k8s._config.coredns.name, + labels+: { 'app.kubernetes.io/name': k8s._config.coredns.name }, }, spec: { jobLabel: 'app.kubernetes.io/name', selector: { - matchLabels: { 'k8s-app': 'kube-dns' }, + matchLabels: { 'k8s-app': k8s._config.coredns.name }, }, namespaceSelector: { matchNames: ['kube-system'], @@ -347,5 +362,78 @@ function(params) { }, }, + 'coredns-slo-response-errors': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: k8s._config.coredns.name + '-response-errors', + labels+: { + 'app.kubernetes.io/name': k8s._config.coredns.name, + 'app.kubernetes.io/component': 'controller', + prometheus: 'k8s', // TODO + 'pyrra.dev/component': k8s._config.coredns.name, + role: 'alert-rules', + }, + }, + spec: { + target: k8s._config.coredns.slos.responseErrors.target, + window: k8s._config.coredns.slos.responseErrors.window, + description: ||| + CoreDNS runs within a Kubernetes cluster and resolves internal requests and forward external requests. + If CoreDNS fails to answer requests applications might be unable to make requests. + |||, + indicator: { + ratio: { + errors: { + metric: 'coredns_dns_responses_total{%s,rcode="SERVFAIL"}' % [ + k8s._config.mixin._config.coreDNSSelector, + ], + }, + total: { + metric: 'coredns_dns_responses_total{%s}' % [ + k8s._config.mixin._config.coreDNSSelector, + ], + }, + }, + }, + }, + }, + 'coredns-slo-response-latency': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: k8s._config.coredns.name + '-response-latency', + labels+: { + 'app.kubernetes.io/name': 'coredns', + 'app.kubernetes.io/component': 'controller', + prometheus: 'k8s', // TODO + 'pyrra.dev/component': 'coredns', + role: 'alert-rules', + }, + }, + spec: { + target: k8s._config.coredns.slos.responseLatency.target, + window: k8s._config.coredns.slos.responseLatency.window, + description: ||| + CoreDNS runs within a Kubernetes cluster and resolves internal requests and forward external requests. + If CoreDNS gets too slow it might have an impact on the latency of other applications in this cluster. + |||, + indicator: { + latency: { + success: { + metric: 'coredns_dns_request_duration_seconds_bucket{%s,le="%s"}' % [ + k8s._config.mixin._config.coreDNSSelector, + k8s._config.coredns.slos.responseLatency.latency, + ], + }, + total: { + metric: 'coredns_dns_request_duration_seconds_count{%s}' % [ + k8s._config.mixin._config.coreDNSSelector, + ], + }, + }, + }, + }, + }, } From 19b222e59d138743790860a0a5c23c5eeed01f8f Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Sat, 22 Oct 2022 01:42:36 +0200 Subject: [PATCH 04/14] Add Kubelet SLOs to the component --- .../components/k8s-control-plane.libsonnet | 87 ++++++++++++++++++- .../components/pyrra.libsonnet | 58 ------------- jsonnet/kube-prometheus/main.libsonnet | 2 +- 3 files changed, 87 insertions(+), 60 deletions(-) diff --git a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet index 6bf249f117..9386207a02 100644 --- a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet +++ b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet @@ -27,6 +27,18 @@ local defaults = { }, }, kubeProxy:: false, + kubelet: { + slos: { + requestErrors: { + target: '99', + window: '2w', + }, + runtimeErrors: { + target: '99.5', + window: '2w', + }, + }, + }, coredns: { name: 'coredns', slos: { @@ -102,7 +114,7 @@ function(params) { }, }, - serviceMonitorKubelet: { + kubeletServiceMonitor: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', metadata: k8s._metadata { @@ -199,6 +211,79 @@ function(params) { }, }, + 'kubelet-slo-request-errors': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: 'kubelet-request-errors', + labels+: { + 'app.kubernetes.io/name': 'kubelet', + prometheus: 'k8s', //TODO + role: 'alert-rules', + 'pyrra.dev/component': 'kubelet', + }, + }, + spec: { + target: k8s._config.kubelet.slos.requestErrors.target, + window: k8s._config.kubelet.slos.requestErrors.window, + description: ||| + The kubelet is the primary “node agent” that runs on each node. + The kubelet ensures that the containers are running and healthy. + If these requests are failing the Kubelet might not know what to run exactly. + |||, + indicator: { + ratio: { + errors: { + metric: 'rest_client_requests_total{%s,code=~"5.."}' % [ + k8s._config.mixin._config.kubeletSelector, + ], + }, + total: { + metric: 'rest_client_requests_total{%s}' % [ + k8s._config.mixin._config.kubeletSelector, + ], + }, + }, + }, + }, + }, + + 'kubelet-slo-runtime-errors': { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: 'kubelet-runtime-errors', + labels+: { + 'app.kubernetes.io/name': 'kubelet', + prometheus: 'k8s', //TODO + role: 'alert-rules', + 'pyrra.dev/component': 'kubelet', + }, + }, + spec: { + target: k8s._config.kubelet.slos.runtimeErrors.target, + window: k8s._config.kubelet.slos.runtimeErrors.window, + description: ||| + The kubelet is the primary “node agent” that runs on each node. + If there are runtime errors the kubelet might be unable to check the containers are running and healthy. + |||, + indicator: { + ratio: { + errors: { + metric: 'kubelet_runtime_operations_errors_total{%s}' % [ + k8s._config.mixin._config.kubeletSelector, + ], + }, + total: { + metric: 'kubelet_runtime_operations_total{%s}' % [ + k8s._config.mixin._config.kubeletSelector, + ], + }, + }, + }, + }, + }, + serviceMonitorKubeControllerManager: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', diff --git a/jsonnet/kube-prometheus/components/pyrra.libsonnet b/jsonnet/kube-prometheus/components/pyrra.libsonnet index 4f8d41a596..30598816e0 100644 --- a/jsonnet/kube-prometheus/components/pyrra.libsonnet +++ b/jsonnet/kube-prometheus/components/pyrra.libsonnet @@ -357,64 +357,6 @@ function(params) { }, }, - 'slo-kubelet-request-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'kubelet-request-errors', - namespace: 'kube-system', - labels: { - prometheus: 'k8s', - role: 'alert-rules', - 'pyrra.dev/component': 'kubelet', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'rest_client_requests_total{job="kubelet",code=~"5.."}', - }, - total: { - metric: 'rest_client_requests_total{job="kubelet"}', - }, - }, - }, - }, - }, - - 'slo-kubelet-runtime-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'kubelet-runtime-errors', - namespace: 'kube-system', - labels: { - prometheus: 'k8s', - role: 'alert-rules', - 'pyrra.dev/component': 'kubelet', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'kubelet_runtime_operations_errors_total{job="kubelet"}', - }, - total: { - metric: 'kubelet_runtime_operations_total{job="kubelet"}', - }, - }, - }, - }, - }, - 'slo-prometheus-rule-evaluation-failures': { apiVersion: 'pyrra.dev/v1alpha1', kind: 'ServiceLevelObjective', diff --git a/jsonnet/kube-prometheus/main.libsonnet b/jsonnet/kube-prometheus/main.libsonnet index a8365dbdf0..a19b9b524c 100644 --- a/jsonnet/kube-prometheus/main.libsonnet +++ b/jsonnet/kube-prometheus/main.libsonnet @@ -115,7 +115,7 @@ local utils = import './lib/utils.libsonnet'; image: $.values.common.images.prometheusAdapter, prometheusURL: 'http://prometheus-' + $.values.prometheus.name + '.' + $.values.prometheus.namespace + '.svc:9090/', rangeIntervals+: { - kubelet: utils.rangeInterval($.kubernetesControlPlane.serviceMonitorKubelet.spec.endpoints[0].interval), + kubelet: utils.rangeInterval($.kubernetesControlPlane.kubeletServiceMonitor.spec.endpoints[0].interval), nodeExporter: utils.rangeInterval($.nodeExporter.serviceMonitor.spec.endpoints[0].interval), }, }, From 60e2609a02ada7a6bace5405bbc6ce37c38a5d8f Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Sat, 22 Oct 2022 03:26:01 +0200 Subject: [PATCH 05/14] Add SLOs for kubeControllerManager and kubeProxy --- example.jsonnet | 3 + .../components/k8s-control-plane.libsonnet | 139 +++++++++++++++++- 2 files changed, 139 insertions(+), 3 deletions(-) diff --git a/example.jsonnet b/example.jsonnet index 5f9f1eae98..f530474183 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -12,6 +12,9 @@ local kp = common+: { namespace: 'monitoring', }, + kubernetesControlPlane+: { + kubeProxy:true, + }, }, }; diff --git a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet index 9386207a02..0ce7ef65f1 100644 --- a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet +++ b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet @@ -19,6 +19,7 @@ local defaults = { kubeSchedulerSelector: 'job="kube-scheduler"', kubeControllerManagerSelector: 'job="kube-controller-manager"', kubeApiserverSelector: 'job="apiserver"', + kubeProxySelector: 'job="kube-proxy"', coreDNSSelector: 'job="coredns"', podLabel: 'pod', runbookURLPattern: 'https://runbooks.prometheus-operator.dev/runbooks/kubernetes/%s', @@ -26,7 +27,6 @@ local defaults = { hostNetworkInterfaceSelector: 'device!~"veth.+"', }, }, - kubeProxy:: false, kubelet: { slos: { requestErrors: { @@ -39,6 +39,28 @@ local defaults = { }, }, }, + kubeControllerManager: { + slos: { + requestErrors: { + target: '99', + window: '2w', + }, + }, + }, + kubeProxy: false, + kubeProxyConfig: { // different name for backwards compatability + slos: { + syncRulesLatency: { + target: '90', + latency: '0.512', // must exist as le label + window: '2w', + }, + requestErrors: { + target: '90', // kube-proxy makes very few requests + window: '2w', + }, + }, + }, coredns: { name: 'coredns', slos: { @@ -234,7 +256,7 @@ function(params) { indicator: { ratio: { errors: { - metric: 'rest_client_requests_total{%s,code=~"5.."}' % [ + metric: 'rest_client_requests_total{%s,code=~"5..|"}' % [ k8s._config.mixin._config.kubeletSelector, ], }, @@ -284,7 +306,7 @@ function(params) { }, }, - serviceMonitorKubeControllerManager: { + kubeControllerManagerServiceMonitor: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', metadata: k8s._metadata { @@ -318,6 +340,43 @@ function(params) { }, }, + kubeControllerManagerSLORequestErrors: { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: 'kube-controller-manager-request-errors', + labels+: { + 'app.kubernetes.io/name': 'kube-controller-manager', + prometheus: 'k8s', //TODO + role: 'alert-rules', + 'pyrra.dev/component': 'kube-controller-manager', + }, + }, + spec: { + target: k8s._config.kubeControllerManager.slos.requestErrors.target, + window: k8s._config.kubeControllerManager.slos.requestErrors.window, + description: ||| + The Kubernetes controller manager is a daemon that embeds the core control loops shipped with Kubernetes. + In applications of robotics and automation, a control loop is a non-terminating loop that regulates the state of the system. + In Kubernetes, a controller is a control loop that watches the shared state of the cluster through the apiserver and makes changes attempting to move the current state towards the desired state. Examples of controllers that ship with Kubernetes today are the replication controller, endpoints controller, namespace controller, and serviceaccounts controller. + |||, + indicator: { + ratio: { + errors: { + metric: 'rest_client_requests_total{%s,code=~"5..|"}' % [ + k8s._config.mixin._config.kubeControllerManagerSelector, + ], + }, + total: { + metric: 'rest_client_requests_total{%s}' % [ + k8s._config.mixin._config.kubeControllerManagerSelector, + ], + }, + }, + }, + }, + }, + serviceMonitorApiserver: { apiVersion: 'monitoring.coreos.com/v1', kind: 'ServiceMonitor', @@ -412,6 +471,80 @@ function(params) { }, }, + [if (defaults + params).kubeProxy then 'kubeProxySLOSyncRulesLatency']: { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: 'kube-proxy-sync-rules-latency', + labels+: { + 'app.kubernetes.io/name': 'kube-proxy', + 'app.kubernetes.io/component': 'controller', //TODO + prometheus: 'k8s', // TODO + 'pyrra.dev/component': 'kube-proxy', + role: 'alert-rules', + }, + }, + spec: { + target: k8s._config.kubeProxyConfig.slos.syncRulesLatency.target, + window: k8s._config.kubeProxyConfig.slos.syncRulesLatency.window, + description: ||| + The Kubernetes network proxy runs on each node. + This reflects services as defined in the Kubernetes API on each node and can do simple TCP, UDP + stream forwarding or round robin TCP,UDP forwarding across a set of backends. + + If this is firing the networks might not be synchronized fast enough and services might be unable to reach the containers they want to reach. + |||, + indicator: { + latency: { + success: { + metric: 'kubeproxy_sync_proxy_rules_duration_seconds_bucket{%s,le="%s"}' % [ + k8s._config.mixin._config.kubeProxySelector, + k8s._config.kubeProxyConfig.slos.syncRulesLatency.latency, + ], + }, + total: { + metric: 'kubeproxy_sync_proxy_rules_duration_seconds_count{%s}' % [ + k8s._config.mixin._config.kubeProxySelector, + ], + }, + }, + }, + }, + }, + + kubeProxySLORequestErrors: { + apiVersion: 'pyrra.dev/v1alpha1', + kind: 'ServiceLevelObjective', + metadata: k8s._metadata { + name: 'kube-proxy-request-errors', + labels+: { + 'app.kubernetes.io/name': 'kube-proxy', + 'app.kubernetes.io/component': 'controller', //TODO + prometheus: 'k8s', // TODO + 'pyrra.dev/component': 'kube-proxy', + role: 'alert-rules', + }, + }, + spec: { + target: k8s._config.kubeProxyConfig.slos.requestErrors.target, + window: k8s._config.kubeProxyConfig.slos.requestErrors.window, + description: '', + indicator: { + ratio: { + errors: { + metric: 'rest_client_requests_total{%s,code=~"5..|"}' % [ + k8s._config.mixin._config.kubeProxySelector, + ], + }, + total: { + metric: 'rest_client_requests_total{%s}' % [ + k8s._config.mixin._config.kubeProxySelector, + ], + }, + }, + }, + }, + }, 'coredns-ServiceMonitor': { apiVersion: 'monitoring.coreos.com/v1', From 611196f59fe169f939f94d464fc0ce4146c43af4 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Sun, 9 Jul 2023 16:05:47 +0200 Subject: [PATCH 06/14] Run Pyrra with generic rules This should allow Grafana dashboards to access the SLO time series. --- .../components/pyrra.libsonnet | 1 + jsonnet/kube-prometheus/main.libsonnet | 4 +- kustomization.yaml | 31 +++- ...sControlPlane-coredns-ServiceMonitor.yaml} | 2 +- ...trolPlane-coredns-slo-response-errors.yaml | 24 +++ ...rolPlane-coredns-slo-response-latency.yaml | 24 +++ ...kubeControllerManagerSLORequestErrors.yaml | 21 +++ ...-kubeControllerManagerServiceMonitor.yaml} | 0 ...ontrolPlane-kubeProxySLORequestErrors.yaml | 22 +++ ...ntrolPlane-kubelet-slo-request-errors.yaml | 24 +++ ...ntrolPlane-kubelet-slo-runtime-errors.yaml | 23 +++ ...esControlPlane-kubeletServiceMonitor.yaml} | 0 .../prometheusOperator-sloHTTPErrors.yaml | 23 +++ ...prometheusOperator-sloReconcileErrors.yaml | 27 +++ manifests/pyrra-apiDeployment.yaml | 43 +++++ manifests/pyrra-apiService.yaml | 19 ++ manifests/pyrra-kubernetesClusterRole.yaml | 49 ++++++ .../pyrra-kubernetesClusterRoleBinding.yaml | 18 ++ manifests/pyrra-kubernetesDeployment.yaml | 43 +++++ manifests/pyrra-kubernetesService.yaml | 19 ++ manifests/pyrra-kubernetesServiceAccount.yaml | 10 ++ ...ra-slo-apiserver-read-cluster-latency.yaml | 19 ++ ...-slo-apiserver-read-namespace-latency.yaml | 19 ++ ...a-slo-apiserver-read-resource-latency.yaml | 19 ++ ...ra-slo-apiserver-read-response-errors.yaml | 19 ++ ...a-slo-apiserver-write-response-errors.yaml | 19 ++ ...ra-slo-prometheus-notification-errors.yaml | 19 ++ .../pyrra-slo-prometheus-query-errors.yaml | 21 +++ ...o-prometheus-rule-evaluation-failures.yaml | 19 ++ ...a-slo-prometheus-sd-kubernetes-errors.yaml | 19 ++ manifests/setup/crd.yaml | 162 ++++++++++++++++++ 31 files changed, 756 insertions(+), 6 deletions(-) rename manifests/{kubernetesControlPlane-serviceMonitorCoreDNS.yaml => kubernetesControlPlane-coredns-ServiceMonitor.yaml} (95%) create mode 100644 manifests/kubernetesControlPlane-coredns-slo-response-errors.yaml create mode 100644 manifests/kubernetesControlPlane-coredns-slo-response-latency.yaml create mode 100644 manifests/kubernetesControlPlane-kubeControllerManagerSLORequestErrors.yaml rename manifests/{kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml => kubernetesControlPlane-kubeControllerManagerServiceMonitor.yaml} (100%) create mode 100644 manifests/kubernetesControlPlane-kubeProxySLORequestErrors.yaml create mode 100644 manifests/kubernetesControlPlane-kubelet-slo-request-errors.yaml create mode 100644 manifests/kubernetesControlPlane-kubelet-slo-runtime-errors.yaml rename manifests/{kubernetesControlPlane-serviceMonitorKubelet.yaml => kubernetesControlPlane-kubeletServiceMonitor.yaml} (100%) create mode 100644 manifests/prometheusOperator-sloHTTPErrors.yaml create mode 100644 manifests/prometheusOperator-sloReconcileErrors.yaml create mode 100644 manifests/pyrra-apiDeployment.yaml create mode 100644 manifests/pyrra-apiService.yaml create mode 100644 manifests/pyrra-kubernetesClusterRole.yaml create mode 100644 manifests/pyrra-kubernetesClusterRoleBinding.yaml create mode 100644 manifests/pyrra-kubernetesDeployment.yaml create mode 100644 manifests/pyrra-kubernetesService.yaml create mode 100644 manifests/pyrra-kubernetesServiceAccount.yaml create mode 100644 manifests/pyrra-slo-apiserver-read-cluster-latency.yaml create mode 100644 manifests/pyrra-slo-apiserver-read-namespace-latency.yaml create mode 100644 manifests/pyrra-slo-apiserver-read-resource-latency.yaml create mode 100644 manifests/pyrra-slo-apiserver-read-response-errors.yaml create mode 100644 manifests/pyrra-slo-apiserver-write-response-errors.yaml create mode 100644 manifests/pyrra-slo-prometheus-notification-errors.yaml create mode 100644 manifests/pyrra-slo-prometheus-query-errors.yaml create mode 100644 manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml create mode 100644 manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml create mode 100644 manifests/setup/crd.yaml diff --git a/jsonnet/kube-prometheus/components/pyrra.libsonnet b/jsonnet/kube-prometheus/components/pyrra.libsonnet index 30598816e0..e70be27eab 100644 --- a/jsonnet/kube-prometheus/components/pyrra.libsonnet +++ b/jsonnet/kube-prometheus/components/pyrra.libsonnet @@ -167,6 +167,7 @@ function(params) { image: pyrra._config.image, args: [ 'kubernetes', + '--generic-rules', ], // resources: pyrra._config.resources, ports: [{ containerPort: pyrra._config.port }], diff --git a/jsonnet/kube-prometheus/main.libsonnet b/jsonnet/kube-prometheus/main.libsonnet index a19b9b524c..fb76ec1f11 100644 --- a/jsonnet/kube-prometheus/main.libsonnet +++ b/jsonnet/kube-prometheus/main.libsonnet @@ -133,8 +133,8 @@ local utils = import './lib/utils.libsonnet'; }, pyrra: { namespace: $.values.common.namespace, - version: $.values.common.versions.nodeExporter, - image: $.values.common.images.nodeExporter, + version: $.values.common.versions.pyrra, + image: $.values.common.images.pyrra, }, }, diff --git a/kustomization.yaml b/kustomization.yaml index e0d8039ed7..e51683a9e5 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -36,12 +36,18 @@ resources: - ./manifests/kubeStateMetrics-service.yaml - ./manifests/kubeStateMetrics-serviceAccount.yaml - ./manifests/kubeStateMetrics-serviceMonitor.yaml +- ./manifests/kubernetesControlPlane-coredns-ServiceMonitor.yaml +- ./manifests/kubernetesControlPlane-coredns-slo-response-errors.yaml +- ./manifests/kubernetesControlPlane-coredns-slo-response-latency.yaml +- ./manifests/kubernetesControlPlane-kubeControllerManagerSLORequestErrors.yaml +- ./manifests/kubernetesControlPlane-kubeControllerManagerServiceMonitor.yaml +- ./manifests/kubernetesControlPlane-kubeProxySLORequestErrors.yaml +- ./manifests/kubernetesControlPlane-kubelet-slo-request-errors.yaml +- ./manifests/kubernetesControlPlane-kubelet-slo-runtime-errors.yaml +- ./manifests/kubernetesControlPlane-kubeletServiceMonitor.yaml - ./manifests/kubernetesControlPlane-prometheusRule.yaml - ./manifests/kubernetesControlPlane-serviceMonitorApiserver.yaml -- ./manifests/kubernetesControlPlane-serviceMonitorCoreDNS.yaml -- ./manifests/kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml - ./manifests/kubernetesControlPlane-serviceMonitorKubeScheduler.yaml -- ./manifests/kubernetesControlPlane-serviceMonitorKubelet.yaml - ./manifests/nodeExporter-clusterRole.yaml - ./manifests/nodeExporter-clusterRoleBinding.yaml - ./manifests/nodeExporter-daemonset.yaml @@ -85,6 +91,24 @@ resources: - ./manifests/prometheusOperator-service.yaml - ./manifests/prometheusOperator-serviceAccount.yaml - ./manifests/prometheusOperator-serviceMonitor.yaml +- ./manifests/prometheusOperator-sloHTTPErrors.yaml +- ./manifests/prometheusOperator-sloReconcileErrors.yaml +- ./manifests/pyrra-apiDeployment.yaml +- ./manifests/pyrra-apiService.yaml +- ./manifests/pyrra-kubernetesClusterRole.yaml +- ./manifests/pyrra-kubernetesClusterRoleBinding.yaml +- ./manifests/pyrra-kubernetesDeployment.yaml +- ./manifests/pyrra-kubernetesService.yaml +- ./manifests/pyrra-kubernetesServiceAccount.yaml +- ./manifests/pyrra-slo-apiserver-read-cluster-latency.yaml +- ./manifests/pyrra-slo-apiserver-read-namespace-latency.yaml +- ./manifests/pyrra-slo-apiserver-read-resource-latency.yaml +- ./manifests/pyrra-slo-apiserver-read-response-errors.yaml +- ./manifests/pyrra-slo-apiserver-write-response-errors.yaml +- ./manifests/pyrra-slo-prometheus-notification-errors.yaml +- ./manifests/pyrra-slo-prometheus-query-errors.yaml +- ./manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml +- ./manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml - ./manifests/setup/0alertmanagerConfigCustomResourceDefinition.yaml - ./manifests/setup/0alertmanagerCustomResourceDefinition.yaml - ./manifests/setup/0podmonitorCustomResourceDefinition.yaml @@ -95,4 +119,5 @@ resources: - ./manifests/setup/0scrapeconfigCustomResourceDefinition.yaml - ./manifests/setup/0servicemonitorCustomResourceDefinition.yaml - ./manifests/setup/0thanosrulerCustomResourceDefinition.yaml +- ./manifests/setup/crd.yaml - ./manifests/setup/namespace.yaml diff --git a/manifests/kubernetesControlPlane-serviceMonitorCoreDNS.yaml b/manifests/kubernetesControlPlane-coredns-ServiceMonitor.yaml similarity index 95% rename from manifests/kubernetesControlPlane-serviceMonitorCoreDNS.yaml rename to manifests/kubernetesControlPlane-coredns-ServiceMonitor.yaml index f3313d6da4..bdea0bf4e8 100644 --- a/manifests/kubernetesControlPlane-serviceMonitorCoreDNS.yaml +++ b/manifests/kubernetesControlPlane-coredns-ServiceMonitor.yaml @@ -22,4 +22,4 @@ spec: - kube-system selector: matchLabels: - k8s-app: kube-dns + k8s-app: coredns diff --git a/manifests/kubernetesControlPlane-coredns-slo-response-errors.yaml b/manifests/kubernetesControlPlane-coredns-slo-response-errors.yaml new file mode 100644 index 0000000000..f28a2fff9a --- /dev/null +++ b/manifests/kubernetesControlPlane-coredns-slo-response-errors.yaml @@ -0,0 +1,24 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: coredns + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + pyrra.dev/component: coredns + role: alert-rules + name: coredns-response-errors + namespace: monitoring +spec: + description: | + CoreDNS runs within a Kubernetes cluster and resolves internal requests and forward external requests. + If CoreDNS fails to answer requests applications might be unable to make requests. + indicator: + ratio: + errors: + metric: coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"} + total: + metric: coredns_dns_responses_total{job="coredns"} + target: "99.99" + window: 2w diff --git a/manifests/kubernetesControlPlane-coredns-slo-response-latency.yaml b/manifests/kubernetesControlPlane-coredns-slo-response-latency.yaml new file mode 100644 index 0000000000..c3eee8bf10 --- /dev/null +++ b/manifests/kubernetesControlPlane-coredns-slo-response-latency.yaml @@ -0,0 +1,24 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: coredns + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + pyrra.dev/component: coredns + role: alert-rules + name: coredns-response-latency + namespace: monitoring +spec: + description: | + CoreDNS runs within a Kubernetes cluster and resolves internal requests and forward external requests. + If CoreDNS gets too slow it might have an impact on the latency of other applications in this cluster. + indicator: + latency: + success: + metric: coredns_dns_request_duration_seconds_bucket{job="coredns",le="0.032"} + total: + metric: coredns_dns_request_duration_seconds_count{job="coredns"} + target: "99" + window: 2w diff --git a/manifests/kubernetesControlPlane-kubeControllerManagerSLORequestErrors.yaml b/manifests/kubernetesControlPlane-kubeControllerManagerSLORequestErrors.yaml new file mode 100644 index 0000000000..dd066b421f --- /dev/null +++ b/manifests/kubernetesControlPlane-kubeControllerManagerSLORequestErrors.yaml @@ -0,0 +1,21 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/name: kube-controller-manager + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + pyrra.dev/component: kube-controller-manager + role: alert-rules + name: kube-controller-manager-request-errors + namespace: monitoring +spec: + description: "The Kubernetes controller manager is a daemon that embeds the core control loops shipped with Kubernetes. \nIn applications of robotics and automation, a control loop is a non-terminating loop that regulates the state of the system. \nIn Kubernetes, a controller is a control loop that watches the shared state of the cluster through the apiserver and makes changes attempting to move the current state towards the desired state. Examples of controllers that ship with Kubernetes today are the replication controller, endpoints controller, namespace controller, and serviceaccounts controller.\n" + indicator: + ratio: + errors: + metric: rest_client_requests_total{job="kube-controller-manager",code=~"5..|"} + total: + metric: rest_client_requests_total{job="kube-controller-manager"} + target: "99" + window: 2w diff --git a/manifests/kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml b/manifests/kubernetesControlPlane-kubeControllerManagerServiceMonitor.yaml similarity index 100% rename from manifests/kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml rename to manifests/kubernetesControlPlane-kubeControllerManagerServiceMonitor.yaml diff --git a/manifests/kubernetesControlPlane-kubeProxySLORequestErrors.yaml b/manifests/kubernetesControlPlane-kubeProxySLORequestErrors.yaml new file mode 100644 index 0000000000..a69f01710b --- /dev/null +++ b/manifests/kubernetesControlPlane-kubeProxySLORequestErrors.yaml @@ -0,0 +1,22 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: kube-proxy + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + pyrra.dev/component: kube-proxy + role: alert-rules + name: kube-proxy-request-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: rest_client_requests_total{job="kube-proxy",code=~"5..|"} + total: + metric: rest_client_requests_total{job="kube-proxy"} + target: "90" + window: 2w diff --git a/manifests/kubernetesControlPlane-kubelet-slo-request-errors.yaml b/manifests/kubernetesControlPlane-kubelet-slo-request-errors.yaml new file mode 100644 index 0000000000..e1424b4fb0 --- /dev/null +++ b/manifests/kubernetesControlPlane-kubelet-slo-request-errors.yaml @@ -0,0 +1,24 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/name: kubelet + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + pyrra.dev/component: kubelet + role: alert-rules + name: kubelet-request-errors + namespace: monitoring +spec: + description: | + The kubelet is the primary “node agent” that runs on each node. + The kubelet ensures that the containers are running and healthy. + If these requests are failing the Kubelet might not know what to run exactly. + indicator: + ratio: + errors: + metric: rest_client_requests_total{job="kubelet", metrics_path="/metrics",code=~"5..|"} + total: + metric: rest_client_requests_total{job="kubelet", metrics_path="/metrics"} + target: "99" + window: 2w diff --git a/manifests/kubernetesControlPlane-kubelet-slo-runtime-errors.yaml b/manifests/kubernetesControlPlane-kubelet-slo-runtime-errors.yaml new file mode 100644 index 0000000000..079f7a953b --- /dev/null +++ b/manifests/kubernetesControlPlane-kubelet-slo-runtime-errors.yaml @@ -0,0 +1,23 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/name: kubelet + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + pyrra.dev/component: kubelet + role: alert-rules + name: kubelet-runtime-errors + namespace: monitoring +spec: + description: | + The kubelet is the primary “node agent” that runs on each node. + If there are runtime errors the kubelet might be unable to check the containers are running and healthy. + indicator: + ratio: + errors: + metric: kubelet_runtime_operations_errors_total{job="kubelet", metrics_path="/metrics"} + total: + metric: kubelet_runtime_operations_total{job="kubelet", metrics_path="/metrics"} + target: "99.5" + window: 2w diff --git a/manifests/kubernetesControlPlane-serviceMonitorKubelet.yaml b/manifests/kubernetesControlPlane-kubeletServiceMonitor.yaml similarity index 100% rename from manifests/kubernetesControlPlane-serviceMonitorKubelet.yaml rename to manifests/kubernetesControlPlane-kubeletServiceMonitor.yaml diff --git a/manifests/prometheusOperator-sloHTTPErrors.yaml b/manifests/prometheusOperator-sloHTTPErrors.yaml new file mode 100644 index 0000000000..b183af08ad --- /dev/null +++ b/manifests/prometheusOperator-sloHTTPErrors.yaml @@ -0,0 +1,23 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.66.0 + prometheus: k8s + pyrra.dev/component: prometheus-operator + role: alert-rules + name: prometheus-operator-http-errors + namespace: monitoring +spec: + description: "The Prometheus Operator makes HTTP requests to the Kubernetes API server to read and write the objects.\nIf this firing the Prometheus Operator might not be able read and write the latest objects. \n" + indicator: + ratio: + errors: + metric: prometheus_operator_kubernetes_client_http_requests_total{job="prometheus-operator",namespace="monitoring",status_code=~"5.."} + total: + metric: prometheus_operator_kubernetes_client_http_requests_total{job="prometheus-operator",namespace="monitoring"} + target: "99.5" + window: 2w diff --git a/manifests/prometheusOperator-sloReconcileErrors.yaml b/manifests/prometheusOperator-sloReconcileErrors.yaml new file mode 100644 index 0000000000..fd8506e4d3 --- /dev/null +++ b/manifests/prometheusOperator-sloReconcileErrors.yaml @@ -0,0 +1,27 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.66.0 + prometheus: k8s + pyrra.dev/component: prometheus-operator + role: alert-rules + name: prometheus-operator-reconcile-errors + namespace: monitoring +spec: + description: | + The Prometheus Operator reconciles the controllers object to have the underlying resource in the desired state. + If this is firing the object may not be running correctly. + indicator: + ratio: + errors: + metric: prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"} + grouping: + - controller + total: + metric: prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="monitoring"} + target: "95" + window: 2w diff --git a/manifests/pyrra-apiDeployment.yaml b/manifests/pyrra-apiDeployment.yaml new file mode 100644 index 0000000000..dab7f7b683 --- /dev/null +++ b/manifests/pyrra-apiDeployment.yaml @@ -0,0 +1,43 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.6.3 + name: pyrra-api + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra + app.kubernetes.io/part-of: kube-prometheus + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + template: + metadata: + labels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.6.3 + spec: + containers: + - args: + - api + - --api-url=http://pyrra-kubernetes.monitoring.svc.cluster.local:9444 + - --prometheus-url=http://prometheus-k8s.monitoring.svc.cluster.local:9090 + image: ghcr.io/pyrra-dev/pyrra:v0.6.3 + name: pyrra + ports: + - containerPort: 9099 + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + nodeSelector: + kubernetes.io/os: linux diff --git a/manifests/pyrra-apiService.yaml b/manifests/pyrra-apiService.yaml new file mode 100644 index 0000000000..4e731d90e1 --- /dev/null +++ b/manifests/pyrra-apiService.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.6.3 + name: pyrra-api + namespace: monitoring +spec: + ports: + - name: http + port: 9099 + targetPort: 9099 + selector: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/pyrra-kubernetesClusterRole.yaml b/manifests/pyrra-kubernetesClusterRole.yaml new file mode 100644 index 0000000000..8f3cb9f8cd --- /dev/null +++ b/manifests/pyrra-kubernetesClusterRole.yaml @@ -0,0 +1,49 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.6.3 + name: pyrra-kubernetes + namespace: monitoring +rules: +- apiGroups: + - monitoring.coreos.com + resources: + - prometheusrules + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - monitoring.coreos.com + resources: + - prometheusrules/status + verbs: + - get +- apiGroups: + - pyrra.dev + resources: + - servicelevelobjectives + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - pyrra.dev + resources: + - servicelevelobjectives/status + verbs: + - get + - patch + - update diff --git a/manifests/pyrra-kubernetesClusterRoleBinding.yaml b/manifests/pyrra-kubernetesClusterRoleBinding.yaml new file mode 100644 index 0000000000..0102293d48 --- /dev/null +++ b/manifests/pyrra-kubernetesClusterRoleBinding.yaml @@ -0,0 +1,18 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.6.3 + name: pyrra-kubernetes + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: pyrra-kubernetes +subjects: +- kind: ServiceAccount + name: pyrra-kubernetes + namespace: monitoring diff --git a/manifests/pyrra-kubernetesDeployment.yaml b/manifests/pyrra-kubernetesDeployment.yaml new file mode 100644 index 0000000000..caaa30ff9c --- /dev/null +++ b/manifests/pyrra-kubernetesDeployment.yaml @@ -0,0 +1,43 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.6.3 + name: pyrra-kubernetes + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/part-of: kube-prometheus + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + template: + metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.6.3 + spec: + containers: + - args: + - kubernetes + - --generic-rules + image: ghcr.io/pyrra-dev/pyrra:v0.6.3 + name: pyrra + ports: + - containerPort: 9099 + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: pyrra-kubernetes diff --git a/manifests/pyrra-kubernetesService.yaml b/manifests/pyrra-kubernetesService.yaml new file mode 100644 index 0000000000..fe7dac2f7f --- /dev/null +++ b/manifests/pyrra-kubernetesService.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.6.3 + name: pyrra-kubernetes + namespace: monitoring +spec: + ports: + - name: http + port: 9444 + targetPort: 9444 + selector: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/pyrra-kubernetesServiceAccount.yaml b/manifests/pyrra-kubernetesServiceAccount.yaml new file mode 100644 index 0000000000..ced6637c3f --- /dev/null +++ b/manifests/pyrra-kubernetesServiceAccount.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.6.3 + name: pyrra-kubernetes + namespace: monitoring diff --git a/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml b/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml new file mode 100644 index 0000000000..bb7ff787dc --- /dev/null +++ b/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml @@ -0,0 +1,19 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + pyrra.dev/component: apiserver + role: alert-rules + name: apiserver-read-cluster-latency + namespace: kube-system +spec: + description: "" + indicator: + latency: + success: + metric: apiserver_request_slo_duration_seconds_bucket{job="apiserver",scope=~"cluster|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)",le="5"} + total: + metric: apiserver_request_slo_duration_seconds_count{job="apiserver",scope=~"cluster|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml b/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml new file mode 100644 index 0000000000..a0b1172f26 --- /dev/null +++ b/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml @@ -0,0 +1,19 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + pyrra.dev/component: apiserver + role: alert-rules + name: apiserver-read-namespace-latency + namespace: kube-system +spec: + description: "" + indicator: + latency: + success: + metric: apiserver_request_slo_duration_seconds_bucket{job="apiserver",scope=~"namespace|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)",le="5"} + total: + metric: apiserver_request_slo_duration_seconds_count{job="apiserver",scope=~"namespace|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-apiserver-read-resource-latency.yaml b/manifests/pyrra-slo-apiserver-read-resource-latency.yaml new file mode 100644 index 0000000000..bc38ee57b4 --- /dev/null +++ b/manifests/pyrra-slo-apiserver-read-resource-latency.yaml @@ -0,0 +1,19 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + pyrra.dev/component: apiserver + role: alert-rules + name: apiserver-read-resource-latency + namespace: kube-system +spec: + description: "" + indicator: + latency: + success: + metric: apiserver_request_slo_duration_seconds_bucket{job="apiserver",scope=~"resource|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)",le="0.1"} + total: + metric: apiserver_request_slo_duration_seconds_count{job="apiserver",scope=~"resource|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-apiserver-read-response-errors.yaml b/manifests/pyrra-slo-apiserver-read-response-errors.yaml new file mode 100644 index 0000000000..d9dc2f5fe6 --- /dev/null +++ b/manifests/pyrra-slo-apiserver-read-response-errors.yaml @@ -0,0 +1,19 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + pyrra.dev/component: apiserver + role: alert-rules + name: apiserver-read-response-errors + namespace: kube-system +spec: + description: "" + indicator: + ratio: + errors: + metric: apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."} + total: + metric: apiserver_request_total{job="apiserver",verb=~"LIST|GET"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-apiserver-write-response-errors.yaml b/manifests/pyrra-slo-apiserver-write-response-errors.yaml new file mode 100644 index 0000000000..e7c9774ece --- /dev/null +++ b/manifests/pyrra-slo-apiserver-write-response-errors.yaml @@ -0,0 +1,19 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + pyrra.dev/component: apiserver + role: alert-rules + name: apiserver-write-response-errors + namespace: kube-system +spec: + description: "" + indicator: + ratio: + errors: + metric: apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."} + total: + metric: apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-prometheus-notification-errors.yaml b/manifests/pyrra-slo-prometheus-notification-errors.yaml new file mode 100644 index 0000000000..6e29e8314c --- /dev/null +++ b/manifests/pyrra-slo-prometheus-notification-errors.yaml @@ -0,0 +1,19 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + pyrra.dev/component: prometheus + role: alert-rules + name: prometheus-notification-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: prometheus_notifications_errors_total{job="prometheus-k8s"} + total: + metric: prometheus_notifications_sent_total{job="prometheus-k8s"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-prometheus-query-errors.yaml b/manifests/pyrra-slo-prometheus-query-errors.yaml new file mode 100644 index 0000000000..3de4fbc592 --- /dev/null +++ b/manifests/pyrra-slo-prometheus-query-errors.yaml @@ -0,0 +1,21 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + pyrra.dev/component: prometheus + role: alert-rules + name: prometheus-query-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: prometheus_http_requests_total{job="prometheus-k8s",handler=~"/api/v1/query.*",code=~"5.."} + grouping: + - handler + total: + metric: prometheus_http_requests_total{job="prometheus-k8s",handler=~"/api/v1/query.*"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml b/manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml new file mode 100644 index 0000000000..d28e7fe003 --- /dev/null +++ b/manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml @@ -0,0 +1,19 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + pyrra.dev/component: prometheus + role: alert-rules + name: prometheus-rule-evaluation-failures + namespace: monitoring +spec: + description: Rule and alerting rules are being evaluated every few seconds. This needs to work for recording rules to be created and most importantly for alerts to be evaluated. + indicator: + ratio: + errors: + metric: prometheus_rule_evaluation_failures_total{job="prometheus-k8s"} + total: + metric: prometheus_rule_evaluations_total{job="prometheus-k8s"} + target: "99.99" + window: 2w diff --git a/manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml b/manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml new file mode 100644 index 0000000000..a48781c58d --- /dev/null +++ b/manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml @@ -0,0 +1,19 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + pyrra.dev/component: prometheus + role: alert-rules + name: prometheus-sd-kubernetes-errors + namespace: monitoring +spec: + description: If there are too many errors Prometheus is having a bad time discovering new Kubernetes services. + indicator: + ratio: + errors: + metric: prometheus_sd_kubernetes_http_request_total{job="prometheus-k8s",status_code=~"5..|"} + total: + metric: prometheus_sd_kubernetes_http_request_total{job="prometheus-k8s"} + target: "99" + window: 2w diff --git a/manifests/setup/crd.yaml b/manifests/setup/crd.yaml new file mode 100644 index 0000000000..cfe5ad848b --- /dev/null +++ b/manifests/setup/crd.yaml @@ -0,0 +1,162 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.11.1 + creationTimestamp: null + name: servicelevelobjectives.pyrra.dev +spec: + group: pyrra.dev + names: + kind: ServiceLevelObjective + listKind: ServiceLevelObjectiveList + plural: servicelevelobjectives + shortNames: + - slo + singular: servicelevelobjective + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: ServiceLevelObjective is the Schema for the ServiceLevelObjectives API. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: ServiceLevelObjectiveSpec defines the desired state of ServiceLevelObjective. + properties: + alerting: + description: Alerting customizes the alerting rules generated by Pyrra. + properties: + disabled: + description: Disabled is used to disable the generation of alerts. Recording rules are still generated. + type: boolean + name: + description: Name is used as the name of the alert generated by Pyrra. Defaults to "ErrorBudgetBurn". + type: string + type: object + description: + description: Description describes the ServiceLevelObjective in more detail and gives extra context for engineers that might not directly work on the service. + type: string + indicator: + description: ServiceLevelIndicator is the underlying data source that indicates how the service is doing. This will be a Prometheus metric with specific selectors for your service. + properties: + bool_gauge: + description: BoolGauge is the indicator that measures wheter a boolean gauge is successul. + properties: + grouping: + description: Total is the metric that returns how many requests there are in total. + items: + type: string + type: array + metric: + type: string + required: + - grouping + - metric + type: object + latency: + description: Latency is the indicator that measures a certain percentage to be faster than. + properties: + grouping: + description: Grouping allows an SLO to be defined for many SLI at once, like HTTP handlers for example. + items: + type: string + type: array + success: + description: Success is the metric that returns how many errors there are. + properties: + metric: + type: string + required: + - metric + type: object + total: + description: Total is the metric that returns how many requests there are in total. + properties: + metric: + type: string + required: + - metric + type: object + required: + - success + - total + type: object + latencyNative: + description: LatencyNative is the indicator that measures a certain percentage to be faster than the expected latency. This uses the new native histograms in Prometheus. + properties: + grouping: + description: Grouping allows an SLO to be defined for many SLI at once, like HTTP handlers for example. + items: + type: string + type: array + latency: + description: Latency the requests should be faster than. + format: int64 + type: integer + total: + description: Total is the metric that returns how many requests there are in total. + properties: + metric: + type: string + required: + - metric + type: object + required: + - latency + - total + type: object + ratio: + description: Ratio is the indicator that measures against errors / total events. + properties: + errors: + description: Errors is the metric that returns how many errors there are. + properties: + metric: + type: string + required: + - metric + type: object + grouping: + description: Grouping allows an SLO to be defined for many SLI at once, like HTTP handlers for example. + items: + type: string + type: array + total: + description: Total is the metric that returns how many requests there are in total. + properties: + metric: + type: string + required: + - metric + type: object + required: + - errors + - total + type: object + type: object + target: + description: 'Target is a string that''s casted to a float64 between 0 - 100. It represents the desired availability of the service in the given window. float64 are not supported: https://github.com/kubernetes-sigs/controller-tools/issues/245' + type: string + window: + description: Window within which the Target is supposed to be kept. Usually something like 1d, 7d or 28d. + type: string + required: + - indicator + - target + - window + type: object + status: + description: ServiceLevelObjectiveStatus defines the observed state of ServiceLevelObjective. + type: object + type: object + served: true + storage: true From 1008ef97132fa21f3150a6ef3b4096cea0325cdb Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 15 Sep 2023 16:30:37 +0200 Subject: [PATCH 07/14] Use upstream pyrra jsonnet as foundation for component --- .../components/pyrra.libsonnet | 487 +----------------- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- 2 files changed, 15 insertions(+), 474 deletions(-) diff --git a/jsonnet/kube-prometheus/components/pyrra.libsonnet b/jsonnet/kube-prometheus/components/pyrra.libsonnet index e70be27eab..7ca367c735 100644 --- a/jsonnet/kube-prometheus/components/pyrra.libsonnet +++ b/jsonnet/kube-prometheus/components/pyrra.libsonnet @@ -1,484 +1,25 @@ +local pyrra = import 'github.com/pyrra-dev/pyrra/jsonnet/pyrra/kubernetes.libsonnet'; + local defaults = { local defaults = self, name:: 'pyrra', namespace:: error 'must provide namespace', version:: error 'must provide version', - image: error 'must provide image', - replicas:: 1, - port:: 9099, - - commonLabels:: { - 'app.kubernetes.io/name': 'pyrra', - 'app.kubernetes.io/version': defaults.version, - 'app.kubernetes.io/part-of': 'kube-prometheus', + image:: error 'must provide image', + resources:: { + limits: { cpu: '200m', memory: '512Mi' }, + requests: { cpu: '100m', memory: '100Mi' }, }, }; -function(params) { - local pyrra = self, - _config:: defaults + params, - - crd: ( - import 'github.com/pyrra-dev/pyrra/config/crd/bases/pyrra.dev_servicelevelobjectives.json' - ), - - _apiMetadata:: { - name: pyrra._config.name + '-api', - namespace: pyrra._config.namespace, - labels: pyrra._config.commonLabels { - 'app.kubernetes.io/component': 'api', - }, - }, - apiSelectorLabels:: { - [labelName]: pyrra._apiMetadata.labels[labelName] - for labelName in std.objectFields(pyrra._apiMetadata.labels) - if !std.setMember(labelName, ['app.kubernetes.io/version']) - }, - - apiService: { - apiVersion: 'v1', - kind: 'Service', - metadata: pyrra._apiMetadata, - spec: { - ports: [ - { name: 'http', targetPort: pyrra._config.port, port: pyrra._config.port }, - ], - selector: pyrra.apiSelectorLabels, - }, - }, - - apiDeployment: - local c = { - name: pyrra._config.name, - image: pyrra._config.image, - args: [ - 'api', - '--api-url=http://%s.%s.svc.cluster.local:9444' % [pyrra.kubernetesService.metadata.name, pyrra.kubernetesService.metadata.namespace], - '--prometheus-url=http://prometheus-k8s.%s.svc.cluster.local:9090' % pyrra._config.namespace, - ], - // resources: pyrra._config.resources, - ports: [{ containerPort: pyrra._config.port }], - securityContext: { - allowPrivilegeEscalation: false, - readOnlyRootFilesystem: true, - }, - }; - - { - apiVersion: 'apps/v1', - kind: 'Deployment', - metadata: pyrra._apiMetadata, - spec: { - replicas: pyrra._config.replicas, - selector: { - matchLabels: pyrra.apiSelectorLabels, - }, - strategy: { - rollingUpdate: { - maxSurge: 1, - maxUnavailable: 1, - }, - }, - template: { - metadata: { labels: pyrra._apiMetadata.labels }, - spec: { - containers: [c], - // serviceAccountName: $.serviceAccount.metadata.name, - nodeSelector: { 'kubernetes.io/os': 'linux' }, - }, - }, - }, - }, - - _kubernetesMetadata:: { - name: pyrra._config.name + '-kubernetes', - namespace: pyrra._config.namespace, - labels: pyrra._config.commonLabels { - 'app.kubernetes.io/component': 'kubernetes', - }, - }, - kubernetesSelectorLabels:: { - [labelName]: pyrra._kubernetesMetadata.labels[labelName] - for labelName in std.objectFields(pyrra._kubernetesMetadata.labels) - if !std.setMember(labelName, ['app.kubernetes.io/version']) - }, - - kubernetesServiceAccount: { - apiVersion: 'v1', - kind: 'ServiceAccount', - metadata: pyrra._kubernetesMetadata, - }, - - kubernetesClusterRole: { - apiVersion: 'rbac.authorization.k8s.io/v1', - kind: 'ClusterRole', - metadata: pyrra._kubernetesMetadata, - rules: [{ - apiGroups: ['monitoring.coreos.com'], - resources: ['prometheusrules'], - verbs: ['create', 'delete', 'get', 'list', 'patch', 'update', 'watch'], - }, { - apiGroups: ['monitoring.coreos.com'], - resources: ['prometheusrules/status'], - verbs: ['get'], - }, { - apiGroups: ['pyrra.dev'], - resources: ['servicelevelobjectives'], - verbs: ['create', 'delete', 'get', 'list', 'patch', 'update', 'watch'], - }, { - apiGroups: ['pyrra.dev'], - resources: ['servicelevelobjectives/status'], - verbs: ['get', 'patch', 'update'], - }], - }, - - kubernetesClusterRoleBinding: { - apiVersion: 'rbac.authorization.k8s.io/v1', - kind: 'ClusterRoleBinding', - metadata: pyrra._kubernetesMetadata, - roleRef: { - apiGroup: 'rbac.authorization.k8s.io', - kind: 'ClusterRole', - name: pyrra.kubernetesClusterRole.metadata.name, - }, - subjects: [{ - kind: 'ServiceAccount', - name: pyrra.kubernetesServiceAccount.metadata.name, - namespace: pyrra._config.namespace, - }], - }, - - kubernetesService: { - apiVersion: 'v1', - kind: 'Service', - metadata: pyrra._kubernetesMetadata, - spec: { - ports: [ - { name: 'http', targetPort: 9444, port: 9444 }, - ], - selector: pyrra.kubernetesSelectorLabels, - }, - }, - - kubernetesDeployment: - local c = { - name: pyrra._config.name, - image: pyrra._config.image, - args: [ - 'kubernetes', - '--generic-rules', - ], - // resources: pyrra._config.resources, - ports: [{ containerPort: pyrra._config.port }], - securityContext: { - allowPrivilegeEscalation: false, - readOnlyRootFilesystem: true, - }, - }; - - { - apiVersion: 'apps/v1', - kind: 'Deployment', - metadata: pyrra._kubernetesMetadata { - name: pyrra._config.name + '-kubernetes', - }, - spec: { - replicas: pyrra._config.replicas, - selector: { - matchLabels: pyrra.kubernetesSelectorLabels, - }, - strategy: { - rollingUpdate: { - maxSurge: 1, - maxUnavailable: 1, - }, - }, - template: { - metadata: { labels: pyrra._kubernetesMetadata.labels }, - spec: { - containers: [c], - serviceAccountName: pyrra.kubernetesServiceAccount.metadata.name, - nodeSelector: { 'kubernetes.io/os': 'linux' }, - }, - }, - }, - }, - - // Most of these should eventually be moved to the components themselves. - // For now, this is a good start to have everything in one place. - 'slo-apiserver-read-response-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-read-response-errors', - namespace: 'kube-system', - labels: { - prometheus: 'k8s', - role: 'alert-rules', - 'pyrra.dev/component': 'apiserver', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}', - }, - total: { - metric: 'apiserver_request_total{job="apiserver",verb=~"LIST|GET"}', - }, - }, - }, - }, - }, - - 'slo-apiserver-write-response-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-write-response-errors', - namespace: 'kube-system', - labels: { - prometheus: 'k8s', - role: 'alert-rules', - 'pyrra.dev/component': 'apiserver', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}', - }, - total: { - metric: 'apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}', - }, - }, - }, +function(params) + local config = defaults { + values+:: { + pyrra+: params, }, - }, - - 'slo-apiserver-read-resource-latency': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-read-resource-latency', - namespace: 'kube-system', - labels: { - prometheus: 'k8s', - role: 'alert-rules', - 'pyrra.dev/component': 'apiserver', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - latency: { - success: { - // metric: 'apiserver_request_duration_seconds_bucket{job="apiserver",scope=~"resource|",verb=~"LIST|GET",le="0.1"}', - metric: 'apiserver_request_slo_duration_seconds_bucket{job="apiserver",scope=~"resource|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)",le="0.1"}', - }, - total: { - // metric: 'apiserver_request_duration_seconds_count{job="apiserver",scope=~"resource|",verb=~"LIST|GET"}', - metric: 'apiserver_request_slo_duration_seconds_count{job="apiserver",scope=~"resource|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)"}', - }, - }, - }, - }, - }, - - 'slo-apiserver-read-namespace-latency': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-read-namespace-latency', - namespace: 'kube-system', - labels: { - prometheus: 'k8s', - role: 'alert-rules', - 'pyrra.dev/component': 'apiserver', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - latency: { - success: { - //metric: 'apiserver_request_duration_seconds_bucket{job="apiserver",scope=~"namespace|",verb=~"LIST|GET",le="5"}', - metric: 'apiserver_request_slo_duration_seconds_bucket{job="apiserver",scope=~"namespace|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)",le="5"}', - }, - total: { - //metric: 'apiserver_request_duration_seconds_count{job="apiserver",scope=~"namespace|",verb=~"LIST|GET"}', - metric: 'apiserver_request_slo_duration_seconds_count{job="apiserver",scope=~"namespace|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)"}', - }, - }, - }, - }, - }, - - 'slo-apiserver-read-cluster-latency': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'apiserver-read-cluster-latency', - namespace: 'kube-system', - labels: { - prometheus: 'k8s', - role: 'alert-rules', - 'pyrra.dev/component': 'apiserver', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - latency: { - success: { - //metric: 'apiserver_request_duration_seconds_bucket{job="apiserver",scope=~"cluster|",verb=~"LIST|GET",le="5"}', - metric: 'apiserver_request_slo_duration_seconds_bucket{job="apiserver",scope=~"cluster|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)",le="5"}', - }, - total: { - //metric: 'apiserver_request_duration_seconds_count{job="apiserver",scope=~"cluster|",verb=~"LIST|GET"}', - metric: 'apiserver_request_slo_duration_seconds_count{job="apiserver",scope=~"cluster|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-rule-evaluation-failures': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-rule-evaluation-failures', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - 'pyrra.dev/component': 'prometheus', - }, - }, - spec: { - target: '99.99', - window: '2w', - description: 'Rule and alerting rules are being evaluated every few seconds. This needs to work for recording rules to be created and most importantly for alerts to be evaluated.', - indicator: { - ratio: { - errors: { - metric: 'prometheus_rule_evaluation_failures_total{job="prometheus-k8s"}', - }, - total: { - metric: 'prometheus_rule_evaluations_total{job="prometheus-k8s"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-sd-kubernetes-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-sd-kubernetes-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - 'pyrra.dev/component': 'prometheus', - }, - }, - spec: { - target: '99', - window: '2w', - description: 'If there are too many errors Prometheus is having a bad time discovering new Kubernetes services.', - indicator: { - ratio: { - errors: { - metric: 'prometheus_sd_kubernetes_http_request_total{job="prometheus-k8s",status_code=~"5..|"}', - }, - total: { - metric: 'prometheus_sd_kubernetes_http_request_total{job="prometheus-k8s"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-query-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-query-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - 'pyrra.dev/component': 'prometheus', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - grouping: ['handler'], - errors: { - metric: 'prometheus_http_requests_total{job="prometheus-k8s",handler=~"/api/v1/query.*",code=~"5.."}', - }, - total: { - metric: 'prometheus_http_requests_total{job="prometheus-k8s",handler=~"/api/v1/query.*"}', - }, - }, - }, - }, - }, - - 'slo-prometheus-notification-errors': { - apiVersion: 'pyrra.dev/v1alpha1', - kind: 'ServiceLevelObjective', - metadata: { - name: 'prometheus-notification-errors', - namespace: pyrra._config.namespace, - labels: { - prometheus: 'k8s', - role: 'alert-rules', - 'pyrra.dev/component': 'prometheus', - }, - }, - spec: { - target: '99', - window: '2w', - description: '', - indicator: { - ratio: { - errors: { - metric: 'prometheus_notifications_errors_total{job="prometheus-k8s"}', - }, - total: { - metric: 'prometheus_notifications_sent_total{job="prometheus-k8s"}', - }, - }, - }, - }, - }, -} - -// 1000 * histogram_quantile(0.99, rate(kubelet_runtime_operations_duration_seconds_bucket{operation_type="list_containers"}[5m])) -// rate(kubelet_runtime_operations_duration_seconds_count[5m]) + }; + // Safety check + assert std.isObject(config.resources); -// etcd! -// kube-proxy? -// kube-scheduler + (pyrra + config).pyrra diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index 786accc377..a546b4067e 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -108,7 +108,7 @@ "source": { "git": { "remote": "https://github.com/pyrra-dev/pyrra.git", - "subdir": "config/crd/bases" + "subdir": "jsonnet" } }, "version": "release-0.6", From 7b6c4d6c45fcfe0e46a7ea1efdafe1879a4bb354 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Fri, 15 Sep 2023 16:36:39 +0200 Subject: [PATCH 08/14] Enable Pyrra generic rules for Grafana by default --- .../components/pyrra.libsonnet | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/jsonnet/kube-prometheus/components/pyrra.libsonnet b/jsonnet/kube-prometheus/components/pyrra.libsonnet index 7ca367c735..00467cf6f7 100644 --- a/jsonnet/kube-prometheus/components/pyrra.libsonnet +++ b/jsonnet/kube-prometheus/components/pyrra.libsonnet @@ -22,4 +22,22 @@ function(params) // Safety check assert std.isObject(config.resources); - (pyrra + config).pyrra + (pyrra + config).pyrra { + // Enable generic rules for kube-promethues by default + kubernetesDeployment+: { + spec+: { + template+: { + spec+: { + containers: [ + c { + args+: [ + '--generic-rules', + ], + } + for c in super.containers + ], + }, + }, + }, + }, + } From 4d1e3d3dcbf492c8cec635f2338bc069831e752a Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Sat, 16 Sep 2023 16:26:27 +0300 Subject: [PATCH 09/14] Update Pyrra dependency --- example.jsonnet | 2 +- jsonnet/kube-prometheus/jsonnetfile.json | 2 +- jsonnetfile.lock.json | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/example.jsonnet b/example.jsonnet index f530474183..4b3a3c5a18 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -13,7 +13,7 @@ local kp = namespace: 'monitoring', }, kubernetesControlPlane+: { - kubeProxy:true, + kubeProxy: true, }, }, }; diff --git a/jsonnet/kube-prometheus/jsonnetfile.json b/jsonnet/kube-prometheus/jsonnetfile.json index a546b4067e..5f8b08225a 100644 --- a/jsonnet/kube-prometheus/jsonnetfile.json +++ b/jsonnet/kube-prometheus/jsonnetfile.json @@ -111,7 +111,7 @@ "subdir": "jsonnet" } }, - "version": "release-0.6", + "version": "main", "name": "pyrra" }, { diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 7cb3fbe78b..34d63a61bf 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -148,11 +148,11 @@ "source": { "git": { "remote": "https://github.com/pyrra-dev/pyrra.git", - "subdir": "config/crd/bases" + "subdir": "jsonnet" } }, - "version": "551856d42dff02ec38c5b0ea6a2d99c4cb127e82", - "sum": "bY/Pcrrbynguq8/HaI88cQ3B2hLv/xc+76QILY7IL+g=", + "version": "f638b929cfc2f55020cadfa70f67942bad865f7a", + "sum": "l00ZaFyXmEf6i+59sknEp8Vxsv7TaohubjnKSutztEg=", "name": "pyrra" }, { From 7be5262ece62d407590ad095847c64673ccd277e Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Sat, 16 Sep 2023 16:34:49 +0300 Subject: [PATCH 10/14] Add changes to the manifests folder --- .../prometheusOperator-sloHTTPErrors.yaml | 2 +- ...prometheusOperator-sloReconcileErrors.yaml | 2 +- manifests/pyrra-apiDeployment.yaml | 6 ++-- manifests/pyrra-apiService.yaml | 2 +- manifests/pyrra-kubernetesClusterRole.yaml | 2 +- .../pyrra-kubernetesClusterRoleBinding.yaml | 2 +- manifests/pyrra-kubernetesDeployment.yaml | 6 ++-- manifests/pyrra-kubernetesService.yaml | 5 +++- manifests/pyrra-kubernetesServiceAccount.yaml | 2 +- ...ra-slo-apiserver-read-cluster-latency.yaml | 7 ++--- ...-slo-apiserver-read-namespace-latency.yaml | 7 ++--- ...a-slo-apiserver-read-resource-latency.yaml | 7 ++--- ...ra-slo-apiserver-read-response-errors.yaml | 7 ++--- ...a-slo-apiserver-write-response-errors.yaml | 7 ++--- .../pyrra-slo-coredns-response-errors.yaml | 18 ++++++++++++ .../pyrra-slo-kubelet-request-errors.yaml | 18 ++++++++++++ .../pyrra-slo-kubelet-runtime-errors.yaml | 18 ++++++++++++ ...ra-slo-prometheus-notification-errors.yaml | 1 - ...a-slo-prometheus-operator-http-errors.yaml | 18 ++++++++++++ ...-prometheus-operator-reconcile-errors.yaml | 20 +++++++++++++ .../pyrra-slo-prometheus-query-errors.yaml | 1 - ...o-prometheus-rule-evaluation-failures.yaml | 1 - ...a-slo-prometheus-sd-kubernetes-errors.yaml | 1 - manifests/setup/crd.yaml | 29 +++++++++++++++---- 24 files changed, 146 insertions(+), 43 deletions(-) create mode 100644 manifests/pyrra-slo-coredns-response-errors.yaml create mode 100644 manifests/pyrra-slo-kubelet-request-errors.yaml create mode 100644 manifests/pyrra-slo-kubelet-runtime-errors.yaml create mode 100644 manifests/pyrra-slo-prometheus-operator-http-errors.yaml create mode 100644 manifests/pyrra-slo-prometheus-operator-reconcile-errors.yaml diff --git a/manifests/prometheusOperator-sloHTTPErrors.yaml b/manifests/prometheusOperator-sloHTTPErrors.yaml index b183af08ad..35441662d2 100644 --- a/manifests/prometheusOperator-sloHTTPErrors.yaml +++ b/manifests/prometheusOperator-sloHTTPErrors.yaml @@ -5,7 +5,7 @@ metadata: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 0.66.0 + app.kubernetes.io/version: 0.67.1 prometheus: k8s pyrra.dev/component: prometheus-operator role: alert-rules diff --git a/manifests/prometheusOperator-sloReconcileErrors.yaml b/manifests/prometheusOperator-sloReconcileErrors.yaml index fd8506e4d3..2246e10c67 100644 --- a/manifests/prometheusOperator-sloReconcileErrors.yaml +++ b/manifests/prometheusOperator-sloReconcileErrors.yaml @@ -5,7 +5,7 @@ metadata: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 0.66.0 + app.kubernetes.io/version: 0.67.1 prometheus: k8s pyrra.dev/component: prometheus-operator role: alert-rules diff --git a/manifests/pyrra-apiDeployment.yaml b/manifests/pyrra-apiDeployment.yaml index dab7f7b683..eed23a3fca 100644 --- a/manifests/pyrra-apiDeployment.yaml +++ b/manifests/pyrra-apiDeployment.yaml @@ -5,7 +5,7 @@ metadata: app.kubernetes.io/component: api app.kubernetes.io/name: pyrra app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 0.6.3 + app.kubernetes.io/version: 0.6.4 name: pyrra-api namespace: monitoring spec: @@ -25,14 +25,14 @@ spec: app.kubernetes.io/component: api app.kubernetes.io/name: pyrra app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 0.6.3 + app.kubernetes.io/version: 0.6.4 spec: containers: - args: - api - --api-url=http://pyrra-kubernetes.monitoring.svc.cluster.local:9444 - --prometheus-url=http://prometheus-k8s.monitoring.svc.cluster.local:9090 - image: ghcr.io/pyrra-dev/pyrra:v0.6.3 + image: ghcr.io/pyrra-dev/pyrra:v0.6.4 name: pyrra ports: - containerPort: 9099 diff --git a/manifests/pyrra-apiService.yaml b/manifests/pyrra-apiService.yaml index 4e731d90e1..248dc02354 100644 --- a/manifests/pyrra-apiService.yaml +++ b/manifests/pyrra-apiService.yaml @@ -5,7 +5,7 @@ metadata: app.kubernetes.io/component: api app.kubernetes.io/name: pyrra app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 0.6.3 + app.kubernetes.io/version: 0.6.4 name: pyrra-api namespace: monitoring spec: diff --git a/manifests/pyrra-kubernetesClusterRole.yaml b/manifests/pyrra-kubernetesClusterRole.yaml index 8f3cb9f8cd..571606a7b9 100644 --- a/manifests/pyrra-kubernetesClusterRole.yaml +++ b/manifests/pyrra-kubernetesClusterRole.yaml @@ -5,7 +5,7 @@ metadata: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 0.6.3 + app.kubernetes.io/version: 0.6.4 name: pyrra-kubernetes namespace: monitoring rules: diff --git a/manifests/pyrra-kubernetesClusterRoleBinding.yaml b/manifests/pyrra-kubernetesClusterRoleBinding.yaml index 0102293d48..32945a87ad 100644 --- a/manifests/pyrra-kubernetesClusterRoleBinding.yaml +++ b/manifests/pyrra-kubernetesClusterRoleBinding.yaml @@ -5,7 +5,7 @@ metadata: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 0.6.3 + app.kubernetes.io/version: 0.6.4 name: pyrra-kubernetes namespace: monitoring roleRef: diff --git a/manifests/pyrra-kubernetesDeployment.yaml b/manifests/pyrra-kubernetesDeployment.yaml index caaa30ff9c..caa6e28149 100644 --- a/manifests/pyrra-kubernetesDeployment.yaml +++ b/manifests/pyrra-kubernetesDeployment.yaml @@ -5,7 +5,7 @@ metadata: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 0.6.3 + app.kubernetes.io/version: 0.6.4 name: pyrra-kubernetes namespace: monitoring spec: @@ -25,13 +25,13 @@ spec: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 0.6.3 + app.kubernetes.io/version: 0.6.4 spec: containers: - args: - kubernetes - --generic-rules - image: ghcr.io/pyrra-dev/pyrra:v0.6.3 + image: ghcr.io/pyrra-dev/pyrra:v0.6.4 name: pyrra ports: - containerPort: 9099 diff --git a/manifests/pyrra-kubernetesService.yaml b/manifests/pyrra-kubernetesService.yaml index fe7dac2f7f..06e3d3a8ae 100644 --- a/manifests/pyrra-kubernetesService.yaml +++ b/manifests/pyrra-kubernetesService.yaml @@ -5,7 +5,7 @@ metadata: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 0.6.3 + app.kubernetes.io/version: 0.6.4 name: pyrra-kubernetes namespace: monitoring spec: @@ -13,6 +13,9 @@ spec: - name: http port: 9444 targetPort: 9444 + - name: webhooks + port: 9443 + targetPort: 9443 selector: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra diff --git a/manifests/pyrra-kubernetesServiceAccount.yaml b/manifests/pyrra-kubernetesServiceAccount.yaml index ced6637c3f..e84bc6e9f5 100644 --- a/manifests/pyrra-kubernetesServiceAccount.yaml +++ b/manifests/pyrra-kubernetesServiceAccount.yaml @@ -5,6 +5,6 @@ metadata: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 0.6.3 + app.kubernetes.io/version: 0.6.4 name: pyrra-kubernetes namespace: monitoring diff --git a/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml b/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml index bb7ff787dc..29c968b322 100644 --- a/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml +++ b/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml @@ -3,17 +3,16 @@ kind: ServiceLevelObjective metadata: labels: prometheus: k8s - pyrra.dev/component: apiserver role: alert-rules name: apiserver-read-cluster-latency - namespace: kube-system + namespace: monitoring spec: description: "" indicator: latency: success: - metric: apiserver_request_slo_duration_seconds_bucket{job="apiserver",scope=~"cluster|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)",le="5"} + metric: apiserver_request_sli_duration_seconds_bucket{component="apiserver",scope=~"cluster|",verb=~"LIST|GET",le="5"} total: - metric: apiserver_request_slo_duration_seconds_count{job="apiserver",scope=~"cluster|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)"} + metric: apiserver_request_sli_duration_seconds_count{component="apiserver",scope=~"cluster|",verb=~"LIST|GET"} target: "99" window: 2w diff --git a/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml b/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml index a0b1172f26..2cecd6cb34 100644 --- a/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml +++ b/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml @@ -3,17 +3,16 @@ kind: ServiceLevelObjective metadata: labels: prometheus: k8s - pyrra.dev/component: apiserver role: alert-rules name: apiserver-read-namespace-latency - namespace: kube-system + namespace: monitoring spec: description: "" indicator: latency: success: - metric: apiserver_request_slo_duration_seconds_bucket{job="apiserver",scope=~"namespace|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)",le="5"} + metric: apiserver_request_sli_duration_seconds_bucket{component="apiserver",scope=~"namespace|",verb=~"LIST|GET",le="5"} total: - metric: apiserver_request_slo_duration_seconds_count{job="apiserver",scope=~"namespace|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)"} + metric: apiserver_request_sli_duration_seconds_count{component="apiserver",scope=~"namespace|",verb=~"LIST|GET"} target: "99" window: 2w diff --git a/manifests/pyrra-slo-apiserver-read-resource-latency.yaml b/manifests/pyrra-slo-apiserver-read-resource-latency.yaml index bc38ee57b4..3d9a85b5fe 100644 --- a/manifests/pyrra-slo-apiserver-read-resource-latency.yaml +++ b/manifests/pyrra-slo-apiserver-read-resource-latency.yaml @@ -3,17 +3,16 @@ kind: ServiceLevelObjective metadata: labels: prometheus: k8s - pyrra.dev/component: apiserver role: alert-rules name: apiserver-read-resource-latency - namespace: kube-system + namespace: monitoring spec: description: "" indicator: latency: success: - metric: apiserver_request_slo_duration_seconds_bucket{job="apiserver",scope=~"resource|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)",le="0.1"} + metric: apiserver_request_sli_duration_seconds_bucket{component="apiserver",scope=~"resource|",verb=~"LIST|GET",le="0.1"} total: - metric: apiserver_request_slo_duration_seconds_count{job="apiserver",scope=~"resource|",verb=~"LIST|GET",subresource!~"/(healthz|livez|readyz)"} + metric: apiserver_request_sli_duration_seconds_count{component="apiserver",scope=~"resource|",verb=~"LIST|GET"} target: "99" window: 2w diff --git a/manifests/pyrra-slo-apiserver-read-response-errors.yaml b/manifests/pyrra-slo-apiserver-read-response-errors.yaml index d9dc2f5fe6..067ca6c928 100644 --- a/manifests/pyrra-slo-apiserver-read-response-errors.yaml +++ b/manifests/pyrra-slo-apiserver-read-response-errors.yaml @@ -3,17 +3,16 @@ kind: ServiceLevelObjective metadata: labels: prometheus: k8s - pyrra.dev/component: apiserver role: alert-rules name: apiserver-read-response-errors - namespace: kube-system + namespace: monitoring spec: description: "" indicator: ratio: errors: - metric: apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."} + metric: apiserver_request_total{component="apiserver",verb=~"LIST|GET",code=~"5.."} total: - metric: apiserver_request_total{job="apiserver",verb=~"LIST|GET"} + metric: apiserver_request_total{component="apiserver",verb=~"LIST|GET"} target: "99" window: 2w diff --git a/manifests/pyrra-slo-apiserver-write-response-errors.yaml b/manifests/pyrra-slo-apiserver-write-response-errors.yaml index e7c9774ece..c94985d599 100644 --- a/manifests/pyrra-slo-apiserver-write-response-errors.yaml +++ b/manifests/pyrra-slo-apiserver-write-response-errors.yaml @@ -3,17 +3,16 @@ kind: ServiceLevelObjective metadata: labels: prometheus: k8s - pyrra.dev/component: apiserver role: alert-rules name: apiserver-write-response-errors - namespace: kube-system + namespace: monitoring spec: description: "" indicator: ratio: errors: - metric: apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."} + metric: apiserver_request_total{component="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."} total: - metric: apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"} + metric: apiserver_request_total{component="apiserver",verb=~"POST|PUT|PATCH|DELETE"} target: "99" window: 2w diff --git a/manifests/pyrra-slo-coredns-response-errors.yaml b/manifests/pyrra-slo-coredns-response-errors.yaml new file mode 100644 index 0000000000..346a7fcf28 --- /dev/null +++ b/manifests/pyrra-slo-coredns-response-errors.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: coredns-response-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: coredns_dns_responses_total{job="kube-dns",rcode="SERVFAIL"} + total: + metric: coredns_dns_responses_total{job="kube-dns"} + target: "99.99" + window: 2w diff --git a/manifests/pyrra-slo-kubelet-request-errors.yaml b/manifests/pyrra-slo-kubelet-request-errors.yaml new file mode 100644 index 0000000000..5696de98b1 --- /dev/null +++ b/manifests/pyrra-slo-kubelet-request-errors.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: kubelet-request-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: rest_client_requests_total{job="kubelet",code=~"5.."} + total: + metric: rest_client_requests_total{job="kubelet"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-kubelet-runtime-errors.yaml b/manifests/pyrra-slo-kubelet-runtime-errors.yaml new file mode 100644 index 0000000000..a7a95f89bc --- /dev/null +++ b/manifests/pyrra-slo-kubelet-runtime-errors.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: kubelet-runtime-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: kubelet_runtime_operations_errors_total{job="kubelet"} + total: + metric: kubelet_runtime_operations_total{job="kubelet"} + target: "99" + window: 2w diff --git a/manifests/pyrra-slo-prometheus-notification-errors.yaml b/manifests/pyrra-slo-prometheus-notification-errors.yaml index 6e29e8314c..c9e01cfe4c 100644 --- a/manifests/pyrra-slo-prometheus-notification-errors.yaml +++ b/manifests/pyrra-slo-prometheus-notification-errors.yaml @@ -3,7 +3,6 @@ kind: ServiceLevelObjective metadata: labels: prometheus: k8s - pyrra.dev/component: prometheus role: alert-rules name: prometheus-notification-errors namespace: monitoring diff --git a/manifests/pyrra-slo-prometheus-operator-http-errors.yaml b/manifests/pyrra-slo-prometheus-operator-http-errors.yaml new file mode 100644 index 0000000000..217cd0013b --- /dev/null +++ b/manifests/pyrra-slo-prometheus-operator-http-errors.yaml @@ -0,0 +1,18 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: prometheus-operator-http-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: prometheus_operator_kubernetes_client_http_requests_total{job="prometheus-operator",status_code=~"5.."} + total: + metric: prometheus_operator_kubernetes_client_http_requests_total{job="prometheus-operator"} + target: "99.5" + window: 2w diff --git a/manifests/pyrra-slo-prometheus-operator-reconcile-errors.yaml b/manifests/pyrra-slo-prometheus-operator-reconcile-errors.yaml new file mode 100644 index 0000000000..0c579bbdc8 --- /dev/null +++ b/manifests/pyrra-slo-prometheus-operator-reconcile-errors.yaml @@ -0,0 +1,20 @@ +apiVersion: pyrra.dev/v1alpha1 +kind: ServiceLevelObjective +metadata: + labels: + prometheus: k8s + role: alert-rules + name: prometheus-operator-reconcile-errors + namespace: monitoring +spec: + description: "" + indicator: + ratio: + errors: + metric: prometheus_operator_reconcile_errors_total{job="prometheus-operator"} + grouping: + - controller + total: + metric: prometheus_operator_reconcile_operations_total{job="prometheus-operator"} + target: "95" + window: 2w diff --git a/manifests/pyrra-slo-prometheus-query-errors.yaml b/manifests/pyrra-slo-prometheus-query-errors.yaml index 3de4fbc592..99357ea44f 100644 --- a/manifests/pyrra-slo-prometheus-query-errors.yaml +++ b/manifests/pyrra-slo-prometheus-query-errors.yaml @@ -3,7 +3,6 @@ kind: ServiceLevelObjective metadata: labels: prometheus: k8s - pyrra.dev/component: prometheus role: alert-rules name: prometheus-query-errors namespace: monitoring diff --git a/manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml b/manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml index d28e7fe003..32c542b6ec 100644 --- a/manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml +++ b/manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml @@ -3,7 +3,6 @@ kind: ServiceLevelObjective metadata: labels: prometheus: k8s - pyrra.dev/component: prometheus role: alert-rules name: prometheus-rule-evaluation-failures namespace: monitoring diff --git a/manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml b/manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml index a48781c58d..c85d5205f5 100644 --- a/manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml +++ b/manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml @@ -3,7 +3,6 @@ kind: ServiceLevelObjective metadata: labels: prometheus: k8s - pyrra.dev/component: prometheus role: alert-rules name: prometheus-sd-kubernetes-errors namespace: monitoring diff --git a/manifests/setup/crd.yaml b/manifests/setup/crd.yaml index cfe5ad848b..847543716e 100644 --- a/manifests/setup/crd.yaml +++ b/manifests/setup/crd.yaml @@ -16,7 +16,20 @@ spec: singular: servicelevelobjective scope: Namespaced versions: - - name: v1alpha1 + - additionalPrinterColumns: + - jsonPath: .spec.window + name: Window + type: string + - jsonPath: .spec.target + name: Target + type: string + - jsonPath: .status.type + name: Type + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 schema: openAPIV3Schema: description: ServiceLevelObjective is the Schema for the ServiceLevelObjectives API. @@ -49,7 +62,7 @@ spec: description: ServiceLevelIndicator is the underlying data source that indicates how the service is doing. This will be a Prometheus metric with specific selectors for your service. properties: bool_gauge: - description: BoolGauge is the indicator that measures wheter a boolean gauge is successul. + description: BoolGauge is the indicator that measures whether a boolean gauge is successful. properties: grouping: description: Total is the metric that returns how many requests there are in total. @@ -59,11 +72,10 @@ spec: metric: type: string required: - - grouping - metric type: object latency: - description: Latency is the indicator that measures a certain percentage to be faster than. + description: Latency is the indicator that measures a certain percentage to be faster than the expected latency. properties: grouping: description: Grouping allows an SLO to be defined for many SLI at once, like HTTP handlers for example. @@ -100,8 +112,7 @@ spec: type: array latency: description: Latency the requests should be faster than. - format: int64 - type: integer + type: string total: description: Total is the metric that returns how many requests there are in total. properties: @@ -156,7 +167,13 @@ spec: type: object status: description: ServiceLevelObjectiveStatus defines the observed state of ServiceLevelObjective. + properties: + type: + description: Type is the generated resource type, like PrometheusRule or ConfigMap + type: string type: object type: object served: true storage: true + subresources: + status: {} From 1f8620095ffaad9ecb7605f9b64cae7ecb366c0f Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Sat, 30 Sep 2023 12:53:52 +0200 Subject: [PATCH 11/14] Update Pyrra to the released v0.7.0 --- jsonnetfile.lock.json | 4 ++-- kustomization.yaml | 8 ++++++++ manifests/pyrra-apiDeployment.yaml | 4 +--- manifests/pyrra-apiService.yaml | 2 -- manifests/pyrra-apiServiceAccount.yaml | 9 +++++++++ manifests/pyrra-apiServiceMonitor.yaml | 19 +++++++++++++++++++ manifests/pyrra-kubernetesClusterRole.yaml | 1 - .../pyrra-kubernetesClusterRoleBinding.yaml | 1 - manifests/pyrra-kubernetesDeployment.yaml | 3 --- manifests/pyrra-kubernetesService.yaml | 5 +++-- manifests/pyrra-kubernetesServiceAccount.yaml | 1 - manifests/pyrra-kubernetesServiceMonitor.yaml | 19 +++++++++++++++++++ ...ra-slo-apiserver-read-cluster-latency.yaml | 4 ++-- ...-slo-apiserver-read-namespace-latency.yaml | 4 ++-- ...a-slo-apiserver-read-resource-latency.yaml | 4 ++-- 15 files changed, 67 insertions(+), 21 deletions(-) create mode 100644 manifests/pyrra-apiServiceAccount.yaml create mode 100644 manifests/pyrra-apiServiceMonitor.yaml create mode 100644 manifests/pyrra-kubernetesServiceMonitor.yaml diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 34d63a61bf..d90df0a1bd 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -151,8 +151,8 @@ "subdir": "jsonnet" } }, - "version": "f638b929cfc2f55020cadfa70f67942bad865f7a", - "sum": "l00ZaFyXmEf6i+59sknEp8Vxsv7TaohubjnKSutztEg=", + "version": "e6d76176d1adbd4712561a1e61caca470edd4002", + "sum": "egH5yDS/wTfqNLFm7tSWafIsEGjOuk8xDj2PFhCWX2A=", "name": "pyrra" }, { diff --git a/kustomization.yaml b/kustomization.yaml index e51683a9e5..9421247310 100644 --- a/kustomization.yaml +++ b/kustomization.yaml @@ -95,17 +95,25 @@ resources: - ./manifests/prometheusOperator-sloReconcileErrors.yaml - ./manifests/pyrra-apiDeployment.yaml - ./manifests/pyrra-apiService.yaml +- ./manifests/pyrra-apiServiceAccount.yaml +- ./manifests/pyrra-apiServiceMonitor.yaml - ./manifests/pyrra-kubernetesClusterRole.yaml - ./manifests/pyrra-kubernetesClusterRoleBinding.yaml - ./manifests/pyrra-kubernetesDeployment.yaml - ./manifests/pyrra-kubernetesService.yaml - ./manifests/pyrra-kubernetesServiceAccount.yaml +- ./manifests/pyrra-kubernetesServiceMonitor.yaml - ./manifests/pyrra-slo-apiserver-read-cluster-latency.yaml - ./manifests/pyrra-slo-apiserver-read-namespace-latency.yaml - ./manifests/pyrra-slo-apiserver-read-resource-latency.yaml - ./manifests/pyrra-slo-apiserver-read-response-errors.yaml - ./manifests/pyrra-slo-apiserver-write-response-errors.yaml +- ./manifests/pyrra-slo-coredns-response-errors.yaml +- ./manifests/pyrra-slo-kubelet-request-errors.yaml +- ./manifests/pyrra-slo-kubelet-runtime-errors.yaml - ./manifests/pyrra-slo-prometheus-notification-errors.yaml +- ./manifests/pyrra-slo-prometheus-operator-http-errors.yaml +- ./manifests/pyrra-slo-prometheus-operator-reconcile-errors.yaml - ./manifests/pyrra-slo-prometheus-query-errors.yaml - ./manifests/pyrra-slo-prometheus-rule-evaluation-failures.yaml - ./manifests/pyrra-slo-prometheus-sd-kubernetes-errors.yaml diff --git a/manifests/pyrra-apiDeployment.yaml b/manifests/pyrra-apiDeployment.yaml index eed23a3fca..fad7cf027b 100644 --- a/manifests/pyrra-apiDeployment.yaml +++ b/manifests/pyrra-apiDeployment.yaml @@ -4,7 +4,6 @@ metadata: labels: app.kubernetes.io/component: api app.kubernetes.io/name: pyrra - app.kubernetes.io/part-of: kube-prometheus app.kubernetes.io/version: 0.6.4 name: pyrra-api namespace: monitoring @@ -14,7 +13,6 @@ spec: matchLabels: app.kubernetes.io/component: api app.kubernetes.io/name: pyrra - app.kubernetes.io/part-of: kube-prometheus strategy: rollingUpdate: maxSurge: 1 @@ -24,7 +22,6 @@ spec: labels: app.kubernetes.io/component: api app.kubernetes.io/name: pyrra - app.kubernetes.io/part-of: kube-prometheus app.kubernetes.io/version: 0.6.4 spec: containers: @@ -41,3 +38,4 @@ spec: readOnlyRootFilesystem: true nodeSelector: kubernetes.io/os: linux + serviceAccountName: pyrra-api diff --git a/manifests/pyrra-apiService.yaml b/manifests/pyrra-apiService.yaml index 248dc02354..74b5358332 100644 --- a/manifests/pyrra-apiService.yaml +++ b/manifests/pyrra-apiService.yaml @@ -4,7 +4,6 @@ metadata: labels: app.kubernetes.io/component: api app.kubernetes.io/name: pyrra - app.kubernetes.io/part-of: kube-prometheus app.kubernetes.io/version: 0.6.4 name: pyrra-api namespace: monitoring @@ -16,4 +15,3 @@ spec: selector: app.kubernetes.io/component: api app.kubernetes.io/name: pyrra - app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/pyrra-apiServiceAccount.yaml b/manifests/pyrra-apiServiceAccount.yaml new file mode 100644 index 0000000000..b37ae4a76a --- /dev/null +++ b/manifests/pyrra-apiServiceAccount.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.6.4 + name: pyrra-api + namespace: monitoring diff --git a/manifests/pyrra-apiServiceMonitor.yaml b/manifests/pyrra-apiServiceMonitor.yaml new file mode 100644 index 0000000000..bc501f3e23 --- /dev/null +++ b/manifests/pyrra-apiServiceMonitor.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.6.4 + name: pyrra-api + namespace: monitoring +spec: + endpoints: + - port: http + namespaceSelector: + matchNames: + - monitoring + selector: + matchLabels: + app.kubernetes.io/component: api + app.kubernetes.io/name: pyrra diff --git a/manifests/pyrra-kubernetesClusterRole.yaml b/manifests/pyrra-kubernetesClusterRole.yaml index 571606a7b9..e0a1cbdfbd 100644 --- a/manifests/pyrra-kubernetesClusterRole.yaml +++ b/manifests/pyrra-kubernetesClusterRole.yaml @@ -4,7 +4,6 @@ metadata: labels: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra - app.kubernetes.io/part-of: kube-prometheus app.kubernetes.io/version: 0.6.4 name: pyrra-kubernetes namespace: monitoring diff --git a/manifests/pyrra-kubernetesClusterRoleBinding.yaml b/manifests/pyrra-kubernetesClusterRoleBinding.yaml index 32945a87ad..2f6bb547dd 100644 --- a/manifests/pyrra-kubernetesClusterRoleBinding.yaml +++ b/manifests/pyrra-kubernetesClusterRoleBinding.yaml @@ -4,7 +4,6 @@ metadata: labels: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra - app.kubernetes.io/part-of: kube-prometheus app.kubernetes.io/version: 0.6.4 name: pyrra-kubernetes namespace: monitoring diff --git a/manifests/pyrra-kubernetesDeployment.yaml b/manifests/pyrra-kubernetesDeployment.yaml index caa6e28149..6f54f74ded 100644 --- a/manifests/pyrra-kubernetesDeployment.yaml +++ b/manifests/pyrra-kubernetesDeployment.yaml @@ -4,7 +4,6 @@ metadata: labels: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra - app.kubernetes.io/part-of: kube-prometheus app.kubernetes.io/version: 0.6.4 name: pyrra-kubernetes namespace: monitoring @@ -14,7 +13,6 @@ spec: matchLabels: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra - app.kubernetes.io/part-of: kube-prometheus strategy: rollingUpdate: maxSurge: 1 @@ -24,7 +22,6 @@ spec: labels: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra - app.kubernetes.io/part-of: kube-prometheus app.kubernetes.io/version: 0.6.4 spec: containers: diff --git a/manifests/pyrra-kubernetesService.yaml b/manifests/pyrra-kubernetesService.yaml index 06e3d3a8ae..3dd330429b 100644 --- a/manifests/pyrra-kubernetesService.yaml +++ b/manifests/pyrra-kubernetesService.yaml @@ -4,12 +4,14 @@ metadata: labels: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra - app.kubernetes.io/part-of: kube-prometheus app.kubernetes.io/version: 0.6.4 name: pyrra-kubernetes namespace: monitoring spec: ports: + - name: metrics + port: 8080 + targetPort: 8080 - name: http port: 9444 targetPort: 9444 @@ -19,4 +21,3 @@ spec: selector: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra - app.kubernetes.io/part-of: kube-prometheus diff --git a/manifests/pyrra-kubernetesServiceAccount.yaml b/manifests/pyrra-kubernetesServiceAccount.yaml index e84bc6e9f5..24b0aa650d 100644 --- a/manifests/pyrra-kubernetesServiceAccount.yaml +++ b/manifests/pyrra-kubernetesServiceAccount.yaml @@ -4,7 +4,6 @@ metadata: labels: app.kubernetes.io/component: kubernetes app.kubernetes.io/name: pyrra - app.kubernetes.io/part-of: kube-prometheus app.kubernetes.io/version: 0.6.4 name: pyrra-kubernetes namespace: monitoring diff --git a/manifests/pyrra-kubernetesServiceMonitor.yaml b/manifests/pyrra-kubernetesServiceMonitor.yaml new file mode 100644 index 0000000000..924f6743a2 --- /dev/null +++ b/manifests/pyrra-kubernetesServiceMonitor.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra + app.kubernetes.io/version: 0.6.4 + name: pyrra-kubernetes + namespace: monitoring +spec: + endpoints: + - port: metrics + namespaceSelector: + matchNames: + - monitoring + selector: + matchLabels: + app.kubernetes.io/component: kubernetes + app.kubernetes.io/name: pyrra diff --git a/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml b/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml index 29c968b322..ddc701d37a 100644 --- a/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml +++ b/manifests/pyrra-slo-apiserver-read-cluster-latency.yaml @@ -11,8 +11,8 @@ spec: indicator: latency: success: - metric: apiserver_request_sli_duration_seconds_bucket{component="apiserver",scope=~"cluster|",verb=~"LIST|GET",le="5"} + metric: apiserver_request_slo_duration_seconds_bucket{component="apiserver",scope=~"cluster|",verb=~"LIST|GET",le="5"} total: - metric: apiserver_request_sli_duration_seconds_count{component="apiserver",scope=~"cluster|",verb=~"LIST|GET"} + metric: apiserver_request_slo_duration_seconds_count{component="apiserver",scope=~"cluster|",verb=~"LIST|GET"} target: "99" window: 2w diff --git a/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml b/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml index 2cecd6cb34..86e4c391ff 100644 --- a/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml +++ b/manifests/pyrra-slo-apiserver-read-namespace-latency.yaml @@ -11,8 +11,8 @@ spec: indicator: latency: success: - metric: apiserver_request_sli_duration_seconds_bucket{component="apiserver",scope=~"namespace|",verb=~"LIST|GET",le="5"} + metric: apiserver_request_slo_duration_seconds_bucket{component="apiserver",scope=~"namespace|",verb=~"LIST|GET",le="5"} total: - metric: apiserver_request_sli_duration_seconds_count{component="apiserver",scope=~"namespace|",verb=~"LIST|GET"} + metric: apiserver_request_slo_duration_seconds_count{component="apiserver",scope=~"namespace|",verb=~"LIST|GET"} target: "99" window: 2w diff --git a/manifests/pyrra-slo-apiserver-read-resource-latency.yaml b/manifests/pyrra-slo-apiserver-read-resource-latency.yaml index 3d9a85b5fe..ec0d7598b3 100644 --- a/manifests/pyrra-slo-apiserver-read-resource-latency.yaml +++ b/manifests/pyrra-slo-apiserver-read-resource-latency.yaml @@ -11,8 +11,8 @@ spec: indicator: latency: success: - metric: apiserver_request_sli_duration_seconds_bucket{component="apiserver",scope=~"resource|",verb=~"LIST|GET",le="0.1"} + metric: apiserver_request_slo_duration_seconds_bucket{verb=~"LIST|GET",le="0.1"} total: - metric: apiserver_request_sli_duration_seconds_count{component="apiserver",scope=~"resource|",verb=~"LIST|GET"} + metric: apiserver_request_slo_duration_seconds_count{verb=~"LIST|GET"} target: "99" window: 2w From 895f628598f3adc6c09a95c1f9d0a714f725c49e Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Sat, 30 Sep 2023 14:56:15 +0200 Subject: [PATCH 12/14] Generate after rebasing --- manifests/prometheusOperator-sloHTTPErrors.yaml | 2 +- manifests/prometheusOperator-sloReconcileErrors.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/manifests/prometheusOperator-sloHTTPErrors.yaml b/manifests/prometheusOperator-sloHTTPErrors.yaml index 35441662d2..e2a8928a68 100644 --- a/manifests/prometheusOperator-sloHTTPErrors.yaml +++ b/manifests/prometheusOperator-sloHTTPErrors.yaml @@ -5,7 +5,7 @@ metadata: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 0.67.1 + app.kubernetes.io/version: 0.68.0 prometheus: k8s pyrra.dev/component: prometheus-operator role: alert-rules diff --git a/manifests/prometheusOperator-sloReconcileErrors.yaml b/manifests/prometheusOperator-sloReconcileErrors.yaml index 2246e10c67..a0a42bb108 100644 --- a/manifests/prometheusOperator-sloReconcileErrors.yaml +++ b/manifests/prometheusOperator-sloReconcileErrors.yaml @@ -5,7 +5,7 @@ metadata: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator app.kubernetes.io/part-of: kube-prometheus - app.kubernetes.io/version: 0.67.1 + app.kubernetes.io/version: 0.68.0 prometheus: k8s pyrra.dev/component: prometheus-operator role: alert-rules From db1b8c6adbec1364ef31c3801fd832ed5cb7506e Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Sat, 30 Sep 2023 16:48:05 +0200 Subject: [PATCH 13/14] Remove accidental change --- example.jsonnet | 3 --- 1 file changed, 3 deletions(-) diff --git a/example.jsonnet b/example.jsonnet index 4b3a3c5a18..5f9f1eae98 100644 --- a/example.jsonnet +++ b/example.jsonnet @@ -12,9 +12,6 @@ local kp = common+: { namespace: 'monitoring', }, - kubernetesControlPlane+: { - kubeProxy: true, - }, }, }; From 33447869a8992bd6e757c66a286410ea4875cd53 Mon Sep 17 00:00:00 2001 From: Matthias Loibl Date: Sat, 30 Sep 2023 17:03:17 +0200 Subject: [PATCH 14/14] Fix problems to unblock CI --- ...ng-prometheus-rules-and-grafana-dashboards.md | 16 ++++++++++++---- docs/customizing.md | 16 ++++++++++++---- jsonnet/kube-prometheus/addons/pyrra.libsonnet | 0 .../components/k8s-control-plane.libsonnet | 2 +- 4 files changed, 25 insertions(+), 9 deletions(-) delete mode 100644 jsonnet/kube-prometheus/addons/pyrra.libsonnet diff --git a/docs/customizations/developing-prometheus-rules-and-grafana-dashboards.md b/docs/customizations/developing-prometheus-rules-and-grafana-dashboards.md index cfab2e687f..0a61e0623e 100644 --- a/docs/customizations/developing-prometheus-rules-and-grafana-dashboards.md +++ b/docs/customizations/developing-prometheus-rules-and-grafana-dashboards.md @@ -31,7 +31,6 @@ local kp = // (import 'kube-prometheus/addons/static-etcd.libsonnet') + // (import 'kube-prometheus/addons/custom-metrics.libsonnet') + // (import 'kube-prometheus/addons/external-metrics.libsonnet') + - // (import 'kube-prometheus/addons/pyrra.libsonnet') + { values+:: { common+: { @@ -43,17 +42,26 @@ local kp = { 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) + for name in std.filter( + (function(name) + name != 'serviceMonitor' && + name != 'prometheusRule' && + name != 'sloHTTPErrors' && + name != 'sloReconcileErrors'), + std.objectFields(kp.prometheusOperator) + ) } + -// { 'setup/pyrra-slo-CustomResourceDefinition': kp.pyrra.crd } + +{ 'setup/pyrra-slo-CustomResourceDefinition': kp.pyrra.crd } + // serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + { 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'prometheus-operator-sloHTTPErrors': kp.prometheusOperator.sloHTTPErrors } + +{ 'prometheus-operator-sloReconcileErrors': kp.prometheusOperator.sloReconcileErrors } + { 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + -// { ['pyrra-' + name]: kp.pyrra[name] for name in std.objectFields(kp.pyrra) if name != 'crd' } + +{ ['pyrra-' + name]: kp.pyrra[name] for name in std.objectFields(kp.pyrra) if name != 'crd' } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + diff --git a/docs/customizing.md b/docs/customizing.md index a051ed4437..3fb3038f27 100644 --- a/docs/customizing.md +++ b/docs/customizing.md @@ -51,7 +51,6 @@ local kp = // (import 'kube-prometheus/addons/static-etcd.libsonnet') + // (import 'kube-prometheus/addons/custom-metrics.libsonnet') + // (import 'kube-prometheus/addons/external-metrics.libsonnet') + - // (import 'kube-prometheus/addons/pyrra.libsonnet') + { values+:: { common+: { @@ -63,17 +62,26 @@ local kp = { 'setup/0namespace-namespace': kp.kubePrometheus.namespace } + { ['setup/prometheus-operator-' + name]: kp.prometheusOperator[name] - for name in std.filter((function(name) name != 'serviceMonitor' && name != 'prometheusRule'), std.objectFields(kp.prometheusOperator)) + for name in std.filter( + (function(name) + name != 'serviceMonitor' && + name != 'prometheusRule' && + name != 'sloHTTPErrors' && + name != 'sloReconcileErrors'), + std.objectFields(kp.prometheusOperator) + ) } + -// { 'setup/pyrra-slo-CustomResourceDefinition': kp.pyrra.crd } + +{ 'setup/pyrra-slo-CustomResourceDefinition': kp.pyrra.crd } + // serviceMonitor and prometheusRule are separated so that they can be created after the CRDs are ready { 'prometheus-operator-serviceMonitor': kp.prometheusOperator.serviceMonitor } + { 'prometheus-operator-prometheusRule': kp.prometheusOperator.prometheusRule } + +{ 'prometheus-operator-sloHTTPErrors': kp.prometheusOperator.sloHTTPErrors } + +{ 'prometheus-operator-sloReconcileErrors': kp.prometheusOperator.sloReconcileErrors } + { 'kube-prometheus-prometheusRule': kp.kubePrometheus.prometheusRule } + { ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + { ['blackbox-exporter-' + name]: kp.blackboxExporter[name] for name in std.objectFields(kp.blackboxExporter) } + { ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } + -// { ['pyrra-' + name]: kp.pyrra[name] for name in std.objectFields(kp.pyrra) if name != 'crd' } + +{ ['pyrra-' + name]: kp.pyrra[name] for name in std.objectFields(kp.pyrra) if name != 'crd' } + { ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + { ['kubernetes-' + name]: kp.kubernetesControlPlane[name] for name in std.objectFields(kp.kubernetesControlPlane) } { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + diff --git a/jsonnet/kube-prometheus/addons/pyrra.libsonnet b/jsonnet/kube-prometheus/addons/pyrra.libsonnet deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet index 0ce7ef65f1..478ccd4807 100644 --- a/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet +++ b/jsonnet/kube-prometheus/components/k8s-control-plane.libsonnet @@ -56,7 +56,7 @@ local defaults = { window: '2w', }, requestErrors: { - target: '90', // kube-proxy makes very few requests + target: '90', // kube-proxy makes very few requests window: '2w', }, },