From 5631d11f1b7d5cda1b6ade25af96f9a644443373 Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Sun, 31 Mar 2024 11:06:16 +0000 Subject: [PATCH 1/7] Add Monitoring Metrics with aioprometheus for ResourcePools --- operator/metrics/__init__.py | 4 ++ operator/metrics/app_metrics.py | 38 +++++++++++++++++ operator/metrics/metrics_manager.py | 18 ++++++++ operator/metrics/metrics_service.py | 22 ++++++++++ operator/metrics/resourcepool_metrics.py | 38 +++++++++++++++++ operator/operator.py | 45 ++++++++++++++++++++ operator/poolboy_k8s.py | 3 ++ operator/resourcepool.py | 54 ++++++++++++++++++++++++ requirements.txt | 1 + 9 files changed, 223 insertions(+) create mode 100644 operator/metrics/__init__.py create mode 100644 operator/metrics/app_metrics.py create mode 100644 operator/metrics/metrics_manager.py create mode 100644 operator/metrics/metrics_service.py create mode 100644 operator/metrics/resourcepool_metrics.py diff --git a/operator/metrics/__init__.py b/operator/metrics/__init__.py new file mode 100644 index 0000000..064f66d --- /dev/null +++ b/operator/metrics/__init__.py @@ -0,0 +1,4 @@ +from .app_metrics import AppMetrics # noqa F401 +from .metrics_manager import MetricsManager # noqa F401 +from .metrics_service import MetricsService # noqa F401 +from .resourcepool_metrics import ResourcePoolMetrics # noqa F401 \ No newline at end of file diff --git a/operator/metrics/app_metrics.py b/operator/metrics/app_metrics.py new file mode 100644 index 0000000..7f8ae01 --- /dev/null +++ b/operator/metrics/app_metrics.py @@ -0,0 +1,38 @@ +from __future__ import annotations +import time +from functools import wraps +from aioprometheus import Summary + + +class AppMetrics: + # Summary: Similar to a histogram, a summary samples observations + # (usually things like request durations and response sizes). + # While it also provides a total count of observations and a sum of all observed values, + # it calculates configurable quantiles over a sliding time window. + response_time_seconds = Summary("response_time_seconds", + "Response time in seconds", + {"method": "Method used for the request", + "resource_type": "Type of resource requested" + } + ) + + @staticmethod + def measure_execution_time(metric_name, **labels): + + def decorator(func): + @wraps(func) + async def wrapper(*args, **kwargs): + metric = getattr(AppMetrics, metric_name, None) + start_time = time.time() + result = await func(*args, **kwargs) + end_time = time.time() + duration = end_time - start_time + if metric and callable(getattr(metric, 'observe', None)): + metric.observe(labels=labels, value=duration) + else: + print(f"Metric {metric_name} not found or doesn't support observe()") + return result + + return wrapper + + return decorator diff --git a/operator/metrics/metrics_manager.py b/operator/metrics/metrics_manager.py new file mode 100644 index 0000000..6757aad --- /dev/null +++ b/operator/metrics/metrics_manager.py @@ -0,0 +1,18 @@ +from __future__ import annotations +from aioprometheus import REGISTRY, Counter, Gauge + +from .app_metrics import AppMetrics +from .resourcepool_metrics import ResourcePoolMetrics + + +class MetricsManager: + metrics_classes = [AppMetrics, ResourcePoolMetrics] + + @classmethod + def register(cls): + for metrics_class in cls.metrics_classes: + for attr_name in dir(metrics_class): + attr = getattr(metrics_class, attr_name) + if isinstance(attr, (Counter, Gauge)): + if attr.name not in REGISTRY.collectors: + REGISTRY.register(attr) diff --git a/operator/metrics/metrics_service.py b/operator/metrics/metrics_service.py new file mode 100644 index 0000000..2a89beb --- /dev/null +++ b/operator/metrics/metrics_service.py @@ -0,0 +1,22 @@ +from __future__ import annotations +import os +import logging +from aioprometheus.service import Service + + +logger = logging.getLogger(__name__) + + +class MetricsService: + service = Service() + + @classmethod + async def start(cls, addr="0.0.0.0", port=8000) -> None: + port = int(os.environ.get("METRICS_PORT", port)) + await cls.service.start(addr=addr, port=port, metrics_url="/metrics") + logger.info(f"Serving metrics on: {cls.service.metrics_url}") + + @classmethod + async def stop(cls) -> None: + logger.info("Stopping metrics service") + await cls.service.stop() diff --git a/operator/metrics/resourcepool_metrics.py b/operator/metrics/resourcepool_metrics.py new file mode 100644 index 0000000..a2491cb --- /dev/null +++ b/operator/metrics/resourcepool_metrics.py @@ -0,0 +1,38 @@ +from __future__ import annotations +from aioprometheus import Counter, Gauge + +from metrics import AppMetrics + + +class ResourcePoolMetrics(AppMetrics): + resource_pool_labels = {'name': "Resource pool name", + 'namespace': "Resource pool namespace" + } + + resource_pool_min_available = Gauge( + 'resource_pool_min_available', + 'Minimum number of available environments in each resource pool', + resource_pool_labels + ) + + resource_pool_available = Gauge( + 'resource_pool_available', + 'Number of available environments in each resource pool', + resource_pool_labels + ) + + resource_pool_used_total = Counter( + 'resource_pool_used_total', + 'Total number of environments used in each resource pool', + resource_pool_labels + ) + + resource_pool_state_labels = {'name': "Resource pool name", + 'namespace': "Resource pool namespace", + 'state': "State of the resource pool" + } + resource_pool_state = Gauge( + 'resource_pool_state', + 'State of each resource pool, including available and used resources', + resource_pool_state_labels + ) diff --git a/operator/operator.py b/operator/operator.py index 8ce280e..8de4f32 100755 --- a/operator/operator.py +++ b/operator/operator.py @@ -17,6 +17,9 @@ from resourceprovider import ResourceProvider from resourcewatcher import ResourceWatcher +from metrics import MetricsManager, MetricsService, AppMetrics + + @kopf.on.startup() async def startup(logger: kopf.ObjectLogger, settings: kopf.OperatorSettings, **_): @@ -41,6 +44,12 @@ async def startup(logger: kopf.ObjectLogger, settings: kopf.OperatorSettings, ** # Configure logging configure_kopf_logging() + # Start metrics service + await MetricsService.start() + + # Register metrics + MetricsManager.register() + # Preload for matching ResourceClaim templates await Poolboy.on_startup() await ResourceProvider.preload(logger=logger) @@ -51,6 +60,7 @@ async def startup(logger: kopf.ObjectLogger, settings: kopf.OperatorSettings, ** async def cleanup(logger: kopf.ObjectLogger, **_): await ResourceWatcher.stop_all() await Poolboy.on_cleanup() + await MetricsService.stop() @kopf.on.create( @@ -65,6 +75,11 @@ async def cleanup(logger: kopf.ObjectLogger, **_): Poolboy.operator_domain, Poolboy.operator_version, 'resourceclaims', id='resource_claim_update', labels={Poolboy.ignore_label: kopf.ABSENT}, ) +@AppMetrics.measure_execution_time( + 'response_time_seconds', + method='on_create', + resource_type='resourceclaims' + ) async def resource_claim_event( annotations: kopf.Annotations, labels: kopf.Labels, @@ -94,6 +109,11 @@ async def resource_claim_event( Poolboy.operator_domain, Poolboy.operator_version, 'resourceclaims', labels={Poolboy.ignore_label: kopf.ABSENT}, ) +@AppMetrics.measure_execution_time( + 'response_time_seconds', + method='on_delete', + resource_type='resourceclaims' + ) async def resource_claim_delete( annotations: kopf.Annotations, labels: kopf.Labels, @@ -173,6 +193,11 @@ async def resource_claim_daemon( Poolboy.operator_domain, Poolboy.operator_version, 'resourcehandles', id='resource_handle_update', labels={Poolboy.ignore_label: kopf.ABSENT}, ) +@AppMetrics.measure_execution_time( + 'response_time_seconds', + method='on_create', + resource_type='resourcehandles' + ) async def resource_handle_event( annotations: kopf.Annotations, labels: kopf.Labels, @@ -202,6 +227,11 @@ async def resource_handle_event( Poolboy.operator_domain, Poolboy.operator_version, 'resourcehandles', labels={Poolboy.ignore_label: kopf.ABSENT}, ) +@AppMetrics.measure_execution_time( + 'response_time_seconds', + method='on_delete', + resource_type='resourcehandles' +) async def resource_handle_delete( annotations: kopf.Annotations, labels: kopf.Labels, @@ -281,6 +311,11 @@ async def resource_handle_daemon( Poolboy.operator_domain, Poolboy.operator_version, 'resourcepools', id='resource_pool_update', labels={Poolboy.ignore_label: kopf.ABSENT}, ) +@AppMetrics.measure_execution_time( + 'response_time_seconds', + method='on_event', + resource_type='resourcepools' + ) async def resource_pool_event( annotations: kopf.Annotations, labels: kopf.Labels, @@ -310,6 +345,11 @@ async def resource_pool_event( Poolboy.operator_domain, Poolboy.operator_version, 'resourcepools', labels={Poolboy.ignore_label: kopf.ABSENT}, ) +@AppMetrics.measure_execution_time( + 'response_time_seconds', + method='on_delete', + resource_type='resourcepools' + ) async def resource_pool_delete( annotations: kopf.Annotations, labels: kopf.Labels, @@ -337,6 +377,11 @@ async def resource_pool_delete( @kopf.on.event(Poolboy.operator_domain, Poolboy.operator_version, 'resourceproviders') +@AppMetrics.measure_execution_time( + 'response_time_seconds', + method='on_event', + resource_type='resourceproviders' + ) async def resource_provider_event(event: Mapping, logger: kopf.ObjectLogger, **_) -> None: definition = event['object'] if event['type'] == 'DELETED': diff --git a/operator/poolboy_k8s.py b/operator/poolboy_k8s.py index c82d119..69f9365 100644 --- a/operator/poolboy_k8s.py +++ b/operator/poolboy_k8s.py @@ -254,6 +254,9 @@ async def get_requester_from_namespace(namespace: str) -> tuple[Optional[Mapping if not user_name: return None, [] + if user_name == 'system:admin': + user_name = 'mamorim-redhat.com' + try: user = await Poolboy.custom_objects_api.get_cluster_custom_object( 'user.openshift.io', 'v1', 'users', user_name diff --git a/operator/resourcepool.py b/operator/resourcepool.py index e577680..e27f6f4 100644 --- a/operator/resourcepool.py +++ b/operator/resourcepool.py @@ -3,6 +3,8 @@ import kubernetes_asyncio import pytimeparse +from metrics import ResourcePoolMetrics + from datetime import timedelta from typing import List, Mapping, Optional, TypeVar @@ -119,6 +121,17 @@ def lifespan_unclaimed_timedelta(self): def metadata(self) -> Mapping: return self.meta + @property + def metrics_labels(self) -> Mapping: + return { + 'name': self.name, + 'namespace': self.namespace, + } + + @property + def metric_state_labels(self) -> Mapping: + return {'name': self.name, 'namespace': self.namespace, 'state': ''} + @property def min_available(self) -> int: return self.spec.get('minAvailable', 0) @@ -161,13 +174,54 @@ def refresh(self, self.status = status self.uid = uid + async def handle_metrics(self, logger: kopf.ObjectLogger, resource_handles): + logger.info("Handling metrics for resource pool") + resource_handle_deficit = self.min_available - len(resource_handles) + + ResourcePoolMetrics.resource_pool_min_available.set( + labels=self.metrics_labels, + value=self.min_available + ) + + ResourcePoolMetrics.resource_pool_available.set( + labels=self.metrics_labels, + value=len(resource_handles) + ) + + if resource_handle_deficit < 0: + ResourcePoolMetrics.resource_pool_used_total.inc( + labels=self.metrics_labels, + value=resource_handle_deficit + ) + + state_labels = self.metric_state_labels + state_labels['state'] = 'available' + ResourcePoolMetrics.resource_pool_state.set( + labels=state_labels, + value=len(resource_handles) + ) + + state_labels['state'] = 'used' + ResourcePoolMetrics.resource_pool_state.set( + labels=state_labels, + value=resource_handle_deficit + ) + async def handle_delete(self, logger: kopf.ObjectLogger): await resourcehandle.ResourceHandle.delete_unbound_handles_for_pool(logger=logger, resource_pool=self) + @ResourcePoolMetrics.measure_execution_time( + 'response_time_seconds', + method='manage', + resource_type='resourcepool' + ) async def manage(self, logger: kopf.ObjectLogger): async with self.lock: resource_handles = await resourcehandle.ResourceHandle.get_unbound_handles_for_pool(resource_pool=self, logger=logger) resource_handle_deficit = self.min_available - len(resource_handles) + + await self.handle_metrics(logger=logger, resource_handles=resource_handles) + if resource_handle_deficit <= 0: return for i in range(resource_handle_deficit): diff --git a/requirements.txt b/requirements.txt index a986470..511096f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,7 @@ jsonpointer==2.2 jsonschema==3.2.0 openapi-schema-validator==0.1.5 prometheus-client==0.11.0 +aioprometheus==23.12.0 pyasn1==0.4.8 pyasn1-modules==0.2.8 pydantic==1.9.0 From 789e94fd28d8650c5f9ab8426e1a91a331600b73 Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Sun, 31 Mar 2024 11:16:42 +0000 Subject: [PATCH 2/7] Update values and fix typo --- helm/templates/service_metrics.yaml | 18 ++++++++++++++++++ helm/templates/service_monitor.yaml | 18 ++++++++++++++++++ helm/values.yaml | 6 ++++++ operator/poolboy_k8s.py | 3 --- 4 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 helm/templates/service_metrics.yaml create mode 100644 helm/templates/service_monitor.yaml diff --git a/helm/templates/service_metrics.yaml b/helm/templates/service_metrics.yaml new file mode 100644 index 0000000..046043b --- /dev/null +++ b/helm/templates/service_metrics.yaml @@ -0,0 +1,18 @@ +{{- if .Values.deploy -}} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "poolboy.name" . }}-metrics + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} +spec: + type: {{ .Values.serviceMetrics.type }} + {{- with .Values.serviceMetrics.ports }} + ports: + {{- toYaml . | nindent 2 }} + {{- end }} + selector: + {{- include "poolboy.selectorLabels" . | nindent 4 }} + sessionAffinity: None +{{- end -}} diff --git a/helm/templates/service_monitor.yaml b/helm/templates/service_monitor.yaml new file mode 100644 index 0000000..95ab11a --- /dev/null +++ b/helm/templates/service_monitor.yaml @@ -0,0 +1,18 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "poolboy.name" . }} + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} +spec: + selector: + matchLabels: + app.kubernetes.io/name: {{ include "poolboy.name" . }} + namespaceSelector: + matchNames: + - {{ include "poolboy.namespaceName" . }} + endpoints: + - port: {{ include "poolboy.name" . }} + interval: "30s" + path: /metrics diff --git a/helm/values.yaml b/helm/values.yaml index b215b30..33ecd49 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -59,6 +59,12 @@ service: port: 8000 protocol: TCP targetPort: 8000 + - name: metrics-service + port: 8000 + protocol: web + targetPort: 8000 + path: /metrics + resources: {} # We usually recommend not to specify default resources and to leave this as a conscious diff --git a/operator/poolboy_k8s.py b/operator/poolboy_k8s.py index 69f9365..c82d119 100644 --- a/operator/poolboy_k8s.py +++ b/operator/poolboy_k8s.py @@ -254,9 +254,6 @@ async def get_requester_from_namespace(namespace: str) -> tuple[Optional[Mapping if not user_name: return None, [] - if user_name == 'system:admin': - user_name = 'mamorim-redhat.com' - try: user = await Poolboy.custom_objects_api.get_cluster_custom_object( 'user.openshift.io', 'v1', 'users', user_name From a2d40c81cb3eb07f8b773b83fa0dc8c42701ed33 Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Mon, 1 Apr 2024 01:26:50 +0000 Subject: [PATCH 3/7] Update ServiceMonitor template --- helm/templates/service_monitor.yaml | 14 +++++++------- helm/values.yaml | 18 ++++++++++++------ operator/operator.py | 5 +++++ 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/helm/templates/service_monitor.yaml b/helm/templates/service_monitor.yaml index 95ab11a..914de9f 100644 --- a/helm/templates/service_monitor.yaml +++ b/helm/templates/service_monitor.yaml @@ -6,13 +6,13 @@ metadata: labels: {{- include "poolboy.labels" . | nindent 4 }} spec: + endpoints: + - interval: 30s + path: /metrics + port: {{ .Values.serviceMetrics.name}} + namespaceSelector: + matchNames: + - {{ include "poolboy.namespaceName" . }} selector: matchLabels: app.kubernetes.io/name: {{ include "poolboy.name" . }} - namespaceSelector: - matchNames: - - {{ include "poolboy.namespaceName" . }} - endpoints: - - port: {{ include "poolboy.name" . }} - interval: "30s" - path: /metrics diff --git a/helm/values.yaml b/helm/values.yaml index 33ecd49..8ab51ff 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -53,16 +53,22 @@ serviceAccount: service: type: ClusterIP - port: 8000 + port: 8080 + ports: + - name: health-check + port: 8080 + protocol: TCP + targetPort: 8080 + +serviceMetrics: + type: ClusterIP + port: 80 + name: metrics ports: - name: metrics - port: 8000 + port: 80 protocol: TCP targetPort: 8000 - - name: metrics-service - port: 8000 - protocol: web - targetPort: 8000 path: /metrics diff --git a/operator/operator.py b/operator/operator.py index 8de4f32..10b3056 100755 --- a/operator/operator.py +++ b/operator/operator.py @@ -22,6 +22,11 @@ @kopf.on.startup() +@AppMetrics.measure_execution_time( + 'response_time_seconds', + method='on_startup', + resource_type='poolboy' + ) async def startup(logger: kopf.ObjectLogger, settings: kopf.OperatorSettings, **_): # Store last handled configuration in status settings.persistence.diffbase_storage = kopf.StatusDiffBaseStorage(field='status.diffBase') From 4873c2f60a07afc735c58e4fe49cef49327db545 Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Mon, 1 Apr 2024 10:11:15 +0000 Subject: [PATCH 4/7] Add values for service monitor --- helm/templates/service_metrics.yaml | 18 ------------------ helm/templates/service_monitor.yaml | 12 ++++++++---- helm/values.yaml | 24 ++++++++++++++---------- 3 files changed, 22 insertions(+), 32 deletions(-) delete mode 100644 helm/templates/service_metrics.yaml diff --git a/helm/templates/service_metrics.yaml b/helm/templates/service_metrics.yaml deleted file mode 100644 index 046043b..0000000 --- a/helm/templates/service_metrics.yaml +++ /dev/null @@ -1,18 +0,0 @@ -{{- if .Values.deploy -}} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "poolboy.name" . }}-metrics - namespace: {{ include "poolboy.namespaceName" . }} - labels: - {{- include "poolboy.labels" . | nindent 4 }} -spec: - type: {{ .Values.serviceMetrics.type }} - {{- with .Values.serviceMetrics.ports }} - ports: - {{- toYaml . | nindent 2 }} - {{- end }} - selector: - {{- include "poolboy.selectorLabels" . | nindent 4 }} - sessionAffinity: None -{{- end -}} diff --git a/helm/templates/service_monitor.yaml b/helm/templates/service_monitor.yaml index 914de9f..8914658 100644 --- a/helm/templates/service_monitor.yaml +++ b/helm/templates/service_monitor.yaml @@ -1,3 +1,4 @@ +{{- if .Values.serviceMonitor.create }} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: @@ -7,12 +8,15 @@ metadata: {{- include "poolboy.labels" . | nindent 4 }} spec: endpoints: - - interval: 30s - path: /metrics - port: {{ .Values.serviceMetrics.name}} + - interval: {{ .Values.serviceMonitor.interval }} + path: {{ .Values.serviceMonitor.path }} + port: {{ .Values.serviceMonitor.portName }} + scrapeTimeout: {{ .Values.serviceMonitor.scrapeTimeout }} + jobLabel: {{ include "poolboy.name" . }} namespaceSelector: matchNames: - {{ include "poolboy.namespaceName" . }} selector: matchLabels: - app.kubernetes.io/name: {{ include "poolboy.name" . }} + {{- include "poolboy.selectorLabels" . | nindent 6 }} +{{- end }} diff --git a/helm/values.yaml b/helm/values.yaml index 8ab51ff..a4ce1f0 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -51,27 +51,31 @@ serviceAccount: # If not set and create is true, a name is generated using the fullname template name: +serviceMonitor: + # ServiceMonitor requires prometheus-operator installed in the cluster + create: true + # The port name of the service monitor to use. + portName: metrics + # The path to scrape metrics from + path: /metrics + # How often Prometheus should scrape + interval: 30s + # How long Prometheus should wait for a scrape to complete + scrapeTimeout: 15s + service: type: ClusterIP port: 8080 ports: - - name: health-check + - name: healthz port: 8080 protocol: TCP targetPort: 8080 - -serviceMetrics: - type: ClusterIP - port: 80 - name: metrics - ports: - name: metrics port: 80 protocol: TCP targetPort: 8000 - path: /metrics - - + resources: {} # We usually recommend not to specify default resources and to leave this as a conscious # choice for the user. This also increases chances charts run on environments with little From f6e22ca58a2f5dd1772d5f3db1424c4530c3c903 Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Mon, 1 Apr 2024 12:36:45 +0000 Subject: [PATCH 5/7] Replace custom decorator by @timer aioprometheus decorator --- operator/operator.py | 77 +++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 40 deletions(-) diff --git a/operator/operator.py b/operator/operator.py index 10b3056..eedd2ba 100755 --- a/operator/operator.py +++ b/operator/operator.py @@ -3,6 +3,8 @@ import kopf import logging +from aioprometheus import timer + from copy import deepcopy from datetime import datetime, timedelta from typing import Any, Mapping, Optional @@ -20,13 +22,11 @@ from metrics import MetricsManager, MetricsService, AppMetrics - @kopf.on.startup() -@AppMetrics.measure_execution_time( - 'response_time_seconds', - method='on_startup', - resource_type='poolboy' - ) +@timer( + AppMetrics.response_time_seconds, + labels={'method': 'startup', 'resource_type': 'poolboy_operator'} +) async def startup(logger: kopf.ObjectLogger, settings: kopf.OperatorSettings, **_): # Store last handled configuration in status settings.persistence.diffbase_storage = kopf.StatusDiffBaseStorage(field='status.diffBase') @@ -62,6 +62,10 @@ async def startup(logger: kopf.ObjectLogger, settings: kopf.OperatorSettings, ** @kopf.on.cleanup() +@timer( + AppMetrics.response_time_seconds, + labels={'method': 'cleanup', 'resource_type': 'poolboy_operator'} +) async def cleanup(logger: kopf.ObjectLogger, **_): await ResourceWatcher.stop_all() await Poolboy.on_cleanup() @@ -80,11 +84,10 @@ async def cleanup(logger: kopf.ObjectLogger, **_): Poolboy.operator_domain, Poolboy.operator_version, 'resourceclaims', id='resource_claim_update', labels={Poolboy.ignore_label: kopf.ABSENT}, ) -@AppMetrics.measure_execution_time( - 'response_time_seconds', - method='on_create', - resource_type='resourceclaims' - ) +@timer( + AppMetrics.response_time_seconds, + labels={'method': 'resource_claim_event', 'resource_type': 'resourceclaims'} +) async def resource_claim_event( annotations: kopf.Annotations, labels: kopf.Labels, @@ -114,11 +117,10 @@ async def resource_claim_event( Poolboy.operator_domain, Poolboy.operator_version, 'resourceclaims', labels={Poolboy.ignore_label: kopf.ABSENT}, ) -@AppMetrics.measure_execution_time( - 'response_time_seconds', - method='on_delete', - resource_type='resourceclaims' - ) +@timer( + AppMetrics.response_time_seconds, + labels={'method': 'resource_claim_delete', 'resource_type': 'resourceclaims'} +) async def resource_claim_delete( annotations: kopf.Annotations, labels: kopf.Labels, @@ -198,11 +200,10 @@ async def resource_claim_daemon( Poolboy.operator_domain, Poolboy.operator_version, 'resourcehandles', id='resource_handle_update', labels={Poolboy.ignore_label: kopf.ABSENT}, ) -@AppMetrics.measure_execution_time( - 'response_time_seconds', - method='on_create', - resource_type='resourcehandles' - ) +@timer( + AppMetrics.response_time_seconds, + labels={'method': 'resource_handle_event', 'resource_type': 'resourcehandles'} +) async def resource_handle_event( annotations: kopf.Annotations, labels: kopf.Labels, @@ -232,10 +233,9 @@ async def resource_handle_event( Poolboy.operator_domain, Poolboy.operator_version, 'resourcehandles', labels={Poolboy.ignore_label: kopf.ABSENT}, ) -@AppMetrics.measure_execution_time( - 'response_time_seconds', - method='on_delete', - resource_type='resourcehandles' +@timer( + AppMetrics.response_time_seconds, + labels={'method': 'resource_handle_delete', 'resource_type': 'resourcehandles'} ) async def resource_handle_delete( annotations: kopf.Annotations, @@ -316,11 +316,10 @@ async def resource_handle_daemon( Poolboy.operator_domain, Poolboy.operator_version, 'resourcepools', id='resource_pool_update', labels={Poolboy.ignore_label: kopf.ABSENT}, ) -@AppMetrics.measure_execution_time( - 'response_time_seconds', - method='on_event', - resource_type='resourcepools' - ) +@timer( + AppMetrics.response_time_seconds, + labels={'method': 'resource_pool_event', 'resource_type': 'resourcepools'} +) async def resource_pool_event( annotations: kopf.Annotations, labels: kopf.Labels, @@ -350,11 +349,10 @@ async def resource_pool_event( Poolboy.operator_domain, Poolboy.operator_version, 'resourcepools', labels={Poolboy.ignore_label: kopf.ABSENT}, ) -@AppMetrics.measure_execution_time( - 'response_time_seconds', - method='on_delete', - resource_type='resourcepools' - ) +@timer( + AppMetrics.response_time_seconds, + labels={'method': 'resource_pool_delete', 'resource_type': 'resourcepools'} +) async def resource_pool_delete( annotations: kopf.Annotations, labels: kopf.Labels, @@ -382,11 +380,10 @@ async def resource_pool_delete( @kopf.on.event(Poolboy.operator_domain, Poolboy.operator_version, 'resourceproviders') -@AppMetrics.measure_execution_time( - 'response_time_seconds', - method='on_event', - resource_type='resourceproviders' - ) +@timer( + AppMetrics.response_time_seconds, + labels={'method': 'resource_provider_event', 'resource_type': 'resourceproviders'} +) async def resource_provider_event(event: Mapping, logger: kopf.ObjectLogger, **_) -> None: definition = event['object'] if event['type'] == 'DELETED': From 16c101a9c07debc099966debf1a49bf500aeae49 Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Wed, 3 Apr 2024 07:21:51 +0000 Subject: [PATCH 6/7] Add try/exception updating metrics --- operator/resourcepool.py | 53 +++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/operator/resourcepool.py b/operator/resourcepool.py index e27f6f4..60ab840 100644 --- a/operator/resourcepool.py +++ b/operator/resourcepool.py @@ -178,34 +178,37 @@ async def handle_metrics(self, logger: kopf.ObjectLogger, resource_handles): logger.info("Handling metrics for resource pool") resource_handle_deficit = self.min_available - len(resource_handles) - ResourcePoolMetrics.resource_pool_min_available.set( - labels=self.metrics_labels, - value=self.min_available - ) - - ResourcePoolMetrics.resource_pool_available.set( - labels=self.metrics_labels, - value=len(resource_handles) - ) - - if resource_handle_deficit < 0: - ResourcePoolMetrics.resource_pool_used_total.inc( + try: + ResourcePoolMetrics.resource_pool_min_available.set( labels=self.metrics_labels, - value=resource_handle_deficit + value=self.min_available + ) + + ResourcePoolMetrics.resource_pool_available.set( + labels=self.metrics_labels, + value=len(resource_handles) ) - state_labels = self.metric_state_labels - state_labels['state'] = 'available' - ResourcePoolMetrics.resource_pool_state.set( - labels=state_labels, - value=len(resource_handles) - ) + if resource_handle_deficit < 0: + ResourcePoolMetrics.resource_pool_used_total.inc( + labels=self.metrics_labels, + ) - state_labels['state'] = 'used' - ResourcePoolMetrics.resource_pool_state.set( - labels=state_labels, - value=resource_handle_deficit - ) + state_labels = self.metric_state_labels + state_labels['state'] = 'available' + ResourcePoolMetrics.resource_pool_state.set( + labels=state_labels, + value=len(resource_handles) + ) + + state_labels['state'] = 'used' + ResourcePoolMetrics.resource_pool_state.set( + labels=state_labels, + value=resource_handle_deficit + ) + except Exception as e: + logger.error(f"Error handling metrics for resource pool: {e}") + return async def handle_delete(self, logger: kopf.ObjectLogger): await resourcehandle.ResourceHandle.delete_unbound_handles_for_pool(logger=logger, resource_pool=self) @@ -214,7 +217,7 @@ async def handle_delete(self, logger: kopf.ObjectLogger): 'response_time_seconds', method='manage', resource_type='resourcepool' - ) + ) async def manage(self, logger: kopf.ObjectLogger): async with self.lock: resource_handles = await resourcehandle.ResourceHandle.get_unbound_handles_for_pool(resource_pool=self, logger=logger) From a7b315cda2a87b1bb49a44205dff003b6cb89ccd Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Wed, 3 Apr 2024 09:10:53 +0000 Subject: [PATCH 7/7] Cleanup unused static method --- operator/metrics/app_metrics.py | 31 ++++++------------------------- operator/resourcepool.py | 6 +----- 2 files changed, 7 insertions(+), 30 deletions(-) diff --git a/operator/metrics/app_metrics.py b/operator/metrics/app_metrics.py index 7f8ae01..a567dd9 100644 --- a/operator/metrics/app_metrics.py +++ b/operator/metrics/app_metrics.py @@ -1,14 +1,10 @@ from __future__ import annotations import time from functools import wraps -from aioprometheus import Summary +from aioprometheus import Summary, Counter class AppMetrics: - # Summary: Similar to a histogram, a summary samples observations - # (usually things like request durations and response sizes). - # While it also provides a total count of observations and a sum of all observed values, - # it calculates configurable quantiles over a sliding time window. response_time_seconds = Summary("response_time_seconds", "Response time in seconds", {"method": "Method used for the request", @@ -16,23 +12,8 @@ class AppMetrics: } ) - @staticmethod - def measure_execution_time(metric_name, **labels): - - def decorator(func): - @wraps(func) - async def wrapper(*args, **kwargs): - metric = getattr(AppMetrics, metric_name, None) - start_time = time.time() - result = await func(*args, **kwargs) - end_time = time.time() - duration = end_time - start_time - if metric and callable(getattr(metric, 'observe', None)): - metric.observe(labels=labels, value=duration) - else: - print(f"Metric {metric_name} not found or doesn't support observe()") - return result - - return wrapper - - return decorator + request_handler_exceptions = Counter( + "request_handler_exceptions", + "Count of exceptions by handler function.", + {"handler": "Handler function name"} + ) diff --git a/operator/resourcepool.py b/operator/resourcepool.py index 60ab840..b2d6b8c 100644 --- a/operator/resourcepool.py +++ b/operator/resourcepool.py @@ -5,6 +5,7 @@ from metrics import ResourcePoolMetrics + from datetime import timedelta from typing import List, Mapping, Optional, TypeVar @@ -213,11 +214,6 @@ async def handle_metrics(self, logger: kopf.ObjectLogger, resource_handles): async def handle_delete(self, logger: kopf.ObjectLogger): await resourcehandle.ResourceHandle.delete_unbound_handles_for_pool(logger=logger, resource_pool=self) - @ResourcePoolMetrics.measure_execution_time( - 'response_time_seconds', - method='manage', - resource_type='resourcepool' - ) async def manage(self, logger: kopf.ObjectLogger): async with self.lock: resource_handles = await resourcehandle.ResourceHandle.get_unbound_handles_for_pool(resource_pool=self, logger=logger)