Skip to content

Commit

Permalink
Fix too many tasks being created for create contact points (#12)
Browse files Browse the repository at this point in the history
Fix too many tasks being created for create_contact_points_for_datasource task

Co-authored-by: Julia <[email protected]>
  • Loading branch information
mderynck and Ferril authored Jun 6, 2022
1 parent dbaffd8 commit 583835f
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from django.apps import apps
from rest_framework import status

from apps.alerts.tasks import create_contact_points_for_datasource
from apps.alerts.tasks import schedule_create_contact_points_for_datasource
from apps.grafana_plugin.helpers import GrafanaAPIClient

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -77,16 +77,15 @@ def create_contact_points(self) -> None:
# sync other datasource
for datasource in datasources:
if datasource["type"] == GrafanaAlertingSyncManager.ALERTING_DATASOURCE:
if self.create_contact_point(datasource) is None:
contact_point = self.create_contact_point(datasource)
if contact_point is None:
# Failed to create contact point duo to getting wrong alerting config. It is expected behaviour.
# Add datasource to list and retry to create contact point for it async
datasources_to_create.append(datasource)

if datasources_to_create:
# create other contact points async
create_contact_points_for_datasource.apply_async(
(self.alert_receive_channel.pk, datasources_to_create),
)
schedule_create_contact_points_for_datasource(self.alert_receive_channel.pk, datasources_to_create)
else:
self.alert_receive_channel.is_finished_alerting_setup = True
self.alert_receive_channel.save(update_fields=["is_finished_alerting_setup"])
Expand Down
1 change: 1 addition & 0 deletions engine/apps/alerts/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .call_ack_url import call_ack_url # noqa: F401
from .check_escalation_finished import check_escalation_finished_task # noqa: F401
from .create_contact_points_for_datasource import create_contact_points_for_datasource # noqa: F401
from .create_contact_points_for_datasource import schedule_create_contact_points_for_datasource # noqa: F401
from .custom_button_result import custom_button_result # noqa: F401
from .delete_alert_group import delete_alert_group # noqa: F401
from .distribute_alert import distribute_alert # noqa: F401
Expand Down
42 changes: 38 additions & 4 deletions engine/apps/alerts/tasks/create_contact_points_for_datasource.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,44 @@
import logging

from celery.utils.log import get_task_logger
from django.apps import apps
from django.core.cache import cache
from rest_framework import status

from apps.grafana_plugin.helpers import GrafanaAPIClient
from common.custom_celery_tasks import shared_dedicated_queue_retry_task

logger = get_task_logger(__name__)
logger.setLevel(logging.DEBUG)


def get_cache_key_create_contact_points_for_datasource(alert_receive_channel_id):
CACHE_KEY_PREFIX = "create_contact_points_for_datasource"
return f"{CACHE_KEY_PREFIX}_{alert_receive_channel_id}"


@shared_dedicated_queue_retry_task
def schedule_create_contact_points_for_datasource(alert_receive_channel_id, datasource_list):
CACHE_LIFETIME = 600
START_TASK_DELAY = 3
task = create_contact_points_for_datasource.apply_async(
args=[alert_receive_channel_id, datasource_list], countdown=START_TASK_DELAY
)
cache_key = get_cache_key_create_contact_points_for_datasource(alert_receive_channel_id)
cache.set(cache_key, task.id, timeout=CACHE_LIFETIME)


@shared_dedicated_queue_retry_task(autoretry_for=(Exception,), retry_backoff=True, max_retries=10)
def create_contact_points_for_datasource(alert_receive_channel_id, datasource_list):
"""
Try to create contact points for other datasource.
Restart task for datasource, for which contact point was not created.
"""
cache_key = get_cache_key_create_contact_points_for_datasource(alert_receive_channel_id)
cached_task_id = cache.get(cache_key)
current_task_id = create_contact_points_for_datasource.request.id
if cached_task_id is not None and current_task_id != cached_task_id:
return

AlertReceiveChannel = apps.get_model("alerts", "AlertReceiveChannel")

Expand All @@ -21,24 +49,30 @@ def create_contact_points_for_datasource(alert_receive_channel_id, datasource_li
api_token=alert_receive_channel.organization.api_token,
)
# list of datasource for which contact point creation was failed
datasource_to_create = []
datasources_to_create = []
for datasource in datasource_list:
contact_point = None
config, response_info = client.get_alerting_config(datasource["id"])
if config is None:
if response_info.get("status_code") == status.HTTP_404_NOT_FOUND:
client.get_alertmanager_status_with_config(datasource["id"])
contact_point = alert_receive_channel.grafana_alerting_sync_manager.create_contact_point(datasource)
elif response_info.get("status_code") == status.HTTP_400_BAD_REQUEST:
logger.warning(
f"Failed to create contact point for integration {alert_receive_channel_id}, "
f"datasource info: {datasource}; response: {response_info}"
)
continue
else:
contact_point = alert_receive_channel.grafana_alerting_sync_manager.create_contact_point(datasource)
if contact_point is None:
# Failed to create contact point duo to getting wrong alerting config.
# Add datasource to list and retry to create contact point for it again
datasource_to_create.append(datasource)
datasources_to_create.append(datasource)

# if some contact points were not created, restart task for them
if datasource_to_create:
create_contact_points_for_datasource.apply_async((alert_receive_channel_id, datasource_to_create), countdown=5)
if datasources_to_create:
schedule_create_contact_points_for_datasource(alert_receive_channel_id, datasources_to_create)
else:
alert_receive_channel.is_finished_alerting_setup = True
alert_receive_channel.save(update_fields=["is_finished_alerting_setup"])

0 comments on commit 583835f

Please sign in to comment.