Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: improve Slack rate limiting logic when updating alert groups #5287

Open
wants to merge 24 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion engine/apps/alerts/models/alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,7 +674,7 @@ def acknowledge_by_user_or_backsync(
organization_id = user.organization_id if user else self.channel.organization_id
logger.debug(f"Started acknowledge_by_user_or_backsync for alert_group {self.pk}")

# if incident was silenced or resolved, unsilence/unresolve it without starting escalation
# if alert group was silenced or resolved, unsilence/unresolve it without starting escalation
if self.silenced:
self.un_silence()
self.log_records.create(
Expand Down Expand Up @@ -1990,6 +1990,13 @@ def slack_channel_id(self) -> str | None:

@property
def slack_message(self) -> typing.Optional["SlackMessage"]:
"""
`slack_message` property returns the first `SlackMessage` for the `AlertGroup`. This corresponds to the
Slack message representing the main message in Slack (ie. not a message in a thread).

This should not be confused with `slack_messages`, which is a related manager that returns all `SlackMessage`
instances for the `AlertGroup`.
"""
try:
# prefetched_slack_messages could be set in apps.api.serializers.alert_group.AlertGroupListSerializer
return self.prefetched_slack_messages[0] if self.prefetched_slack_messages else None
Expand Down
34 changes: 18 additions & 16 deletions engine/apps/alerts/models/alert_receive_channel.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
metrics_remove_deleted_integration_from_cache,
metrics_update_integration_cache,
)
from apps.slack.constants import SLACK_RATE_LIMIT_DELAY, SLACK_RATE_LIMIT_TIMEOUT
from apps.slack.constants import SLACK_RATE_LIMIT_TIMEOUT
from apps.slack.tasks import post_slack_rate_limit_message
from apps.slack.utils import post_message_to_channel
from common.api_helpers.utils import create_engine_url
Expand All @@ -43,7 +43,7 @@

from apps.alerts.models import AlertGroup, ChannelFilter
from apps.labels.models import AlertReceiveChannelAssociatedLabel
from apps.user_management.models import Organization, Team
from apps.user_management.models import Organization, Team, User

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -391,7 +391,7 @@ def save(self, *args, **kwargs):

return super().save(*args, **kwargs)

def change_team(self, team_id, user):
def change_team(self, team_id: int, user: "User") -> None:
if team_id == self.team_id:
raise TeamCanNotBeChangedError("Integration is already in this team")

Expand All @@ -409,52 +409,54 @@ def grafana_alerting_sync_manager(self):
return GrafanaAlertingSyncManager(self)

@property
def is_alerting_integration(self):
def is_alerting_integration(self) -> bool:
return self.integration in {
AlertReceiveChannel.INTEGRATION_GRAFANA_ALERTING,
AlertReceiveChannel.INTEGRATION_LEGACY_GRAFANA_ALERTING,
}

@cached_property
def team_name(self):
def team_name(self) -> str:
return self.team.name if self.team else "No team"

@cached_property
def team_id_or_no_team(self):
def team_id_or_no_team(self) -> str:
return self.team_id if self.team else "no_team"

@cached_property
def emojized_verbal_name(self):
def emojized_verbal_name(self) -> str:
return emoji.emojize(self.verbal_name, language="alias")

@property
def new_incidents_web_link(self):
def new_incidents_web_link(self) -> str:
from apps.alerts.models import AlertGroup

return UIURLBuilder(self.organization).alert_groups(
f"?integration={self.public_primary_key}&status={AlertGroup.NEW}",
)

@property
def is_rate_limited_in_slack(self):
def is_rate_limited_in_slack(self) -> bool:
return (
self.rate_limited_in_slack_at is not None
and self.rate_limited_in_slack_at + SLACK_RATE_LIMIT_TIMEOUT > timezone.now()
)

def start_send_rate_limit_message_task(self, delay=SLACK_RATE_LIMIT_DELAY):
def start_send_rate_limit_message_task(self, error_message_verb: str, delay: int) -> None:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is currently only invoked in two spots, both of which have been updated (additionally, both of these spots that were invoking this function were already passing in a delay, hence why I removed the default of SLACK_RATE_LIMIT_DELAY)

task_id = celery_uuid()

self.rate_limit_message_task_id = task_id
self.rate_limited_in_slack_at = timezone.now()
self.save(update_fields=["rate_limit_message_task_id", "rate_limited_in_slack_at"])
post_slack_rate_limit_message.apply_async((self.pk,), countdown=delay, task_id=task_id)

post_slack_rate_limit_message.apply_async((self.pk, error_message_verb), countdown=delay, task_id=task_id)

@property
def alert_groups_count(self):
def alert_groups_count(self) -> int:
return self.alert_groups.count()

@property
def alerts_count(self):
def alerts_count(self) -> int:
from apps.alerts.models import Alert

return Alert.objects.filter(group__channel=self).count()
Expand Down Expand Up @@ -548,14 +550,14 @@ def integration_url(self) -> str | None:
return create_engine_url(f"integrations/v1/{slug}/{self.token}/")

@property
def inbound_email(self):
def inbound_email(self) -> typing.Optional[str]:
if self.integration != AlertReceiveChannel.INTEGRATION_INBOUND_EMAIL:
return None

return f"{self.token}@{live_settings.INBOUND_EMAIL_DOMAIN}"

@property
def default_channel_filter(self):
def default_channel_filter(self) -> typing.Optional["ChannelFilter"]:
return self.channel_filters.filter(is_default=True).first()

# Templating
Expand Down Expand Up @@ -590,7 +592,7 @@ def templates(self):
}

@property
def is_available_for_custom_templates(self):
def is_available_for_custom_templates(self) -> bool:
return True

# Maintenance
Expand Down
4 changes: 0 additions & 4 deletions engine/apps/alerts/tasks/compare_escalations.py

This file was deleted.

3 changes: 1 addition & 2 deletions engine/apps/alerts/tasks/escalate_alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from common.custom_celery_tasks import shared_dedicated_queue_retry_task

from .compare_escalations import compare_escalations
from .task_logger import task_logger


Expand All @@ -29,7 +28,7 @@ def escalate_alert_group(alert_group_pk):
except IndexError:
return f"Alert group with pk {alert_group_pk} doesn't exist"

if not compare_escalations(escalate_alert_group.request.id, alert_group.active_escalation_id):
if escalate_alert_group.request.id != alert_group.active_escalation_id:
return "Active escalation ID mismatch. Duplication or non-active escalation triggered. Active: {}".format(
alert_group.active_escalation_id
)
Expand Down
3 changes: 1 addition & 2 deletions engine/apps/alerts/tasks/notify_user.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from apps.phone_notifications.phone_backend import PhoneBackend
from common.custom_celery_tasks import shared_dedicated_queue_retry_task

from .compare_escalations import compare_escalations
from .task_logger import task_logger

if typing.TYPE_CHECKING:
Expand Down Expand Up @@ -618,7 +617,7 @@ def send_bundled_notification(user_notification_bundle_id: int):
)
return

if not compare_escalations(send_bundled_notification.request.id, user_notification_bundle.notification_task_id):
if send_bundled_notification.request.id != user_notification_bundle.notification_task_id:
task_logger.info(
f"send_bundled_notification: notification_task_id mismatch. "
f"Duplication or non-active notification triggered. "
Expand Down
5 changes: 3 additions & 2 deletions engine/apps/alerts/tasks/unsilence.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from common.custom_celery_tasks import shared_dedicated_queue_retry_task

from .compare_escalations import compare_escalations
from .send_alert_group_signal import send_alert_group_signal
from .task_logger import task_logger

Expand All @@ -23,11 +22,13 @@ def unsilence_task(alert_group_pk):
except IndexError:
task_logger.info(f"unsilence_task. alert_group {alert_group_pk} doesn't exist")
return
if not compare_escalations(unsilence_task.request.id, alert_group.unsilence_task_uuid):

if unsilence_task.request.id != alert_group.unsilence_task_uuid:
task_logger.info(
f"unsilence_task. alert_group {alert_group.pk}.ID mismatch.Active: {alert_group.unsilence_task_uuid}"
)
return

if alert_group.status == AlertGroup.SILENCED and alert_group.is_root_alert_group:
initial_state = alert_group.state
task_logger.info(f"unsilence alert_group {alert_group_pk} and start escalation if needed")
Expand Down
4 changes: 2 additions & 2 deletions engine/apps/alerts/tests/test_alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,11 @@ def test_delete(
# Check that appropriate Slack API calls are made
assert mock_chat_delete.call_count == 2
assert mock_chat_delete.call_args_list[0] == call(
channel=resolution_note_1.slack_channel_id, ts=resolution_note_1.ts
channel=resolution_note_1.slack_channel.slack_id, ts=resolution_note_1.ts
)
assert mock_chat_delete.call_args_list[1] == call(channel=slack_message.channel.slack_id, ts=slack_message.slack_id)
mock_reactions_remove.assert_called_once_with(
channel=resolution_note_2.slack_channel_id, name="memo", timestamp=resolution_note_2.ts
channel=resolution_note_2.slack_channel.slack_id, name="memo", timestamp=resolution_note_2.ts
)


Expand Down
14 changes: 3 additions & 11 deletions engine/apps/alerts/tests/test_maintenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@ def maintenance_test_setup(


@pytest.mark.django_db
def test_start_maintenance_integration(
maintenance_test_setup, make_alert_receive_channel, mock_start_disable_maintenance_task
):
def test_start_maintenance_integration(maintenance_test_setup, make_alert_receive_channel):
organization, user = maintenance_test_setup

alert_receive_channel = make_alert_receive_channel(
Expand All @@ -37,9 +35,7 @@ def test_start_maintenance_integration(


@pytest.mark.django_db
def test_start_maintenance_integration_multiple_previous_instances(
maintenance_test_setup, make_alert_receive_channel, mock_start_disable_maintenance_task
):
def test_start_maintenance_integration_multiple_previous_instances(maintenance_test_setup, make_alert_receive_channel):
organization, user = maintenance_test_setup

alert_receive_channel = make_alert_receive_channel(
Expand All @@ -64,9 +60,7 @@ def test_start_maintenance_integration_multiple_previous_instances(


@pytest.mark.django_db
def test_maintenance_integration_will_not_start_twice(
maintenance_test_setup, make_alert_receive_channel, mock_start_disable_maintenance_task
):
def test_maintenance_integration_will_not_start_twice(maintenance_test_setup, make_alert_receive_channel):
organization, user = maintenance_test_setup

alert_receive_channel = make_alert_receive_channel(
Expand All @@ -91,7 +85,6 @@ def test_alert_attached_to_maintenance_incident_integration(
maintenance_test_setup,
make_alert_receive_channel,
make_alert_with_custom_create_method,
mock_start_disable_maintenance_task,
):
organization, user = maintenance_test_setup

Expand Down Expand Up @@ -122,7 +115,6 @@ def test_stop_maintenance(
maintenance_test_setup,
make_alert_receive_channel,
make_alert_with_custom_create_method,
mock_start_disable_maintenance_task,
):
organization, user = maintenance_test_setup
alert_receive_channel = make_alert_receive_channel(
Expand Down
94 changes: 60 additions & 34 deletions engine/apps/alerts/tests/test_notify_user.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,47 +530,73 @@ def test_send_bundle_notification(
alert_group_1 = make_alert_group(alert_receive_channel=alert_receive_channel)
alert_group_2 = make_alert_group(alert_receive_channel=alert_receive_channel)
alert_group_3 = make_alert_group(alert_receive_channel=alert_receive_channel)

task_id = "test_task_id"
notification_bundle = make_user_notification_bundle(
user, UserNotificationPolicy.NotificationChannel.SMS, notification_task_id="test_task_id", eta=timezone.now()
user, UserNotificationPolicy.NotificationChannel.SMS, notification_task_id=task_id, eta=timezone.now()
)

notification_bundle.append_notification(alert_group_1, notification_policy)
notification_bundle.append_notification(alert_group_2, notification_policy)
notification_bundle.append_notification(alert_group_3, notification_policy)

assert notification_bundle.notifications.filter(bundle_uuid__isnull=True).count() == 3

alert_group_3.resolve()
with patch("apps.alerts.tasks.notify_user.compare_escalations", return_value=True):
# send notification for 2 active alert groups
send_bundled_notification(notification_bundle.id)
assert f"alert_group {alert_group_3.id} is not active, skip notification" in caplog.text
assert "perform bundled notification for alert groups with ids:" in caplog.text
# check bundle_uuid was set, notification for resolved alert group was deleted
assert notification_bundle.notifications.filter(bundle_uuid__isnull=True).count() == 0
assert notification_bundle.notifications.all().count() == 2
assert not notification_bundle.notifications.filter(alert_group=alert_group_3).exists()

# send notification for 1 active alert group
notification_bundle.notifications.update(bundle_uuid=None)
alert_group_2.resolve()
send_bundled_notification(notification_bundle.id)
assert f"alert_group {alert_group_2.id} is not active, skip notification" in caplog.text
assert (
f"there is only one alert group in bundled notification, perform regular notification. "
f"alert_group {alert_group_1.id}"
) in caplog.text
# check bundle_uuid was set
assert notification_bundle.notifications.filter(bundle_uuid__isnull=True).count() == 0
assert notification_bundle.notifications.all().count() == 1
# cleanup notifications
notification_bundle.notifications.all().delete()

# send notification for 0 active alert group
notification_bundle.append_notification(alert_group_1, notification_policy)
alert_group_1.resolve()
send_bundled_notification(notification_bundle.id)
assert f"alert_group {alert_group_1.id} is not active, skip notification" in caplog.text
assert f"no alert groups to notify about or notification is not allowed for user {user.id}" in caplog.text
# check all notifications were deleted
assert notification_bundle.notifications.all().count() == 0

# send notification for 2 active alert groups
send_bundled_notification.apply((notification_bundle.id,), task_id=task_id)

assert f"alert_group {alert_group_3.id} is not active, skip notification" in caplog.text
assert "perform bundled notification for alert groups with ids:" in caplog.text

# check bundle_uuid was set, notification for resolved alert group was deleted
assert notification_bundle.notifications.filter(bundle_uuid__isnull=True).count() == 0
assert notification_bundle.notifications.all().count() == 2
assert not notification_bundle.notifications.filter(alert_group=alert_group_3).exists()

# send notification for 1 active alert group
notification_bundle.notifications.update(bundle_uuid=None)

# since we're calling send_bundled_notification several times within this test, we need to reset task_id
# because it gets set to None after the first call
notification_bundle.notification_task_id = task_id
notification_bundle.save()

alert_group_2.resolve()

send_bundled_notification.apply((notification_bundle.id,), task_id=task_id)

assert f"alert_group {alert_group_2.id} is not active, skip notification" in caplog.text
assert (
f"there is only one alert group in bundled notification, perform regular notification. "
f"alert_group {alert_group_1.id}"
) in caplog.text

# check bundle_uuid was set
assert notification_bundle.notifications.filter(bundle_uuid__isnull=True).count() == 0
assert notification_bundle.notifications.all().count() == 1

# cleanup notifications
notification_bundle.notifications.all().delete()

# send notification for 0 active alert group
notification_bundle.append_notification(alert_group_1, notification_policy)

# since we're calling send_bundled_notification several times within this test, we need to reset task_id
# because it gets set to None after the first call
notification_bundle.notification_task_id = task_id
notification_bundle.save()

alert_group_1.resolve()

send_bundled_notification.apply((notification_bundle.id,), task_id=task_id)

assert f"alert_group {alert_group_1.id} is not active, skip notification" in caplog.text
assert f"no alert groups to notify about or notification is not allowed for user {user.id}" in caplog.text

# check all notifications were deleted
assert notification_bundle.notifications.all().count() == 0


@pytest.mark.django_db
Expand Down
Loading
Loading