Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(grouping): Fix unmerged seer grouphash integrity error #83081

Merged
merged 2 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion migrations_lockfile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ remote_subscriptions: 0003_drop_remote_subscription

replays: 0004_index_together

sentry: 0807_remove_monitor_attachment_id_pt2
sentry: 0808_change_grouphash_metadata_seer_matched_grouphash_deletion_config

social_auth: 0002_default_auto_field

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Generated by Django 5.1.4 on 2025-01-08 22:51

import django.db.models.deletion
from django.db import migrations

import sentry.db.models.fields.foreignkey
from sentry.new_migrations.migrations import CheckedMigration


class Migration(CheckedMigration):
# This flag is used to mark that a migration shouldn't be automatically run in production.
# This should only be used for operations where it's safe to run the migration after your
# code has deployed. So this should not be used for most operations that alter the schema
# of a table.
# Here are some things that make sense to mark as post deployment:
# - Large data migrations. Typically we want these to be run manually so that they can be
# monitored and not block the deploy for a long period of time while they run.
# - Adding indexes to large tables. Since this can take a long time, we'd generally prefer to
# run this outside deployments so that we don't block them. Note that while adding an index
# is a schema change, it's completely safe to run the operation after the code has deployed.
# Once deployed, run these manually via: https://develop.sentry.dev/database-migrations/#migration-deployment

is_post_deployment = False

dependencies = [
("sentry", "0807_remove_monitor_attachment_id_pt2"),
]

operations = [
migrations.AlterField(
model_name="grouphashmetadata",
name="seer_matched_grouphash",
field=sentry.db.models.fields.foreignkey.FlexibleForeignKey(
null=True,
on_delete=django.db.models.deletion.SET_NULL,
related_name="seer_matchees",
to="sentry.grouphash",
),
),
]
2 changes: 1 addition & 1 deletion src/sentry/models/grouphashmetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class GroupHashMetadata(Model):
seer_model = models.CharField(null=True)
# The `GroupHash` record representing the match Seer sent back as a match (if any)
seer_matched_grouphash = FlexibleForeignKey(
"sentry.GroupHash", related_name="seer_matchees", on_delete=models.DO_NOTHING, null=True
"sentry.GroupHash", related_name="seer_matchees", on_delete=models.SET_NULL, null=True
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lgtm, there shouldn't be too many GroupHashMetadata per GroupHash, correct?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's technically possible for it to be a large number, but most often it's not.

Also, I assume (?) that the null setting happens only when delete is called on the linked object, yes? If so, then in cases where there's been no manual unmerge (the 99.9% case), all of the GroupHashMetadata records pointing at the GroupHash instance will already have been deleted by the time we get to the null setting, as they'll belong to sibling GroupHash records of the GroupHash record in question. (See point 3 in the first list in the PR description.) So only in the 0.1% case will there even be any remaining GroupHashMetadata records on which to update the seer_matched_grouphash value.

)
# The similarity between this hash's stacktrace and the parent (matched) hash's stacktrace
seer_match_distance = models.FloatField(null=True)
Expand Down
166 changes: 166 additions & 0 deletions tests/sentry/deletions/test_grouphash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
from dataclasses import asdict
from unittest.mock import patch

from sentry.conf.server import SEER_SIMILARITY_MODEL_VERSION
from sentry.deletions.tasks.groups import delete_groups
from sentry.models.group import Group
from sentry.models.grouphash import GroupHash
from sentry.models.grouphashmetadata import GroupHashMetadata
from sentry.seer.similarity.types import SeerSimilarIssueData
from sentry.tasks.unmerge import unmerge
from sentry.testutils.cases import TestCase
from sentry.testutils.helpers.features import apply_feature_flag_on_cls
from sentry.testutils.skips import requires_snuba

pytestmark = [requires_snuba]


@apply_feature_flag_on_cls("organizations:grouphash-metadata-creation")
class DeleteGroupHashTest(TestCase):
def test_deleting_group_deletes_grouphash_and_metadata(self):
event = self.store_event(data={"message": "Dogs are great!"}, project_id=self.project.id)
assert event.group
group_id = event.group.id

grouphash = GroupHash.objects.filter(group_id=group_id).first()
assert grouphash

grouphash_metadata = GroupHashMetadata.objects.filter(grouphash_id=grouphash.id).first()
assert grouphash_metadata

with self.tasks():
delete_groups(object_ids=[group_id])

assert not Group.objects.filter(id=group_id).exists()
assert not GroupHash.objects.filter(group_id=group_id).exists()
assert not GroupHashMetadata.objects.filter(grouphash_id=grouphash.id).exists()

def test_deleting_grouphash_matched_by_seer(self):
existing_event = self.store_event(
data={"message": "Dogs are great!"}, project_id=self.project.id
)
assert existing_event.group
existing_group_id = existing_event.group.id

existing_grouphash = GroupHash.objects.filter(group_id=existing_group_id).first()
assert existing_grouphash

seer_result_data = SeerSimilarIssueData(
parent_hash=existing_event.get_primary_hash(),
parent_group_id=existing_group_id,
stacktrace_distance=0.01,
should_group=True,
)

with (
patch("sentry.grouping.ingest.seer.should_call_seer_for_grouping", return_value=True),
patch(
"sentry.grouping.ingest.seer.get_seer_similar_issues",
return_value=(
{
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
"results": [asdict(seer_result_data)],
},
existing_grouphash,
),
),
):
new_event = self.store_event(
data={"message": "Adopt, don't shop"}, project_id=self.project.id
)
new_grouphash = GroupHash.objects.filter(hash=new_event.get_primary_hash()).first()
assert new_grouphash and new_grouphash.metadata

assert new_grouphash != existing_grouphash
assert new_event.group_id == existing_group_id
assert new_grouphash.metadata.seer_matched_grouphash == existing_grouphash

with self.tasks():
delete_groups(object_ids=[existing_group_id])

assert not Group.objects.filter(id=existing_group_id).exists()
assert not GroupHash.objects.filter(group_id=existing_group_id).exists()
assert not GroupHashMetadata.objects.filter(grouphash_id=existing_grouphash.id).exists()
assert not GroupHashMetadata.objects.filter(grouphash_id=new_grouphash.id).exists()

def test_deleting_grouphash_matched_by_seer_after_unmerge(self):
"""
Ensure that `seer_matched_grouphash` references aren't left dangling (and causing integrity
errors) when the matched grouphash is deleted.
"""
existing_event = self.store_event(
data={"message": "Dogs are great!"}, project_id=self.project.id
)
assert existing_event.group

existing_grouphash = GroupHash.objects.filter(
hash=existing_event.get_primary_hash()
).first()
assert existing_grouphash

seer_result_data = SeerSimilarIssueData(
parent_hash=existing_event.get_primary_hash(),
parent_group_id=existing_event.group.id,
stacktrace_distance=0.01,
should_group=True,
)

with (
patch("sentry.grouping.ingest.seer.should_call_seer_for_grouping", return_value=True),
patch(
"sentry.grouping.ingest.seer.get_seer_similar_issues",
return_value=(
{
"similarity_model_version": SEER_SIMILARITY_MODEL_VERSION,
"results": [asdict(seer_result_data)],
},
existing_grouphash,
),
),
):
new_event = self.store_event(
data={"message": "Adopt, don't shop"}, project_id=self.project.id
)
new_grouphash = GroupHash.objects.filter(hash=new_event.get_primary_hash()).first()
assert new_grouphash and new_grouphash.metadata

assert new_grouphash != existing_grouphash
assert new_event.group_id == existing_event.group.id
assert new_grouphash.metadata.seer_matched_grouphash == existing_grouphash

with self.tasks():
unmerge.delay(
self.project.id,
existing_event.group.id,
None,
[new_grouphash.hash],
None,
)

# Pull the grouphashes from the DB again to check updated values
existing_grouphash = GroupHash.objects.filter(
hash=existing_event.get_primary_hash()
).first()
new_grouphash = GroupHash.objects.filter(hash=new_event.get_primary_hash()).first()
assert existing_grouphash and new_grouphash

# The grouphashes now point to different groups, but the `seer_matched_grouphash`
# link remains
assert existing_grouphash.group_id != new_grouphash.group_id
assert (
new_grouphash.metadata
and new_grouphash.metadata.seer_matched_grouphash == existing_grouphash
)

with self.tasks():
delete_groups(object_ids=[existing_event.group.id])

assert not Group.objects.filter(id=existing_event.group.id).exists()
assert not GroupHash.objects.filter(group_id=existing_event.group.id).exists()
assert not GroupHashMetadata.objects.filter(grouphash_id=existing_grouphash.id).exists()

# The unmerged grouphash and its metadata remain, but the `seer_matched_grouphash` link has
# been broken so as not to cause integrity errors
new_grouphash = GroupHash.objects.filter(hash=new_event.get_primary_hash()).first()
assert new_grouphash and new_grouphash.metadata
assert new_grouphash.metadata.seer_matched_grouphash is None
Loading