Skip to content

Commit

Permalink
Full-text search for commits for PerfCompare using SearchVector (#8533)
Browse files Browse the repository at this point in the history
* add searchvector to commit table and index using GIN

* add ordering by push time and set result limit to 200

* add test for new search functionality

* handle duplicate revisions from different projects

* resolve migration conflict

* resolve migration issues

* resolve merge conflicts

* add limit to comments filed in searchh vector

* update index to have subtr for comments
  • Loading branch information
Netacci authored Feb 28, 2025
1 parent da145c8 commit 3bbe246
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 5 deletions.
89 changes: 88 additions & 1 deletion tests/webapp/api/test_push_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from tests.conftest import IS_WINDOWS
from treeherder.etl.push import store_push_data
from treeherder.model.models import FailureClassification, JobNote, Push
from treeherder.model.models import Commit, FailureClassification, JobNote, Push
from treeherder.webapp.api import utils


Expand Down Expand Up @@ -379,6 +379,93 @@ def test_push_author_contains(client, test_repository):
assert results[0]["id"] == 3


def test_push_search(client, test_repository):
"""
Test the search parameter for filtering by Commit fields: revision, author, comments.
"""
now = datetime.datetime.now()
push1 = Push.objects.create(
repository=test_repository,
revision="1234abcd",
author="[email protected]",
time=now,
)
push2 = Push.objects.create(
repository=test_repository,
revision="2234abcd",
author="[email protected]",
time=now + datetime.timedelta(seconds=1),
)
push3 = Push.objects.create(
repository=test_repository,
revision="3234abcd",
author="[email protected]",
time=now + datetime.timedelta(seconds=2),
)

# Add Commit objects linked to the Push objects
Commit.objects.create(
push=push1, revision="1234abcd", author="kaz <[email protected]>", comments="Initial commit"
)
Commit.objects.create(
push=push2, revision="2234abcd", author="foo <[email protected]>", comments="Bug 12345567 - fix"
)
Commit.objects.create(
push=push3,
revision="3234abcd",
author="quxzan <qux@bar>.com",
comments="Bug 12345567 - Feature added",
)

# Test search by comments
resp = client.get(
reverse("push-list", kwargs={"project": test_repository.name}) + "?search=bug"
)
assert resp.status_code == 200

results = resp.json()["results"]
assert len(results) == 2
assert set([result["id"] for result in results]) == set([3, 2])

# Test search by bug number
resp = client.get(
reverse("push-list", kwargs={"project": test_repository.name}) + "?search=12345567"
)
assert resp.status_code == 200

results = resp.json()["results"]
assert len(results) == 2
assert set([result["id"] for result in results]) == set([3, 2])

# Test search by author
resp = client.get(
reverse("push-list", kwargs={"project": test_repository.name}) + "?search=foo"
)
assert resp.status_code == 200

results = resp.json()["results"]
assert len(results) == 1
assert results[0]["id"] == push2.id

# Test search by revision
resp = client.get(
reverse("push-list", kwargs={"project": test_repository.name}) + "?search=3234abcd"
)
assert resp.status_code == 200

results = resp.json()["results"]
assert len(results) == 1
assert results[0]["id"] == push3.id

# Test empty search input
resp = client.get(reverse("push-list", kwargs={"project": test_repository.name}) + "?search=")
assert resp.status_code == 200

results = resp.json()["results"]
assert len(results) == 3
assert set([result["id"] for result in results]) == set([3, 2, 1])


def test_push_reviewbot(client, test_repository):
"""
test the reviewbot parameter
Expand Down
1 change: 1 addition & 0 deletions treeherder/config/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
INSTALLED_APPS = [
"django.contrib.auth",
"django.contrib.contenttypes",
"django.contrib.postgres.search",
# Disable Django's own staticfiles handling in favour of WhiteNoise, for
# greater consistency between gunicorn and `./manage.py runserver`.
"whitenoise.runserver_nostatic",
Expand Down
28 changes: 28 additions & 0 deletions treeherder/model/migrations/0038_commit_search_vector_idx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Generated by Django 5.1.5 on 2025-02-27 18:06

import django.contrib.postgres.indexes
import django.contrib.postgres.search
import django.db.models.functions.text
from django.db import migrations


class Migration(migrations.Migration):

dependencies = [
("model", "0037_bugjobmap_internal_bug_refs"),
]

operations = [
migrations.AddIndex(
model_name="commit",
index=django.contrib.postgres.indexes.GinIndex(
django.contrib.postgres.search.SearchVector(
"revision",
"author",
django.db.models.functions.text.Substr("comments", 1, 100000),
config="english",
),
name="search_vector_idx",
),
),
]
10 changes: 9 additions & 1 deletion treeherder/model/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,14 @@
import newrelic.agent
from django.conf import settings
from django.contrib.auth.models import User
from django.contrib.postgres.search import TrigramSimilarity
from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.search import SearchVector, TrigramSimilarity
from django.core.cache import cache
from django.core.exceptions import ObjectDoesNotExist
from django.core.validators import MinLengthValidator
from django.db import models, transaction
from django.db.models import Count, Max, Min, Q, Subquery
from django.db.models.functions import Substr
from django.db.utils import ProgrammingError
from django.forms import model_to_dict
from django.utils import timezone
Expand Down Expand Up @@ -189,6 +191,12 @@ class Commit(models.Model):
class Meta:
db_table = "commit"
unique_together = ("push", "revision")
indexes = [
GinIndex(
SearchVector("revision", "author", Substr("comments", 1, 100000), config="english"),
name="search_vector_idx",
),
]

def __str__(self):
return f"{self.push.repository.name} {self.revision}"
Expand Down
25 changes: 22 additions & 3 deletions treeherder/webapp/api/push.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,15 @@

import newrelic.agent
from cache_memoize import cache_memoize
from django.contrib.postgres.search import SearchQuery, SearchVector
from django.db.models.functions import Substr
from rest_framework import viewsets
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework.status import HTTP_400_BAD_REQUEST, HTTP_404_NOT_FOUND

from treeherder.log_parser.failureline import get_group_results
from treeherder.model.models import Job, JobType, Push, Repository
from treeherder.model.models import Commit, Job, JobType, Push, Repository
from treeherder.push_health.builds import get_build_failures
from treeherder.push_health.compare import get_commit_history
from treeherder.push_health.linting import get_lint_failures
Expand Down Expand Up @@ -42,7 +44,6 @@ def list(self, request, project):

# This will contain some meta data about the request and results
meta = {}

# support ranges for date as well as revisions(changes) like old tbpl
for param in [
"fromchange",
Expand All @@ -60,7 +61,6 @@ def list(self, request, project):
all_repos = request.query_params.get("all_repos")

pushes = Push.objects.order_by("-time")

if not all_repos:
try:
repository = Repository.objects.get(name=project)
Expand All @@ -71,6 +71,25 @@ def list(self, request, project):

pushes = pushes.filter(repository=repository)

search_param = filter_params.get("search")
if search_param:
repository = Repository.objects.get(name=project)
filtered_commits = (
Commit.objects.annotate(
search=SearchVector(
"revision", "author", Substr("comments", 1, 100000), config="english"
)
)
.filter(
search=SearchQuery(search_param, config="english"),
push__repository=repository,
)
.values_list("push_id", flat=True)
# Get most recent results and limit result to 200
.order_by("-push__time")
.distinct()[:200]
)
pushes = pushes.filter(id__in=filtered_commits)
for param, value in meta.items():
if param == "fromchange":
revision_field = "revision__startswith" if len(value) < 40 else "revision"
Expand Down

0 comments on commit 3bbe246

Please sign in to comment.