Skip to content

Commit

Permalink
feat(ReferenceCitation): use resolved_case_name and resolved_case_nam…
Browse files Browse the repository at this point in the history
…e_short for search and resolution

Fixes #199

- updated ReferenceCitation and FullCaseCitation models metadata to admit resolved_case_name_short and resolved_case_name
- update helpers.filter_citations to save metadata of duplicated ReferenceCitations on the kept object
- update tests to show how the finding and resolution will work
  • Loading branch information
grossir committed Feb 1, 2025
1 parent abfc7f7 commit 9805ef2
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 51 deletions.
9 changes: 5 additions & 4 deletions eyecite/find.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def get_citations(

# Check for reference citations that follow a full citation
# Using the plaintiff or defendant
references = _extract_reference_citations(citation, plain_text)
references = extract_reference_citations(citation, plain_text)
citations.extend(references)

# CASE 2: Token is an "Id." or "Ibid." reference.
Expand Down Expand Up @@ -124,8 +124,9 @@ def get_citations(
return citations


def _extract_reference_citations(
citation: FullCitation, plain_text: str
def extract_reference_citations(
citation: FullCitation,
plain_text: str,
) -> List[ReferenceCitation]:
"""Extract reference citations that follow a full citation
Expand Down Expand Up @@ -156,7 +157,7 @@ def is_valid_name(name: str) -> bool:

regexes = [
rf"(?P<{key}>{re.escape(value)})"
for key in ["plaintiff", "defendant"]
for key in ReferenceCitation.name_fields
if (value := getattr(citation.metadata, key, None))
and is_valid_name(value)
]
Expand Down
14 changes: 14 additions & 0 deletions eyecite/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
FullJournalCitation,
FullLawCitation,
ParagraphToken,
ReferenceCitation,
ResourceCitation,
StopWordToken,
Token,
Expand Down Expand Up @@ -336,6 +337,19 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
last_span = last_citation.span()
current_span = citation.span()

if current_span == last_span and isinstance(
last_citation, ReferenceCitation
):
# a single ReferenceCitation may be found via different
# names. Save the name metadata to account for collisions
for field in ReferenceCitation.name_fields:
if not getattr(last_citation.metadata, field):
setattr(
last_citation.metadata,
field,
getattr(citation.metadata, field),
)

if current_span[0] <= last_span[1]:
# Remove overlapping citations that can occur in edge cases
continue
Expand Down
12 changes: 12 additions & 0 deletions eyecite/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,9 @@ class Metadata(CaseCitation.Metadata):
plaintiff: Optional[str] = None
defendant: Optional[str] = None
extra: Optional[str] = None
# May be populated after citation resolution
resolved_case_name_short: Optional[str] = None
resolved_case_name: Optional[str] = None

def add_metadata(self, words: "Tokens"):
"""Extract metadata from text before and after citation."""
Expand Down Expand Up @@ -604,6 +607,15 @@ class Metadata(CitationBase.Metadata):
plaintiff: Optional[str] = None
defendant: Optional[str] = None
pin_cite: Optional[str] = None
resolved_case_name_short: Optional[str] = None
resolved_case_name: Optional[str] = None

name_fields = [
"plaintiff",
"defendant",
"resolved_case_name_short",
"resolved_case_name",
]


@dataclass(eq=False, unsafe_hash=False, repr=False)
Expand Down
48 changes: 27 additions & 21 deletions eyecite/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,29 +84,34 @@ def _filter_by_matching_antecedent(
return matches[0] if len(matches) == 1 else None


def _filter_by_matching_plaintiff_or_defendant(
def _filter_by_matching_plaintiff_or_defendant_or_resolved_names(
resolved_full_cites: ResolvedFullCites,
plaintiff: str,
defendant: str,
reference_citation: ReferenceCitation,
) -> Optional[ResourceType]:
"""Filter out any impossible reference citations"""
"""Filter out reference citations that point to more than 1 Resource"""
matches: List[ResourceType] = []
compare_keys = [
"defendant",
"plaintiff",
"resolved_case_name",
"resolved_case_name_short",
]

for full_citation, resource in resolved_full_cites:
if not isinstance(full_citation, FullCaseCitation):
continue
defendant_match = (
defendant
and full_citation.metadata.defendant
and defendant in full_citation.metadata.defendant
)
plaintiff_match = (
plaintiff
and full_citation.metadata.plaintiff
and plaintiff in full_citation.metadata.plaintiff
)
if defendant_match or plaintiff_match:
matches.append(resource)

for key in compare_keys:
reference_value = getattr(reference_citation.metadata, key)
full_case_value = getattr(full_citation.metadata, key)
if (
reference_value
and full_case_value
and reference_value in full_case_value
):
matches.append(resource)
break

# Remove duplicates and only accept if one candidate remains
matches = list(set(matches))
return matches[0] if len(matches) == 1 else None
Expand Down Expand Up @@ -216,18 +221,19 @@ def _resolve_reference_citation(
"""Resolve reference citations
Try to resolve reference citations by checking whether their is only one
full citation that appears with either the defendant or plaintiff
full citation that appears with either the defendant or plaintiff or
resolved_case_name_short or resolved_case_name
field of any of the previously resolved full citations.
"""
if (
not reference_citation.metadata.defendant
and not reference_citation.metadata.plaintiff
and not reference_citation.metadata.resolved_case_name_short
and not reference_citation.metadata.resolved_case_name
):
return None
return _filter_by_matching_plaintiff_or_defendant(
resolved_full_cites,
reference_citation.metadata.plaintiff,
reference_citation.metadata.defendant,
return _filter_by_matching_plaintiff_or_defendant_or_resolved_names(
resolved_full_cites, reference_citation
)


Expand Down
29 changes: 29 additions & 0 deletions tests/test_FindTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from unittest import TestCase

from eyecite import clean_text, get_citations
from eyecite.find import extract_reference_citations
from eyecite.helpers import filter_citations

# by default tests use a cache for speed
# call tests with `EYECITE_CACHE_DIR= python ...` to disable cache
Expand Down Expand Up @@ -858,3 +860,30 @@ def test_citation_fullspan(self):
self.assertEqual(
extracted.full_span(), (start_idx, len(sentence)), error_msg
)

def test_reference_extraction(self):
"""Can we extract a reference citation using resolved metadata?"""
texts = [
# In this case the reference citation got with the
# resolved_case_name is redundant, was already got in the regular
# process. Can we deduplicate?
"""See, e.g., State v. Wingler, 135 A. 2d 468 (1957);
[State v. Wingler at 175, citing, Minnesota ex rel.]""",
# In this case the resolved_case_name actually helps getting the
# reference citation
"""See, e.g., State v. W1ngler, 135 A. 2d 468 (1957);
[State v. Wingler at 175, citing, Minnesota ex rel.]""",
]
for plain_text in texts:
citations = get_citations(plain_text)
citations[0].metadata.resolved_case_name = "State v. Wingler"
references = extract_reference_citations(citations[0], plain_text)
final_citations = filter_citations(citations + references)
self.assertEqual(
len(final_citations), 2, "There should only be 2 citations"
)
self.assertEqual(
len(references),
1,
"Only a reference citation should had been picked up",
)
81 changes: 55 additions & 26 deletions tests/test_ResolveTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from unittest import TestCase

from eyecite import get_citations
from eyecite.find import extract_reference_citations
from eyecite.helpers import filter_citations
from eyecite.models import FullCitation, Resource
from eyecite.resolve import resolve_citations

Expand Down Expand Up @@ -32,42 +34,53 @@ def assertResolution(self, citations, expected_resolution_dict):
)

def checkReferenceResolution(
self, *expected_resolutions: tuple[list[list[int]], str]
self,
expected_indices: list[list[int]],
citation_text: str,
resolved_case_name_short: Optional[str] = None,
):
"""
Helper function to help test reference citations.
Args:
*expected_resolutions (tuple[list[int], str]):
A list of tuples where each tuple contains:
- A list of expected indices for the resolved citations.
- A string of citation text to process.
expected_indices: A list of expected indices for the resolved
citations.
citation_text: A string of citation text to process.
resolved_case_name_short: a case name for simulating post-resolution
metadata assignment to full case citations; this will also be
used as a flag to use a second round of reference extractions
Returns:
None
"""
for expected_indices, citation_text in expected_resolutions:
citations = get_citations(citation_text)
citations = get_citations(citation_text)
if resolved_case_name_short:
citations[0].metadata.resolved_case_name_short = (
resolved_case_name_short
)
new_references = extract_reference_citations(
citations[0], citation_text
)
citations = filter_citations(citations + new_references)

# Step 2: Build a helper dict to map corrected citations to indices
resolution_index_map = {
cite.corrected_citation(): idx
for idx, cite in enumerate(citations)
}
# Step 2: Build a helper dict to map corrected citations to indices
resolution_index_map = {
cite.corrected_citation(): idx
for idx, cite in enumerate(citations)
}

# Step 3: Resolve citations and format the resolution
resolved_citations = resolve_citations(citations)
formatted_resolution = format_resolution(resolved_citations)
# Step 3: Resolve citations and format the resolution
resolved_citations = resolve_citations(citations)
formatted_resolution = format_resolution(resolved_citations)

# Step 4: Map resolved citations to their indices
result = {
key: [resolution_index_map[value] for value in values]
for key, values in formatted_resolution.items()
}
# Step 4: Map resolved citations to their indices
result = {
key: [resolution_index_map[value] for value in values]
for key, values in formatted_resolution.items()
}

# Step 5: Compare the actual results with expected indices
actual_indices = list(result.values())
self.assertEqual(expected_indices, actual_indices)
# Step 5: Compare the actual results with expected indices
actual_indices = list(result.values())
self.assertEqual(expected_indices, actual_indices)

def checkResolution(
self, *expected_resolutions: Tuple[Optional[int], str]
Expand Down Expand Up @@ -337,7 +350,7 @@ def test_complex_resolution(self):
)

def test_reference_resolution(self):
self.checkReferenceResolution(
for test_tuple in (
([[0, 1]], "Foo v. Bar, 1 U.S. 1 ... Foo at 2"),
([[0]], "Foo at 2. .... ; Foo v. Bar, 1 U.S. 1"),
(
Expand All @@ -352,4 +365,20 @@ def test_reference_resolution(self):
[[0, 2], [1]],
"Foo v. Bar 1 U.S. 12, 347-348; In Smith, 12 U.S. 1 (1999) we saw something else. someting. In Foo at 2 we see that",
),
)
# Ok resolved_case_name and order, ReferenceCitation should be resolved
(
[[0, 1], [2]],
"State v. Dze 3 U.S. 22; something something. In Doe at 122, something more. In State v. Doe 4 U.S. 33",
"Doe",
),
# due to the reference matching more than 1 full citation, we don't
# resolve
(
[[0], [1]],
"State v. Smlth 3 U.S. 22; something something. In State v. Smith 4 U.S. 33. In Smith at 122, something more",
"Smith",
),
# ambiguous resolved_case_name, ReferenceCitation should not be
# resolved
):
self.checkReferenceResolution(*test_tuple)

0 comments on commit 9805ef2

Please sign in to comment.