diff --git a/eyecite/find.py b/eyecite/find.py index c3cc4c3..90985be 100644 --- a/eyecite/find.py +++ b/eyecite/find.py @@ -79,7 +79,7 @@ def get_citations( # Check for reference citations that follow a full citation # Using the plaintiff or defendant - references = _extract_reference_citations(citation, plain_text) + references = extract_reference_citations(citation, plain_text) citations.extend(references) # CASE 2: Token is an "Id." or "Ibid." reference. @@ -124,8 +124,9 @@ def get_citations( return citations -def _extract_reference_citations( - citation: FullCitation, plain_text: str +def extract_reference_citations( + citation: FullCitation, + plain_text: str, ) -> List[ReferenceCitation]: """Extract reference citations that follow a full citation @@ -156,7 +157,7 @@ def is_valid_name(name: str) -> bool: regexes = [ rf"(?P<{key}>{re.escape(value)})" - for key in ["plaintiff", "defendant"] + for key in ReferenceCitation.name_fields if (value := getattr(citation.metadata, key, None)) and is_valid_name(value) ] diff --git a/eyecite/helpers.py b/eyecite/helpers.py index 9ab88f5..82e63b5 100644 --- a/eyecite/helpers.py +++ b/eyecite/helpers.py @@ -12,6 +12,7 @@ FullJournalCitation, FullLawCitation, ParagraphToken, + ReferenceCitation, ResourceCitation, StopWordToken, Token, @@ -336,6 +337,19 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]: last_span = last_citation.span() current_span = citation.span() + if current_span == last_span and isinstance( + last_citation, ReferenceCitation + ): + # a single ReferenceCitation may be found via different + # names. Save the name metadata to account for collisions + for field in ReferenceCitation.name_fields: + if not getattr(last_citation.metadata, field): + setattr( + last_citation.metadata, + field, + getattr(citation.metadata, field), + ) + if current_span[0] <= last_span[1]: # Remove overlapping citations that can occur in edge cases continue diff --git a/eyecite/models.py b/eyecite/models.py index c0fc5e2..cf3db60 100644 --- a/eyecite/models.py +++ b/eyecite/models.py @@ -456,6 +456,9 @@ class Metadata(CaseCitation.Metadata): plaintiff: Optional[str] = None defendant: Optional[str] = None extra: Optional[str] = None + # May be populated after citation resolution + resolved_case_name_short: Optional[str] = None + resolved_case_name: Optional[str] = None def add_metadata(self, words: "Tokens"): """Extract metadata from text before and after citation.""" @@ -604,6 +607,15 @@ class Metadata(CitationBase.Metadata): plaintiff: Optional[str] = None defendant: Optional[str] = None pin_cite: Optional[str] = None + resolved_case_name_short: Optional[str] = None + resolved_case_name: Optional[str] = None + + name_fields = [ + "plaintiff", + "defendant", + "resolved_case_name_short", + "resolved_case_name", + ] @dataclass(eq=False, unsafe_hash=False, repr=False) diff --git a/eyecite/resolve.py b/eyecite/resolve.py index 7f09ccf..2ce5051 100644 --- a/eyecite/resolve.py +++ b/eyecite/resolve.py @@ -84,29 +84,34 @@ def _filter_by_matching_antecedent( return matches[0] if len(matches) == 1 else None -def _filter_by_matching_plaintiff_or_defendant( +def _filter_by_matching_plaintiff_or_defendant_or_resolved_names( resolved_full_cites: ResolvedFullCites, - plaintiff: str, - defendant: str, + reference_citation: ReferenceCitation, ) -> Optional[ResourceType]: - """Filter out any impossible reference citations""" + """Filter out reference citations that point to more than 1 Resource""" matches: List[ResourceType] = [] + compare_keys = [ + "defendant", + "plaintiff", + "resolved_case_name", + "resolved_case_name_short", + ] for full_citation, resource in resolved_full_cites: if not isinstance(full_citation, FullCaseCitation): continue - defendant_match = ( - defendant - and full_citation.metadata.defendant - and defendant in full_citation.metadata.defendant - ) - plaintiff_match = ( - plaintiff - and full_citation.metadata.plaintiff - and plaintiff in full_citation.metadata.plaintiff - ) - if defendant_match or plaintiff_match: - matches.append(resource) + + for key in compare_keys: + reference_value = getattr(reference_citation.metadata, key) + full_case_value = getattr(full_citation.metadata, key) + if ( + reference_value + and full_case_value + and reference_value in full_case_value + ): + matches.append(resource) + break + # Remove duplicates and only accept if one candidate remains matches = list(set(matches)) return matches[0] if len(matches) == 1 else None @@ -216,18 +221,19 @@ def _resolve_reference_citation( """Resolve reference citations Try to resolve reference citations by checking whether their is only one - full citation that appears with either the defendant or plaintiff + full citation that appears with either the defendant or plaintiff or + resolved_case_name_short or resolved_case_name field of any of the previously resolved full citations. """ if ( not reference_citation.metadata.defendant and not reference_citation.metadata.plaintiff + and not reference_citation.metadata.resolved_case_name_short + and not reference_citation.metadata.resolved_case_name ): return None - return _filter_by_matching_plaintiff_or_defendant( - resolved_full_cites, - reference_citation.metadata.plaintiff, - reference_citation.metadata.defendant, + return _filter_by_matching_plaintiff_or_defendant_or_resolved_names( + resolved_full_cites, reference_citation ) diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py index 8f2da29..291c9ae 100644 --- a/tests/test_FindTest.py +++ b/tests/test_FindTest.py @@ -4,6 +4,8 @@ from unittest import TestCase from eyecite import clean_text, get_citations +from eyecite.find import extract_reference_citations +from eyecite.helpers import filter_citations # by default tests use a cache for speed # call tests with `EYECITE_CACHE_DIR= python ...` to disable cache @@ -858,3 +860,30 @@ def test_citation_fullspan(self): self.assertEqual( extracted.full_span(), (start_idx, len(sentence)), error_msg ) + + def test_reference_extraction(self): + """Can we extract a reference citation using resolved metadata?""" + texts = [ + # In this case the reference citation got with the + # resolved_case_name is redundant, was already got in the regular + # process. Can we deduplicate? + """See, e.g., State v. Wingler, 135 A. 2d 468 (1957); + [State v. Wingler at 175, citing, Minnesota ex rel.]""", + # In this case the resolved_case_name actually helps getting the + # reference citation + """See, e.g., State v. W1ngler, 135 A. 2d 468 (1957); + [State v. Wingler at 175, citing, Minnesota ex rel.]""", + ] + for plain_text in texts: + citations = get_citations(plain_text) + citations[0].metadata.resolved_case_name = "State v. Wingler" + references = extract_reference_citations(citations[0], plain_text) + final_citations = filter_citations(citations + references) + self.assertEqual( + len(final_citations), 2, "There should only be 2 citations" + ) + self.assertEqual( + len(references), + 1, + "Only a reference citation should had been picked up", + ) diff --git a/tests/test_ResolveTest.py b/tests/test_ResolveTest.py index 6f64412..51f0b53 100644 --- a/tests/test_ResolveTest.py +++ b/tests/test_ResolveTest.py @@ -3,6 +3,8 @@ from unittest import TestCase from eyecite import get_citations +from eyecite.find import extract_reference_citations +from eyecite.helpers import filter_citations from eyecite.models import FullCitation, Resource from eyecite.resolve import resolve_citations @@ -32,42 +34,53 @@ def assertResolution(self, citations, expected_resolution_dict): ) def checkReferenceResolution( - self, *expected_resolutions: tuple[list[list[int]], str] + self, + expected_indices: list[list[int]], + citation_text: str, + resolved_case_name_short: Optional[str] = None, ): """ Helper function to help test reference citations. Args: - *expected_resolutions (tuple[list[int], str]): - A list of tuples where each tuple contains: - - A list of expected indices for the resolved citations. - - A string of citation text to process. - + expected_indices: A list of expected indices for the resolved + citations. + citation_text: A string of citation text to process. + resolved_case_name_short: a case name for simulating post-resolution + metadata assignment to full case citations; this will also be + used as a flag to use a second round of reference extractions Returns: None """ - for expected_indices, citation_text in expected_resolutions: - citations = get_citations(citation_text) + citations = get_citations(citation_text) + if resolved_case_name_short: + citations[0].metadata.resolved_case_name_short = ( + resolved_case_name_short + ) + new_references = extract_reference_citations( + citations[0], citation_text + ) + citations = filter_citations(citations + new_references) - # Step 2: Build a helper dict to map corrected citations to indices - resolution_index_map = { - cite.corrected_citation(): idx - for idx, cite in enumerate(citations) - } + # Step 2: Build a helper dict to map corrected citations to indices + resolution_index_map = { + cite.corrected_citation(): idx + for idx, cite in enumerate(citations) + } - # Step 3: Resolve citations and format the resolution - resolved_citations = resolve_citations(citations) - formatted_resolution = format_resolution(resolved_citations) + # Step 3: Resolve citations and format the resolution + resolved_citations = resolve_citations(citations) + formatted_resolution = format_resolution(resolved_citations) - # Step 4: Map resolved citations to their indices - result = { - key: [resolution_index_map[value] for value in values] - for key, values in formatted_resolution.items() - } + # Step 4: Map resolved citations to their indices + result = { + key: [resolution_index_map[value] for value in values] + for key, values in formatted_resolution.items() + } - # Step 5: Compare the actual results with expected indices - actual_indices = list(result.values()) - self.assertEqual(expected_indices, actual_indices) + # Step 5: Compare the actual results with expected indices + actual_indices = list(result.values()) + self.assertEqual(expected_indices, actual_indices) def checkResolution( self, *expected_resolutions: Tuple[Optional[int], str] @@ -337,7 +350,7 @@ def test_complex_resolution(self): ) def test_reference_resolution(self): - self.checkReferenceResolution( + for test_tuple in ( ([[0, 1]], "Foo v. Bar, 1 U.S. 1 ... Foo at 2"), ([[0]], "Foo at 2. .... ; Foo v. Bar, 1 U.S. 1"), ( @@ -352,4 +365,20 @@ def test_reference_resolution(self): [[0, 2], [1]], "Foo v. Bar 1 U.S. 12, 347-348; In Smith, 12 U.S. 1 (1999) we saw something else. someting. In Foo at 2 we see that", ), - ) + # Ok resolved_case_name and order, ReferenceCitation should be resolved + ( + [[0, 1], [2]], + "State v. Dze 3 U.S. 22; something something. In Doe at 122, something more. In State v. Doe 4 U.S. 33", + "Doe", + ), + # due to the reference matching more than 1 full citation, we don't + # resolve + ( + [[0], [1]], + "State v. Smlth 3 U.S. 22; something something. In State v. Smith 4 U.S. 33. In Smith at 122, something more", + "Smith", + ), + # ambiguous resolved_case_name, ReferenceCitation should not be + # resolved + ): + self.checkReferenceResolution(*test_tuple)