diff --git a/eyecite/find.py b/eyecite/find.py index 7c4d975..c3cc4c3 100644 --- a/eyecite/find.py +++ b/eyecite/find.py @@ -1,12 +1,15 @@ +import re from typing import List, Type, cast from eyecite.helpers import ( disambiguate_reporters, extract_pin_cite, + filter_citations, joke_cite, match_on_tokens, ) from eyecite.models import ( + CaseReferenceToken, CitationBase, CitationToken, FullCaseCitation, @@ -15,6 +18,7 @@ FullLawCitation, IdCitation, IdToken, + ReferenceCitation, ResourceCitation, SectionToken, ShortCaseCitation, @@ -54,7 +58,7 @@ def get_citations( return joke_cite words, citation_tokens = tokenizer.tokenize(plain_text) - citations = [] + citations: list[CitationBase] = [] for i, token in citation_tokens: citation: CitationBase @@ -70,6 +74,13 @@ def get_citations( citation = _extract_shortform_citation(words, i) else: citation = _extract_full_citation(words, i) + if citations and isinstance(citation, FullCitation): + citation.is_parallel_citation(citations[-1]) + + # Check for reference citations that follow a full citation + # Using the plaintiff or defendant + references = _extract_reference_citations(citation, plain_text) + citations.extend(references) # CASE 2: Token is an "Id." or "Ibid." reference. # In this case, the citation should simply be to the item cited @@ -99,6 +110,8 @@ def get_citations( citations.append(citation) + citations = filter_citations(citations) + # Remove citations with multiple reporter candidates where we couldn't # guess correct reporter if remove_ambiguous: @@ -107,10 +120,72 @@ def get_citations( # Returns a list of citations ordered in the sequence that they appear in # the document. The ordering of this list is important for reconstructing # the references of the ShortCaseCitation, SupraCitation, and - # IdCitation objects. + # IdCitation and ReferenceCitation objects. return citations +def _extract_reference_citations( + citation: FullCitation, plain_text: str +) -> List[ReferenceCitation]: + """Extract reference citations that follow a full citation + + :param citation: the full case citation found + :param plain_text: the text + :return: Pin cite reference citations + """ + if len(plain_text) <= citation.span()[-1]: + return [] + if not isinstance(citation, FullCaseCitation): + return [] + + def is_valid_name(name: str) -> bool: + """Validate name isnt a regex issue + + Excludes strings like Co., numbers or lower case strs + + :param name: The name to check + :return: True if usable, false if not + """ + return ( + isinstance(name, str) + and len(name) > 2 + and name[0].isupper() + and not name.endswith(".") + and not name.isdigit() + ) + + regexes = [ + rf"(?P<{key}>{re.escape(value)})" + for key in ["plaintiff", "defendant"] + if (value := getattr(citation.metadata, key, None)) + and is_valid_name(value) + ] + if not regexes: + return [] + pin_cite_re = ( + rf"\b(?:{'|'.join(regexes)})\s+at\s+(?P\d{{1,5}})\b" + ) + reference_citations = [] + remaining_text = plain_text[citation.span()[-1] :] + offset = citation.span()[-1] + for match in re.compile(pin_cite_re).finditer(remaining_text): + start, end = match.span() + matched_text = match.group(0) + reference = ReferenceCitation( + token=CaseReferenceToken( + data=matched_text, start=start + offset, end=end + offset + ), + span_start=start + offset, + span_end=end + offset, + full_span_start=start + offset, + full_span_end=end + offset, + index=0, + metadata=match.groupdict(), + ) + reference_citations.append(reference) + return reference_citations + + def _extract_full_citation( words: Tokens, index: int, diff --git a/eyecite/helpers.py b/eyecite/helpers.py index 34b9e58..9ab88f5 100644 --- a/eyecite/helpers.py +++ b/eyecite/helpers.py @@ -141,9 +141,11 @@ def add_defendant(citation: CaseCitation, words: Tokens) -> None: break if start_index: citation.full_span_start = citation.span()[0] - offset - citation.metadata.defendant = "".join( + defendant = "".join( str(w) for w in words[start_index : citation.index] ).strip(", ") + if defendant.strip(): + citation.metadata.defendant = defendant def add_law_metadata(citation: FullLawCitation, words: Tokens) -> None: @@ -315,6 +317,32 @@ def disambiguate_reporters( ] +def filter_citations(citations: List[CitationBase]) -> List[CitationBase]: + """Filter and order citations, ensuring reference citations are in sequence + + This function resolves rare but possible overlaps between ref. citations + and short citations. It also orders all citations by their `citation.span`, + as reference citations may be extracted out of order. The final result is a + properly sorted list of citations as they appear in the text + + :param citations: List of citations + :return: Sorted and filtered citations + """ + filtered_citations: List[CitationBase] = [] + sorted_citations = sorted(citations, key=lambda citation: citation.span()) + for citation in sorted_citations: + if filtered_citations: + last_citation = filtered_citations[-1] + last_span = last_citation.span() + current_span = citation.span() + + if current_span[0] <= last_span[1]: + # Remove overlapping citations that can occur in edge cases + continue + filtered_citations.append(citation) + return filtered_citations + + joke_cite: List[CitationBase] = [ FullCaseCitation( CitationToken( diff --git a/eyecite/models.py b/eyecite/models.py index 69a7d9a..c0fc5e2 100644 --- a/eyecite/models.py +++ b/eyecite/models.py @@ -301,6 +301,26 @@ class FullCitation(ResourceCitation): """Abstract base class indicating that a citation fully identifies a resource.""" + def is_parallel_citation(self, preceding: CitationBase): + """Check if preceding citation is parallel + + Args: + preceding (): The previous citation found + + Returns: None + """ + is_parallel = ( + self.full_span_start == preceding.full_span_start + and self.full_span_end == preceding.full_span_end + and isinstance(preceding, FullCaseCitation) + ) + if is_parallel: + # if parallel get plaintiff/defendant data from + # the earlier citation, since it won't be on the + # parallel one. + self.metadata.defendant = preceding.metadata.defendant + self.metadata.plaintiff = preceding.metadata.plaintiff + @dataclass(eq=False, unsafe_hash=False, repr=False) class FullLawCitation(FullCitation): @@ -566,6 +586,26 @@ def formatted(self): return "".join(parts) +@dataclass(eq=False, unsafe_hash=False, repr=False) +class ReferenceCitation(CitationBase): + """A reference citation is a citation that refers to + a full case citation by name and pincite alone. + + Future versions hopefully with drop the pincite requirement + + Examples: + Roe at 240 + """ + + @dataclass(eq=True, unsafe_hash=True) + class Metadata(CitationBase.Metadata): + """Define fields on self.metadata.""" + + plaintiff: Optional[str] = None + defendant: Optional[str] = None + pin_cite: Optional[str] = None + + @dataclass(eq=False, unsafe_hash=False, repr=False) class UnknownCitation(CitationBase): """Convenience class which represents an unknown citation. A recognized @@ -679,6 +719,11 @@ class StopWordToken(Token): """Word matching one of the STOP_TOKENS.""" +@dataclass(eq=True, unsafe_hash=True) +class CaseReferenceToken(Token): + """Word matching plaintiff or defendant in a full case citation""" + + @dataclass class TokenExtractor: """Class for extracting all matches from a given string for the given diff --git a/eyecite/resolve.py b/eyecite/resolve.py index 5001cb9..7f09ccf 100644 --- a/eyecite/resolve.py +++ b/eyecite/resolve.py @@ -7,6 +7,7 @@ FullCaseCitation, FullCitation, IdCitation, + ReferenceCitation, Resource, ResourceType, ShortCaseCitation, @@ -83,6 +84,34 @@ def _filter_by_matching_antecedent( return matches[0] if len(matches) == 1 else None +def _filter_by_matching_plaintiff_or_defendant( + resolved_full_cites: ResolvedFullCites, + plaintiff: str, + defendant: str, +) -> Optional[ResourceType]: + """Filter out any impossible reference citations""" + matches: List[ResourceType] = [] + + for full_citation, resource in resolved_full_cites: + if not isinstance(full_citation, FullCaseCitation): + continue + defendant_match = ( + defendant + and full_citation.metadata.defendant + and defendant in full_citation.metadata.defendant + ) + plaintiff_match = ( + plaintiff + and full_citation.metadata.plaintiff + and plaintiff in full_citation.metadata.plaintiff + ) + if defendant_match or plaintiff_match: + matches.append(resource) + # Remove duplicates and only accept if one candidate remains + matches = list(set(matches)) + return matches[0] if len(matches) == 1 else None + + def _has_invalid_pin_cite( full_cite: FullCitation, id_cite: IdCitation ) -> bool: @@ -180,6 +209,28 @@ def _resolve_supra_citation( ) +def _resolve_reference_citation( + reference_citation: ReferenceCitation, + resolved_full_cites: ResolvedFullCites, +) -> Optional[ResourceType]: + """Resolve reference citations + + Try to resolve reference citations by checking whether their is only one + full citation that appears with either the defendant or plaintiff + field of any of the previously resolved full citations. + """ + if ( + not reference_citation.metadata.defendant + and not reference_citation.metadata.plaintiff + ): + return None + return _filter_by_matching_plaintiff_or_defendant( + resolved_full_cites, + reference_citation.metadata.plaintiff, + reference_citation.metadata.defendant, + ) + + def _resolve_id_citation( id_citation: IdCitation, last_resolution: ResourceType, @@ -214,6 +265,10 @@ def resolve_citations( [SupraCitation, ResolvedFullCites], Optional[ResourceType], ] = _resolve_supra_citation, + resolve_reference_citation: Callable[ + [ReferenceCitation, ResolvedFullCites], + Optional[ResourceType], + ] = _resolve_reference_citation, resolve_id_citation: Callable[ [IdCitation, ResourceType, Resolutions], Optional[ResourceType] ] = _resolve_id_citation, @@ -286,6 +341,11 @@ def resolve_citations( elif isinstance(citation, SupraCitation): resolution = resolve_supra_citation(citation, resolved_full_cites) + elif isinstance(citation, ReferenceCitation): + resolution = resolve_reference_citation( + citation, resolved_full_cites + ) + # If the citation is an id citation, try to resolve it elif isinstance(citation, IdCitation): resolution = resolve_id_citation( diff --git a/eyecite/test_factories.py b/eyecite/test_factories.py index b1d4db2..9b58801 100644 --- a/eyecite/test_factories.py +++ b/eyecite/test_factories.py @@ -1,11 +1,13 @@ from eyecite.helpers import get_year from eyecite.models import ( + CaseReferenceToken, CitationToken, FullCaseCitation, FullJournalCitation, FullLawCitation, IdCitation, IdToken, + ReferenceCitation, SectionToken, ShortCaseCitation, SupraCitation, @@ -103,6 +105,13 @@ def id_citation(source_text=None, index=0, **kwargs): return IdCitation(IdToken(source_text, 0, 99), index, **kwargs) +def reference_citation(source_text=None, index=0, **kwargs): + """Convenience function for creating mock ReferenceCitation objects.""" + return ReferenceCitation( + CaseReferenceToken(source_text, 0, 99), index, **kwargs + ) + + def unknown_citation(source_text=None, index=0, **kwargs): """Convenience function for creating mock UnknownCitation objects.""" return UnknownCitation(SectionToken(source_text, 0, 99), index, **kwargs) diff --git a/tests/test_AnnotateTest.py b/tests/test_AnnotateTest.py index 7ae3d71..e61c7af 100644 --- a/tests/test_AnnotateTest.py +++ b/tests/test_AnnotateTest.py @@ -47,6 +47,12 @@ def lower_annotator(before, text, after): "<0>1 U.S. 1. Foo v. Bar, <1>supra at 2.", [], ), + # Reference cite + ( + "Foo v. Bar 1 U.S. 1. In Foo at 2.", + "Foo v. Bar <0>1 U.S. 1. In <1>Foo at 2.", + [], + ), # whitespace and html -- no unbalanced tag check ( "foo 1 U.S. 1 bar", diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py index b65ac9a..09a2000 100644 --- a/tests/test_FindTest.py +++ b/tests/test_FindTest.py @@ -13,6 +13,7 @@ id_citation, journal_citation, law_citation, + reference_citation, supra_citation, unknown_citation, ) @@ -145,7 +146,7 @@ def test_find_citations(self): 'parenthetical': 'overruling foo'}), case_citation(page='2', reporter='S. Ct.', year=1982, metadata={'plaintiff': 'lissner', - 'defendant': 'test 1 U.S. 12, 347-348', + 'defendant': 'test', 'court': 'ca4', 'pin_cite': '358', 'parenthetical': 'overruling foo'}), @@ -453,6 +454,83 @@ def test_find_citations(self): [],), ('lorem 111 N. W. 12th St.', [],), + # Eyecite has issue with linebreaks when identifying defendants and + # previously could store defendant as only whitespace + ('\n rt. denied,\n \n \n 541 U.S. 1085 (2004);\n \n', + [case_citation( + page='1085', + volume="541", + reporter="U.S.", + year=2004, + metadata={'plaintiff': None, + 'defendant': None, + 'court': 'scotus'})], + {'clean': ['html', 'inline_whitespace']}), + # Test filtering overlapping citations - this finds four citations + # but should filter down to three + ("Miles v. Smith 1 Ga. 1; asdfasdf asd Something v. Else, 1 Miles 3; 1 Miles at 10", + [case_citation(page='1', + volume="1", + reporter="Ga.", + metadata={'plaintiff': 'Miles', + 'defendant': 'Smith'}), + case_citation(page='3', + volume="1", + reporter="Miles", + metadata={'plaintiff': 'Something', + 'defendant': 'Else'} + ), + case_citation(volume="1", page='10', reporter='Miles', + short=True, + metadata={'pin_cite': '10'})]), + ('General Casualty cites as compelling Amick v. Liberty Mut. Ins. Co., 455 A.2d 793 (R.I. 1983). In that case ... Stats, do. See Amick at 795', + [case_citation(page='793', + volume="455", + reporter="A.2d", + year=1983, + metadata={'plaintiff': 'Amick', + 'defendant': 'Liberty Mut. Ins. Co.', + 'court': 'ri' + }), + reference_citation('Amick at 795', metadata={'plaintiff': 'Amick', 'pin_cite': '795'})]), + # Test reference citation + ('Foo v. Bar 1 U.S. 12, 347-348. something something, In Foo at 62 we see that', + [case_citation(page='12', + metadata={'plaintiff': 'Foo', + 'defendant': 'Bar', + 'pin_cite': '347-348'}), + reference_citation('Foo at 62', metadata={'plaintiff': 'Foo', 'pin_cite': '62'})]), + # Test that reference citation must occur after full case citation + ('In Foo at 62 we see that, Foo v. Bar 1 U.S. 12, 347-348. something something,', + [case_citation(page='12', + metadata={'plaintiff': 'Foo', + 'defendant': 'Bar', + 'pin_cite': '347-348'})]), + # Test reference against defendant name + ('In re Foo 1 Mass. 12, 347-348. something something, in Foo at 62 we see that, ', + [case_citation(page='12', reporter="Mass.", volume="1", + metadata={'defendant': 'Foo', 'pin_cite': '347-348'}), + reference_citation('Foo at 62', + metadata={'defendant': 'Foo', + "pin_cite": "62"})]), + # Test reference citation that contains at + ('In re Foo 1 Mass. 12, 347-348. something something, in at we see that', + [case_citation(page='12', reporter="Mass.", volume="1", + metadata={'defendant': 'Foo', 'pin_cite': '347-348'})]), + # Test U.S. as plaintiff with reference citations + ('U.S. v. Boch Oldsmobile, Inc., 909 F.2d 657, 660 (1st Cir.1990); Piper Aircraft, 454 U.S. at 241', + [case_citation(page='657', reporter="F.2d", volume="909", + metadata={'plaintiff': 'U.S.', 'defendant': 'Boch Oldsmobile, Inc.', 'pin_cite': '660'}), + case_citation(volume="454", page='241', reporter_found='U.S.', short=True, + metadata={'antecedent_guess': 'Aircraft', 'court': "scotus", 'pin_cite': "241"})]), + # Test reference citation after an id citation + ('we said in Morton v. Mancari, 417 U. S. 535, 552 (1974) “Literally every piece ....”. “asisovereign tribal entities . . . .” Id. In Mancari at 665', + [case_citation(page='535', year=1974, volume="417", + reporter="U. S.", + metadata={'plaintiff': 'Morton', 'defendant': 'Mancari', "pin_cite": "552", "court": "scotus"}), + id_citation('Id.,', metadata={}), + reference_citation('Mancari', + metadata={'defendant': 'Mancari', "pin_cite": "665"})]), # Test Conn. Super. Ct. regex variation. ('Failed to recognize 1993 Conn. Super. Ct. 5243-P', [case_citation(volume='1993', reporter='Conn. Super. Ct.', diff --git a/tests/test_ResolveTest.py b/tests/test_ResolveTest.py index 61c4898..6f64412 100644 --- a/tests/test_ResolveTest.py +++ b/tests/test_ResolveTest.py @@ -31,6 +31,44 @@ def assertResolution(self, citations, expected_resolution_dict): format_resolution(expected_resolution_dict), ) + def checkReferenceResolution( + self, *expected_resolutions: tuple[list[list[int]], str] + ): + """ + Helper function to help test reference citations. + + Args: + *expected_resolutions (tuple[list[int], str]): + A list of tuples where each tuple contains: + - A list of expected indices for the resolved citations. + - A string of citation text to process. + + Returns: + None + """ + for expected_indices, citation_text in expected_resolutions: + citations = get_citations(citation_text) + + # Step 2: Build a helper dict to map corrected citations to indices + resolution_index_map = { + cite.corrected_citation(): idx + for idx, cite in enumerate(citations) + } + + # Step 3: Resolve citations and format the resolution + resolved_citations = resolve_citations(citations) + formatted_resolution = format_resolution(resolved_citations) + + # Step 4: Map resolved citations to their indices + result = { + key: [resolution_index_map[value] for value in values] + for key, values in formatted_resolution.items() + } + + # Step 5: Compare the actual results with expected indices + actual_indices = list(result.values()) + self.assertEqual(expected_indices, actual_indices) + def checkResolution( self, *expected_resolutions: Tuple[Optional[int], str] ): @@ -297,3 +335,21 @@ def test_complex_resolution(self): ), (2, "However, this should succeed, Lorem, 1 U.S., at 52."), ) + + def test_reference_resolution(self): + self.checkReferenceResolution( + ([[0, 1]], "Foo v. Bar, 1 U.S. 1 ... Foo at 2"), + ([[0]], "Foo at 2. .... ; Foo v. Bar, 1 U.S. 1"), + ( + [[0, 1]], + "Foo v. Bar 1 U.S. 12, 347-348. something something, In Foo at 62 we see that", + ), + ( + [[0, 2], [1]], + "Foo v. Bar 1 U.S. 12, 347-348; 12 U.S. 1. someting; In Foo at 2 we see that", + ), + ( + [[0, 2], [1]], + "Foo v. Bar 1 U.S. 12, 347-348; In Smith, 12 U.S. 1 (1999) we saw something else. someting. In Foo at 2 we see that", + ), + )