Merge branch 'main' into 27-add-detection-of-ridgelys-notes-and-other…

…-irregular-reporters
freelawproject · Jan 24, 2025 · 56dd76e · 56dd76e
2 parents ad40535 + d09473c
commit 56dd76e
Show file tree

Hide file tree

Showing 8 changed files with 361 additions and 4 deletions.
diff --git a/eyecite/find.py b/eyecite/find.py
@@ -1,12 +1,15 @@
+import re
 from typing import List, Type, cast
 
 from eyecite.helpers import (
     disambiguate_reporters,
     extract_pin_cite,
+    filter_citations,
     joke_cite,
     match_on_tokens,
 )
 from eyecite.models import (
+    CaseReferenceToken,
     CitationBase,
     CitationToken,
     FullCaseCitation,
@@ -15,6 +18,7 @@
     FullLawCitation,
     IdCitation,
     IdToken,
+    ReferenceCitation,
     ResourceCitation,
     SectionToken,
     ShortCaseCitation,
@@ -54,7 +58,7 @@ def get_citations(
         return joke_cite
 
     words, citation_tokens = tokenizer.tokenize(plain_text)
-    citations = []
+    citations: list[CitationBase] = []
 
     for i, token in citation_tokens:
         citation: CitationBase
@@ -70,6 +74,13 @@ def get_citations(
                 citation = _extract_shortform_citation(words, i)
             else:
                 citation = _extract_full_citation(words, i)
+                if citations and isinstance(citation, FullCitation):
+                    citation.is_parallel_citation(citations[-1])
+
+                # Check for reference citations that follow a full citation
+                # Using the plaintiff or defendant
+                references = _extract_reference_citations(citation, plain_text)
+                citations.extend(references)
 
         # CASE 2: Token is an "Id." or "Ibid." reference.
         # In this case, the citation should simply be to the item cited
@@ -99,6 +110,8 @@ def get_citations(
 
         citations.append(citation)
 
+    citations = filter_citations(citations)
+
     # Remove citations with multiple reporter candidates where we couldn't
     # guess correct reporter
     if remove_ambiguous:
@@ -107,10 +120,72 @@ def get_citations(
     # Returns a list of citations ordered in the sequence that they appear in
     # the document. The ordering of this list is important for reconstructing
     # the references of the ShortCaseCitation, SupraCitation, and
-    # IdCitation objects.
+    # IdCitation and ReferenceCitation objects.
     return citations
 
 
+def _extract_reference_citations(
+    citation: FullCitation, plain_text: str
+) -> List[ReferenceCitation]:
+    """Extract reference citations that follow a full citation
+
+    :param citation: the full case citation found
+    :param plain_text: the text
+    :return: Pin cite reference citations
+    """
+    if len(plain_text) <= citation.span()[-1]:
+        return []
+    if not isinstance(citation, FullCaseCitation):
+        return []
+
+    def is_valid_name(name: str) -> bool:
+        """Validate name isnt a regex issue
+
+        Excludes strings like Co., numbers or lower case strs
+
+        :param name: The name to check
+        :return: True if usable, false if not
+        """
+        return (
+            isinstance(name, str)
+            and len(name) > 2
+            and name[0].isupper()
+            and not name.endswith(".")
+            and not name.isdigit()
+        )
+
+    regexes = [
+        rf"(?P<{key}>{re.escape(value)})"
+        for key in ["plaintiff", "defendant"]
+        if (value := getattr(citation.metadata, key, None))
+        and is_valid_name(value)
+    ]
+    if not regexes:
+        return []
+    pin_cite_re = (
+        rf"\b(?:{'|'.join(regexes)})\s+at\s+(?P<pin_cite>\d{{1,5}})\b"
+    )
+    reference_citations = []
+    remaining_text = plain_text[citation.span()[-1] :]
+    offset = citation.span()[-1]
+    for match in re.compile(pin_cite_re).finditer(remaining_text):
+        start, end = match.span()
+        matched_text = match.group(0)
+        reference = ReferenceCitation(
+            token=CaseReferenceToken(
+                data=matched_text, start=start + offset, end=end + offset
+            ),
+            span_start=start + offset,
+            span_end=end + offset,
+            full_span_start=start + offset,
+            full_span_end=end + offset,
+            index=0,
+            metadata=match.groupdict(),
+        )
+        reference_citations.append(reference)
+    return reference_citations
+
+
 def _extract_full_citation(
     words: Tokens,
     index: int,

diff --git a/eyecite/helpers.py b/eyecite/helpers.py
@@ -141,9 +141,11 @@ def add_defendant(citation: CaseCitation, words: Tokens) -> None:
             break
     if start_index:
         citation.full_span_start = citation.span()[0] - offset
-        citation.metadata.defendant = "".join(
+        defendant = "".join(
             str(w) for w in words[start_index : citation.index]
         ).strip(", ")
+        if defendant.strip():
+            citation.metadata.defendant = defendant
 
 
 def add_law_metadata(citation: FullLawCitation, words: Tokens) -> None:
@@ -315,6 +317,32 @@ def disambiguate_reporters(
     ]
 
 
+def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
+    """Filter and order citations, ensuring reference citations are in sequence
+
+    This function resolves rare but possible overlaps between ref. citations
+    and short citations. It also orders all citations by their `citation.span`,
+    as reference citations may be extracted out of order. The final result is a
+    properly sorted list of citations as they appear in the text
+
+    :param citations: List of citations
+    :return: Sorted and filtered citations
+    """
+    filtered_citations: List[CitationBase] = []
+    sorted_citations = sorted(citations, key=lambda citation: citation.span())
+    for citation in sorted_citations:
+        if filtered_citations:
+            last_citation = filtered_citations[-1]
+            last_span = last_citation.span()
+            current_span = citation.span()
+
+            if current_span[0] <= last_span[1]:
+                # Remove overlapping citations that can occur in edge cases
+                continue
+        filtered_citations.append(citation)
+    return filtered_citations
+
+
 joke_cite: List[CitationBase] = [
     FullCaseCitation(
         CitationToken(

diff --git a/eyecite/models.py b/eyecite/models.py
@@ -301,6 +301,26 @@ class FullCitation(ResourceCitation):
     """Abstract base class indicating that a citation fully identifies a
     resource."""
 
+    def is_parallel_citation(self, preceding: CitationBase):
+        """Check if preceding citation is parallel
+
+        Args:
+            preceding (): The previous citation found
+
+        Returns: None
+        """
+        is_parallel = (
+            self.full_span_start == preceding.full_span_start
+            and self.full_span_end == preceding.full_span_end
+            and isinstance(preceding, FullCaseCitation)
+        )
+        if is_parallel:
+            # if parallel get plaintiff/defendant data from
+            # the earlier citation, since it won't be on the
+            # parallel one.
+            self.metadata.defendant = preceding.metadata.defendant
+            self.metadata.plaintiff = preceding.metadata.plaintiff
+
 
 @dataclass(eq=False, unsafe_hash=False, repr=False)
 class FullLawCitation(FullCitation):
@@ -566,6 +586,26 @@ def formatted(self):
         return "".join(parts)
 
 
+@dataclass(eq=False, unsafe_hash=False, repr=False)
+class ReferenceCitation(CitationBase):
+    """A reference citation is a citation that refers to
+    a full case citation by name and pincite alone.
+
+    Future versions hopefully with drop the pincite requirement
+
+    Examples:
+    Roe at 240
+    """
+
+    @dataclass(eq=True, unsafe_hash=True)
+    class Metadata(CitationBase.Metadata):
+        """Define fields on self.metadata."""
+
+        plaintiff: Optional[str] = None
+        defendant: Optional[str] = None
+        pin_cite: Optional[str] = None
+
+
 @dataclass(eq=False, unsafe_hash=False, repr=False)
 class UnknownCitation(CitationBase):
     """Convenience class which represents an unknown citation. A recognized
@@ -679,6 +719,11 @@ class StopWordToken(Token):
     """Word matching one of the STOP_TOKENS."""
 
 
+@dataclass(eq=True, unsafe_hash=True)
+class CaseReferenceToken(Token):
+    """Word matching plaintiff or defendant in a full case citation"""
+
+
 @dataclass
 class TokenExtractor:
     """Class for extracting all matches from a given string for the given

diff --git a/eyecite/resolve.py b/eyecite/resolve.py
@@ -7,6 +7,7 @@
     FullCaseCitation,
     FullCitation,
     IdCitation,
+    ReferenceCitation,
     Resource,
     ResourceType,
     ShortCaseCitation,
@@ -83,6 +84,34 @@ def _filter_by_matching_antecedent(
     return matches[0] if len(matches) == 1 else None
 
 
+def _filter_by_matching_plaintiff_or_defendant(
+    resolved_full_cites: ResolvedFullCites,
+    plaintiff: str,
+    defendant: str,
+) -> Optional[ResourceType]:
+    """Filter out any impossible reference citations"""
+    matches: List[ResourceType] = []
+
+    for full_citation, resource in resolved_full_cites:
+        if not isinstance(full_citation, FullCaseCitation):
+            continue
+        defendant_match = (
+            defendant
+            and full_citation.metadata.defendant
+            and defendant in full_citation.metadata.defendant
+        )
+        plaintiff_match = (
+            plaintiff
+            and full_citation.metadata.plaintiff
+            and plaintiff in full_citation.metadata.plaintiff
+        )
+        if defendant_match or plaintiff_match:
+            matches.append(resource)
+    # Remove duplicates and only accept if one candidate remains
+    matches = list(set(matches))
+    return matches[0] if len(matches) == 1 else None
+
+
 def _has_invalid_pin_cite(
     full_cite: FullCitation, id_cite: IdCitation
 ) -> bool:
@@ -180,6 +209,28 @@ def _resolve_supra_citation(
     )
 
 
+def _resolve_reference_citation(
+    reference_citation: ReferenceCitation,
+    resolved_full_cites: ResolvedFullCites,
+) -> Optional[ResourceType]:
+    """Resolve reference citations
+
+    Try to resolve reference citations by checking whether their is only one
+    full citation that appears with either the defendant or plaintiff
+    field of any of the previously resolved full citations.
+    """
+    if (
+        not reference_citation.metadata.defendant
+        and not reference_citation.metadata.plaintiff
+    ):
+        return None
+    return _filter_by_matching_plaintiff_or_defendant(
+        resolved_full_cites,
+        reference_citation.metadata.plaintiff,
+        reference_citation.metadata.defendant,
+    )
+
+
 def _resolve_id_citation(
     id_citation: IdCitation,
     last_resolution: ResourceType,
@@ -214,6 +265,10 @@ def resolve_citations(
         [SupraCitation, ResolvedFullCites],
         Optional[ResourceType],
     ] = _resolve_supra_citation,
+    resolve_reference_citation: Callable[
+        [ReferenceCitation, ResolvedFullCites],
+        Optional[ResourceType],
+    ] = _resolve_reference_citation,
     resolve_id_citation: Callable[
         [IdCitation, ResourceType, Resolutions], Optional[ResourceType]
     ] = _resolve_id_citation,
@@ -286,6 +341,11 @@ def resolve_citations(
         elif isinstance(citation, SupraCitation):
             resolution = resolve_supra_citation(citation, resolved_full_cites)
 
+        elif isinstance(citation, ReferenceCitation):
+            resolution = resolve_reference_citation(
+                citation, resolved_full_cites
+            )
+
         # If the citation is an id citation, try to resolve it
         elif isinstance(citation, IdCitation):
             resolution = resolve_id_citation(

diff --git a/eyecite/test_factories.py b/eyecite/test_factories.py
@@ -1,11 +1,13 @@
 from eyecite.helpers import get_year
 from eyecite.models import (
+    CaseReferenceToken,
     CitationToken,
     FullCaseCitation,
     FullJournalCitation,
     FullLawCitation,
     IdCitation,
     IdToken,
+    ReferenceCitation,
     SectionToken,
     ShortCaseCitation,
     SupraCitation,
@@ -103,6 +105,13 @@ def id_citation(source_text=None, index=0, **kwargs):
     return IdCitation(IdToken(source_text, 0, 99), index, **kwargs)
 
 
+def reference_citation(source_text=None, index=0, **kwargs):
+    """Convenience function for creating mock ReferenceCitation objects."""
+    return ReferenceCitation(
+        CaseReferenceToken(source_text, 0, 99), index, **kwargs
+    )
+
+
 def unknown_citation(source_text=None, index=0, **kwargs):
     """Convenience function for creating mock UnknownCitation objects."""
     return UnknownCitation(SectionToken(source_text, 0, 99), index, **kwargs)

diff --git a/tests/test_AnnotateTest.py b/tests/test_AnnotateTest.py
@@ -47,6 +47,12 @@ def lower_annotator(before, text, after):
                 "<0>1 U.S. 1</0>. Foo v. Bar, <1>supra at 2</1>.",
                 [],
             ),
+            # Reference cite
+            (
+                "Foo v. Bar 1 U.S. 1. In Foo at 2.",
+                "Foo v. Bar <0>1 U.S. 1</0>. In <1>Foo at 2</1>.",
+                [],
+            ),
             # whitespace and html -- no unbalanced tag check
             (
                 "<body>foo  <i>1   <b>U.S.</b></i>   1 bar</body>",