Merge branch 'main' into 174-parsing-error-for-citations-with-defenda…

…nt-thompson
freelawproject · Feb 7, 2025 · d0b4c4f · d0b4c4f
2 parents 17caf7b + 32ee756
commit d0b4c4f
Show file tree

Hide file tree

Showing 15 changed files with 857 additions and 19 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -16,14 +16,25 @@ Fixes:
 
 ## Current
 
-**2.6.4 - 2024-06-03**
+**2.6.5 - 2025-01-28**
+
+Features:
+
+- Add ReferenceCitation model and associated logic
 
 Fixes:
 
-- Bump eyecite to for InvalidError/hyperscan bug
+- Fix court string matching with whitespace
+- Fix court name issues
 
 ## Past
 
+**2.6.4 - 2024-06-03**
+
+Fixes:
+
+- Bump eyecite to for InvalidError/hyperscan bug
+
 **2.6.3 - 2024-04-09**
 
 Fixes:

diff --git a/README.rst b/README.rst
@@ -119,6 +119,45 @@ Extracting Citations
    that might refer to more than one reporter and can't be narrowed down by date.
 3. :code:`tokenizer` ==> Tokenizer, default :code:`eyecite.tokenizers.default_tokenizer`: An instance of a Tokenizer object (see "Tokenizers" below).
 
+Resolving Reference Citations
+-----------------------------
+
+Eyecite now supports a two-step process for extracting and resolving reference citations. This feature improves handling of citations that reference previously mentioned cases without explicitly repeating the full case name or citation.
+
+Reference citations, such as “Theatre Enterprises at 552”, can be difficult to extract accurately if a judge is citing to `Theatre Enterprises, Inc. v. Paramount Film Distributing Corp., 346 U. S. 537, 541 (1954)` they lack a full case name. To address this, Eyecite allows for an initial citation extraction, followed by a secondary reference resolution step. If you have an external database (e.g., CourtListener) that provides resolved case names, you can use this feature to enhance citation finding.
+
+from eyecite import get_citations
+from eyecite.find import extract_reference_citations
+from eyecite.helpers import filter_citations
+
+plain_text = (
+    "quoting Theatre Enterprises, Inc. v. Paramount Film Distributing Corp., 346 U. S. 537, 541 (1954); "
+    "alterations in original). Thus, the District Court understood that allegations of "
+    "parallel business conduct, taken alone, do not state a claim under § 1; "
+    "plaintiffs must allege additional facts that “ten[d] to exclude independent "
+    "self-interested conduct as an As Theatre Enterprises at 552 held, parallel"
+)
+
+::
+
+    from eyecite import get_citations
+    from eyecite.find import extract_reference_citations
+    from eyecite.helpers import filter_citations
+
+    # Step 1: Extract full citations
+    citations = get_citations(plain_text)
+
+    # Step 2: Resolve the case name from an external database or prior knowledge
+    citations[0].metadata.resolved_case_name_short = "Theatre Enterprises"
+
+    # Step 3: Extract reference citations using the resolved name
+    references = extract_reference_citations(citations[0], plain_text)
+
+    # Step 4: Filter and merge citations
+    new_citations = filter_citations(citations + references)
+
+Keep in mind that this feature requires an external database or heuristic method to resolve the short case name before extracting reference citations a second time.
+
 
 Cleaning Input Text
 -------------------

diff --git a/TUTORIAL.ipynb b/TUTORIAL.ipynb
@@ -54,7 +54,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "opinion_url = 'https://www.courtlistener.com/api/rest/v3/opinions/1741/'\n",
+    "opinion_url = 'https://www.courtlistener.com/api/rest/v4/opinions/1741/'\n",
     "opinion_text = requests.get(opinion_url).json()['plain_text']"
    ]
   },
@@ -163,6 +163,17 @@
     "Next, we'll extract the citations using a custom tokenizer. Unlike the default tokenizer, here we'll use our hyperscan tokenizer for much faster extraction, which works by automatically pre-compiling and caching a regular expression database on first use. Because of this one-time pre-compilation stage, the first use of this tokenizer is slow:"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1384d75b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# install hyperscan if not already installed\n",
+    "# !pip install hyperscan"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 7,

diff --git a/eyecite/annotate.py b/eyecite/annotate.py
@@ -1,11 +1,18 @@
 from bisect import bisect_left, bisect_right
 from difflib import SequenceMatcher
 from functools import partial
+from logging import getLogger
 from typing import Any, Callable, Iterable, Optional, Tuple
 
 import fast_diff_match_patch
 
-from eyecite.utils import is_balanced_html, wrap_html_tags
+from eyecite.utils import (
+    is_balanced_html,
+    maybe_balance_style_tags,
+    wrap_html_tags,
+)
+
+logger = getLogger("eyecite")
 
 
 def annotate_citations(
@@ -59,6 +66,9 @@ def annotate_citations(
     Returns:
         The annotated text.
     """
+    if unbalanced_tags not in ["unchecked", "skip", "wrap"]:
+        raise ValueError(f"Unknown option '{unbalanced_tags}")
+
     # set up offset_updater if we have to move annotations to source_text
     offset_updater = None
     if source_text and source_text != plain_text:
@@ -88,13 +98,20 @@ def annotate_citations(
         # handle HTML tags
         if unbalanced_tags == "unchecked":
             pass
-        elif unbalanced_tags in ("skip", "wrap"):
-            if not is_balanced_html(span_text):
-                if unbalanced_tags == "skip":
-                    continue
+        elif not is_balanced_html(span_text):
+            if unbalanced_tags == "wrap":
                 span_text = wrap_html_tags(span_text, after, before)
-        else:
-            raise ValueError(f"Unknown option '{unbalanced_tags}")
+            else:  # "skip" case
+                original_span_text = span_text
+                start, end, span_text = maybe_balance_style_tags(
+                    start, end, plain_text
+                )
+                if not is_balanced_html(span_text):
+                    logger.error(
+                        "Citation was not annotated due to unbalanced tags %s",
+                        original_span_text,
+                    )
+                    continue
 
         if annotator is not None:
             annotated_span = annotator(before, span_text, after)

diff --git a/eyecite/find.py b/eyecite/find.py
@@ -1,12 +1,15 @@
+import re
 from typing import List, Type, cast
 
 from eyecite.helpers import (
     disambiguate_reporters,
     extract_pin_cite,
+    filter_citations,
     joke_cite,
     match_on_tokens,
 )
 from eyecite.models import (
+    CaseReferenceToken,
     CitationBase,
     CitationToken,
     FullCaseCitation,
@@ -15,6 +18,7 @@
     FullLawCitation,
     IdCitation,
     IdToken,
+    ReferenceCitation,
     ResourceCitation,
     SectionToken,
     ShortCaseCitation,
@@ -25,6 +29,7 @@
 )
 from eyecite.regexes import SHORT_CITE_ANTECEDENT_REGEX, SUPRA_ANTECEDENT_REGEX
 from eyecite.tokenizers import Tokenizer, default_tokenizer
+from eyecite.utils import DISALLOWED_NAMES
 
 
 def get_citations(
@@ -54,7 +59,7 @@ def get_citations(
         return joke_cite
 
     words, citation_tokens = tokenizer.tokenize(plain_text)
-    citations = []
+    citations: list[CitationBase] = []
 
     for i, token in citation_tokens:
         citation: CitationBase
@@ -70,6 +75,13 @@ def get_citations(
                 citation = _extract_shortform_citation(words, i)
             else:
                 citation = _extract_full_citation(words, i)
+                if citations and isinstance(citation, FullCitation):
+                    citation.is_parallel_citation(citations[-1])
+
+                # Check for reference citations that follow a full citation
+                # Using the plaintiff or defendant
+                references = extract_reference_citations(citation, plain_text)
+                citations.extend(references)
 
         # CASE 2: Token is an "Id." or "Ibid." reference.
         # In this case, the citation should simply be to the item cited
@@ -99,6 +111,8 @@ def get_citations(
 
         citations.append(citation)
 
+    citations = filter_citations(citations)
+
     # Remove citations with multiple reporter candidates where we couldn't
     # guess correct reporter
     if remove_ambiguous:
@@ -107,10 +121,74 @@ def get_citations(
     # Returns a list of citations ordered in the sequence that they appear in
     # the document. The ordering of this list is important for reconstructing
     # the references of the ShortCaseCitation, SupraCitation, and
-    # IdCitation objects.
+    # IdCitation and ReferenceCitation objects.
     return citations
 
 
+def extract_reference_citations(
+    citation: FullCitation,
+    plain_text: str,
+) -> List[ReferenceCitation]:
+    """Extract reference citations that follow a full citation
+
+    :param citation: the full case citation found
+    :param plain_text: the text
+    :return: Pin cite reference citations
+    """
+    if len(plain_text) <= citation.span()[-1]:
+        return []
+    if not isinstance(citation, FullCaseCitation):
+        return []
+
+    def is_valid_name(name: str) -> bool:
+        """Validate name isnt a regex issue
+
+        Excludes strings like Co., numbers or lower case strs
+
+        :param name: The name to check
+        :return: True if usable, false if not
+        """
+        return (
+            isinstance(name, str)
+            and len(name) > 2
+            and name[0].isupper()
+            and not name.endswith(".")
+            and not name.isdigit()
+            and name.lower() not in DISALLOWED_NAMES
+        )
+
+    regexes = [
+        rf"(?P<{key}>{re.escape(value)})"
+        for key in ReferenceCitation.name_fields
+        if (value := getattr(citation.metadata, key, None))
+        and is_valid_name(value)
+    ]
+    if not regexes:
+        return []
+    pin_cite_re = (
+        rf"\b(?:{'|'.join(regexes)})\s+at(\s¶)?\s+(?P<pin_cite>\d{{1,5}})\b"
+    )
+    reference_citations = []
+    remaining_text = plain_text[citation.span()[-1] :]
+    offset = citation.span()[-1]
+    for match in re.compile(pin_cite_re).finditer(remaining_text):
+        start, end = match.span()
+        matched_text = match.group(0)
+        reference = ReferenceCitation(
+            token=CaseReferenceToken(
+                data=matched_text, start=start + offset, end=end + offset
+            ),
+            span_start=start + offset,
+            span_end=end + offset,
+            full_span_start=start + offset,
+            full_span_end=end + offset,
+            index=0,
+            metadata=match.groupdict(),
+        )
+        reference_citations.append(reference)
+    return reference_citations
+
+
 def _extract_full_citation(
     words: Tokens,
     index: int,
@@ -170,14 +248,18 @@ def _extract_shortform_citation(
         strings_only=True,
         forward=False,
     )
+    offset = 0
     if m:
+        ante_start, ante_end = m.span()
+        offset = ante_end - ante_start
         antecedent_guess = m["antecedent"].strip()
 
     # Get pin_cite
     cite_token = cast(CitationToken, words[index])
     pin_cite, span_end, parenthetical = extract_pin_cite(
         words, index, prefix=cite_token.groups["page"]
     )
+    span_end = span_end if span_end else 0
 
     # make ShortCaseCitation
     citation = ShortCaseCitation(
@@ -186,6 +268,8 @@ def _extract_shortform_citation(
         exact_editions=cite_token.exact_editions,
         variation_editions=cite_token.variation_editions,
         span_end=span_end,
+        full_span_start=cite_token.start - offset,
+        full_span_end=max([span_end, cite_token.end]),
         metadata={
             "antecedent_guess": antecedent_guess,
             "pin_cite": pin_cite,