Skip to content

Commit

Permalink
Merge branch 'main' into 174-parsing-error-for-citations-with-defenda…
Browse files Browse the repository at this point in the history
…nt-thompson
  • Loading branch information
quevon24 authored Feb 7, 2025
2 parents 17caf7b + 32ee756 commit d0b4c4f
Show file tree
Hide file tree
Showing 15 changed files with 857 additions and 19 deletions.
15 changes: 13 additions & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,25 @@ Fixes:

## Current

**2.6.4 - 2024-06-03**
**2.6.5 - 2025-01-28**

Features:

- Add ReferenceCitation model and associated logic

Fixes:

- Bump eyecite to for InvalidError/hyperscan bug
- Fix court string matching with whitespace
- Fix court name issues

## Past

**2.6.4 - 2024-06-03**

Fixes:

- Bump eyecite to for InvalidError/hyperscan bug

**2.6.3 - 2024-04-09**

Fixes:
Expand Down
39 changes: 39 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,45 @@ Extracting Citations
that might refer to more than one reporter and can't be narrowed down by date.
3. :code:`tokenizer` ==> Tokenizer, default :code:`eyecite.tokenizers.default_tokenizer`: An instance of a Tokenizer object (see "Tokenizers" below).

Resolving Reference Citations
-----------------------------

Eyecite now supports a two-step process for extracting and resolving reference citations. This feature improves handling of citations that reference previously mentioned cases without explicitly repeating the full case name or citation.

Reference citations, such as “Theatre Enterprises at 552”, can be difficult to extract accurately if a judge is citing to `Theatre Enterprises, Inc. v. Paramount Film Distributing Corp., 346 U. S. 537, 541 (1954)` they lack a full case name. To address this, Eyecite allows for an initial citation extraction, followed by a secondary reference resolution step. If you have an external database (e.g., CourtListener) that provides resolved case names, you can use this feature to enhance citation finding.

from eyecite import get_citations
from eyecite.find import extract_reference_citations
from eyecite.helpers import filter_citations

plain_text = (
"quoting Theatre Enterprises, Inc. v. Paramount Film Distributing Corp., 346 U. S. 537, 541 (1954); "
"alterations in original). Thus, the District Court understood that allegations of "
"parallel business conduct, taken alone, do not state a claim under § 1; "
"plaintiffs must allege additional facts that “ten[d] to exclude independent "
"self-interested conduct as an As Theatre Enterprises at 552 held, parallel"
)

::

from eyecite import get_citations
from eyecite.find import extract_reference_citations
from eyecite.helpers import filter_citations

# Step 1: Extract full citations
citations = get_citations(plain_text)

# Step 2: Resolve the case name from an external database or prior knowledge
citations[0].metadata.resolved_case_name_short = "Theatre Enterprises"

# Step 3: Extract reference citations using the resolved name
references = extract_reference_citations(citations[0], plain_text)

# Step 4: Filter and merge citations
new_citations = filter_citations(citations + references)

Keep in mind that this feature requires an external database or heuristic method to resolve the short case name before extracting reference citations a second time.


Cleaning Input Text
-------------------
Expand Down
13 changes: 12 additions & 1 deletion TUTORIAL.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
"metadata": {},
"outputs": [],
"source": [
"opinion_url = 'https://www.courtlistener.com/api/rest/v3/opinions/1741/'\n",
"opinion_url = 'https://www.courtlistener.com/api/rest/v4/opinions/1741/'\n",
"opinion_text = requests.get(opinion_url).json()['plain_text']"
]
},
Expand Down Expand Up @@ -163,6 +163,17 @@
"Next, we'll extract the citations using a custom tokenizer. Unlike the default tokenizer, here we'll use our hyperscan tokenizer for much faster extraction, which works by automatically pre-compiling and caching a regular expression database on first use. Because of this one-time pre-compilation stage, the first use of this tokenizer is slow:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1384d75b",
"metadata": {},
"outputs": [],
"source": [
"# install hyperscan if not already installed\n",
"# !pip install hyperscan"
]
},
{
"cell_type": "code",
"execution_count": 7,
Expand Down
31 changes: 24 additions & 7 deletions eyecite/annotate.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from bisect import bisect_left, bisect_right
from difflib import SequenceMatcher
from functools import partial
from logging import getLogger
from typing import Any, Callable, Iterable, Optional, Tuple

import fast_diff_match_patch

from eyecite.utils import is_balanced_html, wrap_html_tags
from eyecite.utils import (
is_balanced_html,
maybe_balance_style_tags,
wrap_html_tags,
)

logger = getLogger("eyecite")


def annotate_citations(
Expand Down Expand Up @@ -59,6 +66,9 @@ def annotate_citations(
Returns:
The annotated text.
"""
if unbalanced_tags not in ["unchecked", "skip", "wrap"]:
raise ValueError(f"Unknown option '{unbalanced_tags}")

# set up offset_updater if we have to move annotations to source_text
offset_updater = None
if source_text and source_text != plain_text:
Expand Down Expand Up @@ -88,13 +98,20 @@ def annotate_citations(
# handle HTML tags
if unbalanced_tags == "unchecked":
pass
elif unbalanced_tags in ("skip", "wrap"):
if not is_balanced_html(span_text):
if unbalanced_tags == "skip":
continue
elif not is_balanced_html(span_text):
if unbalanced_tags == "wrap":
span_text = wrap_html_tags(span_text, after, before)
else:
raise ValueError(f"Unknown option '{unbalanced_tags}")
else: # "skip" case
original_span_text = span_text
start, end, span_text = maybe_balance_style_tags(
start, end, plain_text
)
if not is_balanced_html(span_text):
logger.error(
"Citation was not annotated due to unbalanced tags %s",
original_span_text,
)
continue

if annotator is not None:
annotated_span = annotator(before, span_text, after)
Expand Down
88 changes: 86 additions & 2 deletions eyecite/find.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import re
from typing import List, Type, cast

from eyecite.helpers import (
disambiguate_reporters,
extract_pin_cite,
filter_citations,
joke_cite,
match_on_tokens,
)
from eyecite.models import (
CaseReferenceToken,
CitationBase,
CitationToken,
FullCaseCitation,
Expand All @@ -15,6 +18,7 @@
FullLawCitation,
IdCitation,
IdToken,
ReferenceCitation,
ResourceCitation,
SectionToken,
ShortCaseCitation,
Expand All @@ -25,6 +29,7 @@
)
from eyecite.regexes import SHORT_CITE_ANTECEDENT_REGEX, SUPRA_ANTECEDENT_REGEX
from eyecite.tokenizers import Tokenizer, default_tokenizer
from eyecite.utils import DISALLOWED_NAMES


def get_citations(
Expand Down Expand Up @@ -54,7 +59,7 @@ def get_citations(
return joke_cite

words, citation_tokens = tokenizer.tokenize(plain_text)
citations = []
citations: list[CitationBase] = []

for i, token in citation_tokens:
citation: CitationBase
Expand All @@ -70,6 +75,13 @@ def get_citations(
citation = _extract_shortform_citation(words, i)
else:
citation = _extract_full_citation(words, i)
if citations and isinstance(citation, FullCitation):
citation.is_parallel_citation(citations[-1])

# Check for reference citations that follow a full citation
# Using the plaintiff or defendant
references = extract_reference_citations(citation, plain_text)
citations.extend(references)

# CASE 2: Token is an "Id." or "Ibid." reference.
# In this case, the citation should simply be to the item cited
Expand Down Expand Up @@ -99,6 +111,8 @@ def get_citations(

citations.append(citation)

citations = filter_citations(citations)

# Remove citations with multiple reporter candidates where we couldn't
# guess correct reporter
if remove_ambiguous:
Expand All @@ -107,10 +121,74 @@ def get_citations(
# Returns a list of citations ordered in the sequence that they appear in
# the document. The ordering of this list is important for reconstructing
# the references of the ShortCaseCitation, SupraCitation, and
# IdCitation objects.
# IdCitation and ReferenceCitation objects.
return citations


def extract_reference_citations(
citation: FullCitation,
plain_text: str,
) -> List[ReferenceCitation]:
"""Extract reference citations that follow a full citation
:param citation: the full case citation found
:param plain_text: the text
:return: Pin cite reference citations
"""
if len(plain_text) <= citation.span()[-1]:
return []
if not isinstance(citation, FullCaseCitation):
return []

def is_valid_name(name: str) -> bool:
"""Validate name isnt a regex issue
Excludes strings like Co., numbers or lower case strs
:param name: The name to check
:return: True if usable, false if not
"""
return (
isinstance(name, str)
and len(name) > 2
and name[0].isupper()
and not name.endswith(".")
and not name.isdigit()
and name.lower() not in DISALLOWED_NAMES
)

regexes = [
rf"(?P<{key}>{re.escape(value)})"
for key in ReferenceCitation.name_fields
if (value := getattr(citation.metadata, key, None))
and is_valid_name(value)
]
if not regexes:
return []
pin_cite_re = (
rf"\b(?:{'|'.join(regexes)})\s+at(\s¶)?\s+(?P<pin_cite>\d{{1,5}})\b"
)
reference_citations = []
remaining_text = plain_text[citation.span()[-1] :]
offset = citation.span()[-1]
for match in re.compile(pin_cite_re).finditer(remaining_text):
start, end = match.span()
matched_text = match.group(0)
reference = ReferenceCitation(
token=CaseReferenceToken(
data=matched_text, start=start + offset, end=end + offset
),
span_start=start + offset,
span_end=end + offset,
full_span_start=start + offset,
full_span_end=end + offset,
index=0,
metadata=match.groupdict(),
)
reference_citations.append(reference)
return reference_citations


def _extract_full_citation(
words: Tokens,
index: int,
Expand Down Expand Up @@ -170,14 +248,18 @@ def _extract_shortform_citation(
strings_only=True,
forward=False,
)
offset = 0
if m:
ante_start, ante_end = m.span()
offset = ante_end - ante_start
antecedent_guess = m["antecedent"].strip()

# Get pin_cite
cite_token = cast(CitationToken, words[index])
pin_cite, span_end, parenthetical = extract_pin_cite(
words, index, prefix=cite_token.groups["page"]
)
span_end = span_end if span_end else 0

# make ShortCaseCitation
citation = ShortCaseCitation(
Expand All @@ -186,6 +268,8 @@ def _extract_shortform_citation(
exact_editions=cite_token.exact_editions,
variation_editions=cite_token.variation_editions,
span_end=span_end,
full_span_start=cite_token.start - offset,
full_span_end=max([span_end, cite_token.end]),
metadata={
"antecedent_guess": antecedent_guess,
"pin_cite": pin_cite,
Expand Down
Loading

0 comments on commit d0b4c4f

Please sign in to comment.