Skip to content

Commit

Permalink
Merge branch 'main' into 27-add-detection-of-ridgelys-notes-and-other…
Browse files Browse the repository at this point in the history
…-irregular-reporters
  • Loading branch information
flooie authored Jan 24, 2025
2 parents ad40535 + d09473c commit 56dd76e
Show file tree
Hide file tree
Showing 8 changed files with 361 additions and 4 deletions.
79 changes: 77 additions & 2 deletions eyecite/find.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import re
from typing import List, Type, cast

from eyecite.helpers import (
disambiguate_reporters,
extract_pin_cite,
filter_citations,
joke_cite,
match_on_tokens,
)
from eyecite.models import (
CaseReferenceToken,
CitationBase,
CitationToken,
FullCaseCitation,
Expand All @@ -15,6 +18,7 @@
FullLawCitation,
IdCitation,
IdToken,
ReferenceCitation,
ResourceCitation,
SectionToken,
ShortCaseCitation,
Expand Down Expand Up @@ -54,7 +58,7 @@ def get_citations(
return joke_cite

words, citation_tokens = tokenizer.tokenize(plain_text)
citations = []
citations: list[CitationBase] = []

for i, token in citation_tokens:
citation: CitationBase
Expand All @@ -70,6 +74,13 @@ def get_citations(
citation = _extract_shortform_citation(words, i)
else:
citation = _extract_full_citation(words, i)
if citations and isinstance(citation, FullCitation):
citation.is_parallel_citation(citations[-1])

# Check for reference citations that follow a full citation
# Using the plaintiff or defendant
references = _extract_reference_citations(citation, plain_text)
citations.extend(references)

# CASE 2: Token is an "Id." or "Ibid." reference.
# In this case, the citation should simply be to the item cited
Expand Down Expand Up @@ -99,6 +110,8 @@ def get_citations(

citations.append(citation)

citations = filter_citations(citations)

# Remove citations with multiple reporter candidates where we couldn't
# guess correct reporter
if remove_ambiguous:
Expand All @@ -107,10 +120,72 @@ def get_citations(
# Returns a list of citations ordered in the sequence that they appear in
# the document. The ordering of this list is important for reconstructing
# the references of the ShortCaseCitation, SupraCitation, and
# IdCitation objects.
# IdCitation and ReferenceCitation objects.
return citations


def _extract_reference_citations(
citation: FullCitation, plain_text: str
) -> List[ReferenceCitation]:
"""Extract reference citations that follow a full citation
:param citation: the full case citation found
:param plain_text: the text
:return: Pin cite reference citations
"""
if len(plain_text) <= citation.span()[-1]:
return []
if not isinstance(citation, FullCaseCitation):
return []

def is_valid_name(name: str) -> bool:
"""Validate name isnt a regex issue
Excludes strings like Co., numbers or lower case strs
:param name: The name to check
:return: True if usable, false if not
"""
return (
isinstance(name, str)
and len(name) > 2
and name[0].isupper()
and not name.endswith(".")
and not name.isdigit()
)

regexes = [
rf"(?P<{key}>{re.escape(value)})"
for key in ["plaintiff", "defendant"]
if (value := getattr(citation.metadata, key, None))
and is_valid_name(value)
]
if not regexes:
return []
pin_cite_re = (
rf"\b(?:{'|'.join(regexes)})\s+at\s+(?P<pin_cite>\d{{1,5}})\b"
)
reference_citations = []
remaining_text = plain_text[citation.span()[-1] :]
offset = citation.span()[-1]
for match in re.compile(pin_cite_re).finditer(remaining_text):
start, end = match.span()
matched_text = match.group(0)
reference = ReferenceCitation(
token=CaseReferenceToken(
data=matched_text, start=start + offset, end=end + offset
),
span_start=start + offset,
span_end=end + offset,
full_span_start=start + offset,
full_span_end=end + offset,
index=0,
metadata=match.groupdict(),
)
reference_citations.append(reference)
return reference_citations


def _extract_full_citation(
words: Tokens,
index: int,
Expand Down
30 changes: 29 additions & 1 deletion eyecite/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,9 +141,11 @@ def add_defendant(citation: CaseCitation, words: Tokens) -> None:
break
if start_index:
citation.full_span_start = citation.span()[0] - offset
citation.metadata.defendant = "".join(
defendant = "".join(
str(w) for w in words[start_index : citation.index]
).strip(", ")
if defendant.strip():
citation.metadata.defendant = defendant


def add_law_metadata(citation: FullLawCitation, words: Tokens) -> None:
Expand Down Expand Up @@ -315,6 +317,32 @@ def disambiguate_reporters(
]


def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
"""Filter and order citations, ensuring reference citations are in sequence
This function resolves rare but possible overlaps between ref. citations
and short citations. It also orders all citations by their `citation.span`,
as reference citations may be extracted out of order. The final result is a
properly sorted list of citations as they appear in the text
:param citations: List of citations
:return: Sorted and filtered citations
"""
filtered_citations: List[CitationBase] = []
sorted_citations = sorted(citations, key=lambda citation: citation.span())
for citation in sorted_citations:
if filtered_citations:
last_citation = filtered_citations[-1]
last_span = last_citation.span()
current_span = citation.span()

if current_span[0] <= last_span[1]:
# Remove overlapping citations that can occur in edge cases
continue
filtered_citations.append(citation)
return filtered_citations


joke_cite: List[CitationBase] = [
FullCaseCitation(
CitationToken(
Expand Down
45 changes: 45 additions & 0 deletions eyecite/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,26 @@ class FullCitation(ResourceCitation):
"""Abstract base class indicating that a citation fully identifies a
resource."""

def is_parallel_citation(self, preceding: CitationBase):
"""Check if preceding citation is parallel
Args:
preceding (): The previous citation found
Returns: None
"""
is_parallel = (
self.full_span_start == preceding.full_span_start
and self.full_span_end == preceding.full_span_end
and isinstance(preceding, FullCaseCitation)
)
if is_parallel:
# if parallel get plaintiff/defendant data from
# the earlier citation, since it won't be on the
# parallel one.
self.metadata.defendant = preceding.metadata.defendant
self.metadata.plaintiff = preceding.metadata.plaintiff


@dataclass(eq=False, unsafe_hash=False, repr=False)
class FullLawCitation(FullCitation):
Expand Down Expand Up @@ -566,6 +586,26 @@ def formatted(self):
return "".join(parts)


@dataclass(eq=False, unsafe_hash=False, repr=False)
class ReferenceCitation(CitationBase):
"""A reference citation is a citation that refers to
a full case citation by name and pincite alone.
Future versions hopefully with drop the pincite requirement
Examples:
Roe at 240
"""

@dataclass(eq=True, unsafe_hash=True)
class Metadata(CitationBase.Metadata):
"""Define fields on self.metadata."""

plaintiff: Optional[str] = None
defendant: Optional[str] = None
pin_cite: Optional[str] = None


@dataclass(eq=False, unsafe_hash=False, repr=False)
class UnknownCitation(CitationBase):
"""Convenience class which represents an unknown citation. A recognized
Expand Down Expand Up @@ -679,6 +719,11 @@ class StopWordToken(Token):
"""Word matching one of the STOP_TOKENS."""


@dataclass(eq=True, unsafe_hash=True)
class CaseReferenceToken(Token):
"""Word matching plaintiff or defendant in a full case citation"""


@dataclass
class TokenExtractor:
"""Class for extracting all matches from a given string for the given
Expand Down
60 changes: 60 additions & 0 deletions eyecite/resolve.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
FullCaseCitation,
FullCitation,
IdCitation,
ReferenceCitation,
Resource,
ResourceType,
ShortCaseCitation,
Expand Down Expand Up @@ -83,6 +84,34 @@ def _filter_by_matching_antecedent(
return matches[0] if len(matches) == 1 else None


def _filter_by_matching_plaintiff_or_defendant(
resolved_full_cites: ResolvedFullCites,
plaintiff: str,
defendant: str,
) -> Optional[ResourceType]:
"""Filter out any impossible reference citations"""
matches: List[ResourceType] = []

for full_citation, resource in resolved_full_cites:
if not isinstance(full_citation, FullCaseCitation):
continue
defendant_match = (
defendant
and full_citation.metadata.defendant
and defendant in full_citation.metadata.defendant
)
plaintiff_match = (
plaintiff
and full_citation.metadata.plaintiff
and plaintiff in full_citation.metadata.plaintiff
)
if defendant_match or plaintiff_match:
matches.append(resource)
# Remove duplicates and only accept if one candidate remains
matches = list(set(matches))
return matches[0] if len(matches) == 1 else None


def _has_invalid_pin_cite(
full_cite: FullCitation, id_cite: IdCitation
) -> bool:
Expand Down Expand Up @@ -180,6 +209,28 @@ def _resolve_supra_citation(
)


def _resolve_reference_citation(
reference_citation: ReferenceCitation,
resolved_full_cites: ResolvedFullCites,
) -> Optional[ResourceType]:
"""Resolve reference citations
Try to resolve reference citations by checking whether their is only one
full citation that appears with either the defendant or plaintiff
field of any of the previously resolved full citations.
"""
if (
not reference_citation.metadata.defendant
and not reference_citation.metadata.plaintiff
):
return None
return _filter_by_matching_plaintiff_or_defendant(
resolved_full_cites,
reference_citation.metadata.plaintiff,
reference_citation.metadata.defendant,
)


def _resolve_id_citation(
id_citation: IdCitation,
last_resolution: ResourceType,
Expand Down Expand Up @@ -214,6 +265,10 @@ def resolve_citations(
[SupraCitation, ResolvedFullCites],
Optional[ResourceType],
] = _resolve_supra_citation,
resolve_reference_citation: Callable[
[ReferenceCitation, ResolvedFullCites],
Optional[ResourceType],
] = _resolve_reference_citation,
resolve_id_citation: Callable[
[IdCitation, ResourceType, Resolutions], Optional[ResourceType]
] = _resolve_id_citation,
Expand Down Expand Up @@ -286,6 +341,11 @@ def resolve_citations(
elif isinstance(citation, SupraCitation):
resolution = resolve_supra_citation(citation, resolved_full_cites)

elif isinstance(citation, ReferenceCitation):
resolution = resolve_reference_citation(
citation, resolved_full_cites
)

# If the citation is an id citation, try to resolve it
elif isinstance(citation, IdCitation):
resolution = resolve_id_citation(
Expand Down
9 changes: 9 additions & 0 deletions eyecite/test_factories.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from eyecite.helpers import get_year
from eyecite.models import (
CaseReferenceToken,
CitationToken,
FullCaseCitation,
FullJournalCitation,
FullLawCitation,
IdCitation,
IdToken,
ReferenceCitation,
SectionToken,
ShortCaseCitation,
SupraCitation,
Expand Down Expand Up @@ -103,6 +105,13 @@ def id_citation(source_text=None, index=0, **kwargs):
return IdCitation(IdToken(source_text, 0, 99), index, **kwargs)


def reference_citation(source_text=None, index=0, **kwargs):
"""Convenience function for creating mock ReferenceCitation objects."""
return ReferenceCitation(
CaseReferenceToken(source_text, 0, 99), index, **kwargs
)


def unknown_citation(source_text=None, index=0, **kwargs):
"""Convenience function for creating mock UnknownCitation objects."""
return UnknownCitation(SectionToken(source_text, 0, 99), index, **kwargs)
Expand Down
6 changes: 6 additions & 0 deletions tests/test_AnnotateTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ def lower_annotator(before, text, after):
"<0>1 U.S. 1</0>. Foo v. Bar, <1>supra at 2</1>.",
[],
),
# Reference cite
(
"Foo v. Bar 1 U.S. 1. In Foo at 2.",
"Foo v. Bar <0>1 U.S. 1</0>. In <1>Foo at 2</1>.",
[],
),
# whitespace and html -- no unbalanced tag check
(
"<body>foo <i>1 <b>U.S.</b></i> 1 bar</body>",
Expand Down
Loading

0 comments on commit 56dd76e

Please sign in to comment.