Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bad parsing for citations with defendant similar to nominative reporter #190

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 62 additions & 6 deletions eyecite/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def tokenize(self, text: str) -> Tuple[Tokens, List[Tuple[int, Token]]]:
# descending. Remove overlaps by returning only matches
# where the current start offset is greater than the previously
# returned end offset. Also return text between matches.
citation_tokens = []
citation_tokens = [] # type: ignore
all_tokens: Tokens = []
tokens = sorted(
self.extract_tokens(text), key=lambda m: (m.start, -m.end)
Expand All @@ -313,17 +313,44 @@ def tokenize(self, text: str) -> Tuple[Tokens, List[Tuple[int, Token]]]:
if merged:
continue
if offset > token.start:
# skip overlaps
continue
# Overlap found
skip_token = True
# Check if previous token is a CitationToken and current token
# is a CitationToken
if (
isinstance(token, CitationToken)
and citation_tokens
and isinstance(citation_tokens[-1][1], CitationToken)
):
last_citation_token = citation_tokens[-1][1]
all_tokens_idx = all_tokens.index(last_citation_token)
# If previous CitationToken has a nominative reporter check
# overlap between tokens
if "volume_nominative" in last_citation_token.groups:
result, overlap = self.find_and_remove_overlap(
last_citation_token.data, token.data
)
if overlap:
# We make sure there is an overlap, replace bad
# CitationToken with correct text from overlap
all_tokens[all_tokens_idx] = result
# Remove bad CitationToken from citation_tokens
del citation_tokens[-1]
skip_token = False

if skip_token:
# Different overlap, skip it
continue

if offset < token.start:
# capture plain text before each match
# Capture plain text before each match
self.append_text(all_tokens, text[offset : token.start])
# capture match
# Capture match
citation_tokens.append((len(all_tokens), token))
all_tokens.append(token)
offset = token.end
last_token = token
# capture plain text after final match
# Capture plain text after final match
if offset < len(text):
self.append_text(all_tokens, text[offset:])
return all_tokens, citation_tokens
Expand Down Expand Up @@ -351,6 +378,35 @@ def append_text(tokens, text):
tokens.append(" ")
tokens.pop() # remove final extra space

@staticmethod
def find_and_remove_overlap(str1: str, str2: str) -> tuple[str, str]:
"""Find the overlap between two strings and removes the overlap form
the first string

:param str1: first string to compare and modify
:param str2: second string to find overlap with

:return: A tuple containing:
- The cleaned version of `str1` with the overlap removed
- The overlapping substring found between `str1` and `str2`
"""
overlap = ""
for i in range(len(str1)):
# Check for substring of str1 starting from index i in str2
if str2.startswith(str1[i:]):
overlap = str1[i:]
break

# Remove the overlap from the first string
result = str1.replace(overlap, "", 1)

# Remove trailing comma if it is the last character (ignoring spaces)
result = result.strip()
if result.endswith(","):
result = result[:-1].rstrip()

return result, overlap


@dataclass
class AhocorasickTokenizer(Tokenizer):
Expand Down
14 changes: 14 additions & 0 deletions tests/test_FindTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,20 @@ def test_find_citations(self):
metadata={'plaintiff': 'Commonwealth', 'defendant': 'Muniz',
'court': 'pa'})]),
('Foo v. Bar, 1 F.Supp. 1 (SC 1967)', [case_citation(volume='1', reporter='F.Supp.', year=1967, page='1', metadata={'plaintiff': 'Foo', 'defendant': 'Bar', 'court': 'sc'})]),
# Test with Thomson in case name (bad citation match due defendant same as nominative reporter)
('Shapiro v. Thompson, 394 U. S. 618',
[case_citation(volume='394', reporter='U. S.', page='618',
metadata={'plaintiff': 'Shapiro',
'defendant': 'Thompson',
'court': 'scotus'}
)]),
# Test other nominative in case name
('Foo v. Cooke, 1 U. S. 1',
[case_citation(volume='1', reporter='U. S.', page='1',
metadata={'plaintiff': 'Foo',
'defendant': 'Cooke',
'court': 'scotus'}
)]),
)
# fmt: on
self.run_test_pairs(test_pairs, "Citation extraction")
Expand Down
Loading