diff --git a/eyecite/tokenizers.py b/eyecite/tokenizers.py index ba9ea7d..6f4c49f 100644 --- a/eyecite/tokenizers.py +++ b/eyecite/tokenizers.py @@ -297,7 +297,7 @@ def tokenize(self, text: str) -> Tuple[Tokens, List[Tuple[int, Token]]]: # descending. Remove overlaps by returning only matches # where the current start offset is greater than the previously # returned end offset. Also return text between matches. - citation_tokens = [] + citation_tokens = [] # type: ignore all_tokens: Tokens = [] tokens = sorted( self.extract_tokens(text), key=lambda m: (m.start, -m.end) @@ -313,17 +313,44 @@ def tokenize(self, text: str) -> Tuple[Tokens, List[Tuple[int, Token]]]: if merged: continue if offset > token.start: - # skip overlaps - continue + # Overlap found + skip_token = True + # Check if previous token is a CitationToken and current token + # is a CitationToken + if ( + isinstance(token, CitationToken) + and citation_tokens + and isinstance(citation_tokens[-1][1], CitationToken) + ): + last_citation_token = citation_tokens[-1][1] + all_tokens_idx = all_tokens.index(last_citation_token) + # If previous CitationToken has a nominative reporter check + # overlap between tokens + if "volume_nominative" in last_citation_token.groups: + result, overlap = self.find_and_remove_overlap( + last_citation_token.data, token.data + ) + if overlap: + # We make sure there is an overlap, replace bad + # CitationToken with correct text from overlap + all_tokens[all_tokens_idx] = result + # Remove bad CitationToken from citation_tokens + del citation_tokens[-1] + skip_token = False + + if skip_token: + # Different overlap, skip it + continue + if offset < token.start: - # capture plain text before each match + # Capture plain text before each match self.append_text(all_tokens, text[offset : token.start]) - # capture match + # Capture match citation_tokens.append((len(all_tokens), token)) all_tokens.append(token) offset = token.end last_token = token - # capture plain text after final match + # Capture plain text after final match if offset < len(text): self.append_text(all_tokens, text[offset:]) return all_tokens, citation_tokens @@ -351,6 +378,35 @@ def append_text(tokens, text): tokens.append(" ") tokens.pop() # remove final extra space + @staticmethod + def find_and_remove_overlap(str1: str, str2: str) -> tuple[str, str]: + """Find the overlap between two strings and removes the overlap form + the first string + + :param str1: first string to compare and modify + :param str2: second string to find overlap with + + :return: A tuple containing: + - The cleaned version of `str1` with the overlap removed + - The overlapping substring found between `str1` and `str2` + """ + overlap = "" + for i in range(len(str1)): + # Check for substring of str1 starting from index i in str2 + if str2.startswith(str1[i:]): + overlap = str1[i:] + break + + # Remove the overlap from the first string + result = str1.replace(overlap, "", 1) + + # Remove trailing comma if it is the last character (ignoring spaces) + result = result.strip() + if result.endswith(","): + result = result[:-1].rstrip() + + return result, overlap + @dataclass class AhocorasickTokenizer(Tokenizer): diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py index cab670e..1277b4a 100644 --- a/tests/test_FindTest.py +++ b/tests/test_FindTest.py @@ -494,6 +494,20 @@ def test_find_citations(self): metadata={'plaintiff': 'Commonwealth', 'defendant': 'Muniz', 'court': 'pa'})]), ('Foo v. Bar, 1 F.Supp. 1 (SC 1967)', [case_citation(volume='1', reporter='F.Supp.', year=1967, page='1', metadata={'plaintiff': 'Foo', 'defendant': 'Bar', 'court': 'sc'})]), + # Test with Thomson in case name (bad citation match due defendant same as nominative reporter) + ('Shapiro v. Thompson, 394 U. S. 618', + [case_citation(volume='394', reporter='U. S.', page='618', + metadata={'plaintiff': 'Shapiro', + 'defendant': 'Thompson', + 'court': 'scotus'} + )]), + # Test other nominative in case name + ('Foo v. Cooke, 1 U. S. 1', + [case_citation(volume='1', reporter='U. S.', page='1', + metadata={'plaintiff': 'Foo', + 'defendant': 'Cooke', + 'court': 'scotus'} + )]), ) # fmt: on self.run_test_pairs(test_pairs, "Citation extraction")