freelawproject · quevon24 · Jan 9, 2025 · Jan 9, 2025 · Jan 22, 2025 · Jan 23, 2025
diff --git a/eyecite/tokenizers.py b/eyecite/tokenizers.py
@@ -297,7 +297,7 @@ def tokenize(self, text: str) -> Tuple[Tokens, List[Tuple[int, Token]]]:
         # descending. Remove overlaps by returning only matches
         # where the current start offset is greater than the previously
         # returned end offset. Also return text between matches.
-        citation_tokens = []
+        citation_tokens = []  # type: ignore
         all_tokens: Tokens = []
         tokens = sorted(
             self.extract_tokens(text), key=lambda m: (m.start, -m.end)
@@ -313,17 +313,44 @@ def tokenize(self, text: str) -> Tuple[Tokens, List[Tuple[int, Token]]]:
                 if merged:
                     continue
             if offset > token.start:
-                # skip overlaps
-                continue
+                # Overlap found
+                skip_token = True
+                # Check if previous token is a CitationToken and current token
+                # is a CitationToken
+                if (
+                    isinstance(token, CitationToken)
+                    and citation_tokens
+                    and isinstance(citation_tokens[-1][1], CitationToken)
+                ):
+                    last_citation_token = citation_tokens[-1][1]
+                    all_tokens_idx = all_tokens.index(last_citation_token)
+                    # If previous CitationToken has a nominative reporter check
+                    # overlap between tokens
+                    if "volume_nominative" in last_citation_token.groups:
+                        result, overlap = self.find_and_remove_overlap(
+                            last_citation_token.data, token.data
+                        )
+                        if overlap:
+                            # We make sure there is an overlap, replace bad
+                            # CitationToken with correct text from overlap
+                            all_tokens[all_tokens_idx] = result
+                            # Remove bad CitationToken from citation_tokens
+                            del citation_tokens[-1]
+                            skip_token = False
+
+                if skip_token:
+                    # Different overlap, skip it
+                    continue
+
             if offset < token.start:
-                # capture plain text before each match
+                # Capture plain text before each match
                 self.append_text(all_tokens, text[offset : token.start])
-            # capture match
+            # Capture match
             citation_tokens.append((len(all_tokens), token))
             all_tokens.append(token)
             offset = token.end
             last_token = token
-        # capture plain text after final match
+        # Capture plain text after final match
         if offset < len(text):
             self.append_text(all_tokens, text[offset:])
         return all_tokens, citation_tokens
@@ -351,6 +378,35 @@ def append_text(tokens, text):
                 tokens.append(" ")
         tokens.pop()  # remove final extra space
 
+    @staticmethod
+    def find_and_remove_overlap(str1: str, str2: str) -> tuple[str, str]:
+        """Find the overlap between two strings and removes the overlap form
+        the first string
+
+        :param str1: first string to compare and modify
+        :param str2: second string to find overlap with
+
+        :return: A tuple containing:
+            - The cleaned version of `str1` with the overlap removed
+            - The overlapping substring found between `str1` and `str2`
+        """
+        overlap = ""
+        for i in range(len(str1)):
+            # Check for substring of str1 starting from index i in str2
+            if str2.startswith(str1[i:]):
+                overlap = str1[i:]
+                break
+
+        # Remove the overlap from the first string
+        result = str1.replace(overlap, "", 1)
+
+        # Remove trailing comma if it is the last character (ignoring spaces)
+        result = result.strip()
+        if result.endswith(","):
+            result = result[:-1].rstrip()
+
+        return result, overlap
+
 
 @dataclass
 class AhocorasickTokenizer(Tokenizer):

diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
@@ -494,6 +494,20 @@ def test_find_citations(self):
                               metadata={'plaintiff': 'Commonwealth', 'defendant': 'Muniz',
                                         'court': 'pa'})]),
             ('Foo v. Bar,  1 F.Supp. 1 (SC 1967)', [case_citation(volume='1', reporter='F.Supp.', year=1967, page='1', metadata={'plaintiff': 'Foo', 'defendant': 'Bar', 'court': 'sc'})]),
+            # Test with Thomson in case name (bad citation match due defendant same as nominative reporter)
+            ('Shapiro v. Thompson, 394 U. S. 618',
+             [case_citation(volume='394', reporter='U. S.', page='618',
+                            metadata={'plaintiff': 'Shapiro',
+                                      'defendant': 'Thompson',
+                                      'court': 'scotus'}
+                            )]),
+            # Test other nominative in case name
+            ('Foo v. Cooke, 1 U. S. 1',
+             [case_citation(volume='1', reporter='U. S.', page='1',
+                            metadata={'plaintiff': 'Foo',
+                                      'defendant': 'Cooke',
+                                      'court': 'scotus'}
+                            )]),
         )
         # fmt: on
         self.run_test_pairs(test_pairs, "Citation extraction")