feat(ReferenceCitation): use resolved_case_name and resolved_case_nam…

…e_short for search and resolution Fixes #199 - updated ReferenceCitation and FullCaseCitation models metadata to admit resolved_case_name_short and resolved_case_name - update helpers.filter_citations to save metadata of duplicated ReferenceCitations on the kept object - update tests to show how the finding and resolution will work
freelawproject · Feb 1, 2025 · 9805ef2 · 9805ef2
1 parent abfc7f7
commit 9805ef2
Show file tree

Hide file tree

Showing 6 changed files with 142 additions and 51 deletions.
diff --git a/eyecite/find.py b/eyecite/find.py
@@ -79,7 +79,7 @@ def get_citations(
 
                 # Check for reference citations that follow a full citation
                 # Using the plaintiff or defendant
-                references = _extract_reference_citations(citation, plain_text)
+                references = extract_reference_citations(citation, plain_text)
                 citations.extend(references)
 
         # CASE 2: Token is an "Id." or "Ibid." reference.
@@ -124,8 +124,9 @@ def get_citations(
     return citations
 
 
-def _extract_reference_citations(
-    citation: FullCitation, plain_text: str
+def extract_reference_citations(
+    citation: FullCitation,
+    plain_text: str,
 ) -> List[ReferenceCitation]:
     """Extract reference citations that follow a full citation
 
@@ -156,7 +157,7 @@ def is_valid_name(name: str) -> bool:
 
     regexes = [
         rf"(?P<{key}>{re.escape(value)})"
-        for key in ["plaintiff", "defendant"]
+        for key in ReferenceCitation.name_fields
         if (value := getattr(citation.metadata, key, None))
         and is_valid_name(value)
     ]

diff --git a/eyecite/helpers.py b/eyecite/helpers.py
@@ -12,6 +12,7 @@
     FullJournalCitation,
     FullLawCitation,
     ParagraphToken,
+    ReferenceCitation,
     ResourceCitation,
     StopWordToken,
     Token,
@@ -336,6 +337,19 @@ def filter_citations(citations: List[CitationBase]) -> List[CitationBase]:
             last_span = last_citation.span()
             current_span = citation.span()
 
+            if current_span == last_span and isinstance(
+                last_citation, ReferenceCitation
+            ):
+                # a single ReferenceCitation may be found via different
+                # names. Save the name metadata to account for collisions
+                for field in ReferenceCitation.name_fields:
+                    if not getattr(last_citation.metadata, field):
+                        setattr(
+                            last_citation.metadata,
+                            field,
+                            getattr(citation.metadata, field),
+                        )
+
             if current_span[0] <= last_span[1]:
                 # Remove overlapping citations that can occur in edge cases
                 continue

diff --git a/eyecite/models.py b/eyecite/models.py
@@ -456,6 +456,9 @@ class Metadata(CaseCitation.Metadata):
         plaintiff: Optional[str] = None
         defendant: Optional[str] = None
         extra: Optional[str] = None
+        # May be populated after citation resolution
+        resolved_case_name_short: Optional[str] = None
+        resolved_case_name: Optional[str] = None
 
     def add_metadata(self, words: "Tokens"):
         """Extract metadata from text before and after citation."""
@@ -604,6 +607,15 @@ class Metadata(CitationBase.Metadata):
         plaintiff: Optional[str] = None
         defendant: Optional[str] = None
         pin_cite: Optional[str] = None
+        resolved_case_name_short: Optional[str] = None
+        resolved_case_name: Optional[str] = None
+
+    name_fields = [
+        "plaintiff",
+        "defendant",
+        "resolved_case_name_short",
+        "resolved_case_name",
+    ]
 
 
 @dataclass(eq=False, unsafe_hash=False, repr=False)

diff --git a/eyecite/resolve.py b/eyecite/resolve.py
@@ -84,29 +84,34 @@ def _filter_by_matching_antecedent(
     return matches[0] if len(matches) == 1 else None
 
 
-def _filter_by_matching_plaintiff_or_defendant(
+def _filter_by_matching_plaintiff_or_defendant_or_resolved_names(
     resolved_full_cites: ResolvedFullCites,
-    plaintiff: str,
-    defendant: str,
+    reference_citation: ReferenceCitation,
 ) -> Optional[ResourceType]:
-    """Filter out any impossible reference citations"""
+    """Filter out reference citations that point to more than 1 Resource"""
     matches: List[ResourceType] = []
+    compare_keys = [
+        "defendant",
+        "plaintiff",
+        "resolved_case_name",
+        "resolved_case_name_short",
+    ]
 
     for full_citation, resource in resolved_full_cites:
         if not isinstance(full_citation, FullCaseCitation):
             continue
-        defendant_match = (
-            defendant
-            and full_citation.metadata.defendant
-            and defendant in full_citation.metadata.defendant
-        )
-        plaintiff_match = (
-            plaintiff
-            and full_citation.metadata.plaintiff
-            and plaintiff in full_citation.metadata.plaintiff
-        )
-        if defendant_match or plaintiff_match:
-            matches.append(resource)
+
+        for key in compare_keys:
+            reference_value = getattr(reference_citation.metadata, key)
+            full_case_value = getattr(full_citation.metadata, key)
+            if (
+                reference_value
+                and full_case_value
+                and reference_value in full_case_value
+            ):
+                matches.append(resource)
+                break
+
     # Remove duplicates and only accept if one candidate remains
     matches = list(set(matches))
     return matches[0] if len(matches) == 1 else None
@@ -216,18 +221,19 @@ def _resolve_reference_citation(
     """Resolve reference citations
 
     Try to resolve reference citations by checking whether their is only one
-    full citation that appears with either the defendant or plaintiff
+    full citation that appears with either the defendant or plaintiff or
+    resolved_case_name_short or resolved_case_name
     field of any of the previously resolved full citations.
     """
     if (
         not reference_citation.metadata.defendant
         and not reference_citation.metadata.plaintiff
+        and not reference_citation.metadata.resolved_case_name_short
+        and not reference_citation.metadata.resolved_case_name
     ):
         return None
-    return _filter_by_matching_plaintiff_or_defendant(
-        resolved_full_cites,
-        reference_citation.metadata.plaintiff,
-        reference_citation.metadata.defendant,
+    return _filter_by_matching_plaintiff_or_defendant_or_resolved_names(
+        resolved_full_cites, reference_citation
     )
 
 

diff --git a/tests/test_FindTest.py b/tests/test_FindTest.py
@@ -4,6 +4,8 @@
 from unittest import TestCase
 
 from eyecite import clean_text, get_citations
+from eyecite.find import extract_reference_citations
+from eyecite.helpers import filter_citations
 
 # by default tests use a cache for speed
 # call tests with `EYECITE_CACHE_DIR= python ...` to disable cache
@@ -858,3 +860,30 @@ def test_citation_fullspan(self):
             self.assertEqual(
                 extracted.full_span(), (start_idx, len(sentence)), error_msg
             )
+
+    def test_reference_extraction(self):
+        """Can we extract a reference citation using resolved metadata?"""
+        texts = [
+            # In this case the reference citation got with the
+            # resolved_case_name is redundant, was already got in the regular
+            # process. Can we deduplicate?
+            """See, e.g., State v. Wingler, 135 A. 2d 468 (1957);
+            [State v. Wingler at 175, citing, Minnesota ex rel.]""",
+            # In this case the resolved_case_name actually helps getting the
+            # reference citation
+            """See, e.g., State v. W1ngler, 135 A. 2d 468 (1957);
+            [State v. Wingler at 175, citing, Minnesota ex rel.]""",
+        ]
+        for plain_text in texts:
+            citations = get_citations(plain_text)
+            citations[0].metadata.resolved_case_name = "State v. Wingler"
+            references = extract_reference_citations(citations[0], plain_text)
+            final_citations = filter_citations(citations + references)
+            self.assertEqual(
+                len(final_citations), 2, "There should only be 2 citations"
+            )
+            self.assertEqual(
+                len(references),
+                1,
+                "Only a reference citation should had been picked up",
+            )
diff --git a/tests/test_ResolveTest.py b/tests/test_ResolveTest.py
@@ -3,6 +3,8 @@
 from unittest import TestCase
 
 from eyecite import get_citations
+from eyecite.find import extract_reference_citations
+from eyecite.helpers import filter_citations
 from eyecite.models import FullCitation, Resource
 from eyecite.resolve import resolve_citations
 
@@ -32,42 +34,53 @@ def assertResolution(self, citations, expected_resolution_dict):
         )
 
     def checkReferenceResolution(
-        self, *expected_resolutions: tuple[list[list[int]], str]
+        self,
+        expected_indices: list[list[int]],
+        citation_text: str,
+        resolved_case_name_short: Optional[str] = None,
     ):
         """
         Helper function to help test reference citations.
 
         Args:
-            *expected_resolutions (tuple[list[int], str]):
-                A list of tuples where each tuple contains:
-                - A list of expected indices for the resolved citations.
-                - A string of citation text to process.
-
+            expected_indices: A list of expected indices for the resolved
+                citations.
+            citation_text: A string of citation text to process.
+            resolved_case_name_short: a case name for simulating post-resolution
+                metadata assignment to full case citations; this will also be
+                used as a flag to use a second round of reference extractions
         Returns:
             None
         """
-        for expected_indices, citation_text in expected_resolutions:
-            citations = get_citations(citation_text)
+        citations = get_citations(citation_text)
+        if resolved_case_name_short:
+            citations[0].metadata.resolved_case_name_short = (
+                resolved_case_name_short
+            )
+            new_references = extract_reference_citations(
+                citations[0], citation_text
+            )
+            citations = filter_citations(citations + new_references)
 
-            # Step 2: Build a helper dict to map corrected citations to indices
-            resolution_index_map = {
-                cite.corrected_citation(): idx
-                for idx, cite in enumerate(citations)
-            }
+        # Step 2: Build a helper dict to map corrected citations to indices
+        resolution_index_map = {
+            cite.corrected_citation(): idx
+            for idx, cite in enumerate(citations)
+        }
 
-            # Step 3: Resolve citations and format the resolution
-            resolved_citations = resolve_citations(citations)
-            formatted_resolution = format_resolution(resolved_citations)
+        # Step 3: Resolve citations and format the resolution
+        resolved_citations = resolve_citations(citations)
+        formatted_resolution = format_resolution(resolved_citations)
 
-            # Step 4: Map resolved citations to their indices
-            result = {
-                key: [resolution_index_map[value] for value in values]
-                for key, values in formatted_resolution.items()
-            }
+        # Step 4: Map resolved citations to their indices
+        result = {
+            key: [resolution_index_map[value] for value in values]
+            for key, values in formatted_resolution.items()
+        }
 
-            # Step 5: Compare the actual results with expected indices
-            actual_indices = list(result.values())
-            self.assertEqual(expected_indices, actual_indices)
+        # Step 5: Compare the actual results with expected indices
+        actual_indices = list(result.values())
+        self.assertEqual(expected_indices, actual_indices)
 
     def checkResolution(
         self, *expected_resolutions: Tuple[Optional[int], str]
@@ -337,7 +350,7 @@ def test_complex_resolution(self):
         )
 
     def test_reference_resolution(self):
-        self.checkReferenceResolution(
+        for test_tuple in (
             ([[0, 1]], "Foo v. Bar, 1 U.S. 1 ... Foo at 2"),
             ([[0]], "Foo at 2. .... ; Foo v. Bar, 1 U.S. 1"),
             (
@@ -352,4 +365,20 @@ def test_reference_resolution(self):
                 [[0, 2], [1]],
                 "Foo v. Bar 1 U.S. 12, 347-348; In Smith, 12 U.S. 1 (1999) we saw something else. someting. In Foo at 2 we see that",
             ),
-        )
+            # Ok resolved_case_name and order, ReferenceCitation should be resolved
+            (
+                [[0, 1], [2]],
+                "State v. Dze 3 U.S. 22; something something. In Doe at 122, something more. In State v. Doe 4 U.S. 33",
+                "Doe",
+            ),
+            # due to the reference matching more than 1 full citation, we don't
+            # resolve
+            (
+                [[0], [1]],
+                "State v. Smlth 3 U.S. 22; something something. In State v. Smith 4 U.S. 33. In Smith at 122, something more",
+                "Smith",
+            ),
+            # ambiguous resolved_case_name, ReferenceCitation should not be
+            # resolved
+        ):
+            self.checkReferenceResolution(*test_tuple)