Skip to content

Commit

Permalink
Merge pull request #197 from freelawproject/196-include-html-style-ta…
Browse files Browse the repository at this point in the history
…gs-in-citation-span

fix(annotate_citations): try to include HTML style tags if not balanced
  • Loading branch information
flooie authored Feb 3, 2025
2 parents abfc7f7 + 4bfcc46 commit 90718a5
Show file tree
Hide file tree
Showing 3 changed files with 176 additions and 9 deletions.
31 changes: 24 additions & 7 deletions eyecite/annotate.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
from bisect import bisect_left, bisect_right
from difflib import SequenceMatcher
from functools import partial
from logging import getLogger
from typing import Any, Callable, Iterable, Optional, Tuple

import fast_diff_match_patch

from eyecite.utils import is_balanced_html, wrap_html_tags
from eyecite.utils import (
is_balanced_html,
maybe_balance_style_tags,
wrap_html_tags,
)

logger = getLogger("eyecite")


def annotate_citations(
Expand Down Expand Up @@ -59,6 +66,9 @@ def annotate_citations(
Returns:
The annotated text.
"""
if unbalanced_tags not in ["unchecked", "skip", "wrap"]:
raise ValueError(f"Unknown option '{unbalanced_tags}")

# set up offset_updater if we have to move annotations to source_text
offset_updater = None
if source_text and source_text != plain_text:
Expand Down Expand Up @@ -88,13 +98,20 @@ def annotate_citations(
# handle HTML tags
if unbalanced_tags == "unchecked":
pass
elif unbalanced_tags in ("skip", "wrap"):
if not is_balanced_html(span_text):
if unbalanced_tags == "skip":
continue
elif not is_balanced_html(span_text):
if unbalanced_tags == "wrap":
span_text = wrap_html_tags(span_text, after, before)
else:
raise ValueError(f"Unknown option '{unbalanced_tags}")
else: # "skip" case
original_span_text = span_text
start, end, span_text = maybe_balance_style_tags(
start, end, plain_text
)
if not is_balanced_html(span_text):
logger.error(
"Citation was not annotated due to unbalanced tags %s",
original_span_text,
)
continue

if annotator is not None:
annotated_span = annotator(before, span_text, after)
Expand Down
54 changes: 54 additions & 0 deletions eyecite/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,57 @@ def hash_sha256(dictionary: dict) -> int:

# Calculate the hash of the bytes, convert to an int, and return
return int.from_bytes(hashlib.sha256(json_bytes).digest(), byteorder="big")


def maybe_balance_style_tags(
start: int, end: int, plain_text: str
) -> tuple[int, int, str]:
"""Try to include style tags at the edge of the span marked as invalid
In some HTML sources the citations are styled with tags like <i> or <em>
When the citation is found in a stripped-of-tags text, the span may
leave out the opening or closing tag. When this happens and we try to
annotate the HTML, it will render invalid HTML. This happens mostly with
IdCitation, ReferenceCitation, etc.
This function will try to find opening or closing tags inmediately
preceding or following the citation span. If it finds them, it will
return the new start, end and span. If not, it will return the old ones
:param start: the original start of the span
:param end: the origina end of the span
:param plain_text: the text to annotate
:return: a tuple (new start, new end, new span text)
"""
span_text = plain_text[start:end]
style_tags = ["i", "em", "b"]
tolerance = 5 # tolerate at most this amount of whitespace

for tag in style_tags:
opening_tag = f"<{tag}>"
closing_tag = f"</{tag}>"
has_opening = opening_tag in span_text
has_closing = closing_tag in span_text
if has_opening and not has_closing:
# look for closing tag after the end
extended_end = max(
end + len(closing_tag) + tolerance, len(plain_text)
)
if end_match := re.search(
rf"{span_text}\s*{closing_tag}",
plain_text[start:extended_end],
flags=re.MULTILINE,
):
end = start + end_match.end()

if not has_opening and has_closing:
# look for opening tag before the start
extended_start = min(start - len(opening_tag) - tolerance, 0)
if start_match := re.search(
rf"{opening_tag}\s*{span_text}",
plain_text[extended_start:end],
flags=re.MULTILINE,
):
start = extended_start + start_match.start()

return start, end, plain_text[start:end]
100 changes: 98 additions & 2 deletions tests/test_AnnotateTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def straighten_quotes(text):
def lower_annotator(before, text, after):
return before + text.lower() + after

self.maxDiff = None
test_pairs = (
# single cite
("1 U.S. 1", "<0>1 U.S. 1</0>", []),
Expand Down Expand Up @@ -59,10 +60,10 @@ def lower_annotator(before, text, after):
"<body>foo <i><0>1 <b>U.S.</b></i> 1</0> bar</body>",
["html", "inline_whitespace"],
),
# whitespace and html -- skip unbalanced tags
# whitespace and html -- unbalanced tags are repaired
(
"foo <i>1 U.S.</i> 1; 2 <i>U.S.</i> 2",
"foo <i>1 U.S.</i> 1; <1>2 <i>U.S.</i> 2</1>",
"foo <0><i>1 U.S.</i> 1</0>; <1>2 <i>U.S.</i> 2</1>",
["html", "inline_whitespace"],
{"unbalanced_tags": "skip"},
),
Expand Down Expand Up @@ -101,6 +102,94 @@ def lower_annotator(before, text, after):
[],
{"annotator": lower_annotator},
),
# solvable unbalanced <em> tag. Need the FullCaseCitation first
# so the ReferenceCitation can be found
# from https://www.courtlistener.com/api/rest/v4/opinions/8496639/
# source: Opinion.xml_harvard
(
" partially secured by a debtor’s principal residence was not "
"con-firmable. <em>Nobelman v. Am. Sav. Bank, </em>"
"508 U.S. 324, 113 S.Ct. 2106, 124 L.Ed.2d 228 (1993). That "
"plan proposed to bifurcate the claim and... pay the unsecured"
"... only by a lien on the debtor’s principal residence.” "
"<em>Nobelman </em>at 332, 113 S.Ct. 2106. Section 1123(b)(5) "
"codifies the <em>Nobelman </em>decision in individual debtor "
"chapter 11 cases.",
" partially secured by a debtor’s principal residence was not"
" con-firmable. <em>Nobelman v. Am. Sav. Bank, </em>"
"<a href='something'>508 U.S. 324</a>, <a href='something'>"
"113 S.Ct. 2106</a>, <a href='something'>124 L.Ed.2d 228</a>"
" (1993). That plan proposed to bifurcate the claim and..."
" pay the unsecured... only by a lien on the debtor’s"
" principal residence.” <a href='something'><em>Nobelman </em>"
"at 332</a>, <a href='something'>113 S.Ct. 2106</a>. Section"
" 1123(b)(5) codifies the <em>Nobelman </em>decision in"
" individual debtor chapter 11 cases.",
["html", "all_whitespace"],
{"annotate_anchors": True, "unbalanced_tags": "skip"},
),
# solvable unbalanced <i> tag
# from https://www.courtlistener.com/api/rest/v4/opinions/2841253/
# source: Opinion.html
(
"he has not agreed so to submit.’” <i>Howsam v. Dean"
" Witter Reynolds, Inc.</i>, 537 U.S. 79, 83, 123 S. Ct."
" 588, 591 (2002) (combined mandamus and"
" interlocutory appeal) (citing <i>Howsam</i> at 84, 123"
" S. Ct. at 592)",
"he has not agreed so to submit.’” <i>Howsam v. Dean"
" Witter Reynolds, Inc.</i>, <a href='something'>537 U.S."
" 79</a>, 83, <a href='something'>123 S. Ct. 588</a>, 591"
" (2002) (combined mandamus and interlocutory appeal)"
" (citing <a href='something'><i>Howsam</i> at 84</a>, <a"
" href='something'>123 S. Ct. at 592</a>)",
["html", "all_whitespace"],
{"annotate_anchors": True, "unbalanced_tags": "skip"},
),
# The next 2 examples could be resolved if we increased the
# character tolerance or admitted the full case name instead of
# just one of the parties
(
# https://www.courtlistener.com/api/rest/v4/opinions/1535649/
# source: xml_harvard
"See also Styler v. Tall Oaks, Inc. (In re Hatch),"
" 93 B.R. 263, 267 (Bankr.D. Utah 1988),"
" <em> rev'd </em> 114 B.R. 747 (D.Utah 1989)."
"</p>... The court makes no"
" determination as to whe Fifth Amendment to the"
" constitution of the United States.” <em> Styler v."
" Tall Oaks, Inc. (In re Hatch), </em> at 748."
"</p>",
"See also Styler v. Tall Oaks, Inc. (In re Hatch),"
" <a href='something'>93 B.R. 263</a>, 267"
" (Bankr.D. Utah 1988), <em> rev'd </em> <a"
" href='something'>114 B.R. 747</a> (D.Utah 1989)."
"</p>... The court makes no"
" determination as to whe Fifth Amendment to the"
" constitution of the United States.” <em> Styler v."
" Tall Oaks, Inc. (In re Hatch), </em> at 748."
"</p>",
["html", "all_whitespace"],
{"annotate_anchors": True, "unbalanced_tags": "skip"},
),
(
# https://www.courtlistener.com/api/rest/v4/opinions/1985850/
# source: html_lawbox
"to act rationally. <i>See, e.g., </i><i>State v."
" Wingler,</i> 25 <i>N.J.</i> 161, 175, 135 <i>A.</i>2d"
" 468 (1957); <i>citing, ... have been applied.'"
" [<i>State v. Wingler</i> at 175, 135 <i>A.</i>2d"
" 468, <i>citing, </i><i>Minnesota ex rel.</i>",
"to act rationally. <i>See, e.g., </i><i>State v."
" Wingler,</i> <a href='something'>25 <i>N.J.</i>"
" 161</a>, 175, <a href='something'>135 <i>A.</i>2d"
" 468</a> (1957); <i>citing, ... have been applied.'"
" [<i>State v. Wingler</i> at 175, <a"
" href='something'>135 <i>A.</i>2d 468</a>, <i>citing,"
" </i><i>Minnesota ex rel.</i>",
["html", "all_whitespace"],
{"annotate_anchors": True, "unbalanced_tags": "skip"},
),
)
for source_text, expected, clean_steps, *annotate_kwargs in test_pairs:
annotate_kwargs = annotate_kwargs[0] if annotate_kwargs else {}
Expand All @@ -115,6 +204,13 @@ def lower_annotator(before, text, after):
(c.span(), f"<{i}>", f"</{i}>")
for i, c in enumerate(cites)
]

if annotate_kwargs.pop("annotate_anchors", False):
annotations = [
(c.span(), "<a href='something'>", "</a>")
for c in cites
]

annotated = annotate_citations(
plain_text,
annotations,
Expand Down

0 comments on commit 90718a5

Please sign in to comment.