-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathclef_ip_utils.py
124 lines (109 loc) · 3.8 KB
/
clef_ip_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from pathlib import Path
import re
import xml.etree.ElementTree as ET
btext = ["b", "i", "o", "u", "sub", "sup", "smallcaps"]
ptext = ["b", "i", "o", "u", "sub", "sup", "smallcaps", "br", "dl", "ul", "ol", "sl"]
supported_children = {
"abstract": ["abst-problem", "abst-solution", "heading", "p"],
"abst-problem": ["p"],
"abst-solution": ["p"],
"heading": [*btext],
"p": [*ptext],
"claim": ["claim-text"],
"claim-text": [*ptext, "claim-text"],
"b": [ "i", "o", "u", "sub", "sup", "smallcaps"],
"i": ["b", "o", "u", "sub", "sup", "smallcaps"],
"o": ["b", "i", "u", "sub", "sup", "smallcaps"],
"u": ["b", "i", "o", "sub", "sup", "smallcaps"],
"sub": ["b", "i", "o", "u", "sub", "sup", "smallcaps", "sub2", "sup2"],
"sup": ["b", "i", "o", "u", "sub", "smallcaps", "sub2", "sup2"],
"smallcaps": ["b", "i", "o", "u", "sub", "sup", "sub2", "sup2"],
"sub2": ["b", "i", "o", "u", "sub", "sup", "smallcaps", "sup2"],
"sup2": ["b", "i", "o", "u", "sub", "sup", "smallcaps", "sub2" ],
"br": [],
"dl": ["dt", "dd"],
"ul": ["li"],
"ol": ["li"],
"sl": ["li"],
"dt": [*btext],
"dd": [*ptext],
"li": [*ptext]
}
requires_whitespace = {
"abstract": True,
"abst-problem": True,
"abst-solution": True,
"heading": True,
"p": True,
"claim": True,
"claim-text": True,
"b": False,
"i": False,
"o": False,
"u": False,
"sub": False,
"sup": False,
"smallcaps": False,
"sub2": False,
"sup2": False,
"br": True,
"dl": True,
"ul": True,
"ol": True,
"sl": True,
"dt": True,
"dd": True,
"li": True
}
def _get_text_from_tag(tag: ET.Element):
final_text = tag.text or ""
for child in tag:
if child.tag not in supported_children[tag.tag]:
if child.tail is not None:
final_text += " " + child.tail
continue
child_text = _get_text_from_tag(child)
if requires_whitespace[child.tag] and final_text != "":
final_text += " "
final_text += child_text
if child.tail is not None:
if requires_whitespace[child.tag]:
final_text += " "
final_text += child.tail
final_text = re.sub(r"\s+", " ", final_text)
return final_text.strip()
def extract_content_from_patent(
patent_file: Path,
add_section_tags=False,
abstract_only=False
):
xml_document_root = ET.parse(patent_file)
abstract = xml_document_root.find(".//abstract[@lang='EN']")
abstract_text = _get_text_from_tag(abstract) if abstract is not None else ""
if add_section_tags:
abstract_text = f"[abstract] {abstract_text}"
if abstract_only:
return abstract_text, []
claims = xml_document_root.findall(".//claims[@lang='EN']/claim")
if patent_file.stem.startswith("EP"):
# ^ EPO patent
claims_texts = [_get_text_from_tag(claim) for claim in claims]
else:
# ^ WO patent
assert len(claims) == 1
# ^ WO patents contain only one <claim> tag that contains all of
# the claims
single_claim_text = _get_text_from_tag(claims[0])
raw_claims_texts = re.sub(
r"\s+(\d{1,2}\.\s+[^\d])", # e.g. " 1. T"
lambda match: "\n" + match.groups()[0],
single_claim_text
).split("\n")
claims_texts = [
re.sub(r"^\d{1,2}\.\s*", "", raw_text)
for raw_text in raw_claims_texts
if re.search(r"^\d{1,2}\.\s*", raw_text)
]
if add_section_tags:
claims_texts = [f"[claim] {claim_text}" for claim_text in claims_texts]
return abstract_text, claims_texts