forked from google-research-datasets/natural-questions
-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_utils.py
144 lines (112 loc) · 5.05 KB
/
text_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
r"""Utilities to simplify the canonical NQ data.
The canonical NQ data contains the HTML of each Wikipedia page along with a
sequence of tokens on that page, each of which is indexed into the HTML.
Many users will not want to use the HTML at all, and this file provides
utilities to extract only the text into a new record of the form:
{
"example_id": 3902,
"document_url": "http://wikipedia.org/en/strings"
"question_text": "what is a string",
"document_text": "<P> A string is a list of characters in order . </P>",
"annotations": [{
"long_answer": { "start_token": 0, "end_token": 12 },
"short_answers": [{ "start_token": 5, "end_token": 8 }],
"yes_no_answer": "NONE",
}],
"long_answer_candidates": [
{"start_token": 0, "end_token": 12, "top_level": True}
]
}
which leads to a much smaller training set (4.4Gb instead of 41Gb).
In this representation, the [start, end) indices are into the blank separated
sequence of tokens. So, answer spans can be extracted using the following
snippet:
" ".join(example["document_text"].split(" ")[`start_token`:`end_token`]).
WARNING: Use `split(" ")` instead of `split()` to avoid complications from
characters such as `\u180e` which may or may not be recognized as a whitespace
character depending on your python version.
To avoid complications at test time, we do not provide a simplified version
of the development data, and there is no simplified version of the hidden test
set. If you rely on the simplified data, then you must call the
`simplify_nq_example` function below on every example that is passed in at test
time.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import re
def get_nq_tokens(simplified_nq_example):
"""Returns list of blank separated tokens."""
if "document_text" not in simplified_nq_example:
raise ValueError("`get_nq_tokens` should be called on a simplified NQ"
"example that contains the `document_text` field.")
return simplified_nq_example["document_text"].split(" ")
def simplify_nq_example(nq_example):
r"""Returns dictionary with blank separated tokens in `document_text` field.
Removes byte offsets from annotations, and removes `document_html` and
`document_tokens` fields. All annotations in the ouput are represented as
[start_token, end_token) offsets into the blank separated tokens in the
`document_text` field.
WARNING: Tokens are separated by a single blank character. Do not split on
arbitrary whitespace since different implementations have different
treatments of some unicode characters such as \u180e.
Args:
nq_example: Dictionary containing original NQ example fields.
Returns:
Dictionary containing `document_text` field, not containing
`document_tokens` or `document_html`, and with all annotations represented
as [`start_token`, `end_token`) offsets into the space separated sequence.
"""
def _clean_token(token):
"""Returns token in which blanks are replaced with underscores.
HTML table cell openers may contain blanks if they span multiple columns.
There are also a very few unicode characters that are prepended with blanks.
Args:
token: Dictionary representation of token in original NQ format.
Returns:
String token.
"""
return re.sub(u" ", "_", token["token"])
text = " ".join([_clean_token(t) for t in nq_example["document_tokens"]])
def _remove_html_byte_offsets(span):
if "start_byte" in span:
del span["start_byte"]
if "end_byte" in span:
del span["end_byte"]
return span
def _clean_annotation(annotation):
annotation["long_answer"] = _remove_html_byte_offsets(
annotation["long_answer"])
annotation["short_answers"] = [
_remove_html_byte_offsets(sa) for sa in annotation["short_answers"]
]
return annotation
simplified_nq_example = {
"question_text": nq_example["question_text"],
"example_id": nq_example["example_id"],
"document_url": nq_example["document_url"],
"document_text": text,
"long_answer_candidates": [
_remove_html_byte_offsets(c)
for c in nq_example["long_answer_candidates"]
],
"annotations": [_clean_annotation(a) for a in nq_example["annotations"]]
}
if len(get_nq_tokens(simplified_nq_example)) != len(
nq_example["document_tokens"]):
raise ValueError("Incorrect number of tokens.")
return simplified_nq_example