-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfind_outliers.py
66 lines (53 loc) · 2.05 KB
/
find_outliers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""
Copyright (c) 2019 Emil Lynegaard
Distributed under the MIT software license, see the
accompanying LICENSE.md or https://opensource.org/licenses/MIT
Small script for finding outliers in a dataset.
Currently supports bin files from See et al. 2017,
and .tsv files of form <article>\t<summary>.
Tweak main function to find outliers according to the desired
definition of outlier.
"""
import sys
import fileinput
import struct
# pylint: disable=no-name-in-module
from tensorflow.core.example import example_pb2
def tsv_generator(file):
"""For using files created by preprocess.py"""
for line in fileinput.input(file):
article, summary = line.strip().split("\t")
yield (article, summary)
def bin_generator(file):
"""
For using the files provided by See et al. 2017 found at:
https://github.com/abisee/cnn-dailymail
"""
with open(file, "rb") as reader:
while True:
len_bytes = reader.read(8)
if not len_bytes:
break
str_len = struct.unpack("q", len_bytes)[0]
example_str = struct.unpack("%ds" % str_len, reader.read(str_len))[0]
example = example_pb2.Example.FromString(example_str)
article = example.features.feature["article"].bytes_list.value[0].decode()
summary = example.features.feature["abstract"].bytes_list.value[0].decode()
summary = summary.replace("<s>", "")
summary = summary.replace("</s>", "")
yield (article, summary)
def main():
"""main"""
file = sys.argv[1]
assert file.endswith(".bin") or file.endswith(".tsv")
generator = bin_generator(file) if file.endswith(".bin") else tsv_generator(file)
outliers = []
for (article, summary) in generator:
article_length = len(article.split())
summary_length = len(summary.split())
c = article_length / summary_length
if c < 1.0:
outliers.append((article, summary))
outliers.sort(key=lambda x: len(x[0].split()) / len(x[1].split()))
if __name__ == "__main__":
main()