-
Notifications
You must be signed in to change notification settings - Fork 0
/
text.py
50 lines (44 loc) · 1.35 KB
/
text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import string
def load_doc(filename):
file = open(filename, 'r')
text = file.read()
file.close()
return text
def load_descriptions(doc):
mapping = dict()
for line in doc.split('\n'):
tokens = line.split()
if len(line) < 2:
continue
image_id, image_desc = tokens[0], tokens[1:]
image_id = image_id.split('.')[0]
image_desc = ' '.join(image_desc)
if image_id not in mapping:
mapping[image_id] = list()
mapping[image_id].append(image_desc)
return mapping
def clean_descriptions(descriptions):
table = str.maketrans('', '', string.punctuation)
for key, desc_list in descriptions.items():
for i in range(len(desc_list)):
desc = desc_list[i]
desc = desc.split()
desc = [word.lower() for word in desc]
desc = [w.translate(table) for w in desc]
desc = [word for word in desc if len(word)>1]
desc = [word for word in desc if word.isalpha()]
desc_list[i] = ' '.join(desc)
def to_vocabulary(descriptions):
all_desc = set()
for key in descriptions.keys():
[all_desc.update(d.split()) for d in descriptions[key]]
return all_desc
def save_descriptions(descriptions, filename):
lines = list()
for key, desc_list in descriptions.items():
for desc in desc_list:
lines.append(key + ' ' + desc)
data = '\n'.join(lines)
file = open(filename, 'w')
file.write(data)
file.close()