-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbaseline.py
45 lines (36 loc) · 1.42 KB
/
baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# Baseline
import os
from collections import Counter
from nltk.tag import UnigramTagger
from tag_set_mappings import hausa_mapping
from utils import read_test_sentences, read_train_sentences, write_out
DIR = "projected"
TEST = "test_data/revised_test_data.tsv"
TRAIN = os.listdir(DIR)
OUT = "test_data/predictions"
def predict(sentence, model, unknown="<unk>"):
return [tag if tag is not None else unknown for _, tag in model.tag(tokens)]
def get_most_frequent_tag(sentences):
count_dict = Counter()
for sent in sentences:
for token, tag in sent:
count_dict.update(tag)
return count_dict.most_common(1)[0][0]
test_path = os.path.join(TEST)
test_data = read_test_sentences(test_path)
for train in TRAIN:
print(train)
train_path = os.path.join(DIR, train)
data = read_train_sentences(train_path)
most_frequent = hausa_mapping.mapping[get_most_frequent_tag(data)]
tagger = UnigramTagger(data)
predict_sentences = []
for sent in test_data:
tokens = [token for token, _, _ in sent]
predicted_tags = predict(tokens,tagger, most_frequent)
pred_sent = [(token, hau_tag, ud_tag, pred) for (token, hau_tag, ud_tag), pred in zip(sent, predicted_tags)]
predict_sentences.append(pred_sent)
result_file = "{}_baseline.tsv".format(train.split(".")[0])
out_path = os.path.join(OUT, result_file)
write_out(predict_sentences, out_path)
print(result_file)