-
Notifications
You must be signed in to change notification settings - Fork 5
/
conllu-subtree-tag-distribution.py
58 lines (47 loc) · 1.1 KB
/
conllu-subtree-tag-distribution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import sys
LABEL = sys.argv[1]
# for each sentence
# for each node
# check if the label is LABEL
# if it is, recurse down, storing the frequency counts of pos tags
trees = []
current_tree = {}
for line in sys.stdin.readlines():
if line.strip() == '':
trees.append(current_tree)
# print(current_tree)
current_tree = {}
continue
elif line[0] == '#':
continue
row = line.split('\t')
idx = int(row[0])
pos = row[3]
head = int(row[6])
deprel = row[7]
if head not in current_tree:
current_tree[head] = []
current_tree[head].append((idx, deprel, pos))
def dfs(freq, node, tree):
if node not in tree:
return freq
for child in tree[node]:
if child[2] not in freq:
freq[child[2]] = 0
freq[child[2]] += 1
print('S:', node, child, tree[node], '|',freq[child[2]])
if child[0] in tree:
freq = dfs(freq, child[0], tree)
return freq
freq = {}
for tree in trees:
print()
k = list(tree.keys())
k.sort()
for head in k:
for child in tree[head]:
if child[1] == LABEL:
print('T:',head, child)
dfs(freq, child[0], tree)
for tag in freq:
print(tag, freq[tag])