-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsent_statistics.py
359 lines (303 loc) · 14.6 KB
/
sent_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
# -*- coding: utf-8 -*-
import math as m
from auxiliaries.dset_proc_aux import *
from kelly import get_kelly_info, get_svalex_info, get_svalex2_info, cefr_scale
from word_pic import get_mutual_info
from auxiliaries.dset_prep_aux import clean_value
from auxiliaries.match_aux import add_keyword_info
class SentStatistics():
"""
Collects statistical information about a Sentence instance
(see 'ling_unit.py'). Links to tag sets referred to in:
- morpho-syntactic: http://spraakbanken.gu.se/korp/markup/msdtags.html
- dependency relations: http://stp.lingfil.uu.se/~nivre/swedish_treebank/dep.html
Args:
sent (instance): a Sentence instance
kelly_list (list): the loaded Kelly word list (see kelly.py)
parameters (dict): sentence search parameters passed as a dictionary
Attributes:
sent: see above
stats: the object that will be filled with the statistical information
modal_verb_list: a list of verbs usable as modal (auxiliary) verbs
kelly_list: see above
params: see 'parameters' above
Yields:
A container for statistical information for a sentence.
"""
def __init__(self, sent, parameters={}):
self.sent = sent
self.stats = {"voc_cefr": {"A1":0, "A2":0, "B1":0, "B2":0,
"C1":0, "C2":0, "?":0, "-":0},
"voc_cefr_svalex2": {"A1":0, "A2":0, "B1":0, "B2":0,
"C1":0, "?":0, "-":0}}
self.modal_verb_list = [u"kunna", u"måste", u"skola", u"vilja", u"böra", u"få"]
self.params = parameters
self.sent.level = "B1" # for a fixed level
def get_len_stats(self,t):
"""
Collects lenght-based statisctics.
Arg:
t: a Token instance
"""
#Long words
if len(t.word) > 6:
put_feature_value(self.stats, "long_w", 1)
#Extra-long words
if len(t.word) > 13:
put_feature_value(self.stats, "xlong_w", 1)
#Total length of characters per token
put_feature_value_list(self.stats, "tok_len", len(t.word))
return self.stats
def get_kelly_stats(self,t,kelly_list):
"""
Collects information from the Kelly word list including:
the CEFR level for all categories and frequency only for
lexical categories (nouns, verbs, adjectives, adverbs).
The information is added to the 'stats' attribute.
Arg:
t: a Token instance
"""
if self.params: #for sent_match
v = get_kelly_info(kelly_list, t, self.params.get("target_cefr",""))
else:
v = get_kelly_info(kelly_list, t, self.sent.level)
#except AttributeError:
# v = get_kelly_info(self.kelly_list, t, self.text_level)
cefr = v[1]
self.stats["voc_cefr"][cefr] += 1.0
if t.pos in ["NN", "VB"] and (v[0] == "above"):
put_feature_value(self.stats, "diff_NNVB", 1.0)
if t.pos in ["NN", "JJ","VB", "AB"]:
freq_kelly = v[2]
if freq_kelly:
log_freq_kelly = m.log(freq_kelly) #following Coh-metrix
else:
log_freq_kelly = 0.0
put_feature_value_list(self.stats, "voc_freq_kelly", log_freq_kelly)
#token of a suitable CEFR level according to Kelly?
if v[0] == "above": #or v == "not in kelly" or v == "no lemma"?
put_feature_value_list(self.stats,"diff_voc", t.word)
return self.stats
def get_svalex_stats(self,t, svalex_list):
"""
Collects frequency information from the SVALex or SweLLex lists.
Arg:
t: a Token instance
svalex_list: a pickled version of the SVALex or SweLLex list
"""
if t.pos in ["NN", "VB", "JJ", "AB"]:
svalex_info = get_svalex_info(svalex_list, t, self.params.get("target_cefr","any"))
svalex_fr,out_of_svalex = svalex_info[0], svalex_info[1]
put_feature_value_list(self.stats,"svalex_fr", svalex_fr)
if out_of_svalex:
put_feature_value_list(self.stats,"out_of_svalex", t.word)
return self.stats
def get_svalex2_stats(self, t, svalex2_list):
"""
Collects information from the version of the SVALex or SweLLex lists where
frequency distributions were mapped to CEFR levels.
Args:
t: a Token instance
svalex2_list: a pickled version of the SVALex or SweLLex list mapped to CEFR levels
"""
if self.params:
diff_info, level = get_svalex2_info(t, svalex2_list, self.params.get("target_cefr","B1"))
else:
diff_info, level = get_svalex2_info(t, svalex2_list, self.sent.level)
self.stats["voc_cefr_svalex2"][level] += 1.0
if t.pos in ["NN", "VB"] and (diff_info == "above"):
put_feature_value(self.stats, "diff_NNVB_svalex2", 1.0)
if diff_info == "above":
put_feature_value_list(self.stats,"diff_voc_svalex2", t.word)
return self.stats
def get_morpho_synt_stats(self,s,t,i):
"""
Gathers information based on part-of-speech and morpho-syntactic tags.
Args:
s: an instance of the Sentence class
t: an instance of the Token class
i: current index of token in the sentence
"""
put_feature_value_list(self.stats,"pos_unigr", t.pos)
#Verbs
if t.pos == "VB":
if not self.stats["finite"]:
if ("INF" not in t.msd) and ("SUP" not in t.msd) and ("PRF" not in t.msd):
# only modal verb as finite verb without VG not allowed
if t.lemma:
if t.lemma[0] in self.modal_verb_list: #få, ska sometimes non-modal use
try:
ch_deprel = [tt.deprel for tt in self.stats["heads"][t.ref]]
if "VG" in ch_deprel:
put_feature_value_list(self.stats, "finite", 1.0)
except KeyError:
pass
else:
put_feature_value_list(self.stats, "finite", 1.0)
else:
put_feature_value_list(self.stats, "finite", 1.0)
if t.deprel not in ["VG", "SP"]: #SP e.g. är öppen
put_feature_value(self.stats, "main_verb", 1.0)
if t.lemma:
# check next word if verb (also non-modal use of those verbs)
if t.lemma[0] in self.modal_verb_list:
try:
if s.nodes[i+1].deprel == "VG":
put_feature_value_list(self.stats, "modal_verb", t.word)
except IndexError:
for w in s.nodes[i:]:
if (w.pos == "VB" and w.deprel == "VG"
and w.depheadid == t.ref):
put_feature_value_list(self.stats, "modal_verb", t.word)
if "SFO" in t.msd:
if t.lemma[0][-1] == "s": # e.g. finns
put_feature_value_list(self.stats, "sverb", t.word)
else:
put_feature_value(self.stats, "passive", 1.0)
if t.msd[:6] == "PC.PRF":
put_feature_value(self.stats, "perf_pc", 1.0)
if t.msd[:6] == "PC.PRS":
put_feature_value(self.stats, "pres_pc", 1.0)
if "PRT" in t.msd:
put_feature_value(self.stats, "past_VB", 1.0)
elif "PRS" in t.msd:
put_feature_value(self.stats, "pres_VB", 1.0)
elif "SUP" in t.msd:
put_feature_value(self.stats, "sup_VB", 1.0)
if "IMP" in t.msd:
put_feature_value(self.stats, "imp_VB", 1.0)
if "KON" in t.msd:
put_feature_value(self.stats, "konj_VB", 1.0)
if t.word in ["han", "hon", "det", "den"]:
put_feature_value(self.stats, "PN_3SG", 1.0)
if t.pos == "NN" and ("NEU" in t.msd):
put_feature_value(self.stats, "neu_NN", 1.0)
# Relative strucutres (pronouns etc.)
if t.pos in ["HA", "HD", "HP", "HS"]:
if s.nodes[-1].word != "?": # to exclude interrogative use of those
# (but indirect questions not handled...)
put_feature_value(self.stats, "rel_str", 1.0)
return self.stats
def deprel_stats(self,t,root_ref, verb_args):
"""
Collects syntactic information based on the lenght and the direction
of dependencies.
Args:
t: a Token instance
root_ref: position (index) of ROOT element in the sentence
"""
put_feature_value_list(self.stats,"deprel_unigr", t.deprel)
if t.pos == "VB":
verb_args[t.ref] = []
if t.depheadid == None or t.depheadid == '':
put_feature_value_list(self.stats, "dep_len", 0.0)
else:
dep_len = int(t.ref)-int(t.depheadid)
put_feature_value_list(self.stats, "dep_len", abs(dep_len))
if dep_len < 0:
put_feature_value(self.stats, "left_arc", 1.0)
if dep_len > 0:
put_feature_value(self.stats, "right_arc", 1.0)
#Collecting verbal arguments (all - restrict to pronouns and nouns?)
if t.depheadid in verb_args:
verb_args[t.depheadid].append(t)
self.stats["verb_args"] = verb_args
if root_ref:
if t.depheadid == root_ref:
root_dep_len = int(root_ref)-int(t.ref)
put_feature_value_list(self.stats,
"root_dep_len", abs(root_dep_len))
return self.stats
def get_semantic_stats(self,t):
"""
Gets semantic information, based only on number of senses
for now, since no word-sense disambiguation is used.
Arg:
t: a Token instance
"""
# nr of senses per word and noun
put_feature_value_list(self.stats, "senses/w", len(t.saldo))
if t.pos == "NN":
put_feature_value_list(self.stats, "nn_senses/nn", len(t.saldo))
return self.stats
def get_sentmatch_stats(self, t, all_tokens, word_pictures):
if t.word == self.params["query_w"]: # note: only suitable for wordform search
put_feature_value(self.stats, "keyword_count", 1.0)
if not t.word.isalpha():
put_feature_value_list(self.stats, "non_alpha", t.word)
elif t.word.isalpha() and not t.lemma:
put_feature_value_list(self.stats, "non_lemmatized", t.word)
if t.deprel in ["ES", "FS", "SS", "FP", "SP", "VS"]: # logical, dummy, other subjects + compl.s
self.stats["has_subject"] = True
if t.deprel == "NA": # negation adverbials
put_feature_value_list(self.stats, "neg_form", t.word)
if "AN" in t.msd:
put_feature_value_list(self.stats,"abbrev",t.word)
if "PM" in t.msd:
put_feature_value_list(self.stats,"proper_name",t.word)
if t.deprel == "ROOT":
put_feature_value_list(self.stats, "roots", t)
#add_keyword_info(t, self.stats, self.params)
mi,used_rel_lemma = get_mutual_info(t, all_tokens, self.stats, word_pictures)
put_feature_value_list(self.stats,"used_rel_lemmas",used_rel_lemma)
put_feature_value_list(self.stats,"MI",mi)
return self.stats
def get_stats_SWE(self, kelly_list, svalex_list, svalex2_list, word_pictures={}, use_deprel=True,use_ngrams=False):
"""
Gathers statistical information from different linguistic levels
for a sentence based on information specific to the Korp pipeline
tags for Swedish (as of June 2015) and the Swedish Kelly wordlist.
Arg:
kelly_list: Kelly word list
svalex_list: only normalized frequencies
svalex2_list: n. frequencies mapped to CEFR levels
use_deprel: whether to use dependency relation tags
use_ngrams: whether to collect ngrams (uni- and bigrams),
see 'get_ngrams()' in 'feat_aux.py'
"""
s = self.sent
root_ref = check_root(s)
tokens = []
verb_args = {} #{"verb1": {arg1_pos:"PN", arg1_deprel:"SS", etc.},..}
self.stats["finite"] = []
self.stats["heads"] = {}
self.stats["keyword"] = {}
self.stats["has_subject"] = 0
self.stats["used_rel_lemmas"] = []
# Collect the position ('ref') of the dependency head of each token
# +1 compared to regular indexes, string type
for tkn in s.nodes:
if tkn.deprel == "ROOT":
put_feature_value_list(self.stats["heads"], tkn.ref, tkn) #"ROOT"
else:
put_feature_value_list(self.stats["heads"], tkn.depheadid, tkn)
for i,t in enumerate(s.nodes):
mapped_token = map_Token_to_dict(t) #just a fix, see dset_proc_aux.py
tokens.append(mapped_token)
#get statistics from different liguistic levels
self.stats = self.get_len_stats(t)
self.stats = self.get_kelly_stats(t, kelly_list)
if svalex_list:
self.stats = self.get_svalex_stats(t, svalex_list)
if svalex2_list:
self.stats = self.get_svalex2_stats(t, svalex2_list)
self.stats = self.get_semantic_stats(t)
self.stats = self.get_morpho_synt_stats(s,t,i)
if self.params:
self.stats = self.get_sentmatch_stats(t,s.nodes,word_pictures)
#add lemma unigrams
lm_ngram = get_lemma_ngrams(s, t, i, "uni")
if lm_ngram:
put_feature_value_list(self.stats,"lemma_unigr", lm_ngram)
if use_ngrams: #bi- and trigrams
self.stats = get_ngrams(self.stats,s,t,i)
if use_deprel:
self.stats = self.deprel_stats(t,root_ref, verb_args)
#fix for JSON serialization issue with the Token class
self.stats["tokens"] = s.nodes # retain a copy of Token instances
s.nodes = tokens # change Token instances to 'dict'-s
return self.stats
def print_statistics(self):
print "Statisctics for:'" + self.sent.words + "'"
for k,v in self.stats.iteritems():
print k + ": \t",v