-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwell_formedness.py
87 lines (79 loc) · 4.17 KB
/
well_formedness.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
"""
Functions for assessing the well-formedness of a sentence.
"""
from __future__ import division
from auxiliaries.dset_proc_aux import *
def has_root(sent_match):
"""Checks wether the sentence has a dependency root.
"""
if "roots" not in sent_match.stats.keys():
put_feature_value_list(sent_match.match, "no_root", (True, "no dependency root"))
def check_sent_start(sent_match):
""" Checks sentence beginning for capital letters.
"""
if sent_match.stats["tokens"][0].word in [u"”", '"', "'", "-", u"–", "("]:
if not sent_match.stats["tokens"][1].word[0].isupper():
put_feature_value_list(sent_match.match, "sent_tokenization", (True, "no initial capital"))
#return
elif not sent_match.stats["tokens"][0].word[0].isupper(): #and not sent_match.stats["tokens"][0].word.isdigit()
put_feature_value_list(sent_match.match, "sent_tokenization", (True, "no initial capital"))
#return
def check_sent_end(sent_match):
""" Checks sentence end for major delimiters.
"""
sent_end = sent_match.stats["tokens"][-1]
if sent_end.word in [u"”", '"', "'", ")"]:
if sent_match.stats["tokens"][-2].word not in [".", "!", "?"]: #!= "MAD"
put_feature_value_list(sent_match.match, "sent_tokenization", (True, "ends with: '%s'" % sent_match.stats["tokens"][-2].word))
elif sent_end.word not in [".", "!", "?"]: #"MAD" and sent_match.stats["tokens"][-1].word not in [":", "..."]
put_feature_value_list(sent_match.match, "sent_tokenization", (True, "ends with: '%s'" % sent_end.word))
#ends with period, but second last word is abbreviation:
elif sent_end.word == "." and "AN" in sent_match.stats["tokens"][-2].msd.split("."):
put_feature_value_list(sent_match.match, "sent_tokenization", (True, "ends with: '%s'" % sent_match.stats["tokens"][-2].word))
def check_sent_tokenization(sent_match):
""" Checks whether the sentence is correctly tokenized.
"""
check_sent_start(sent_match)
check_sent_end(sent_match)
def get_bad_lexica_percentage(sent_match, thresholds):
""" Checks whether the percentage of non-alpha tokens and unrecognized
lemmas is within the specified threshold.
"""
criteria = ["non_alpha", "non_lemmatized"]
for criterion in criteria:
if criterion == "non_lemmatized":
#exclude punctuation marks from non lemmatized items (already counted in non alpha)
value = len(sent_match.stats.get(criterion, []))
else:
value = len(sent_match.stats.get(criterion, []))
if value > 0:
corr_value = 0
# needed for (.) added manually as sentence ending punctuation as
# sentence tokenization work-around in Korp annotation lab,
# present only in the development data
if "".join([t.word for t in sent_match.stats["tokens"]][-3:]) == "(.)":
corr_value = 2
try:
percentage = (value / len(sent_match.sent.nodes)-corr_value) * 100
except AttributeError:
percentage = (value / len(sent_match.sent.sentence.nodes)-corr_value) * 100
if percentage > thresholds[criterion]:
crit_str = " ".join(criterion.split("_"))
message = "%d %s tokens: %s" % (value,crit_str, ", ".join(sent_match.stats.get(criterion, []))) #nr of tokens per categgory
put_feature_value_list(sent_match.match, criterion, (percentage, message))
def check_ellipsis(sent_match):
""" Checks whether the sentence is elliptic, i.e. lacks the
subject or a finite verb.
"""
# No subject required with imperative or passive
if sent_match.stats.has_key("imp_VB") or sent_match.sent[-1]["word"] == "?": #or sent_match.stats.get("passive", 0.0)
has_subject = True
elif sent_match.stats["has_subject"]:
has_subject = True
else:
has_subject = False
if not sent_match.stats["finite"]: #or not has_subject
put_feature_value(sent_match.match, "elliptic", (True, "no finite verb")) #no finite verb
elif not has_subject:
put_feature_value(sent_match.match, "elliptic", (True, "no subject"))