Skip to content

Commit

Permalink
force_disambiguate. fix #46 (#47)
Browse files Browse the repository at this point in the history
  • Loading branch information
reynoldsnlp authored Jul 22, 2020
1 parent 368de12 commit 9f272e7
Show file tree
Hide file tree
Showing 17 changed files with 138 additions and 139 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ print(phonetic_doc1)
| most\_likely\_reading | `Reading` | "Most likely" reading (may be partially random selection) |
| most\_likely\_lemmas | `List[str]` | List of lemma(s) from the "most likely" reading |
| transliterate | `str` | The original text converted to Romanized Cyrillic (default=Scholarly) |
| force\_disambiguate | `None` | Fully disambiguate readings using methods **other than** the Constraint Grammar |
| cg3\_str | `str` | Analysis stream in the [VISL-CG3 format](https://visl.sdu.dk/cg3/single/#stream-vislcg) |
| hfst\_str | `str` | Analysis stream in the XFST/HFST format |

Expand Down
4 changes: 4 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[mypy]
warn_unused_configs = True
warn_unused_ignores = True
warn_unreachable = True
4 changes: 2 additions & 2 deletions test/test_README.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ def test_all_properties_are_documented_in_tables():
'features', 'parse_cg3', 'parse_hfst',
'respace', 'stress_eval', 'stress_preds2tsv',
'tokenize'},
'Token': {'annotation', 'end_char', 'features', 'guess',
'guess_freq', 'guess_syllable',
'Token': {'annotation', 'end_char', 'features',
'guess_syllable',
'has_tag_in_most_likely_reading', 'head',
'is_L2_error', 'might_be_L2_error',
'phon_predictions', 'phonetic_transcriptions',
Expand Down
1 change: 0 additions & 1 deletion udar/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ def parse_input(input_str: str, args: argparse.Namespace) -> Document:
if args.input_type == 'c':
return Document.from_cg3(input_str)
elif args.input_type == 'f':
raise NotImplementedError # TODO
return Document.from_hfst(input_str)
elif args.input_type == 'p':
return Document(input_str, disambiguate=args.disambiguate)
Expand Down
2 changes: 1 addition & 1 deletion udar/conversion/external2udar.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def readable_sent(sentence):
desc='Reading corpus...', total=sent_count):
sent_id = sent['id']
doc = Document(sent.source.get_text(), annotation=sent_id)
u_toks = [t.orig for t in doc]
u_toks = [t.text for t in doc]
oc_toks = [t['text'] for t in sent.find_all('token')]

if u_toks != oc_toks:
Expand Down
20 changes: 13 additions & 7 deletions udar/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from sys import stderr
from typing import Dict
from typing import Iterable
from typing import Iterator
from typing import List
from typing import Optional
from typing import Tuple
Expand All @@ -16,6 +17,7 @@
from .fsts import get_analyzer
from .misc import get_stanza_sent_tokenizer
from .sentence import Sentence
from .tok import Token


__all__ = ['Document']
Expand All @@ -35,14 +37,14 @@
# return nltk_sent_tokenizer


def _str2Sentences(input_str, **kwargs):
def _str2Sentences(input_str, **kwargs) -> List[Sentence]:
stanza_sent = get_stanza_sent_tokenizer()
# TODO should the following 2 lines be solved in tokenizer's pmscript?
input_str = input_str.replace('#', ' ') # The `#` char is ignored by udar
input_str = re.sub(r'([^аэоуыяеёюи])[\u0300\u0301]', r'\1', input_str,
flags=re.I)
stanza_doc = stanza_sent(input_str)
return [Sentence(sent.text, **kwargs)
return [Sentence(sent.text, id=i, **kwargs)
for i, sent in enumerate(stanza_doc.sentences)]


Expand Down Expand Up @@ -83,7 +85,7 @@ def __init__(self, input_text: Union[str, Iterable[Sentence], 'Document'],
or hasattr(input_text, '__iter__'))
and isinstance(next(iter(input_text)), Sentence)):
self.text = ' '.join(sent.text for sent in input_text)
self.sentences = list(input_text)
self.sentences = list(input_text) # type: ignore
for sent in self.sentences:
sent.doc = self
elif isinstance(input_text, Document):
Expand All @@ -109,12 +111,16 @@ def __eq__(self, other):
and all(s == o
for s, o in zip(self.sentences, other.sentences)))

def __getitem__(self, i: Union[int, slice]):
warn('Indexing on a Document object is slow.', stacklevel=2)
def __getitem__(self, i: Union[int, slice]) -> Union[Token, List[Token]]:
"""Get *Token(s)* by index/slice."""
warn('Indexing on large Document objects can be slow. '
'It is more efficient to index Tokens within Document.sentences',
stacklevel=3)
# TODO optimize?
return list(self)[i]

def __iter__(self):
def __iter__(self) -> Iterator[Token]:
"""Return iterator over *Tokens*."""
return iter(chain(*self.sentences))

def __repr__(self):
Expand Down Expand Up @@ -161,7 +167,7 @@ def from_cg3(cls, input_stream: str, **kwargs):
r'# ANNOTATION: ([^\n]*)\n'
r'# TEXT: ([^\n]*)\n'
r'(.+?)', input_stream, flags=re.S)
if split_by_sentence is not None:
if split_by_sentence:
sentences = [Sentence.from_cg3(stream, id=id,
annotation=annotation,
orig_text=text, **kwargs)
Expand Down
14 changes: 7 additions & 7 deletions udar/features/feature_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,16 @@ def __call__(self, docs: Union[List[Document], Document], feat_names=None,
if ((hasattr(docs, '__iter__') or hasattr(docs, '__getitem__'))
and isinstance(next(iter(docs)), Document)):
for doc in docs:
doc.features = self._call_features(doc,
feat_names=feat_names,
tuple_constructor=tuple_constructor, # noqa: E501
**kwargs)
doc.features = self._call_features(doc, # type: ignore
feat_names=feat_names,
tuple_constructor=tuple_constructor, # noqa: E501
**kwargs)
output.append(doc.features)
elif isinstance(docs, Document):
docs.features = self._call_features(docs,
feat_names=feat_names,
tuple_constructor=tuple_constructor, # noqa: E501
**kwargs)
feat_names=feat_names,
tuple_constructor=tuple_constructor, # noqa: E501
**kwargs)
output.append(docs.features)
else:
raise TypeError('Expected Document or list of Documents; got '
Expand Down
8 changes: 4 additions & 4 deletions udar/features/lexical_complexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def prcnt_words_over_n_sylls(n, doc: Document, lower=False, rmv_punc=True,
return zero_div_val
for n in range(1, MAX_SYLL): # noqa: E305
name = f'prcnt_words_over_{n}_sylls'
this_partial = partial(prcnt_words_over_n_sylls, n) # type: ignore
this_partial = partial(prcnt_words_over_n_sylls, n)
this_partial.__name__ = name # type: ignore
doc = prcnt_words_over_n_sylls.__doc__.replace(' n ', f' {n} ') # type: ignore # noqa: E501
ALL[name] = Feature(name, this_partial, doc=doc,
Expand All @@ -50,7 +50,7 @@ def prcnt_content_words_over_n_sylls(n, doc: Document, lower=False,
return zero_div_val
for n in range(1, MAX_SYLL): # noqa: E305
name = f'prcnt_content_words_over_{n}_sylls'
this_partial = partial(prcnt_content_words_over_n_sylls, n) # type: ignore
this_partial = partial(prcnt_content_words_over_n_sylls, n)
this_partial.__name__ = name # type: ignore
doc = prcnt_content_words_over_n_sylls.__doc__.replace(' n ', f' {n} ') # type: ignore # noqa: E501
ALL[name] = Feature(name, this_partial, doc=doc,
Expand All @@ -73,7 +73,7 @@ def prcnt_words_over_n_chars(n, doc: Document, lower=False, rmv_punc=True,
return zero_div_val
for n in range(1, MAX_SYLL): # noqa: E305
name = f'prcnt_words_over_{n}_chars'
this_partial = partial(prcnt_words_over_n_chars, n) # type: ignore
this_partial = partial(prcnt_words_over_n_chars, n)
this_partial.__name__ = name # type: ignore
doc = prcnt_words_over_n_chars.__doc__.replace(' n ', f' {n} ') # type: ignore # noqa: E501
ALL[name] = Feature(name, this_partial, doc=doc,
Expand All @@ -97,7 +97,7 @@ def prcnt_content_words_over_n_chars(n, doc: Document, lower=False,
return zero_div_val
for n in range(1, MAX_SYLL): # noqa: E305
name = f'prcnt_content_words_over_{n}_chars'
this_partial = partial(prcnt_content_words_over_n_chars, n) # type: ignore
this_partial = partial(prcnt_content_words_over_n_chars, n)
this_partial.__name__ = name # type: ignore
doc = prcnt_content_words_over_n_chars.__doc__.replace(' n ', f' {n} ') # type: ignore # noqa: E501
ALL[name] = Feature(name, this_partial, doc=doc,
Expand Down
8 changes: 4 additions & 4 deletions udar/features/lexical_familiarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ def num_words_at_lexmin_level(level, doc: Document) -> int:
lexmin_dict = _get_lexmin_dict()
return len([1 for tok in doc
if any(lexmin_dict.get(lem) == level
for lem in tok.most_likely_lemmas(method=MOST_LIKELY))]) # type: ignore # noqa: E501
for lem in tok.most_likely_lemmas(method=MOST_LIKELY))])
for level in ['A1', 'A2', 'B1', 'B2']: # noqa: E305
name = f'num_words_at_lexmin_{level}'
this_partial = partial(num_words_at_lexmin_level, level) # type: ignore
this_partial = partial(num_words_at_lexmin_level, level)
this_partial.__name__ = name # type: ignore
doc = num_words_at_lexmin_level.__doc__.replace('LEVEL', level) # type: ignore # noqa: E501
ALL[name] = Feature(name, this_partial, doc=doc,
Expand Down Expand Up @@ -70,10 +70,10 @@ def num_words_at_kelly_level(level, doc: Document) -> int:
kelly_dict = _get_kelly_dict()
return len([1 for tok in doc
if any(kelly_dict.get(lem) == level
for lem in tok.most_likely_lemmas(method=MOST_LIKELY))]) # type: ignore # noqa: E501
for lem in tok.most_likely_lemmas(method=MOST_LIKELY))])
for level in ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']: # noqa: E305
name = f'num_words_at_kelly_{level}'
this_partial = partial(num_words_at_kelly_level, level) # type: ignore
this_partial = partial(num_words_at_kelly_level, level)
this_partial.__name__ = name # type: ignore
doc = num_words_at_kelly_level.__doc__.replace('LEVEL', level) # type: ignore # noqa: E501
ALL[name] = Feature(name, this_partial, doc=doc,
Expand Down
2 changes: 1 addition & 1 deletion udar/features/lexical_variability.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def type_token_ratio_Tag(tag: str, doc: Document, lower=True, rmv_punc=False,
return zero_div_val
for tag in tag_dict: # noqa: E305
name = f'type_token_ratio_{safe_tag_name(tag)}'
this_partial = partial(type_token_ratio_Tag, tag) # type: ignore
this_partial = partial(type_token_ratio_Tag, tag)
this_partial.__name__ = name # type: ignore
doc = this_partial.func.__doc__.replace('a given', f'the `{tag}`') # type: ignore # noqa: E501
ALL[name] = Feature(name, this_partial, doc=doc,
Expand Down
4 changes: 2 additions & 2 deletions udar/features/priors.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def _token_frequencies(doc: Document,
"""Make list of token frequencies."""
toks = ALL['_filter_toks'](doc, has_tag=has_tag, rmv_punc=rmv_punc)
RNC_tok_freq_dict = _get_RNC_tok_freq_dict()
return [RNC_tok_freq_dict.get(tok.text, 0) for tok in toks] # type: ignore
return [RNC_tok_freq_dict.get(tok.text, 0) for tok in toks]


@add_to_ALL('_token_frequency_ranks', category='_prior')
Expand All @@ -129,7 +129,7 @@ def _token_frequency_ranks(doc: Document,
"""Make list of token frequency ranks."""
toks = ALL['_filter_toks'](doc, has_tag=has_tag, rmv_punc=rmv_punc)
RNC_tok_freq_rank_dict = _get_RNC_tok_freq_rank_dict()
return [RNC_tok_freq_rank_dict.get(tok.text, 0) for tok in toks] # type: ignore # noqa: E501
return [RNC_tok_freq_rank_dict.get(tok.text, 0) for tok in toks]


@add_to_ALL('_dependency_lengths', category='_prior')
Expand Down
4 changes: 2 additions & 2 deletions udar/features/syntax.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def avg_dependency_length_Tag(has_tag: str, doc: Document, rmv_punc=False,
return avg_dep_len
for tag in tag_dict: # noqa: E305
name = f'avg_dependency_length_{safe_tag_name(tag)}'
this_partial = partial(avg_dependency_length_Tag, tag) # type: ignore
this_partial = partial(avg_dependency_length_Tag, tag)
this_partial.__name__ = name # type: ignore
doc = avg_dependency_length_Tag.__doc__.replace('a given', f'the `{tag}`') # type: ignore # noqa: E501
ALL[name] = Feature(name, this_partial, doc=doc, category='Syntax')
Expand All @@ -61,7 +61,7 @@ def max_dependency_length_Tag(has_tag: str, doc: Document, rmv_punc=False,
return max_dep_len
for tag in tag_dict: # noqa: E305
name = f'max_dependency_length_{safe_tag_name(tag)}'
this_partial = partial(max_dependency_length_Tag, tag) # type: ignore
this_partial = partial(max_dependency_length_Tag, tag)
this_partial.__name__ = name # type: ignore
doc = max_dependency_length_Tag.__doc__.replace('a given', f'the `{tag}`') # type: ignore # noqa: E501
ALL[name] = Feature(name, this_partial, doc=doc, category='Syntax')
Expand Down
12 changes: 6 additions & 6 deletions udar/fsts.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, fname: str):
class Analyzer(Udar):
"""HFST transducer that takes string and returns grammatical readings.
It is generally recommended to use :py:func:`get_generator` to obtain an
It is generally recommended to use :py:func:`get_analyzer` to obtain an
Analyzer object.
Example
Expand Down Expand Up @@ -108,12 +108,12 @@ def __call__(self, read: Union['Reading', str]) -> Optional[str]:
return None


analyzer_cache: Dict[str, Udar] = {}
generator_cache: Dict[str, Udar] = {}
g2p = None
analyzer_cache: Dict[str, Analyzer] = {}
generator_cache: Dict[str, Generator] = {}
g2p: 'libhfst.HfstTransducer' = None


def get_analyzer(**kwargs):
def get_analyzer(**kwargs) -> Analyzer:
global analyzer_cache
signature = ['='.join((key, str(val)))
for key, val in sorted(kwargs.items())]
Expand All @@ -125,7 +125,7 @@ def get_analyzer(**kwargs):
return analyzer_cache[flavor]


def get_generator(**kwargs):
def get_generator(**kwargs) -> Generator:
global generator_cache
signature = ['='.join((key, str(val)))
for key, val in sorted(kwargs.items())]
Expand Down
8 changes: 4 additions & 4 deletions udar/reading.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,11 +137,11 @@ def generate(self,
The same arguments accepted by :py:meth:`Generator.__init__`.
(default: bundled generator)
"""
try:
return _generator(self.hfst_noL2_str()) # type: ignore
except TypeError:
if _generator is not None:
return _generator(self.hfst_noL2_str())
else:
_generator = get_generator(**kwargs)
return _generator(self.hfst_noL2_str()) # type: ignore
return _generator(self.hfst_noL2_str())

def replace_tag(self, orig_tag: Union[Tag, str], new_tag: Union[Tag, str],
which_subreading: Union[int, slice] = slice(None)):
Expand Down
16 changes: 11 additions & 5 deletions udar/sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,7 @@ def analyze(self, L2_errors: bool, _analyzer=None,
self._toks = []

def disambiguate(self, gram_path: Union[str, Path] = '',
traces: bool = True):
traces: bool = True, force: str = None):
"""Use Constraint Grammar to remove as many ambiguous readings as
possible.
Expand All @@ -463,6 +463,10 @@ def disambiguate(self, gram_path: Union[str, Path] = '',
Whether to keep track of readings that are *removed* by the
Constraint Grammar. Removed readings can be found in
:py:attr:`Token.removed_readings`. (default: True)
force
Use the given method to force removal of ambiguity left by the
Constraint Grammar. See :py:meth:`Token.most_likely_reading` for
the list of available methods. # TODO kwargs?
"""
if gram_path == '':
gram_path = f'{RSRC_PATH}disambiguator.cg3'
Expand All @@ -486,10 +490,12 @@ def disambiguate(self, gram_path: Union[str, Path] = '',
'\n\n'.join(f'{old} {triangle} {new}'
for old, new
in zip(self, new_tokens)))
for old, new in zip(self, new_tokens):
old.readings = new.readings
old.removed_readings += new.removed_readings
old.lemmas = new.lemmas
for old_tok, new_tok in zip(self, new_tokens):
if force is not None:
new_tok.force_disambiguate(method=force)
old_tok.readings = new_tok.readings
old_tok.removed_readings += new_tok.removed_readings
old_tok.lemmas = new_tok.lemmas
self._disambiguated = True

@staticmethod
Expand Down
Loading

0 comments on commit 9f272e7

Please sign in to comment.