force_disambiguate. fix #46 (#47)

reynoldsnlp · Jul 22, 2020 · 9f272e7 · 9f272e7
1 parent 368de12
commit 9f272e7
Show file tree

Hide file tree

Showing 17 changed files with 138 additions and 139 deletions.
diff --git a/README.md b/README.md
@@ -172,6 +172,7 @@ print(phonetic_doc1)
 | most\_likely\_reading | `Reading` | "Most likely" reading (may be partially random selection) |
 | most\_likely\_lemmas | `List[str]` | List of lemma(s) from the "most likely" reading |
 | transliterate | `str` | The original text converted to Romanized Cyrillic (default=Scholarly) |
+| force\_disambiguate | `None` | Fully disambiguate readings using methods **other than** the Constraint Grammar |
 | cg3\_str | `str` | Analysis stream in the [VISL-CG3 format](https://visl.sdu.dk/cg3/single/#stream-vislcg) |
 | hfst\_str | `str` | Analysis stream in the XFST/HFST format |
 

diff --git a/mypy.ini b/mypy.ini
@@ -0,0 +1,4 @@
+[mypy]
+warn_unused_configs = True
+warn_unused_ignores = True
+warn_unreachable = True
diff --git a/test/test_README.py b/test/test_README.py
@@ -69,8 +69,8 @@ def test_all_properties_are_documented_in_tables():
                                  'features', 'parse_cg3', 'parse_hfst',
                                  'respace', 'stress_eval', 'stress_preds2tsv',
                                  'tokenize'},
-                    'Token': {'annotation', 'end_char', 'features', 'guess',
-                              'guess_freq', 'guess_syllable',
+                    'Token': {'annotation', 'end_char', 'features',
+                              'guess_syllable',
                               'has_tag_in_most_likely_reading', 'head',
                               'is_L2_error', 'might_be_L2_error',
                               'phon_predictions', 'phonetic_transcriptions',

diff --git a/udar/__main__.py b/udar/__main__.py
@@ -9,7 +9,6 @@ def parse_input(input_str: str, args: argparse.Namespace) -> Document:
     if args.input_type == 'c':
         return Document.from_cg3(input_str)
     elif args.input_type == 'f':
-        raise NotImplementedError  # TODO
         return Document.from_hfst(input_str)
     elif args.input_type == 'p':
         return Document(input_str, disambiguate=args.disambiguate)

diff --git a/udar/conversion/external2udar.py b/udar/conversion/external2udar.py
@@ -63,7 +63,7 @@ def readable_sent(sentence):
                                 desc='Reading corpus...', total=sent_count):
         sent_id = sent['id']
         doc = Document(sent.source.get_text(), annotation=sent_id)
-        u_toks = [t.orig for t in doc]
+        u_toks = [t.text for t in doc]
         oc_toks = [t['text'] for t in sent.find_all('token')]
 
         if u_toks != oc_toks:

diff --git a/udar/document.py b/udar/document.py
@@ -4,6 +4,7 @@
 from sys import stderr
 from typing import Dict
 from typing import Iterable
+from typing import Iterator
 from typing import List
 from typing import Optional
 from typing import Tuple
@@ -16,6 +17,7 @@
 from .fsts import get_analyzer
 from .misc import get_stanza_sent_tokenizer
 from .sentence import Sentence
+from .tok import Token
 
 
 __all__ = ['Document']
@@ -35,14 +37,14 @@
 #         return nltk_sent_tokenizer
 
 
-def _str2Sentences(input_str, **kwargs):
+def _str2Sentences(input_str, **kwargs) -> List[Sentence]:
     stanza_sent = get_stanza_sent_tokenizer()
     # TODO should the following 2 lines be solved in tokenizer's pmscript?
     input_str = input_str.replace('#', ' ')  # The `#` char is ignored by udar
     input_str = re.sub(r'([^аэоуыяеёюи])[\u0300\u0301]', r'\1', input_str,
                        flags=re.I)
     stanza_doc = stanza_sent(input_str)
-    return [Sentence(sent.text, **kwargs)
+    return [Sentence(sent.text, id=i, **kwargs)
             for i, sent in enumerate(stanza_doc.sentences)]
 
 
@@ -83,7 +85,7 @@ def __init__(self, input_text: Union[str, Iterable[Sentence], 'Document'],
                or hasattr(input_text, '__iter__'))
               and isinstance(next(iter(input_text)), Sentence)):
             self.text = ' '.join(sent.text for sent in input_text)
-            self.sentences = list(input_text)
+            self.sentences = list(input_text)  # type: ignore
             for sent in self.sentences:
                 sent.doc = self
         elif isinstance(input_text, Document):
@@ -109,12 +111,16 @@ def __eq__(self, other):
                 and all(s == o
                         for s, o in zip(self.sentences, other.sentences)))
 
-    def __getitem__(self, i: Union[int, slice]):
-        warn('Indexing on a Document object is slow.', stacklevel=2)
+    def __getitem__(self, i: Union[int, slice]) -> Union[Token, List[Token]]:
+        """Get *Token(s)* by index/slice."""
+        warn('Indexing on large Document objects can be slow. '
+             'It is more efficient to index Tokens within Document.sentences',
+             stacklevel=3)
         # TODO optimize?
         return list(self)[i]
 
-    def __iter__(self):
+    def __iter__(self) -> Iterator[Token]:
+        """Return iterator over *Tokens*."""
         return iter(chain(*self.sentences))
 
     def __repr__(self):
@@ -161,7 +167,7 @@ def from_cg3(cls, input_stream: str, **kwargs):
                                        r'# ANNOTATION: ([^\n]*)\n'
                                        r'# TEXT: ([^\n]*)\n'
                                        r'(.+?)', input_stream, flags=re.S)
-        if split_by_sentence is not None:
+        if split_by_sentence:
             sentences = [Sentence.from_cg3(stream, id=id,
                                            annotation=annotation,
                                            orig_text=text, **kwargs)

diff --git a/udar/features/feature_extractor.py b/udar/features/feature_extractor.py
@@ -82,16 +82,16 @@ def __call__(self, docs: Union[List[Document], Document], feat_names=None,
         if ((hasattr(docs, '__iter__') or hasattr(docs, '__getitem__'))
                 and isinstance(next(iter(docs)), Document)):
             for doc in docs:
-                doc.features = self._call_features(doc,
-                                                    feat_names=feat_names,
-                                                    tuple_constructor=tuple_constructor,  # noqa: E501
-                                                    **kwargs)
+                doc.features = self._call_features(doc,  # type: ignore
+                                                   feat_names=feat_names,
+                                                   tuple_constructor=tuple_constructor,  # noqa: E501
+                                                   **kwargs)
                 output.append(doc.features)
         elif isinstance(docs, Document):
             docs.features = self._call_features(docs,
-                                                 feat_names=feat_names,
-                                                 tuple_constructor=tuple_constructor,  # noqa: E501
-                                                 **kwargs)
+                                                feat_names=feat_names,
+                                                tuple_constructor=tuple_constructor,  # noqa: E501
+                                                **kwargs)
             output.append(docs.features)
         else:
             raise TypeError('Expected Document or list of Documents; got '

diff --git a/udar/features/lexical_complexity.py b/udar/features/lexical_complexity.py
@@ -26,7 +26,7 @@ def prcnt_words_over_n_sylls(n, doc: Document, lower=False, rmv_punc=True,
         return zero_div_val
 for n in range(1, MAX_SYLL):  # noqa: E305
     name = f'prcnt_words_over_{n}_sylls'
-    this_partial = partial(prcnt_words_over_n_sylls, n)  # type: ignore
+    this_partial = partial(prcnt_words_over_n_sylls, n)
     this_partial.__name__ = name  # type: ignore
     doc = prcnt_words_over_n_sylls.__doc__.replace(' n ', f' {n} ')  # type: ignore  # noqa: E501
     ALL[name] = Feature(name, this_partial, doc=doc,
@@ -50,7 +50,7 @@ def prcnt_content_words_over_n_sylls(n, doc: Document, lower=False,
         return zero_div_val
 for n in range(1, MAX_SYLL):  # noqa: E305
     name = f'prcnt_content_words_over_{n}_sylls'
-    this_partial = partial(prcnt_content_words_over_n_sylls, n)  # type: ignore
+    this_partial = partial(prcnt_content_words_over_n_sylls, n)
     this_partial.__name__ = name  # type: ignore
     doc = prcnt_content_words_over_n_sylls.__doc__.replace(' n ', f' {n} ')  # type: ignore  # noqa: E501
     ALL[name] = Feature(name, this_partial, doc=doc,
@@ -73,7 +73,7 @@ def prcnt_words_over_n_chars(n, doc: Document, lower=False, rmv_punc=True,
         return zero_div_val
 for n in range(1, MAX_SYLL):  # noqa: E305
     name = f'prcnt_words_over_{n}_chars'
-    this_partial = partial(prcnt_words_over_n_chars, n)  # type: ignore
+    this_partial = partial(prcnt_words_over_n_chars, n)
     this_partial.__name__ = name  # type: ignore
     doc = prcnt_words_over_n_chars.__doc__.replace(' n ', f' {n} ')  # type: ignore  # noqa: E501
     ALL[name] = Feature(name, this_partial, doc=doc,
@@ -97,7 +97,7 @@ def prcnt_content_words_over_n_chars(n, doc: Document, lower=False,
         return zero_div_val
 for n in range(1, MAX_SYLL):  # noqa: E305
     name = f'prcnt_content_words_over_{n}_chars'
-    this_partial = partial(prcnt_content_words_over_n_chars, n)  # type: ignore
+    this_partial = partial(prcnt_content_words_over_n_chars, n)
     this_partial.__name__ = name  # type: ignore
     doc = prcnt_content_words_over_n_chars.__doc__.replace(' n ', f' {n} ')  # type: ignore  # noqa: E501
     ALL[name] = Feature(name, this_partial, doc=doc,

diff --git a/udar/features/lexical_familiarity.py b/udar/features/lexical_familiarity.py
@@ -27,10 +27,10 @@ def num_words_at_lexmin_level(level, doc: Document) -> int:
     lexmin_dict = _get_lexmin_dict()
     return len([1 for tok in doc
                 if any(lexmin_dict.get(lem) == level
-                       for lem in tok.most_likely_lemmas(method=MOST_LIKELY))])  # type: ignore  # noqa: E501
+                       for lem in tok.most_likely_lemmas(method=MOST_LIKELY))])
 for level in ['A1', 'A2', 'B1', 'B2']:  # noqa: E305
     name = f'num_words_at_lexmin_{level}'
-    this_partial = partial(num_words_at_lexmin_level, level)  # type: ignore
+    this_partial = partial(num_words_at_lexmin_level, level)
     this_partial.__name__ = name  # type: ignore
     doc = num_words_at_lexmin_level.__doc__.replace('LEVEL', level)  # type: ignore  # noqa: E501
     ALL[name] = Feature(name, this_partial, doc=doc,
@@ -70,10 +70,10 @@ def num_words_at_kelly_level(level, doc: Document) -> int:
     kelly_dict = _get_kelly_dict()
     return len([1 for tok in doc
                 if any(kelly_dict.get(lem) == level
-                       for lem in tok.most_likely_lemmas(method=MOST_LIKELY))])  # type: ignore  # noqa: E501
+                       for lem in tok.most_likely_lemmas(method=MOST_LIKELY))])
 for level in ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']:  # noqa: E305
     name = f'num_words_at_kelly_{level}'
-    this_partial = partial(num_words_at_kelly_level, level)  # type: ignore
+    this_partial = partial(num_words_at_kelly_level, level)
     this_partial.__name__ = name  # type: ignore
     doc = num_words_at_kelly_level.__doc__.replace('LEVEL', level)  # type: ignore  # noqa: E501
     ALL[name] = Feature(name, this_partial, doc=doc,

diff --git a/udar/features/lexical_variability.py b/udar/features/lexical_variability.py
@@ -131,7 +131,7 @@ def type_token_ratio_Tag(tag: str, doc: Document, lower=True, rmv_punc=False,
         return zero_div_val
 for tag in tag_dict:  # noqa: E305
     name = f'type_token_ratio_{safe_tag_name(tag)}'
-    this_partial = partial(type_token_ratio_Tag, tag)  # type: ignore
+    this_partial = partial(type_token_ratio_Tag, tag)
     this_partial.__name__ = name  # type: ignore
     doc = this_partial.func.__doc__.replace('a given', f'the `{tag}`')  # type: ignore  # noqa: E501
     ALL[name] = Feature(name, this_partial, doc=doc,

diff --git a/udar/features/priors.py b/udar/features/priors.py
@@ -119,7 +119,7 @@ def _token_frequencies(doc: Document,
     """Make list of token frequencies."""
     toks = ALL['_filter_toks'](doc, has_tag=has_tag, rmv_punc=rmv_punc)
     RNC_tok_freq_dict = _get_RNC_tok_freq_dict()
-    return [RNC_tok_freq_dict.get(tok.text, 0) for tok in toks]  # type: ignore
+    return [RNC_tok_freq_dict.get(tok.text, 0) for tok in toks]
 
 
 @add_to_ALL('_token_frequency_ranks', category='_prior')
@@ -129,7 +129,7 @@ def _token_frequency_ranks(doc: Document,
     """Make list of token frequency ranks."""
     toks = ALL['_filter_toks'](doc, has_tag=has_tag, rmv_punc=rmv_punc)
     RNC_tok_freq_rank_dict = _get_RNC_tok_freq_rank_dict()
-    return [RNC_tok_freq_rank_dict.get(tok.text, 0) for tok in toks]  # type: ignore  # noqa: E501
+    return [RNC_tok_freq_rank_dict.get(tok.text, 0) for tok in toks]
 
 
 @add_to_ALL('_dependency_lengths', category='_prior')

diff --git a/udar/features/syntax.py b/udar/features/syntax.py
@@ -34,7 +34,7 @@ def avg_dependency_length_Tag(has_tag: str, doc: Document, rmv_punc=False,
     return avg_dep_len
 for tag in tag_dict:  # noqa: E305
     name = f'avg_dependency_length_{safe_tag_name(tag)}'
-    this_partial = partial(avg_dependency_length_Tag, tag)  # type: ignore
+    this_partial = partial(avg_dependency_length_Tag, tag)
     this_partial.__name__ = name  # type: ignore
     doc = avg_dependency_length_Tag.__doc__.replace('a given', f'the `{tag}`')  # type: ignore  # noqa: E501
     ALL[name] = Feature(name, this_partial, doc=doc, category='Syntax')
@@ -61,7 +61,7 @@ def max_dependency_length_Tag(has_tag: str, doc: Document, rmv_punc=False,
     return max_dep_len
 for tag in tag_dict:  # noqa: E305
     name = f'max_dependency_length_{safe_tag_name(tag)}'
-    this_partial = partial(max_dependency_length_Tag, tag)  # type: ignore
+    this_partial = partial(max_dependency_length_Tag, tag)
     this_partial.__name__ = name  # type: ignore
     doc = max_dependency_length_Tag.__doc__.replace('a given', f'the `{tag}`')  # type: ignore  # noqa: E501
     ALL[name] = Feature(name, this_partial, doc=doc, category='Syntax')

diff --git a/udar/fsts.py b/udar/fsts.py
@@ -37,7 +37,7 @@ def __init__(self, fname: str):
 class Analyzer(Udar):
     """HFST transducer that takes string and returns grammatical readings.
 
-    It is generally recommended to use :py:func:`get_generator` to obtain an
+    It is generally recommended to use :py:func:`get_analyzer` to obtain an
     Analyzer object.
 
     Example
@@ -108,12 +108,12 @@ def __call__(self, read: Union['Reading', str]) -> Optional[str]:
             return None
 
 
-analyzer_cache: Dict[str, Udar] = {}
-generator_cache: Dict[str, Udar] = {}
-g2p = None
+analyzer_cache: Dict[str, Analyzer] = {}
+generator_cache: Dict[str, Generator] = {}
+g2p: 'libhfst.HfstTransducer' = None
 
 
-def get_analyzer(**kwargs):
+def get_analyzer(**kwargs) -> Analyzer:
     global analyzer_cache
     signature = ['='.join((key, str(val)))
                  for key, val in sorted(kwargs.items())]
@@ -125,7 +125,7 @@ def get_analyzer(**kwargs):
         return analyzer_cache[flavor]
 
 
-def get_generator(**kwargs):
+def get_generator(**kwargs) -> Generator:
     global generator_cache
     signature = ['='.join((key, str(val)))
                  for key, val in sorted(kwargs.items())]

diff --git a/udar/reading.py b/udar/reading.py
@@ -137,11 +137,11 @@ def generate(self,
             The same arguments accepted by :py:meth:`Generator.__init__`.
             (default: bundled generator)
         """
-        try:
-            return _generator(self.hfst_noL2_str())  # type: ignore
-        except TypeError:
+        if _generator is not None:
+            return _generator(self.hfst_noL2_str())
+        else:
             _generator = get_generator(**kwargs)
-            return _generator(self.hfst_noL2_str())  # type: ignore
+            return _generator(self.hfst_noL2_str())
 
     def replace_tag(self, orig_tag: Union[Tag, str], new_tag: Union[Tag, str],
                     which_subreading: Union[int, slice] = slice(None)):

diff --git a/udar/sentence.py b/udar/sentence.py
@@ -450,7 +450,7 @@ def analyze(self, L2_errors: bool, _analyzer=None,
         self._toks = []
 
     def disambiguate(self, gram_path: Union[str, Path] = '',
-                     traces: bool = True):
+                     traces: bool = True, force: str = None):
         """Use Constraint Grammar to remove as many ambiguous readings as
         possible.
 
@@ -463,6 +463,10 @@ def disambiguate(self, gram_path: Union[str, Path] = '',
             Whether to keep track of readings that are *removed* by the
             Constraint Grammar. Removed readings can be found in
             :py:attr:`Token.removed_readings`. (default: True)
+        force
+            Use the given method to force removal of ambiguity left by the
+            Constraint Grammar. See :py:meth:`Token.most_likely_reading` for
+            the list of available methods.  # TODO kwargs?
         """
         if gram_path == '':
             gram_path = f'{RSRC_PATH}disambiguator.cg3'
@@ -486,10 +490,12 @@ def disambiguate(self, gram_path: Union[str, Path] = '',
                                  '\n\n'.join(f'{old} {triangle} {new}'
                                              for old, new
                                              in zip(self, new_tokens)))
-        for old, new in zip(self, new_tokens):
-            old.readings = new.readings
-            old.removed_readings += new.removed_readings
-            old.lemmas = new.lemmas
+        for old_tok, new_tok in zip(self, new_tokens):
+            if force is not None:
+                new_tok.force_disambiguate(method=force)
+            old_tok.readings = new_tok.readings
+            old_tok.removed_readings += new_tok.removed_readings
+            old_tok.lemmas = new_tok.lemmas
         self._disambiguated = True
 
     @staticmethod