diff --git a/.gitignore b/.gitignore index df9ce6c..324ee54 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ natas.egg-info build/* dist/* -natas/__pycache__/* \ No newline at end of file +natas/__pycache__/* +*.DS_Store diff --git a/README.md b/README.md index 5a399a3..bc7bae2 100644 --- a/README.md +++ b/README.md @@ -24,11 +24,12 @@ For a list of non-modern spelling variants, the tool can produce an ordered list natas.normalize_words(["seacreat", "wiþe"]) >> [['secret', 'secrete'], ['with', 'withe', 'wide', 'white', 'way']] -Possible keyword arguments are n_best=10, dictionary=None, all_candidates=True, correct_spelling_cache=True. +Possible keyword arguments are n_best=10, dictionary=None, all_candidates=True, correct_spelling_cache=True, return_scores=False. - *n_best* sets the number of candidates the NMT will output - *dictionary* sets a custom dictionary to be used to filter the NMT output (see more in the next section) - *all_candidates*, if False, the method will return only the topmost normalization candidate (this will improve the speed of the method) - *correct_spelling_cache*, used only when checking if a candidate word is correctly spelled. Set this to False if you are testing with multiple *dictionaries*. +- *return_scores*, if True, returns the model's predictions scores for example [['secret', -1.0969021320343018], ['secrete', -4.121032238006592]] ## OCR post correction diff --git a/natas/__init__.py b/natas/__init__.py index 25dd435..6609e2d 100644 --- a/natas/__init__.py +++ b/natas/__init__.py @@ -8,10 +8,12 @@ class W2VException(Exception): def normalize_words(words, n_best=10, dictionary=None, all_candidates=True, correct_spelling_cache=True, return_scores=False): return _normalize(words, "normalization.pt", n_best=n_best, dictionary=dictionary, all_candidates=all_candidates,correct_spelling_cache=correct_spelling_cache, return_scores=return_scores) -def ocr_correct_words(words, n_best=10, dictionary=None, all_candidates=True, hybrid=False, hybrid_w2v_model=None,correct_spelling_cache=True): +def ocr_correct_words(words, n_best=10, dictionary=None, all_candidates=True, hybrid=False, hybrid_w2v_model=None,correct_spelling_cache=True, return_scores=False): if hybrid is True and hybrid_w2v_model is None: raise W2VException("W2V model not specified") - norms = _normalize(words, "ocr.pt", n_best=n_best, dictionary=dictionary, all_candidates=all_candidates,correct_spelling_cache=correct_spelling_cache) + if hybrid and return_scores: + raise Exception("hybrid mode does not support scores") + norms = _normalize(words, "ocr.pt", n_best=n_best, dictionary=dictionary, all_candidates=all_candidates,correct_spelling_cache=correct_spelling_cache,return_scores=return_scores) if hybrid: for i, l in enumerate(norms): if len(l) == 0: diff --git a/setup.py b/setup.py index a026a58..b0e2a7d 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ # Versions should comply with PEP440. For a discussion on single-sourcing # the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.1.0', + version='1.2.0', description='Python library for processing historical English', long_description=long_description, diff --git a/test.py b/test.py index 8d74896..82cbfd8 100644 --- a/test.py +++ b/test.py @@ -6,9 +6,10 @@ #print(natas.is_correctly_spelled("cat")) print(natas.normalize_words(["seacreat", "wiþe"], n_best=5, return_scores=True)) +print(natas.ocr_correct_words(["paft", "friendlhip"], return_scores=True)) +print(natas.normalize_words(["seacreat", "wiþe"], n_best=5)) print(natas.ocr_correct_words(["paft", "friendlhip"])) - #print(natas.is_correctly_spelled("ca7")) #model = Word2Vec.load("/Users/mikahama/Downloads/models/model_fi_1820-1917.w2v")