Skip to content

Commit

Permalink
preparing for a new release
Browse files Browse the repository at this point in the history
  • Loading branch information
Mika Hämäläinen committed Oct 4, 2021
1 parent 8a61a10 commit 9024f70
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 6 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
natas.egg-info
build/*
dist/*
natas/__pycache__/*
natas/__pycache__/*
*.DS_Store
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@ For a list of non-modern spelling variants, the tool can produce an ordered list
natas.normalize_words(["seacreat", "wiþe"])
>> [['secret', 'secrete'], ['with', 'withe', 'wide', 'white', 'way']]

Possible keyword arguments are n_best=10, dictionary=None, all_candidates=True, correct_spelling_cache=True.
Possible keyword arguments are n_best=10, dictionary=None, all_candidates=True, correct_spelling_cache=True, return_scores=False.
- *n_best* sets the number of candidates the NMT will output
- *dictionary* sets a custom dictionary to be used to filter the NMT output (see more in the next section)
- *all_candidates*, if False, the method will return only the topmost normalization candidate (this will improve the speed of the method)
- *correct_spelling_cache*, used only when checking if a candidate word is correctly spelled. Set this to False if you are testing with multiple *dictionaries*.
- *return_scores*, if True, returns the model's predictions scores for example [['secret', -1.0969021320343018], ['secrete', -4.121032238006592]]

## OCR post correction

Expand Down
6 changes: 4 additions & 2 deletions natas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@ class W2VException(Exception):
def normalize_words(words, n_best=10, dictionary=None, all_candidates=True, correct_spelling_cache=True, return_scores=False):
return _normalize(words, "normalization.pt", n_best=n_best, dictionary=dictionary, all_candidates=all_candidates,correct_spelling_cache=correct_spelling_cache, return_scores=return_scores)

def ocr_correct_words(words, n_best=10, dictionary=None, all_candidates=True, hybrid=False, hybrid_w2v_model=None,correct_spelling_cache=True):
def ocr_correct_words(words, n_best=10, dictionary=None, all_candidates=True, hybrid=False, hybrid_w2v_model=None,correct_spelling_cache=True, return_scores=False):
if hybrid is True and hybrid_w2v_model is None:
raise W2VException("W2V model not specified")
norms = _normalize(words, "ocr.pt", n_best=n_best, dictionary=dictionary, all_candidates=all_candidates,correct_spelling_cache=correct_spelling_cache)
if hybrid and return_scores:
raise Exception("hybrid mode does not support scores")
norms = _normalize(words, "ocr.pt", n_best=n_best, dictionary=dictionary, all_candidates=all_candidates,correct_spelling_cache=correct_spelling_cache,return_scores=return_scores)
if hybrid:
for i, l in enumerate(norms):
if len(l) == 0:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
# Versions should comply with PEP440. For a discussion on single-sourcing
# the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version='1.1.0',
version='1.2.0',

description='Python library for processing historical English',
long_description=long_description,
Expand Down
3 changes: 2 additions & 1 deletion test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

#print(natas.is_correctly_spelled("cat"))
print(natas.normalize_words(["seacreat", "wiþe"], n_best=5, return_scores=True))
print(natas.ocr_correct_words(["paft", "friendlhip"], return_scores=True))
print(natas.normalize_words(["seacreat", "wiþe"], n_best=5))
print(natas.ocr_correct_words(["paft", "friendlhip"]))


#print(natas.is_correctly_spelled("ca7"))

#model = Word2Vec.load("/Users/mikahama/Downloads/models/model_fi_1820-1917.w2v")
Expand Down

0 comments on commit 9024f70

Please sign in to comment.