preparing for a new release

mikahama · Oct 4, 2021 · 9024f70 · 9024f70
1 parent 8a61a10
commit 9024f70
Show file tree

Hide file tree

Showing 5 changed files with 11 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 natas.egg-info
 build/*
 dist/*
-natas/__pycache__/*
+natas/__pycache__/*
+*.DS_Store
diff --git a/README.md b/README.md
@@ -24,11 +24,12 @@ For a list of non-modern spelling variants, the tool can produce an ordered list
     natas.normalize_words(["seacreat", "wiþe"])
     >> [['secret', 'secrete'], ['with', 'withe', 'wide', 'white', 'way']]
 
-Possible keyword arguments are n_best=10, dictionary=None, all_candidates=True, correct_spelling_cache=True. 
+Possible keyword arguments are n_best=10, dictionary=None, all_candidates=True, correct_spelling_cache=True, return_scores=False. 
 - *n_best* sets the number of candidates the NMT will output
 - *dictionary* sets a custom dictionary to be used to filter the NMT output (see more in the next section)
 - *all_candidates*, if False, the method will return only the topmost normalization candidate (this will improve the speed of the method)
 - *correct_spelling_cache*, used only when checking if a candidate word is correctly spelled. Set this to False if you are testing with multiple *dictionaries*.
+- *return_scores*, if True, returns the model's predictions scores for example [['secret', -1.0969021320343018], ['secrete', -4.121032238006592]]
 
 ## OCR post correction
 

diff --git a/natas/__init__.py b/natas/__init__.py
@@ -8,10 +8,12 @@ class W2VException(Exception):
 def normalize_words(words, n_best=10, dictionary=None, all_candidates=True, correct_spelling_cache=True, return_scores=False):
 	return _normalize(words, "normalization.pt", n_best=n_best, dictionary=dictionary, all_candidates=all_candidates,correct_spelling_cache=correct_spelling_cache, return_scores=return_scores)
 
-def ocr_correct_words(words, n_best=10, dictionary=None, all_candidates=True, hybrid=False, hybrid_w2v_model=None,correct_spelling_cache=True):
+def ocr_correct_words(words, n_best=10, dictionary=None, all_candidates=True, hybrid=False, hybrid_w2v_model=None,correct_spelling_cache=True, return_scores=False):
 	if hybrid is True and hybrid_w2v_model is None:
 		raise W2VException("W2V model not specified")
-	norms = _normalize(words, "ocr.pt", n_best=n_best, dictionary=dictionary, all_candidates=all_candidates,correct_spelling_cache=correct_spelling_cache)
+	if hybrid and return_scores:
+		raise Exception("hybrid mode does not support scores")
+	norms = _normalize(words, "ocr.pt", n_best=n_best, dictionary=dictionary, all_candidates=all_candidates,correct_spelling_cache=correct_spelling_cache,return_scores=return_scores)
 	if hybrid:
 		for i, l in enumerate(norms):
 			if len(l) == 0:

diff --git a/setup.py b/setup.py
@@ -23,7 +23,7 @@
     # Versions should comply with PEP440.  For a discussion on single-sourcing
     # the version across setup.py and the project code, see
     # https://packaging.python.org/en/latest/single_source_version.html
-    version='1.1.0',
+    version='1.2.0',
 
     description='Python library for processing historical English',
     long_description=long_description,

diff --git a/test.py b/test.py
@@ -6,9 +6,10 @@
 
 #print(natas.is_correctly_spelled("cat"))
 print(natas.normalize_words(["seacreat", "wiþe"], n_best=5, return_scores=True))
+print(natas.ocr_correct_words(["paft", "friendlhip"], return_scores=True))
+print(natas.normalize_words(["seacreat", "wiþe"], n_best=5))
 print(natas.ocr_correct_words(["paft", "friendlhip"]))
 
-
 #print(natas.is_correctly_spelled("ca7"))
 
 #model = Word2Vec.load("/Users/mikahama/Downloads/models/model_fi_1820-1917.w2v")