Skip to content

Commit

Permalink
progress bar and version bump
Browse files Browse the repository at this point in the history
  • Loading branch information
rsennrich committed Dec 8, 2021
1 parent 823c880 commit 7bae758
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 2 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
CHANGELOG
---------

v0.3.8:
- multiprocessing support (get_vocab and apply_bpe)
- progress bar for learn_bpe
- seed parameter for deterministic BPE dropout
- ignore some unicode line separators which would crash subword-nmt

v0.3.7:
- BPE dropout (Provilkov et al., 2019)
- more efficient glossaries (https://github.com/rsennrich/subword-nmt/pull/69)
Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_suite():

setup(
name='subword_nmt',
version='0.3.7',
version='0.3.8',
description='Unsupervised Word Segmentation for Neural Machine Translation and Text Generation',
long_description=(codecs.open("README.md", encoding='utf-8').read() +
"\n\n" + codecs.open("CHANGELOG.md", encoding='utf-8').read()),
Expand All @@ -28,6 +28,8 @@ def test_suite():
'Programming Language :: Python :: 2',
'Programming Language :: Python :: 3',
],
install_requires=['mock',
'tqdm'],
packages=find_packages(),
entry_points={
'console_scripts': ['subword-nmt=subword_nmt.subword_nmt:main'],
Expand Down
8 changes: 7 additions & 1 deletion subword_nmt/learn_bpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@
from multiprocessing import Pool, cpu_count
from collections import defaultdict, Counter

try:
from tqdm import tqdm
except ImportError:
def tqdm(iterator, *args, **kwargs):
return iterator

# hack for python2/3 compatibility
from io import open
argparse.open = open
Expand Down Expand Up @@ -294,7 +300,7 @@ def learn_bpe(infile, outfile, num_symbols, min_frequency=2, verbose=False, is_d

# threshold is inspired by Zipfian assumption, but should only affect speed
threshold = max(stats.values()) / 10
for i in range(num_symbols):
for i in tqdm(range(num_symbols)):
if stats:
most_frequent = max(stats, key=lambda x: (stats[x], x))

Expand Down

0 comments on commit 7bae758

Please sign in to comment.