Skip to content

Commit

Permalink
Sharded data iterator. (#241)
Browse files Browse the repository at this point in the history
* Sharded data iterator.

* Added remaining sockeye/*.py files to typechecked files (#242)

* Tests to see we get the right number of batches.

* Improved log message about vocabs a little bit

* Factored validation iter creation into separate function

* Covering prepare data in the system tests.

* Writing a data version.
  • Loading branch information
tdomhan authored and fhieber committed Dec 18, 2017
1 parent 204b214 commit 0ed81fd
Show file tree
Hide file tree
Showing 23 changed files with 2,354 additions and 806 deletions.
20 changes: 18 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,17 @@ Note that Sockeye has checks in place to not translate with an old model that wa

Each version section may have have subsections for: _Added_, _Changed_, _Removed_, _Deprecated_, and _Fixed_.

## [1.15.6]
### Added
- New CLI `sockeye.prepare_data` for preprocessing the training data only once before training,
potentially splitting large datasets into shards. At training time only one shard is loaded into memory at a time,
limiting the maximum memory usage.

### Changed
- Instead of using the ```--source``` and ```--target``` arguments ```sockeye.train``` now accepts a
```--prepared-data``` argument pointing to the folder containing the preprocessed and sharded data. Using the raw
training data is still possible and now consumes less memory.

## [1.15.5]
### Added
- Optionally apply query, key and value projections to the source and target hidden vectors in the CNN model
Expand All @@ -33,8 +44,13 @@ Each version section may have have subsections for: _Added_, _Changed_, _Removed

## [1.15.0]
### Added
- Added support for Swish-1 (SiLU) activation to transformer models ([Ramachandran et al. 2017: Searching for Activation Functions](https://arxiv.org/pdf/1710.05941.pdf), [Elfwing et al. 2017: Sigmoid-Weighted Linear Units for Neural Network Function Approximation in Reinforcement Learning](https://arxiv.org/pdf/1702.03118.pdf)). Use `--transformer-activation-type swish1`.
- Added support for GELU activation to transformer models ([Hendrycks and Gimpel 2016: Bridging Nonlinearities and Stochastic Regularizers with Gaussian Error Linear Units](https://arxiv.org/pdf/1606.08415.pdf). Use `--transformer-activation-type gelu`.
- Added support for Swish-1 (SiLU) activation to transformer models
([Ramachandran et al. 2017: Searching for Activation Functions](https://arxiv.org/pdf/1710.05941.pdf),
[Elfwing et al. 2017: Sigmoid-Weighted Linear Units for Neural Network Function Approximation
in Reinforcement Learning](https://arxiv.org/pdf/1702.03118.pdf)). Use `--transformer-activation-type swish1`.
- Added support for GELU activation to transformer models ([Hendrycks and Gimpel 2016: Bridging Nonlinearities and
Stochastic Regularizers with Gaussian Error Linear Units](https://arxiv.org/pdf/1606.08415.pdf).
Use `--transformer-activation-type gelu`.

## [1.14.3]
### Changed
Expand Down
9 changes: 5 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,15 +110,16 @@ def get_requirements(filename):

entry_points={
'console_scripts': [
'sockeye-train = sockeye.train:main',
'sockeye-translate = sockeye.translate:main',
'sockeye-average = sockeye.average:main',
'sockeye-embeddings = sockeye.embeddings:main',
'sockeye-evaluate = sockeye.evaluate:main',
'sockeye-vocab = sockeye.vocab:main',
'sockeye-extract-parameters = sockeye.extract_parameters:main',
'sockeye-lexicon = sockeye.lexicon:main',
'sockeye-extract = sockeye.extract_parameters:main',
'sockeye-init-embed = sockeye.init_embedding:main'
'sockeye-prepare-data = sockeye.prepare_data:main'
'sockeye-train = sockeye.train:main',
'sockeye-translate = sockeye.translate:main',
'sockeye-vocab = sockeye.vocab:main'
],
},

Expand Down
227 changes: 148 additions & 79 deletions sockeye/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,22 @@ def check_regular_file(value_to_check):
return check_regular_file


def regular_folder() -> Callable:
"""
Returns a method that can be used in argument parsing to check the argument is a directory.
:return: A method that can be used as a type in argparse.
"""

def check_regular_directory(value_to_check):
value_to_check = str(value_to_check)
if not os.path.isdir(value_to_check):
raise argparse.ArgumentTypeError("must be a directory.")
return value_to_check

return check_regular_directory


def int_greater_or_equal(threshold: int) -> Callable:
"""
Returns a method that can be used in argument parsing to check that the argument is greater or equal to `threshold`.
Expand Down Expand Up @@ -235,62 +251,117 @@ def add_logging_args(params):
help='Suppress console logging.')


def add_io_args(params):
data_params = params.add_argument_group("Data & I/O")
def add_training_data_args(params, required=False):
params.add_argument(C.TRAINING_ARG_SOURCE, '-s',
required=required,
type=regular_file(),
help='Source side of parallel training data.')
params.add_argument(C.TRAINING_ARG_TARGET, '-t',
required=required,
type=regular_file(),
help='Target side of parallel training data.')


def add_validation_data_params(params):
params.add_argument('--validation-source', '-vs',
required=True,
type=regular_file(),
help='Source side of validation data.')
params.add_argument('--validation-target', '-vt',
required=True,
type=regular_file(),
help='Target side of validation data.')


def add_prepared_data_args(params):
params.add_argument(C.TRAINING_ARG_PREPARED_DATA, '-d',
type=regular_folder(),
help='Prepared training data directory created through python -m sockeye.prepare_data.')


def add_monitoring_args(params):
params.add_argument('--use-tensorboard',
action='store_true',
help='Track metrics through tensorboard. Requires installed tensorboard.')

params.add_argument('--monitor-pattern',
default=None,
type=str,
help="Pattern to match outputs/weights/gradients to monitor. '.*' monitors everything. "
"Default: %(default)s.")

params.add_argument('--monitor-stat-func',
default=C.STAT_FUNC_DEFAULT,
choices=list(C.MONITOR_STAT_FUNCS.keys()),
help="Statistics function to run on monitored outputs/weights/gradients. "
"Default: %(default)s.")


def add_training_output_args(params):
params.add_argument('--output', '-o',
required=True,
help='Folder where model & training results are written to.')
params.add_argument('--overwrite-output',
action='store_true',
help='Delete all contents of the model directory if it already exists.')


def add_training_io_args(params):
params = params.add_argument_group("Data & I/O")

# Unfortunately we must set --source/--target to not required as we either accept these parameters
# or --prepared-data which can not easily be encoded in argparse.
add_training_data_args(params, required=False)
add_prepared_data_args(params)
add_validation_data_params(params)
add_bucketing_args(params)
add_vocab_args(params)
add_training_output_args(params)
add_monitoring_args(params)

data_params.add_argument('--source', '-s',
required=True,
type=regular_file(),
help='Source side of parallel training data.')
data_params.add_argument('--target', '-t',
required=True,
type=regular_file(),
help='Target side of parallel training data.')
data_params.add_argument('--limit',
default=None,
type=int,
help="Maximum number of training sequences to read. Default: %(default)s.")

data_params.add_argument('--validation-source', '-vs',
required=True,
type=regular_file(),
help='Source side of validation data.')
data_params.add_argument('--validation-target', '-vt',
required=True,
type=regular_file(),
help='Target side of validation data.')

data_params.add_argument('--output', '-o',
required=True,
help='Folder where model & training results are written to.')
data_params.add_argument('--overwrite-output',
action='store_true',
help='Delete all contents of the model directory if it already exists.')

data_params.add_argument('--source-vocab',
required=False,
default=None,
help='Existing source vocabulary (JSON)')
data_params.add_argument('--target-vocab',
required=False,
default=None,
help='Existing target vocabulary (JSON)')

data_params.add_argument('--use-tensorboard',
action='store_true',
help='Track metrics through tensorboard. Requires installed tensorboard.')

data_params.add_argument('--monitor-pattern',
default=None,
type=str,
help="Pattern to match outputs/weights/gradients to monitor. '.*' monitors everything. "
"Default: %(default)s.")
def add_bucketing_args(params):
params.add_argument('--no-bucketing',
action='store_true',
help='Disable bucketing: always unroll the graph to --max-seq-len. Default: %(default)s.')

data_params.add_argument('--monitor-stat-func',
default=C.STAT_FUNC_DEFAULT,
choices=list(C.MONITOR_STAT_FUNCS.keys()),
help="Statistics function to run on monitored outputs/weights/gradients. "
"Default: %(default)s.")
params.add_argument('--bucket-width',
type=int_greater_or_equal(1),
default=10,
help='Width of buckets in tokens. Default: %(default)s.')

params.add_argument('--max-seq-len',
type=multiple_values(num_values=2, greater_or_equal=1),
default=(100, 100),
help='Maximum sequence length in tokens. Note that the target side will be extended by '
'the <BOS> (beginning of sentence) token, increasing the effective target length. '
'Use "x:x" to specify separate values for src&tgt. Default: %(default)s.')


def add_prepare_data_cli_args(params):
params = params.add_argument_group("Data preparation.")
add_training_data_args(params, required=True)
add_vocab_args(params)
add_bucketing_args(params)

params.add_argument('--num-samples-per-shard',
default=1000000,
help='The approximate number of samples per shard. Default: %(default)s.')

params.add_argument('--min-num-shards',
default=1,
type=int_greater_or_equal(1),
help='The minimum number of shards to use, even if they would not '
'reach the desired number of samples per shard. Default: %(default)s.')

params.add_argument('--seed',
type=int,
default=13,
help='Random seed used that makes shard assignments deterministic. Default: %(default)s.')

params.add_argument('--output', '-o',
required=True,
help='Folder where the prepared and possibly sharded data is written to.')


def add_device_args(params):
Expand Down Expand Up @@ -319,16 +390,29 @@ def add_device_args(params):
'write permissions.')


def add_vocab_args(model_params):
model_params.add_argument('--num-words',
type=multiple_values(num_values=2, greater_or_equal=0),
default=(50000, 50000),
help='Maximum vocabulary size. Use "x:x" to specify separate values for src&tgt. '
'Default: %(default)s.')
model_params.add_argument('--word-min-count',
type=multiple_values(num_values=2, greater_or_equal=1),
default=(1, 1),
help='Minimum frequency of words to be included in vocabularies. Default: %(default)s.')
def add_vocab_args(params):
params.add_argument('--source-vocab',
required=False,
default=None,
help='Existing source vocabulary (JSON).')
params.add_argument('--target-vocab',
required=False,
default=None,
help='Existing target vocabulary (JSON).')
params.add_argument(C.VOCAB_ARG_SHARED_VOCAB,
action='store_true',
default=False,
help='Share source and target vocabulary. '
'Will be automatically turned on when using weight tying. Default: %(default)s.')
params.add_argument('--num-words',
type=multiple_values(num_values=2, greater_or_equal=0),
default=(50000, 50000),
help='Maximum vocabulary size. Use "x:x" to specify separate values for src&tgt. '
'Default: %(default)s.')
params.add_argument('--word-min-count',
type=multiple_values(num_values=2, greater_or_equal=1),
default=(1, 1),
help='Minimum frequency of words to be included in vocabularies. Default: %(default)s.')


def add_model_parameters(params):
Expand All @@ -344,8 +428,6 @@ def add_model_parameters(params):
help="Allow misssing parameters when initializing model parameters from file. "
"Default: %(default)s.")

add_vocab_args(model_params)

model_params.add_argument('--encoder',
choices=C.ENCODERS,
default=C.RNN_NAME,
Expand Down Expand Up @@ -539,12 +621,6 @@ def add_model_parameters(params):
help='The type of weight tying. source embeddings=src, target embeddings=trg, '
'target softmax weight matrix=softmax. Default: %(default)s.')

model_params.add_argument('--max-seq-len',
type=multiple_values(num_values=2, greater_or_equal=1),
default=(100, 100),
help='Maximum sequence length in tokens. '
'Use "x:x" to specify separate values for src&tgt. Default: %(default)s.')

model_params.add_argument('--layer-normalization', action="store_true",
help="Adds layer normalization before non-linear activations. "
"This includes MLP attention, RNN decoder state initialization, "
Expand Down Expand Up @@ -576,13 +652,6 @@ def add_training_args(params):
type=str,
default='replicate',
help=argparse.SUPPRESS)
train_params.add_argument('--no-bucketing',
action='store_true',
help='Disable bucketing: always unroll to the max_len.')
train_params.add_argument('--bucket-width',
type=int_greater_or_equal(1),
default=10,
help='Width of buckets in tokens. Default: %(default)s.')

train_params.add_argument('--loss',
default=C.CROSS_ENTROPY,
Expand Down Expand Up @@ -824,7 +893,7 @@ def add_training_args(params):


def add_train_cli_args(params):
add_io_args(params)
add_training_io_args(params)
add_model_parameters(params)
add_training_args(params)
add_device_args(params)
Expand Down
15 changes: 15 additions & 0 deletions sockeye/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,12 @@
"keep_last_params"]

# Other argument constants
TRAINING_ARG_SOURCE = "--source"
TRAINING_ARG_TARGET = "--target"
TRAINING_ARG_PREPARED_DATA = "--prepared-data"

VOCAB_ARG_SHARED_VOCAB = "--shared-vocab"

INFERENCE_ARG_INPUT_LONG = "--input"
INFERENCE_ARG_INPUT_SHORT = "-i"
INFERENCE_ARG_OUTPUT_LONG = "--output"
Expand Down Expand Up @@ -316,3 +322,12 @@

LARGE_POSITIVE_VALUE = 99999999.
LARGE_NEGATIVE_VALUE = -LARGE_POSITIVE_VALUE

# data sharding
SHARD_NAME = "shard.%05d"
SHARD_SOURCE = SHARD_NAME + ".source"
SHARD_TARGET = SHARD_NAME + ".target"
DATA_CONFIG = "data.config"
PREPARED_DATA_VERSION_FILE = "data.version"
PREPARED_DATA_VERSION = 1

2 changes: 1 addition & 1 deletion sockeye/convolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(self,
kernel_width: int,
num_hidden: int,
act_type: str = C.GLU,
weight_normalization: bool = False):
weight_normalization: bool = False) -> None:
super().__init__()
self.kernel_width = kernel_width
self.num_hidden = num_hidden
Expand Down
Loading

0 comments on commit 0ed81fd

Please sign in to comment.