From 8b2e51bbbe644d391e5d4d11ad35047b5ac8b326 Mon Sep 17 00:00:00 2001 From: IgnatovFedor Date: Tue, 2 Jul 2019 14:07:26 +0300 Subject: [PATCH 01/18] feat: add /poller api endpoint for stand monitoring (#898) * feat: added /poller endpoint for stand monitoring * feat: removed /poller endpoint and added GET method to model endpointo * Update deeppavlov/utils/server/server.py Co-Authored-By: Aleksei Lymar * fix: removed GET method from Added /poller endpoint with GET method for monitoring * feat: added option to send custom parameters by poller This option has added to monitor models that accept something besides text. Poller can sand this parameters in data. By default we are using POST requests with empty container to feed model with text data. ValueError was added to avoid double slash at poller endpoint path * fix: changed filter regex * fix: changed error message for attempt to use '/' endpoint Co-Authored-By: litinsky * fix: changed filter in PollerFilter class and added docstring --- deeppavlov/utils/server/server.py | 39 ++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/deeppavlov/utils/server/server.py b/deeppavlov/utils/server/server.py index 57b0c0faef..5bfc4a4f70 100644 --- a/deeppavlov/utils/server/server.py +++ b/deeppavlov/utils/server/server.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re import ssl -from logging import getLogger +from logging import getLogger, Filter from pathlib import Path from typing import List, Tuple @@ -31,7 +32,20 @@ SERVER_CONFIG_FILENAME = 'server_config.json' + +class PollerFilter(Filter): + """ + PollerFilter class is used to filter POST requests log records to + /poller endpoints. + """ + pat = re.compile(r'POST\s/\S*/poller\s') + def filter(self, record): + return not PollerFilter.pat.search(record.getMessage()) + + log = getLogger(__name__) +werklog = getLogger('werkzeug') +werklog.addFilter(PollerFilter()) app = Flask(__name__) Swagger(app) @@ -100,6 +114,19 @@ def interact(model: Chainer, params_names: List[str]) -> Tuple[Response, int]: return jsonify(result), 200 +def test_interact(model: Chainer, params_names: List[str]) -> Tuple[Response, int]: + data = request.get_json() + if not data: + model_args = [["Test string."] for _ in params_names] + else: + model_args = [data.get(param_name) for param_name in params_names] + try: + _ = model(*model_args) + return Response('["Test passed"]\n'), 200 + except Exception: + return Response('["Test failed"]\n'), 400 + + def start_model_server(model_config, https=False, ssl_key=None, ssl_cert=None, port=None): server_config_path = get_settings_path() / SERVER_CONFIG_FILENAME server_params = get_server_params(server_config_path, model_config) @@ -109,6 +136,12 @@ def start_model_server(model_config, https=False, ssl_key=None, ssl_cert=None, p model_endpoint = server_params['model_endpoint'] model_args_names = server_params['model_args_names'] + if model_endpoint == '/': + e = ValueError('"/" endpoint is reserved, please provide correct endpoint in model_endpoint' + 'param in server configuration file') + log.error(e) + raise e + https = https or server_params['https'] if https: @@ -159,4 +192,8 @@ def index(): def answer(): return interact(model, model_args_names) + @app.route(model_endpoint+'/poller', methods=['POST']) + def polling(): + return test_interact(model, model_args_names) + app.run(host=host, port=port, threaded=False, ssl_context=ssl_context) From 055980124979257b3136f10633eb16598c279890 Mon Sep 17 00:00:00 2001 From: cclauss Date: Wed, 3 Jul 2019 09:35:04 +0200 Subject: [PATCH 02/18] Use ==/!= to compare str, bytes, and int literals Identity is not the same thing as equality in Python. $ __python__ ``` >>> mode = "tes" >>> mode += "t" >>> mode == "test" True >>> mode is "test" False ``` [flake8](http://flake8.pycqa.org) testing of https://github.com/deepmipt/DeepPavlov on Python 3.7.1 $ __flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics__ ``` ./deeppavlov/dataset_readers/ubuntu_dstc7_mt_reader.py:92:20: F632 use ==/!= to compare str, bytes, and int literals if mode is not "test": ^ ./deeppavlov/dataset_readers/ubuntu_dstc7_mt_reader.py:97:20: F632 use ==/!= to compare str, bytes, and int literals if mode is not "test": ^ 2 F632 use ==/!= to compare str, bytes, and int literals 2 ``` --- deeppavlov/dataset_readers/ubuntu_dstc7_mt_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deeppavlov/dataset_readers/ubuntu_dstc7_mt_reader.py b/deeppavlov/dataset_readers/ubuntu_dstc7_mt_reader.py index 3a3740ea27..398a12a369 100644 --- a/deeppavlov/dataset_readers/ubuntu_dstc7_mt_reader.py +++ b/deeppavlov/dataset_readers/ubuntu_dstc7_mt_reader.py @@ -89,12 +89,12 @@ def _create_dialog_iter(self, filename, mode="train"): utterances.append(msg['utterance']) true_response = "" # true response sentence - if mode is not "test": + if mode != "test": true_response = dialog['options-for-correct-answers'][0]['utterance'] fake_responses = [] # rest (wrong) responses target_id = "" - if mode is not "test": + if mode != "test": correct_answer = dialog['options-for-correct-answers'][0] target_id = correct_answer['candidate-id'] for i, utterance in enumerate(dialog['options-for-next']): From 5ca21e6d215b4dc429d18073b5c88bc51cd879dd Mon Sep 17 00:00:00 2001 From: cclauss Date: Wed, 3 Jul 2019 23:32:04 +0200 Subject: [PATCH 03/18] tests: Add flake8 --select=E9,F63,F7,F82 to Jenkinsfile (#913) * setup.py: Add flake8 to extras_require={'test':} * Jenkinsfile: Add flake8 . --select=E9,F63,F7,F82 On the flake8 test selection, this PR does _not_ focus on "_style violations_" (the majority of flake8 error codes that [__python/black__](https://github.com/python/black) can autocorrect). Instead these tests are focus on runtime safety and correctness: * E9 tests are about Python syntax errors usually raised because flake8 can not build an Abstract Syntax Tree (AST). Often these issues are a sign of unused code or code that has not been ported to Python 3. These would be compile-time errors in a compiled language but in a dynamic language like Python they result in the script halting/crashing on the user. * F63 tests are usually about the confusion between identity and equality in Python. Use ==/!= to compare str, bytes, and int literals is the classic case. These are areas where __a == b__ is True but __a is b__ is False (or vice versa). * F7 tests logic errors and syntax errors in type hints * F82 tests are almost always _undefined names_ which are usually a sign of a typo, missing imports, or code that has not been ported to Python 3. These also would be compile-time errors in a compiled language but in Python a __NameError__ is raised which will halt/crash the script on the user. * Update Jenkinsfile --- Jenkinsfile | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/Jenkinsfile b/Jenkinsfile index 0cdd8aab3b..1b0ba05c5f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -21,6 +21,7 @@ node('gpu') { stage('Tests') { sh """ . .venv-$BUILD_NUMBER/bin/activate + flake8 `python -c 'import deeppavlov; print(deeppavlov.__path__[0])'` --count --select=E9,F63,F7,F82 --show-source --statistics pytest -v --disable-warnings cd docs make clean diff --git a/setup.py b/setup.py index 87a55dbba4..d7a13015d8 100644 --- a/setup.py +++ b/setup.py @@ -59,6 +59,7 @@ def readme(): include_package_data=True, extras_require={ 'tests': [ + 'flake8', 'pytest', 'pexpect'], 'docs': [ From 08c32c63f790a4deae6fa440f3a8b6c392c65368 Mon Sep 17 00:00:00 2001 From: acriptis Date: Tue, 9 Jul 2019 10:35:45 +0300 Subject: [PATCH 04/18] refactor: refactor morphotagger to inherit KerasModel instead of KerasWrapper (#918) * morphotagger refactored by collapsing KerasWrapper, CharacterTagger and MorphoTagger into one class * fix typo in registry * make morphotagger propertiesless to avoid failing TFMetaclass call before intialization * clean imports * delete redundant KerasWrapper * Update morpho_tagger.py mode attribute defaults to infer Co-Authored-By: Aleksei Lymar * Update morpho_tagger.py load_path to extend supported formats Co-Authored-By: Aleksei Lymar * Update morpho_tagger.py to include Path dependency Co-Authored-By: Aleksei Lymar * add License * Update morpho_tagger.py save_path format Co-Authored-By: Aleksei Lymar * remove excessive methods symbols_number_() and tags_number_() in favor of direct calls * make imports precise * remove duplicated keras.backend import * optimize call of morphotagger, remove redundant graph.as_default call * docs: update MorphoTagger docs after refactoring. * docs: update MorphoTagger docs after refactoring. --- deeppavlov/core/common/registry.json | 2 +- deeppavlov/core/models/keras_model.py | 80 ----------- .../{network.py => morpho_tagger.py} | 131 ++++++++++-------- docs/apiref/core/models.rst | 2 - docs/apiref/models/morpho_tagger.rst | 8 +- docs/components/morphotagger.rst | 6 +- 6 files changed, 82 insertions(+), 147 deletions(-) rename deeppavlov/models/morpho_tagger/{network.py => morpho_tagger.py} (80%) diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index b3c5910cde..e329e9bb75 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -71,7 +71,7 @@ "logit_ranker": "deeppavlov.models.doc_retrieval.logit_ranker:LogitRanker", "lowercase_preprocessor": "deeppavlov.models.preprocessors.capitalization:LowercasePreprocessor", "mask": "deeppavlov.models.preprocessors.mask:Mask", - "morpho_tagger": "deeppavlov.models.morpho_tagger.network:MorphoTagger", + "morpho_tagger": "deeppavlov.models.morpho_tagger.morpho_tagger:MorphoTagger", "morphotagger_dataset": "deeppavlov.dataset_iterators.morphotagger_iterator:MorphoTaggerDatasetIterator", "morphotagger_dataset_reader": "deeppavlov.dataset_readers.morphotagging_dataset_reader:MorphotaggerDatasetReader", "morphotagger_multidataset": "deeppavlov.dataset_iterators.morphotagger_iterator:MorphoTaggerMultiDatasetIterator", diff --git a/deeppavlov/core/models/keras_model.py b/deeppavlov/core/models/keras_model.py index ef8d01598d..5cf941159d 100644 --- a/deeppavlov/core/models/keras_model.py +++ b/deeppavlov/core/models/keras_model.py @@ -96,86 +96,6 @@ def process_event(self, event_name: str, data: dict) -> None: return -class KerasWrapper(KerasModel): - """A wrapper over external Keras models. It is used, for example, - to wrap :class:`~deeppavlov.models.morpho_tagger.network.CharacterTagger`. - A subclass of :class:`~deeppavlov.core.models.keras_model.KerasModel` - - Attributes: - cls: the class to be wrapped - save_path: the path where model is saved - load_path: the path from where model is loaded - mode: usage mode - **kwargs: a dictionary containing model parameters specified in the main part - of json config that corresponds to the model - """ - def __init__(self, cls: type, save_path: Optional[str] = None, - load_path: Optional[str] = None, mode: str = None, - **kwargs) -> None: - # Calls parent constructor. Results in creation of save_folder if it doesn't exist - super().__init__(save_path=save_path, load_path=load_path, mode=mode) - - # Dicts are mutable! To prevent changes in config dict outside this class - # we use deepcopy - opt = deepcopy(kwargs) - - # Finds all input parameters of the network __init__ to pass them into network later - network_parameter_names = list(inspect.signature(cls.__init__).parameters) - # Fills all provided parameters from opt (opt is a dictionary formed from the model - # json config file, except the "name" field) - network_parameters = {par: opt[par] for par in network_parameter_names if par in opt} - self._net = cls(**network_parameters) - - # Finds all parameters for network train to pass them into train method later - train_parameters_names = list(inspect.signature(self._net.train_on_batch).parameters) - - # Fills all provided parameters from opt - train_parameters = {par: opt[par] for par in train_parameters_names if par in opt} - self.train_parameters = train_parameters - self.opt = opt - - # Tries to load the model from model `load_path`, if it is available - self.load() - - def load(self) -> None: - """Checks existence of the model file, loads the model if the file exists""" - - # Checks presence of the model files - if self.load_path.exists(): - path = str(self.load_path.resolve()) - log.info('[loading model from {}]'.format(path)) - self._net.load(path) - - def save(self) -> None: - """Saves model to the save_path, provided in config. The directory is - already created by super().__init__, which is called in __init__ of this class""" - path = str(self.save_path.absolute()) - log.info('[saving model to {}]'.format(path)) - self._net.save(path) - - def train_on_batch(self, *args) -> None: - """Trains the model on a single batch. - - Args: - *args: the list of network inputs. - Last element of `args` is the batch of targets, - all previous elements are training data batches - """ - *data, labels = args - self._net.train_on_batch(data, labels) - - def __call__(self, *x_batch, **kwargs) -> Union[List, np.ndarray]: - """ - Predicts answers on batch elements. - - Args: - instance: a batch to predict answers on - """ - with self.graph.as_default(): - K.set_session(self.sess) - return self._net.predict_on_batch(x_batch, **kwargs) - - class LRScheduledKerasModel(LRScheduledModel, KerasModel): """ KerasModel enhanced with optimizer, learning rate and momentum diff --git a/deeppavlov/models/morpho_tagger/network.py b/deeppavlov/models/morpho_tagger/morpho_tagger.py similarity index 80% rename from deeppavlov/models/morpho_tagger/network.py rename to deeppavlov/models/morpho_tagger/morpho_tagger.py index 1838831d10..ba95216ef0 100644 --- a/deeppavlov/models/morpho_tagger/network.py +++ b/deeppavlov/models/morpho_tagger/morpho_tagger.py @@ -13,30 +13,38 @@ # limitations under the License. from logging import getLogger -from typing import List, Union, Tuple, Iterable +from pathlib import Path +from typing import List, Optional, Union, Tuple +import numpy as np import keras.layers as kl import keras.optimizers as ko import keras.regularizers as kreg +import keras.backend as kb from keras import Model from deeppavlov.core.common.registry import register from deeppavlov.core.data.vocab import DefaultVocabulary -from deeppavlov.core.models.keras_model import KerasWrapper +from deeppavlov.core.models.keras_model import KerasModel from .cells import Highway -from .common_tagger import * +from .common_tagger import to_one_hot log = getLogger(__name__) MAX_WORD_LENGTH = 30 -class CharacterTagger: +@register("morpho_tagger") +class MorphoTagger(KerasModel): """A class for character-based neural morphological tagger Parameters: symbols: character vocabulary tags: morphological tags vocabulary + save_path: the path where model is saved + load_path: the path from where model is loaded + mode: usage mode + word_rnn: the type of character-level network (only `cnn` implemented) char_embeddings_size: the size of character embeddings char_conv_layers: the number of convolutional layers on character level @@ -64,10 +72,15 @@ class CharacterTagger: word_dropout: the ratio of dropout before word level (it is applied to word embeddings) regularizer: l2 regularization parameter verbose: the level of verbosity + + A subclass of :class:`~deeppavlov.core.models.keras_model.KerasModel` """ def __init__(self, symbols: DefaultVocabulary, tags: DefaultVocabulary, + save_path: Optional[Union[str, Path]] = None, + load_path: Optional[Union[str, Path]] = None, + mode: str = 'infer', word_rnn: str = "cnn", char_embeddings_size: int = 16, char_conv_layers: int = 1, @@ -84,7 +97,9 @@ def __init__(self, word_lstm_units: Union[int, List[int]] = 128, word_dropout: float = 0.0, regularizer: float = None, - verbose: int = 1): + verbose: int = 1, **kwargs): + # Calls parent constructor. Results in creation of save_folder if it doesn't exist + super().__init__(save_path=save_path, load_path=load_path, mode=mode) self.symbols = symbols self.tags = tags self.word_rnn = word_rnn @@ -107,6 +122,29 @@ def __init__(self, self._initialize() self.build() + # Tries to load the model from model `load_path`, if it is available + self.load() + + def load(self) -> None: + """ + Checks existence of the model file, loads the model if the file exists + Loads model weights from a file + """ + + # Checks presence of the model files + if self.load_path.exists(): + path = str(self.load_path.resolve()) + log.info('[loading model from {}]'.format(path)) + self.model_.load_weights(path) + + def save(self) -> None: + """ + Saves model weights to the save_path, provided in config. The directory is + already created by super().__init__, which is called in __init__ of this class""" + path = str(self.save_path.absolute()) + log.info('[saving model to {}]'.format(path)) + self.model_.save_weights(path) + def _initialize(self): if isinstance(self.char_window_size, int): self.char_window_size = [self.char_window_size] @@ -123,19 +161,7 @@ def _initialize(self): if self.regularizer is not None: self.regularizer = kreg.l2(self.regularizer) if self.verbose > 0: - log.info("{} symbols, {} tags in CharacterTagger".format(self.symbols_number_, self.tags_number_)) - - @property - def symbols_number_(self) -> int: - """Character vocabulary size - """ - return len(self.symbols) - - @property - def tags_number_(self) -> int: - """Tag vocabulary size - """ - return len(self.tags) + log.info("{} symbols, {} tags in CharacterTagger".format(len(self.symbols), len(self.tags))) def build(self): """Builds the network using Keras. @@ -162,8 +188,8 @@ def build(self): def _build_word_cnn(self, inputs): """Builds word-level network """ - inputs = kl.Lambda(kb.one_hot, arguments={"num_classes": self.symbols_number_}, - output_shape=lambda x: tuple(x) + (self.symbols_number_,))(inputs) + inputs = kl.Lambda(kb.one_hot, arguments={"num_classes": len(self.symbols)}, + output_shape=lambda x: tuple(x) + (len(self.symbols),))(inputs) char_embeddings = kl.Dense(self.char_embeddings_size, use_bias=False)(inputs) conv_outputs = [] self.char_output_dim_ = 0 @@ -213,7 +239,7 @@ def _build_basic_network(self, word_outputs): kl.LSTM(self.word_lstm_units[-1], return_sequences=True, dropout=self.lstm_dropout))(lstm_outputs) pre_outputs = kl.TimeDistributed( - kl.Dense(self.tags_number_, activation="softmax", + kl.Dense(len(self.tags), activation="softmax", activity_regularizer=self.regularizer), name="p")(lstm_outputs) return pre_outputs, lstm_outputs @@ -231,26 +257,32 @@ def _transform_batch(self, data, labels=None, transform_to_one_hot=True): else: return X - def train_on_batch(self, data: List[Iterable], labels: Iterable[list]) -> None: - """Trains model on a single batch + def train_on_batch(self, *args) -> None: + """Trains the model on a single batch. Args: - data: a batch of word sequences - labels: a batch of correct tag sequences - Returns: - the trained model + *args: the list of network inputs. + Last element of `args` is the batch of targets, + all previous elements are training data batches """ + # data: List[Iterable], labels: Iterable[list] + # Args: + # data: a batch of word sequences + # labels: a batch of correct tag sequences + *data, labels = args X, Y = self._transform_batch(data, labels) self.model_.train_on_batch(X, Y) - def predict_on_batch(self, data: Union[list, tuple], + def predict_on_batch(self, data: Union[List[np.ndarray], Tuple[np.ndarray]], return_indexes: bool = False) -> List[List[str]]: """ Makes predictions on a single batch Args: - data: a batch of word sequences together with additional inputs - return_indexes: whether to return tag indexes in vocabulary or tags themselves + data: model inputs for a single batch, data[0] contains input character encodings + and is the only element of data for mist models. Subsequent elements of data + include the output of additional vectorizers, e.g., dictionary-based one. + return_indexes: whether to return tag indexes in vocabulary or the tags themselves Returns: a batch of label sequences @@ -265,6 +297,18 @@ def predict_on_batch(self, data: Union[list, tuple], answer[i] = elem if return_indexes else self.tags.idxs2toks(elem) return answer + def __call__(self, *x_batch, **kwargs) -> Union[List, np.ndarray]: + """ + Predicts answers on batch elements. + + Args: + x_batch: a batch to predict answers on. It can be either a single array + for basic model or a sequence of arrays for a complex one ( + :config:`configuration file ` + or its lemmatized version). + """ + return self.predict_on_batch(x_batch, **kwargs) + def _make_sent_vector(self, sent: List, bucket_length: int =None) -> np.ndarray: """Transforms a sentence to Numpy array, which will be the network input. @@ -302,30 +346,3 @@ def _make_tags_vector(self, tags, bucket_length=None) -> np.ndarray: for i, tag in enumerate(tags): answer[i] = self.tags.tok2idx(tag) return answer - - def save(self, outfile) -> None: - """Saves model weights to a file - - Args: - outfile: file with model weights (other model components should be given in config) - """ - self.model_.save_weights(outfile) - - def load(self, infile) -> None: - """Loads model weights from a file - - Args: - infile: file to load model weights from - """ - self.model_.load_weights(infile) - - -@register("morpho_tagger") -class MorphoTagger(KerasWrapper): - """ - A wrapper over :class:`CharacterTagger`. - It is inherited from :class:`~deeppavlov.core.keras_model.KerasWrapper`. - It accepts initialization parameters of :class:`CharacterTagger` - """ - def __init__(self, *args, **kwargs) -> None: - super().__init__(CharacterTagger, *args, **kwargs) \ No newline at end of file diff --git a/docs/apiref/core/models.rst b/docs/apiref/core/models.rst index aeb3e1a22d..ee9d59a537 100644 --- a/docs/apiref/core/models.rst +++ b/docs/apiref/core/models.rst @@ -16,6 +16,4 @@ Abstract model classes and interfaces. .. autoclass:: deeppavlov.core.models.keras_model.KerasModel -.. autoclass:: deeppavlov.core.models.keras_model.KerasWrapper - .. autoclass:: deeppavlov.core.models.lr_scheduled_model.LRScheduledModel diff --git a/docs/apiref/models/morpho_tagger.rst b/docs/apiref/models/morpho_tagger.rst index 539bcace19..8e73a7a9ce 100644 --- a/docs/apiref/models/morpho_tagger.rst +++ b/docs/apiref/models/morpho_tagger.rst @@ -1,12 +1,12 @@ deeppavlov.models.morpho_tagger =============================== -.. autoclass:: deeppavlov.models.morpho_tagger.network.MorphoTagger +.. autoclass:: deeppavlov.models.morpho_tagger.morpho_tagger.MorphoTagger + :members: -.. autofunction:: deeppavlov.models.morpho_tagger.common.predict_with_model + .. automethod:: __call__ -.. autoclass:: deeppavlov.models.morpho_tagger.network.CharacterTagger - :members: +.. autofunction:: deeppavlov.models.morpho_tagger.common.predict_with_model .. autoclass:: deeppavlov.models.morpho_tagger.lemmatizer.UDPymorphyLemmatizer :members: diff --git a/docs/components/morphotagger.rst b/docs/components/morphotagger.rst index 7ddf46d7ec..83973b4af5 100644 --- a/docs/components/morphotagger.rst +++ b/docs/components/morphotagger.rst @@ -576,7 +576,7 @@ are listed in a separate distributed with the library. This part of the config l } The next part performs the tagging itself. Together with general parameters it describes -the input parameters of :class:`~deeppavlov.models.morpho_tagger.network.CharacterTagger`) class. +the input parameters of :class:`~deeppavlov.models.morpho_tagger.morpho_tagger.MorphoTagger`) class. :: @@ -605,13 +605,13 @@ When an additional vectorizer is used, the first line is changed to Config includes general parameters of :class:`~deeppavlov.core.models.component.Component` class, described in the :doc:`config_description ` and specific -:class:`~deeppavlov.models.morpho_tagger.network.CharacterTagger` +:class:`~deeppavlov.models.morpho_tagger.morpho_tagger.MorphoTagger` parameters. The latter include - ``tags`` - tag vocabulary. ``#tag_vocab`` refers to an already defined model with ``"id" = "tag_vocab"``. - ``symbols`` - character vocabulary. ``#char_vocab`` refers to an already defined model with ``"id" = "char_vocab"``. -and other specific parameters of the network, available in :class:`~deeppavlov.models.morpho_tagger.network.CharacterTagger` documentation. +and other specific parameters of the network, available in :class:`~deeppavlov.models.morpho_tagger.morpho_tagger.MorphoTagger` documentation. The ``"train"`` section of ``"chainer"`` contains training parameters, such as number of epochs, batch_size and logging frequency, see general readme for more details. From 4ad20913ec41874556bd493f88509f437273161d Mon Sep 17 00:00:00 2001 From: romanov Date: Tue, 9 Jul 2019 17:31:25 +0300 Subject: [PATCH 05/18] feat: add Snips dataset reader and iterators --- .../configs/classifiers/intents_snips.json | 21 +---- .../classifiers/intents_snips_big.json | 21 +---- .../classifiers/intents_snips_sklearn.json | 21 +---- .../intents_snips_tfidf_weighted.json | 21 +---- .../evolution/evolve_intents_snips.json | 19 +--- deeppavlov/core/common/registry.json | 3 + .../snips_intents_iterator.py | 38 ++++++++ .../dataset_iterators/snips_ner_iterator.py | 48 ++++++++++ deeppavlov/dataset_readers/snips_reader.py | 93 +++++++++++++++++++ .../classifiers/intents_snips_bigru.json | 19 +--- .../classifiers/intents_snips_bilstm.json | 19 +--- .../intents_snips_bilstm_bilstm.json | 19 +--- .../classifiers/intents_snips_bilstm_cnn.json | 19 +--- .../intents_snips_bilstm_proj_layer.json | 19 +--- ...tents_snips_bilstm_self_add_attention.json | 19 +--- ...ents_snips_bilstm_self_mult_attention.json | 19 +--- .../classifiers/intents_snips_cnn_bilstm.json | 19 +--- 17 files changed, 225 insertions(+), 212 deletions(-) create mode 100644 deeppavlov/dataset_iterators/snips_intents_iterator.py create mode 100644 deeppavlov/dataset_iterators/snips_ner_iterator.py create mode 100644 deeppavlov/dataset_readers/snips_reader.py diff --git a/deeppavlov/configs/classifiers/intents_snips.json b/deeppavlov/configs/classifiers/intents_snips.json index 9fe4593b79..b64349b16f 100644 --- a/deeppavlov/configs/classifiers/intents_snips.json +++ b/deeppavlov/configs/classifiers/intents_snips.json @@ -1,22 +1,13 @@ { "dataset_reader": { - "class_name": "basic_classification_reader", + "class_name": "snips_reader", "x": "text", "y": "intents", "data_path": "{DOWNLOADS_PATH}/snips" }, "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "class_name": "snips_intents_iterator", + "seed": 42 }, "chainer": { "in": [ @@ -145,11 +136,7 @@ "server_utils": "KerasIntentModel" }, "download": [ - { - "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", - "subdir": "{DOWNLOADS_PATH}/snips" - }, - { +{ "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin", "subdir": "{DOWNLOADS_PATH}/embeddings" }, diff --git a/deeppavlov/configs/classifiers/intents_snips_big.json b/deeppavlov/configs/classifiers/intents_snips_big.json index f9c728f02b..64e4363572 100644 --- a/deeppavlov/configs/classifiers/intents_snips_big.json +++ b/deeppavlov/configs/classifiers/intents_snips_big.json @@ -1,22 +1,13 @@ { "dataset_reader": { - "class_name": "basic_classification_reader", + "class_name": "snips_reader", "x": "text", "y": "intents", "data_path": "{DOWNLOADS_PATH}/snips" }, "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "class_name": "snips_intents_iterator", + "seed": 42 }, "chainer": { "in": [ @@ -145,11 +136,7 @@ "server_utils": "KerasIntentModel" }, "download": [ - { - "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", - "subdir": "{DOWNLOADS_PATH}/snips" - }, - { +{ "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin", "subdir": "{DOWNLOADS_PATH}/embeddings" }, diff --git a/deeppavlov/configs/classifiers/intents_snips_sklearn.json b/deeppavlov/configs/classifiers/intents_snips_sklearn.json index 37e898ff8b..78ddf5c8f1 100644 --- a/deeppavlov/configs/classifiers/intents_snips_sklearn.json +++ b/deeppavlov/configs/classifiers/intents_snips_sklearn.json @@ -1,22 +1,13 @@ { "dataset_reader": { - "class_name": "basic_classification_reader", + "class_name": "snips_reader", "x": "text", "y": "intents", "data_path": "{DOWNLOADS_PATH}/snips" }, "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "class_name": "snips_intents_iterator", + "seed": 42 }, "chainer": { "in": [ @@ -193,11 +184,7 @@ "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", "subdir": "{MODELS_PATH}" }, - { - "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", - "subdir": "{DOWNLOADS_PATH}/snips" - }, - { +{ "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_snips_sklearn_v9.tar.gz", "subdir": "{MODELS_PATH}/classifiers" }, diff --git a/deeppavlov/configs/classifiers/intents_snips_tfidf_weighted.json b/deeppavlov/configs/classifiers/intents_snips_tfidf_weighted.json index f0f263ba00..8e85b9f4d6 100644 --- a/deeppavlov/configs/classifiers/intents_snips_tfidf_weighted.json +++ b/deeppavlov/configs/classifiers/intents_snips_tfidf_weighted.json @@ -1,22 +1,13 @@ { "dataset_reader": { - "class_name": "basic_classification_reader", + "class_name": "snips_reader", "x": "text", "y": "intents", "data_path": "{DOWNLOADS_PATH}/snips" }, "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "class_name": "snips_intents_iterator", + "seed": 42 }, "chainer": { "in": [ @@ -189,11 +180,7 @@ "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", "subdir": "{MODELS_PATH}" }, - { - "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", - "subdir": "{DOWNLOADS_PATH}/snips" - }, - { +{ "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_snips_sklearn_v10.tar.gz", "subdir": "{MODELS_PATH}/classifiers" }, diff --git a/deeppavlov/configs/evolution/evolve_intents_snips.json b/deeppavlov/configs/evolution/evolve_intents_snips.json index 6db0269721..29ccf85b9f 100644 --- a/deeppavlov/configs/evolution/evolve_intents_snips.json +++ b/deeppavlov/configs/evolution/evolve_intents_snips.json @@ -1,28 +1,19 @@ { "dataset_reader": { - "class_name": "basic_classification_reader", + "class_name": "snips_reader", "x": "text", "y": "intents", "data_path": "{DOWNLOADS_PATH}/snips" }, "dataset_iterator": { - "class_name": "basic_classification_iterator", + "class_name": "snips_intents_iterator", "seed": { "evolve_range": [ 50, 500 ], "discrete": true - }, - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + } }, "chainer": { "in": [ @@ -208,10 +199,6 @@ "server_utils": "KerasIntentModel" }, "download": [ - { - "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", - "subdir": "{DOWNLOADS_PATH}/snips" - }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin", "subdir": "{DOWNLOADS_PATH}/embeddings" diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index e329e9bb75..831c175835 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -117,6 +117,9 @@ "sklearn_component": "deeppavlov.models.sklearn.sklearn_component:SklearnComponent", "slotfill_raw": "deeppavlov.models.slotfill.slotfill_raw:SlotFillingComponent", "smn_nn": "deeppavlov.models.ranking.sequential_matching_network:SMNNetwork", + "snips_intents_iterator": "deeppavlov.dataset_iterators.snips_intents_iterator:SnipsIntentIterator", + "snips_ner_iterator": "deeppavlov.dataset_iterators.snips_ner_iterator:SnipsNerIterator", + "snips_reader": "deeppavlov.dataset_readers.snips_reader:SnipsReader", "spelling_error_model": "deeppavlov.models.spelling_correction.brillmoore.error_model:ErrorModel", "spelling_levenshtein": "deeppavlov.models.spelling_correction.levenshtein.searcher_component:LevenshteinSearcherComponent", "split_tokenizer": "deeppavlov.models.tokenizers.split_tokenizer:SplitTokenizer", diff --git a/deeppavlov/dataset_iterators/snips_intents_iterator.py b/deeppavlov/dataset_iterators/snips_intents_iterator.py new file mode 100644 index 0000000000..c96a1be0d1 --- /dev/null +++ b/deeppavlov/dataset_iterators/snips_intents_iterator.py @@ -0,0 +1,38 @@ +# Copyright 2019 Alexey Romanov +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Any + +from overrides import overrides + +from deeppavlov.core.common.registry import register +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator + + +@register('snips_intents_iterator') +class SnipsIntentIterator(DataLearningIterator): + @overrides + def split(self, *args, **kwargs): + self.train = self._split(self.train) + self.valid = self._split(self.valid) + self.test = self._split(self.test) + + @staticmethod + def _split(queries: List[Any]): + result = [] + for query in queries: + text = ''.join(part['text'] for part in query['data']) + intent = query['intent'] + result.append((text, [intent])) + return result diff --git a/deeppavlov/dataset_iterators/snips_ner_iterator.py b/deeppavlov/dataset_iterators/snips_ner_iterator.py new file mode 100644 index 0000000000..9c25016a7e --- /dev/null +++ b/deeppavlov/dataset_iterators/snips_ner_iterator.py @@ -0,0 +1,48 @@ +# Copyright 2019 Alexey Romanov +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import nltk +from overrides import overrides + +from deeppavlov.core.common.registry import register +from deeppavlov.core.data.data_learning_iterator import DataLearningIterator + + +@register('snips_ner_iterator') +class SnipsNerIterator(DataLearningIterator): + @overrides + def split(self, *args, **kwargs): + self.train = self._split(self.train) + self.valid = self._split(self.valid) + self.test = self._split(self.test) + + @staticmethod + def _split(queries: List[Any]): + result = [] + for query in queries: + query = query['data'] + words = [] + slots = [] + for part in query: + part_words = nltk.tokenize.wordpunct_tokenize(part['text']) + entity = part.get('entity', None) + if entity: + slots.append('B-' + entity) + slots += ['I-' + entity] * (len(part_words) - 1) + else: + slots += ['O'] * len(part_words) + words += part_words + + result.append((words, slots)) + return result diff --git a/deeppavlov/dataset_readers/snips_reader.py b/deeppavlov/dataset_readers/snips_reader.py new file mode 100644 index 0000000000..28ec70b56c --- /dev/null +++ b/deeppavlov/dataset_readers/snips_reader.py @@ -0,0 +1,93 @@ +# Copyright 2019 Alexey Romanov +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from logging import getLogger +from typing import List, Dict, Any, Optional +from pathlib import Path + +from overrides import overrides + +from deeppavlov.core.common.registry import register +from deeppavlov.core.data.dataset_reader import DatasetReader +from deeppavlov.core.data.utils import download_decompress, mark_done, is_done + +log = getLogger(__name__) + + +@register('snips_reader') +class SnipsReader(DatasetReader): + """The class to download and read Snips NLU Benchmark dataset (custom intents section). + + See https://github.com/snipsco/nlu-benchmark. + """ + + # noinspection PyAttributeOutsideInit + @overrides + def read(self, data_path: str, queries_per_intent: Optional[int] = None, test_validate_split: float = 0.5, + *args, **kwargs) -> \ + Dict[str, List[Dict[str, Any]]]: + """ + Each query in the output has the following form: + { 'intent': intent_name, + 'data': [ { 'text': text, ('entity': slot_name)? } ] + } + + Args: + data_path: A path to a folder with dataset files. + queries_per_intent: Number of queries to load for each intent. None to load all. + If the requested number is greater than available in file, all queries are returned. + test_validate_split: Proportion of `_validate` files to be used as test dataset (since Snips + is split into training and validation sets without a separate test set). + """ + data_path = Path(data_path) + intents = ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', + 'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent'] + + if not is_done(data_path): + url = 'http://files.deeppavlov.ai/datasets/snips.tar.gz' + log.info('[downloading data from {} to {}]'.format(url, data_path)) + download_decompress(url, data_path) + mark_done(data_path) + + use_full_file = queries_per_intent is None or queries_per_intent > 70 + training_data = [] + validation_data = [] + test_data = [] + + for intent in intents: + intent_path = data_path / intent + train_file_name = f"train_{intent}{'_full' if use_full_file else ''}.json" + validate_file_name = f"validate_{intent}.json" + + train_queries = self._load_file(intent_path / train_file_name, intent, queries_per_intent) + validate_queries = self._load_file(intent_path / validate_file_name, intent, queries_per_intent) + num_test_queries = round(len(validate_queries) * test_validate_split) + + training_data.extend(train_queries) + validation_data.extend(validate_queries[num_test_queries:]) + test_data.extend(validate_queries[:num_test_queries]) + + return {'train': training_data, 'valid': validation_data, 'test': test_data} + + @staticmethod + def _load_file(path: Path, intent: str, num_queries: Optional[int]): + with path.open(encoding='latin_1') as f: + data = json.load(f) + + # restrict number of queries + queries = data[intent][:num_queries] + for query in queries: + query['intent'] = intent + return queries diff --git a/tests/test_configs/classifiers/intents_snips_bigru.json b/tests/test_configs/classifiers/intents_snips_bigru.json index 3e74fe71d6..244d54b617 100644 --- a/tests/test_configs/classifiers/intents_snips_bigru.json +++ b/tests/test_configs/classifiers/intents_snips_bigru.json @@ -1,22 +1,13 @@ { "dataset_reader": { - "class_name": "basic_classification_reader", + "class_name": "snips_reader", "x": "text", "y": "intents", "data_path": "{DOWNLOADS_PATH}/snips" }, "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "class_name": "snips_intents_iterator", + "seed": 42 }, "chainer": { "in": [ @@ -145,10 +136,6 @@ "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", "subdir": "{MODELS_PATH}" }, - { - "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", - "subdir": "{DOWNLOADS_PATH}/snips" - }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin", "subdir": "{DOWNLOADS_PATH}/embeddings" diff --git a/tests/test_configs/classifiers/intents_snips_bilstm.json b/tests/test_configs/classifiers/intents_snips_bilstm.json index eb8bd3e984..5371d84b17 100644 --- a/tests/test_configs/classifiers/intents_snips_bilstm.json +++ b/tests/test_configs/classifiers/intents_snips_bilstm.json @@ -1,22 +1,13 @@ { "dataset_reader": { - "class_name": "basic_classification_reader", + "class_name": "snips_reader", "x": "text", "y": "intents", "data_path": "{DOWNLOADS_PATH}/snips" }, "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "class_name": "snips_intents_iterator", + "seed": 42 }, "chainer": { "in": [ @@ -145,10 +136,6 @@ "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", "subdir": "{MODELS_PATH}" }, - { - "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", - "subdir": "{DOWNLOADS_PATH}/snips" - }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin", "subdir": "{DOWNLOADS_PATH}/embeddings" diff --git a/tests/test_configs/classifiers/intents_snips_bilstm_bilstm.json b/tests/test_configs/classifiers/intents_snips_bilstm_bilstm.json index a8c4748ff1..7af8d58c42 100644 --- a/tests/test_configs/classifiers/intents_snips_bilstm_bilstm.json +++ b/tests/test_configs/classifiers/intents_snips_bilstm_bilstm.json @@ -1,22 +1,13 @@ { "dataset_reader": { - "class_name": "basic_classification_reader", + "class_name": "snips_reader", "x": "text", "y": "intents", "data_path": "{DOWNLOADS_PATH}/snips" }, "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "class_name": "snips_intents_iterator", + "seed": 42 }, "chainer": { "in": [ @@ -146,10 +137,6 @@ "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", "subdir": "{MODELS_PATH}" }, - { - "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", - "subdir": "{DOWNLOADS_PATH}/snips" - }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin", "subdir": "{DOWNLOADS_PATH}/embeddings" diff --git a/tests/test_configs/classifiers/intents_snips_bilstm_cnn.json b/tests/test_configs/classifiers/intents_snips_bilstm_cnn.json index b96efc592a..f146ad4779 100644 --- a/tests/test_configs/classifiers/intents_snips_bilstm_cnn.json +++ b/tests/test_configs/classifiers/intents_snips_bilstm_cnn.json @@ -1,22 +1,13 @@ { "dataset_reader": { - "class_name": "basic_classification_reader", + "class_name": "snips_reader", "x": "text", "y": "intents", "data_path": "{DOWNLOADS_PATH}/snips" }, "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "class_name": "snips_intents_iterator", + "seed": 42 }, "chainer": { "in": [ @@ -152,10 +143,6 @@ "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", "subdir": "{MODELS_PATH}" }, - { - "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", - "subdir": "{DOWNLOADS_PATH}/snips" - }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin", "subdir": "{DOWNLOADS_PATH}/embeddings" diff --git a/tests/test_configs/classifiers/intents_snips_bilstm_proj_layer.json b/tests/test_configs/classifiers/intents_snips_bilstm_proj_layer.json index 940ecb191b..18489bc2fd 100644 --- a/tests/test_configs/classifiers/intents_snips_bilstm_proj_layer.json +++ b/tests/test_configs/classifiers/intents_snips_bilstm_proj_layer.json @@ -1,22 +1,13 @@ { "dataset_reader": { - "class_name": "basic_classification_reader", + "class_name": "snips_reader", "x": "text", "y": "intents", "data_path": "{DOWNLOADS_PATH}/snips" }, "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "class_name": "snips_intents_iterator", + "seed": 42 }, "chainer": { "in": [ @@ -147,10 +138,6 @@ "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", "subdir": "{MODELS_PATH}" }, - { - "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", - "subdir": "{DOWNLOADS_PATH}/snips" - }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin", "subdir": "{DOWNLOADS_PATH}/embeddings" diff --git a/tests/test_configs/classifiers/intents_snips_bilstm_self_add_attention.json b/tests/test_configs/classifiers/intents_snips_bilstm_self_add_attention.json index fac8d9c101..588020a7da 100644 --- a/tests/test_configs/classifiers/intents_snips_bilstm_self_add_attention.json +++ b/tests/test_configs/classifiers/intents_snips_bilstm_self_add_attention.json @@ -1,22 +1,13 @@ { "dataset_reader": { - "class_name": "basic_classification_reader", + "class_name": "snips_reader", "x": "text", "y": "intents", "data_path": "{DOWNLOADS_PATH}/snips" }, "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "class_name": "snips_intents_iterator", + "seed": 42 }, "chainer": { "in": [ @@ -148,10 +139,6 @@ "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", "subdir": "{MODELS_PATH}" }, - { - "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", - "subdir": "{DOWNLOADS_PATH}/snips" - }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin", "subdir": "{DOWNLOADS_PATH}/embeddings" diff --git a/tests/test_configs/classifiers/intents_snips_bilstm_self_mult_attention.json b/tests/test_configs/classifiers/intents_snips_bilstm_self_mult_attention.json index 358731a333..3ec8802f80 100644 --- a/tests/test_configs/classifiers/intents_snips_bilstm_self_mult_attention.json +++ b/tests/test_configs/classifiers/intents_snips_bilstm_self_mult_attention.json @@ -1,22 +1,13 @@ { "dataset_reader": { - "class_name": "basic_classification_reader", + "class_name": "snips_reader", "x": "text", "y": "intents", "data_path": "{DOWNLOADS_PATH}/snips" }, "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "class_name": "snips_intents_iterator", + "seed": 42 }, "chainer": { "in": [ @@ -148,10 +139,6 @@ "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", "subdir": "{MODELS_PATH}" }, - { - "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", - "subdir": "{DOWNLOADS_PATH}/snips" - }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin", "subdir": "{DOWNLOADS_PATH}/embeddings" diff --git a/tests/test_configs/classifiers/intents_snips_cnn_bilstm.json b/tests/test_configs/classifiers/intents_snips_cnn_bilstm.json index 68a5737833..2b62a146a0 100644 --- a/tests/test_configs/classifiers/intents_snips_cnn_bilstm.json +++ b/tests/test_configs/classifiers/intents_snips_cnn_bilstm.json @@ -1,22 +1,13 @@ { "dataset_reader": { - "class_name": "basic_classification_reader", + "class_name": "snips_reader", "x": "text", "y": "intents", "data_path": "{DOWNLOADS_PATH}/snips" }, "dataset_iterator": { - "class_name": "basic_classification_iterator", - "seed": 42, - "field_to_split": "train", - "split_fields": [ - "train", - "valid" - ], - "split_proportions": [ - 0.9, - 0.1 - ] + "class_name": "snips_intents_iterator", + "seed": 42 }, "chainer": { "in": [ @@ -152,10 +143,6 @@ "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", "subdir": "{MODELS_PATH}" }, - { - "url": "http://files.deeppavlov.ai/datasets/snips_intents/train.csv", - "subdir": "{DOWNLOADS_PATH}/snips" - }, { "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/dstc2_fastText_model.bin", "subdir": "{DOWNLOADS_PATH}/embeddings" From 2fc1d96994b5c479dfcf8bdc424a75cdb563cb33 Mon Sep 17 00:00:00 2001 From: romanov Date: Wed, 10 Jul 2019 10:54:01 +0300 Subject: [PATCH 06/18] refactor: unify split implementations in dataset iterators --- .../core/data/data_learning_iterator.py | 11 +++++++--- .../dataset_iterators/dialog_iterator.py | 16 ++++----------- .../document_bert_ner_iterator.py | 15 +------------- .../dataset_iterators/dstc2_ner_iterator.py | 20 ++++--------------- .../kvret_dialog_iterator.py | 10 ++-------- .../morphotagger_iterator.py | 2 +- .../snips_intents_iterator.py | 10 ++-------- .../dataset_iterators/snips_ner_iterator.py | 10 ++-------- .../dataset_iterators/squad_iterator.py | 8 ++------ 9 files changed, 26 insertions(+), 76 deletions(-) diff --git a/deeppavlov/core/data/data_learning_iterator.py b/deeppavlov/core/data/data_learning_iterator.py index ff4085b852..d2ee6af42c 100644 --- a/deeppavlov/core/data/data_learning_iterator.py +++ b/deeppavlov/core/data/data_learning_iterator.py @@ -32,17 +32,22 @@ class DataLearningIterator: random: instance of ``Random`` initialized with a seed """ def split(self, *args, **kwargs): + """ Manipulate self.train, self.valid, and self.test into their final form. """ pass + def preprocess(self, data: List[Tuple[Any, Any]], *args, **kwargs) -> List[Tuple[Any, Any]]: + """ Transform the data for a specific data type (e.g. ``'train'``). """ + return data + def __init__(self, data: Dict[str, List[Tuple[Any, Any]]], seed: int = None, shuffle: bool = True, *args, **kwargs) -> None: self.shuffle = shuffle self.random = Random(seed) - self.train = data.get('train', []) - self.valid = data.get('valid', []) - self.test = data.get('test', []) + self.train = self.preprocess(data.get('train', []), *args, **kwargs) + self.valid = self.preprocess(data.get('valid', []), *args, **kwargs) + self.test = self.preprocess(data.get('test', []), *args, **kwargs) self.split(*args, **kwargs) self.data = { 'train': self.train, diff --git a/deeppavlov/dataset_iterators/dialog_iterator.py b/deeppavlov/dataset_iterators/dialog_iterator.py index 5729f00a50..447c47c578 100644 --- a/deeppavlov/dataset_iterators/dialog_iterator.py +++ b/deeppavlov/dataset_iterators/dialog_iterator.py @@ -32,8 +32,8 @@ class DialogDatasetIterator(DataLearningIterator): test: list of dialogs used for testing (tuples ``(context, response)``) """ - @staticmethod - def _dialogs(data): + @overrides + def preprocess(self, data, *args, **kwargs): dialogs = [] prev_resp_act = None for x, y in data: @@ -47,12 +47,6 @@ def _dialogs(data): dialogs[-1][1].append(y) return dialogs - @overrides - def split(self, *args, **kwargs): - self.train = self._dialogs(self.train) - self.valid = self._dialogs(self.valid) - self.test = self._dialogs(self.test) - @register('dialog_db_result_iterator') class DialogDBResultDatasetIterator(DataLearningIterator): @@ -76,7 +70,5 @@ def _db_result(data): return x['db_result'] @overrides - def split(self, *args, **kwargs): - self.train = [(r, "") for r in filter(None, map(self._db_result, self.train))] - self.valid = [(r, "") for r in filter(None, map(self._db_result, self.valid))] - self.test = [(r, "") for r in filter(None, map(self._db_result, self.test))] + def preprocess(self, data, *args, **kwargs): + return [(r, "") for r in filter(None, map(self._db_result, data))] diff --git a/deeppavlov/dataset_iterators/document_bert_ner_iterator.py b/deeppavlov/dataset_iterators/document_bert_ner_iterator.py index 3d0f392ad8..58fb0cacb3 100644 --- a/deeppavlov/dataset_iterators/document_bert_ner_iterator.py +++ b/deeppavlov/dataset_iterators/document_bert_ner_iterator.py @@ -59,23 +59,10 @@ def __init__(self, self.max_seq_length = max_seq_length or float('inf') self.one_sample_per_doc = one_sample_per_doc self.left_context_rate = left_context_rate - self.shuffle = shuffle - vocab_file = str(expand_path(bert_tokenizer_vocab_file)) self.tokenizer = FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) - self.random = Random(seed) - - self.train = data.get('train', []) - self.valid = data.get('valid', []) - self.test = data.get('test', []) - self.split(*args, **kwargs) - self.data = { - 'train': self.train, - 'valid': self.valid, - 'test': self.test, - 'all': self.train + self.test + self.valid - } + super().__init__(data, seed, shuffle, *args, **kwargs) def gen_batches(self, batch_size: int, data_type: str = 'train', shuffle: bool = None) -> Iterator[Tuple[tuple, tuple]]: diff --git a/deeppavlov/dataset_iterators/dstc2_ner_iterator.py b/deeppavlov/dataset_iterators/dstc2_ner_iterator.py index 2854f18325..10c07ab3ad 100644 --- a/deeppavlov/dataset_iterators/dstc2_ner_iterator.py +++ b/deeppavlov/dataset_iterators/dstc2_ner_iterator.py @@ -14,8 +14,7 @@ import json import logging -from random import Random -from typing import List, Tuple +from typing import List, Tuple, Dict from deeppavlov.core.commands.utils import expand_path from deeppavlov.core.common.registry import register @@ -37,26 +36,15 @@ class Dstc2NerDatasetIterator(DataLearningIterator): seed: value for random seed shuffle: whether to shuffle the data """ - def __init__(self, data: List[Tuple], dataset_path: str, seed: int = None, shuffle: bool = False): - self.shuffle = shuffle - self.random = Random(seed) + def __init__(self, data: Dict[str, List[Tuple]], dataset_path: str, seed: int = None, shuffle: bool = False): # TODO: include slot vals to dstc2.tar.gz dataset_path = expand_path(dataset_path) / 'slot_vals.json' self._build_slot_vals(dataset_path) with open(dataset_path, encoding='utf8') as f: self._slot_vals = json.load(f) - for data_type in ['train', 'test', 'valid']: - bio_markup_data = self._preprocess(data.get(data_type, [])) - setattr(self, data_type, bio_markup_data) - self.data = { - 'train': self.train, - 'valid': self.valid, - 'test': self.test, - 'all': self.train + self.test + self.valid - } - self.shuffle = shuffle + super().__init__(data, seed, shuffle) - def _preprocess(self, data_part): + def preprocess(self, data_part, *args, **kwargs): processed_data_part = list() processed_texts = dict() for sample in data_part: diff --git a/deeppavlov/dataset_iterators/kvret_dialog_iterator.py b/deeppavlov/dataset_iterators/kvret_dialog_iterator.py index 2943719948..dcc3f8e772 100644 --- a/deeppavlov/dataset_iterators/kvret_dialog_iterator.py +++ b/deeppavlov/dataset_iterators/kvret_dialog_iterator.py @@ -55,8 +55,8 @@ def _dialogs(data): dialogs[-1][1][1].append(task) return dialogs - @staticmethod - def _utterances(data): + @overrides + def preprocess(self, data, *args, **kwargs): utters = [] history = [] for x, y in data: @@ -74,9 +74,3 @@ def _utterances(data): y_tuple = (y['text'], y['task']['intent']) utters.append((x_tuple, y_tuple)) return utters - - @overrides - def split(self, *args, **kwargs): - self.train = self._utterances(self.train) - self.valid = self._utterances(self.valid) - self.test = self._utterances(self.test) diff --git a/deeppavlov/dataset_iterators/morphotagger_iterator.py b/deeppavlov/dataset_iterators/morphotagger_iterator.py index 515dad152c..1b7ac0413c 100644 --- a/deeppavlov/dataset_iterators/morphotagger_iterator.py +++ b/deeppavlov/dataset_iterators/morphotagger_iterator.py @@ -67,7 +67,7 @@ def __init__(self, data: Dict[str, List[Tuple[Any, Any]]], seed: int = None, self.min_train_fraction = min_train_fraction super().__init__(data, seed, shuffle) - def split(self) -> None: + def split(self, *args, **kwargs) -> None: """ Splits the `train` part to `train` and `valid`, if no `valid` part is specified. Moves deficient data from `valid` to `train` if both parts are given, diff --git a/deeppavlov/dataset_iterators/snips_intents_iterator.py b/deeppavlov/dataset_iterators/snips_intents_iterator.py index c96a1be0d1..4f90455336 100644 --- a/deeppavlov/dataset_iterators/snips_intents_iterator.py +++ b/deeppavlov/dataset_iterators/snips_intents_iterator.py @@ -23,15 +23,9 @@ @register('snips_intents_iterator') class SnipsIntentIterator(DataLearningIterator): @overrides - def split(self, *args, **kwargs): - self.train = self._split(self.train) - self.valid = self._split(self.valid) - self.test = self._split(self.test) - - @staticmethod - def _split(queries: List[Any]): + def preprocess(self, data, *args, **kwargs): result = [] - for query in queries: + for query in data: text = ''.join(part['text'] for part in query['data']) intent = query['intent'] result.append((text, [intent])) diff --git a/deeppavlov/dataset_iterators/snips_ner_iterator.py b/deeppavlov/dataset_iterators/snips_ner_iterator.py index 9c25016a7e..2186ebbaa9 100644 --- a/deeppavlov/dataset_iterators/snips_ner_iterator.py +++ b/deeppavlov/dataset_iterators/snips_ner_iterator.py @@ -22,15 +22,9 @@ @register('snips_ner_iterator') class SnipsNerIterator(DataLearningIterator): @overrides - def split(self, *args, **kwargs): - self.train = self._split(self.train) - self.valid = self._split(self.valid) - self.test = self._split(self.test) - - @staticmethod - def _split(queries: List[Any]): + def preprocess(self, data, *args, **kwargs): result = [] - for query in queries: + for query in data: query = query['data'] words = [] slots = [] diff --git a/deeppavlov/dataset_iterators/squad_iterator.py b/deeppavlov/dataset_iterators/squad_iterator.py index 84c82d9d00..518c6b6aa7 100644 --- a/deeppavlov/dataset_iterators/squad_iterator.py +++ b/deeppavlov/dataset_iterators/squad_iterator.py @@ -37,12 +37,8 @@ class SquadIterator(DataLearningIterator): """ - def split(self, *args, **kwargs) -> None: - for dt in ['train', 'valid', 'test']: - setattr(self, dt, SquadIterator._extract_cqas(getattr(self, dt))) - - @staticmethod - def _extract_cqas(data: Dict[str, Any]) -> List[Tuple[Tuple[str, str], Tuple[List[str], List[int]]]]: + def preprocess(self, data: Dict[str, Any], *args, **kwargs) -> \ + List[Tuple[Tuple[str, str], Tuple[List[str], List[int]]]]: """Extracts context, question, answer, answer_start from SQuAD data Args: From 06bd745c037422cb1cbd2b3af98849541cb569ba Mon Sep 17 00:00:00 2001 From: Alexey Romanov Date: Wed, 17 Jul 2019 14:25:50 +0300 Subject: [PATCH 07/18] fix: initialize validation set in MorphoTaggerDatasetIterator (#921) --- deeppavlov/dataset_iterators/morphotagger_iterator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeppavlov/dataset_iterators/morphotagger_iterator.py b/deeppavlov/dataset_iterators/morphotagger_iterator.py index 1b7ac0413c..dca0daa990 100644 --- a/deeppavlov/dataset_iterators/morphotagger_iterator.py +++ b/deeppavlov/dataset_iterators/morphotagger_iterator.py @@ -77,7 +77,7 @@ def split(self, *args, **kwargs) -> None: if self.shuffle: random.shuffle(self.train) L = int(len(self.train) * (1.0 - self.validation_split)) - self.train, self.valid = self.train[:L], self.valid[L:] + self.train, self.valid = self.train[:L], self.train[L:] elif self.min_train_fraction > 0.0: train_length = len(self.train) valid_length = len(self.valid) From 2155632a4bd53959a2dd1725b035f46e55e104e5 Mon Sep 17 00:00:00 2001 From: Alexander Dmitrievskiy Date: Wed, 24 Jul 2019 11:40:55 +0300 Subject: [PATCH 08/18] feat: remove DefaultVocabulary in favor of SimpleVocabulary (#924) * refactor morpho_tagger to be compatible with SimpleVocabulary * add idxs2toks method to SimpleVocabulary for compatibility with morphoragger * fix bug with simple vocab usage in seq2seq bot * add documentation to SimpleVocabulary * rename and document LowercasePreprocessor into CharSplittingLowercasePreprocessor and update affected configs * rename ids in morphotagger configs * rename output variable from char_splitting_lowercase_preprocessor * update MorphoTagger for SimpleVocabulary usage instead of DefaultVocaulary * delete DefaultVocabulary * update documentation in MorphoTagger * prettify docs with Usage of CharSplittingLowercasePreprocessor * update docs for params in SimpleVocabulary * docs: complete and fix @acriptis updates in LowercasePreprocessor -> CharSplittingLowercasePreprocessor modification. --- .../morpho_tagger/UD2.0/morpho_ar.json | 16 +- .../morpho_tagger/UD2.0/morpho_cs.json | 16 +- .../morpho_tagger/UD2.0/morpho_de.json | 16 +- .../morpho_tagger/UD2.0/morpho_en.json | 16 +- .../morpho_tagger/UD2.0/morpho_es_ancora.json | 16 +- .../morpho_tagger/UD2.0/morpho_fr.json | 16 +- .../morpho_tagger/UD2.0/morpho_hi.json | 16 +- .../morpho_tagger/UD2.0/morpho_hu.json | 16 +- .../morpho_tagger/UD2.0/morpho_it.json | 16 +- .../UD2.0/morpho_ru_syntagrus.json | 16 +- .../UD2.0/morpho_ru_syntagrus_pymorphy.json | 16 +- ...orpho_ru_syntagrus_pymorphy_lemmatize.json | 16 +- .../morpho_tagger/UD2.0/morpho_tr.json | 16 +- .../configs/seq2seq_go_bot/bot_kvret.json | 4 +- .../seq2seq_go_bot/bot_kvret_train.json | 4 +- deeppavlov/core/common/registry.json | 2 +- deeppavlov/core/data/simple_vocab.py | 16 +- deeppavlov/core/data/vocab.py | 216 ------------------ deeppavlov/models/go_bot/network.py | 2 +- .../models/morpho_tagger/morpho_tagger.py | 16 +- .../models/preprocessors/capitalization.py | 20 +- docs/apiref/core/data.rst | 2 - docs/apiref/models/preprocessors.rst | 2 +- docs/components/morphotagger.rst | 6 +- 24 files changed, 136 insertions(+), 362 deletions(-) delete mode 100644 deeppavlov/core/data/vocab.py diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ar.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ar.json index bdd4b8f458..5a6dedf6ec 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ar.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ar.json @@ -30,22 +30,21 @@ ] }, { - "id": "lowercase_preprocessor", - "class_name": "lowercase_preprocessor", + "id": "char_splitting_lowercase_preprocessor", + "class_name": "char_splitting_lowercase_preprocessor", "in": [ "x_tokens" ], "out": [ - "x_processed" + "x_chars_lowered_marked" ] }, { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": [ "y" ], - "level": "token", "special_tokens": [ "PAD", "BEGIN", @@ -56,23 +55,22 @@ }, { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": [ - "x_processed" + "x_chars_lowered_marked" ], "special_tokens": [ "PAD", "BEGIN", "END" ], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ar/char.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ar/char.dict" }, { "in": [ - "x_processed" + "x_chars_lowered_marked" ], "in_y": [ "y" diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_cs.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_cs.json index ec83e44c59..0171da3de9 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_cs.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_cs.json @@ -30,22 +30,21 @@ ] }, { - "id": "lowercase_preprocessor", - "class_name": "lowercase_preprocessor", + "id": "char_splitting_lowercase_preprocessor", + "class_name": "char_splitting_lowercase_preprocessor", "in": [ "x_tokens" ], "out": [ - "x_processed" + "x_chars_lowered_marked" ] }, { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": [ "y" ], - "level": "token", "special_tokens": [ "PAD", "BEGIN", @@ -56,23 +55,22 @@ }, { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": [ - "x_processed" + "x_chars_lowered_marked" ], "special_tokens": [ "PAD", "BEGIN", "END" ], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/cs/char.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/cs/char.dict" }, { "in": [ - "x_processed" + "x_chars_lowered_marked" ], "in_y": [ "y" diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_de.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_de.json index cf7923039a..c6e304f164 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_de.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_de.json @@ -30,22 +30,21 @@ ] }, { - "id": "lowercase_preprocessor", - "class_name": "lowercase_preprocessor", + "id": "char_splitting_lowercase_preprocessor", + "class_name": "char_splitting_lowercase_preprocessor", "in": [ "x_tokens" ], "out": [ - "x_processed" + "x_chars_lowered_marked" ] }, { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": [ "y" ], - "level": "token", "special_tokens": [ "PAD", "BEGIN", @@ -56,23 +55,22 @@ }, { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": [ - "x_processed" + "x_chars_lowered_marked" ], "special_tokens": [ "PAD", "BEGIN", "END" ], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/de/char.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/de/char.dict" }, { "in": [ - "x_processed" + "x_chars_lowered_marked" ], "in_y": [ "y" diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_en.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_en.json index 89b40074d3..441f6b27df 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_en.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_en.json @@ -30,22 +30,21 @@ ] }, { - "id": "lowercase_preprocessor", - "class_name": "lowercase_preprocessor", + "id": "char_splitting_lowercase_preprocessor", + "class_name": "char_splitting_lowercase_preprocessor", "in": [ "x_tokens" ], "out": [ - "x_processed" + "x_chars_lowered_marked" ] }, { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": [ "y" ], - "level": "token", "special_tokens": [ "PAD", "BEGIN", @@ -56,23 +55,22 @@ }, { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": [ - "x_processed" + "x_chars_lowered_marked" ], "special_tokens": [ "PAD", "BEGIN", "END" ], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/en/char.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/en/char.dict" }, { "in": [ - "x_processed" + "x_chars_lowered_marked" ], "in_y": [ "y" diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_es_ancora.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_es_ancora.json index acdf3768d9..1bb78b7bbb 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_es_ancora.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_es_ancora.json @@ -30,22 +30,21 @@ ] }, { - "id": "lowercase_preprocessor", - "class_name": "lowercase_preprocessor", + "id": "char_splitting_lowercase_preprocessor", + "class_name": "char_splitting_lowercase_preprocessor", "in": [ "x_tokens" ], "out": [ - "x_processed" + "x_chars_lowered_marked" ] }, { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": [ "y" ], - "level": "token", "special_tokens": [ "PAD", "BEGIN", @@ -56,23 +55,22 @@ }, { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": [ - "x_processed" + "x_chars_lowered_marked" ], "special_tokens": [ "PAD", "BEGIN", "END" ], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/es_ancora/char.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/es_ancora/char.dict" }, { "in": [ - "x_processed" + "x_chars_lowered_marked" ], "in_y": [ "y" diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_fr.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_fr.json index 3b583e9868..d48241e913 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_fr.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_fr.json @@ -30,22 +30,21 @@ ] }, { - "id": "lowercase_preprocessor", - "class_name": "lowercase_preprocessor", + "id": "char_splitting_lowercase_preprocessor", + "class_name": "char_splitting_lowercase_preprocessor", "in": [ "x_tokens" ], "out": [ - "x_processed" + "x_chars_lowered_marked" ] }, { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": [ "y" ], - "level": "token", "special_tokens": [ "PAD", "BEGIN", @@ -56,23 +55,22 @@ }, { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": [ - "x_processed" + "x_chars_lowered_marked" ], "special_tokens": [ "PAD", "BEGIN", "END" ], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/fr/char.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/fr/char.dict" }, { "in": [ - "x_processed" + "x_chars_lowered_marked" ], "in_y": [ "y" diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hi.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hi.json index 578ba82714..120b18f75d 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hi.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hi.json @@ -30,22 +30,21 @@ ] }, { - "id": "lowercase_preprocessor", - "class_name": "lowercase_preprocessor", + "id": "char_splitting_lowercase_preprocessor", + "class_name": "char_splitting_lowercase_preprocessor", "in": [ "x_tokens" ], "out": [ - "x_processed" + "x_chars_lowered_marked" ] }, { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": [ "y" ], - "level": "token", "special_tokens": [ "PAD", "BEGIN", @@ -56,23 +55,22 @@ }, { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": [ - "x_processed" + "x_chars_lowered_marked" ], "special_tokens": [ "PAD", "BEGIN", "END" ], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hi/char.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hi/char.dict" }, { "in": [ - "x_processed" + "x_chars_lowered_marked" ], "in_y": [ "y" diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hu.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hu.json index 09b7c058f7..623e7548fd 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hu.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_hu.json @@ -30,22 +30,21 @@ ] }, { - "id": "lowercase_preprocessor", - "class_name": "lowercase_preprocessor", + "id": "char_splitting_lowercase_preprocessor", + "class_name": "char_splitting_lowercase_preprocessor", "in": [ "x_tokens" ], "out": [ - "x_processed" + "x_chars_lowered_marked" ] }, { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": [ "y" ], - "level": "token", "special_tokens": [ "PAD", "BEGIN", @@ -56,23 +55,22 @@ }, { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": [ - "x_processed" + "x_chars_lowered_marked" ], "special_tokens": [ "PAD", "BEGIN", "END" ], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hu/char.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/hu/char.dict" }, { "in": [ - "x_processed" + "x_chars_lowered_marked" ], "in_y": [ "y" diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_it.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_it.json index 6705aca6d6..dfcd799d05 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_it.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_it.json @@ -30,22 +30,21 @@ ] }, { - "id": "lowercase_preprocessor", - "class_name": "lowercase_preprocessor", + "id": "char_splitting_lowercase_preprocessor", + "class_name": "char_splitting_lowercase_preprocessor", "in": [ "x_tokens" ], "out": [ - "x_processed" + "x_chars_lowered_marked" ] }, { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": [ "y" ], - "level": "token", "special_tokens": [ "PAD", "BEGIN", @@ -56,23 +55,22 @@ }, { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": [ - "x_processed" + "x_chars_lowered_marked" ], "special_tokens": [ "PAD", "BEGIN", "END" ], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/it/char.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/it/char.dict" }, { "in": [ - "x_processed" + "x_chars_lowered_marked" ], "in_y": [ "y" diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus.json index 09dd43c6ce..1aaa8ca70b 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus.json @@ -30,22 +30,21 @@ ] }, { - "id": "lowercase_preprocessor", - "class_name": "lowercase_preprocessor", + "id": "char_splitting_lowercase_preprocessor", + "class_name": "char_splitting_lowercase_preprocessor", "in": [ "x_tokens" ], "out": [ - "x_processed" + "x_chars_lowered_marked" ] }, { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": [ "y" ], - "level": "token", "special_tokens": [ "PAD", "BEGIN", @@ -56,23 +55,22 @@ }, { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": [ - "x_processed" + "x_chars_lowered_marked" ], "special_tokens": [ "PAD", "BEGIN", "END" ], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/char.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/char.dict" }, { "in": [ - "x_processed" + "x_chars_lowered_marked" ], "in_y": [ "y" diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy.json index bbd247a627..b4f6da6f6a 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy.json @@ -30,22 +30,21 @@ ] }, { - "id": "lowercase_preprocessor", - "class_name": "lowercase_preprocessor", + "id": "char_splitting_lowercase_preprocessor", + "class_name": "char_splitting_lowercase_preprocessor", "in": [ "x_tokens" ], "out": [ - "x_processed" + "x_chars_lowered_marked" ] }, { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": [ "y" ], - "level": "token", "special_tokens": [ "PAD", "BEGIN", @@ -56,17 +55,16 @@ }, { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": [ - "x_processed" + "x_chars_lowered_marked" ], "special_tokens": [ "PAD", "BEGIN", "END" ], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/char.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/char.dict" }, @@ -85,7 +83,7 @@ }, { "in": [ - "x_processed", + "x_chars_lowered_marked", "x_possible_tags" ], "in_y": [ diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy_lemmatize.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy_lemmatize.json index 28f8b268e6..34a8c55aec 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy_lemmatize.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_ru_syntagrus_pymorphy_lemmatize.json @@ -31,22 +31,21 @@ ] }, { - "id": "lowercase_preprocessor", - "class_name": "lowercase_preprocessor", + "id": "char_splitting_lowercase_preprocessor", + "class_name": "char_splitting_lowercase_preprocessor", "in": [ "x_tokens" ], "out": [ - "x_processed" + "x_chars_lowered_marked" ] }, { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": [ "y" ], - "level": "token", "special_tokens": [ "PAD", "BEGIN", @@ -57,17 +56,16 @@ }, { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": [ - "x_processed" + "x_chars_lowered_marked" ], "special_tokens": [ "PAD", "BEGIN", "END" ], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/char.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/ru_syntagrus/char.dict" }, @@ -86,7 +84,7 @@ }, { "in": [ - "x_processed", + "x_chars_lowered_marked", "x_possible_tags" ], "in_y": [ diff --git a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_tr.json b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_tr.json index 33041d7012..b4e96569cc 100644 --- a/deeppavlov/configs/morpho_tagger/UD2.0/morpho_tr.json +++ b/deeppavlov/configs/morpho_tagger/UD2.0/morpho_tr.json @@ -31,22 +31,21 @@ ] }, { - "id": "lowercase_preprocessor", - "class_name": "lowercase_preprocessor", + "id": "char_splitting_lowercase_preprocessor", + "class_name": "char_splitting_lowercase_preprocessor", "in": [ "x_tokens" ], "out": [ - "x_processed" + "x_chars_lowered_marked" ] }, { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": [ "y" ], - "level": "token", "special_tokens": [ "PAD", "BEGIN", @@ -57,23 +56,22 @@ }, { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": [ - "x_processed" + "x_chars_lowered_marked" ], "special_tokens": [ "PAD", "BEGIN", "END" ], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/tr/char.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/tr/char.dict" }, { "in": [ - "x_processed" + "x_chars_lowered_marked" ], "in_y": [ "y" diff --git a/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json b/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json index a5a4b724d8..c7f2b469cf 100644 --- a/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json +++ b/deeppavlov/configs/seq2seq_go_bot/bot_kvret.json @@ -38,7 +38,7 @@ { "id": "src_token_vocab", "class_name": "simple_vocab", - "default_token": "", + "unk_token": "", "special_tokens": ["", ""], "save_path": "{MODELS_PATH}/vocabs/kvret_src_tokens.dict", "load_path": "{MODELS_PATH}/vocabs/kvret_src_tokens.dict" @@ -46,7 +46,7 @@ { "id": "tgt_token_vocab", "class_name": "simple_vocab", - "default_token": "", + "unk_token": "", "special_tokens": ["", "", ""], "save_path": "{MODELS_PATH}/vocabs/kvret_tgt_tokens.dict", "load_path": "{MODELS_PATH}/vocabs/kvret_tgt_tokens.dict" diff --git a/deeppavlov/configs/seq2seq_go_bot/bot_kvret_train.json b/deeppavlov/configs/seq2seq_go_bot/bot_kvret_train.json index 7d2ce923a1..1febea3fbb 100644 --- a/deeppavlov/configs/seq2seq_go_bot/bot_kvret_train.json +++ b/deeppavlov/configs/seq2seq_go_bot/bot_kvret_train.json @@ -56,7 +56,7 @@ "fit_on": ["x_tokens", "y_tokens"], "class_name": "simple_vocab", "min_freq": 2, - "default_token": "", + "unk_token": "", "special_tokens": ["", ""], "save_path": "{MODELS_PATH}/vocabs/kvret_src_tokens.dict", "load_path": "{MODELS_PATH}/vocabs/kvret_src_tokens.dict" @@ -65,7 +65,7 @@ "id": "tgt_token_vocab", "fit_on": ["y_without_entities_tokens"], "class_name": "simple_vocab", - "default_token": "", + "unk_token": "", "special_tokens": ["", "", ""], "save_path": "{MODELS_PATH}/vocabs/kvret_tgt_tokens.dict", "load_path": "{MODELS_PATH}/vocabs/kvret_tgt_tokens.dict" diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 831c175835..079010b520 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -69,7 +69,7 @@ "lemmatized_output_prettifier": "deeppavlov.models.morpho_tagger.common:LemmatizedOutputPrettifier", "line_reader": "deeppavlov.dataset_readers.line_reader:LineReader", "logit_ranker": "deeppavlov.models.doc_retrieval.logit_ranker:LogitRanker", - "lowercase_preprocessor": "deeppavlov.models.preprocessors.capitalization:LowercasePreprocessor", + "char_splitting_lowercase_preprocessor": "deeppavlov.models.preprocessors.capitalization:CharSplittingLowercasePreprocessor", "mask": "deeppavlov.models.preprocessors.mask:Mask", "morpho_tagger": "deeppavlov.models.morpho_tagger.morpho_tagger:MorphoTagger", "morphotagger_dataset": "deeppavlov.dataset_iterators.morphotagger_iterator:MorphoTaggerDatasetIterator", diff --git a/deeppavlov/core/data/simple_vocab.py b/deeppavlov/core/data/simple_vocab.py index 1d150e1019..7ccd78ab02 100644 --- a/deeppavlov/core/data/simple_vocab.py +++ b/deeppavlov/core/data/simple_vocab.py @@ -29,7 +29,17 @@ @register('simple_vocab') class SimpleVocabulary(Estimator): - """Implements simple vocabulary.""" + """Implements simple vocabulary. + + Parameters: + special_tokens: tuple of tokens that shouldn't be counted. + max_tokens: upper bound for number of tokens in the vocabulary. + min_freq: minimal count of a token (except special tokens). + pad_with_zeros: if True, then batch of elements will be padded with zeros up to length of + the longest element in batch. + unk_token: label assigned to unknown tokens. + freq_drop_load: if True, then frequencies of tokens are set to min_freq on the model load. + """ def __init__(self, special_tokens: Tuple[str, ...] = tuple(), max_tokens: int = 2**30, @@ -51,7 +61,6 @@ def __init__(self, self.load() def fit(self, *args): - # return None self.reset() tokens = chain(*args) # filter(None, <>) -- to filter empty tokens @@ -162,3 +171,6 @@ def reset(self): self._t2i = defaultdict(lambda: unk_index) self._i2t = [] self.count = 0 + + def idxs2toks(self, idxs): + return [self[idx] for idx in idxs] diff --git a/deeppavlov/core/data/vocab.py b/deeppavlov/core/data/vocab.py deleted file mode 100644 index f13f69d98f..0000000000 --- a/deeppavlov/core/data/vocab.py +++ /dev/null @@ -1,216 +0,0 @@ -# Copyright 2017 Neural Networks and Deep Learning lab, MIPT -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import itertools -from collections import Counter, defaultdict -from logging import getLogger -from pathlib import Path -from typing import List, Callable - -import numpy as np - -from deeppavlov.core.common.errors import ConfigError -from deeppavlov.core.common.registry import register -from deeppavlov.core.models.estimator import Estimator - -log = getLogger(__name__) - - -@register('default_vocab') -class DefaultVocabulary(Estimator): - """ - Implements vocabulary of tokens, chars or other structeres. - - Parameters: - level: level of operation can be tokens (``'token'``) or chars (``'char'``). - special_tokens: tuple of tokens that shouldn't be counted. - default_token: label assigned to unknown tokens. - tokenizer: callable used to get tokens out of string. - min_freq: minimal count of a token (except special tokens). - """ - - def __init__(self, - save_path: str, - load_path: str, - level: str = 'token', - special_tokens: List[str] = [], - default_token: str = None, - tokenizer: Callable = None, - min_freq: int = 0, - **kwargs) -> None: - - super().__init__(load_path=load_path, save_path=save_path, **kwargs) - - self.special_tokens = special_tokens - self.default_token = default_token - self.min_freq = min_freq - self.preprocess_fn = self._build_preprocess_fn(level, tokenizer) - - # TODO check via decorator - self.reset() - if self.load_path: - self.load() - - @staticmethod - def _build_preprocess_fn(level, tokenizer=None): - def iter_level(utter): - if isinstance(utter, list) and utter and isinstance(utter[0], dict): - tokens = (u['text'] for u in utter) - elif isinstance(utter, dict): - tokens = [utter['text']] - elif isinstance(utter, list) and (not utter or isinstance(utter[0], str) or isinstance(utter[0], tuple)): - tokens = utter - else: - tokens = [utter] - - if tokenizer is not None: - tokens = tokenizer([' '.join(tokens)])[0] - tokens = filter(None, tokens) - - if level == 'token': - yield from tokens - elif level == 'char': - for token in tokens: - yield from token - else: - raise ValueError("level argument is either equal to `token`" - " or to `char`") - - def preprocess_fn(data): - for d in data: - yield from iter_level(d) - - return preprocess_fn - - def __getitem__(self, key): - if isinstance(key, (int, np.integer)): - return self._i2t[key] - elif isinstance(key, str): - return self._t2i[key] - else: - raise NotImplementedError("not implemented for type `{}`".format(type(key))) - - def __contains__(self, item): - return item in self._t2i - - def __len__(self): - return len(self._t2i) - - def keys(self): - return (k for k, v in self.freqs.most_common() if k in self._t2i) - - def values(self): - return (v for k, v in self.freqs.most_common() if k in self._t2i) - - def items(self): - return ((k, v) for k, v in self.freqs.most_common() if k in self._t2i) - - def reset(self): - # default index is the position of default_token - if self.default_token is not None: - default_ind = self.special_tokens.index(self.default_token) - else: - default_ind = 0 - self._t2i = defaultdict(lambda: default_ind) - self._i2t = dict() - self.freqs = Counter() - - for i, token in enumerate(self.special_tokens): - self._t2i[token] = i - self._i2t[i] = token - self.freqs[token] += 0 - - def fit(self, *args): - self.reset() - self._train( - tokens=filter(None, itertools.chain.from_iterable( - map(self.preprocess_fn, zip(*args)))), - counts=None, - update=True - ) - - def _train(self, tokens, counts=None, update=True): - counts = counts or itertools.repeat(1) - if not update: - self.reset() - - for token, cnt in zip(tokens, counts): - self.freqs[token] += cnt - - index = len(self._t2i) - for token, count in self.freqs.items(): - if token not in self._t2i and count >= self.min_freq: - self._t2i[token] = index - self._i2t[index] = token - index += 1 - return - - def __call__(self, samples, **kwargs): - return [self[s] for s in samples] - - def save(self): - log.info("[saving vocabulary to {}]".format(self.save_path)) - - with self.save_path.open('wt', encoding="utf8") as f: - for n in range(len(self._t2i)): - token = self._i2t[n] - cnt = self.freqs[token] - f.write('{}\t{:d}\n'.format(token, cnt)) - - # @check_path_exists() - def load(self): - if self.load_path: - if self.load_path.is_file(): - log.info("[loading vocabulary from {}]".format(self.load_path)) - tokens, counts = [], [] - for ln in self.load_path.open('r', encoding="utf8"): - token, cnt = ln.split('\t', 1) - tokens.append(token) - counts.append(int(cnt)) - self._train(tokens=tokens, counts=counts, update=True) - elif isinstance(self.load_path, Path): - if not self.load_path.parent.is_dir(): - raise ConfigError("Provided `load_path` for {} doesn't exist!".format( - self.__class__.__name__)) - else: - raise ConfigError("`load_path` for {} is not provided!".format(self)) - - def idx2tok(self, idx): - return self._i2t[idx] - - def idxs2toks(self, idxs, filter_paddings=False): - toks = [] - for idx in idxs: - # if not filter_paddings or idx != self.tok2idx(''): - toks.append(self._i2t[idx]) - return toks - - def tok2idx(self, tok): - return self._t2i[tok] - - def toks2idxs(self, toks): - return [self._t2i[tok] for tok in toks] - - def batch_toks2batch_idxs(self, b_toks): - max_len = max(len(toks) for toks in b_toks) - # Create array filled with paddings - # batch = np.ones([len(b_toks), max_len]) * self.tok2idx('') - batch = np.zeros([len(b_toks), max_len]) - for n, tokens in enumerate(b_toks): - idxs = self.toks2idxs(tokens) - batch[n, :len(idxs)] = idxs - return batch - - def batch_idxs2batch_toks(self, b_idxs, filter_paddings=False): - return [self.idxs2toks(idxs, filter_paddings) for idxs in b_idxs] diff --git a/deeppavlov/models/go_bot/network.py b/deeppavlov/models/go_bot/network.py index e4e41e0dec..a5e713fb47 100644 --- a/deeppavlov/models/go_bot/network.py +++ b/deeppavlov/models/go_bot/network.py @@ -83,7 +83,7 @@ class GoalOrientedBot(LRScheduledTFModel): for response generation. template_type: type of used response templates in string format. word_vocab: vocabulary of input word tokens - (:class:`~deeppavlov.core.data.vocab.DefaultVocabulary` recommended). + (:class:`~deeppavlov.core.data.simple_vocab.SimpleVocabulary` recommended). bow_embedder: instance of one-hot word encoder :class:`~deeppavlov.models.embedders.bow_embedder.BoWEmbedder`. embedder: one of embedders from diff --git a/deeppavlov/models/morpho_tagger/morpho_tagger.py b/deeppavlov/models/morpho_tagger/morpho_tagger.py index ba95216ef0..b4a1dd0f7a 100644 --- a/deeppavlov/models/morpho_tagger/morpho_tagger.py +++ b/deeppavlov/models/morpho_tagger/morpho_tagger.py @@ -24,7 +24,7 @@ from keras import Model from deeppavlov.core.common.registry import register -from deeppavlov.core.data.vocab import DefaultVocabulary +from deeppavlov.core.data.simple_vocab import SimpleVocabulary from deeppavlov.core.models.keras_model import KerasModel from .cells import Highway from .common_tagger import to_one_hot @@ -76,8 +76,8 @@ class MorphoTagger(KerasModel): A subclass of :class:`~deeppavlov.core.models.keras_model.KerasModel` """ def __init__(self, - symbols: DefaultVocabulary, - tags: DefaultVocabulary, + symbols: SimpleVocabulary, + tags: SimpleVocabulary, save_path: Optional[Union[str, Path]] = None, load_path: Optional[Union[str, Path]] = None, mode: str = 'infer', @@ -323,12 +323,12 @@ def _make_sent_vector(self, sent: List, bucket_length: int =None) -> np.ndarray: bucket_length = bucket_length or len(sent) answer = np.zeros(shape=(bucket_length, MAX_WORD_LENGTH+2), dtype=np.int32) for i, word in enumerate(sent): - answer[i, 0] = self.tags.tok2idx("BEGIN") + answer[i, 0] = self.tags["BEGIN"] m = min(len(word), MAX_WORD_LENGTH) for j, x in enumerate(word[-m:]): - answer[i, j+1] = self.symbols.tok2idx(x) - answer[i, m+1] = self.tags.tok2idx("END") - answer[i, m+2:] = self.tags.tok2idx("PAD") + answer[i, j+1] = self.symbols[x] + answer[i, m+1] = self.tags["END"] + answer[i, m+2:] = self.tags["PAD"] return answer def _make_tags_vector(self, tags, bucket_length=None) -> np.ndarray: @@ -344,5 +344,5 @@ def _make_tags_vector(self, tags, bucket_length=None) -> np.ndarray: bucket_length = bucket_length or len(tags) answer = np.zeros(shape=(bucket_length,), dtype=np.int32) for i, tag in enumerate(tags): - answer[i] = self.tags.tok2idx(tag) + answer[i] = self.tags[tag] return answer diff --git a/deeppavlov/models/preprocessors/capitalization.py b/deeppavlov/models/preprocessors/capitalization.py index 7cb2a257c2..59fe43b74c 100644 --- a/deeppavlov/models/preprocessors/capitalization.py +++ b/deeppavlov/models/preprocessors/capitalization.py @@ -74,8 +74,10 @@ def __call__(self, tokens_batch, **kwargs): def process_word(word: str, to_lower: bool = False, append_case: Optional[str] = None) -> Tuple[str]: - """Converts word to a tuple of symbols, optionally converts it to lowercase - and adds capitalization label. + """The method implements the following operations: + 1. converts word to a tuple of symbols (character splitting), + 2. optionally converts it to lowercase and + 3. adds capitalization label. Args: word: input word @@ -84,7 +86,13 @@ def process_word(word: str, to_lower: bool = False, ('' for first capital and '' for all caps) Returns: - a preprocessed word + a preprocessed word. + + Example: + >>> process_word(word="Zaman", to_lower=True, append_case="first") + ('', 'z', 'a', 'm', 'a', 'n') + >>> process_word(word="MSU", to_lower=True, append_case="last") + ('m', 's', 'u', '') """ if all(x.isupper() for x in word) and len(word) > 1: uppercase = "" @@ -108,8 +116,8 @@ def process_word(word: str, to_lower: bool = False, return tuple(answer) -@register('lowercase_preprocessor') -class LowercasePreprocessor(Component): +@register('char_splitting_lowercase_preprocessor') +class CharSplittingLowercasePreprocessor(Component): """A callable wrapper over :func:`process_word`. Takes as input a batch of tokenized sentences and returns a batch of preprocessed sentences. @@ -126,4 +134,4 @@ def __call__(self, tokens_batch: List[List[str]], **kwargs) -> List[List[Tuple[s # elem = NLTKMosesTokenizer()([elem])[0] # # elem = [x for x in re.split("(\w+|[,.])", elem) if x.strip() != ""] answer.append([process_word(x, self.to_lower, self.append_case) for x in elem]) - return answer \ No newline at end of file + return answer diff --git a/docs/apiref/core/data.rst b/docs/apiref/core/data.rst index 2acefad6be..81567f9400 100644 --- a/docs/apiref/core/data.rst +++ b/docs/apiref/core/data.rst @@ -10,6 +10,4 @@ DatasetReader, Vocab, DataLearningIterator and DataFittingIterator classes. .. autoclass:: deeppavlov.core.data.sqlite_database.Sqlite3Database -.. autoclass:: deeppavlov.core.data.vocab.DefaultVocabulary - .. autoclass:: deeppavlov.core.data.simple_vocab.SimpleVocabulary diff --git a/docs/apiref/models/preprocessors.rst b/docs/apiref/models/preprocessors.rst index 72c6dabf10..f35c31e904 100644 --- a/docs/apiref/models/preprocessors.rst +++ b/docs/apiref/models/preprocessors.rst @@ -7,7 +7,7 @@ deeppavlov.models.preprocessors .. autofunction:: deeppavlov.models.preprocessors.capitalization.process_word -.. autoclass:: deeppavlov.models.preprocessors.capitalization.LowercasePreprocessor +.. autoclass:: deeppavlov.models.preprocessors.capitalization.CharSplittingLowercasePreprocessor .. autoclass:: deeppavlov.models.preprocessors.char_splitter.CharSplitter diff --git a/docs/components/morphotagger.rst b/docs/components/morphotagger.rst index 83973b4af5..8fd4a9f5d1 100644 --- a/docs/components/morphotagger.rst +++ b/docs/components/morphotagger.rst @@ -512,9 +512,8 @@ model should predict to tag indexes. { "id": "tag_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "fit_on": ["y"], - "level": "token", "special_tokens": ["PAD", "BEGIN", "END"], "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/tag_en.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/tag_en.dict" @@ -527,11 +526,10 @@ symbols which occur at least ``min_freq`` times in the training set are kept. { "id": "char_vocab", - "class_name": "default_vocab", + "class_name": "simple_vocab", "min_freq": 3, "fit_on": ["x_processed"], "special_tokens": ["PAD", "BEGIN", "END"], - "level": "char", "save_path": "{MODELS_PATH}/morpho_tagger/UD2.0/char_en.dict", "load_path": "{MODELS_PATH}/morpho_tagger/UD2.0/char_en.dict" }, From 73f8a1523604652916f9dcc0d7955b6061bc6239 Mon Sep 17 00:00:00 2001 From: Aleksei Lymar Date: Wed, 24 Jul 2019 11:41:13 +0300 Subject: [PATCH 09/18] tests: increase the allowed error for testing TFLayers (#941) --- tests/test_tf_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tf_layers.py b/tests/test_tf_layers.py index aed4a2e207..e67ff61c8d 100644 --- a/tests/test_tf_layers.py +++ b/tests/test_tf_layers.py @@ -147,7 +147,7 @@ def load(self, path): class TestTFLayers: - allowed_error_lvl = 0.01 + allowed_error_lvl = 0.01 * 2 ** 0.5 @staticmethod def equal_values(a, b, round=5): From 72185f7892b45864913b1078e9f23ddd130424b7 Mon Sep 17 00:00:00 2001 From: Aleksei Lymar Date: Wed, 24 Jul 2019 11:42:31 +0300 Subject: [PATCH 10/18] feat: update requirements and become compatible with python 3.7 (#861) * feat: update required packages versions * feat: downgrade scikit-learn to not break tests * chore: move updated kenlm dependency to a separate requirements file * fix: add spelling requirements to the kbqa_rus config * fix: remove empty line in kenlm.txt * docs: update required python version in README.md * feat: update tensorflow version * feat: require newer lxml version==4.3.4 * feat: last sklearn version * chore: config for sklearn classifier and new model * fix: new model files * fix: new model files * fix: new model files * fix: new model files * tests: update jenkinsfile to run tests on cuda-10.0 in python3.7 * feat: update core dependencies * tests: fix commands order in Jenkinsfile * tests: cd back to tests after building docs * chore: update version to 0.5.0 * docs: add breaking changes description for 0.5.0 --- Jenkinsfile | 15 +++-- README.md | 7 ++- deeppavlov/__init__.py | 2 +- .../classifiers/intents_snips_sklearn.json | 55 +++++-------------- .../intents_snips_tfidf_weighted.json | 29 +++++----- .../configs/faq/tfidf_logreg_autofaq.json | 12 ++-- .../configs/faq/tfidf_logreg_en_faq.json | 14 ++--- deeppavlov/configs/kbqa/kbqa_rus.json | 3 +- .../tfidf_logreg_autofaq_psearch.json | 12 ++-- .../brillmoore_kartaslov_ru.json | 3 +- .../brillmoore_kartaslov_ru_custom_vocab.json | 3 +- .../brillmoore_wikitypos_en.json | 3 +- .../levenshtein_corrector_ru.json | 3 +- deeppavlov/requirements/fasttext.txt | 2 +- deeppavlov/requirements/gensim.txt | 2 +- deeppavlov/requirements/kenlm.txt | 1 + deeppavlov/requirements/spelling.txt | 3 +- deeppavlov/requirements/tf-gpu.txt | 2 +- deeppavlov/requirements/tf-hub.txt | 2 +- deeppavlov/requirements/tf.txt | 2 +- requirements.txt | 32 +++++------ 21 files changed, 97 insertions(+), 110 deletions(-) create mode 100644 deeppavlov/requirements/kenlm.txt diff --git a/Jenkinsfile b/Jenkinsfile index 1b0ba05c5f..3feaf0f959 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,4 +1,4 @@ -node('gpu') { +node('cuda-module') { timestamps { try { stage('Clean') { @@ -9,9 +9,9 @@ node('gpu') { } stage('Setup') { env.TFHUB_CACHE_DIR="tfhub_cache" - env.LD_LIBRARY_PATH="/usr/local/cuda-9.0/lib64" + env.LD_LIBRARY_PATH="/usr/local/cuda-10.0/lib64" sh """ - virtualenv --python=python3 '.venv-$BUILD_NUMBER' + virtualenv --python=python3.7 '.venv-$BUILD_NUMBER' . '.venv-$BUILD_NUMBER/bin/activate' pip install .[tests,docs] pip install -r deeppavlov/requirements/tf-gpu.txt @@ -20,12 +20,17 @@ node('gpu') { } stage('Tests') { sh """ + . /etc/profile + module add cuda/10.0 . .venv-$BUILD_NUMBER/bin/activate - flake8 `python -c 'import deeppavlov; print(deeppavlov.__path__[0])'` --count --select=E9,F63,F7,F82 --show-source --statistics - pytest -v --disable-warnings + cd docs make clean make html + cd .. + + flake8 `python -c 'import deeppavlov; print(deeppavlov.__path__[0])'` --count --select=E9,F63,F7,F82 --show-source --statistics + pytest -v --disable-warnings """ currentBuild.result = 'SUCCESS' } diff --git a/README.md b/README.md index eaa1aa8bf2..9655a48380 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ [![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/deepmipt/DeepPavlov/blob/master/LICENSE) -![Python 3.6](https://img.shields.io/badge/python-3.6-green.svg) +![Python 3.6, 3.7](https://img.shields.io/badge/python-3.6%20%7C%203.7-green.svg) [![Downloads](https://pepy.tech/badge/deeppavlov)](https://pepy.tech/project/deeppavlov) DeepPavlov is an open-source conversational AI library built on [TensorFlow](https://www.tensorflow.org/) and [Keras](https://keras.io/). It is designed for @@ -142,6 +142,11 @@ You can also specify batch size with `-b` or `--batch-size` parameter. # Breaking changes +**Breaking changes in version 0.5.0** +- dependencies have to be reinstalled for most pipeline configurations +- models depending on `tensorflow` require `CUDA 10.0` to run on GPU instead of `CUDA 9.0` +- scikit-learn models have to be redownloaded or retrained + **Breaking changes in version 0.4.0!** - default target variable name for [neural evolution](https://docs.deeppavlov.ai/en/0.4.0/intro/hypersearch.html#parameters-evolution-for-deeppavlov-models) was changed from `MODELS_PATH` to `MODEL_PATH`. diff --git a/deeppavlov/__init__.py b/deeppavlov/__init__.py index e9e6d4e7d4..b40b8f7e35 100644 --- a/deeppavlov/__init__.py +++ b/deeppavlov/__init__.py @@ -37,7 +37,7 @@ def evaluate_model(config: [str, Path, dict], download: bool = False, recursive: except ImportError: 'Assuming that requirements are not yet installed' -__version__ = '0.4.0' +__version__ = '0.5.0' __author__ = 'Neural Networks and Deep Learning lab, MIPT' __description__ = 'An open source library for building end-to-end dialog systems and training chatbots.' __keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot'] diff --git a/deeppavlov/configs/classifiers/intents_snips_sklearn.json b/deeppavlov/configs/classifiers/intents_snips_sklearn.json index 78ddf5c8f1..61012f7958 100644 --- a/deeppavlov/configs/classifiers/intents_snips_sklearn.json +++ b/deeppavlov/configs/classifiers/intents_snips_sklearn.json @@ -23,8 +23,8 @@ "fit_on": [ "y" ], - "save_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v9/classes.dict", - "load_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v9/classes.dict", + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", "in": "y", "out": "y_ids" }, @@ -41,8 +41,8 @@ ], "id": "tfidf_vec", "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v9/tfidf.pkl", - "load_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v9/tfidf.pkl", + "save_path": "{MODEL_PATH}/tfidf.pkl", + "load_path": "{MODEL_PATH}/tfidf.pkl", "model_class": "sklearn.feature_extraction.text:TfidfVectorizer", "infer_method": "transform", "lowercase": true, @@ -67,8 +67,8 @@ ], "id": "selector", "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v9/selectkbest.pkl", - "load_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v9/selectkbest.pkl", + "save_path": "{MODEL_PATH}/selectkbest.pkl", + "load_path": "{MODEL_PATH}/selectkbest.pkl", "model_class": "sklearn.feature_selection:SelectKBest", "infer_method": "transform", "score_func": "sklearn.feature_selection:chi2", @@ -86,28 +86,12 @@ ], "id": "pca", "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v9/pca.pkl", - "load_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v9/pca.pkl", + "save_path": "{MODEL_PATH}/pca.pkl", + "load_path": "{MODEL_PATH}/pca.pkl", "model_class": "sklearn.decomposition:PCA", "infer_method": "transform", "n_components": 300 }, - { - "in": "x", - "out": "x_tok", - "id": "my_tokenizer", - "class_name": "nltk_tokenizer", - "tokenizer": "wordpunct_tokenize" - }, - { - "in": "x_tok", - "out": "x_emb", - "id": "my_embedder", - "class_name": "fasttext", - "load_path": "{DOWNLOADS_PATH}/embeddings/wiki.en.bin", - "dim": 300, - "mean": true - }, { "class_name": "one_hotter", "id": "onehotter", @@ -118,21 +102,19 @@ }, { "in": [ - "x_pca", - "x_emb" + "x_pca" ], "out": [ "y_pred_onehot" ], "fit_on": [ "x_pca", - "x_emb", "y_onehot" ], "class_name": "sklearn_component", "main": true, - "save_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v9/model.pkl", - "load_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v9/model.pkl", + "save_path": "{MODEL_PATH}/model.pkl", + "load_path": "{MODEL_PATH}/model.pkl", "model_class": "sklearn.neighbors:KNeighborsClassifier", "infer_method": "predict", "ensure_list_output": true @@ -158,7 +140,7 @@ "metrics": [ "accuracy" ], - "show_examples": true, + "show_examples": false, "evaluation_targets": [ "train", "valid" @@ -169,7 +151,8 @@ "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v11" }, "requirements": [ "{DEEPPAVLOV_PATH}/requirements/tf.txt", @@ -181,16 +164,8 @@ }, "download": [ { - "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", - "subdir": "{MODELS_PATH}" - }, -{ - "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_snips_sklearn_v9.tar.gz", + "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_snips_sklearn_v11.tar.gz", "subdir": "{MODELS_PATH}/classifiers" - }, - { - "url": "http://files.deeppavlov.ai/deeppavlov_data/embeddings/wiki.en.bin", - "subdir": "{DOWNLOADS_PATH}/embeddings" } ] } diff --git a/deeppavlov/configs/classifiers/intents_snips_tfidf_weighted.json b/deeppavlov/configs/classifiers/intents_snips_tfidf_weighted.json index 8e85b9f4d6..b767a26a20 100644 --- a/deeppavlov/configs/classifiers/intents_snips_tfidf_weighted.json +++ b/deeppavlov/configs/classifiers/intents_snips_tfidf_weighted.json @@ -23,8 +23,8 @@ "fit_on": [ "y" ], - "save_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v10/classes.dict", - "load_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v10/classes.dict", + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", "in": "y", "out": "y_ids" }, @@ -41,8 +41,8 @@ ], "id": "my_tfidf_vectorizer", "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v10/tfidf.pkl", - "load_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v10/tfidf.pkl", + "save_path": "{MODEL_PATH}/tfidf.pkl", + "load_path": "{MODEL_PATH}/tfidf.pkl", "model_class": "sklearn.feature_extraction.text:TfidfVectorizer", "infer_method": "transform", "lowercase": true, @@ -61,8 +61,8 @@ ], "id": "my_selector", "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v10/selectkbest.pkl", - "load_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v10/selectkbest.pkl", + "save_path": "{MODEL_PATH}/selectkbest.pkl", + "load_path": "{MODEL_PATH}/selectkbest.pkl", "model_class": "sklearn.feature_selection:SelectKBest", "infer_method": "transform", "score_func": "sklearn.feature_selection:chi2", @@ -80,8 +80,8 @@ ], "id": "my_pca", "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v10/pca.pkl", - "load_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v10/pca.pkl", + "save_path": "{MODEL_PATH}/pca.pkl", + "load_path": "{MODEL_PATH}/pca.pkl", "model_class": "sklearn.decomposition:PCA", "infer_method": "transform", "n_components": 300 @@ -133,8 +133,8 @@ ], "class_name": "sklearn_component", "main": true, - "save_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v10/model.pkl", - "load_path": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v10/model.pkl", + "save_path": "{MODEL_PATH}/model.pkl", + "load_path": "{MODEL_PATH}/model.pkl", "model_class": "sklearn.linear_model:LogisticRegression", "infer_method": "predict", "ensure_list_output": true @@ -165,7 +165,8 @@ "variables": { "ROOT_PATH": "~/.deeppavlov", "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", - "MODELS_PATH": "{ROOT_PATH}/models" + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/classifiers/intents_snips_sklearn_v12" }, "requirements": [ "{DEEPPAVLOV_PATH}/requirements/tf.txt", @@ -177,11 +178,7 @@ }, "download": [ { - "url": "http://files.deeppavlov.ai/deeppavlov_data/vocabs.tar.gz", - "subdir": "{MODELS_PATH}" - }, -{ - "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_snips_sklearn_v10.tar.gz", + "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/intents_snips_sklearn_v12.tar.gz", "subdir": "{MODELS_PATH}/classifiers" }, { diff --git a/deeppavlov/configs/faq/tfidf_logreg_autofaq.json b/deeppavlov/configs/faq/tfidf_logreg_autofaq.json index 4594085028..9e2516fceb 100644 --- a/deeppavlov/configs/faq/tfidf_logreg_autofaq.json +++ b/deeppavlov/configs/faq/tfidf_logreg_autofaq.json @@ -37,8 +37,8 @@ ], "id": "tfidf_vec", "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki_v1.pkl", - "load_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki_v1.pkl", + "save_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki_v2.pkl", + "load_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki_v2.pkl", "model_class": "sklearn.feature_extraction.text:TfidfVectorizer", "infer_method": "transform" }, @@ -64,8 +64,8 @@ ], "class_name": "sklearn_component", "main": true, - "save_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v1.pkl", - "load_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v1.pkl", + "save_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v2.pkl", + "load_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v2.pkl", "model_class": "sklearn.linear_model:LogisticRegression", "infer_method": "predict_proba", "C": 1000, @@ -100,11 +100,11 @@ }, "download": [ { - "url": "http://files.deeppavlov.ai/faq/school/tfidf_logreg_classifier_v1.pkl", + "url": "http://files.deeppavlov.ai/faq/school/tfidf_logreg_classifier_v2.pkl", "subdir": "{MODELS_PATH}/faq" }, { - "url": "http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki_v1.pkl", + "url": "http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki_v2.pkl", "subdir": "{MODELS_PATH}/vectorizer" }, { diff --git a/deeppavlov/configs/faq/tfidf_logreg_en_faq.json b/deeppavlov/configs/faq/tfidf_logreg_en_faq.json index 6fe1cf9d32..04ab1f34d4 100644 --- a/deeppavlov/configs/faq/tfidf_logreg_en_faq.json +++ b/deeppavlov/configs/faq/tfidf_logreg_en_faq.json @@ -36,8 +36,8 @@ ], "id": "tfidf_vec", "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v3/tfidf.pkl", - "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v3/tfidf.pkl", + "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v4/tfidf.pkl", + "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v4/tfidf.pkl", "model_class": "sklearn.feature_extraction.text:TfidfVectorizer", "infer_method": "transform" }, @@ -47,8 +47,8 @@ "fit_on": [ "y" ], - "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v3/en_mipt_answers.dict", - "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v3/en_mipt_answers.dict", + "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v4/en_mipt_answers.dict", + "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v4/en_mipt_answers.dict", "in": "y", "out": "y_ids" }, @@ -63,8 +63,8 @@ ], "class_name": "sklearn_component", "main": true, - "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v3/logreg.pkl", - "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v3/logreg.pkl", + "save_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v4/logreg.pkl", + "load_path": "{MODELS_PATH}/faq/mipt/en_mipt_faq_v4/logreg.pkl", "model_class": "sklearn.linear_model:LogisticRegression", "infer_method": "predict_proba", "C": 1000, @@ -103,7 +103,7 @@ ], "download": [ { - "url": "http://files.deeppavlov.ai/faq/mipt/en_mipt_faq_v3.tar.gz", + "url": "http://files.deeppavlov.ai/faq/mipt/en_mipt_faq_v4.tar.gz", "subdir": "{MODELS_PATH}/faq/mipt" } ] diff --git a/deeppavlov/configs/kbqa/kbqa_rus.json b/deeppavlov/configs/kbqa/kbqa_rus.json index 49f3fe5512..f9fb98c023 100644 --- a/deeppavlov/configs/kbqa/kbqa_rus.json +++ b/deeppavlov/configs/kbqa/kbqa_rus.json @@ -226,7 +226,8 @@ }, "requirements": [ "{DEEPPAVLOV_PATH}/requirements/tf.txt", - "{DEEPPAVLOV_PATH}/requirements/fasttext.txt" + "{DEEPPAVLOV_PATH}/requirements/fasttext.txt", + "{DEEPPAVLOV_PATH}/requirements/spelling.txt" ], "download": [ { diff --git a/deeppavlov/configs/paramsearch/tfidf_logreg_autofaq_psearch.json b/deeppavlov/configs/paramsearch/tfidf_logreg_autofaq_psearch.json index a2e97b9efa..f793916514 100644 --- a/deeppavlov/configs/paramsearch/tfidf_logreg_autofaq_psearch.json +++ b/deeppavlov/configs/paramsearch/tfidf_logreg_autofaq_psearch.json @@ -35,8 +35,8 @@ ], "id": "tfidf_vec", "class_name": "sklearn_component", - "save_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki.pkl", - "load_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki.pkl", + "save_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki_v3.pkl", + "load_path": "{MODELS_PATH}/vectorizer/tfidf_vectorizer_ruwiki_v3.pkl", "model_class": "sklearn.feature_extraction.text:TfidfVectorizer", "infer_method": "transform" }, @@ -53,8 +53,8 @@ ], "class_name": "sklearn_component", "main": true, - "save_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier.pkl", - "load_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier.pkl", + "save_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v3.pkl", + "load_path": "{MODELS_PATH}/faq/tfidf_logreg_classifier_v3.pkl", "model_class": "sklearn.linear_model:LogisticRegression", "infer_method": "predict", "C": { @@ -94,11 +94,11 @@ }, "download": [ { - "url": "http://files.deeppavlov.ai/faq/school/tfidf_logreg_classifier.pkl", + "url": "http://files.deeppavlov.ai/faq/school/tfidf_logreg_classifier_v3.pkl", "subdir": "{MODELS_PATH}/faq" }, { - "url": "http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki.pkl", + "url": "http://files.deeppavlov.ai/vectorizer/tfidf_vectorizer_ruwiki_v3.pkl", "subdir": "{MODELS_PATH}/vectorizer" } ] diff --git a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru.json b/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru.json index 41cc799698..2ef47fac0a 100644 --- a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru.json +++ b/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru.json @@ -69,7 +69,8 @@ "MODELS_PATH": "{ROOT_PATH}/models" }, "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/spelling.txt" + "{DEEPPAVLOV_PATH}/requirements/spelling.txt", + "{DEEPPAVLOV_PATH}/requirements/kenlm.txt" ], "labels": { "telegram_utils": "ErrorModel", diff --git a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_custom_vocab.json b/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_custom_vocab.json index 4e3d883112..c70a4238de 100644 --- a/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_custom_vocab.json +++ b/deeppavlov/configs/spelling_correction/brillmoore_kartaslov_ru_custom_vocab.json @@ -71,7 +71,8 @@ "MODELS_PATH": "{ROOT_PATH}/models" }, "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/spelling.txt" + "{DEEPPAVLOV_PATH}/requirements/spelling.txt", + "{DEEPPAVLOV_PATH}/requirements/kenlm.txt" ], "labels": { "telegram_utils": "ErrorModel", diff --git a/deeppavlov/configs/spelling_correction/brillmoore_wikitypos_en.json b/deeppavlov/configs/spelling_correction/brillmoore_wikitypos_en.json index 22fd5ed8bc..d25fb248a2 100644 --- a/deeppavlov/configs/spelling_correction/brillmoore_wikitypos_en.json +++ b/deeppavlov/configs/spelling_correction/brillmoore_wikitypos_en.json @@ -66,7 +66,8 @@ "MODELS_PATH": "{ROOT_PATH}/models" }, "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/spelling.txt" + "{DEEPPAVLOV_PATH}/requirements/spelling.txt", + "{DEEPPAVLOV_PATH}/requirements/kenlm.txt" ], "labels": { "telegram_utils": "ErrorModel", diff --git a/deeppavlov/configs/spelling_correction/levenshtein_corrector_ru.json b/deeppavlov/configs/spelling_correction/levenshtein_corrector_ru.json index 132e45b4e2..ca85a8c6c3 100644 --- a/deeppavlov/configs/spelling_correction/levenshtein_corrector_ru.json +++ b/deeppavlov/configs/spelling_correction/levenshtein_corrector_ru.json @@ -47,7 +47,8 @@ "MODELS_PATH": "{ROOT_PATH}/models" }, "requirements": [ - "{DEEPPAVLOV_PATH}/requirements/spelling.txt" + "{DEEPPAVLOV_PATH}/requirements/spelling.txt", + "{DEEPPAVLOV_PATH}/requirements/kenlm.txt" ], "labels": { "telegram_utils": "ErrorModel", diff --git a/deeppavlov/requirements/fasttext.txt b/deeppavlov/requirements/fasttext.txt index 5f60bf5190..896ed329ed 100644 --- a/deeppavlov/requirements/fasttext.txt +++ b/deeppavlov/requirements/fasttext.txt @@ -1,2 +1,2 @@ pybind11==2.2.3 -git+https://github.com/deepmipt/fastText.git#egg=fastText==0.8.22 +git+https://github.com/deepmipt/fastText.git#egg=fastText==0.8.22 \ No newline at end of file diff --git a/deeppavlov/requirements/gensim.txt b/deeppavlov/requirements/gensim.txt index ce61965790..89bc241aef 100644 --- a/deeppavlov/requirements/gensim.txt +++ b/deeppavlov/requirements/gensim.txt @@ -1 +1 @@ -gensim==2.3.0 \ No newline at end of file +gensim==3.7.3 \ No newline at end of file diff --git a/deeppavlov/requirements/kenlm.txt b/deeppavlov/requirements/kenlm.txt new file mode 100644 index 0000000000..c5f77257bf --- /dev/null +++ b/deeppavlov/requirements/kenlm.txt @@ -0,0 +1 @@ +git+https://github.com/kpu/kenlm.git@2ad7cb56924cd3c6811c604973f592cb5ef604eb#egg=kenlm \ No newline at end of file diff --git a/deeppavlov/requirements/spelling.txt b/deeppavlov/requirements/spelling.txt index bc6605c003..45fc7e17e6 100644 --- a/deeppavlov/requirements/spelling.txt +++ b/deeppavlov/requirements/spelling.txt @@ -1,4 +1,3 @@ -lxml==4.1.1 +lxml==4.3.4 python-Levenshtein==0.12.0 -git+https://github.com/kpu/kenlm.git@328cc2995202e84d29e3773203d29cdd6cc07132#egg=kenlm sortedcontainers==2.0.2 \ No newline at end of file diff --git a/deeppavlov/requirements/tf-gpu.txt b/deeppavlov/requirements/tf-gpu.txt index b96321afd9..bfafa1601a 100644 --- a/deeppavlov/requirements/tf-gpu.txt +++ b/deeppavlov/requirements/tf-gpu.txt @@ -1 +1 @@ -tensorflow-gpu==1.10.0 +tensorflow-gpu==1.14.0 \ No newline at end of file diff --git a/deeppavlov/requirements/tf-hub.txt b/deeppavlov/requirements/tf-hub.txt index 59e76b67f1..b9e22c0609 100644 --- a/deeppavlov/requirements/tf-hub.txt +++ b/deeppavlov/requirements/tf-hub.txt @@ -1 +1 @@ -tensorflow-hub==0.1.1 +tensorflow-hub==0.1.1 \ No newline at end of file diff --git a/deeppavlov/requirements/tf.txt b/deeppavlov/requirements/tf.txt index 9ec1d5fe03..4af7a4546f 100644 --- a/deeppavlov/requirements/tf.txt +++ b/deeppavlov/requirements/tf.txt @@ -1 +1 @@ -tensorflow==1.10.0 +tensorflow==1.14.0 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d2e7deb46c..268a61795a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,20 @@ -Cython==0.28.5 +Cython==0.29.12 overrides==1.9 -numpy==1.14.5 -pandas==0.23.1 +numpy==1.16.4 +pandas==0.24.2 nltk==3.2.5 -tqdm==4.23.4 -scipy==1.1.0 -h5py==2.8.0 -keras==2.2.0 -scikit-learn==0.19.1 -fuzzywuzzy==0.16.0 +tqdm==4.32.2 +scipy==1.3.0 +h5py==2.9.0 +keras==2.2.4 +scikit-learn==0.21.2 +fuzzywuzzy==0.17.0 pymorphy2==0.8 pymorphy2-dicts-ru -requests==2.19.1 -pytelegrambotapi==3.5.2 -flask==1.0.2 -flasgger==0.9.1 -flask_cors==3.0.6 -rusenttokenize==0.0.4 -pyopenssl==18.0.0 \ No newline at end of file +requests==2.22.0 +pytelegrambotapi==3.6.6 +flask==1.1.1 +flasgger==0.9.2 +flask_cors==3.0.8 +rusenttokenize==0.0.5 +pyopenssl==19.0.0 \ No newline at end of file From 818f34254114c5dd795f8f83e8fd3aed16a42837 Mon Sep 17 00:00:00 2001 From: Dilyara Baymurzina Date: Thu, 25 Jul 2019 11:47:48 +0300 Subject: [PATCH 11/18] fix: parsin path for evolution correctly (#932) * fix: new method to parse only path using given config * fix: parse path using given config * chore: moved getting variables to separate function * fix: rename function --- deeppavlov/core/commands/utils.py | 24 ++++++++++++++++++++++-- deeppavlov/evolve.py | 6 +++--- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/deeppavlov/core/commands/utils.py b/deeppavlov/core/commands/utils.py index ceb396c80d..995bbf5988 100644 --- a/deeppavlov/core/commands/utils.py +++ b/deeppavlov/core/commands/utils.py @@ -36,8 +36,8 @@ def _parse_config_property(item: _T, variables: Dict[str, Union[str, Path, float return item -def parse_config(config: Union[str, Path, dict]) -> dict: - """Read config's variables and apply their values to all its properties""" +def _get_variables_from_config(config: Union[str, Path, dict]): + """Read config's variables""" if isinstance(config, (str, Path)): config = read_json(find_config(config)) @@ -55,6 +55,17 @@ def parse_config(config: Union[str, Path, dict]) -> dict: value = value.format(**variables) variables[name] = value variables_exact[f'{{{name}}}'] = value + + return variables, variables_exact + + +def parse_config(config: Union[str, Path, dict]) -> dict: + """Apply variables' values to all its properties""" + if isinstance(config, (str, Path)): + config = read_json(find_config(config)) + + variables, variables_exact = _get_variables_from_config(config) + return _parse_config_property(config, variables, variables_exact) @@ -67,3 +78,12 @@ def import_packages(packages: list) -> None: """Import packages from list to execute their code.""" for package in packages: __import__(package) + + +def parse_value_with_config(value: Union[str, Path], config: Union[str, Path, dict]) -> Path: + """Fill the variables in `value` with variables values from `config`. + `value` should be a string. If `value` is a string of only variable, `value` will be replaced with + variable's value from config (the variable's value could be anything then).""" + variables, variables_exact = _get_variables_from_config(config) + + return _parse_config_property(str(value), variables, variables_exact) diff --git a/deeppavlov/evolve.py b/deeppavlov/evolve.py index c21696645b..0867c4eb5d 100644 --- a/deeppavlov/evolve.py +++ b/deeppavlov/evolve.py @@ -24,7 +24,7 @@ import pandas as pd -from deeppavlov.core.commands.utils import expand_path, parse_config +from deeppavlov.core.commands.utils import expand_path, parse_config, parse_value_with_config from deeppavlov.core.common.errors import ConfigError from deeppavlov.core.common.file import read_json, save_json, find_config from deeppavlov.models.evolution.evolution_param_generator import ParamsEvolution @@ -107,8 +107,8 @@ def main(): evolve_metric = considered_metrics[0] # Create table variable for gathering results - abs_path_to_main_models = expand_path(str(evolution.models_path).format( - **evolution.basic_config['metadata']['variables'])) + abs_path_to_main_models = expand_path(parse_value_with_config(evolution.models_path, + evolution.basic_config)) abs_path_to_main_models.mkdir(parents=True, exist_ok=True) result_file = abs_path_to_main_models / "result_table.tsv" From fd961f0ad0f1526d2db7ee38fda08ca2a431eb81 Mon Sep 17 00:00:00 2001 From: Dilyara Baymurzina Date: Thu, 25 Jul 2019 11:48:42 +0300 Subject: [PATCH 12/18] feat: add an English conversational BERT model (#936) * feat: insults model with conv bert * chore: metrics for new model * feat: docs on conversational bert * feat: link for file downloading * feat: test on conv bert * chore: conv bert scores * fix: bert paths * fix: links in table --- .../classifiers/insults_kaggle_conv_bert.json | 161 ++++++++++++++++++ docs/components/bert.rst | 11 ++ docs/components/classifiers.rst | 2 + docs/intro/features.rst | 2 + tests/test_quick_start.py | 1 + 5 files changed, 177 insertions(+) create mode 100644 deeppavlov/configs/classifiers/insults_kaggle_conv_bert.json diff --git a/deeppavlov/configs/classifiers/insults_kaggle_conv_bert.json b/deeppavlov/configs/classifiers/insults_kaggle_conv_bert.json new file mode 100644 index 0000000000..a19bfa114e --- /dev/null +++ b/deeppavlov/configs/classifiers/insults_kaggle_conv_bert.json @@ -0,0 +1,161 @@ +{ + "dataset_reader": { + "class_name": "basic_classification_reader", + "x": "Comment", + "y": "Class", + "data_path": "{DOWNLOADS_PATH}/insults_data" + }, + "dataset_iterator": { + "class_name": "basic_classification_iterator", + "seed": 42 + }, + "chainer": { + "in": [ + "x" + ], + "in_y": [ + "y" + ], + "pipe": [ + { + "class_name": "bert_preprocessor", + "vocab_file": "{DOWNLOADS_PATH}/bert_models/conversational_cased_L-12_H-768_A-12/vocab.txt", + "do_lower_case": false, + "max_seq_length": 64, + "in": [ + "x" + ], + "out": [ + "bert_features" + ] + }, + { + "id": "classes_vocab", + "class_name": "simple_vocab", + "fit_on": [ + "y" + ], + "save_path": "{MODEL_PATH}/classes.dict", + "load_path": "{MODEL_PATH}/classes.dict", + "in": [ + "y" + ], + "out": [ + "y_ids" + ] + }, + { + "in": [ + "y_ids" + ], + "out": [ + "y_onehot" + ], + "class_name": "one_hotter", + "depth": "#classes_vocab.len", + "single_vector": true + }, + { + "class_name": "bert_classifier", + "n_classes": "#classes_vocab.len", + "return_probas": true, + "one_hot_labels": true, + "bert_config_file": "{DOWNLOADS_PATH}/bert_models/conversational_cased_L-12_H-768_A-12/bert_config.json", + "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/conversational_cased_L-12_H-768_A-12/bert_model.ckpt", + "save_path": "{MODEL_PATH}/model", + "load_path": "{MODEL_PATH}/model", + "keep_prob": 0.5, + "learning_rate": 1e-05, + "learning_rate_drop_patience": 5, + "learning_rate_drop_div": 2.0, + "in": [ + "bert_features" + ], + "in_y": [ + "y_onehot" + ], + "out": [ + "y_pred_probas" + ] + }, + { + "in": [ + "y_pred_probas" + ], + "out": [ + "y_pred_ids" + ], + "class_name": "proba2labels", + "max_proba": true + }, + { + "in": [ + "y_pred_ids" + ], + "out": [ + "y_pred_labels" + ], + "ref": "classes_vocab" + } + ], + "out": [ + "y_pred_labels" + ] + }, + "train": { + "epochs": 100, + "batch_size": 64, + "metrics": [ + { + "name": "roc_auc", + "inputs": [ + "y_onehot", + "y_pred_probas" + ] + }, + "sets_accuracy", + "f1_macro" + ], + "validation_patience": 5, + "val_every_n_epochs": 1, + "log_every_n_epochs": 1, + "show_examples": false, + "evaluation_targets": [ + "train", + "valid", + "test" + ], + "class_name": "nn_trainer", + "tensorboard_log_dir": "{MODEL_PATH}/" + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "MODEL_PATH": "{MODELS_PATH}/classifiers/insults_kaggle_v4" + }, + "requirements": [ + "{DEEPPAVLOV_PATH}/requirements/tf.txt", + "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt" + ], + "labels": { + "telegram_utils": "IntentModel", + "server_utils": "KerasIntentModel" + }, + "download": [ + { + "url": "http://files.deeppavlov.ai/datasets/insults_data.tar.gz", + "subdir": "{DOWNLOADS_PATH}" + }, + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/conversational_cased_L-12_H-768_A-12.tar.gz", + "subdir": "{DOWNLOADS_PATH}/bert_models" + }, + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/classifiers/insults_kaggle_v4.tar.gz", + "subdir": "{MODELS_PATH}/classifiers" + } + ] + } +} diff --git a/docs/components/bert.rst b/docs/components/bert.rst index 6080a8395b..b31b1b876b 100644 --- a/docs/components/bert.rst +++ b/docs/components/bert.rst @@ -23,6 +23,13 @@ We have trained BERT-base model for other languages: RuBERT was trained on the Russian part of Wikipedia and news data. We used this training data to build vocabulary of Russian subtokens and took multilingual version of BERT-base as initialization for RuBERT [1]_. SlavicBERT training was done in the same manner as RuBERT. +We have trained BERT-base model for conversational language style: +- Conversational BERT, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: `[deeppavlov] `__ + +Conversational BERT was trained on the English part of Twitter, Reddit, DailyDialogues [3]_, OpenSubtitles [4]_, Debates [5]_, Blogs [6]_, Facebook News Comments. +We used this training data to build vocabulary of English subtokens and took +English cased version of BERT-base as initialization for English Conversational BERT. + Here, in DeepPavlov, we made it easy to use pre-trained BERT for downstream tasks like classification, tagging, question answering and ranking. We also provide pre-trained models and examples on how to use BERT with DeepPavlov. @@ -84,3 +91,7 @@ Statistics are available :doc:`here `. .. [1] Kuratov, Y., Arkhipov, M. (2019). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language. arXiv preprint arXiv:1905.07213. .. [2] McDonald, R., Brokos, G. I., & Androutsopoulos, I. (2018). Deep relevance ranking using enhanced document-query interactions. arXiv preprint arXiv:1809.01682. +.. [3] Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. IJCNLP 2017. +.. [4] P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation (LREC 2016) +.. [5] Justine Zhang, Ravi Kumar, Sujith Ravi, Cristian Danescu-Niculescu-Mizil. Proceedings of NAACL, 2016. +.. [6] J. Schler, M. Koppel, S. Argamon and J. Pennebaker (2006). Effects of Age and Gender on Blogging in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs. \ No newline at end of file diff --git a/docs/components/classifiers.rst b/docs/components/classifiers.rst index fec9723b85..0a4c770b62 100644 --- a/docs/components/classifiers.rst +++ b/docs/components/classifiers.rst @@ -295,6 +295,8 @@ Therefore, this model is available only for interaction. | Insult detection | `Insults`_ | | :config:`Reddit emb ` | ROC-AUC | 0.9263 | 0.8556 | 6.2 Gb | + + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+ | | | | :config:`English BERT ` | | 0.9255 | 0.8612 | 1200 Mb | ++ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+ +| | | | :config:`English Conversational BERT ` | | 0.9389 | 0.8941 | 1200 Mb | +------------------+--------------------+ +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+ | 5 topics | `AG News`_ | | :config:`Wiki emb ` | Accuracy | 0.8922 | 0.9059 | 8.5 Gb | +------------------+--------------------+------+-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+ diff --git a/docs/intro/features.rst b/docs/intro/features.rst index 0b0babd695..409c6c57da 100644 --- a/docs/intro/features.rst +++ b/docs/intro/features.rst @@ -79,6 +79,8 @@ Several pre-trained models are available and presented in Table below. | Insult detection | `Insults`_ | | :config:`Reddit emb ` | ROC-AUC | 0.9263 | 0.8556 | 6.2 Gb | + + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+ | | | | :config:`English BERT ` | | 0.9255 | 0.8612 | 1200 Mb | ++ + + +-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+ +| | | | :config:`English Conversational BERT ` | | 0.9389 | 0.8941 | 1200 Mb | +------------------+--------------------+ +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+ | 5 topics | `AG News`_ | | :config:`Wiki emb ` | Accuracy | 0.8922 | 0.9059 | 8.5 Gb | +------------------+--------------------+------+-------------------------------------------------------------------------------------------------+ +--------+--------+-----------+ diff --git a/tests/test_quick_start.py b/tests/test_quick_start.py index 5645ac2e74..9c36880035 100644 --- a/tests/test_quick_start.py +++ b/tests/test_quick_start.py @@ -86,6 +86,7 @@ ("classifiers/paraphraser_bert.json", "classifiers", ('IP', 'TI')): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/paraphraser_rubert.json", "classifiers", ('IP', 'TI')): [TWO_ARGUMENTS_INFER_CHECK], ("classifiers/insults_kaggle_bert.json", "classifiers", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], + ("classifiers/insults_kaggle_conv_bert.json", "classifiers", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/rusentiment_bert.json", "classifiers", ('IP',)): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/intents_dstc2_bert.json", "classifiers", ('IP', 'TI')): [ONE_ARGUMENT_INFER_CHECK], ("classifiers/intents_dstc2.json", "classifiers", ALL_MODES): [ONE_ARGUMENT_INFER_CHECK], From 955751dae23700356c3f7dcf0dc9e24c185dce63 Mon Sep 17 00:00:00 2001 From: Aleksei Lymar Date: Thu, 25 Jul 2019 11:48:59 +0300 Subject: [PATCH 13/18] fix: actually raise ConfigError when loading an inconsistent Keras model (#944) resolves #943 --- deeppavlov/models/classifiers/keras_classification_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeppavlov/models/classifiers/keras_classification_model.py b/deeppavlov/models/classifiers/keras_classification_model.py index f085d99b04..f2b0c6a2a9 100644 --- a/deeppavlov/models/classifiers/keras_classification_model.py +++ b/deeppavlov/models/classifiers/keras_classification_model.py @@ -284,7 +284,7 @@ def _load(self, model_name: str) -> None: try: model.load_weights(str(weights_path)) except ValueError: - ConfigError("Some non-changable parameters of neural network differ from given pre-trained model") + raise ConfigError("Some non-changable parameters of neural network differ from given pre-trained model") self.model = model From 49bdb1d3ccdf84acf3a4c8341dae284d93c656da Mon Sep 17 00:00:00 2001 From: Dilyara Baymurzina Date: Thu, 25 Jul 2019 11:49:14 +0300 Subject: [PATCH 14/18] fix: change error message for proba2labels (#946) --- deeppavlov/models/classifiers/proba2labels.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/deeppavlov/models/classifiers/proba2labels.py b/deeppavlov/models/classifiers/proba2labels.py index f11b5eb022..5cb2dfe659 100644 --- a/deeppavlov/models/classifiers/proba2labels.py +++ b/deeppavlov/models/classifiers/proba2labels.py @@ -27,7 +27,7 @@ @register('proba2labels') class Proba2Labels(Component): """ - Class implements probability to labels processing using two different ways: \ + Class implements probability to labels processing using the following ways: \ choosing one or top_n indices with maximal probability or choosing any number of indices \ which probabilities to belong with are higher than given confident threshold @@ -74,5 +74,6 @@ def __call__(self, data: Union[np.ndarray, List[List[float]], List[List[int]]], elif self.top_n: return [np.argsort(d)[::-1][:self.top_n] for d in data] else: - raise ConfigError("Proba2Labels requires one of two arguments: bool `max_proba` or " - "float `confident_threshold` for multi-label classification") + raise ConfigError("Proba2Labels requires one of three arguments: bool `max_proba` or " + "float `confident_threshold` for multi-label classification or" + "integer `top_n` for choosing several labels with the highest probabilities") From a5d085657f3fd653c6660a17259098334a2773c2 Mon Sep 17 00:00:00 2001 From: Dilyara Baymurzina Date: Fri, 26 Jul 2019 12:03:44 +0300 Subject: [PATCH 15/18] fix: change params in config intents_sample (#949) --- deeppavlov/configs/classifiers/intents_sample_csv.json | 6 +++--- deeppavlov/configs/classifiers/intents_sample_json.json | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/deeppavlov/configs/classifiers/intents_sample_csv.json b/deeppavlov/configs/classifiers/intents_sample_csv.json index ff6a85ba64..9b15809ae4 100644 --- a/deeppavlov/configs/classifiers/intents_sample_csv.json +++ b/deeppavlov/configs/classifiers/intents_sample_csv.json @@ -83,9 +83,9 @@ "embedding_size": "#my_embedder.dim", "n_classes": "#classes_vocab.len", "kernel_sizes_cnn": [ - 3, - 5, - 7 + 1, + 2, + 3 ], "filters_cnn": 256, "optimizer": "Adam", diff --git a/deeppavlov/configs/classifiers/intents_sample_json.json b/deeppavlov/configs/classifiers/intents_sample_json.json index 8fd9de8105..e8d8034591 100644 --- a/deeppavlov/configs/classifiers/intents_sample_json.json +++ b/deeppavlov/configs/classifiers/intents_sample_json.json @@ -78,9 +78,9 @@ "embedding_size": "#my_embedder.dim", "n_classes": "#classes_vocab.len", "kernel_sizes_cnn": [ - 3, - 5, - 7 + 1, + 2, + 3 ], "filters_cnn": 256, "optimizer": "Adam", From c3ab96d5ffc72cb04c6afd8a7d6e4fdd8cb24775 Mon Sep 17 00:00:00 2001 From: Mary Trofimova Date: Mon, 29 Jul 2019 12:29:23 +0300 Subject: [PATCH 16/18] docs: restructure documentation and add a proper QuickStart (#947) * feat: update required packages versions * feat: downgrade scikit-learn to not break tests * chore: move updated kenlm dependency to a separate requirements file * fix: add spelling requirements to the kbqa_rus config * fix: remove empty line in kenlm.txt * docs: update required python version in README.md * feat: update tensorflow version * feat: require newer lxml version==4.3.4 * feat: last sklearn version * chore: config for sklearn classifier and new model * fix: new model files * fix: new model files * fix: new model files * fix: new model files * tests: update jenkinsfile to run tests on cuda-10.0 in python3.7 * feat: update core dependencies * tests: fix commands order in Jenkinsfile * tests: cd back to tests after building docs * chore: update version to 0.5.0 * docs: add breaking changes description for 0.5.0 * docs: add python3.7 support info * docs: environment via venv module * docs: shorten installation section * docs: mv logo up and intro rewrite * docs: logo resize * docs: rm hello bot from readme * feat: add python37 to docs * feat: new structure of docs * docs: simplify rest api doc * docs: rewrite integration doc * docs: minor fixes, component -> model * docs: rename Component -> Model everywhere * docs: mv data processors to dev guide * docs: add contribution guide * docs: restructure main doc tree * docs: add quick links * docs: make links italic font * docs: refactor docker hub doc * docs: add docker installation doc * docs: models & skills to lower level of hyerarchy * docs: add todos * refactor: change dir hyerarchy * docs: contributor guide minor fixes * fix: unit tests are not really unit * docs: show subsections of modesl in the main index * docs: no features in readme * docs: add table of contexts for features * docs: refactor and fix BERT models descriptions * docs: add custom BERT init usage guide * Apply suggestions from code review Co-Authored-By: Aleksei Lymar * docs: resolve @yoptar's suggestions * docs: @dilya's suggestions * fix: custom bert tasks mentions * docs: add link to config description * docs: google style != pep8 * docs: minor additions and fixes to quick start * docs: add BERT to the embeddings section * docs: fix readme links * docs: add quick start to readme * docs: fix readme * docs: installation before quickstart * docs: apply suggestions from code review --- README.md | 276 ++++++++++-------- docs/apiref/skills/default_skill.rst | 2 +- docs/components/data_processors.rst | 75 ----- docs/devguides/contribution_guide.rst | 111 +++++++ .../devguides/{extending.rst => registry.rst} | 4 +- docs/devguides/yandex_alice.rst | 53 ---- docs/{intro => features}/hypersearch.rst | 0 docs/{components => features/models}/bert.rst | 30 +- .../models}/classifiers.rst | 4 +- docs/features/models/index.rst | 19 ++ docs/{components => features/models}/kbqa.rst | 8 +- .../models}/morphotagger.rst | 6 +- docs/{components => features/models}/ner.rst | 2 +- .../models}/neural_ranking.rst | 4 +- .../models}/popularity_ranking.rst | 0 .../models}/slot_filling.rst | 8 +- .../models}/spelling_correction.rst | 0 .../{components => features/models}/squad.rst | 0 .../models}/tfidf_ranking.rst | 0 .../features.rst => features/overview.rst} | 184 ++++++------ .../pretrained_vectors.rst | 34 ++- docs/{ => features}/skills/aiml_skill.rst | 0 docs/{ => features}/skills/ecommerce.rst | 0 docs/{ => features}/skills/faq.rst | 0 docs/{ => features}/skills/go_bot.rst | 2 +- docs/features/skills/index.rst | 15 + docs/{ => features}/skills/odqa.rst | 0 .../skills/pattern_matching.rst | 2 - docs/{ => features}/skills/seq2seq_go_bot.rst | 2 +- docs/index.rst | 60 ++-- .../amazon_alexa.rst | 20 +- docs/{devguides => integrations}/aws_ec2.rst | 4 +- .../ms_bot.rst} | 20 +- docs/{devguides => integrations}/rest_api.rst | 65 +++-- docs/{devguides => integrations}/settings.rst | 0 docs/integrations/telegram.rst | 66 +++++ docs/integrations/yandex_alice.rst | 62 ++++ ...nfig_description.rst => configuration.rst} | 155 +++++++++- docs/intro/hello_bot.ipynb | 119 -------- docs/intro/installation.rst | 34 ++- docs/intro/overview.rst | 18 +- docs/intro/quick_start.rst | 130 +++++++++ 42 files changed, 1007 insertions(+), 587 deletions(-) delete mode 100644 docs/components/data_processors.rst create mode 100644 docs/devguides/contribution_guide.rst rename docs/devguides/{extending.rst => registry.rst} (97%) delete mode 100644 docs/devguides/yandex_alice.rst rename docs/{intro => features}/hypersearch.rst (100%) rename docs/{components => features/models}/bert.rst (86%) rename docs/{components => features/models}/classifiers.rst (99%) create mode 100644 docs/features/models/index.rst rename docs/{components => features/models}/kbqa.rst (89%) rename docs/{components => features/models}/morphotagger.rst (99%) rename docs/{components => features/models}/ner.rst (99%) rename docs/{components => features/models}/neural_ranking.rst (98%) rename docs/{components => features/models}/popularity_ranking.rst (100%) rename docs/{components => features/models}/slot_filling.rst (95%) rename docs/{components => features/models}/spelling_correction.rst (100%) rename docs/{components => features/models}/squad.rst (100%) rename docs/{components => features/models}/tfidf_ranking.rst (100%) rename docs/{intro/features.rst => features/overview.rst} (96%) rename docs/{intro => features}/pretrained_vectors.rst (81%) rename docs/{ => features}/skills/aiml_skill.rst (100%) rename docs/{ => features}/skills/ecommerce.rst (100%) rename docs/{ => features}/skills/faq.rst (100%) rename docs/{ => features}/skills/go_bot.rst (99%) create mode 100644 docs/features/skills/index.rst rename docs/{ => features}/skills/odqa.rst (100%) rename docs/{ => features}/skills/pattern_matching.rst (82%) rename docs/{ => features}/skills/seq2seq_go_bot.rst (99%) rename docs/{devguides => integrations}/amazon_alexa.rst (93%) rename docs/{devguides => integrations}/aws_ec2.rst (97%) rename docs/{devguides/ms_bot_integration.rst => integrations/ms_bot.rst} (85%) rename docs/{devguides => integrations}/rest_api.rst (67%) rename docs/{devguides => integrations}/settings.rst (100%) create mode 100644 docs/integrations/telegram.rst create mode 100644 docs/integrations/yandex_alice.rst rename docs/intro/{config_description.rst => configuration.rst} (56%) delete mode 100644 docs/intro/hello_bot.ipynb create mode 100644 docs/intro/quick_start.rst diff --git a/README.md b/README.md index 9655a48380..e7f3e9a6ca 100644 --- a/README.md +++ b/README.md @@ -1,146 +1,215 @@ [![License Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/deepmipt/DeepPavlov/blob/master/LICENSE) ![Python 3.6, 3.7](https://img.shields.io/badge/python-3.6%20%7C%203.7-green.svg) [![Downloads](https://pepy.tech/badge/deeppavlov)](https://pepy.tech/project/deeppavlov) + -DeepPavlov is an open-source conversational AI library built on [TensorFlow](https://www.tensorflow.org/) and [Keras](https://keras.io/). It is designed for - * development of production ready chat-bots and complex conversational systems, - * NLP and dialog systems research. +DeepPavlov is an open-source conversational AI library built on [TensorFlow](https://www.tensorflow.org/) and [Keras](https://keras.io/). -# Demo +DeepPavlov is designed for +* development of production ready chat-bots and complex conversational systems, +* research in the area of NLP and, particularly, of dialog systems. -Demo of selected features is available at [demo.ipavlov.ai](https://demo.ipavlov.ai/) +## Quick Links -# Hello Bot in DeepPavlov +* Demo [*demo.ipavlov.ai*](https://demo.ipavlov.ai/) +* Documentation [*docs.deeppavlov.ai*](http://docs.deeppavlov.ai/) + * Model List [*docs:features/*](http://docs.deeppavlov.ai/en/master/features/overview.html) + * Contribution Guide [*docs:contribution_guide/*](http://docs.deeppavlov.ai/en/master/devguides/contribution_guide.html) +* Issues [*github/issues/*](https://github.com/deepmipt/DeepPavlov/issues) +* Forum [*forum.ipavlov.ai*](https://forum.ipavlov.ai/) +* Blogs [*ipavlov.ai/#rec108281800*](http://ipavlov.ai/#rec108281800) +* Tutorials [*examples/*](https://github.com/deepmipt/DeepPavlov/tree/master/examples) +* Docker Hub [*hub.docker.com/u/deeppavlov/*](https://hub.docker.com/u/deeppavlov/) + * Docker Images Documentation [*docs:docker-images/*](http://docs.deeppavlov.ai/en/master/intro/installation.html#docker-images) -Import key components to build HelloBot. -```python -from deeppavlov.skills.pattern_matching_skill import PatternMatchingSkill -from deeppavlov.agents.default_agent.default_agent import DefaultAgent -from deeppavlov.agents.processors.highest_confidence_selector import HighestConfidenceSelector -``` - -Create skills as pre-defined responses for a user's input containing specific keywords or matching regexps. Every skill returns response and confidence. -```python -hello = PatternMatchingSkill(responses=['Hello world!'], patterns=["hi", "hello", "good day"]) -bye = PatternMatchingSkill(['Goodbye world!', 'See you around'], patterns=["bye", "chao", "see you"]) -fallback = PatternMatchingSkill(["I don't understand, sorry", 'I can say "Hello world!"']) -``` - -Agent executes skills and then takes response from the skill with the highest confidence. -```python -HelloBot = DefaultAgent([hello, bye, fallback], skills_selector=HighestConfidenceSelector()) -``` - -Give the floor to the HelloBot! -```python -print(HelloBot(['Hello!', 'Boo...', 'Bye.'])) -``` - -[Jupyter notebook with HelloBot example.](https://colab.research.google.com/github/deepmipt/DeepPavlov/blob/master/docs/intro/hello_bot.ipynb) - - -# Features +**Models** -**Components** +[Named Entity Recognition](http://docs.deeppavlov.ai/en/master/features/models/ner.html) | [Slot filling](http://docs.deeppavlov.ai/en/master/features/models/slot_filling.html) -[Named Entity Recognition](http://docs.deeppavlov.ai/en/latest/components/ner.html) | [Slot filling](http://docs.deeppavlov.ai/en/latest/components/slot_filling.html) +[Intent/Sentence Classification](http://docs.deeppavlov.ai/en/master/features/models/classifiers.html) | [Question Answering over Text (SQuAD)](http://docs.deeppavlov.ai/en/master/features/models/squad.html) -[Intent/Sentence Classification](http://docs.deeppavlov.ai/en/latest/components/classifiers.html) | [Question Answering over Text (SQuAD)](http://docs.deeppavlov.ai/en/latest/components/squad.html) +[Sentence Similarity/Ranking](http://docs.deeppavlov.ai/en/master/features/models/neural_ranking.html) | [TF-IDF Ranking](http://docs.deeppavlov.ai/en/master/features/models/tfidf_ranking.html) -[Sentence Similarity/Ranking](http://docs.deeppavlov.ai/en/latest/components/neural_ranking.html) | [TF-IDF Ranking](http://docs.deeppavlov.ai/en/latest/components/tfidf_ranking.html) - -[Morphological tagging](http://docs.deeppavlov.ai/en/latest/components/morphotagger.html) | [Automatic Spelling Correction](http://docs.deeppavlov.ai/en/latest/components/spelling_correction.html) - -[ELMo training and fine-tuning](http://docs.deeppavlov.ai/en/latest/apiref/models/elmo.html) +[Morphological tagging](http://docs.deeppavlov.ai/en/master/features/models/morphotagger.html) | [Automatic Spelling Correction](http://docs.deeppavlov.ai/en/master/features/models/spelling_correction.html) +[ELMo training and fine-tuning](http://docs.deeppavlov.ai/en/master/apiref/models/elmo.html) **Skills** -[Goal(Task)-oriented Bot](http://docs.deeppavlov.ai/en/latest/skills/go_bot.html) | [Seq2seq Goal-Oriented bot](http://docs.deeppavlov.ai/en/latest/skills/seq2seq_go_bot.html) +[Goal(Task)-oriented Bot](http://docs.deeppavlov.ai/en/master/features/skills/go_bot.html) | [Seq2seq Goal-Oriented bot](http://docs.deeppavlov.ai/en/master/features/skills/seq2seq_go_bot.html) -[Open Domain Questions Answering](http://docs.deeppavlov.ai/en/latest/skills/odqa.html) | [eCommerce Bot](http://docs.deeppavlov.ai/en/master/skills/ecommerce.html) +[Open Domain Questions Answering](http://docs.deeppavlov.ai/en/master/features/skills/odqa.html) | [eCommerce Bot](http://docs.deeppavlov.ai/en/master/features/skills/ecommerce.html) -[Frequently Asked Questions Answering](http://docs.deeppavlov.ai/en/latest/skills/faq.html) | [Pattern Matching](http://docs.deeppavlov.ai/en/latest/skills/pattern_matching.html) +[Frequently Asked Questions Answering](http://docs.deeppavlov.ai/en/master/features/skills/faq.html) | [Pattern Matching](http://docs.deeppavlov.ai/en/master/features/skills/pattern_matching.html) **Embeddings** -[ELMo embeddings for the Russian language](http://docs.deeppavlov.ai/en/latest/apiref/models/embedders.html#deeppavlov.models.embedders.elmo_embedder.ELMoEmbedder) +[BERT embeddings for the Russian, Polish, Bulgarian, Czech, and informal English](http://docs.deeppavlov.ai/en/master/features/pretrained_vectors.html#bert) -[FastText embeddings for the Russian language](http://docs.deeppavlov.ai/en/latest/intro/pretrained_vectors.html) +[ELMo embeddings for the Russian language](http://docs.deeppavlov.ai/en/master/features/pretrained_vectors.html#elmo) + +[FastText embeddings for the Russian language](http://docs.deeppavlov.ai/en/master/features/pretrained_vectors.html#fasttext) **Auto ML** -[Tuning Models with Evolutionary Algorithm](http://docs.deeppavlov.ai/en/latest/intro/hypersearch.html) +[Tuning Models with Evolutionary Algorithm](http://docs.deeppavlov.ai/en/master/features/hypersearch.html) -# Installation +## Installation -0. Currently we support `Linux` and `Windows` platforms and `Python 3.6` +0. We support `Linux` and `Windows` platforms, `Python 3.6` and `Python 3.7` * **`Python 3.5` is not supported!** - * **`Windows` platform requires `Git` for Windows (for example, [git](https://git-scm.com/download/win)), `Visual Studio 2015/2017` with `C++` build tools installed!** + * **installation for `Windows` requires `Git`(for example, [git](https://git-scm.com/download/win)) and `Visual Studio 2015/2017` with `C++` build tools installed!** -1. Create a virtual environment with `Python 3.6`: - ``` - virtualenv env - ``` -2. Activate the environment: +1. Create and activate a virtual environment: * `Linux` ``` + python -m venv env source ./env/bin/activate ``` * `Windows` ``` + python -m venv env .\env\Scripts\activate.bat ``` -3. Install the package inside this virtual environment: +2. Install the package inside the environment: ``` pip install deeppavlov ``` -# Quick start +## QuickStart + +There is a bunch of great pre-trained NLP models in DeepPavlov. Each model is +determined by its config file. + +List of models is available on +[the doc page](http://docs.deeppavlov.ai/en/master/features/overview.html) in +the `deeppavlov.configs` (Python): -To use our pre-trained models, you should first install their requirements: +```python + from deeppavlov import configs ``` -python -m deeppavlov install + +When you're decided on the model (+ config file), there are two ways to train, +evaluate and infer it: + +* via [Command line interface (CLI)](#command-line-interface-cli) and +* via [Python](#python). + +Before making choice of an interface, install model's package requirements +(CLI): + +```bash + python -m deeppavlov install ``` - -Then download the models and data for them: + +* where `` is path to the chosen model's config file (e.g. + `deeppavlov/configs/ner/slotfill_dstc2.json`) or just name without + *.json* extension (e.g. `slotfill_dstc2`) + + +### Command line interface (CLI) + +To get predictions from a model interactively through CLI, run + +```bash + python -m deeppavlov interact [-d] ``` -python -m deeppavlov download + +* `-d` downloads required data -- pretrained model files and embeddings + (optional). + +You can train it in the same simple way: + +```bash + python -m deeppavlov train [-d] ``` -or you can use additional key `-d` to automatically download all required models and data with any command like `interact`, `riseapi`, etc. -Then you can interact with the models or train them with the following command: +Dataset will be downloaded regardless of whether there was `-d` flag or not. + +To train on your own data you need to modify dataset reader path in the +[train config doc](http://docs.deeppavlov.ai/en/master/intro/config_description.html#train-config). +The data format is specified in the corresponding model doc page. + +There are even more actions you can perform with configs: +```bash + python -m deeppavlov [-d] ``` -python -m deeppavlov [-d] + +* `` can be + * `download` to download model's data (same as `-d`), + * `train` to train the model on the data specified in the config file, + * `evaluate` to calculate metrics on the same dataset, + * `interact` to interact via CLI, + * `riseapi` to run a REST API server (see + [doc](http://docs.deeppavlov.ai/en/master/integrations/rest_api.html)), + * `interactbot` to run as a Telegram bot (see + [doc](http://docs.deeppavlov.ai/en/master/integrations/telegram.html)), + * `interactmsbot` to run a Miscrosoft Bot Framework server (see + [doc](http://docs.deeppavlov.ai/en/master/integrations/ms_bot.html)), + * `predict` to get prediction for samples from *stdin* or from + ** if `-f ` is specified. +* `` specifies path (or name) of model's config file +* `-d` downloads required data + + +### Python + +To get predictions from a model interactively through Python, run + +```python + from deeppavlov import build_model + + model = build_model(, download=True) + + # get predictions for 'input_text1', 'input_text2' + model(['input_text1', 'input_text2']) ``` -* `` can be `train`, `predict`, `interact`, `interactbot`, `interactmsbot` or `riseapi` -* `` should be a path to an NLP pipeline json config (e.g. `deeppavlov/configs/ner/slotfill_dstc2.json`) -or a name without the `.json` extension of one of the config files [provided](deeppavlov/configs) in this repository (e.g. `slotfill_dstc2`) +* where `download=True` downloads required data from web -- pretrained model + files and embeddings (optional), +* `` is path to the chosen model's config file (e.g. + `"deeppavlov/configs/ner/ner_ontonotes_bert_mult.json"`) or + `deeppavlov.configs` attribute (e.g. + `deeppavlov.configs.ner.ner_ontonotes_bert_mult` without quotation marks). -For the `interactbot` mode you should specify Telegram bot token in `-t` parameter or in `TELEGRAM_TOKEN` environment variable. -Also you should use `--no-default-skill` optional flag if your component implements an interface of DeepPavlov [*Skill*](deeppavlov/core/skill/skill.py) to skip its wrapping with DeepPavlov [*DefaultStatelessSkill*](deeppavlov/skills/default_skill/default_skill.py). -If you want to get custom `/start` and `/help` Telegram messages for the running model you should: -* Add section to [*deeppavlov/utils/settings/models_info.json*](deeppavlov/utils/settings/models_info.json) with your custom Telegram messages -* In model config file specify `metadata.labels.telegram_utils` parameter with name which refers to the added section of [*deeppavlov/utils/settings/models_info.json*](deeppavlov/utils/settings/models_info.json) +You can train it in the same simple way: -You can also serve DeepPavlov models for: -* Microsoft Bot Framework ([see developer guide for the detailed instructions](http://docs.deeppavlov.ai/en/latest/devguides/ms_bot_integration.html)) -* Amazon Alexa ([see developer guide for the detailed instructions](http://docs.deeppavlov.ai/en/latest/devguides/amazon_alexa.html)) +```python + from deeppavlov import train_model + + model = train_model(, download=True) +``` + +* `download=True` downloads pretrained model, therefore the pretrained +model will be, first, loaded and then train (optional). + +Dataset will be downloaded regardless of whether there was ``-d`` flag or +not. + +To train on your own data you need to modify dataset reader path in the +[train config doc](http://docs.deeppavlov.ai/en/master/intro/config_description.html#train-config). +The data format is specified in the corresponding model doc page. + +You can also calculate metrics on the dataset specified in your config file: + +```python + from deeppavlov import evaluate_model + + model = evaluate_model(, download=True) +``` -For `riseapi` mode you should specify api settings (host, port, etc.) in [*deeppavlov/utils/settings/server_config.json*](deeppavlov/utils/settings/server_config.json) configuration file. If provided, values from *model_defaults* section override values for the same parameters from *common_defaults* section. Model names in *model_defaults* section should be similar to the class names of the models main component. -Here is [detailed info on the DeepPavlov REST API](http://docs.deeppavlov.ai/en/latest/devguides/rest_api.html) +There are also available integrations with various messengers, see +[Telegram Bot doc page](http://docs.deeppavlov.ai/en/master/integrations/telegram.html) +and others in the Integrations section for more info. -All DeepPavlov settings files are stored in `deeppavlov/utils/settings` by default. You can get full path to it with `python -m deeppavlov.settings settings`. Also you can move it with with `python -m deeppavlov.settings settings -p ` (all your configuration settings will be preserved) or move it to default location with `python -m deeppavlov.settings settings -d` (all your configuration settings will be RESET to default ones). -For `predict` you can specify path to input file with `-f` or `--input-file` parameter, otherwise, data will be taken -from stdin. -Every line of input text will be used as a pipeline input parameter, so one example will consist of as many lines, -as many input parameters your pipeline expects. -You can also specify batch size with `-b` or `--batch-size` parameter. +## Breaking Changes -# Breaking changes +**Breaking changes in version 0.5.0** +- dependencies have to be reinstalled for most pipeline configurations +- models depending on `tensorflow` require `CUDA 10.0` to run on GPU instead of `CUDA 9.0` +- scikit-learn models have to be redownloaded or retrained **Breaking changes in version 0.5.0** - dependencies have to be reinstalled for most pipeline configurations @@ -164,48 +233,23 @@ was changed from `MODELS_PATH` to `MODEL_PATH`. - As of `version 0.1.0` all models, embeddings and other downloaded data for provided configurations are by default downloaded to the `.deeppavlov` directory in current user's home directory. This can be changed on per-model basis by modifying - a `ROOT_PATH` [variable](http://docs.deeppavlov.ai/en/latest/intro/config_description.html#variables) + a `ROOT_PATH` [variable](http://docs.deeppavlov.ai/en/master/intro/configuration.html#variables) or related fields one by one in model's configuration file. -- In configuration files, for all components, dataset readers and iterators `"name"` and `"class"` fields are combined +- In configuration files, for all features/models, dataset readers and iterators `"name"` and `"class"` fields are combined into the `"class_name"` field. - `deeppavlov.core.commands.infer.build_model_from_config()` was renamed to `build_model` and can be imported from the `deeppavlov` module directly. - The way arguments are passed to metrics functions during training and evaluation was changed and - [documented](http://docs.deeppavlov.ai/en/latest/intro/config_description.html#metrics). - - -# Documentation - -[docs.deeppavlov.ai](http://docs.deeppavlov.ai/) - -# Docker images - -We have built several DeepPavlov based Docker images, which include: -* DeepPavlov based Jupyter notebook Docker image; -* Docker images which serve some of our models and allow to access them via REST API (`riseapi` mode). + [documented](http://docs.deeppavlov.ai/en/0.4.0/intro/config_description.html#metrics). -Here is our [DockerHub repository](https://hub.docker.com/u/deeppavlov/) with images and deployment instructions. - -# Tutorials - -Jupyter notebooks explaining how to use DeepPalov for different tasks can be found in [/examples/](https://github.com/deepmipt/DeepPavlov/tree/master/examples) - -# License +## License DeepPavlov is Apache 2.0 - licensed. -# Support and collaboration - -If you have any questions, bug reports or feature requests, please feel free to post on our [Github Issues](https://github.com/deepmipt/DeepPavlov/issues) page. Please tag your issue with `bug`, `feature request`, or `question`. Also we’ll be glad to see your pull requests to add new datasets, models, embeddings, etc. In addition, we would like to invite everyone to join our [community forum](https://forum.ipavlov.ai/), where you can ask the DeepPavlov community any questions, share ideas, and find like-minded people. - -# The Team - -

- -

+## The Team DeepPavlov is built and maintained by [Neural Networks and Deep Learning Lab](https://mipt.ru/english/research/labs/neural-networks-and-deep-learning-lab) at [MIPT](https://mipt.ru/english/) within [iPavlov](http://ipavlov.ai/) project (part of [National Technology Initiative](https://asi.ru/eng/nti/)) and in partnership with [Sberbank](http://www.sberbank.com/). diff --git a/docs/apiref/skills/default_skill.rst b/docs/apiref/skills/default_skill.rst index eee11ba057..f6f954eb88 100644 --- a/docs/apiref/skills/default_skill.rst +++ b/docs/apiref/skills/default_skill.rst @@ -1,6 +1,6 @@ deeppavlov.skills.default_skill =============================== -Skill used for wrapping DeepPavlov components. +Skill used for wrapping DeepPavlov models. .. automodule:: deeppavlov.skills.default_skill.default_skill :members: diff --git a/docs/components/data_processors.rst b/docs/components/data_processors.rst deleted file mode 100644 index 1123b221e4..0000000000 --- a/docs/components/data_processors.rst +++ /dev/null @@ -1,75 +0,0 @@ -Data processors -=============== - -Preprocessors -------------- - -Preprocessor is a component that processes batch of samples. - -* Already implemented universal preprocessors of **tokenized texts** (each sample is a list of tokens): - - - **CharSplitter** (registered as ``char_splitter``) splits every token in given batch of tokenized samples to a sequence of characters. - - - **Mask** (registered as ``mask``) returns binary mask of corresponding length (padding up to the maximum length per batch. - - - **PymorphyRussianLemmatizer** (registered as ``pymorphy_russian_lemmatizer``) performs lemmatization for Russian language. - - - **Sanitizer** (registered as ``sanitizer``) removes all combining characters like diacritical marks from tokens. - - -* Already implemented universal preprocessors of **non-tokenized texts** (each sample is a string): - - - **DirtyCommentsPreprocessor** (registered as ``dirty_comments_preprocessor``) preprocesses samples converting samples to lowercase, paraphrasing English combinations with apostrophe ``'``, transforming more than three the same symbols to two symbols. - - - **StrLower** (registered as ``str_lower``) converts samples to lowercase. - - -* Already implemented universal preprocessors of another type of features: - - - **OneHotter** (registered as ``one_hotter``) performs one-hotting operation for the batch of samples where each sample is an integer label or a list of integer labels (can be combined in one batch). If ``multi_label`` parameter is set to ``True``, returns one one-dimensional vector per sample with several elements equal to ``1``. - - -Tokenizers ----------- - -Tokenizer is a component that processes batch of samples (each sample is a text string). - - - **LazyTokenizer** (registered as ``lazy_tokenizer``) tokenizes using ``nltk.word_tokenize``. - - - **NLTKTokenizer** (registered as ``nltk_tokenizer``) tokenizes using tokenizers from ``nltk.tokenize``, e.g. ``nltk.tokenize.wordpunct_tokenize``. - - - **NLTKMosesTokenizer** (registered as ``nltk_moses_tokenizer``) tokenizes and detokenizes using ``nltk.tokenize.moses.MosesDetokenizer``, ``nltk.tokenize.moses.MosesTokenizer``. - - - **RuSentTokenizer** (registered as ``ru_sent_tokenizer``) is a rule-based tokenizer for Russian language. - - - **RussianTokenizer** (registered as ``ru_tokenizer``) tokenizes or lemmatizes Russian texts using ``nltk.tokenize.toktok.ToktokTokenizer``. - - - **StreamSpacyTokenizer** (registered as ``stream_spacy_tokenizer``) tokenizes or lemmatizes texts with spacy ``en_core_web_sm`` models by default. - - - **SplitTokenizer** (registered as ``split_tokenizer``) tokenizes using string method ``split``. - - -Embedders ---------- - -Embedder is a component that converts every token in a tokenized batch to a vector of particular dimensionality (optionally, returns a single vector per sample). - - - **GloVeEmbedder** (registered as ``glove``) reads embedding file in GloVe format (file starts with ``number_of_words embeddings_dim line`` followed by lines ``word embedding_vector``). If ``mean`` returns one vector per sample - mean of embedding vectors of tokens. - - - **FasttextEmbedder** (registered as ``fasttext``) reads embedding file in fastText format. If ``mean`` returns one vector per sample - mean of embedding vectors of tokens. - - - **BoWEmbedder** (registered as ``bow``) performs one-hot encoding of tokens using pre-built vocabulary. - - - **TfidfWeightedEmbedder** (registered as ``tfidf_weighted``) accepts embedder, tokenizer (for detokenization, by default, detokenize with joining with space), TFIDF vectorizer or counter vocabulary, optionally accepts tags vocabulary (to assign additional multiplcative weights to particular tags). If ``mean`` returns one vector per sample - mean of embedding vectors of tokens. - - - **ELMoEmbedder** (registered as ``elmo``) converts tokens to pre-trained contextual representations from large-scale bidirectional language models. See examples `here `__. - -Vectorizers ------------ - -Vectorizer is a component that converts batch of text samples to batch of vectors. - - - **SklearnComponent** (registered as ``sklearn_component``) is a DeepPavlov wrapper for most of sklearn estimators, vectorizers etc. For example, to get TFIDF-vecotrizer one should assign in config ``model_class`` to ``sklearn.feature_extraction.text:TfidfVectorizer``, ``infer_method`` to ``transform``, pass ``load_path``, ``save_path`` and other sklearn model parameters. - - - **HashingTfIdfVectorizer** (registered as ``hashing_tfidf_vectorizer``) implements hashing version of usual TFIDF-vecotrizer. It creates a TFIDF matrix from collection of documents of size ``[n_documents X n_features(hash_size)]``. - diff --git a/docs/devguides/contribution_guide.rst b/docs/devguides/contribution_guide.rst new file mode 100644 index 0000000000..010ddac1a1 --- /dev/null +++ b/docs/devguides/contribution_guide.rst @@ -0,0 +1,111 @@ + +Contribution Guide +===================== + +We are happy when you share your research with us and when you improve our +code! There is an easy way to contribute to our project, follow the steps +below. Your commit will be reviewed and added to our dev branch, and will be +added to master branch with the nearest release. Moreover, if you are a +dedicated contributor, you have a chance to get our t-shirt, get invited to +one of our events or even join our team ; ) + +How to contribute: + +#. Don't start the coding first. You should **post an** + `issue `_ to discuss the + features you want to add. If our team or other contributors accept your offer + or give a +1, assign the issue to yourself. Now proceed with coding : ) + +#. **Write readable code** and keep it + `PEP8 `_-ed, **add docstrings** + and keep them consistent with the + `Google Style `_. + Pay attention that we support typing annotations in every function + declaration. + + Accompany code with **clear comments** to let other people understand the + flow of your mind. + + If you create new models, refer to the :doc:`Registry your model + ` section to add it to the DeepPavlov registry of + models. + +#. **Clone and/or update** your checked out **copy of DeepPavlov** to ensure + you have the most recent commits from the master branch: + + .. code:: bash + + git clone git@github.com:deepmipt/DeepPavlov.git + cd DeepPavlov/ + git fetch origin + git checkout dev + git pull + +#. **Create a new branch and switch** to it. Give it a meaningful name: + + .. code:: bash + + git checkout -b what_my_code_does_branch + +#. We ask you to **add some tests**. This will help us maintain the + framework, and this will help users to understand the feature you introduce. + Examples of implemented tests are available in `tests/ + `_ + directory. + +#. Please, **update the documentation**, if you committed significant changes + to our code. + +#. **Commit your changes and push** your feature branch to your GitHub fork. + Don't forget to reference the GitHub issue associated with your task. + Squash your commits into a single commit with git's interactive rebase. + Create a new branch if necessary. + + .. code:: bash + + git add my_files + git commit -m "fix: resolve issue #271" + git push origin my_branch + + Follow the `semantic commit notation `_ + for the name of the commit. + +#. **Create a new pull request** to get your feature branch merged into dev + for others to use. You’ll first need to ensure your feature branch contains + the latest changes from dev. + + .. code:: bash + + # (external contribs): make a new pull request: + + # merge latest dev changes into your feature branch + git fetch origin + git checkout dev + git pull origin dev + git checkout my_branch + git merge dev # you may need to manually resolve merge conflicts + +#. Once your change has been successfully merged, you can **remove the source + branch** and ensure your local copy is up to date: + + .. code:: bash + + git fetch origin + git checkout dev + git pull + git branch -d my_branch + git branch -d -r origin/my_branch + +#. **Relax and wait** : ) + +Some time after that your commit will be reassigned to somebody from our team +to check your code. +If the code is okay and all tests work fine, your commit will be approved and +added to the framework. Your research will become a part of a common big work +and other people will happily use it and thank you :D + +If you still have any questions, either on the contribution process or about +the framework itself, please ask us at our forum ``_. +Follow us on Facebook to get news on releases, new features, approved +contributions and resolved issues ``_ + diff --git a/docs/devguides/extending.rst b/docs/devguides/registry.rst similarity index 97% rename from docs/devguides/extending.rst rename to docs/devguides/registry.rst index 74b9f40f7c..a2a9cd4dde 100644 --- a/docs/devguides/extending.rst +++ b/docs/devguides/registry.rst @@ -1,5 +1,5 @@ -Extending the library -===================== +Register your model +=================== In order to extend the library, you need to register your classes and functions; it is done in two steps. diff --git a/docs/devguides/yandex_alice.rst b/docs/devguides/yandex_alice.rst deleted file mode 100644 index 97b4bc2abc..0000000000 --- a/docs/devguides/yandex_alice.rst +++ /dev/null @@ -1,53 +0,0 @@ -Yandex Alice integration -======================== - - -Pipelines -~~~~~~~~~ - - -Any DeepPavlov pipeline can be launched as a skill for Yandex.Alice. - -Configure host, port, model endpoint, GET request arguments in ``deeppavlov/utils/settings/server_config.json`` or see default values there. - -Use your own certificate for HTTPS if you have; otherwise, generate self-signed one like that: - -:: - - openssl req -new -newkey rsa:4096 -days 365 -nodes -x509 -subj "/CN=MY_DOMAIN_OR_IP" -keyout my.key -out my.crt - -Then run - -:: - - python -m deeppavlov riseapi --api-mode alice --https --key my.key --cert my.crt [-d] [-p ] - - -Optional ``-d`` key is for dependencies download before service start. - -Optional ``-p`` key is used to override the port number. - -Now set up and test your dialog (https://dialogs.yandex.ru/developer/). Detailed documentation of the platform could be -found on https://tech.yandex.ru/dialogs/alice/doc/about-docpage/, while other library options described in -:doc:`REST API ` section. - - -Agents -~~~~~~ - -You can also run :doc:`agents ` as Alice skills: - -.. code:: python - - from deeppavlov.agents.default_agent.default_agent import DefaultAgent - from deeppavlov.agents.processors.highest_confidence_selector import HighestConfidenceSelector - from deeppavlov.skills.pattern_matching_skill import PatternMatchingSkill - from deeppavlov.utils.alice import start_agent_server - - skill_hello = PatternMatchingSkill(['Привет, мир!'], patterns=['привет', 'здравствуй', 'добрый день']) - skill_bye = PatternMatchingSkill(['Пока, мир', 'Ещё увидимся'], patterns=['пока', 'чао', 'увидимся', 'до свидания']) - skill_fallback = PatternMatchingSkill(['Извини, я не понимаю', 'Я умею здороваться )']) - - agent = DefaultAgent([skill_hello, skill_bye, skill_fallback], skills_processor=HighestConfidenceSelector()) - - start_agent_server(agent, host='0.0.0.0', port=7051, endpoint='/agent', ssl_key='my.key', ssl_cert='my.crt') diff --git a/docs/intro/hypersearch.rst b/docs/features/hypersearch.rst similarity index 100% rename from docs/intro/hypersearch.rst rename to docs/features/hypersearch.rst diff --git a/docs/components/bert.rst b/docs/features/models/bert.rst similarity index 86% rename from docs/components/bert.rst rename to docs/features/models/bert.rst index b31b1b876b..c8a847ee02 100644 --- a/docs/components/bert.rst +++ b/docs/features/models/bert.rst @@ -19,15 +19,16 @@ We have trained BERT-base model for other languages: - RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__ - SlavicBERT, Slavic (bg, cs, pl, ru), cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] `__ +- Conversational BERT, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: `[deeppavlov] `__ RuBERT was trained on the Russian part of Wikipedia and news data. We used this training data to build vocabulary of Russian subtokens and took -multilingual version of BERT-base as initialization for RuBERT [1]_. SlavicBERT training was done in the same manner as RuBERT. +multilingual version of BERT-base as initialization for RuBERT [1]_. -We have trained BERT-base model for conversational language style: -- Conversational BERT, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: `[deeppavlov] `__ +SlavicBERT was trained on Russian News and four Wikipedias: Bulgarian, Czech, Polish, and Russian. +Subtoken vocabulary was built using this data. Multilingual BERT was used as an initialization for SlavicBERT. Conversational BERT was trained on the English part of Twitter, Reddit, DailyDialogues [3]_, OpenSubtitles [4]_, Debates [5]_, Blogs [6]_, Facebook News Comments. -We used this training data to build vocabulary of English subtokens and took +We used this training data to build the vocabulary of English subtokens and took English cased version of BERT-base as initialization for English Conversational BERT. Here, in DeepPavlov, we made it easy to use pre-trained BERT for downstream tasks like classification, tagging, question answering and @@ -55,7 +56,7 @@ BERT for Named Entity Recognition (Sequence Tagging) ---------------------------------------------------- Pre-trained BERT model can be used for sequence tagging. Examples of usage of BERT for sequence tagging can be -found :doc:`here `. The module used for tagging is :class:`~deeppavlov.models.bert.bert_ner.BertNerModel`. +found :doc:`here `. The module used for tagging is :class:`~deeppavlov.models.bert.bert_ner.BertNerModel`. To tag each word representations of the first sub-word elements are extracted. So for each word there is only one vector produced. These representations are passed to a dense layer or Bi-RNN layer to produce distribution over tags. There is also an optional CRF layer on the top. @@ -70,7 +71,7 @@ Context Question Answering on `SQuAD `. +:doc:`Context Question Answering documentation page `. BERT for Ranking ---------------- @@ -86,12 +87,23 @@ Additional components :class:`~deeppavlov.models.preprocessors.bert_preprocessor.BertSepRankerPredictorPreprocessor` and :class:`~deeppavlov.models.bert.bert_ranker.BertSepRankerPredictor` are for usage in the ``interact`` mode where the task for ranking is to retrieve the best possible response from some provided response base with the help of -the trained model. Working examples with the trained models are given :doc:`here `. -Statistics are available :doc:`here `. +the trained model. Working examples with the trained models are given :doc:`here `. +Statistics are available :doc:`here `. + +Using custom BERT in DeepPavlov +------------------------------- + +The previous sections describe the BERT based models implemented in DeepPavlov. +To change the BERT model used for initialization in any downstream task mentioned above the following parameters of +the :doc:`config ` file must be changed to match new BERT path: + +* download URL in the ``metadata.download.url`` part of the config +* ``bert_config_file``, ``pretrained_bert`` in the BERT based Component +* ``vocab_file`` in the ``bert_preprocessor`` .. [1] Kuratov, Y., Arkhipov, M. (2019). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language. arXiv preprint arXiv:1905.07213. .. [2] McDonald, R., Brokos, G. I., & Androutsopoulos, I. (2018). Deep relevance ranking using enhanced document-query interactions. arXiv preprint arXiv:1809.01682. .. [3] Yanran Li, Hui Su, Xiaoyu Shen, Wenjie Li, Ziqiang Cao, and Shuzi Niu. DailyDialog: A Manually Labelled Multi-turn Dialogue Dataset. IJCNLP 2017. .. [4] P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation (LREC 2016) .. [5] Justine Zhang, Ravi Kumar, Sujith Ravi, Cristian Danescu-Niculescu-Mizil. Proceedings of NAACL, 2016. -.. [6] J. Schler, M. Koppel, S. Argamon and J. Pennebaker (2006). Effects of Age and Gender on Blogging in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs. \ No newline at end of file +.. [6] J. Schler, M. Koppel, S. Argamon and J. Pennebaker (2006). Effects of Age and Gender on Blogging in Proceedings of 2006 AAAI Spring Symposium on Computational Approaches for Analyzing Weblogs. diff --git a/docs/components/classifiers.rst b/docs/features/models/classifiers.rst similarity index 99% rename from docs/components/classifiers.rst rename to docs/features/models/classifiers.rst index 0a4c770b62..c951f34f98 100644 --- a/docs/components/classifiers.rst +++ b/docs/features/models/classifiers.rst @@ -126,7 +126,7 @@ English. **deeppavlov.models.bert.BertClassifierModel** (see :doc:`here `) provides easy to use solution for classification problem using pre-trained BERT. Several **pre-trained English, multi-lingual and Russian BERT** models are provided in -:doc:`our BERT documentation `. +:doc:`our BERT documentation `. Two main components of BERT classifier pipeline in DeepPavlov are ``deeppavlov.models.preprocessors.BertPreprocessor`` (see :doc:`here `) @@ -418,4 +418,4 @@ References .. [7] Coucke A. et al. Snips voice platform: an embedded spoken language understanding system for private-by-design voice interfaces //arXiv preprint arXiv:1805.10190. – 2018. -.. [8] Devlin J. et al. Bert: Pre-training of deep bidirectional transformers for language understanding //arXiv preprint arXiv:1810.04805. – 2018. \ No newline at end of file +.. [8] Devlin J. et al. Bert: Pre-training of deep bidirectional transformers for language understanding //arXiv preprint arXiv:1810.04805. – 2018. diff --git a/docs/features/models/index.rst b/docs/features/models/index.rst new file mode 100644 index 0000000000..1e3893dc12 --- /dev/null +++ b/docs/features/models/index.rst @@ -0,0 +1,19 @@ +Models +====== + +.. toctree:: + :glob: + :maxdepth: 1 + + BERT-based models + Context Question Answering + Classification + Morphological Tagger + Named Entity Recognition + Neural Ranking + Slot filling + Spelling Correction + TF-IDF Ranking + Popularity Ranking + Knowledge Base Question answering + diff --git a/docs/components/kbqa.rst b/docs/features/models/kbqa.rst similarity index 89% rename from docs/components/kbqa.rst rename to docs/features/models/kbqa.rst index d3e2c346ef..6141eed08b 100644 --- a/docs/components/kbqa.rst +++ b/docs/features/models/kbqa.rst @@ -5,15 +5,15 @@ Description ----------- The Knowledge Base Question Answering model uses Wikidata to answer question. To find the answer the following -components are used: -:doc:`NER component ` performs entity discovery. In a given question it finds a substring which +models are used: +:doc:`NER model ` performs entity discovery. In a given question it finds a substring which is an entity, possible mentioned in a Knowledge Base. -:doc:`Classification component ` classifies the question into a set of predefined relations from +:doc:`Classification model ` classifies the question into a set of predefined relations from Wikidata. Substring extracted by the NER model is used for entity linking. Entity linking preforms matching the substring with one of the Wikidata entities. Matching is based on Levenshtein distance between the substring and an entity description. The result of the matching procedure is a set of candidate entities. The reset is search of the -entity among this set with one of the top-k relations predicted by classification component. +entity among this set with one of the top-k relations predicted by classification model. Use the model diff --git a/docs/components/morphotagger.rst b/docs/features/models/morphotagger.rst similarity index 99% rename from docs/components/morphotagger.rst rename to docs/features/models/morphotagger.rst index 8fd4a9f5d1..dabe43b0b3 100644 --- a/docs/components/morphotagger.rst +++ b/docs/features/models/morphotagger.rst @@ -488,7 +488,7 @@ Chainer The ``chainer`` part of the configuration file contains the specification of the neural network model and supplementary things such as vocabularies. Chainer refers to an instance of :class:`~deeppavlov.core.common.chainer.Chainer`, see -:doc:`config_description ` for a complete description. +:doc:`configuration ` for a complete description. The major part of ``chainer`` is ``pipe``. The ``pipe`` contains vocabularies and the network itself as well @@ -602,7 +602,7 @@ When an additional vectorizer is used, the first line is changed to ``"word_vectorizers": [["#pymorphy_vectorizer.dim", 128]]`` is appended. Config includes general parameters of :class:`~deeppavlov.core.models.component.Component` class, -described in the :doc:`config_description ` and specific +described in the :doc:`configuration ` and specific :class:`~deeppavlov.models.morpho_tagger.morpho_tagger.MorphoTagger` parameters. The latter include @@ -650,4 +650,4 @@ and produces the output of the format 7 married VERB Tense=Past|VerbForm=Part|Voice=Pass 8 . PUNCT _ -To generate output in 10 column CONLL-U format add ``"format_mode": "ud"`` to the described section. \ No newline at end of file +To generate output in 10 column CONLL-U format add ``"format_mode": "ud"`` to the described section. diff --git a/docs/components/ner.rst b/docs/features/models/ner.rst similarity index 99% rename from docs/components/ner.rst rename to docs/features/models/ner.rst index 7239e617a2..145e21f18c 100644 --- a/docs/components/ner.rst +++ b/docs/features/models/ner.rst @@ -5,7 +5,7 @@ Train and use the model ----------------------- There are two main types of models available: standard RNN based and BERT based. To see details about BERT based -models see :doc:`here `. +models see :doc:`here `. Any pre-trained model can be used for inference from both Command Line Interface (CLI) and Python. Before using the model make sure that all required packages are installed using the command: diff --git a/docs/components/neural_ranking.rst b/docs/features/models/neural_ranking.rst similarity index 98% rename from docs/components/neural_ranking.rst rename to docs/features/models/neural_ranking.rst index c2b048bbca..6023a0dae8 100644 --- a/docs/components/neural_ranking.rst +++ b/docs/features/models/neural_ranking.rst @@ -1,7 +1,7 @@ Ranking and paraphrase identification ===================================== -This library component solves the tasks of ranking and paraphrase identification based on semantic similarity +This library model solves the tasks of ranking and paraphrase identification based on semantic similarity which is trained with siamese neural networks. The trained network can retrieve the response closest semantically to a given context from some database or answer whether two sentences are paraphrases or not. It is possible to build automatic semantic FAQ systems with such neural architectures. @@ -39,7 +39,7 @@ Further the trained representation-based model can be run for inference over the python -m deeppavlov interact ranking_ubuntu_v2_bert_sep_interact [-d] -Statistics on the models quality are available :doc:`here `. +Statistics on the models quality are available :doc:`here `. Building your own response base for bert ranking ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/components/popularity_ranking.rst b/docs/features/models/popularity_ranking.rst similarity index 100% rename from docs/components/popularity_ranking.rst rename to docs/features/models/popularity_ranking.rst diff --git a/docs/components/slot_filling.rst b/docs/features/models/slot_filling.rst similarity index 95% rename from docs/components/slot_filling.rst rename to docs/features/models/slot_filling.rst index dc53038b63..9634e15b4c 100644 --- a/docs/components/slot_filling.rst +++ b/docs/features/models/slot_filling.rst @@ -1,9 +1,9 @@ Neural Named Entity Recognition and Slot Filling ================================================ -This component solves Slot-Filling task using Levenshtein search and different neural network architectures for NER. -To read about NER without slot filling please address :doc:`NER documentation `. -This component serves for solving DSTC 2 Slot-Filling task. In most of the cases, NER task can be formulated as: +This model solves Slot-Filling task using Levenshtein search and different neural network architectures for NER. +To read about NER without slot filling please address :doc:`NER documentation `. +This model serves for solving DSTC 2 Slot-Filling task. In most of the cases, NER task can be formulated as: *Given a sequence of tokens (words, and maybe punctuation symbols) provide a tag from a predefined set of tags for each token in the @@ -47,7 +47,7 @@ Slot Filling is a typical step after the NER. It can be formulated as: *Given an entity of a certain type and a set of all possible values of this entity type provide a normalized form of the entity.* -In this component, the Slot Filling task is solved by Levenshtein +In this model, the Slot Filling task is solved by Levenshtein Distance search across all known entities of a given type. For example, there is an entity of "food" type: diff --git a/docs/components/spelling_correction.rst b/docs/features/models/spelling_correction.rst similarity index 100% rename from docs/components/spelling_correction.rst rename to docs/features/models/spelling_correction.rst diff --git a/docs/components/squad.rst b/docs/features/models/squad.rst similarity index 100% rename from docs/components/squad.rst rename to docs/features/models/squad.rst diff --git a/docs/components/tfidf_ranking.rst b/docs/features/models/tfidf_ranking.rst similarity index 100% rename from docs/components/tfidf_ranking.rst rename to docs/features/models/tfidf_ranking.rst diff --git a/docs/intro/features.rst b/docs/features/overview.rst similarity index 96% rename from docs/intro/features.rst rename to docs/features/overview.rst index 409c6c57da..697dd5b1ac 100644 --- a/docs/intro/features.rst +++ b/docs/features/overview.rst @@ -1,11 +1,13 @@ Features ======== -Components ----------- +.. contents:: :local: + +Models +------ -:doc:`NER component ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +NER model :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are two models for Named Entity Recognition task in DeepPavlov: BERT-based and Bi-LSTM+CRF. The models predict tags (in BIO format) for tokens @@ -38,10 +40,10 @@ which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01 | DSTC2 | | :config:`ner_dstc2.json ` | 97.1 | +---------------------------------------------------------+-------+---------------------------------------------------------------------------+-------------+ -:doc:`Slot filling components ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Slot filling models :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Based on fuzzy Levenshtein search to extract normalized slot values from text. The components either rely on NER results +Based on fuzzy Levenshtein search to extract normalized slot values from text. The models either rely on NER results or perform needle in haystack search. +---------------------------------------------------------------------------------------------------------------------------+------------------+ @@ -51,10 +53,10 @@ or perform needle in haystack search. +---------------------------------------------------------------------------------------------------------------------------+------------------+ -:doc:`Classification component ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Classification model :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Component for classification tasks (intents, sentiment, etc) on word-level. Shallow-and-wide CNN, Deep CNN, BiLSTM, +Model for classification tasks (intents, sentiment, etc) on word-level. Shallow-and-wide CNN, Deep CNN, BiLSTM, BiLSTM with self-attention and other models are presented. The model also allows multilabel classification of texts. Several pre-trained models are available and presented in Table below. @@ -142,66 +144,8 @@ trained on Reddit dataset. .. [3] https://www.slideshare.net/KonstantinSavenkov/nlu-intent-detection-benchmark-by-intento-august-2017 -:doc:`Goal-oriented bot ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Based on Hybrid Code Networks (HCNs) architecture from `Jason D. Williams, Kavosh Asadi, -Geoffrey Zweig, Hybrid Code Networks: practical and efficient end-to-end dialog control -with supervised and reinforcement learning – 2017 `__. -It allows to predict responses in a goal-oriented dialog. The model is -customizable: embeddings, slot filler and intent classifier can be switched on and off on demand. - -Available pre-trained models and their comparison with existing benchmarks: - -+----------------+------+-------------------------------------------------------------------------------------+---------------+---------+------------+------------------+ -| Dataset | Lang | Model | Metric | Valid | Test | Downloads | -+================+======+=====================================================================================+===============+=========+============+==================+ -| `DSTC 2`_ [*]_ | En | :config:`bot with slot filler ` | Turn Accuracy | 0.521 | 0.529 | 400 Mb | -+ + +-------------------------------------------------------------------------------------+ +---------+------------+------------------+ -| | | :config:`bot with slot filler & intents & attention ` | | 0.555 | **0.561** | 8.5 Gb | -+----------------+ +-------------------------------------------------------------------------------------+ +---------+------------+------------------+ -| `DSTC 2`_ | | Bordes and Weston (2016) | | -- | 0.411 | -- | -+ + +-------------------------------------------------------------------------------------+ +---------+------------+------------------+ -| | | Eric and Manning (2017) | | -- | 0.480 | -- | -+ + +-------------------------------------------------------------------------------------+ +---------+------------+------------------+ -| | | Perez and Liu (2016) | | -- | 0.487 | -- | -+ + +-------------------------------------------------------------------------------------+ +---------+------------+------------------+ -| | | Williams et al. (2017) | | -- | **0.556** | -- | -+----------------+------+-------------------------------------------------------------------------------------+---------------+---------+------------+------------------+ - -.. _`DSTC 2`: http://camdial.org/~mh521/dstc/ - -.. [*] There were a few :ref:`modifications ` to the original dataset. - - -:doc:`Seq2seq goal-oriented bot ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Dialogue agent predicts responses in a goal-oriented dialog and is able to handle -multiple domains (pretrained bot allows calendar scheduling, weather information retrieval, -and point-of-interest navigation). The model is end-to-end differentiable and -does not need to explicitly model dialogue state or belief trackers. - -Comparison of deeppavlov pretrained model with others: - -+-------------------+------+----------------------------------------------------+------------------+-----------------+-----------+ -| Dataset | Lang | Model | Valid BLEU | Test BLEU | Downloads | -+===================+======+====================================================+==================+=================+===========+ -| `Stanford Kvret`_ | En | :config:`KvretNet ` | 0.131 | **0.132** | 10 Gb | -+ + +----------------------------------------------------+------------------+-----------------+-----------+ -| | | KvretNet, Mihail Eric et al. (2017) | -- | **0.132** | -- + -+ + +----------------------------------------------------+------------------+-----------------+-----------+ -| | | CopyNet, Mihail Eric et al. (2017) | -- | 0.110 | -- + -+ + +----------------------------------------------------+------------------+-----------------+-----------+ -| | | Attn Seq2Seq, Mihail Eric et al. (2017) | -- | 0.102 | -- + -+ + +----------------------------------------------------+------------------+-----------------+-----------+ -| | | Rule-based, Mihail Eric et al. (2017) | -- | 0.066 | -- + -+-------------------+------+----------------------------------------------------+------------------+-----------------+-----------+ - -.. _`Stanford Kvret`: https://nlp.stanford.edu/blog/a-new-multi-turn-multi-domain-task-oriented-dialogue-dataset/ - -:doc:`Automatic spelling correction component ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Automatic spelling correction model :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Pipelines that use candidates search in a static dictionary and an ARPA language model to correct spelling errors. @@ -233,8 +177,8 @@ on Automatic Spelling Correction for Russian: -:doc:`Ranking component ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Ranking model :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The main neural ranking model based on `LSTM-based deep learning models for non-factoid answer selection `__. The model performs ranking of responses or contexts from some database by their @@ -389,8 +333,8 @@ References: -:doc:`TF-IDF Ranker component ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +TF-IDF Ranker model :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Based on `Reading Wikipedia to Answer Open-Domain Questions `__. The model solves the task of document retrieval for a given query. @@ -401,8 +345,9 @@ Based on `Reading Wikipedia to Answer Open-Domain Questions ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Question Answering model :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Models in this section solve the task of looking for an answer on a question in a given context (`SQuAD `__ task format). There are two models for this task in DeepPavlov: BERT-based and R-Net. Both models predict answer start and end @@ -432,14 +377,14 @@ In the case when answer is not necessary present in given context we have :confi model. This model outputs empty string in case if there is no answer in context. -:doc:`Morphological tagging component ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Morphological tagging model :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Based on character-based approach to morphological tagging `Heigold et al., 2017. An extensive empirical evaluation of character-based morphological tagging for 14 languages `__. A state-of-the-art model for Russian and several other languages. Model takes as input tokenized sentences and outputs the corresponding sequence of morphological labels in `UD format `__. The table below -contains word and sentence accuracy on UD2.0 datasets. For more scores see :doc:`full table `. +contains word and sentence accuracy on UD2.0 datasets. For more scores see :doc:`full table `. .. table:: :widths: auto @@ -473,8 +418,8 @@ contains word and sentence accuracy on UD2.0 datasets. For more scores see :doc: .. _`UD2.0`: https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-1983 .. _`UD Pipe 1.2`: http://ufal.mff.cuni.cz/udpipe -:doc:`Frequently Asked Questions (FAQ) component ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Frequently Asked Questions (FAQ) model :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Set of pipelines for FAQ task: classifying incoming question into set of known questions and return prepared answer. You can build different pipelines based on: tf-idf, weighted fasttext, cosine similarity, logistic regression. @@ -483,8 +428,67 @@ You can build different pipelines based on: tf-idf, weighted fasttext, cosine si Skills ------ -:doc:`eCommerce bot ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Goal-oriented bot :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Based on Hybrid Code Networks (HCNs) architecture from `Jason D. Williams, Kavosh Asadi, +Geoffrey Zweig, Hybrid Code Networks: practical and efficient end-to-end dialog control +with supervised and reinforcement learning – 2017 `__. +It allows to predict responses in a goal-oriented dialog. The model is +customizable: embeddings, slot filler and intent classifier can be switched on and off on demand. + +Available pre-trained models and their comparison with existing benchmarks: + ++----------------+------+-------------------------------------------------------------------------------------+---------------+---------+------------+------------------+ +| Dataset | Lang | Model | Metric | Valid | Test | Downloads | ++================+======+=====================================================================================+===============+=========+============+==================+ +| `DSTC 2`_ [*]_ | En | :config:`bot with slot filler ` | Turn Accuracy | 0.521 | 0.529 | 400 Mb | ++ + +-------------------------------------------------------------------------------------+ +---------+------------+------------------+ +| | | :config:`bot with slot filler & intents & attention ` | | 0.555 | **0.561** | 8.5 Gb | ++----------------+ +-------------------------------------------------------------------------------------+ +---------+------------+------------------+ +| `DSTC 2`_ | | Bordes and Weston (2016) | | -- | 0.411 | -- | ++ + +-------------------------------------------------------------------------------------+ +---------+------------+------------------+ +| | | Eric and Manning (2017) | | -- | 0.480 | -- | ++ + +-------------------------------------------------------------------------------------+ +---------+------------+------------------+ +| | | Perez and Liu (2016) | | -- | 0.487 | -- | ++ + +-------------------------------------------------------------------------------------+ +---------+------------+------------------+ +| | | Williams et al. (2017) | | -- | **0.556** | -- | ++----------------+------+-------------------------------------------------------------------------------------+---------------+---------+------------+------------------+ + +.. _`DSTC 2`: http://camdial.org/~mh521/dstc/ + +.. [*] There were a few :ref:`modifications ` to the original dataset. + + +Seq2seq goal-oriented bot :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Dialogue agent predicts responses in a goal-oriented dialog and is able to handle +multiple domains (pretrained bot allows calendar scheduling, weather information retrieval, +and point-of-interest navigation). The model is end-to-end differentiable and +does not need to explicitly model dialogue state or belief trackers. + +Comparison of deeppavlov pretrained model with others: + ++-------------------+------+----------------------------------------------------+------------------+-----------------+-----------+ +| Dataset | Lang | Model | Valid BLEU | Test BLEU | Downloads | ++===================+======+====================================================+==================+=================+===========+ +| `Stanford Kvret`_ | En | :config:`KvretNet ` | 0.131 | **0.132** | 10 Gb | ++ + +----------------------------------------------------+------------------+-----------------+-----------+ +| | | KvretNet, Mihail Eric et al. (2017) | -- | **0.132** | -- + ++ + +----------------------------------------------------+------------------+-----------------+-----------+ +| | | CopyNet, Mihail Eric et al. (2017) | -- | 0.110 | -- + ++ + +----------------------------------------------------+------------------+-----------------+-----------+ +| | | Attn Seq2Seq, Mihail Eric et al. (2017) | -- | 0.102 | -- + ++ + +----------------------------------------------------+------------------+-----------------+-----------+ +| | | Rule-based, Mihail Eric et al. (2017) | -- | 0.066 | -- + ++-------------------+------+----------------------------------------------------+------------------+-----------------+-----------+ + +.. _`Stanford Kvret`: https://nlp.stanford.edu/blog/a-new-multi-turn-multi-domain-task-oriented-dialogue-dataset/ + + +eCommerce bot :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The eCommerce bot intends to retrieve product items from catalog in sorted order. In addition, it asks an user to provide additional information to specify the search. @@ -493,8 +497,8 @@ The eCommerce bot intends to retrieve product items from catalog in sorted order About **130 Mb** on disc required for eCommerce bot with TfIdf-based ranker and **500 Mb** for BLEU-based ranker. -:doc:`ODQA ` -~~~~~~~~~~~~~~~~~~~~~~~~~~ +ODQA :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ An open domain question answering skill. The skill accepts free-form questions about the world and outputs an answer based on its Wikipedia knowledge. @@ -516,8 +520,8 @@ based on its Wikipedia knowledge. AutoML -------------------- -:doc:`Hyperparameters optimization ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Hyperparameters optimization :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hyperparameters optimization (either by cross-validation or neural evolution) for DeepPavlov models that requires only some small changes in a config file. @@ -526,14 +530,14 @@ that requires only some small changes in a config file. Embeddings ---------- -:doc:`Pre-trained embeddings for the Russian language ` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Pre-trained embeddings :doc:`[docs] ` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Word vectors for the Russian language trained on joint `Russian Wikipedia `__ and `Lenta.ru `__ corpora. -Examples of some components +Examples of some models --------------------------- - Run goal-oriented bot with Telegram interface: diff --git a/docs/intro/pretrained_vectors.rst b/docs/features/pretrained_vectors.rst similarity index 81% rename from docs/intro/pretrained_vectors.rst rename to docs/features/pretrained_vectors.rst index a147aac436..7b78658024 100644 --- a/docs/intro/pretrained_vectors.rst +++ b/docs/features/pretrained_vectors.rst @@ -1,5 +1,37 @@ Pre-trained embeddings -============================ +====================== + +BERT +---- + +We are publishing several pre-trained BERT models: + +* RuBERT for Russian language +* Slavic BERT for Bulgarian, Czech, Polish, and Russian +* Conversational BERT for informal English + +Description of these models is available in the :doc:`BERT section ` of the docs. + +License +~~~~~~~ + +The pre-trained models are distributed under the `License Apache +2.0 `__. + +Downloads +~~~~~~~~~ + +The models can be run with the original `BERT repo `_ code. The download links are: + ++----------------------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+ +| Description | Model parameters | Download link | ++======================+====================================================+============================================================================================================================================+ +| RuBERT | vocab size = 120K, parameters = 180M, size = 700MB | `[rubert_cased_L-12_H-768_A-12] `__ | ++----------------------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+ +| Slavic BERT | vocab size = 120K, parameters = 180M, size = 700MB | `[bg_cs_pl_ru_cased_L-12_H-768_A-12] `__ | ++----------------------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+ +| Conversational BERT | vocab size = 30K, parameters = 110M, size = 400MB | `[conversational_cased_L-12_H-768_A-12] `__ | ++----------------------+----------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------+ ELMo ---- diff --git a/docs/skills/aiml_skill.rst b/docs/features/skills/aiml_skill.rst similarity index 100% rename from docs/skills/aiml_skill.rst rename to docs/features/skills/aiml_skill.rst diff --git a/docs/skills/ecommerce.rst b/docs/features/skills/ecommerce.rst similarity index 100% rename from docs/skills/ecommerce.rst rename to docs/features/skills/ecommerce.rst diff --git a/docs/skills/faq.rst b/docs/features/skills/faq.rst similarity index 100% rename from docs/skills/faq.rst rename to docs/features/skills/faq.rst diff --git a/docs/skills/go_bot.rst b/docs/features/skills/go_bot.rst similarity index 99% rename from docs/skills/go_bot.rst rename to docs/features/skills/go_bot.rst index a7a8e3f5e9..bfcffabb95 100644 --- a/docs/skills/go_bot.rst +++ b/docs/features/skills/go_bot.rst @@ -366,4 +366,4 @@ References 2016 `_ -.. |alt text| image:: ../_static/gobot_diagram.png +.. |alt text| image:: ../../_static/gobot_diagram.png diff --git a/docs/features/skills/index.rst b/docs/features/skills/index.rst new file mode 100644 index 0000000000..6ef60eee19 --- /dev/null +++ b/docs/features/skills/index.rst @@ -0,0 +1,15 @@ +Skills +====== + +.. toctree:: + :glob: + :maxdepth: 1 + + Goal-Oriented Dialogue Bot + Open-Domain Question Answering + Pattern Matching + Sequence-To-Sequence Dialogue Bot + Frequently Asked Questions Answering + eCommerce Bot + AIML + diff --git a/docs/skills/odqa.rst b/docs/features/skills/odqa.rst similarity index 100% rename from docs/skills/odqa.rst rename to docs/features/skills/odqa.rst diff --git a/docs/skills/pattern_matching.rst b/docs/features/skills/pattern_matching.rst similarity index 82% rename from docs/skills/pattern_matching.rst rename to docs/features/skills/pattern_matching.rst index 882cde2561..adc687d8e8 100644 --- a/docs/skills/pattern_matching.rst +++ b/docs/features/skills/pattern_matching.rst @@ -5,5 +5,3 @@ A :doc:`basic skill implementation` that one of predefined responses chosen at random. Skill's confidence equals ``1`` for incoming utterances that match any of predefined patterns or ``0`` for utterances that do not. If no patterns were defined for a skill, its confidence will always be equal to ``0.5``. - -Its usage example can be found in the :doc:`Hello bot! ` tutorial. diff --git a/docs/skills/seq2seq_go_bot.rst b/docs/features/skills/seq2seq_go_bot.rst similarity index 99% rename from docs/skills/seq2seq_go_bot.rst rename to docs/features/skills/seq2seq_go_bot.rst index 7ef0981b1d..f76634aed3 100644 --- a/docs/skills/seq2seq_go_bot.rst +++ b/docs/features/skills/seq2seq_go_bot.rst @@ -209,4 +209,4 @@ References .. [2] `A New Multi-Turn, Multi-Domain, Task-Oriented Dialogue Dataset - 2017 `_ -.. |alt text| image:: ../_static/kvret_diagram.png +.. |alt text| image:: ../../_static/kvret_diagram.png diff --git a/docs/index.rst b/docs/index.rst index 7ed9a424fd..8fc6ba862b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,63 +5,53 @@ Welcome to DeepPavlov's documentation! :glob: :maxdepth: 1 - Hello bot! + QuickStart Installation - Conceptual overview - Features - Configuration files - Pre-trained embeddings - AutoML + General concepts + Configuration file .. toctree:: :glob: - :maxdepth: 1 - :caption: Components - - Data Processors - BERT-based models - Context Question Answering - Classification - Morphological Tagger - Named Entity Recognition - Neural Ranking - Slot filling - Spelling Correction - TF-IDF Ranking - Popularity Ranking - Knowledge Base Question answering + :maxdepth: 2 + :caption: Features + + Overview + Models + Skills + Pre-trained embeddings + AutoML .. toctree:: :glob: - :maxdepth: 1 - :caption: Skills + :maxdepth: 3 + :caption: Integrations - Goal-Oriented Dialogue Bot - Open-Domain Question Answering - Pattern Matching - Sequence-To-Sequence Dialogue Bot - Frequently Asked Questions Answering - eCommerce Bot - AIML + REST API + Telegram integration + Yandex Alice integration + Amazon Alexa integration + Microsoft Bot Framework integration + Amazon AWS deployment + Deeppavlov settings .. toctree:: :glob: :maxdepth: 3 - :caption: Package Reference + :caption: Developer Guides - apiref/* + Contribution guide + Registry your model .. toctree:: :glob: :maxdepth: 3 - :caption: Developer Guides - - devguides/* + :caption: Package Reference + apiref/* Indices and tables diff --git a/docs/devguides/amazon_alexa.rst b/docs/integrations/amazon_alexa.rst similarity index 93% rename from docs/devguides/amazon_alexa.rst rename to docs/integrations/amazon_alexa.rst index 3c8ba8aa4f..4385bd344c 100644 --- a/docs/devguides/amazon_alexa.rst +++ b/docs/integrations/amazon_alexa.rst @@ -1,8 +1,8 @@ Amazon Alexa integration ======================== -DeepPavlov components or skills can be made available for inference via Amazon Alexa. Because of Alexa predominantly -conversational nature (raw text in, raw text out), the best results can be achieved with components with raw text both +DeepPavlov models can be made available for inference via Amazon Alexa. Because of Alexa predominantly +conversational nature (raw text in, raw text out), the best results can be achieved with models with raw text both in input and output (ODQA, SQuAD, etc.). Also we **highly** recommend you to study `Alexa skills building basics `__ @@ -14,7 +14,7 @@ Further instructions are given counting on the fact that you are already familia The whole integrations process takes two main steps: 1. Skill setup in Amazon Alexa Developer console -2. DeepPavlov skill/component REST service mounting +2. DeepPavlov skill/model REST service mounting 1. Skill setup -------------- @@ -167,12 +167,12 @@ console: Please note, that in both cases you should have only one intent with only one slot defined in Alexa Development Console. -2. DeepPavlov skill/component REST service mounting +2. DeepPavlov skill/model REST service mounting --------------------------------------------------- Alexa sends request to the https endpoint which was set in the **Endpoint** section of Alexa Development Console. -You should deploy DeepPavlov skill/component REST service on this +You should deploy DeepPavlov skill/model REST service on this endpoint or redirect it to your REST service. Full REST endpoint URL can be obtained by the swagger ``apidocs/`` endpoint. We remind you that Alexa requires https endpoint with valid certificate from CA. `Here is the guide `__ @@ -182,7 +182,7 @@ Your intent and slot names defined in Alexa Development Console should be the sa DeepPavlov settings file ``deeppavlov/utils/settings/server_config.json``. JSON examples from this guide use default values from the settings file. -DeepPavlov skill/component can be made available for Amazon Alexa as a REST service by: +DeepPavlov skill/model can be made available for Amazon Alexa as a REST service by: .. code:: bash @@ -196,13 +196,13 @@ before service start. Optional ``-p`` key can be provided to override the port value from a settings file. -Optional ``--stateful`` flag should be provided for stateful skills/components. +Optional ``--stateful`` flag should be provided for stateful skills/models. -Optional ``--multi-instance`` can be provided if you wish to raise separate skill/component instance +Optional ``--multi-instance`` can be provided if you wish to raise separate skill/model instance for **each** conversation. -You should use ``--no-default-skill`` optional flag if your component implements an interface of DeepPavlov *Skill* +You should use ``--no-default-skill`` optional flag if your model implements an interface of DeepPavlov *Skill* to skip its wrapping with DeepPavlov *DefaultStatelessSkill*. REST service properties (host, port, https options) are provided in ``deeppavlov/utils/settings/server_config.json``. Please note, -that all command line parameters override corresponding config ones. \ No newline at end of file +that all command line parameters override corresponding config ones. diff --git a/docs/devguides/aws_ec2.rst b/docs/integrations/aws_ec2.rst similarity index 97% rename from docs/devguides/aws_ec2.rst rename to docs/integrations/aws_ec2.rst index 3e86a7f0aa..be69ee2122 100644 --- a/docs/devguides/aws_ec2.rst +++ b/docs/integrations/aws_ec2.rst @@ -28,7 +28,7 @@ Deployment process consists of two main stages: :width: 800 4. Proceed to Step 4. Your instance storage size should be no less than 50 GiB to - store ODQA components. + store ODQA models. .. image:: ../_static/aws_ec2/04_add_storage.png :width: 800 @@ -113,7 +113,7 @@ Deployment process consists of two main stages: ``python -m deeppavlov install en_odqa_pop_infer_enwiki20180211`` -8. Download ODQA components (it will take quite a time): +8. Download ODQA models (it will take quite a time): ``python -m deeppavlov download en_odqa_pop_infer_enwiki20180211`` diff --git a/docs/devguides/ms_bot_integration.rst b/docs/integrations/ms_bot.rst similarity index 85% rename from docs/devguides/ms_bot_integration.rst rename to docs/integrations/ms_bot.rst index a476cc7641..4f1443ea50 100644 --- a/docs/devguides/ms_bot_integration.rst +++ b/docs/integrations/ms_bot.rst @@ -1,13 +1,13 @@ Microsoft Bot Framework integration =================================== -Each library component or skill can be made available for +Each library model or skill can be made available for inference via Microsoft Bot Framework. The whole process takes two main steps: 1. Web App Bot setup in Microsoft Azure -2. DeepPavlov skill/component REST service mounting +2. DeepPavlov skill/model REST service mounting 1. Web App Bot setup -------------------- @@ -47,13 +47,13 @@ The whole process takes two main steps: 3.1 Navigate to your bot *Settings* menu. - 3.2 Input your DeepPavlov skill/component REST service URL + 3.2 Input your DeepPavlov skill/model REST service URL to the *Messaging endpoint* pane. Note, that Microsoft Bot Framework requires https endpoint with valid certificate from CA. 3.3 Save somewhere *Microsoft App ID* (*App ID*). To get *App Secret* you need to proceed to the *Manage* ling near the *Microsoft App ID* pane. - You will need both during your DeepPavlov skill/component REST service start. + You will need both during your DeepPavlov skill/model REST service start. .. image:: ../_static/ms_bot_framework/04_bot_settings.png :width: 1500 @@ -67,18 +67,18 @@ The whole process takes two main steps: .. image:: ../_static/ms_bot_framework/05_bot_channels.png :width: 1500 -2. DeepPavlov skill/component REST service mounting +2. DeepPavlov skill/model REST service mounting --------------------------------------------------- MS Bot Framework sends messages from all channels to the https endpoint which was set in the **Web App Bot connection configuration** section. -You should deploy DeepPavlov skill/component REST service on this +You should deploy DeepPavlov skill/model REST service on this endpoint or terminate it to your REST service. Full REST endpoint URL can be obtained by the swagger ``apidocs/`` endpoint. We remind you that Microsoft Bot Framework requires https endpoint with valid certificate from CA. -Each DeepPavlov skill/component can be made available for MS Bot Framework +Each DeepPavlov skill/model can be made available for MS Bot Framework as a REST service by: .. code:: bash @@ -96,12 +96,12 @@ before service start. Optional ``-p`` key can be provided to override the port value from a settings file. -Optional ``--stateful`` flag should be provided for stateful skills/components. +Optional ``--stateful`` flag should be provided for stateful skills/models. -Optional ``--multi-instance`` can be provided if you wish to raise separate skill/component instance +Optional ``--multi-instance`` can be provided if you wish to raise separate skill/model instance for **each** conversation. -You should use ``--no-default-skill`` optional flag if your component implements an interface of DeepPavlov *Skill* +You should use ``--no-default-skill`` optional flag if your model implements an interface of DeepPavlov *Skill* to skip its wrapping with DeepPavlov *DefaultStatelessSkill*. REST service properties (host, port) are provided in ``deeppavlov/utils/settings/server_config.json``. You can also store your diff --git a/docs/devguides/rest_api.rst b/docs/integrations/rest_api.rst similarity index 67% rename from docs/devguides/rest_api.rst rename to docs/integrations/rest_api.rst index 6dbc36d841..98a93ec442 100644 --- a/docs/devguides/rest_api.rst +++ b/docs/integrations/rest_api.rst @@ -1,22 +1,38 @@ REST API ======== -Each library component or skill can be easily made available for +Each DeepPavlov model can be easily made available for inference as a REST web service. The general method is: -``python -m deeppavlov riseapi [-d] [-p ]`` +.. code:: bash -(optional ``-d`` key is for dependencies download before service start) + python -m deeppavlov riseapi [-d] [-p ] + + +* ``-d``: downloads model specific data before starting the service. +* ``-p ``: sets the port to ````. Overrides default + settings from ``deeppavlov/utils/settings/server_config.json``. + +The command will print the used host and port. Default web service properties +(host, port, model endpoint, GET request arguments) can be modified via changing +``deeppavlov/utils/settings/server_config.json`` file. + +To interact with the REST API via graphical interface open +``:/apidocs`` in a browser (Flasgger UI). + +Advanced configuration +~~~~~~~~~~~~~~~~~~~~~~ + +By modifying ``deeppavlov/utils/settings/server_config.json`` you can change +host, port, model endpoint, GET request arguments and other properties of the +API service. -Web service properties (host, port, model endpoint, GET request -arguments) are provided in ``deeppavlov/utils/settings/server_config.json``, -but port can be overridden with the ``-p`` key in command line. Properties from ``common_defaults`` section are used by default unless -they are overridden by component-specific properties, provided in +they are overridden by model-specific properties, provided in ``model_defaults`` section of the ``server_config.json``. -Component-specific properties are bound to the component by -``server_utils`` label in ``metadata/labels`` section of the component -config. Value of ``server_utils`` label from component config should +Model-specific properties are bound to the model by +``server_utils`` label in ``metadata/labels`` section of the model +config. Value of ``server_utils`` label from model config should match with properties key from ``model_defaults`` section of ``server_config.json``. @@ -27,33 +43,30 @@ of ``server_config.json``. Therefore, ``model_endpoint`` parameter in from ``model_defaults/GoalOrientedBot``. Model argument names are provided as list in ``model_args_names`` -parameter, where arguments order corresponds to component API. +parameter, where arguments order corresponds to model API. When inferencing model via REST api, JSON payload keys should match -component arguments names from ``model_args_names``. -Default argument name for one argument components is *"context"*. -Here are POST requests examples for some of the library components: +model arguments names from ``model_args_names``. +Default argument name for one argument models is *"context"*. +Here are POST requests examples for some of the library models: +-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ -| Component | POST request JSON payload example | +| Model | POST request JSON payload example | +=========================================+=================================================================================================================================================+ -| **One argument components** | +| **One argument models** | +-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ -| NER component | {"context":["Elon Musk launched his cherry Tesla roadster to the Mars orbit"]} | +| NER model | {"context":["Elon Musk launched his cherry Tesla roadster to the Mars orbit"]} | +-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ -| Intent classification component | {"context":["I would like to go to a restaurant with Asian cuisine this evening"]} | +| Intent classification model | {"context":["I would like to go to a restaurant with Asian cuisine this evening"]} | +-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ -| Automatic spelling correction component | {"context":["errror"]} | +| Automatic spelling correction model | {"context":["errror"]} | +-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ -| Ranking component | {"context":["What is the average cost of life insurance services?"]} | +| Ranking model | {"context":["What is the average cost of life insurance services?"]} | +-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ -| (Seq2seq) Goal-oriented bot | {"context":["Hello, can you help me to find and book a restaurant this evening?"]} | +| Goal-oriented bot | {"context":["Hello, can you help me to find and book a restaurant this evening?"]} | +-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ -| **Multiple arguments components** | +| **Multiple arguments models** | +-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ -| Question Answering component | | {"context":["After 1765, growing philosophical and political differences strained the relationship between Great Britain and its colonies."], | +| Question Answering model | | {"context":["After 1765, growing philosophical and political differences strained the relationship between Great Britain and its colonies."], | | | |  "question":["What strained the relationship between Great Britain and its colonies?"]} | +-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+ - -Flasgger UI for API testing is provided on ``:/apidocs`` -when running a component in ``riseapi`` mode. diff --git a/docs/devguides/settings.rst b/docs/integrations/settings.rst similarity index 100% rename from docs/devguides/settings.rst rename to docs/integrations/settings.rst diff --git a/docs/integrations/telegram.rst b/docs/integrations/telegram.rst new file mode 100644 index 0000000000..c5356650e4 --- /dev/null +++ b/docs/integrations/telegram.rst @@ -0,0 +1,66 @@ + +Telegram integration +======================== + +Any model specified by a DeepPavlov config can be launched as a Telegram bot. +You can do it using command line interface or using python. + +Command line interface +~~~~~~~~~~~~~~~~~~~~~~ + +To run a model specified by the ```` config file as a telegram bot +with a ````: + +.. code:: bash + + python -m deeppavlov interactbot -t [-d] [--no-default-skill] + + +* ``-t ``: specifies telegram token as ````. +* ``-d``: downloads model specific data before starting the service. +* ``-no-default-skill``: states that your model is already implements an + interface of a :class:`~deeppavlov.core.skill.skill.Skill` and doesn't + need additional wrapping into a stateless skill + :class:`~deeppavlov.skills.default_skill.default_skill.DefaultStatelessSkill` (models from + Skills section require the flag). + +The command will print the used host and port. Default web service properties +(host, port, model endpoint, GET request arguments) can be modified via changing +``deeppavlov/utils/settings/server_config.json`` file. Advanced API +configuration is described in :doc:`REST API ` section. + +If you want to get custom ``/start`` and ``/help`` Telegram messages for the running model you should: + +* Add section to ``deeppavlov/utils/settings/models_info.json`` with your custom Telegram messages +* In model config file specify ``metadata.labels.telegram_utils`` parameter with name which + refers to the added section of ``deeppavlov/utils/settings/models_info.json`` + +Python +~~~~~~ + +To run a model specified by a DeepPavlov config ```` as as +Telegram bot, you have to turn it to a :class:`~deeppavlov.core.skill.skill.Skill` +and then make it an :class:`~deeppavlov.core.agent.agent.Agent`. + +.. code:: python + + from deeppavlov import build_model + from deeppavlov.skills.default_skill.default_skill import DefaultStatelessSkill + from deeppavlov.agents.default_agent.default_agent import DefaultAgent + from deeppavlov.agents.processors.highest_confidence_selector import HighestConfidenceSelector + from deeppavlov.utils.telegram.telegram_ui import init_bot_for_model + + model = build_model("", download=True) + + # Step 1: make it a Skill + skill = DefaultStatelessSkill(model) + # Step 2: make it an Agent + agent = DefaultAgent(skills=[skill]) + # Step 3: run server + init_bot_for_model(agent, token="", name="my_model_name") + +If your model is already a subclass of :class:`~deeppavlov.core.skill.skill.Skill` +or a subclass of :class:`~deeppavlov.core.agent.agent.Agent` (see +:doc:`skills ` and :doc:`agents `) you can skip +corresponding steps. + diff --git a/docs/integrations/yandex_alice.rst b/docs/integrations/yandex_alice.rst new file mode 100644 index 0000000000..93e72b199e --- /dev/null +++ b/docs/integrations/yandex_alice.rst @@ -0,0 +1,62 @@ +Yandex Alice integration +======================== + +Any model specified by a DeepPavlov config can be launched as a skill for +Yandex.Alice. You can do it using command line interface or using python. + +Command line interface +~~~~~~~~~~~~~~~~~~~~~~ + +To interact with Alice you will require your own HTTPS certificate. To generate +a new one -- run: + +:: + + openssl req -new -newkey rsa:4096 -days 365 -nodes -x509 -subj "/CN=MY_DOMAIN_OR_IP" -keyout my.key -out my.crt + +Then run: + +:: + + python -m deeppavlov riseapi --api-mode alice --https --key my.key --cert my.crt [-d] [-p ] + + +* ``-d``: download model specific data before starting the service. +* ``-p ``: sets the port to ````. Overrides default + settings from ``deeppavlov/utils/settings/server_config.json``. + +Now set up and test your dialog (https://dialogs.yandex.ru/developer/). +Detailed documentation of the platform could be found on +https://tech.yandex.ru/dialogs/alice/doc/about-docpage/. Advanced API +configuration is described in :doc:`REST API ` section. + + +Python +~~~~~~ + +To run a model specified by a DeepPavlov config ```` as an Alice +skill, firstly, you have to turn it to a :class:`~deeppavlov.core.skill.skill.Skill` +and then make it an :class:`~deeppavlov.core.agent.agent.Agent`. + +.. code:: python + + from deeppavlov import build_model + from deeppavlov.skills.default_skill.default_skill import DefaultStatelessSkill + from deeppavlov.agents.default_agent.default_agent import DefaultAgent + from deeppavlov.agents.processors.highest_confidence_selector import HighestConfidenceSelector + from deeppavlov.utils.alice import start_agent_server + + model = build_model("", download=True) + + # Step 1: make it a Skill + skill = DefaultStatelessSkill(model) + # Step 2: make it an Agent + agent = DefaultAgent(skills=[skill]) + # Step 3: run server + start_agent_server(agent, host='0.0.0.0', port=7051, endpoint='/agent', ssl_key='my.key', ssl_cert='my.crt') + +If your model is already a subclass of :class:`~deeppavlov.core.skill.skill.Skill` +or a subclass of :class:`~deeppavlov.core.agent.agent.Agent` (see +:doc:`skills ` and :doc:`agents `) you can skip +corresponding steps. + diff --git a/docs/intro/config_description.rst b/docs/intro/configuration.rst similarity index 56% rename from docs/intro/config_description.rst rename to docs/intro/configuration.rst index 45d5417fea..c828409f8a 100644 --- a/docs/intro/config_description.rst +++ b/docs/intro/configuration.rst @@ -1,5 +1,5 @@ -Configuration files -=================== +Configuration file +================== An NLP pipeline config is a JSON file that contains one required element ``chainer``: @@ -117,7 +117,7 @@ unsupervised settings). This process takes multiple epochs with periodic validat :meth:`~deeppavlov.core.models.nn_model.NNModel.train_on_batch` method has to be implemented for each :class:`~deeppavlov.core.models.nn_model.NNModel`. -Training is triggered by :func:`~deeppavlov.core.commands.train.train_evaluate_model_from_config` function. +Training is triggered by :func:`~deeppavlov.train_model` function. Train config @@ -188,6 +188,7 @@ Train Parameters :class:`nn_trainer `). All other parameters will be passed as keyword arguments to the trainer class's constructor. + Metrics _______ @@ -259,3 +260,151 @@ A particular format of returned data should be defined in :meth:`__call__`. Inference is triggered by :func:`~deeppavlov.core.commands.infer.interact_model` function. There is no need in a separate JSON for inference. + +Model Configuration +------------------- + +Each DeepPavlov model is determined by its configuration file. You can use +existing config files or create yours. You can also choose a config file and +modify preprocessors/tokenizers/embedders/vectorizers there. The components +below have the same interface and are responsible for the same functions, +therefore they can be used in the same parts of a config pipeline. + +Here is a list of useful +:class:`~deeppavlov.core.models.component.Component`\ s aimed to preprocess, +postprocess and vectorize your data. + +Preprocessors +~~~~~~~~~~~~~ + +Preprocessor is a component that processes batch of samples. + +* Already implemented universal preprocessors of **tokenized texts** (each + sample is a list of tokens): + + - :class:`~deeppavlov.models.preprocessors.char_splitter.CharSplitter` + (registered as ``char_splitter``) splits every token in given batch of + tokenized samples to a sequence of characters. + + - :class:`~deeppavlov.models.preprocessors.mask.Mask` (registered as + ``mask``) returns binary mask of corresponding length (padding up to the + maximum length per batch. + + - :class:`~deeppavlov.models.preprocessors.russian_lemmatizer.PymorphyRussianLemmatizer` + (registered as ``pymorphy_russian_lemmatizer``) performs lemmatization + for Russian language. + + - :class:`~deeppavlov.models.preprocessors.sanitizer.Sanitizer` + (registered as ``sanitizer``) removes all combining characters like + diacritical marks from tokens. + +* Already implemented universal preprocessors of **non-tokenized texts** + (each sample is a string): + + - :class:`~deeppavlov.models.preprocessors.dirty_comments_preprocessor.DirtyCommentsPreprocessor` + (registered as ``dirty_comments_preprocessor``) preprocesses samples + converting samples to lowercase, paraphrasing English combinations with + apostrophe ``'``, transforming more than three the same symbols to two + symbols. + + - :class:`~deeppavlov.models.preprocessors.str_lower.StrLower` (registered + as ``str_lower``) converts samples to lowercase. + +* Already implemented universal preprocessors of another type of features: + + - :class:`~deeppavlov.models.preprocessors.one_hotter.OneHotter` + (registered as ``one_hotter``) performs one-hotting operation for the + batch of samples where each sample is an integer label or a list of + integer labels (can be combined in one batch). If ``multi_label`` + parameter is set to ``True``, returns one one-dimensional vector per + sample with several elements equal to ``1``. + + +Tokenizers +~~~~~~~~~~ + +Tokenizer is a component that processes batch of samples (each sample is a text +string). + + - :class:`~deeppavlov.models.tokenizers.lazy_tokenizer.LazyTokenizer` + (registered as ``lazy_tokenizer``) tokenizes using ``nltk.word_tokenize``. + + - :class:`~deeppavlov.models.tokenizers.nltk_tokenizer.NLTKTokenizer` + (registered as ``nltk_tokenizer``) tokenizes using tokenizers from + ``nltk.tokenize``, e.g. ``nltk.tokenize.wordpunct_tokenize``. + + - :class:`~deeppavlov.models.tokenizers.nltk_moses_tokenizer.NLTKMosesTokenizer` + (registered as ``nltk_moses_tokenizer``) tokenizes and detokenizes using + ``nltk.tokenize.moses.MosesDetokenizer``, + ``nltk.tokenize.moses.MosesTokenizer``. + + - :class:`~deeppavlov.models.tokenizers.ru_sent_tokenizer.RuSentTokenizer` + (registered as ``ru_sent_tokenizer``) is a rule-based tokenizer for + Russian language. + + - :class:`~deeppavlov.models.tokenizers.ru_tokenizer.RussianTokenizer` + (registered as ``ru_tokenizer``) tokenizes or lemmatizes Russian texts + using ``nltk.tokenize.toktok.ToktokTokenizer``. + + - :class:`~deeppavlov.models.tokenizers.spacy_tokenizer.StreamSpacyTokenizer` + (registered as ``stream_spacy_tokenizer``) tokenizes or lemmatizes texts + with spacy ``en_core_web_sm`` models by default. + + - :class:`~deeppavlov.models.tokenizers.split_tokenizer.SplitTokenizer` + (registered as ``split_tokenizer``) tokenizes using string method + ``split``. + + +Embedders +~~~~~~~~~ + +Embedder is a component that converts every token in a tokenized batch to a +vector of a particular dimension (optionally, returns a single vector per +sample). + + - :class:`~deeppavlov.models.embedders.glove_embedder.GloVeEmbedder` + (registered as ``glove``) reads embedding file in GloVe format (file + starts with ``number_of_words embeddings_dim line`` followed by lines + ``word embedding_vector``). If ``mean`` returns one vector per + sample --- mean of embedding vectors of tokens. + + - :class:`~deeppavlov.models.embedders.fasttext_embedder.FasttextEmbedder` + (registered as ``fasttext``) reads embedding file in fastText format. + If ``mean`` returns one vector per sample - mean of embedding vectors + of tokens. + + - :class:`~deeppavlov.models.embedders.bow_embedder.BoWEmbedder` + (registered as ``bow``) performs one-hot encoding of tokens using + pre-built vocabulary. + + - :class:`~deeppavlov.models.embedders.tfidf_weighted_embedder.TfidfWeightedEmbedder` + (registered as ``tfidf_weighted``) accepts embedder, tokenizer (for + detokenization, by default, detokenize with joining with space), TFIDF + vectorizer or counter vocabulary, optionally accepts tags vocabulary (to + assign additional multiplcative weights to particular tags). If ``mean`` + returns one vector per sample - mean of embedding vectors of tokens. + + - :class:`~deeppavlov.models.embedders.elmo_embedder.ELMoEmbedder` + (registered as ``elmo``) converts tokens to pre-trained contextual + representations from large-scale bidirectional language models. See + examples `here `__. + +Vectorizers +~~~~~~~~~~~ + +Vectorizer is a component that converts batch of text samples to batch of +vectors. + + - :class:`~deeppavlov.models.sklearn.sklearn_component.SklearnComponent` + (registered as ``sklearn_component``) is a DeepPavlov wrapper for most + of sklearn estimators, vectorizers etc. For example, to get + TFIDF-vectorizer one should assign in config ``model_class`` to + ``sklearn.feature_extraction.text:TfidfVectorizer``, ``infer_method`` + to ``transform``, pass ``load_path``, ``save_path`` and other sklearn + model parameters. + + - :class:`~deeppavlov.models.vectorizers.hashing_tfidf_vectorizer.HashingTfIdfVectorizer` + (registered as ``hashing_tfidf_vectorizer``) implements hashing version + of usual TFIDF-vecotrizer. It creates a TFIDF matrix from collection of + documents of size ``[n_documents X n_features(hash_size)]``. + diff --git a/docs/intro/hello_bot.ipynb b/docs/intro/hello_bot.ipynb deleted file mode 100644 index 3557e98906..0000000000 --- a/docs/intro/hello_bot.ipynb +++ /dev/null @@ -1,119 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Hello bot!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Open in [Colaboratory](https://colab.research.google.com/github/deepmipt/DeepPavlov/blob/master/docs/intro/hello_bot.ipynb)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "!pip install -q deeppavlov" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Import key components to build HelloBot. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from deeppavlov.skills.pattern_matching_skill import PatternMatchingSkill\n", - "from deeppavlov.agents.default_agent.default_agent import DefaultAgent\n", - "from deeppavlov.agents.processors.highest_confidence_selector import HighestConfidenceSelector" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create skills as pre-defined responses for a user's input containing specific keywords and regular expressions. Every skill returns response and confidence." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "hello = PatternMatchingSkill(responses=['Hello world!'], patterns=[\"hi\", \"hello\", \"good day\"])\n", - "bye = PatternMatchingSkill(['Goodbye world!', 'See you around'],\n", - " patterns=[\"bye\", \"chao\", \"see you\"])\n", - "fallback = PatternMatchingSkill([\"I don't understand, sorry\", 'I can say \"Hello world!\"'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Agent executes skills and then takes response from the skill with the highest confidence." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "agent = DefaultAgent([hello, bye, fallback], skills_selector=HighestConfidenceSelector())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Give the floor to the HelloBot!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "agent(['Hello', 'Bye', 'Or not'])" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/intro/installation.rst b/docs/intro/installation.rst index 3837dc4e8a..8506bee5b4 100644 --- a/docs/intro/installation.rst +++ b/docs/intro/installation.rst @@ -1,21 +1,29 @@ Installation ============ -0. Currently we support ``Linux`` and ``Windows`` platforms and ``Python 3.6`` + +We support ``Linux`` and ``Windows`` platforms, ``Python 3.6`` and ``Python 3.7``. + +.. note:: + * ``Python 3.5`` is not supported! - * ``Windows`` platform requires ``Git`` for Windows (for example, `git `__ ), ``Visual Studio 2015/2017`` with ``C++`` build tools installed! -1. Create a virtual environment with ``Python 3.6``: + * installation for ``Windows`` requires ``Git`` for Windows (for example, + `git `_ ), ``Visual Studio 2015/2017`` + with ``C++`` build tools installed! + + +#. Create a virtual environment: .. code:: bash - virtualenv env + python -m venv env -2. Activate the environment: +#. Activate the environment: * Linux - .. code:: bash + .. code:: source ./env/bin/activate @@ -25,9 +33,21 @@ Installation .\env\Scripts\activate.bat -3. Install the package inside this virtual environment: +#. Install the package inside this virtual environment: .. code:: bash pip install deeppavlov + +Docker Images +------------- + +We have built several DeepPavlov based Docker images, which include: + + * DeepPavlov based Jupyter notebook Docker image; + * Docker images which serve some of our models and allow to access them + via REST API (``riseapi`` mode). + +Here is our `DockerHub repository `_ with +images and deployment instructions. diff --git a/docs/intro/overview.rst b/docs/intro/overview.rst index 651f5ffca8..e4ed9e0158 100644 --- a/docs/intro/overview.rst +++ b/docs/intro/overview.rst @@ -24,27 +24,28 @@ Key Concepts (e.g. answer question by FAQ, booking tickets etc.). However, for some tasks a success of interaction is defined as continuous engagement (e.g. chit-chat). -- ``Component`` is a reusable functional part of ``Skill``. +- ``Model`` is any NLP model that doesn't necessarily communicates + with user in natural language. +- ``Component`` is a reusable functional part of ``Model`` or ``Skill``. - ``Rule-based Models`` cannot be trained. - ``Machine Learning Models`` can be trained only stand alone. - ``Deep Learning Models`` can be trained independently and in an end-to-end mode being joined in a chain. - ``Skill Manager`` performs selection of the ``Skill`` to generate response. -- ``Chainer`` builds an agent/component pipeline from heterogeneous +- ``Chainer`` builds an agent/model pipeline from heterogeneous components (Rule-based/ML/DL). It allows to train and infer models in a pipeline as a whole. The smallest building block of the library is ``Component``. ``Component`` stands for any kind of function in an NLP pipeline. It can be implemented as a neural network, a non-neural ML model or a -rule-based system. Besides that, ``Component`` can have nested -structure, i.e. a ``Component`` can include other ``Component`` s. +rule-based system. -``Component`` s can be joined into a ``Skill``. ``Skill`` solves a -larger NLP task compared to ``Component``. However, in terms of -implementation ``Skill``\ s are not different from ``Component``\ s. The -only restriction of ``Skill``\ s is that their input and output should +``Component``\ s can be joined into a ``Model`` or a ``Skill``. ``Model`` +solves a larger NLP task compared to ``Component``. However, in terms of +implementation ``Models``\ s are not different from ``Component``\ s. The +difference of a ``Skill`` from a ``Model`` is that its input and output should both be strings. Therefore, ``Skill``\ s are usually associated with dialogue tasks. @@ -58,3 +59,4 @@ DeepPavlov is built on top of machine learning frameworks `TensorFlow `__ and `Keras `__. Other external libraries can be used to build basic components. + diff --git a/docs/intro/quick_start.rst b/docs/intro/quick_start.rst new file mode 100644 index 0000000000..c2fb7195a3 --- /dev/null +++ b/docs/intro/quick_start.rst @@ -0,0 +1,130 @@ +QuickStart +------------ + +There is a bunch of great pre-trained NLP models in DeepPavlov. Each model is +determined by its config file. + +List of models is available on :doc:`the doc page ` or in +the ``deeppavlov.configs`` (Python): + + .. code:: python + + from deeppavlov import configs + +When you're decided on the model (+ config file), there are two ways to train, +evaluate and infer it: + +* via `Command line interface (CLI)`_ and +* via `Python`_. + +Before making choice of an interface, install model's package requirements +(CLI): + + .. code:: bash + + python -m deeppavlov install + + * where ```` is path to the chosen model's config file (e.g. + ``deeppavlov/configs/ner/slotfill_dstc2.json``) or just name without + `.json` extension (e.g. ``slotfill_dstc2``) + + +Command line interface (CLI) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To get predictions from a model interactively through CLI, run + + .. code:: bash + + python -m deeppavlov interact [-d] + + * ``-d`` downloads required data -- pretrained model files and embeddings + (optional). + +You can train it in the same simple way: + + .. code:: bash + + python -m deeppavlov train [-d] + + Dataset will be downloaded regardless of whether there was ``-d`` flag or + not. + + To train on your own data you need to modify dataset reader path in the + `train section doc `__. The data format is + specified in the corresponding model doc page. + +There are even more actions you can perform with configs: + + .. code:: bash + + python -m deeppavlov [-d] + + * ```` can be + * ``download`` to download model's data (same as ``-d``), + * ``train`` to train the model on the data specified in the config file, + * ``evaluate`` to calculate metrics on the same dataset, + * ``interact`` to interact via CLI, + * ``riseapi`` to run a REST API server (see :doc:`docs + `), + * ``interactbot`` to run as a Telegram bot (see :doc:`docs + `), + * ``interactmsbot`` to run a Miscrosoft Bot Framework server (see + :doc:`docs `), + * ``predict`` to get prediction for samples from `stdin` or from + `` if ``-f `` is specified. + * ```` specifies path (or name) of model's config file + * ``-d`` downloads required data + + +Python +~~~~~~ + +To get predictions from a model interactively through Python, run + + .. code:: python + + from deeppavlov import build_model + + model = build_model(, download=True) + + # get predictions for 'input_text1', 'input_text2' + model(['input_text1', 'input_text2']) + + * where ``download=True`` downloads required data from web -- pretrained model + files and embeddings (optional), + * ```` is path to the chosen model's config file (e.g. + ``"deeppavlov/configs/ner/ner_ontonotes_bert_mult.json"``) or + ``deeppavlov.configs`` attribute (e.g. + ``deeppavlov.configs.ner.ner_ontonotes_bert_mult`` without quotation marks). + +You can train it in the same simple way: + + .. code:: python + + from deeppavlov import train_model + + model = train_model(, download=True) + + * ``download=True`` downloads pretrained model, therefore the pretrained + model will be, first, loaded and then train (optional). + + Dataset will be downloaded regardless of whether there was ``-d`` flag or + not. + + To train on your own data you need to modify dataset reader path in the + `train section doc `__. The data format is + specified in the corresponding model doc page. + +You can also calculate metrics on the dataset specified in your config file: + + .. code:: python + + from deeppavlov import evaluate_model + + model = evaluate_model(, download=True) + +There are also available integrations with various messengers, see +:doc:`Telegram Bot doc page ` and others in the +Integrations section for more info. + From 6ee4d9e89f30555ed288ee78e297d4cfad640118 Mon Sep 17 00:00:00 2001 From: Anton Kiselev Date: Mon, 29 Jul 2019 14:50:46 +0300 Subject: [PATCH 17/18] feat: add a DSL for creating rule-based skills (#928) * dsl skill * on_invalid_command in config * optional arguments * tests * Refactoring + Preprocessing in config * Typo * Refactoring * Docstrings * .rst documentation * class_name in config * utterance_batch and user_ids_batch + refactoring * typo fixed + docstring refactoring * Optional added * refactoring * refactoring context * redundant bracket * copyright + refactoring * return types * return types * docstring * refactoring docstrings * hotfix * docs: adapt DSL documentation to the new structure * docs: docstings refactored * docs: refactoring Co-Authored-By: Anton Kiselev --- deeppavlov/configs/dsl_skill/dsl_skill.json | 40 ++++ deeppavlov/skills/dsl_skill/__init__.py | 0 deeppavlov/skills/dsl_skill/context.py | 55 +++++ deeppavlov/skills/dsl_skill/dsl_skill.py | 224 ++++++++++++++++++ .../skills/dsl_skill/handlers/__init__.py | 2 + .../skills/dsl_skill/handlers/handler.py | 68 ++++++ .../dsl_skill/handlers/regex_handler.py | 81 +++++++ deeppavlov/skills/dsl_skill/utils.py | 23 ++ docs/apiref/skills/dsl_skill.rst | 5 + docs/features/skills/dsl_skill.rst | 45 ++++ docs/features/skills/index.rst | 1 + tests/test_dsl_skill.py | 109 +++++++++ 12 files changed, 653 insertions(+) create mode 100644 deeppavlov/configs/dsl_skill/dsl_skill.json create mode 100644 deeppavlov/skills/dsl_skill/__init__.py create mode 100644 deeppavlov/skills/dsl_skill/context.py create mode 100644 deeppavlov/skills/dsl_skill/dsl_skill.py create mode 100644 deeppavlov/skills/dsl_skill/handlers/__init__.py create mode 100644 deeppavlov/skills/dsl_skill/handlers/handler.py create mode 100644 deeppavlov/skills/dsl_skill/handlers/regex_handler.py create mode 100644 deeppavlov/skills/dsl_skill/utils.py create mode 100644 docs/apiref/skills/dsl_skill.rst create mode 100644 docs/features/skills/dsl_skill.rst create mode 100644 tests/test_dsl_skill.py diff --git a/deeppavlov/configs/dsl_skill/dsl_skill.json b/deeppavlov/configs/dsl_skill/dsl_skill.json new file mode 100644 index 0000000000..296c0708ee --- /dev/null +++ b/deeppavlov/configs/dsl_skill/dsl_skill.json @@ -0,0 +1,40 @@ +{ + "chainer": { + "in": [ + "utterances_batch", + "user_ids_batch" + ], + "out": [ + "responses_batch", + "confidences_batch" + ], + "pipe": [ + { + "class_name": "ru_tokenizer", + "in": "utterances_batch", + "lowercase": true, + "out": "utterance_tokens_batch" + }, + { + "class_name": "DSLSkill", + "on_invalid_command": "Sorry, I do not understand you", + "null_confidence": 0.0, + "in": [ + "utterance_tokens_batch", + "user_ids_batch" + ], + "out": [ + "responses_batch", + "confidences_batch" + ] + } + ] + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models" + } + } +} \ No newline at end of file diff --git a/deeppavlov/skills/dsl_skill/__init__.py b/deeppavlov/skills/dsl_skill/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/deeppavlov/skills/dsl_skill/context.py b/deeppavlov/skills/dsl_skill/context.py new file mode 100644 index 0000000000..7f79cc951f --- /dev/null +++ b/deeppavlov/skills/dsl_skill/context.py @@ -0,0 +1,55 @@ +# Copyright 2019 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Optional, Union, Dict + +import json + +from deeppavlov.skills.dsl_skill.utils import UserId + + +class UserContext: + """ + UserContext object stores information that the current skill currently knows about the user. + + Args: + user_id: id of user + message: current message + current_state: current user state + payload: custom payload dictionary, or a JSON-serialized string of such dictionary + + Attributes: + handler_payload: stores information generated by the selected handler + + """ + + def __init__( + self, + user_id: Optional[UserId] = None, + message: Optional[str] = None, + current_state: Optional[str] = None, + payload: Optional[Union[Dict, str]] = None, + ): + self.user_id = user_id + self.message = message + self.current_state = current_state + self.handler_payload = {} + + # some custom data added by skill creator + self.payload = payload + if payload == '' or payload is None: + self.payload = {} + elif isinstance(payload, str): + self.payload = json.loads(payload) diff --git a/deeppavlov/skills/dsl_skill/dsl_skill.py b/deeppavlov/skills/dsl_skill/dsl_skill.py new file mode 100644 index 0000000000..9a6cef47f3 --- /dev/null +++ b/deeppavlov/skills/dsl_skill/dsl_skill.py @@ -0,0 +1,224 @@ +# Copyright 2019 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABCMeta +from collections import defaultdict +from functools import partial +from itertools import zip_longest, starmap +from typing import List, Optional, Dict, Callable, Tuple + +from deeppavlov.core.common.registry import register +from deeppavlov.skills.dsl_skill.context import UserContext +from deeppavlov.skills.dsl_skill.handlers import Handler, RegexHandler +from deeppavlov.skills.dsl_skill.utils import SkillResponse, UserId + + +class DSLMeta(ABCMeta): + """ + This metaclass is used for creating a skill. Skill is register by its class name in registry. + + Example: + + .. code:: python + + class ExampleSkill(metaclass=DSLMeta): + @DSLMeta.handler(commands=["hello", "hey"]) + def __greeting(context: UserContext): + response = "Hello, my friend!" + confidence = 1.0 + return response, confidence + + Attributes: + name: class name + state_to_handler: dict with states as keys and lists of Handler objects as values + user_to_context: dict with user ids as keys and UserContext objects as values + universal_handlers: list of handlers that can be activated from any state + + """ + skill_collection: Dict[str, 'DSLMeta'] = {} + + def __init__(cls, name: str, + bases, + namespace, + **kwargs): + super().__init__(name, bases, namespace, **kwargs) + cls.name = name + cls.state_to_handler = defaultdict(list) + cls.user_to_context = defaultdict(UserContext) + cls.universal_handlers = [] + + handlers = [attribute for attribute in namespace.values() if isinstance(attribute, Handler)] + + for handler in handlers: + if handler.state is None: + cls.universal_handlers.append(handler) + else: + cls.state_to_handler[handler.state].append(handler) + + cls.handle = partial(DSLMeta.__handle, cls) + cls.__call__ = partial(DSLMeta.__handle_batch, cls) + cls.__init__ = partial(DSLMeta.__init__class, cls) + register()(cls) + DSLMeta.__add_to_collection(cls) + + def __init__class(cls, + on_invalid_command: str = "Простите, я вас не понял", + null_confidence: float = 0, + *args, **kwargs) -> None: + """ + Initialize Skill class + + Args: + on_invalid_command: message to be sent on message with no associated handler + null_confidence: the confidence when DSL has no handler that fits request + """ + # message to be sent on message with no associated handler + cls.on_invalid_command = on_invalid_command + cls.null_confidence = null_confidence + + def __handle_batch(cls: 'DSLMeta', + utterances_batch: List[str], + user_ids_batch: List[UserId]) -> Tuple[List, ...]: + """Returns skill inference result. + Returns batches of skill inference results, estimated confidence + levels and up to date states corresponding to incoming utterance + batch. + + Args: + utterances_batch: A batch of utterances of str type. + user_ids_batch: A batch of user ids. + + Returns: + response_batch: A batch of arbitrary typed skill inference results. + confidence_batch: A batch of float typed confidence levels for each of + skill inference result. + + """ + return (*map(list, zip(*starmap(cls.handle, zip_longest(utterances_batch, user_ids_batch)))),) + + @staticmethod + def __add_to_collection(cls: 'DSLMeta') -> None: + """ + Adds Skill class to Skill classes collection + + Args: + cls: Skill class + + """ + DSLMeta.skill_collection[cls.name] = cls + + @staticmethod + def __handle(cls: 'DSLMeta', + utterance: str, + user_id: UserId) -> SkillResponse: + """ + Handles what is going to be after a message from user arrived. + Simple usage: + skill([], []) + + Args: + cls: instance of callee's class + utterance: a message to be handled + user_id: id of a user + + Returns: + result: handler function's result if succeeded + + """ + context = cls.user_to_context[user_id] + + context.user_id = user_id + context.message = utterance + + current_handler = cls.__select_handler(context) + return cls.__run_handler(current_handler, context) + + def __select_handler(cls, + context: UserContext) -> Optional[Callable]: + """ + Selects handler with the highest priority that could be triggered from the passed context. + + Returns: + handler function that is selected and None if no handler fits request + + """ + available_handlers = cls.state_to_handler[context.current_state] + available_handlers.extend(cls.universal_handlers) + available_handlers.sort(key=lambda h: h.priority, reverse=True) + for handler in available_handlers: + if handler.check(context): + handler.expand_context(context) + return handler.func + + def __run_handler(cls, handler: Optional[Callable], + context: UserContext) -> SkillResponse: + """ + Runs specified handler for current context + + Args: + handler: handler to be run. If None, on_invalid_command is returned + context: user context + + Returns: + SkillResponse + + """ + if handler is None: + return SkillResponse(cls.on_invalid_command, cls.null_confidence) + try: + return SkillResponse(*handler(context=context)) + except Exception as exc: + return SkillResponse(str(exc), 1.0) + + @staticmethod + def handler(commands: Optional[List[str]] = None, + state: Optional[str] = None, + context_condition: Optional[Callable] = None, + priority: int = 0) -> Callable: + """ + Decorator to be used in skills' classes. + Sample usage: + + .. code:: python + + class ExampleSkill(metaclass=DSLMeta): + @DSLMeta.handler(commands=["hello", "hey"], state="greeting") + def __greeting(context: UserContext): + response = "Hello, my friend!" + confidence = 1.0 + return response, confidence + + Args: + priority: integer value to indicate priority. If multiple handlers satisfy + all the requirements, the handler with the greatest priority value will be used + context_condition: function that takes context and + returns True if this handler should be enabled + and False otherwise. If None, no condition is checked + commands: phrases/regexs on what the function wrapped + by this decorator will trigger + state: state name + + Returns: + function decorated into Handler class + + """ + if commands is None: + commands = [".*"] + + def decorator(func: Callable) -> Handler: + return RegexHandler(func, commands, + context_condition=context_condition, + priority=priority, state=state) + + return decorator diff --git a/deeppavlov/skills/dsl_skill/handlers/__init__.py b/deeppavlov/skills/dsl_skill/handlers/__init__.py new file mode 100644 index 0000000000..320eeddce9 --- /dev/null +++ b/deeppavlov/skills/dsl_skill/handlers/__init__.py @@ -0,0 +1,2 @@ +from .handler import * +from .regex_handler import * diff --git a/deeppavlov/skills/dsl_skill/handlers/handler.py b/deeppavlov/skills/dsl_skill/handlers/handler.py new file mode 100644 index 0000000000..c041404e82 --- /dev/null +++ b/deeppavlov/skills/dsl_skill/handlers/handler.py @@ -0,0 +1,68 @@ +# Copyright 2019 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional + +from deeppavlov.skills.dsl_skill.context import UserContext +from deeppavlov.skills.dsl_skill.utils import SkillResponse + + +class Handler: + """ + Handler instance helps DSLMeta class distinguish functions wrapped + by @DSLMeta.handler to add them to handlers storage. + It also checks if the handler function should be triggered based on the given context. + + Attributes: + func: handler function + state: state in which handler can be activated + priority: priority of the function. If 2 or more handlers can be activated, handler + with the highest priority is selected + context_condition: predicate that accepts user context and checks if the handler should be activated. Example: + `lambda context: context.user_id != 1` checks if user_id is not equal to 1. + That means a user with id 1 will be always ignored by the handler. + + """ + + def __init__(self, + func: Callable, + state: Optional[str] = None, + context_condition: Optional[Callable] = None, + priority: int = 0): + self.func = func + self.state = state + self.context_condition = context_condition + self.priority = priority + + def __call__(self, context: UserContext) -> SkillResponse: + return self.func(context) + + def check(self, context: UserContext) -> bool: + """ + Checks: + - if the handler function should be triggered based on the given context via context condition. + + Args: + context: user context + + Returns: + True, if handler should be activated, False otherwise + """ + if self.context_condition is not None: + return self.context_condition(context) + return True + + def expand_context(self, context: UserContext) -> UserContext: + context.handler_payload = {} + return context diff --git a/deeppavlov/skills/dsl_skill/handlers/regex_handler.py b/deeppavlov/skills/dsl_skill/handlers/regex_handler.py new file mode 100644 index 0000000000..f658ca427e --- /dev/null +++ b/deeppavlov/skills/dsl_skill/handlers/regex_handler.py @@ -0,0 +1,81 @@ +# Copyright 2019 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import re +from typing import List, Callable, Optional + +from deeppavlov.skills.dsl_skill.context import UserContext +from .handler import Handler + + +class RegexHandler(Handler): + """ + This handler checks whether the message that is passed to it is matched by a regex. + + Adds the following key to ```context.handler_payload```: + - 'regex_groups' - groups parsed from regular expression in command, by name + + Attributes: + func: handler function + state: state in which handler can be activated + priority: priority of the function. If 2 or more handlers can be activated, function + with the highest priority is selected + context_condition: predicate that accepts user context and checks if the handler should be activated. + Example: `lambda context: context.user_id != 1` checks if user_id is not equal to 1. + That means a user with id 1 will be always ignored by the handler. + commands: handler is activated if regular expression from this list is matched with a user message + + """ + + def __init__(self, + func: Callable, + commands: Optional[List[str]] = None, + state: Optional[str] = None, + context_condition: Optional[Callable] = None, + priority: int = 0): + super().__init__(func, state, context_condition, priority) + self.commands = [re.compile(command) for command in commands] + + def check(self, context: UserContext) -> bool: + """ + Checks: + - if the handler function should be triggered based on the given context via context condition. + - if at least one of the commands is matched to the `context.message`. + + Args: + context: user context + + Returns: + True, if handler should be activated, False otherwise + """ + is_previous_matches = super().check(context) + if not is_previous_matches: + return False + + message = context.message + return any(re.search(regexp, ' '.join(message)) for regexp in self.commands) + + def expand_context(self, context: UserContext) -> UserContext: + context.handler_payload = {'regex_groups': {}} + message = context.message + for regexp in self.commands: + match = re.search(regexp, ' '.join(message)) + if match is not None: + for group_ind, span in enumerate(match.regs): + context.handler_payload['regex_groups'][group_ind] = message[span[0]: span[1]] + for group_name, group_ind in regexp.groupindex.items(): + context.handler_payload['regex_groups'][group_name] = \ + context.handler_payload['regex_groups'][group_ind] + return context diff --git a/deeppavlov/skills/dsl_skill/utils.py b/deeppavlov/skills/dsl_skill/utils.py new file mode 100644 index 0000000000..b2d1e9a7dc --- /dev/null +++ b/deeppavlov/skills/dsl_skill/utils.py @@ -0,0 +1,23 @@ +# Copyright 2019 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Union, NamedTuple + +UserId = Union[str, int] + + +class SkillResponse(NamedTuple): + response: str + confidence: float diff --git a/docs/apiref/skills/dsl_skill.rst b/docs/apiref/skills/dsl_skill.rst new file mode 100644 index 0000000000..facaf57a38 --- /dev/null +++ b/docs/apiref/skills/dsl_skill.rst @@ -0,0 +1,5 @@ +deeppavlov.skills.dsl_skill +======================================== + +.. automodule:: deeppavlov.skills.dsl_skill.dsl_skill + :members: diff --git a/docs/features/skills/dsl_skill.rst b/docs/features/skills/dsl_skill.rst new file mode 100644 index 0000000000..c705b92c33 --- /dev/null +++ b/docs/features/skills/dsl_skill.rst @@ -0,0 +1,45 @@ +DSL Skill +====================== + +An :doc:`DSL implementation`. DSL helps to easily create user-defined + skills for dialog systems. + +For the case when DSL skill matched utterance and found response it outputs response with confidence +value. + +For the case when no match occurred DSL skill returns the argument `on_invalid_command` ("Простите, я вас не понял" by delault) + as utterance and sets confidence to `null_confidence` attribute (0 by default). + +`on_invalid_command` and `null_confidence` can be changed in model config + + +Quick Start +----------- + +DeepPavlov library has default config for DSLSkill here: :config:`configs/dsl_skill/dsl_skill.json ` + +Usage +^^^^^^^^ + +.. code:: python + + from deeppavlov import configs, build_model + from deeppavlov.core.common.file import read_json + from deeppavlov.skills.dsl_skill.dsl_skill import DSLMeta + + + class DSLSkill(metaclass=DSLMeta): + @DSLMeta.handler(commands=["hello", "hi", "sup", "greetings"]) + def greeting(context): + response = "Hello, my friend!" + confidence = 1.0 + return response, confidence + + + skill_config = read_json(configs.dsl_skill.dsl_skill) + + skill = build_model(skill_config, download=True) + utterance = "Hello" + user_id = 1 + response = skill([utterance], [user_id]) + print(response) diff --git a/docs/features/skills/index.rst b/docs/features/skills/index.rst index 6ef60eee19..9c8aa8ae62 100644 --- a/docs/features/skills/index.rst +++ b/docs/features/skills/index.rst @@ -12,4 +12,5 @@ Skills Frequently Asked Questions Answering eCommerce Bot AIML + DSL diff --git a/tests/test_dsl_skill.py b/tests/test_dsl_skill.py new file mode 100644 index 0000000000..b4d4bff049 --- /dev/null +++ b/tests/test_dsl_skill.py @@ -0,0 +1,109 @@ +from logging import getLogger + +from deeppavlov import configs, build_model +from deeppavlov.core.common.file import read_json +from deeppavlov.skills.dsl_skill.dsl_skill import DSLMeta +from deeppavlov.utils.pip_wrapper.pip_wrapper import install_from_config + +log = getLogger(__name__) + + +class DSLSkill(metaclass=DSLMeta): + @DSLMeta.handler(commands=["hello", "hi", "sup", "greetings"]) + def greeting(context): + response = "Hello, my friend!" + confidence = 1.0 + return response, confidence + + +class StateSkill(metaclass=DSLMeta): + @DSLMeta.handler(commands=["hello", "hi", "sup", "greetings"]) + def greeting(context): + response = "Hello, my friend!" + confidence = 1.0 + context.current_state = "state1" + return response, confidence + + @DSLMeta.handler(commands=["bye"], + state="state1") + def bye(context): + response = "bb!" + confidence = 1.0 + return response, confidence + + +class ContextConditionSkill(metaclass=DSLMeta): + @DSLMeta.handler(commands=["hello", "hi", "sup", "greetings"], + context_condition=lambda context: context.user_id != 1) + def greeting(context): + response = "Hello, my friend!" + confidence = 1.0 + return response, confidence + + +class TestDSLSkill: + def setup(self): + self.skill_config = read_json(configs.dsl_skill.dsl_skill) + install_from_config(self.skill_config) + + def test_simple_skill(self): + user_messages_sequence = [ + "Hello", + "Hi", + "Tell me a joke", + "Sup", + "Ok, goodbye" + ] + + skill = build_model(self.skill_config, download=True) + history_of_responses = [] + for user_id, each_utt in enumerate(user_messages_sequence): + log.info(f"User says: {each_utt}") + responses_batch = skill([each_utt], [user_id]) + log.info(f"Bot says: {responses_batch[0]}") + history_of_responses.append(responses_batch) + + # check the first greeting message in 0th batch + assert "Hello, my friend!" in history_of_responses[0][0] + # check the second greeting message in 0th batch + assert "Hello, my friend!" in history_of_responses[1][0] + # check `on_invalid_command` + assert "Sorry, I do not understand you" in history_of_responses[2][0] + + def test_switch_state(self): + user_messages_sequence = [ + "Hello", + "bye", + "bye" + ] + + self.skill_config["chainer"]["pipe"][1]["class_name"] = "StateSkill" + skill = build_model(self.skill_config, download=True) + + history_of_responses = [] + for user_id, each_utt in enumerate(user_messages_sequence): + log.info(f"User says: {each_utt}") + responses_batch = skill([each_utt], [user_id % 2]) + log.info(f"Bot says: {responses_batch[0]}") + history_of_responses.append(responses_batch) + assert "Hello, my friend!" in history_of_responses[0][0] + assert "Sorry, I do not understand you" in history_of_responses[1][0] + assert "bb!" in history_of_responses[2][0] + + def test_context_condition(self): + user_messages_sequence = [ + "Hello", + "Hi" + ] + + self.skill_config["chainer"]["pipe"][1]["class_name"] = "ContextConditionSkill" + skill = build_model(self.skill_config, download=True) + + history_of_responses = [] + for user_id, each_utt in enumerate(user_messages_sequence): + log.info(f"User says: {each_utt}") + responses_batch = skill([each_utt], [user_id]) + log.info(f"Bot says: {responses_batch[0]}") + history_of_responses.append(responses_batch) + assert "Hello, my friend!" in history_of_responses[0][0] + assert "Sorry, I do not understand you" in history_of_responses[1][0] From 43171500ab5efefb1a03932c4383611bb9fe10ca Mon Sep 17 00:00:00 2001 From: Aleksey Lymar Date: Mon, 29 Jul 2019 15:33:35 +0300 Subject: [PATCH 18/18] docs: fix dsl_skill example to not use a state --- deeppavlov/skills/dsl_skill/dsl_skill.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deeppavlov/skills/dsl_skill/dsl_skill.py b/deeppavlov/skills/dsl_skill/dsl_skill.py index 9a6cef47f3..ce2317ecdb 100644 --- a/deeppavlov/skills/dsl_skill/dsl_skill.py +++ b/deeppavlov/skills/dsl_skill/dsl_skill.py @@ -193,7 +193,7 @@ def handler(commands: Optional[List[str]] = None, .. code:: python class ExampleSkill(metaclass=DSLMeta): - @DSLMeta.handler(commands=["hello", "hey"], state="greeting") + @DSLMeta.handler(commands=["hello", "hi", "sup", "greetings"]) def __greeting(context: UserContext): response = "Hello, my friend!" confidence = 1.0