From 9f20a31be1c815de327379acf408dcdebe23f32a Mon Sep 17 00:00:00 2001
From: PaGul <spartaaa300@gmail.com>
Date: Wed, 8 May 2019 15:45:33 +0300
Subject: [PATCH 01/12] Fix some documentation problems

---
 docs/components/neural_ranking.rst |  2 +-
 docs/intro/features.rst            | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/components/neural_ranking.rst b/docs/components/neural_ranking.rst
index dd5047b626..af00bea3a6 100644
--- a/docs/components/neural_ranking.rst
+++ b/docs/components/neural_ranking.rst
@@ -217,7 +217,7 @@ inference, one can use the following code in python:
 
     from deeppavlov import build_model, configs
 
-    para_model = build_model(configs.ranking.deeppavlov/configs/ranking/paraphrase_ident_tune_interact, download=True)
+    para_model = build_model(configs.ranking.paraphrase_ident_tune_interact, download=True)
     para_model(['9 мая метрополитен Петербурга будет работать круглосуточно&Петербургское метро в ночь на 10 мая будет работать круглосуточно'])
     >>> 'This is a paraphrase.'
 
diff --git a/docs/intro/features.rst b/docs/intro/features.rst
index b064000446..253774f392 100644
--- a/docs/intro/features.rst
+++ b/docs/intro/features.rst
@@ -66,7 +66,7 @@ Several pre-trained models are available and presented in Table below.
 +                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
 |                  |                    |      | :config:`BERT <classifiers/intents_dstc2_bert.json>`                                            |             | 0.9673 | 0.9636 |  800 Mb   |
 +------------------+--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
-| 7 intents        | `SNIPS-2017`_ [7]_ |      | :config:`DSTC 2 emb <classifiers/intents_snips.json>`                                           | F1-macro    | 0.8591 |    --  |  800 Mb   |
+| 7 intents        | `SNIPS-2017`_ [1]_ |      | :config:`DSTC 2 emb <classifiers/intents_snips.json>`                                           | F1-macro    | 0.8591 |    --  |  800 Mb   |
 +                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
 |                  |                    |      | :config:`Wiki emb <classifiers/intents_snips_big.json>`                                         |             | 0.9820 |    --  |  8.5 Gb   |
 +                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
@@ -86,7 +86,7 @@ Several pre-trained models are available and presented in Table below.
 +                  +--------------------+      +-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
 |                  |`RuSentiment`_      |      | :config:`RuWiki+Lenta emb <classifiers/rusentiment_cnn.json>`                                   | F1-weighted | 0.6541 | 0.7016 |  6.2 Gb   |
 +                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
-|                  |                    |      | :config:`Twitter emb super-convergence <classifiers/rusentiment_bigru_superconv.json>` [6]_     |             | 0.7301 | 0.7576 |  3.4 Gb   |
+|                  |                    |      | :config:`Twitter emb super-convergence <classifiers/rusentiment_bigru_superconv.json>` [2]_     |             | 0.7301 | 0.7576 |  3.4 Gb   |
 +                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
 |                  |                    |      | :config:`ELMo <classifiers/rusentiment_elmo_twitter_cnn.json>`                                  |             | 0.7519 | 0.7875 |  700 Mb   |
 +                  +                    +      +-------------------------------------------------------------------------------------------------+             +--------+--------+-----------+
@@ -95,8 +95,8 @@ Several pre-trained models are available and presented in Table below.
 | Intent           |`Yahoo-L31`_        |      | :config:`Yahoo-L31 on ELMo <classifiers/yahoo_convers_vs_info.json>` pre-trained on `Yahoo-L6`_ | ROC-AUC     | 0.9412 |   --   |  700 Mb   |
 +------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
 
-.. [6] Smith L. N., Topin N. Super-convergence: Very fast training of residual networks using large learning rates. – 2018.
-.. [7] Coucke A. et al. Snips voice platform: an embedded spoken language understanding system for private-by-design voice interfaces //arXiv preprint arXiv:1805.10190. – 2018.
+.. [1] Smith L. N., Topin N. Super-convergence: Very fast training of residual networks using large learning rates. – 2018.
+.. [2] Coucke A. et al. Snips voice platform: an embedded spoken language understanding system for private-by-design voice interfaces //arXiv preprint arXiv:1805.10190. – 2018.
 
 .. _`DSTC 2`: http://camdial.org/~mh521/dstc/
 .. _`SNIPS-2017`: https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines
@@ -293,15 +293,15 @@ Available pre-trained models for paraphrase identification:
    +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
    |    Dataset             |Model config                                                                                 | Val (accuracy)| Test (accuracy)| Val (F1)| Test (F1)| Val (log_loss)| Test (log_loss)|Downloads |
    +========================+=============================================================================================+===============+================+=========+==========+===============+================+==========+
-   |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser <ranking/paraphrase_ident_paraphraser_interact.json>`  |   83.8        |   75.4         |   87.9  |  80.9    |   0.468       |   0.616        |5938M     |
+   |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser_fastText <ranking/paraphrase_ident_paraphraser_interact.json>`  |   83.8        |   75.4         |   87.9  |  80.9    |   0.468       |   0.616        |5938M     |
    +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser <ranking/paraphrase_ident_elmo_interact.json>`         |   82.7        |   76.0         |   87.3  |  81.4    |   0.391       |   0.510        |5938M     |
+   |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser_elmo <ranking/paraphrase_ident_elmo_interact.json>`         |   82.7        |   76.0         |   87.3  |  81.4    |   0.391       |   0.510        |5938M     |
    +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
    |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser_tune <ranking/paraphrase_ident_tune_interact.json>`    |   82.9        |   76.7         |   87.3  |  82.0    |   0.392       |   0.479        |5938M     |
    +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
    |`paraphraser.ru`_       |:config:`paraphrase_bert <classifiers/paraphraser_bert.json>`                                |   87.4        |   79.3         |   90.2  |  83.4    |   --          |   --           |1330M     |
    +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |`Quora Question Pairs`_ |:config:`paraphrase_ident_qqp <ranking/paraphrase_ident_qqp_bilstm_interact.json>`           |   87.1        |   87.0         |   83.0  |  82.6    |   0.300       |   0.305        |8134M     |
+   |`Quora Question Pairs`_ |:config:`paraphrase_ident_qqp_bilstm <ranking/paraphrase_ident_qqp_bilstm_interact.json>`           |   87.1        |   87.0         |   83.0  |  82.6    |   0.300       |   0.305        |8134M     |
    +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
    |`Quora Question Pairs`_ |:config:`paraphrase_ident_qqp <ranking/paraphrase_ident_qqp_interact.json>`                  |   87.7        |   87.5         |   84.0  |  83.8    |   0.287       |   0.298        |8136M     |
    +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+

From e91e4d1e53096502a034e196a3fbd6d6509de163 Mon Sep 17 00:00:00 2001
From: PaGul <spartaaa300@gmail.com>
Date: Wed, 8 May 2019 15:53:57 +0300
Subject: [PATCH 02/12] Fix some documentation problems[2]

---
 docs/intro/features.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/intro/features.rst b/docs/intro/features.rst
index 253774f392..8981c64dbf 100644
--- a/docs/intro/features.rst
+++ b/docs/intro/features.rst
@@ -95,8 +95,8 @@ Several pre-trained models are available and presented in Table below.
 | Intent           |`Yahoo-L31`_        |      | :config:`Yahoo-L31 on ELMo <classifiers/yahoo_convers_vs_info.json>` pre-trained on `Yahoo-L6`_ | ROC-AUC     | 0.9412 |   --   |  700 Mb   |
 +------------------+--------------------+------+-------------------------------------------------------------------------------------------------+-------------+--------+--------+-----------+
 
-.. [1] Smith L. N., Topin N. Super-convergence: Very fast training of residual networks using large learning rates. – 2018.
-.. [2] Coucke A. et al. Snips voice platform: an embedded spoken language understanding system for private-by-design voice interfaces //arXiv preprint arXiv:1805.10190. – 2018.
+.. [1] Coucke A. et al. Snips voice platform: an embedded spoken language understanding system for private-by-design voice interfaces //arXiv preprint arXiv:1805.10190. – 2018.
+.. [2] Smith L. N., Topin N. Super-convergence: Very fast training of residual networks using large learning rates. – 2018.
 
 .. _`DSTC 2`: http://camdial.org/~mh521/dstc/
 .. _`SNIPS-2017`: https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines

From 5784813eb13e32775ffe110d189f4a10871cf3b4 Mon Sep 17 00:00:00 2001
From: PaGul <spartaaa300@gmail.com>
Date: Wed, 8 May 2019 17:15:54 +0300
Subject: [PATCH 03/12] Fix some documentation problems[3]

---
 docs/intro/features.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/intro/features.rst b/docs/intro/features.rst
index 8981c64dbf..8136923e1b 100644
--- a/docs/intro/features.rst
+++ b/docs/intro/features.rst
@@ -135,6 +135,7 @@ trained on Reddit dataset.
 | Shallow-and-wide CNN   | **0.9956**      | **0.9973**       | **0.9968**    | **0.9871**   | **0.9998**   | **0.9752**           | **0.9854**             |
 +------------------------+-----------------+------------------+---------------+--------------+--------------+----------------------+------------------------+
 
+.. [3] https://www.slideshare.net/KonstantinSavenkov/nlu-intent-detection-benchmark-by-intento-august-2017
 
 
 :doc:`Goal-oriented bot </skills/go_bot>`

From 3b3efe83e479d08ec6fca8958e8d71b467115271 Mon Sep 17 00:00:00 2001
From: leonid <puleon@mail.ru>
Date: Tue, 14 May 2019 13:04:23 +0300
Subject: [PATCH 04/12] fix: paraphraser table, citing 3

---
 docs/intro/features.rst | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/docs/intro/features.rst b/docs/intro/features.rst
index 8136923e1b..8f8d9ff332 100644
--- a/docs/intro/features.rst
+++ b/docs/intro/features.rst
@@ -109,7 +109,7 @@ Several pre-trained models are available and presented in Table below.
 
 As no one had published intent recognition for DSTC-2 data, the
 comparison of the presented model is given on **SNIPS** dataset. The
-evaluation of model scores was conducted in the same way as in [3] to
+evaluation of model scores was conducted in the same way as in [3]_ to
 compare with the results from the report of the authors of the dataset.
 The results were achieved with tuning of parameters and embeddings
 trained on Reddit dataset.
@@ -291,21 +291,21 @@ Available pre-trained models for paraphrase identification:
 .. table::
    :widths: auto
 
-   +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |    Dataset             |Model config                                                                                 | Val (accuracy)| Test (accuracy)| Val (F1)| Test (F1)| Val (log_loss)| Test (log_loss)|Downloads |
-   +========================+=============================================================================================+===============+================+=========+==========+===============+================+==========+
-   |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser_fastText <ranking/paraphrase_ident_paraphraser_interact.json>`  |   83.8        |   75.4         |   87.9  |  80.9    |   0.468       |   0.616        |5938M     |
-   +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser_elmo <ranking/paraphrase_ident_elmo_interact.json>`         |   82.7        |   76.0         |   87.3  |  81.4    |   0.391       |   0.510        |5938M     |
-   +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser_tune <ranking/paraphrase_ident_tune_interact.json>`    |   82.9        |   76.7         |   87.3  |  82.0    |   0.392       |   0.479        |5938M     |
-   +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |`paraphraser.ru`_       |:config:`paraphrase_bert <classifiers/paraphraser_bert.json>`                                |   87.4        |   79.3         |   90.2  |  83.4    |   --          |   --           |1330M     |
-   +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |`Quora Question Pairs`_ |:config:`paraphrase_ident_qqp_bilstm <ranking/paraphrase_ident_qqp_bilstm_interact.json>`           |   87.1        |   87.0         |   83.0  |  82.6    |   0.300       |   0.305        |8134M     |
-   +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |`Quora Question Pairs`_ |:config:`paraphrase_ident_qqp <ranking/paraphrase_ident_qqp_interact.json>`                  |   87.7        |   87.5         |   84.0  |  83.8    |   0.287       |   0.298        |8136M     |
-   +------------------------+---------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
+   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
+   |    Dataset             |Model config                                                                                   | Val (accuracy)| Test (accuracy)| Val (F1)| Test (F1)| Val (log_loss)| Test (log_loss)|Downloads |
+   +========================+===============================================================================================+===============+================+=========+==========+===============+================+==========+
+   |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser_ft <ranking/paraphrase_ident_paraphraser_interact.json>` |   83.8        |   75.4         |   87.9  |  80.9    |   0.468       |   0.616        |5938M     |
+   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
+   |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser_elmo <ranking/paraphrase_ident_elmo_interact.json>`      |   82.7        |   76.0         |   87.3  |  81.4    |   0.391       |   0.510        |5938M     |
+   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
+   |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser_tune <ranking/paraphrase_ident_tune_interact.json>`      |   82.9        |   76.7         |   87.3  |  82.0    |   0.392       |   0.479        |5938M     |
+   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
+   |`paraphraser.ru`_       |:config:`paraphrase_bert <classifiers/paraphraser_bert.json>`                                  |   87.4        |   79.3         |   90.2  |  83.4    |   --          |   --           |1330M     |
+   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
+   |`Quora Question Pairs`_ |:config:`paraphrase_ident_qqp_bilstm <ranking/paraphrase_ident_qqp_bilstm_interact.json>`      |   87.1        |   87.0         |   83.0  |  82.6    |   0.300       |   0.305        |8134M     |
+   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
+   |`Quora Question Pairs`_ |:config:`paraphrase_ident_qqp <ranking/paraphrase_ident_qqp_interact.json>`                    |   87.7        |   87.5         |   84.0  |  83.8    |   0.287       |   0.298        |8136M     |
+   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
 
 .. _`paraphraser.ru`: https://paraphraser.ru/
 .. _`Quora Question Pairs`: https://www.kaggle.com/c/quora-question-pairs/data

From 7c3e57094fd1c4a4eab0a0c1ba4a87d66dc7d4fd Mon Sep 17 00:00:00 2001
From: Mary Trofimova <mary.vikhreva@gmail.com>
Date: Tue, 14 May 2019 14:14:33 +0300
Subject: [PATCH 05/12] fix: ner bert minor fixes (#833)

* refactor: rm unnecessary configs

* fix: less batches during tests

* docs: add scores & examples
---
 .../configs/ner/ner_conll2003_bert_ctx.json   | 133 ---------------
 .../ner/ner_conll2003_bert_ctx_di.json        | 127 --------------
 .../ner/ner_conll2003_bert_search.json        | 158 ------------------
 .../configs/ner/ner_ontonotes_bert_mult.json  |   2 +
 docs/components/ner.rst                       |  27 ++-
 docs/intro/features.rst                       |  36 ++--
 6 files changed, 43 insertions(+), 440 deletions(-)
 delete mode 100644 deeppavlov/configs/ner/ner_conll2003_bert_ctx.json
 delete mode 100644 deeppavlov/configs/ner/ner_conll2003_bert_ctx_di.json
 delete mode 100644 deeppavlov/configs/ner/ner_conll2003_bert_search.json

diff --git a/deeppavlov/configs/ner/ner_conll2003_bert_ctx.json b/deeppavlov/configs/ner/ner_conll2003_bert_ctx.json
deleted file mode 100644
index 1ebf0edc9b..0000000000
--- a/deeppavlov/configs/ner/ner_conll2003_bert_ctx.json
+++ /dev/null
@@ -1,133 +0,0 @@
-{
-  "dataset_reader": {
-    "class_name": "conll2003_reader",
-    "data_path": "{DOWNLOADS_PATH}/conll2003/",
-    "dataset_name": "conll2003",
-    "provide_pos": false,
-    "provide_context": true,
-    "context_size": 3
-  },
-  "dataset_iterator": {
-    "class_name": "data_learning_iterator"
-  },
-  "chainer": {
-    "in": ["x", "left_context", "right_context"],
-    "in_y": ["y"],
-    "pipe": [
-      { 
-        "class_name": "bert_context_add",
-        "vocab_file": "{BERT_PATH}/vocab.txt",
-        "left_context_size": 3,
-        "right_context_size": 3,
-        "left_context_rate": 0.5,
-        "max_seq_length": 510,
-        "in": ["x", "left_context", "right_context", "y"],
-        "out": ["x_rich", "y_rich"]
-      },
-      {
-        "class_name": "bert_ner_preprocessor",
-        "vocab_file": "{BERT_PATH}/vocab.txt",
-        "do_lower_case": false,
-        "max_seq_length": 512,
-        "max_subword_length": 15,
-        "token_maksing_prob": 0.0,
-        "in": ["x_rich", "y_rich"],
-        "out": ["x_subword_tokens", "x_subword_tok_ids", "pred_subword_mask", "y_subword"]
-      },
-      {
-        "class_name": "mask",
-        "in": ["x_subword_tokens"],
-        "out": ["x_subword_mask"]
-      },
-      {
-        "id": "tag_vocab",
-        "class_name": "simple_vocab",
-        "unk_token": ["O"],
-        "pad_with_zeros": true,
-        "fit_on": ["y"],
-        "save_path": "{NER_PATH}/tag.dict",
-        "load_path": "{NER_PATH}/tag.dict",
-        "in": ["y_subword"],
-        "out": ["y_subword_ind"]
-      },
-      {
-        "class_name": "bert_ner",
-        "n_tags": "#tag_vocab.len",
-        "use_crf": false,
-        "return_probas": false,
-        "bert_config_file": "{BERT_PATH}/bert_config.json",
-        "pretrained_bert": "{BERT_PATH}/bert_model.ckpt",
-        "save_path": "{NER_PATH}/model",
-        "load_path": "{NER_PATH}/model",
-        "attention_probs_keep_prob": 0.5,
-        "keep_prob": 0.3,
-        "ema_decay": 0.9,
-        "ema_variables_on_cpu": true,
-        "focal_alpha": null,
-        "focal_gamma": null,
-        "optimizer": "tf.train:AdamOptimizer",
-        "learning_rate": 1.6443457513284065e-05,
-        "learning_rate_drop_patience": 30,
-        "learning_rate_drop_div": 3,
-        "min_learning_rate": 5e-7,
-        "in": ["x_subword_tok_ids", "x_subword_mask", "pred_subword_mask"],
-        "in_y": ["y_subword_ind"],
-        "out": ["y_pred_ind"]
-      },
-      {
-        "ref": "tag_vocab",
-        "in": ["y_pred_ind"],
-        "out": ["y_pred"]
-      }
-    ],
-    "out": ["x", "y_pred"]
-  },
-  "train": {
-    "epochs": 30,
-    "batch_size": 8,
-    "metrics": [
-      {
-        "name": "ner_f1",
-        "inputs": [
-          "y",
-          "y_pred"
-        ]
-      }
-    ],
-    "validation_patience": 120,
-    "val_every_n_batches": 10,
-    "test_best": true,
-    "validate_best": true,
-    "log_every_n_batches": 10,
-    "tensorboard_log_dir": "{NER_PATH}/logs",
-    "show_examples": false,
-    "evaluation_targets": [
-      "valid",
-      "test"
-    ],
-    "class_name": "nn_trainer"
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "~/.deeppavlov/downloads",
-      "MODELS_PATH": "~/.deeppavlov/models",
-      "BERT_PATH": "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12",
-      "NER_PATH": "~/.deeppavlov/models/ner_conll2003_bert_ctx"
-    },
-    "requirements": [
-      "{ROOT_PATH}/requirements/tf.txt",
-      "{ROOT_PATH}/requirements/bert_dp.txt"
-    ],
-    "labels": {
-      "telegram_utils": "NERCoNLL2003Model",
-      "server_utils": "NER"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/cased_L-12_H-768_A-12.zip",
-        "subdir": "{DOWNLOADS_PATH}/bert_models"
-      }
-    ]
-  }
-}
diff --git a/deeppavlov/configs/ner/ner_conll2003_bert_ctx_di.json b/deeppavlov/configs/ner/ner_conll2003_bert_ctx_di.json
deleted file mode 100644
index 995dd2562f..0000000000
--- a/deeppavlov/configs/ner/ner_conll2003_bert_ctx_di.json
+++ /dev/null
@@ -1,127 +0,0 @@
-{
-  "dataset_reader": {
-    "class_name": "conll2003_reader",
-    "data_path": "{DOWNLOADS_PATH}/conll2003/",
-    "dataset_name": "conll2003",
-    "provide_pos": false,
-    "provide_doc_ids": true
-  },
-  "dataset_iterator": {
-    "class_name": "document_bert_ner_iterator",
-    "bert_tokenizer_vocab_file": "{BERT_PATH}/vocab.txt",
-    "left_context_rate": 0.5,
-    "max_seq_length": 510,
-    "shuffle": true
-  },
-  "chainer": {
-    "in": ["x"],
-    "in_y": ["y"],
-    "pipe": [
-      {
-        "class_name": "bert_ner_preprocessor",
-        "vocab_file": "{BERT_PATH}/vocab.txt",
-        "do_lower_case": false,
-        "max_seq_length": 512,
-        "max_subword_length": 15,
-        "token_maksing_prob": 0.0,
-        "in": ["x", "y"],
-        "out": ["x_subword_tokens", "x_subword_tok_ids", "pred_subword_mask", "y_subword"]
-      },
-      {
-        "class_name": "mask",
-        "in": ["x_subword_tokens"],
-        "out": ["x_subword_mask"]
-      },
-      {
-        "id": "tag_vocab",
-        "class_name": "simple_vocab",
-        "unk_token": ["O"],
-        "pad_with_zeros": true,
-        "fit_on": ["y"],
-        "save_path": "{NER_PATH}/tag.dict",
-        "load_path": "{NER_PATH}/tag.dict",
-        "in": ["y_subword"],
-        "out": ["y_subword_ind"]
-      },
-      {
-        "class_name": "bert_ner",
-        "n_tags": "#tag_vocab.len",
-        "use_crf": false,
-        "return_probas": false,
-        "bert_config_file": "{BERT_PATH}/bert_config.json",
-        "pretrained_bert": "{BERT_PATH}/bert_model.ckpt",
-        "save_path": "{NER_PATH}/model",
-        "load_path": "{NER_PATH}/model",
-        "attention_probs_keep_prob": 0.5,
-        "keep_prob": 0.3,
-        "ema_decay": 0.9,
-        "ema_variables_on_cpu": true,
-        "clip_norm": null,
-        "focal_alpha": null,
-        "focal_gamma": null,
-        "optimizer": "tf.train:AdamOptimizer",
-        "learning_rate": 1.6443457513284065e-05,
-        "learning_rate_drop_patience": 30,
-        "learning_rate_drop_div": 3,
-        "min_learning_rate": 5e-7,
-        "in": ["x_subword_tok_ids", "x_subword_mask", "pred_subword_mask"],
-        "in_y": ["y_subword_ind"],
-        "out": ["y_pred_ind"]
-      },
-      {
-        "ref": "tag_vocab",
-        "in": ["y_pred_ind"],
-        "out": ["y_pred"]
-      }
-    ],
-    "out": ["x", "y_pred"]
-  },
-  "train": {
-    "epochs": 30,
-    "batch_size": 8,
-    "metrics": [
-      {
-        "name": "ner_f1",
-        "inputs": [
-          "y",
-          "y_pred"
-        ]
-      }
-    ],
-    "validation_patience": 120,
-    "val_every_n_batches": 10,
-    "test_best": true,
-    "validate_best": true,
-    "log_every_n_batches": 10,
-    "tensorboard_log_dir": "{NER_PATH}/logs",
-    "show_examples": false,
-    "evaluation_targets": [
-      "valid",
-      "test"
-    ],
-    "class_name": "nn_trainer"
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "~/.deeppavlov/downloads",
-      "MODELS_PATH": "~/.deeppavlov/models",
-      "BERT_PATH": "~/.deeppavlov/downloads/bert_models/cased_L-12_H-768_A-12",
-      "NER_PATH": "~/.deeppavlov/models/ner_conll2003_bert_ctx_di"
-    },
-    "requirements": [
-      "{ROOT_PATH}/requirements/tf.txt",
-      "{ROOT_PATH}/requirements/bert_dp.txt"
-    ],
-    "labels": {
-      "telegram_utils": "NERCoNLL2003Model",
-      "server_utils": "NER"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/cased_L-12_H-768_A-12.zip",
-        "subdir": "{DOWNLOADS_PATH}/bert_models"
-      }
-    ]
-  }
-}
diff --git a/deeppavlov/configs/ner/ner_conll2003_bert_search.json b/deeppavlov/configs/ner/ner_conll2003_bert_search.json
deleted file mode 100644
index 5b36bce2a3..0000000000
--- a/deeppavlov/configs/ner/ner_conll2003_bert_search.json
+++ /dev/null
@@ -1,158 +0,0 @@
-{
-  "dataset_reader": {
-    "class_name": "conll2003_reader",
-    "data_path": "{DOWNLOADS_PATH}/conll2003/",
-    "dataset_name": "conll2003",
-    "provide_pos": false
-  },
-  "dataset_iterator": {
-    "class_name": "data_learning_iterator"
-  },
-  "chainer": {
-    "in": ["x"],
-    "in_y": ["y"],
-    "pipe": [
-      {
-        "class_name": "bert_ner_preprocessor",
-        "vocab_file": "{BERT_PATH}/vocab.txt",
-        "do_lower_case": false,
-        "max_seq_length": 512,
-        "max_subword_length": 15,
-        "in": ["x", "y"],
-        "out": ["x_subword_tokens", "x_subword_tok_ids", "pred_subword_mask", "y_subword"]
-      },
-      {
-        "class_name": "mask",
-        "in": ["x_subword_tokens"],
-        "out": ["x_subword_mask"]
-      },
-      {
-        "id": "tag_vocab",
-        "class_name": "simple_vocab",
-        "unk_token": ["O"],
-        "pad_with_zeros": true,
-        "fit_on": ["y"],
-        "save_path": "{NER_PATH}/tag.dict",
-        "load_path": "{NER_PATH}/tag.dict",
-        "in": ["y_subword"],
-        "out": ["y_subword_ind"]
-      },
-      {
-        "class_name": "bert_ner",
-        "n_tags": "#tag_vocab.len",
-        "return_probas": false,
-        "bert_config_file": "{BERT_PATH}/bert_config.json",
-        "pretrained_bert": "{BERT_PATH}/bert_model.ckpt",
-        "save_path": "{NER_PATH}/model",
-        "keep_prob": {
-          "random_choice": [0.02, 0.1, 0.3]
-        },
-        "ema_decay": {
-          "random_choice": [null, 0.7, 0.9, 0.99]
-        },
-        "ema_variables_on_cpu": false,
-        "encoder_layer_ids": {
-          "random_choice": [
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], [-4, -3, -2, -1], [0, 5, 11], [-1]
-          ]
-        },
-        "hidden_keep_prob": {
-          "random_choice": [0.2, 0.6, null]
-        },
-        "attention_probs_keep_prob": {
-          "random_choice": [0.2, 0.6, null]
-        },
-        "optimizer": {
-          "random_choice": ["tf.train:AdamOptimizer", null]
-        },
-        "learning_rate": 1.6443457513284065e-05,
-        "learning_rate_drop_patience": 30,
-        "learning_rate_drop_div": 3,
-        "min_learning_rate": 5e-7,
-        "in": ["x_subword_tok_ids", "x_subword_mask", "pred_subword_mask"],
-        "in_y": ["y_subword_ind"],
-        "out": ["y_subword_pred_ind"]
-      },
-      {
-        "ref": "tag_vocab",
-        "in": ["y_subword_pred_ind"],
-        "out": ["y_subword_pred"]
-      },
-      {
-        "class_name": "deeppavlov.models.bert.bert_ner:MaskCutter",
-        "in": ["y_subword_pred", "pred_subword_mask"],
-        "out": ["y_pred_cut"]
-      },
-      {
-        "class_name": "deeppavlov.models.bert.bert_ner:MaskCutter",
-        "in": ["y_subword", "pred_subword_mask"],
-        "out": ["y_cut"]
-      },
-      {
-        "class_name": "deeppavlov.models.bert.bert_ner:MaskCutter",
-        "in": ["x_subword_tokens", "pred_subword_mask"],
-        "out": ["x_tokens_cut"]
-      }
-    ],
-    "out": ["x_tokens_cut", "y_pred_cut"]
-  },
-  "train": {
-    "epochs": 30,
-    "batch_size": 8,
-    "metrics": [
-      {
-        "name": "ner_f1",
-        "inputs": [
-          "y_cut",
-          "y_pred_cut"
-        ]
-      }
-    ],
-    "validation_patience": 120,
-    "val_every_n_batches": 10,
-    "log_every_n_batches": -1,
-    "show_examples": false,
-    "validate_best": true,
-    "test_best": true,
-    "evaluation_targets": [
-      "valid",
-      "test"
-    ],
-    "class_name": "nn_trainer"
-  },
-  "pipeline_search": {
-    "root": "{ROOT_PATH}/experiments",
-    "exp_name": "bert_ner_experiment",
-    "do_test": true,
-    "search_type": "random",
-    "sample_num": 31,
-    "plot": true,
-    "save_best": true,
-    "multiprocessing": true,
-    "use_all_gpus": [2, 4, 7, 8],
-    "gpu_memory_fraction": 0.8
-  },
-  "metadata": {
-    "variables": {
-      "ROOT_PATH": "~/.deeppavlov",
-      "DOWNLOADS_PATH": "{ROOT_PATH}/downloads",
-      "MODELS_PATH": "{ROOT_PATH}/models",
-      "BERT_PATH": "{DOWNLOADS_PATH}/bert_models/cased_L-12_H-768_A-12",
-      "NER_PATH": "{MODELS_PATH}/tmp"
-    },
-    "requirements": [
-      "{ROOT_PATH}/requirements/tf.txt",
-      "{ROOT_PATH}/requirements/bert_dp.txt"
-    ],
-    "labels": {
-      "telegram_utils": "NERCoNLL2003Model",
-      "server_utils": "NER"
-    },
-    "download": [
-      {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/cased_L-12_H-768_A-12.zip",
-        "subdir": "{DOWNLOADS_PATH}/bert_models"
-      }
-    ]
-  }
-}
diff --git a/deeppavlov/configs/ner/ner_ontonotes_bert_mult.json b/deeppavlov/configs/ner/ner_ontonotes_bert_mult.json
index 1e25553a22..827e9e07a9 100644
--- a/deeppavlov/configs/ner/ner_ontonotes_bert_mult.json
+++ b/deeppavlov/configs/ner/ner_ontonotes_bert_mult.json
@@ -89,6 +89,8 @@
 
     "log_every_n_batches": 20,
     "tensorboard_log_dir": "{NER_PATH}/logs",
+    "pytest_max_batches": 2,
+    "pytest_batch_size": 8,
     "show_examples": false,
     "evaluation_targets": ["valid", "test"],
     "class_name": "nn_trainer"
diff --git a/docs/components/ner.rst b/docs/components/ner.rst
index c66747e184..5146da3dc7 100644
--- a/docs/components/ner.rst
+++ b/docs/components/ner.rst
@@ -58,6 +58,7 @@ Models can be used from Python using the following code:
     from deeppavlov import configs, build_model
 
     ner_model = build_model(configs.ner.ner_ontonotes_bert, download=True)
+
     ner_model(['Bob Ross lived in Florida'])
     >>> [[['Bob', 'Ross', 'lived', 'in', 'Florida']], [['B-PERSON', 'I-PERSON', 'O', 'O', 'B-GPE']]]
 
@@ -66,6 +67,7 @@ The model also can be trained from the Python:
 .. code:: python
 
     from deeppavlov import configs, train_model
+
     ner_model = train_model(configs.ner.ner_ontonotes_bert)
 
 The data for training should be placed in the folder provided in the config:
@@ -74,7 +76,9 @@ The data for training should be placed in the folder provided in the config:
 
     from deeppavlov import configs, train_model
     from deeppavlov.core.commands.utils import parse_config
+    
     config_dict = parse_config(configs.ner.ner_ontonotes_bert)
+
     print(config_dict['dataset_reader']['data_path'])
     >>> '~/.deeppavlov/downloads/ontonotes'
 
@@ -103,17 +107,27 @@ transfer are presented in the table below.
 +---------+-------+
 
 
-The following Python code can be used to infer this model:
+The following Python code can be used to infer the model:
 
 .. code:: python
 
     from deeppavlov import configs, build_model
 
     ner_model = build_model(configs.ner.ner_ontonotes_bert_mult, download=True)
+
+    ner_model(['Curling World Championship will be held in Antananarivo'])
+    >>> (['Curling', 'World', 'Championship', 'will', 'be', 'held', 'in', 'Antananarivo']],
+    [['B-EVENT', 'I-EVENT', 'I-EVENT', 'O', 'O', 'O', 'O', 'B-GPE'])
+
+    ner_model(['Mistrzostwa Świata w Curlingu odbędą się w Antananarivo'])
+    >>> (['Mistrzostwa', 'Świata', 'w', 'Curlingu', 'odbędą', 'się', 'w', 'Antananarivo']],
+    [['B-EVENT', 'I-EVENT', 'I-EVENT', 'I-EVENT', 'O', 'O', 'O', 'B-GPE'])
+
     ner_model(['Чемпионат мира по кёрлингу пройдёт в Антананариву'])
-    >>> (['Чемпионат', 'мира', 'по', 'кёрлингу', 'пройдёт', 'в', 'Антананариву'], ['B-EVENT', 'I-EVENT', 'I-EVENT', 'I-EVENT', 'O', 'O', 'B-GPE'])
+    >>> (['Чемпионат', 'мира', 'по', 'кёрлингу', 'пройдёт', 'в', 'Антананариву'], 
+    ['B-EVENT', 'I-EVENT', 'I-EVENT', 'I-EVENT', 'O', 'O', 'B-GPE'])
 
-The list of tags is and their description is in the table below.
+The list of available tags and their descriptions are presented below.
 
 +--------------+--------------------------------------------------------+
 | PERSON       | People including fictional                             |
@@ -245,11 +259,11 @@ are 80%, 10%, 10%, respectively.
 Few-shot Language-Model based
 -----------------------------
 
-It is possible to get a clod-start baseline from just a few samples of labeled data in a couple of seconds. The solution
+It is possible to get a cold-start baseline from just a few samples of labeled data in a couple of seconds. The solution
 is based on a Language Model trained on open domain corpus. On top of the LM a SVM classification layer is placed. It is
 possible to start from as few as 10 sentences containing entities of interest.
 
-The data for training this model should be collected the following way. Given a collection of `N` sentences without
+The data for training this model should be collected in the following way. Given a collection of `N` sentences without
 markup, sequentially markup sentences until the total number of sentences with entity of interest become equal
 `K`. During the training both sentences with and without markup are used.
 
@@ -283,6 +297,7 @@ To train and use the model from python code the following snippet can be used:
     from deeppavlov import configs, train_model
 
     ner_model = train_model(configs.ner.ner_few_shot_ru, download=True)
+
     ner_model(['Example sentence'])
 
 Warning! This model can take a lot of time and memory if the number of sentences is greater than 1000!
@@ -307,7 +322,9 @@ To use existing few-shot model use the following python interface can be used:
 .. code:: python
 
     from deeppavlov import configs, build_model
+
     ner_model = build_model(configs.ner.ner_few_shot_ru)
+
     ner_model([['Example', 'sentence']])
     ner_model(['Example sentence'])
 
diff --git a/docs/intro/features.rst b/docs/intro/features.rst
index b064000446..29ac165d7e 100644
--- a/docs/intro/features.rst
+++ b/docs/intro/features.rst
@@ -18,23 +18,25 @@ The second model reproduces architecture from the paper `Application
 of a Hybrid Bi-LSTM-CRF model to the task of Russian Named Entity Recognition <https://arxiv.org/pdf/1709.09686.pdf>`__
 which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01360.pdf.
 
-+---------------------------------------------------------+------+-----------------------------------------------------------------+-------------+
-| Dataset                                                 | Lang | Model                                                           |   Test F1   |
-+=========================================================+======+=================================================================+=============+
-| Persons-1000 dataset with additional LOC and ORG markup | Ru   | :config:`ner_rus_bert.json <ner/ner_rus_bert.json>`             |    97.7     |
-+                                                         +      +-----------------------------------------------------------------+-------------+
-| (Collection 3)                                          |      | :config:`ner_rus.json <ner/ner_rus.json>`                       |    95.1     |
-+---------------------------------------------------------+------+-----------------------------------------------------------------+-------------+
-| ConLL-2003                                              | En   | :config:`ner_conll2003_bert.json <ner/ner_conll2003_bert.json>` |    91.5     |
-+                                                         +      +-----------------------------------------------------------------+-------------+
-|                                                         |      | :config:`ner_conll2003.json <ner/ner_conll2003.json>`           |    89.9     |
-+---------------------------------------------------------+      +-----------------------------------------------------------------+-------------+
-| OntoNotes                                               |      | :config:`ner_ontonotes_bert.json <ner/ner_ontonotes_bert.json>` |    88.4     |
-+                                                         +      +-----------------------------------------------------------------+-------------+
-|                                                         |      | :config:`ner_ontonotes.json <ner/ner_ontonotes.json>`           |    87.1     |
-+---------------------------------------------------------+      +-----------------------------------------------------------------+-------------+
-| DSTC2                                                   |      | :config:`ner_dstc2.json <ner/ner_dstc2.json>`                   |    97.1     |
-+---------------------------------------------------------+------+-----------------------------------------------------------------+-------------+
++---------------------------------------------------------+------+---------------------------------------------------------------------------+-------------+
+| Dataset                                                 | Lang | Model                                                                     |   Test F1   |
++=========================================================+======+===========================================================================+=============+
+| Persons-1000 dataset with additional LOC and ORG markup | Ru   | :config:`ner_rus_bert.json <ner/ner_rus_bert.json>`                       |    97.7     |
++                                                         +      +---------------------------------------------------------------------------+-------------+
+| (Collection 3)                                          |      | :config:`ner_rus.json <ner/ner_rus.json>`                                 |    95.1     |
++---------------------------------------------------------+------+---------------------------------------------------------------------------+-------------+
+| ConLL-2003                                              | En   | :config:`ner_conll2003_bert.json <ner/ner_conll2003_bert.json>`           |    91.5     |
++                                                         +      +---------------------------------------------------------------------------+-------------+
+|                                                         |      | :config:`ner_conll2003.json <ner/ner_conll2003.json>`                     |    89.9     |
++---------------------------------------------------------+      +---------------------------------------------------------------------------+-------------+
+|                                                         |      | :config:`ner_ontonotes_bert_mult.json <ner/ner_ontonotes_bert_mult.json>` |    88.9     |
++                                                         +      +---------------------------------------------------------------------------+-------------+
+| OntoNotes                                               |      | :config:`ner_ontonotes_bert.json <ner/ner_ontonotes_bert.json>`           |    88.4     |
++                                                         +      +---------------------------------------------------------------------------+-------------+
+|                                                         |      | :config:`ner_ontonotes.json <ner/ner_ontonotes.json>`                     |    87.1     |
++---------------------------------------------------------+      +---------------------------------------------------------------------------+-------------+
+| DSTC2                                                   |      | :config:`ner_dstc2.json <ner/ner_dstc2.json>`                             |    97.1     |
++---------------------------------------------------------+------+---------------------------------------------------------------------------+-------------+
 
 :doc:`Slot filling components </components/slot_filling>`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

From 90efe549db0a607856c4a7ada27ab91b2279e7cf Mon Sep 17 00:00:00 2001
From: Sergey1704 <SergeyNaumov1704@gmail.com>
Date: Mon, 27 May 2019 16:44:42 +0300
Subject: [PATCH 06/12] feat: do not end yandex.dialogs session after one
 message (#838)

---
 deeppavlov/utils/alice/alice.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deeppavlov/utils/alice/alice.py b/deeppavlov/utils/alice/alice.py
index d0ef655ded..51073f6450 100644
--- a/deeppavlov/utils/alice/alice.py
+++ b/deeppavlov/utils/alice/alice.py
@@ -59,7 +59,7 @@ def interact_alice(agent: Agent):
 
     response = {
         'response': {
-            'end_session': True,
+            'end_session': False,
             'text': ''
         },
         "session": {

From 037890e02fe6eb78e092a62116e90fdad6249091 Mon Sep 17 00:00:00 2001
From: yurakuratov <yurakuratov@gmail.com>
Date: Mon, 27 May 2019 16:47:05 +0300
Subject: [PATCH 07/12] fix: bert documentation pages (#836)

* fix: bert documentation pages

* fix: try to fix setup.py

* fix: restore paraphraser rubert scores

* docs: fix external imports

* feat: upd bert doc page, add slavic bert link
---
 docs/components/bert.rst | 30 ++++++++++++++++--------------
 docs/conf.py             |  3 ++-
 docs/intro/features.rst  |  4 +++-
 3 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/docs/components/bert.rst b/docs/components/bert.rst
index b36746a50c..6a5ac4a0ba 100644
--- a/docs/components/bert.rst
+++ b/docs/components/bert.rst
@@ -12,15 +12,16 @@ There are several pre-trained BERT models released by Google Research, more deta
 -  BERT-base, English, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip>`__, `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/cased_L-12_H-768_A-12.zip>`__
 -  BERT-base, English, uncased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip>`__, `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/uncased_L-12_H-768_A-12.zip>`__
 -  BERT-large, English, cased, 24-layer, 1024-hidden, 16-heads, 340M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip>`__
--  BERT-base, multilingual, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip>`__, `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12.zip>`__
--  BERT-base, Chinese, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip>`__
+-  BERT-base, multilingual, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip>`__, `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/multi_cased_L-12_H-768_A-12.zip>`__
+-  BERT-base, Chinese, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: download from `[google] <https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip>`__
 
-We have trained BERT-base for Russian Language:
+We have trained BERT-base model for other languages:
 
--  RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 110M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_v1.tar.gz>`__
+-  RuBERT, Russian, cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_v1.tar.gz>`__
+-  SlavicBERT, Slavic (bg, cs, pl, ru), cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12.tar.gz>`__
 
 RuBERT was trained on the Russian part of Wikipedia and news data. We used this training data to build vocabulary of Russian subtokens and took
-multilingual version of BERT-base as initialization for RuBERT.
+multilingual version of BERT-base as initialization for RuBERT [1]_. SlavicBERT training was done the same manner as RuBERT.
 
 Here, in DeepPavlov, we made it easy to use pre-trained BERT for downstream tasks like classification, tagging, question answering and
 ranking. We provide pre-trained models and examples on how to use BERT with DeepPavlov.
@@ -28,13 +29,13 @@ ranking. We provide pre-trained models and examples on how to use BERT with Deep
 BERT for Classification
 -----------------------
 
-**deeppavlov.models.bert.BertClassifierModel** (see :doc:`here </apiref/models/bert>`) provides easy to use
-solution for classification problem using pre-trained BERT.
-One can use several pre-trained English, multi-lingual and Russian BERT models that are listed above.
+:class:`~deeppavlov.models.bert.bert_classifier.BertClassifierModel` provides easy to use solution for classification problem
+using pre-trained BERT. One can use several pre-trained English, multi-lingual and Russian BERT models that are
+listed above.
 
 Two main components of BERT classifier pipeline in DeepPavlov are
-``deeppavlov.models.preprocessors.BertPreprocessor`` (see :doc:`here </apiref/models/bert>`)
-and ``deeppavlov.models.bert.BertClassifierModel`` (see :doc:`here </apiref/models/bert>`).
+:class:`~deeppavlov.models.preprocessors.bert_preprocessor.BertPreprocessor` and
+:class:`~deeppavlov.models.bert.bert_classifier.BertClassifierModel`.
 Non-processed texts should be given to ``bert_preprocessor`` for tokenization on subtokens,
 encoding subtokens with their indices and creating tokens and segment masks.
 If one processed classes to one-hot labels in pipeline, ``one_hot_labels`` should be set to ``true``.
@@ -52,7 +53,7 @@ To tag each word representations of the first sub-word elements are extracted. S
 These representations are passed to a dense layer or Bi-RNN layer to produce distribution over tags. There is
 also an optional CRF layer on the top.
 
-Multilingual BERT models allows to perform zero-shot transfer across languages. To use our 19 tags NER for over a
+Multilingual BERT model allows to perform zero-shot transfer across languages. To use our 19 tags NER for over a
 hundred languages see :ref:`ner_multi_bert`.
 
 
@@ -61,13 +62,13 @@ BERT for Context Question Answering (SQuAD)
 Context Question Answering on `SQuAD <https://rajpurkar.github.io/SQuAD-explorer/>`__ dataset is a task
 of looking for an answer on a question in a given context. This task could be formalized as predicting answer start
 and end position in a given context. :class:`~deeppavlov.models.bert.bert_squad.BertSQuADModel` uses two linear
-transformations to predict probability that currents subtoken is start/end position of an answer. For details check
+transformations to predict probability that current subtoken is start/end position of an answer. For details check
 :doc:`Context Question Answering documentation page </components/squad>`.
 
 BERT for Ranking
 ----------------
 There are two main approaches in text ranking. The first one is interaction-based which is relatively accurate but
-works slow and the second one is representation-based which is less accurate but faster [1]_.
+works slow and the second one is representation-based which is less accurate but faster [2]_.
 The interaction-based ranking based on BERT is represented in the DeepPavlov with two main components
 :class:`~deeppavlov.models.preprocessors.bert_preprocessor.BertRankerPreprocessor`
 and :class:`~deeppavlov.models.bert.bert_ranker.BertRankerModel`
@@ -81,4 +82,5 @@ where the task for ranking is to retrieve the best possible response from some p
 the trained model. Working examples with the trained models are given :doc:`here </components/neural_ranking>`.
 Statistics are available :doc:`here </intro/features>`.
 
-.. [1] McDonald, R., Brokos, G. I., & Androutsopoulos, I. (2018). Deep relevance ranking using enhanced document-query interactions. arXiv preprint arXiv:1809.01682.
\ No newline at end of file
+.. [1] Kuratov, Y., Arkhipov, M. (2019). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language. arXiv preprint arXiv:1905.07213.
+.. [2] McDonald, R., Brokos, G. I., & Androutsopoulos, I. (2018). Deep relevance ranking using enhanced document-query interactions. arXiv preprint arXiv:1809.01682.
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index fe10373103..eed018f2b7 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -190,7 +190,8 @@
 
 # -- Extension configuration -------------------------------------------------
 
-autodoc_mock_imports = ['tensorflow', 'tensorflow_hub', 'fastText', 'nltk', 'gensim', 'kenlm', 'spacy', 'lxml', 'sortedcontainers', 'russian_tagsets']
+autodoc_mock_imports = ['tensorflow', 'tensorflow_hub', 'fastText', 'nltk', 'gensim', 'kenlm', 'spacy', 'lxml',
+                        'sortedcontainers', 'russian_tagsets', 'bert_dp', 'aiml']
 
 extlinks = {
     'config': (f'https://github.com/deepmipt/DeepPavlov/blob/{release}/deeppavlov/configs/%s', None)
diff --git a/docs/intro/features.rst b/docs/intro/features.rst
index 5c70ea6757..87161aeaeb 100644
--- a/docs/intro/features.rst
+++ b/docs/intro/features.rst
@@ -302,7 +302,9 @@ Available pre-trained models for paraphrase identification:
    +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
    |`paraphraser.ru`_       |:config:`paraphrase_ident_paraphraser_tune <ranking/paraphrase_ident_tune_interact.json>`      |   82.9        |   76.7         |   87.3  |  82.0    |   0.392       |   0.479        |5938M     |
    +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
-   |`paraphraser.ru`_       |:config:`paraphrase_bert <classifiers/paraphraser_bert.json>`                                  |   87.4        |   79.3         |   90.2  |  83.4    |   --          |   --           |1330M     |
+   |`paraphraser.ru`_       |:config:`paraphrase_bert_multilingual <classifiers/paraphraser_bert.json>`                     |   87.4        |   79.3         |   90.2  |  83.4    |   --          |   --           |1330M     |
+   +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
+   |`paraphraser.ru`_       |:config:`paraphrase_rubert <classifiers/paraphraser_rubert.json>`                              |   90.2        |   84.9         |   92.3  |  87.9    |   --          |   --           |1325M     |
    +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+
    |`Quora Question Pairs`_ |:config:`paraphrase_ident_qqp_bilstm <ranking/paraphrase_ident_qqp_bilstm_interact.json>`      |   87.1        |   87.0         |   83.0  |  82.6    |   0.300       |   0.305        |8134M     |
    +------------------------+-----------------------------------------------------------------------------------------------+---------------+----------------+---------+----------+---------------+----------------+----------+

From c5c5c3c62bfd0f9a8fc19a30a71f208874ee047f Mon Sep 17 00:00:00 2001
From: Mary Trofimova <mary.vikhreva@gmail.com>
Date: Fri, 31 May 2019 18:33:41 +0300
Subject: [PATCH 08/12] feat: compress bert models (#849)

* feat: make cased tokenization default

* feat: allow null pretrained_bert parameter

* feat: allow iob tags in conll-reader

* feat: do not save backup bars

* feat: train and test on chunks from docs

* feat: add docstart token

* feat: upload new models on share

* docs: update model scores ans sizes

* docs: emphasize on multi model

* refactor: remove unused if branch

* refactor: remove debug print
---
 .../configs/ner/ner_conll2003_bert.json       |  2 +-
 .../configs/ner/ner_ontonotes_bert.json       |  2 +-
 .../configs/ner/ner_ontonotes_bert_mult.json  |  2 +-
 deeppavlov/configs/ner/ner_rus_bert.json      |  2 +-
 .../document_bert_ner_iterator.py             | 53 ++++++++----------
 .../dataset_readers/conll2003_reader.py       | 27 ++++++++-
 deeppavlov/models/bert/bert_classifier.py     | 16 +++---
 deeppavlov/models/bert/bert_ner.py            | 55 ++++++++++++-------
 deeppavlov/models/bert/bert_ranker.py         | 16 +++---
 deeppavlov/models/bert/bert_squad.py          | 14 ++---
 .../models/preprocessors/bert_preprocessor.py |  2 +-
 docs/components/ner.rst                       | 39 +++++++------
 docs/intro/features.rst                       | 38 ++++++-------
 13 files changed, 147 insertions(+), 121 deletions(-)

diff --git a/deeppavlov/configs/ner/ner_conll2003_bert.json b/deeppavlov/configs/ner/ner_conll2003_bert.json
index 271c552bb9..18d8abb26e 100644
--- a/deeppavlov/configs/ner/ner_conll2003_bert.json
+++ b/deeppavlov/configs/ner/ner_conll2003_bert.json
@@ -113,7 +113,7 @@
     },
     "download": [
       {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_conll2003_bert.tar.gz",
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_conll2003_bert_v1.tar.gz",
         "subdir": "{MODELS_PATH}"
       },
       {
diff --git a/deeppavlov/configs/ner/ner_ontonotes_bert.json b/deeppavlov/configs/ner/ner_ontonotes_bert.json
index 2ae080e8d9..6707f00f5b 100644
--- a/deeppavlov/configs/ner/ner_ontonotes_bert.json
+++ b/deeppavlov/configs/ner/ner_ontonotes_bert.json
@@ -114,7 +114,7 @@
     "download": [
 
       {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_ontonotes_bert.tar.gz",
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_ontonotes_bert_v1.tar.gz",
         "subdir": "{MODELS_PATH}"
       },
       {
diff --git a/deeppavlov/configs/ner/ner_ontonotes_bert_mult.json b/deeppavlov/configs/ner/ner_ontonotes_bert_mult.json
index 827e9e07a9..3e6b9415d1 100644
--- a/deeppavlov/configs/ner/ner_ontonotes_bert_mult.json
+++ b/deeppavlov/configs/ner/ner_ontonotes_bert_mult.json
@@ -114,7 +114,7 @@
     "download": [
 
       {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_ontonotes_bert_mult.tar.gz",
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_ontonotes_bert_mult_v1.tar.gz",
         "subdir": "{MODELS_PATH}"
       },
       {
diff --git a/deeppavlov/configs/ner/ner_rus_bert.json b/deeppavlov/configs/ner/ner_rus_bert.json
index da1fa0d009..855beaa9ab 100644
--- a/deeppavlov/configs/ner/ner_rus_bert.json
+++ b/deeppavlov/configs/ner/ner_rus_bert.json
@@ -113,7 +113,7 @@
     },
     "download": [
       {
-        "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_rus_bert_v2.tar.gz",
+        "url": "http://files.deeppavlov.ai/deeppavlov_data/ner_rus_bert_v1.tar.gz",
         "subdir": "{MODELS_PATH}"
       },
       {
diff --git a/deeppavlov/dataset_iterators/document_bert_ner_iterator.py b/deeppavlov/dataset_iterators/document_bert_ner_iterator.py
index e7bca28d57..3d0f392ad8 100644
--- a/deeppavlov/dataset_iterators/document_bert_ner_iterator.py
+++ b/deeppavlov/dataset_iterators/document_bert_ner_iterator.py
@@ -49,19 +49,21 @@ class DocumentBertNerIterator(DataLearningIterator):
     def __init__(self,
                  data: Dict[str, List[Tuple[Any, Any]]],
                  bert_tokenizer_vocab_file: str,
+                 do_lower_case: bool = False,
                  left_context_rate: float = 0.5,
                  max_seq_length: int = None,
-                 max_num_sentences: int = None,
+                 one_sample_per_doc: bool = False,
                  seed: int = None,
                  shuffle: bool = True,
                  *args, **kwargs) -> None:
         self.max_seq_length = max_seq_length or float('inf')
+        self.one_sample_per_doc = one_sample_per_doc
         self.left_context_rate = left_context_rate
         self.shuffle = shuffle
 
         vocab_file = str(expand_path(bert_tokenizer_vocab_file))
         self.tokenizer = FullTokenizer(vocab_file=vocab_file,
-                                       do_lower_case=False)
+                                       do_lower_case=do_lower_case)
         self.random = Random(seed)
 
         self.train = data.get('train', [])
@@ -99,35 +101,25 @@ def gen_batches(self, batch_size: int, data_type: str = 'train',
         if num_docs == 0:
             return
 
-        if data_type == 'train':
-            # one sample per document
-            order = list(range(num_docs))
-            if shuffle:
-                self.random.shuffle(order)
-
-            if batch_size < 0:
-                batch_size = num_docs
-
-            for i in range((num_docs - 1) // batch_size + 1):
-                doc_batch = [doc_data[o][1]
-                             for o in order[i * batch_size: (i + 1) * batch_size]]
-                yield tuple(zip(*[self.sample_from_doc(doc) for doc in doc_batch]))
+        # get all sentences from document
+        doc_chunks = [self.chunks_from_doc(doc) for doc_id, doc in doc_data]
+        if self.one_sample_per_doc:
+            samples = [next(chunk) for chunk in doc_chunks]
         else:
-            # get all sentences from document
-            samples = [s for doc_id, doc in doc_data for s in self.chunks_from_doc(doc)]
-            num_samples = len(samples)
+            samples = [s for chunk in doc_chunks for s in chunk]
+        num_samples = len(samples)
 
-            order = list(range(num_samples))
+        order = list(range(num_samples))
 
-            if shuffle:
-                self.random.shuffle(order)
+        if shuffle:
+            self.random.shuffle(order)
 
-            if batch_size < 0:
-                batch_size = num_samples
+        if batch_size < 0:
+            batch_size = num_samples
 
-            for i in range((num_samples - 1) // batch_size + 1):
-                yield tuple(zip(*[samples[o]
-                                  for o in order[i * batch_size: (i + 1) * batch_size]]))
+        for i in range((num_samples - 1) // batch_size + 1):
+            yield tuple(zip(*[samples[o]
+                              for o in order[i * batch_size: (i + 1) * batch_size]]))
 
     def get_instances(self, data_type: str = 'train') -> Tuple[tuple, tuple]:
         data = self.data[data_type]
@@ -193,7 +185,7 @@ def chunks_from_doc(self, doc: List[Tuple[Any, Any]]) -> List[Tuple[Any, Any]]:
             if len(rich_sample_ids) != max(rich_sample_ids) + 1:
                 raise RuntimeError("can't split doc {doc} into chunks")
 
-    @staticmethod 
+    @staticmethod
     def get_context_indices(samples: List[List[str]],
                             sample_id: int,
                             subtokenizer: FullTokenizer,
@@ -206,7 +198,7 @@ def get_context_indices(samples: List[List[str]],
         l_ctx = samples[:sample_id]
         r_ctx = samples[sample_id + 1:]
 
-        subtoks_len = len([st for t in toks 
+        subtoks_len = len([st for t in toks
                            for st in subtokenizer.tokenize(t)])
         l_i, r_i = 0, 0
         while (l_i < len(l_ctx)) or (r_i < len(r_ctx)):
@@ -216,7 +208,7 @@ def get_context_indices(samples: List[List[str]],
                 subtoks = [st for t in l_ctx[-l_i-1]
                            for st in subtokenizer.tokenize(t)]
                 if subtoks_len + len(subtoks) > max_subtokens_length:
-                    break 
+                    break
                 subtoks_len += len(subtoks)
                 rich_sample_indices = [sample_id - l_i - 1] + rich_sample_indices 
                 l_i += 1
@@ -224,10 +216,9 @@ def get_context_indices(samples: List[List[str]],
                 # add one sentence from right_context
                 subtoks = [st for t in r_ctx[r_i] for st in subtokenizer.tokenize(t)]
                 if subtoks_len + len(subtoks) > max_subtokens_length:
-                    break 
+                    break
                 subtoks_len += len(subtoks)
                 rich_sample_indices.append(sample_id + r_i + 1)
                 r_i += 1
         return rich_sample_indices
 
-
diff --git a/deeppavlov/dataset_readers/conll2003_reader.py b/deeppavlov/dataset_readers/conll2003_reader.py
index 70e30bcb1c..c204ddb4c4 100644
--- a/deeppavlov/dataset_readers/conll2003_reader.py
+++ b/deeppavlov/dataset_readers/conll2003_reader.py
@@ -13,9 +13,13 @@ def read(self,
              data_path: str,
              dataset_name: str = None,
              provide_pos: bool = False,
-             provide_doc_ids: bool = False):
+             provide_doc_ids: bool = False,
+             iob: bool = False,
+             docstart_token: str = None):
         self.provide_pos = provide_pos
         self.provide_doc_ids = provide_doc_ids
+        self.iob = iob
+        self.docstart_token = docstart_token
         self.num_docs = 0
         self.x_is_tuple = self.provide_pos or self.provide_doc_ids
         data_path = Path(data_path)
@@ -59,8 +63,12 @@ def parse_ner_file(self, file_name: Path):
                         pos_tags = []
                         tags = []
                     self.num_docs += 1
+                    if self.docstart_token is not None:
+                        tokens = [self.docstart_token]
+                        pos_tags = ['O']
+                        tags = ['O']
                 elif len(line) < 2:
-                    if len(tokens) > 0:
+                    if (len(tokens) > 0) and (tokens != [self.docstart_token]):
                         x = tokens if not self.x_is_tuple else (tokens,)
                         if self.provide_pos:
                             x = x + (pos_tags,)
@@ -87,5 +95,20 @@ def parse_ner_file(self, file_name: Path):
                     x = x + (self.num_docs,)
                 samples.append((x, tags))
                 self.num_docs += 1
+            
+            if self.iob:
+                return [(x, self._iob2_to_iob(tags)) for x, tags in samples]
 
         return samples
+
+    @staticmethod
+    def _iob2_to_iob(tags):
+        iob_tags = []
+
+        for n, tag in enumerate(tags):
+            if tag.startswith('B-') and (not n or (tags[n - 1][2:] != tag[2:])):
+                tag = tag.replace("B-", "I-")
+            iob_tags.append(tag)
+
+        return iob_tags
+            
diff --git a/deeppavlov/models/bert/bert_classifier.py b/deeppavlov/models/bert/bert_classifier.py
index 937209c7c7..d5c7b616e9 100644
--- a/deeppavlov/models/bert/bert_classifier.py
+++ b/deeppavlov/models/bert/bert_classifier.py
@@ -93,14 +93,14 @@ def __init__(self, bert_config_file, n_classes, keep_prob,
         if pretrained_bert is not None:
             pretrained_bert = str(expand_path(pretrained_bert))
 
-        if tf.train.checkpoint_exists(pretrained_bert) \
-                and not tf.train.checkpoint_exists(str(self.load_path.resolve())):
-            logger.info('[initializing model with Bert from {}]'.format(pretrained_bert))
-            # Exclude optimizer and classification variables from saved variables
-            var_list = self._get_saveable_variables(
-                exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'output_weights', 'output_bias'))
-            saver = tf.train.Saver(var_list)
-            saver.restore(self.sess, pretrained_bert)
+            if tf.train.checkpoint_exists(pretrained_bert) \
+                    and not tf.train.checkpoint_exists(str(self.load_path.resolve())):
+                logger.info('[initializing model with Bert from {}]'.format(pretrained_bert))
+                # Exclude optimizer and classification variables from saved variables
+                var_list = self._get_saveable_variables(
+                    exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'output_weights', 'output_bias'))
+                saver = tf.train.Saver(var_list)
+                saver.restore(self.sess, pretrained_bert)
 
         if self.load_path is not None:
             self.load()
diff --git a/deeppavlov/models/bert/bert_ner.py b/deeppavlov/models/bert/bert_ner.py
index 60181c73af..d5c3524161 100644
--- a/deeppavlov/models/bert/bert_ner.py
+++ b/deeppavlov/models/bert/bert_ner.py
@@ -136,14 +136,14 @@ def __init__(self,
         if pretrained_bert is not None:
             pretrained_bert = str(expand_path(pretrained_bert))
 
-        if tf.train.checkpoint_exists(pretrained_bert) \
-                and not tf.train.checkpoint_exists(str(self.load_path.resolve())):
-            log.info('[initializing model with Bert from {}]'.format(pretrained_bert))
-            # Exclude optimizer and classification variables from saved variables
-            var_list = self._get_saveable_variables(
-                exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'ner', 'EMA'))
-            saver = tf.train.Saver(var_list)
-            saver.restore(self.sess, pretrained_bert)
+            if tf.train.checkpoint_exists(pretrained_bert) \
+                    and not tf.train.checkpoint_exists(str(self.load_path.resolve())):
+                log.info('[initializing model with Bert from {}]'.format(pretrained_bert))
+                # Exclude optimizer and classification variables from saved variables
+                var_list = self._get_saveable_variables(
+                    exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'ner', 'EMA'))
+                saver = tf.train.Saver(var_list)
+                saver.restore(self.sess, pretrained_bert)
 
         if self.load_path is not None:
             self.load()
@@ -434,6 +434,20 @@ def token_from_subtoken(units: tf.Tensor, mask: tf.Tensor) -> tf.Tensor:
 
         return tensor
 
+    def _decode_crf(self, feed_dict: Dict[tf.Tensor, np.ndarray]) -> List[np.ndarray]:
+        logits, trans_params, mask, seq_lengths = self.sess.run([self.logits,
+                                                                 self._transition_params,
+                                                                 self.y_masks_ph,
+                                                                 self.seq_lengths],
+                                                                feed_dict=feed_dict)
+        # iterate over the sentences because no batching in viterbi_decode
+        y_pred = []
+        for logit, sequence_length in zip(logits, seq_lengths):
+            logit = logit[:int(sequence_length)]  # keep only the valid steps
+            viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(logit, trans_params)
+            y_pred += [viterbi_seq]
+        return y_pred
+
     def _build_feed_dict(self, input_ids, input_masks, y_masks, token_types=None, y=None):
         feed_dict = {
             self.input_ids_ph: input_ids,
@@ -506,19 +520,18 @@ def __call__(self,
             pred = self.sess.run(self.y_probas, feed_dict=feed_dict)
         return pred
 
-    def _decode_crf(self, feed_dict: Dict[tf.Tensor, np.ndarray]) -> List[np.ndarray]:
-        logits, trans_params, mask, seq_lengths = self.sess.run([self.logits,
-                                                                 self._transition_params,
-                                                                 self.y_masks_ph,
-                                                                 self.seq_lengths],
-                                                                feed_dict=feed_dict)
-        # iterate over the sentences because no batching in viterbi_decode
-        y_pred = []
-        for logit, sequence_length in zip(logits, seq_lengths):
-            logit = logit[:int(sequence_length)]  # keep only the valid steps
-            viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(logit, trans_params)
-            y_pred += [viterbi_seq]
-        return y_pred
+    def save(self, exclude_scopes=('Optimizer', 'EMA/BackupVariables')) -> None:
+        if self.ema:
+            self.sess.run(self.ema.switch_to_train_op)
+        return super().save(exclude_scopes=exclude_scopes)
+
+    def load(self,
+             exclude_scopes=('Optimizer',
+                             'learning_rate',
+                             'momentum',
+                             'EMA/BackupVariables'),
+             **kwargs) -> None:
+        return super().load(exclude_scopes=exclude_scopes, **kwargs)
 
 
 class ExponentialMovingAverage:
diff --git a/deeppavlov/models/bert/bert_ranker.py b/deeppavlov/models/bert/bert_ranker.py
index e9c7921e78..727f4ee31d 100644
--- a/deeppavlov/models/bert/bert_ranker.py
+++ b/deeppavlov/models/bert/bert_ranker.py
@@ -153,14 +153,14 @@ def __init__(self, bert_config_file, keep_prob=0.9,
         if pretrained_bert is not None:
             pretrained_bert = str(expand_path(pretrained_bert))
 
-        if tf.train.checkpoint_exists(pretrained_bert) \
-                and not tf.train.checkpoint_exists(str(self.load_path.resolve())):
-            logger.info('[initializing model with Bert from {}]'.format(pretrained_bert))
-            # Exclude optimizer and classification variables from saved variables
-            var_list = self._get_saveable_variables(
-                exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'output_weights', 'output_bias'))
-            assignment_map = self.get_variables_to_restore(var_list, pretrained_bert)
-            tf.train.init_from_checkpoint(pretrained_bert, assignment_map)
+            if tf.train.checkpoint_exists(pretrained_bert) \
+                    and not tf.train.checkpoint_exists(str(self.load_path.resolve())):
+                logger.info('[initializing model with Bert from {}]'.format(pretrained_bert))
+                # Exclude optimizer and classification variables from saved variables
+                var_list = self._get_saveable_variables(
+                    exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'output_weights', 'output_bias'))
+                assignment_map = self.get_variables_to_restore(var_list, pretrained_bert)
+                tf.train.init_from_checkpoint(pretrained_bert, assignment_map)
 
         self.sess.run(tf.global_variables_initializer())
 
diff --git a/deeppavlov/models/bert/bert_squad.py b/deeppavlov/models/bert/bert_squad.py
index d7cc0f423a..7f5cfd6491 100644
--- a/deeppavlov/models/bert/bert_squad.py
+++ b/deeppavlov/models/bert/bert_squad.py
@@ -91,13 +91,13 @@ def __init__(self, bert_config_file: str,
         if pretrained_bert is not None:
             pretrained_bert = str(expand_path(pretrained_bert))
 
-        if tf.train.checkpoint_exists(pretrained_bert) \
-                and not tf.train.checkpoint_exists(str(self.load_path.resolve())):
-            logger.info('[initializing model with Bert from {}]'.format(pretrained_bert))
-            var_list = self._get_saveable_variables(
-                exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'squad'))
-            saver = tf.train.Saver(var_list)
-            saver.restore(self.sess, pretrained_bert)
+            if tf.train.checkpoint_exists(pretrained_bert) \
+                    and not tf.train.checkpoint_exists(str(self.load_path.resolve())):
+                logger.info('[initializing model with Bert from {}]'.format(pretrained_bert))
+                var_list = self._get_saveable_variables(
+                    exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'squad'))
+                saver = tf.train.Saver(var_list)
+                saver.restore(self.sess, pretrained_bert)
 
         if self.load_path is not None:
             self.load()
diff --git a/deeppavlov/models/preprocessors/bert_preprocessor.py b/deeppavlov/models/preprocessors/bert_preprocessor.py
index a6b31ae538..2889cd8710 100644
--- a/deeppavlov/models/preprocessors/bert_preprocessor.py
+++ b/deeppavlov/models/preprocessors/bert_preprocessor.py
@@ -100,7 +100,7 @@ class BertNerPreprocessor(Component):
 
     def __init__(self,
                  vocab_file: str,
-                 do_lower_case: bool = True,
+                 do_lower_case: bool = False,
                  max_seq_length: int = 512,
                  max_subword_length: int = None,
                  token_maksing_prob: float = 0.0,
diff --git a/docs/components/ner.rst b/docs/components/ner.rst
index 5146da3dc7..7239e617a2 100644
--- a/docs/components/ner.rst
+++ b/docs/components/ner.rst
@@ -30,26 +30,25 @@ Here is the list of all available configs:
 .. table::
     :widths: auto
 
-    +----------------------------------------------------------------------+--------------------+-----------------+------------+------------+
-    | Model                                                                | Dataset            | Embeddings Size | Model Size |  F1 score  |
-    +======================================================================+====================+=================+============+============+
-    | :config:`ner_rus_bert <ner/ner_rus_bert.json>`                       | Collection3 [1]_   | ---             |   2.1 GB   | **97.7**   |
-    +----------------------------------------------------------------------+                    +-----------------+------------+------------+
-    | :config:`ner_rus <ner/ner_rus.json>`                                 |                    | 1.0 GB          |   5.6 MB   |   95.1     |
-    +----------------------------------------------------------------------+--------------------+-----------------+------------+------------+
-    | :config:`ner_conll2003_bert <ner/ner_conll2003_bert.json>`           | CoNLL-2003         | ---             |   2.1 GB   | **91.5**   |
-    +----------------------------------------------------------------------+                    +-----------------+------------+------------+
-    | :config:`ner_conll2003 <ner/ner_conll2003.json>`                     |                    | 331 MB          |   3.1 MB   |   89.9     |
-    +----------------------------------------------------------------------+--------------------+-----------------+------------+------------+
-    | :config:`ner_ontonotes_bert_mult <ner/ner_ontonotes_bert_mult.json>` | OntoNotes          | ---             |   2.1 GB   | **88.9**   |
-    +----------------------------------------------------------------------+                    +-----------------+------------+------------+
-    | :config:`ner_ontonotes_bert <ner/ner_ontonotes_bert.json>`           |                    | ---             |   2.1 GB   |   88.4     |
-    +----------------------------------------------------------------------+                    +-----------------+------------+------------+
-    | :config:`ner_ontonotes <ner/ner_ontonotes.json>`                     |                    | 331 MB          |   7.8 MB   |   86.4     |
-    +----------------------------------------------------------------------+--------------------+-----------------+------------+------------+
-    | :config:`ner_dstc <ner/ner_dstc.json>`                               | DSTC2              | ---             |   626 KB   |   97.1     |
-    +----------------------------------------------------------------------+--------------------+-----------------+------------+------------+
-
+    +----------------------------------------------------------------------+--------------------+----------+-----------------+------------+------------+
+    | Model                                                                | Dataset            | Language | Embeddings Size | Model Size |  F1 score  |
+    +======================================================================+====================+==========+=================+============+============+
+    | :config:`ner_rus_bert <ner/ner_rus_bert.json>`                       | Collection3 [1]_   | Ru       | 700 MB          |   1.4 GB   | **98.1**   |
+    +----------------------------------------------------------------------+                    +          +-----------------+------------+------------+
+    | :config:`ner_rus <ner/ner_rus.json>`                                 |                    |          | 1.0 GB          |   5.6 MB   |   95.1     |
+    +----------------------------------------------------------------------+--------------------+----------+-----------------+------------+------------+
+    | :config:`ner_ontonotes_bert_mult <ner/ner_ontonotes_bert_mult.json>` | Ontonotes          | Multi    | 700 MB          |   1.4 GB   | **88.8**   |
+    +----------------------------------------------------------------------+                    +----------+-----------------+------------+------------+
+    | :config:`ner_ontonotes_bert <ner/ner_ontonotes_bert.json>`           |                    | En       | 400 MB          |   800 MB   |   88.6     |
+    +----------------------------------------------------------------------+                    +          +-----------------+------------+------------+
+    | :config:`ner_ontonotes <ner/ner_ontonotes.json>`                     |                    |          | 331 MB          |   7.8 MB   |   86.4     |
+    +----------------------------------------------------------------------+--------------------+          +-----------------+------------+------------+
+    | :config:`ner_conll2003_bert <ner/ner_conll2003_bert.json>`           | CoNLL-2003         |          | 400 MB          |   850 MB   | **91.7**   |
+    +----------------------------------------------------------------------+                    +          +-----------------+------------+------------+
+    | :config:`ner_conll2003 <ner/ner_conll2003.json>`                     |                    |          | 331 MB          |   3.1 MB   |   89.9     |
+    +----------------------------------------------------------------------+--------------------+          +-----------------+------------+------------+
+    | :config:`ner_dstc2 <ner/ner_dstc2.json>`                             | DSTC2              |          | ---             |   626 KB   |   97.1     |
+    +----------------------------------------------------------------------+--------------------+----------+-----------------+------------+------------+
 
 Models can be used from Python using the following code:
 
diff --git a/docs/intro/features.rst b/docs/intro/features.rst
index 87161aeaeb..0b0babd695 100644
--- a/docs/intro/features.rst
+++ b/docs/intro/features.rst
@@ -18,25 +18,25 @@ The second model reproduces architecture from the paper `Application
 of a Hybrid Bi-LSTM-CRF model to the task of Russian Named Entity Recognition <https://arxiv.org/pdf/1709.09686.pdf>`__
 which is inspired by Bi-LSTM+CRF architecture from https://arxiv.org/pdf/1603.01360.pdf.
 
-+---------------------------------------------------------+------+---------------------------------------------------------------------------+-------------+
-| Dataset                                                 | Lang | Model                                                                     |   Test F1   |
-+=========================================================+======+===========================================================================+=============+
-| Persons-1000 dataset with additional LOC and ORG markup | Ru   | :config:`ner_rus_bert.json <ner/ner_rus_bert.json>`                       |    97.7     |
-+                                                         +      +---------------------------------------------------------------------------+-------------+
-| (Collection 3)                                          |      | :config:`ner_rus.json <ner/ner_rus.json>`                                 |    95.1     |
-+---------------------------------------------------------+------+---------------------------------------------------------------------------+-------------+
-| ConLL-2003                                              | En   | :config:`ner_conll2003_bert.json <ner/ner_conll2003_bert.json>`           |    91.5     |
-+                                                         +      +---------------------------------------------------------------------------+-------------+
-|                                                         |      | :config:`ner_conll2003.json <ner/ner_conll2003.json>`                     |    89.9     |
-+---------------------------------------------------------+      +---------------------------------------------------------------------------+-------------+
-|                                                         |      | :config:`ner_ontonotes_bert_mult.json <ner/ner_ontonotes_bert_mult.json>` |    88.9     |
-+                                                         +      +---------------------------------------------------------------------------+-------------+
-| OntoNotes                                               |      | :config:`ner_ontonotes_bert.json <ner/ner_ontonotes_bert.json>`           |    88.4     |
-+                                                         +      +---------------------------------------------------------------------------+-------------+
-|                                                         |      | :config:`ner_ontonotes.json <ner/ner_ontonotes.json>`                     |    87.1     |
-+---------------------------------------------------------+      +---------------------------------------------------------------------------+-------------+
-| DSTC2                                                   |      | :config:`ner_dstc2.json <ner/ner_dstc2.json>`                             |    97.1     |
-+---------------------------------------------------------+------+---------------------------------------------------------------------------+-------------+
++---------------------------------------------------------+-------+---------------------------------------------------------------------------+-------------+
+| Dataset                                                 | Lang  | Model                                                                     |   Test F1   |
++=========================================================+=======+===========================================================================+=============+
+| Persons-1000 dataset with additional LOC and ORG markup | Ru    | :config:`ner_rus_bert.json <ner/ner_rus_bert.json>`                       |    98.1     |
++                                                         +       +---------------------------------------------------------------------------+-------------+
+| (Collection 3)                                          |       | :config:`ner_rus.json <ner/ner_rus.json>`                                 |    95.1     |
++---------------------------------------------------------+-------+---------------------------------------------------------------------------+-------------+
+| Ontonotes                                               | Multi | :config:`ner_ontonotes_bert_mult.json <ner/ner_ontonotes_bert_mult.json>` |    88.8     |
++                                                         +-------+---------------------------------------------------------------------------+-------------+
+|                                                         | En    | :config:`ner_ontonotes_bert.json <ner/ner_ontonotes_bert.json>`           |    88.6     |
++                                                         +       +---------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_ontonotes.json <ner/ner_ontonotes.json>`                     |    87.1     |
++---------------------------------------------------------+       +---------------------------------------------------------------------------+-------------+
+| ConLL-2003                                              |       | :config:`ner_conll2003_bert.json <ner/ner_conll2003_bert.json>`           |    91.7     |
++                                                         +       +---------------------------------------------------------------------------+-------------+
+|                                                         |       | :config:`ner_conll2003.json <ner/ner_conll2003.json>`                     |    89.9     |
++---------------------------------------------------------+       +---------------------------------------------------------------------------+-------------+
+| DSTC2                                                   |       | :config:`ner_dstc2.json <ner/ner_dstc2.json>`                             |    97.1     |
++---------------------------------------------------------+-------+---------------------------------------------------------------------------+-------------+
 
 :doc:`Slot filling components </components/slot_filling>`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

From 38366629e7269497073283301155729c04729843 Mon Sep 17 00:00:00 2001
From: Aleksei Lymar <yoptar@gmail.com>
Date: Fri, 31 May 2019 18:34:04 +0300
Subject: [PATCH 09/12] docs: add a detailed error message for when fastText is
 not installed (#855)

* docs: add a detailed error message for when fastText is not installed

* docs: parse required fasttext package string from fasttext.txt
---
 .../models/embedders/fasttext_embedder.py       | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/deeppavlov/models/embedders/fasttext_embedder.py b/deeppavlov/models/embedders/fasttext_embedder.py
index f421f37ebd..2962accd45 100644
--- a/deeppavlov/models/embedders/fasttext_embedder.py
+++ b/deeppavlov/models/embedders/fasttext_embedder.py
@@ -15,7 +15,22 @@
 from logging import getLogger
 from typing import Iterator
 
-import fastText
+try:
+    import fastText
+except ModuleNotFoundError as e:
+    import re
+    import sys
+    from pathlib import Path
+
+    ft_req_path = Path(__file__).resolve().parents[2].joinpath('requirements', 'fasttext.txt')
+    packages = ft_req_path.read_text(encoding='utf8').strip()
+    packages = re.sub(r'\s+', ' ', packages)
+
+    raise ModuleNotFoundError(f'{e}\n\nYou can install fastText by running\n'
+                              f'{sys.executable} -m pip install {packages}\n'
+                              'or for your deeppavlov pipeline configuration\n'
+                              f'{sys.executable} -m deeppavlov install <config_path>')
+
 import numpy as np
 from overrides import overrides
 

From 801f2cccd781232fcb21a6c08dc0ebe1f7dce79b Mon Sep 17 00:00:00 2001
From: Aleksey Lymar <yoptar@gmail.com>
Date: Mon, 3 Jun 2019 12:26:42 +0300
Subject: [PATCH 10/12] chore: release 0.3.1

---
 deeppavlov/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deeppavlov/__init__.py b/deeppavlov/__init__.py
index 3b6a40dcfd..770aa153ee 100644
--- a/deeppavlov/__init__.py
+++ b/deeppavlov/__init__.py
@@ -37,7 +37,7 @@ def evaluate_model(config: [str, Path, dict], download: bool = False, recursive:
 except ImportError:
     'Assuming that requirements are not yet installed'
 
-__version__ = '0.3.0'
+__version__ = '0.3.1'
 __author__ = 'Neural Networks and Deep Learning lab, MIPT'
 __description__ = 'An open source library for building end-to-end dialog systems and training chatbots.'
 __keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot']

From df4604acdb821e7ccff464ee6aff448765688399 Mon Sep 17 00:00:00 2001
From: Aleksei Lymar <yoptar@gmail.com>
Date: Mon, 3 Jun 2019 18:18:00 +0300
Subject: [PATCH 11/12] Docs/0.3.1 (#868)

* docs: add breaking changes chapter for release 0.3.0

* docs: move demo link to the top of README.md

* docs: set examples link absolute
---
 README.md | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index c5d5e74b97..0545c7cc98 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,13 @@ DeepPavlov is an open-source conversational AI library built on [TensorFlow](htt
  * development of production ready chat-bots and complex conversational systems,
  * NLP and dialog systems research.
 
+### Demo 
+
+Demo of selected features is available at [demo.ipavlov.ai](https://demo.ipavlov.ai/)
+
+
+### Breaking changes in version 0.3.0!
+- component option `fit_on_batch` in configuration files was removed and replaced with adaptive usage of the `fit_on` parameter.
 
 ### Breaking changes in version 0.2.0!
 - `utils` module was moved from repository root in to `deeppavlov` module
@@ -115,10 +122,6 @@ print(HelloBot(['Hello!', 'Boo...', 'Bye.']))
     pip install deeppavlov
     ```
 
-# Demo 
-
-Demo of selected features is available at [demo.ipavlov.ai](https://demo.ipavlov.ai/)
-
 # Quick start
 
 To use our pre-trained models, you should first install their requirements:
@@ -177,7 +180,7 @@ Here is our [DockerHub repository](https://hub.docker.com/u/deeppavlov/) with im
 
 # Tutorials
 
-Jupyter notebooks and videos explaining how to use DeepPalov for different tasks can be found in [/examples/](examples)
+Jupyter notebooks explaining how to use DeepPalov for different tasks can be found in [/examples/](https://github.com/deepmipt/DeepPavlov/tree/master/examples)
 
 # License
 

From 0214bd4ca2a676057d287a5011a20fb7ee2ddaeb Mon Sep 17 00:00:00 2001
From: Mary Trofimova <mary.vikhreva@gmail.com>
Date: Mon, 3 Jun 2019 18:31:47 +0300
Subject: [PATCH 12/12] docs: fix bert module documentation (#869)

---
 docs/components/bert.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/components/bert.rst b/docs/components/bert.rst
index 6a5ac4a0ba..6080a8395b 100644
--- a/docs/components/bert.rst
+++ b/docs/components/bert.rst
@@ -21,10 +21,10 @@ We have trained BERT-base model for other languages:
 -  SlavicBERT, Slavic (bg, cs, pl, ru), cased, 12-layer, 768-hidden, 12-heads, 180M parameters: `[deeppavlov] <http://files.deeppavlov.ai/deeppavlov_data/bert/bg_cs_pl_ru_cased_L-12_H-768_A-12.tar.gz>`__
 
 RuBERT was trained on the Russian part of Wikipedia and news data. We used this training data to build vocabulary of Russian subtokens and took
-multilingual version of BERT-base as initialization for RuBERT [1]_. SlavicBERT training was done the same manner as RuBERT.
+multilingual version of BERT-base as initialization for RuBERT [1]_. SlavicBERT training was done in the same manner as RuBERT.
 
 Here, in DeepPavlov, we made it easy to use pre-trained BERT for downstream tasks like classification, tagging, question answering and
-ranking. We provide pre-trained models and examples on how to use BERT with DeepPavlov.
+ranking. We also provide pre-trained models and examples on how to use BERT with DeepPavlov.
 
 BERT for Classification
 -----------------------
@@ -38,7 +38,7 @@ Two main components of BERT classifier pipeline in DeepPavlov are
 :class:`~deeppavlov.models.bert.bert_classifier.BertClassifierModel`.
 Non-processed texts should be given to ``bert_preprocessor`` for tokenization on subtokens,
 encoding subtokens with their indices and creating tokens and segment masks.
-If one processed classes to one-hot labels in pipeline, ``one_hot_labels`` should be set to ``true``.
+In case of using one-hot encoded classes in the pipeline, set ``one_hot_labels`` to ``true``.
 
 ``bert_classifier`` has a dense layer of number of classes size upon pooled outputs of Transformer encoder,
 it is followed by ``softmax`` activation (``sigmoid`` if ``multilabel`` parameter is set to ``true`` in config).
@@ -83,4 +83,4 @@ the trained model. Working examples with the trained models are given :doc:`here
 Statistics are available :doc:`here </intro/features>`.
 
 .. [1] Kuratov, Y., Arkhipov, M. (2019). Adaptation of Deep Bidirectional Multilingual Transformers for Russian Language. arXiv preprint arXiv:1905.07213.
-.. [2] McDonald, R., Brokos, G. I., & Androutsopoulos, I. (2018). Deep relevance ranking using enhanced document-query interactions. arXiv preprint arXiv:1809.01682.
\ No newline at end of file
+.. [2] McDonald, R., Brokos, G. I., & Androutsopoulos, I. (2018). Deep relevance ranking using enhanced document-query interactions. arXiv preprint arXiv:1809.01682.