From 9f1dd1171494cf8582837230a04be020e4e895ab Mon Sep 17 00:00:00 2001 From: yurakuratov Date: Fri, 29 Nov 2019 16:44:50 +0300 Subject: [PATCH] feat: use BERT as an extractive summarizer (#1076) * fix: prepare registry import error * feat: add bert as extractive summarizer * fix: some codestyle fixes * fix: DP registy * fix: bert summarizer docstring * fix: codestyle changes * fix: docstrings * docs: add BertAsSummarizer to apiref * docs: add Summarization section to BERT doc page * fix: docstrings and inherit from TFModel * chore: update package version to 0.7.1 --- deeppavlov/__init__.py | 2 +- .../summarization/bert_as_summarizer.json | 41 ++++ .../bert_as_summarizer_with_init.json | 41 ++++ deeppavlov/core/common/registry.json | 1 + deeppavlov/models/bert/bert_as_summarizer.py | 216 ++++++++++++++++++ docs/apiref/models/bert.rst | 5 + docs/features/models/bert.rst | 12 + 7 files changed, 317 insertions(+), 1 deletion(-) create mode 100644 deeppavlov/configs/summarization/bert_as_summarizer.json create mode 100644 deeppavlov/configs/summarization/bert_as_summarizer_with_init.json create mode 100644 deeppavlov/models/bert/bert_as_summarizer.py diff --git a/deeppavlov/__init__.py b/deeppavlov/__init__.py index 277241fef9..02ab1d7808 100644 --- a/deeppavlov/__init__.py +++ b/deeppavlov/__init__.py @@ -39,7 +39,7 @@ def evaluate_model(config: [str, Path, dict], download: bool = False, recursive: except ImportError: 'Assuming that requirements are not yet installed' -__version__ = '0.7.0' +__version__ = '0.7.1' __author__ = 'Neural Networks and Deep Learning lab, MIPT' __description__ = 'An open source library for building end-to-end dialog systems and training chatbots.' __keywords__ = ['NLP', 'NER', 'SQUAD', 'Intents', 'Chatbot'] diff --git a/deeppavlov/configs/summarization/bert_as_summarizer.json b/deeppavlov/configs/summarization/bert_as_summarizer.json new file mode 100644 index 0000000000..1cad9c12ee --- /dev/null +++ b/deeppavlov/configs/summarization/bert_as_summarizer.json @@ -0,0 +1,41 @@ +{ + "chainer": { + "in": ["texts"], + "pipe": [ + { + "class_name": "bert_as_summarizer", + "bert_config_file": "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v2/bert_config.json", + "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v2/bert_model.ckpt", + "vocab_file": "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v2/vocab.txt", + "max_summary_length": 100, + "max_summary_length_in_tokens": true, + "lang": "ru", + "do_lower_case": false, + "max_seq_length": 512, + "in": ["texts"], + "out": ["summarized_text"] + } + ], + "out": ["summarized_text"] + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" + }, + "requirements": [ + "{DEEPPAVLOV_PATH}/requirements/tf.txt", + "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt" + ], + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_v2.tar.gz", + "subdir": "{DOWNLOADS_PATH}/bert_models" + } + ] + } + } + + \ No newline at end of file diff --git a/deeppavlov/configs/summarization/bert_as_summarizer_with_init.json b/deeppavlov/configs/summarization/bert_as_summarizer_with_init.json new file mode 100644 index 0000000000..7f85029c64 --- /dev/null +++ b/deeppavlov/configs/summarization/bert_as_summarizer_with_init.json @@ -0,0 +1,41 @@ +{ + "chainer": { + "in": ["texts", "init_sentences"], + "pipe": [ + { + "class_name": "bert_as_summarizer", + "bert_config_file": "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v2/bert_config.json", + "pretrained_bert": "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v2/bert_model.ckpt", + "vocab_file": "{DOWNLOADS_PATH}/bert_models/rubert_cased_L-12_H-768_A-12_v2/vocab.txt", + "max_summary_length": 100, + "max_summary_length_in_tokens": true, + "lang": "ru", + "do_lower_case": false, + "max_seq_length": 512, + "in": ["texts", "init_sentences"], + "out": ["summarized_text"] + } + ], + "out": ["summarized_text"] + }, + "metadata": { + "variables": { + "ROOT_PATH": "~/.deeppavlov", + "DOWNLOADS_PATH": "{ROOT_PATH}/downloads", + "MODELS_PATH": "{ROOT_PATH}/models", + "CONFIGS_PATH": "{DEEPPAVLOV_PATH}/configs" + }, + "requirements": [ + "{DEEPPAVLOV_PATH}/requirements/tf.txt", + "{DEEPPAVLOV_PATH}/requirements/bert_dp.txt" + ], + "download": [ + { + "url": "http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_v2.tar.gz", + "subdir": "{DOWNLOADS_PATH}/bert_models" + } + ] + } + } + + \ No newline at end of file diff --git a/deeppavlov/core/common/registry.json b/deeppavlov/core/common/registry.json index 9f995f778a..92e97c955f 100644 --- a/deeppavlov/core/common/registry.json +++ b/deeppavlov/core/common/registry.json @@ -6,6 +6,7 @@ "api_router": "deeppavlov.models.api_requester.api_router:ApiRouter", "basic_classification_iterator": "deeppavlov.dataset_iterators.basic_classification_iterator:BasicClassificationDatasetIterator", "basic_classification_reader": "deeppavlov.dataset_readers.basic_classification_reader:BasicClassificationDatasetReader", + "bert_as_summarizer": "deeppavlov.models.bert.bert_as_summarizer:BertAsSummarizer", "bert_classifier": "deeppavlov.models.bert.bert_classifier:BertClassifierModel", "bert_sequence_tagger": "deeppavlov.models.bert.bert_sequence_tagger:BertSequenceTagger", "bert_syntax_parser": "deeppavlov.models.syntax_parser.network:BertSyntaxParser", diff --git a/deeppavlov/models/bert/bert_as_summarizer.py b/deeppavlov/models/bert/bert_as_summarizer.py new file mode 100644 index 0000000000..66efdad7a4 --- /dev/null +++ b/deeppavlov/models/bert/bert_as_summarizer.py @@ -0,0 +1,216 @@ +# Copyright 2017 Neural Networks and Deep Learning lab, MIPT +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from logging import getLogger +from typing import List, Optional + +import numpy as np +import tensorflow.compat.v1 as tf + +from bert_dp.modeling import BertConfig, BertModel, create_initializer, get_assignment_map_from_checkpoint +from deeppavlov.core.commands.utils import expand_path +from deeppavlov.core.common.registry import register +from deeppavlov.core.models.tf_model import TFModel +from deeppavlov.models.preprocessors.bert_preprocessor import BertPreprocessor + +logger = getLogger(__name__) + + +@register('bert_as_summarizer') +class BertAsSummarizer(TFModel): + """Naive Extractive Summarization model based on BERT. + BERT model was trained on Masked Language Modeling (MLM) and Next Sentence Prediction (NSP) tasks. + NSP head was trained to detect in ``[CLS] text_a [SEP] text_b [SEP]`` if text_b follows text_a in original document. + + This NSP head can be used to stack sentences from a long document, based on a initial sentence: + + summary_0 = init_sentence + + summary_1 = summary_0 + argmax(nsp_score(candidates)) + + summary_2 = summary_1 + argmax(nsp_score(candidates)) + + ... + + , where candidates are all sentences from a document. + + Args: + bert_config_file: path to Bert configuration file + pretrained_bert: path to pretrained Bert checkpoint + vocab_file: path to Bert vocabulary + max_summary_length: limit on summary length, number of sentences is used if ``max_summary_length_in_tokens`` + is set to False, else number of tokens is used. + max_summary_length_in_tokens: Use number of tokens as length of summary. + Defaults to ``False``. + max_seq_length: max sequence length in subtokens, including ``[SEP]`` and ``[CLS]`` tokens. + `max_seq_length` is used in Bert to compute NSP scores. Defaults to ``128``. + do_lower_case: set ``True`` if lowercasing is needed. Defaults to ``False``. + lang: use ru_sent_tokenizer for 'ru' and ntlk.sent_tokener for other languages. + Defaults to ``'ru'``. + """ + + def __init__(self, bert_config_file: str, + pretrained_bert: str, + vocab_file: str, + max_summary_length: int, + max_summary_length_in_tokens: Optional[bool] = False, + max_seq_length: Optional[int] = 128, + do_lower_case: Optional[bool] = False, + lang: Optional[str] = 'ru', + **kwargs) -> None: + + self.max_summary_length = max_summary_length + self.max_summary_length_in_tokens = max_summary_length_in_tokens + self.bert_config = BertConfig.from_json_file(str(expand_path(bert_config_file))) + + self.bert_preprocessor = BertPreprocessor(vocab_file=vocab_file, do_lower_case=do_lower_case, + max_seq_length=max_seq_length) + + self.tokenize_reg = re.compile(r"[\w']+|[^\w ]") + + if lang == 'ru': + from ru_sent_tokenize import ru_sent_tokenize + self.sent_tokenizer = ru_sent_tokenize + else: + from nltk import sent_tokenize + self.sent_tokenizer = sent_tokenize + + self.sess_config = tf.ConfigProto(allow_soft_placement=True) + self.sess_config.gpu_options.allow_growth = True + self.sess = tf.Session(config=self.sess_config) + + self._init_graph() + + self.sess.run(tf.global_variables_initializer()) + + if pretrained_bert is not None: + pretrained_bert = str(expand_path(pretrained_bert)) + + if tf.train.checkpoint_exists(pretrained_bert): + logger.info('[initializing model with Bert from {}]'.format(pretrained_bert)) + tvars = tf.trainable_variables() + assignment_map, _ = get_assignment_map_from_checkpoint(tvars, pretrained_bert) + tf.train.init_from_checkpoint(pretrained_bert, assignment_map) + + def _init_graph(self): + self._init_placeholders() + + self.bert = BertModel(config=self.bert_config, + is_training=self.is_train_ph, + input_ids=self.input_ids_ph, + input_mask=self.input_masks_ph, + token_type_ids=self.token_types_ph, + use_one_hot_embeddings=False, + ) + # next sentence prediction head + with tf.variable_scope("cls/seq_relationship"): + output_weights = tf.get_variable( + "output_weights", + shape=[2, self.bert_config.hidden_size], + initializer=create_initializer(self.bert_config.initializer_range)) + output_bias = tf.get_variable( + "output_bias", shape=[2], initializer=tf.zeros_initializer()) + + nsp_logits = tf.matmul(self.bert.get_pooled_output(), output_weights, transpose_b=True) + nsp_logits = tf.nn.bias_add(nsp_logits, output_bias) + self.nsp_probs = tf.nn.softmax(nsp_logits, axis=-1) + + def _init_placeholders(self): + self.input_ids_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='ids_ph') + self.input_masks_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='masks_ph') + self.token_types_ph = tf.placeholder(shape=(None, None), dtype=tf.int32, name='token_types_ph') + + self.is_train_ph = tf.placeholder_with_default(False, shape=[], name='is_train_ph') + + def _build_feed_dict(self, input_ids, input_masks, token_types): + feed_dict = { + self.input_ids_ph: input_ids, + self.input_masks_ph: input_masks, + self.token_types_ph: token_types, + } + return feed_dict + + def _get_nsp_predictions(self, sentences: List[str], candidates: List[str]): + """Compute NextSentence probability for every (sentence_i, candidate_i) pair. + + [CLS] sentence_i [SEP] candidate_i [SEP] + + Args: + sentences: list of sentences + candidates: list of candidates to be the next sentence + + Returns: + probabilities that candidate is a next sentence + """ + features = self.bert_preprocessor(texts_a=sentences, texts_b=candidates) + input_ids = [f.input_ids for f in features] + input_masks = [f.input_mask for f in features] + input_type_ids = [f.input_type_ids for f in features] + feed_dict = self._build_feed_dict(input_ids, input_masks, input_type_ids) + nsp_probs = self.sess.run(self.nsp_probs, feed_dict=feed_dict) + return nsp_probs[:, 0] + + def __call__(self, texts: List[str], init_sentences: Optional[List[str]] = None) -> List[List[str]]: + """Builds summary for text from `texts` + + Args: + texts: texts to build summaries for + init_sentences: ``init_sentence`` is used as the first sentence in summary. + Defaults to None. + + Returns: + List[List[str]]: summaries tokenized on sentences + """ + summaries = [] + # build summaries for each text, init_sentence pair + if init_sentences is None: + init_sentences = [None] * len(texts) + + for text, init_sentence in zip(texts, init_sentences): + text_sentences = self.sent_tokenizer(text) + + if init_sentence is None: + init_sentence = text_sentences[0] + text_sentences = text_sentences[1:] + + # remove duplicates + text_sentences = list(set(text_sentences)) + # remove init_sentence from text sentences + text_sentences = [sent for sent in text_sentences if sent != init_sentence] + + summary = [init_sentence] + if self.max_summary_length_in_tokens: + # get length in tokens + def get_length(x): + return len(self.tokenize_reg.findall(' '.join(x))) + else: + # get length as number of sentences + get_length = len + + candidates = text_sentences[:] + while len(candidates) > 0: + # todo: use batches + candidates_scores = [self._get_nsp_predictions([' '.join(summary)], [cand]) for cand in candidates] + best_candidate_idx = np.argmax(candidates_scores) + best_candidate = candidates[best_candidate_idx] + del candidates[best_candidate_idx] + if get_length(summary + [best_candidate]) > self.max_summary_length: + break + summary = summary + [best_candidate] + summaries += [summary] + return summaries + + def train_on_batch(self, **kwargs): + raise NotImplementedError diff --git a/docs/apiref/models/bert.rst b/docs/apiref/models/bert.rst index b9510ea17f..48dbdbe329 100644 --- a/docs/apiref/models/bert.rst +++ b/docs/apiref/models/bert.rst @@ -61,3 +61,8 @@ deeppavlov.models.bert .. autoclass:: deeppavlov.models.bert.bert_ranker.BertSepRankerPredictor .. automethod:: __call__ + +.. autoclass:: deeppavlov.models.bert.bert_as_summarizer.BertAsSummarizer + + .. automethod:: __call__ + .. automethod:: _get_nsp_predictions \ No newline at end of file diff --git a/docs/features/models/bert.rst b/docs/features/models/bert.rst index 2f5a6a3feb..e3e8c7b57f 100644 --- a/docs/features/models/bert.rst +++ b/docs/features/models/bert.rst @@ -110,6 +110,18 @@ where the task for ranking is to retrieve the best possible response from some p the trained model. Working examples with the trained models are given :doc:`here `. Statistics are available :doc:`here `. +BERT for Extractive Summarization +--------------------------------- +The BERT model was trained on Masked Language Modeling (MLM) and Next Sentence Prediction (NSP) tasks. +NSP head was trained to detect in ``[CLS] text_a [SEP] text_b [SEP]`` if text_b follows text_a in original document. +This NSP head can be used to stack sentences from a long document, based on a initial sentence. The first sentence in +a document can be used as initial one. :class:`~deeppavlov.models.bert.bert_as_summarizer.BertAsSummarizer` relies on +pretrained BERT models and does not require training on summarization dataset. +We have two configuration files: + +- :config:`BertAsSummarizer ` takes first sentence in document as initialization. +- :config:`BertAsSummarizer with init ` uses provided initial sentence. + Using custom BERT in DeepPavlov -------------------------------