init commit

imcaspar · Nov 6, 2019 · 6d3a4a0 · 6d3a4a0
1 parent 51b6a32
commit 6d3a4a0
Show file tree

Hide file tree

Showing 24 changed files with 52,787 additions and 3 deletions.
diff --git a/.github/logo.png b/.github/logo.png
diff --git a/.github/loss.png b/.github/loss.png
diff --git a/README.md b/README.md
@@ -1,10 +1,39 @@
-# gpt2-ml
+![gpt2-ml](./.github/logo.png)
+# GPT2 for Multiple Languages
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/imcaspar/gpt2-ml/blob/master/pretrained_model_demo.ipynb)
 ![GitHub](https://img.shields.io/github/license/imcaspar/gpt2-ml)
 ![GitHub All Releases](https://img.shields.io/github/downloads/imcaspar/gpt2-ml/total)
 [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/imcaspar/gpt2-ml/issues)
 ![GitHub stars](https://img.shields.io/github/stars/imcaspar/gpt2-ml?style=social)
 ![Twitter Follow](https://img.shields.io/twitter/follow/imcaspar?style=social)
 
-GPT2 for Multiple Languages
+[**中文说明**](./README.md) | [**English**](./README_EN.md)
+
+- [x] 简化整理 GPT2 训练代码（based on grover）
+- [x] 移植 bert tokenizer，添加多语言支持
+- [x] 15亿参数 GPT2 中文预训练模型
+
+
+## 预训练模型
+15亿参数中文预训练模型 [Google Drive 下载](https://drive.google.com/open?id=1_6Py_UEGSAMt2RCq_dxsGNfpF4jMhm-5)
+
+训练语料来自 [THUCNews](http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews) 以及 [nlp_chinese_corpus](https://github.com/brightmart/nlp_chinese_corpus)，清洗后总文本量约 15G 
+
+使用 [Cloud TPU Pod v3-256](https://cloud.google.com/tpu/docs/types-zones#types) 训练 10w 步
+
+![loss](./.github/loss.png)
+
+
+## Google Colab
+
+
+## 训练
+
+
+## Reference
+https://github.com/google-research/bert
+
+https://github.com/rowanz/grover
 
-Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC)
+Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC)
diff --git a/README_EN.md b/README_EN.md
diff --git a/configs/base.json b/configs/base.json
@@ -0,0 +1,12 @@
+{
+  "vocab_size": 50270,
+  "hidden_size": 768,
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 1024,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12
+}
diff --git a/configs/large.json b/configs/large.json
@@ -0,0 +1,13 @@
+{
+  "vocab_size": 50270,
+  "hidden_size": 1024,
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "max_position_embeddings": 1024,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "max_batch_size_per_core": 3
+}
diff --git a/configs/mega.json b/configs/mega.json
@@ -0,0 +1,12 @@
+{
+  "vocab_size": 21130,
+  "hidden_size": 1536,
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "initializer_range": 0.014142135623731,
+  "intermediate_size": 6144,
+  "max_position_embeddings": 1024,
+  "num_attention_heads": 24,
+  "num_hidden_layers": 48
+}
diff --git a/dataset/README.md b/dataset/README.md
diff --git a/dataset/prepare_data.py b/dataset/prepare_data.py
@@ -0,0 +1,210 @@
+"""
+Turn a merged corpus into tfrecord files.
+
+NOTE: You will want to do this using several processes. I did this on an AWS machine with 72 CPUs using GNU parallel
+as that's where I had the deduplicated RealNews dataset.
+"""
+import argparse
+import ujson as json
+# from sample.encoder import get_encoder, tokenize_for_grover_training, detokenize, sliding_window, create_int_feature
+import random
+import tensorflow as tf
+import collections
+import os
+from tempfile import TemporaryDirectory
+
+from tokenization import tokenization
+
+parser = argparse.ArgumentParser(description='SCRAPE!')
+parser.add_argument(
+    '-fold',
+    dest='fold',
+    default=0,
+    type=int,
+    help='which fold we are on'
+)
+parser.add_argument(
+    '-num_folds',
+    dest='num_folds',
+    default=1,
+    type=int,
+    help='Number of folds (corresponding to both the number of training files and the number of testing files)',
+)
+parser.add_argument(
+    '-seed',
+    dest='seed',
+    default=1337,
+    type=int,
+    help='which seed to use'
+)
+parser.add_argument(
+    '-base_fn',
+    dest='base_fn',
+    default='news2016zh_',
+    type=str,
+    help='We will output files that are like {base_fn}_{n}.tfrecord for n in 0, ..., 1023'
+)
+
+parser.add_argument(
+    '-input_fn',
+    dest='input_fn',
+    default='realnews.jsonl',
+    type=str,
+    help='Base filename to use. THIS MUST BE A LOCAL FILE.'
+)
+parser.add_argument(
+    '-max_seq_length',
+    dest='max_seq_length',
+    default=1025,
+    type=int,
+    help='Max sequence length',
+)
+
+
+args = parser.parse_args()
+random.seed(args.seed + args.fold)
+
+#encoder = get_encoder()
+tokenizer = tokenization.FullTokenizer(
+    vocab_file="bert-base-chinese-vocab.txt", do_lower_case=True)
+
+
+class S3TFRecordWriter(object):
+    def __init__(self, fn):
+        self.fn = fn
+        if fn.startswith('s3://'):
+            from boto3.s3.transfer import TransferConfig
+            import boto3
+            self.gclient = None
+            self.s3client = boto3.client('s3',
+                                         )
+            self.storage_dir = TemporaryDirectory()
+            self.writer = tf.python_io.TFRecordWriter(
+                os.path.join(self.storage_dir.name, 'temp.tfrecord'))
+            self.bucket_name, self.file_name = self.fn.split(
+                's3://', 1)[1].split('/', 1)
+        elif fn.startswith('gs://'):
+            from google.cloud import storage
+            self.s3client = None
+            self.gclient = storage.Client()
+            self.storage_dir = TemporaryDirectory()
+            self.writer = tf.python_io.TFRecordWriter(
+                os.path.join(self.storage_dir.name, 'temp.tfrecord'))
+            self.bucket_name, self.file_name = self.fn.split(
+                'gs://', 1)[1].split('/', 1)
+
+        else:
+            self.s3client = None
+            self.gclient = None
+            self.bucket_name = None
+            self.file_name = None
+            self.storage_dir = None
+            self.writer = tf.python_io.TFRecordWriter(fn)
+
+    def write(self, x):
+        self.writer.write(x)
+
+    def close(self):
+        self.writer.close()
+
+        if self.s3client is not None:
+            from boto3.s3.transfer import TransferConfig
+            config = TransferConfig(multipart_threshold=1024 * 25, max_concurrency=10,
+                                    multipart_chunksize=1024 * 25, use_threads=True)
+            self.s3client.upload_file(
+                os.path.join(self.storage_dir.name, 'temp.tfrecord'),
+                self.bucket_name,
+                self.file_name,
+                ExtraArgs={'ACL': 'public-read'}, Config=config,
+            )
+            self.storage_dir.cleanup()
+        if self.gclient is not None:
+            bucket = self.gclient.get_bucket(self.bucket_name)
+            blob = bucket.blob(self.file_name)
+            blob.upload_from_filename(os.path.join(
+                self.storage_dir.name, 'temp.tfrecord'))
+            self.storage_dir.cleanup()
+
+    def __enter__(self):
+        # Called when entering "with" context.
+        return self
+
+    def __exit__(self, *_):
+        # Called when exiting "with" context.
+        # Upload shit
+        print("CALLING CLOSE")
+        self.close()
+
+
+def article_iterator(tokenizer):
+    """ Iterate through the provided filename + tokenize"""
+    assert os.path.exists(args.input_fn)
+    for (dirpath, dirnames, filenames) in os.walk(args.input_fn):
+        for filename in filenames:
+            with open(os.path.join(dirpath, filename), 'r') as f:
+                for l_no, l in enumerate(f):
+                    if l_no % args.num_folds == args.fold:
+                        article = json.loads(l)
+
+                        line = tokenization.convert_to_unicode(
+                            article['text'])  # for news2016zh text body
+                        tokens = tokenizer.tokenize(line)
+                        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+                        article['input_ids'] = input_ids
+
+                        article['inst_index'] = (l_no // args.num_folds)
+                        if article['inst_index'] < 100:
+                            print('---\nINPUT{}. {}\n---\nTokens: {}\n'.format(article['inst_index'],
+                                                                            tokens,
+                                                                            input_ids
+                                                                            ), flush=True)
+                        if len(article['input_ids']) <= 64:  # min size of article
+                            continue
+                        yield article
+
+
+def create_int_feature(values):
+    feature = tf.train.Feature(
+        int64_list=tf.train.Int64List(value=list(values)))
+    return feature
+
+
+def buffered_and_sliding_window_article_iterator(tokenizer, final_desired_size=1025):
+    """ We apply a sliding window to fix long sequences, and use a buffer that combines short sequences."""
+    for article in article_iterator(tokenizer):
+        if len(article['input_ids']) >= final_desired_size:
+            article['input_ids'] = article['input_ids'][0:final_desired_size-1]
+        while len(article['input_ids']) < final_desired_size:
+            article['input_ids'].append(0)
+        yield article
+
+
+# OK now write the tfrecord file
+total_written = 0
+train_file = args.base_fn + 'train_wiki19_{:04d}.tfrecord'.format(args.fold)
+with S3TFRecordWriter(train_file) as train_writer:
+    for article in buffered_and_sliding_window_article_iterator(tokenizer,
+                                                                final_desired_size=max(args.max_seq_length + 1, 1025)):
+        writer2use = train_writer
+        assert len(article['input_ids']) == (args.max_seq_length + 1)
+
+        features = collections.OrderedDict()
+        features["input_ids"] = create_int_feature(article['input_ids'])
+        tf_example = tf.train.Example(
+            features=tf.train.Features(feature=features))
+
+        writer2use.write(tf_example.SerializeToString())
+        total_written += 1
+
+        # DEBUG
+        if article['inst_index'] < 5:
+            print("~~~\nIndex {}. ARTICLE: {}\n---\nTokens: {}\n\n".format(article['inst_index'],
+                                                                           tokenizer.convert_ids_to_tokens(
+                                                                               article['input_ids']),
+                                                                           article['input_ids']
+                                                                           ), flush=True)
+        if article['inst_index'] % 1000 == 0:
+            print("{} articles, {} written".format(
+                article['inst_index'], total_written), flush=True)
+print("DONE UPLOADING", flush=True)
diff --git a/dataset/prepare_data.sh b/dataset/prepare_data.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+NUM_FOLDS=1024
+MAX_SEQ_LENGTH=1024
+FN=${1}
+OUT_BUCKET=${2}
+
+rm -rf logs_${MAX_SEQ_LENGTH}
+mkdir logs_${MAX_SEQ_LENGTH}
+parallel -j $(nproc --all) --will-cite "python prepare_data.py -fold {1} -num_folds ${NUM_FOLDS} -base_fn gs://${OUT_BUCKET}/data_${MAX_SEQ_LENGTH}/ -input_fn ${FN} -max_seq_length ${MAX_SEQ_LENGTH} > logs_${MAX_SEQ_LENGTH}/log{1}.txt" ::: $(seq 0 $((${NUM_FOLDS}-1)))
diff --git a/requirements-gpu.txt b/requirements-gpu.txt
@@ -0,0 +1,8 @@
+pandas==0.24.2
+regex==2019.4.14
+h5py==2.9.0
+numpy==1.16.2
+tensorboard==1.13.1
+tensorflow-gpu==1.13.1
+tqdm==4.31.1
+requests==2.22.0
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
@@ -0,0 +1,9 @@
+pandas==0.24.2
+regex==2019.4.14
+h5py==2.9.0
+numpy==1.16.2
+tensorboard==1.13.1
+tensorflow==1.13.1
+tensorflow-estimator==1.13.0
+tqdm==4.31.1
+requests==2.22.0