Skip to content

Commit

Permalink
init commit
Browse files Browse the repository at this point in the history
  • Loading branch information
imcaspar committed Nov 6, 2019
1 parent 51b6a32 commit 6d3a4a0
Show file tree
Hide file tree
Showing 24 changed files with 52,787 additions and 3 deletions.
Binary file added .github/logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added .github/loss.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
35 changes: 32 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,39 @@
# gpt2-ml
![gpt2-ml](./.github/logo.png)
# GPT2 for Multiple Languages

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/imcaspar/gpt2-ml/blob/master/pretrained_model_demo.ipynb)
![GitHub](https://img.shields.io/github/license/imcaspar/gpt2-ml)
![GitHub All Releases](https://img.shields.io/github/downloads/imcaspar/gpt2-ml/total)
[![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/imcaspar/gpt2-ml/issues)
![GitHub stars](https://img.shields.io/github/stars/imcaspar/gpt2-ml?style=social)
![Twitter Follow](https://img.shields.io/twitter/follow/imcaspar?style=social)

GPT2 for Multiple Languages
[**中文说明**](./README.md) | [**English**](./README_EN.md)

- [x] 简化整理 GPT2 训练代码(based on grover)
- [x] 移植 bert tokenizer,添加多语言支持
- [x] 15亿参数 GPT2 中文预训练模型


## 预训练模型
15亿参数中文预训练模型 [Google Drive 下载](https://drive.google.com/open?id=1_6Py_UEGSAMt2RCq_dxsGNfpF4jMhm-5)

训练语料来自 [THUCNews](http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews) 以及 [nlp_chinese_corpus](https://github.com/brightmart/nlp_chinese_corpus),清洗后总文本量约 15G

使用 [Cloud TPU Pod v3-256](https://cloud.google.com/tpu/docs/types-zones#types) 训练 10w 步

![loss](./.github/loss.png)


## Google Colab


## 训练


## Reference
https://github.com/google-research/bert

https://github.com/rowanz/grover

Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC)
Research supported with Cloud TPUs from Google's TensorFlow Research Cloud (TFRC)
Empty file added README_EN.md
Empty file.
12 changes: 12 additions & 0 deletions configs/base.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"vocab_size": 50270,
"hidden_size": 768,
"attention_probs_dropout_prob": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_act": "gelu",
"initializer_range": 0.02,
"intermediate_size": 3072,
"max_position_embeddings": 1024,
"num_attention_heads": 12,
"num_hidden_layers": 12
}
13 changes: 13 additions & 0 deletions configs/large.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"vocab_size": 50270,
"hidden_size": 1024,
"attention_probs_dropout_prob": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_act": "gelu",
"initializer_range": 0.02,
"intermediate_size": 4096,
"max_position_embeddings": 1024,
"num_attention_heads": 16,
"num_hidden_layers": 24,
"max_batch_size_per_core": 3
}
12 changes: 12 additions & 0 deletions configs/mega.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"vocab_size": 21130,
"hidden_size": 1536,
"attention_probs_dropout_prob": 0.1,
"hidden_dropout_prob": 0.1,
"hidden_act": "gelu",
"initializer_range": 0.014142135623731,
"intermediate_size": 6144,
"max_position_embeddings": 1024,
"num_attention_heads": 24,
"num_hidden_layers": 48
}
Empty file added dataset/README.md
Empty file.
210 changes: 210 additions & 0 deletions dataset/prepare_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
"""
Turn a merged corpus into tfrecord files.
NOTE: You will want to do this using several processes. I did this on an AWS machine with 72 CPUs using GNU parallel
as that's where I had the deduplicated RealNews dataset.
"""
import argparse
import ujson as json
# from sample.encoder import get_encoder, tokenize_for_grover_training, detokenize, sliding_window, create_int_feature
import random
import tensorflow as tf
import collections
import os
from tempfile import TemporaryDirectory

from tokenization import tokenization

parser = argparse.ArgumentParser(description='SCRAPE!')
parser.add_argument(
'-fold',
dest='fold',
default=0,
type=int,
help='which fold we are on'
)
parser.add_argument(
'-num_folds',
dest='num_folds',
default=1,
type=int,
help='Number of folds (corresponding to both the number of training files and the number of testing files)',
)
parser.add_argument(
'-seed',
dest='seed',
default=1337,
type=int,
help='which seed to use'
)
parser.add_argument(
'-base_fn',
dest='base_fn',
default='news2016zh_',
type=str,
help='We will output files that are like {base_fn}_{n}.tfrecord for n in 0, ..., 1023'
)

parser.add_argument(
'-input_fn',
dest='input_fn',
default='realnews.jsonl',
type=str,
help='Base filename to use. THIS MUST BE A LOCAL FILE.'
)
parser.add_argument(
'-max_seq_length',
dest='max_seq_length',
default=1025,
type=int,
help='Max sequence length',
)


args = parser.parse_args()
random.seed(args.seed + args.fold)

#encoder = get_encoder()
tokenizer = tokenization.FullTokenizer(
vocab_file="bert-base-chinese-vocab.txt", do_lower_case=True)


class S3TFRecordWriter(object):
def __init__(self, fn):
self.fn = fn
if fn.startswith('s3://'):
from boto3.s3.transfer import TransferConfig
import boto3
self.gclient = None
self.s3client = boto3.client('s3',
)
self.storage_dir = TemporaryDirectory()
self.writer = tf.python_io.TFRecordWriter(
os.path.join(self.storage_dir.name, 'temp.tfrecord'))
self.bucket_name, self.file_name = self.fn.split(
's3://', 1)[1].split('/', 1)
elif fn.startswith('gs://'):
from google.cloud import storage
self.s3client = None
self.gclient = storage.Client()
self.storage_dir = TemporaryDirectory()
self.writer = tf.python_io.TFRecordWriter(
os.path.join(self.storage_dir.name, 'temp.tfrecord'))
self.bucket_name, self.file_name = self.fn.split(
'gs://', 1)[1].split('/', 1)

else:
self.s3client = None
self.gclient = None
self.bucket_name = None
self.file_name = None
self.storage_dir = None
self.writer = tf.python_io.TFRecordWriter(fn)

def write(self, x):
self.writer.write(x)

def close(self):
self.writer.close()

if self.s3client is not None:
from boto3.s3.transfer import TransferConfig
config = TransferConfig(multipart_threshold=1024 * 25, max_concurrency=10,
multipart_chunksize=1024 * 25, use_threads=True)
self.s3client.upload_file(
os.path.join(self.storage_dir.name, 'temp.tfrecord'),
self.bucket_name,
self.file_name,
ExtraArgs={'ACL': 'public-read'}, Config=config,
)
self.storage_dir.cleanup()
if self.gclient is not None:
bucket = self.gclient.get_bucket(self.bucket_name)
blob = bucket.blob(self.file_name)
blob.upload_from_filename(os.path.join(
self.storage_dir.name, 'temp.tfrecord'))
self.storage_dir.cleanup()

def __enter__(self):
# Called when entering "with" context.
return self

def __exit__(self, *_):
# Called when exiting "with" context.
# Upload shit
print("CALLING CLOSE")
self.close()


def article_iterator(tokenizer):
""" Iterate through the provided filename + tokenize"""
assert os.path.exists(args.input_fn)
for (dirpath, dirnames, filenames) in os.walk(args.input_fn):
for filename in filenames:
with open(os.path.join(dirpath, filename), 'r') as f:
for l_no, l in enumerate(f):
if l_no % args.num_folds == args.fold:
article = json.loads(l)

line = tokenization.convert_to_unicode(
article['text']) # for news2016zh text body
tokens = tokenizer.tokenize(line)
input_ids = tokenizer.convert_tokens_to_ids(tokens)

article['input_ids'] = input_ids

article['inst_index'] = (l_no // args.num_folds)
if article['inst_index'] < 100:
print('---\nINPUT{}. {}\n---\nTokens: {}\n'.format(article['inst_index'],
tokens,
input_ids
), flush=True)
if len(article['input_ids']) <= 64: # min size of article
continue
yield article


def create_int_feature(values):
feature = tf.train.Feature(
int64_list=tf.train.Int64List(value=list(values)))
return feature


def buffered_and_sliding_window_article_iterator(tokenizer, final_desired_size=1025):
""" We apply a sliding window to fix long sequences, and use a buffer that combines short sequences."""
for article in article_iterator(tokenizer):
if len(article['input_ids']) >= final_desired_size:
article['input_ids'] = article['input_ids'][0:final_desired_size-1]
while len(article['input_ids']) < final_desired_size:
article['input_ids'].append(0)
yield article


# OK now write the tfrecord file
total_written = 0
train_file = args.base_fn + 'train_wiki19_{:04d}.tfrecord'.format(args.fold)
with S3TFRecordWriter(train_file) as train_writer:
for article in buffered_and_sliding_window_article_iterator(tokenizer,
final_desired_size=max(args.max_seq_length + 1, 1025)):
writer2use = train_writer
assert len(article['input_ids']) == (args.max_seq_length + 1)

features = collections.OrderedDict()
features["input_ids"] = create_int_feature(article['input_ids'])
tf_example = tf.train.Example(
features=tf.train.Features(feature=features))

writer2use.write(tf_example.SerializeToString())
total_written += 1

# DEBUG
if article['inst_index'] < 5:
print("~~~\nIndex {}. ARTICLE: {}\n---\nTokens: {}\n\n".format(article['inst_index'],
tokenizer.convert_ids_to_tokens(
article['input_ids']),
article['input_ids']
), flush=True)
if article['inst_index'] % 1000 == 0:
print("{} articles, {} written".format(
article['inst_index'], total_written), flush=True)
print("DONE UPLOADING", flush=True)
10 changes: 10 additions & 0 deletions dataset/prepare_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

NUM_FOLDS=1024
MAX_SEQ_LENGTH=1024
FN=${1}
OUT_BUCKET=${2}

rm -rf logs_${MAX_SEQ_LENGTH}
mkdir logs_${MAX_SEQ_LENGTH}
parallel -j $(nproc --all) --will-cite "python prepare_data.py -fold {1} -num_folds ${NUM_FOLDS} -base_fn gs://${OUT_BUCKET}/data_${MAX_SEQ_LENGTH}/ -input_fn ${FN} -max_seq_length ${MAX_SEQ_LENGTH} > logs_${MAX_SEQ_LENGTH}/log{1}.txt" ::: $(seq 0 $((${NUM_FOLDS}-1)))
8 changes: 8 additions & 0 deletions requirements-gpu.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
pandas==0.24.2
regex==2019.4.14
h5py==2.9.0
numpy==1.16.2
tensorboard==1.13.1
tensorflow-gpu==1.13.1
tqdm==4.31.1
requests==2.22.0
9 changes: 9 additions & 0 deletions requirements-tpu.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
pandas==0.24.2
regex==2019.4.14
h5py==2.9.0
numpy==1.16.2
tensorboard==1.13.1
tensorflow==1.13.1
tensorflow-estimator==1.13.0
tqdm==4.31.1
requests==2.22.0
Loading

0 comments on commit 6d3a4a0

Please sign in to comment.