forked from harubaru/convogpt
-
Notifications
You must be signed in to change notification settings - Fork 17
/
tokenize_data_uft.py
173 lines (145 loc) · 6.58 KB
/
tokenize_data_uft.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import argparse
import logging
import os
import numpy as np
import pyarrow as pa
from transformers import AddedToken, AutoTokenizer, PreTrainedTokenizer
LOG = logging.getLogger(__name__)
logging.basicConfig(
format='[%(asctime)s] [%(levelname)s] %(message)s',
level=logging.DEBUG,
)
def main() -> None:
args = _parse_args_from_argv()
assert os.path.isfile(args.input_path) or os.path.isdir(args.input_path), f'File or directory \"{args.input_path}\" not found!'
LOG.info("Loading tokenizer...")
# OpenLLaMA's fast tokenizer is broken on the stable release of transformers.
# TODO(TG): When newest transformers version which has fixed tokenizer is released,
# do a version check.
is_openllama = 'open_llama' in args.tokenizer_path or 'open-llama' in args.tokenizer_path
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, use_fast=not is_openllama)
if args.add_special_tokens is not None:
# MAINTENANCE(11b): Big fat warning: the snippet below is copy-pasted
# into ``./training/hf_trainer.py``. Make sure to always keep both
# implementations in sync.
special_token_contents = args.add_special_tokens.split(",")
special_tokens = [
AddedToken(
# Heads up: this is very poorly documented in HuggingFace and
# some old forum discussions mention that it's apparently
# exclusive to the Rust-based tokenizers? If anything seems
# funky about the special token behavior, this is a good place
# to look.
content, lstrip=True, rstrip=True)
for content in special_token_contents
]
tokenizer.add_special_tokens(
{"additional_special_tokens": special_tokens})
# Check if it's a directory of .txt files or a specific file
LOG.info("Done! About to tokenize file(s)...")
if os.path.isfile(args.input_path):
all_file_tokens, total_num_tokens = _tokenize_file(tokenizer, args.input_path, args.max_length)
# Runs this if and only if args.input_path is a directory
else:
all_file_tokens: list[np.array] = []
total_num_tokens = 0
# Find all .txt files from a directory which could potentially
# contain other files.
txt_files = filter(lambda x: x.endswith(".txt"), os.listdir(args.input_path))
txt_files = [os.path.join(args.input_path, f) for f in txt_files]
# Obtain the list of token arrays
for file in txt_files:
file_tokens, num_tokens = _tokenize_file(tokenizer, file, args.max_length)
all_file_tokens += file_tokens
total_num_tokens += num_tokens
_save_as_arrow_file(all_file_tokens, args.output_file)
LOG.info(f"Done! Output file saved to {args.output_file}.")
LOG.info(f"Dataset contains {total_num_tokens:,} tokens.")
def _parse_args_from_argv() -> argparse.Namespace:
'''Parses arguments.'''
parser = argparse.ArgumentParser(description="Dataset tokenizer utility.")
parser.add_argument(
"-i",
"--input-path",
required=True,
help="Path to the input .txt file or folder containing .txt files.",
)
parser.add_argument(
"-o",
"--output-file",
required=True,
help="Path to the output binarized and tokenized file.",
)
parser.add_argument(
"-t",
"--tokenizer-path",
required=True,
help="Path to the HF tokenizer to use.",
)
parser.add_argument(
"-s",
"--add-special-tokens",
type=str,
default=None,
help="Extra special tokens to add to the tokenizer before tokenizing. Comma-separated."
)
parser.add_argument(
"-l",
"--max-length",
type=int,
default=2048,
help=
"The maximum amount of tokens the model will take in a batch.\
Splits the tokenized dataset into chunks of equal length unless the total amount of tokens does not factor cleanly into the context length.\
In that case, an extra chunk will be added in with the remaining tokens.\
Defaults to 2048.",
)
return parser.parse_args()
def _tokenize_file(tokenizer: PreTrainedTokenizer, filepath: str, max_length: int, append_eos: bool = True) -> tuple[list[np.array], int]:
'''
Opens a singular text document and converts its contents into a large array of tokens.
Params:
tokenizer: The specific tokenizer used to tokenize the file.
filepath: The path to the text document that will be tokenized.
'''
LOG.info(f"Loading file {filepath} into memory and tokenizing...")
is_llama = tokenizer.eos_token == "</s>"
with open(filepath, "r", encoding="utf-8") as f:
# Read the entire .txt file into memory.
# Good luck!
file_contents = f.read()
if append_eos:
if is_llama:
file_contents += f" {tokenizer.eos_token}"
else:
file_contents += tokenizer.eos_token
tokenized_contents = tokenizer(file_contents, return_tensors="np").input_ids[0]
num_tokens = len(tokenized_contents)
# Do some list slicing to capture chunks of `context_length` tokens...
closest_ctxlen_factor = (num_tokens // max_length) * max_length
splitable_tkn_chunks = tokenized_contents[:closest_ctxlen_factor]
remainder_tokens = tokenized_contents[closest_ctxlen_factor:]
# We do array_split rather than split here so that `tokens` will have type `list`.
tokenized_contents = np.array_split(splitable_tkn_chunks, (closest_ctxlen_factor // max_length))
# ...then append what's left, if it's unevenly divided
if num_tokens > closest_ctxlen_factor:
tokenized_contents.append(remainder_tokens)
LOG.info(f"Done! File {filepath} has been tokenized.")
return tokenized_contents, num_tokens
def _save_as_arrow_file(tokens: list[np.array], output_file: str) -> None:
'''
Saves a list of arrays with `context_length` length (unless it is not)
Params:
tokens: A list of numpy arrays containing tokens, each with a length of the model's context size.
output_file: The path of the file which will be saved.
'''
LOG.info(f"Writing to arrow file and saving...")
pa_arrays = [pa.array(t) for t in tokens]
schema = pa.schema([pa.field('input_ids', pa_arrays[0].type)])
with pa.OSFile(output_file, 'wb') as sink:
with pa.ipc.new_file(sink, schema=schema) as writer:
for chunk in pa_arrays:
batch = pa.record_batch([chunk], schema=schema)
writer.write(batch)
if __name__ == "__main__":
main()