|
1 | 1 | import os
|
2 | 2 | import sys
|
3 |
| -import ipdb |
4 |
| -import time |
5 | 3 | import random
|
6 | 4 | import shutil
|
7 |
| -import ntpath |
8 | 5 | import numpy as np
|
9 | 6 | import argparse
|
10 | 7 |
|
11 | 8 | sys.path.append('..')
|
12 | 9 |
|
13 | 10 | from core.task import StoryTuringTest
|
14 |
| -from core.utils import load_save_json |
15 | 11 | from core.exp_record import ExpRecorder
|
16 | 12 | from core.semantic_modifier import SemanticModifier
|
17 | 13 | from exp_config import TRAIN_CONFIG, TRAIN_DEBUG_CONFIG, SYSTEM_CONFIG, SEED_OFFSET, TEST_PERCENT, VAL_PERCENT, \
|
|
24 | 20 | from pytorch_lightning import seed_everything
|
25 | 21 |
|
26 | 22 |
|
27 |
| -# python3.6 train_roberta.py --epoch 1 --debug_N 100 |
28 |
| -# python3.6 train_roberta.py --epoch 20 --per_device_train_batch_size 32 --gradient_accumulation_steps 4 |
29 |
| - |
30 |
| -# python3.6 train_roberta.py --epoch 10 --per_device_train_batch_size 32 --gradient_accumulation_steps 4 --data_dir ../data/5billion_sort |
31 |
| -# python3.6 train_roberta.py --epoch 10 --per_device_train_batch_size 32 --gradient_accumulation_steps 4 --data_dir ../data/5billion_sort_no_reverse |
32 |
| -# python3.6 train_roberta.py --epoch 10 --per_device_train_batch_size 32 --gradient_accumulation_steps 4 --data_dir ../data/5billion_sort_unique_no_reverse |
33 |
| -# python3.6 train_roberta.py --epoch 10 --per_device_train_batch_size 32 --gradient_accumulation_steps 4 --data_dir ../data/5billion_shuffle_unique_no_reverse |
34 |
| - |
35 | 23 | def compute_metrics(eval_predict):
|
36 | 24 | predict_prob, labels = eval_predict
|
37 | 25 | predict_label = np.argmax(predict_prob, axis=1)
|
@@ -166,11 +154,6 @@ def main():
|
166 | 154 | else:
|
167 | 155 | load_complete = True
|
168 | 156 |
|
169 |
| - # 直接用tokenizer的vocab看来也不行,因为前面都是ascii码,后面也不是完全按照词频排序的样子 |
170 |
| - # tokenizer_keys = list(tokenizer.vocab.keys()) |
171 |
| - # ipdb.set_trace() |
172 |
| - # read whole dataset into memory |
173 |
| - |
174 | 157 | # (0.) read dataset
|
175 | 158 | story_turing_test = StoryTuringTest(tokenizer, dataset_name=dataset_name)
|
176 | 159 | whole_texts, whole_labels = story_turing_test.read_cn_novel_whole_data(data_dir, semantic_change)
|
@@ -315,11 +298,6 @@ def main():
|
315 | 298 | model_save_dir = os.path.abspath(model_save_dir)
|
316 | 299 | model.save_pretrained(model_save_dir)
|
317 | 300 | print(f"Save best model ckpt to {model_save_dir}")
|
318 |
| - # |
319 |
| - # train_result_save_path = os.path.join(model_save_dir, 'train_result.json') |
320 |
| - # test_result_save_path = os.path.join(model_save_dir, 'test_result.json') |
321 |
| - # load_save_json(train_result_save_path, 'save', data=train_result) |
322 |
| - # load_save_json(test_result_save_path, 'save', data=test_result) |
323 | 301 |
|
324 | 302 |
|
325 | 303 | if __name__ == '__main__':
|
|
0 commit comments