forked from rujunhan/ConditionalEmbeddings
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtoy_corpus.py
102 lines (86 loc) · 3.59 KB
/
toy_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import argparse
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import os
from pathlib import Path
import glob
import pickle
def main(args):
base_text = """
The bright sun shine in the light blue sky <e>
The sun was shine in the sky <e>
The sun was shine bright in the blue sky <e>
In the blue sky the bright sun shine <e>
In the blue sky the sun shine bright <e>
In the sky the light shine <e>
Shine in the light blue sky was the bright sun <e>
Shine in the sky the bright sun <e>
A bright sun shine the blue sky <e>
Bright the sun shine in the blue sky <e>
The light shine bright in the sky <e>
The sun shine bright in the light sky <e>
The sun shine bright in the blue sky <e>
Was shine bright the sun in the sky <e>
"""
text_list = base_text.replace('\n', ' ').strip()
text_list = [w.lower() for w in text_list.split(' ') if w != '']
# Create transition matrix
tm = pd.crosstab(
pd.Series(text_list[:-1], name='i'),
pd.Series(text_list[1:], name='j'), normalize=0)
# Generate text based on the transition matrix
sim_text = []
last_word = '<e>'
for k in tqdm(range(args.m)):
p = tm.loc[last_word]
next_word = np.random.choice(p.keys(), 1, p=p.values)[0]
sim_text.append(next_word)
last_word = next_word
sim_text_txt = ' '.join(sim_text)
sim_text_txt = sim_text_txt.replace(' <e>', '.')
# Save to the required BBB format
# Split into batches
print('[INFO] Saving corpus.')
sentences = sim_text_txt.split('.')
sentences = np.array_split(sentences, args.d)
os.makedirs(os.path.join(args.saveto, 'json'), exist_ok=True)
for i in range(len(sentences)):
text = '.'.join(sentences[i])
doc = {'text': text, 'filedate': str(1990)}
with open(os.path.join(args.saveto, 'json', f"{i}.json"), "w") as wf:
json.dump(doc, wf)
# Create files for co-occurrence matrices
os.makedirs(os.path.join(args.saveto, 'cooccur', '1990'), exist_ok=True)
for fname in glob.glob(os.path.join(args.saveto, 'json', '*.json')):
with open(fname, 'r') as f:
doc = json.load(f)
text = doc['text']
name = fname.split(os.path.sep)[-1].replace('.json', '.txt')
with open(os.path.join(args.saveto, 'cooccur', '1990', name), 'w') as fw:
words = [w for w in text.split(' ') if w != '']
for w in words:
w = w.replace('.', '')
fw.write(f"{w}\tNone\tNone\n")
# Save word dict
final_vocab_file = os.path.join(args.saveto, 'processed', 'vocab_freq.npy')
if os.path.exists(final_vocab_file):
vocab_dict = np.load(final_vocab_file, allow_pickle=True)
vocab_dict = {w: i for w, i in vocab_dict.item().items()}
with open(os.path.join(args.saveto, 'info', 'word-dict.pkl'), 'wb') as vf:
pickle.dump(vocab_dict, vf)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-saveto", type=str, required=False)
parser.add_argument("-m", type=int, help='Number of tokens')
parser.add_argument("-d", type=int, help='Number of documents')
parser.add_argument("-run_location", type=str, required=True, choices=['local', 'sherlock'])
parser.add_argument("-name", type=str, required=True)
args = parser.parse_args()
if args.run_location == 'sherlock':
base_dir = Path('/oak/stanford/groups/deho/legal_nlp/WEB')
elif args.run_location == 'local':
base_dir = Path(__file__).parent
args.saveto = base_dir / f"data/{args.name}"
main(args)