forked from kvfrans/twitch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharrays.py
38 lines (29 loc) · 896 Bytes
/
arrays.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import numpy as np
import json
import re
with open('database.txt') as data_file:
sentences=data_file.read().replace('\n', '')
with open('words.json') as data_file:
wordsdata = json.load(data_file)
count = 0
split_sentences = sentences.split("<eos>")
# 152401 sentences
# 33233 words
nparray = np.zeros((152401,20))
for s in xrange(len(split_sentences)):
sentence = split_sentences[s].lower()
regex = re.compile('[^0-9a-zA-Z ]')
realsent = regex.sub('', sentence)
words = sentence.split(" ")
if len(words) >= 2:
count = count + 1
for w in xrange(min(len(words),20)):
word = words[w]
if word in wordsdata:
nparray[count][w] = wordsdata[word]
else:
nparray[count][w] = 33234
nparray = nparray[:count,:]
print nparray.shape
np.random.shuffle(nparray)
np.save("data.npy",nparray)