-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_recommender.py
98 lines (79 loc) · 3.77 KB
/
train_recommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import matplotlib.pyplot as plt
from skimage import io
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import re
import gensim
from gensim.test.utils import get_tmpfile
import nltk
import os
import time
import utils as ut
def train_models(input_data_path='final_perfume_data_temp.csv', model_dir="models"):
start_time = time.time()
df1 = pd.read_csv(input_data_path, engine='python')
df1['Notes'].fillna(' ', inplace=True)
df1 = ut.preprocess_columns(df1, "Notes", [ut.make_lower_case, ut.remove_punctuation, ut.rem_numbers, ut.stem_words])
df1 = ut.preprocess_columns(df1, 'Description', [ut.make_lower_case, ut.remove_punctuation, ut.decontractions, ut.remove_stop_words, ut.stem_words, ut.fullstops, ut.rem_numbers])
df1['all_details'] = df1['Description'] + " " + df1['Notes']
# This is where our pickle files will be stored
if not os.path.exists(model_dir):
os.mkdir(model_dir)
print(df1['Notes'])
print(df1['Description'])
# ****************************************************************************
print('Tf-IDF Vectorization...')
tf = TfidfVectorizer(analyzer='word',
min_df=10,
ngram_range=(1, 2),
stop_words='english')
tf.fit(df1['all_details'])
tfidf = tf.transform(df1['all_details'])
print("Shape of TFIDF vectorizer ", tfidf.shape)
print('Saving the vectorizer')
with open(os.path.join(model_dir, "tf-idf_vectorizer.pkl"), "wb") as pkl_head:
pickle.dump(tf, pkl_head)
# ****************************************************************************
print()
print('Dimensionality Reduction using SVD...')
svd = TruncatedSVD(n_components=2000)
latent_matrix = svd.fit_transform(tfidf)
print("Shape of the Latent Matrix ", latent_matrix.shape)
print('Saving the SVD Model..')
with open(os.path.join(model_dir, "svd_model.pkl"), "wb") as pkl_head:
pickle.dump(svd, pkl_head)
# ****************************************************************************
doc_labels = df1.Name
svd_feature_matrix = pd.DataFrame(latent_matrix ,index=doc_labels)
print("Shape of the LSA Feature Matrix ", svd_feature_matrix.shape)
print('Saving the LSA Embeddings Model..')
with open(os.path.join(model_dir, "lsa_embeddings.pkl"), "wb") as pkl_head:
pickle.dump(svd_feature_matrix, pkl_head)
# ****************************************************************************
descriptions = df1.Description.values.tolist()
print()
print("Gensim doc2vec Model..")
documents = []
for i in range(len(df1)):
mystr = descriptions[i]
documents.append(re.sub("[^\w]", " ", mystr).split())
formatted_documents = [gensim.models.doc2vec.TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
model = gensim.models.doc2vec.Doc2Vec(vector_size=512, min_count=5, epochs=1000, seed=0, window=3, dm=1)
model.build_vocab(formatted_documents)
model.train(formatted_documents, total_examples=model.corpus_count, epochs=model.epochs)
fname = get_tmpfile(os.path.join(model_dir, "doc2vec_model"))
model.save(os.path.join(model_dir, "doc2vec_model"))
model = gensim.models.doc2vec.Doc2Vec.load(os.path.join(model_dir, "doc2vec_model"))
# ****************************************************************************
doc2vec_feature_matrix = pd.DataFrame(model.docvecs.vectors_docs, index=df1.Name)
print("Shape of the doc2vec Feature Matrix ", doc2vec_feature_matrix.shape)
print("Saving the doc2vec Embeddings Model..")
with open(os.path.join(model_dir, "doc2vec_embeddings.pkl"), "wb") as pkl_head:
pickle.dump(doc2vec_feature_matrix, pkl_head)
total_time = (time.time() - start_time)/60
print()
print("Training Completed.. Total time taken ", str(total_time))