-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfa_recent.py
129 lines (108 loc) · 4.18 KB
/
fa_recent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
extract features from my emails for prediction
"""
# pylint: disable=line-too-long
import sys
import pandas as pd
import configparser
import requests
import pickle
import re #may the force of regex be with you
from sklearn import preprocessing
from imaptocsv import usefulHeaders
config = configparser.ConfigParser()
config.read('config.ini')
if __name__ == "__main__":
print("LOADING DATA")
filename = sys.argv.pop()
df = pd.read_csv(config['DATA']['recent_file'], sep=';', index_col=False)
df_train = pd.read_csv(config['DATA']['labeled_data_file'], sep=';', index_col=False)
print("shape of train dataset: ", df_train.shape)
print("original shape of input dataset: ", df.shape)
df.assign(weekday=pd.Series(range(len(df))))
try:
df['CC'].fillna(df['Cc'], inplace=True)
del df['Cc']
except:
print("Apparantly no Cc headers found in dataframe")
for columnName in list(df.columns.values):
if not columnName in usefulHeaders():
df.drop(columnName, axis=1, inplace=True)
print("input dataset after deleting stuff not in UsefulHeaders: ", df.shape)
# adding column names like in training dataset
counter = 0
for columnName in list(df_train.columns):
counter += 1
if counter == 1:
continue
try:
df.assign(columnName=pd.Series(range(len(df))))
df.loc[0, columnName] = 0
except:
print(columnName, " apparently already in df")
print("input dataset after adding columns from train dataset: ", df.shape)
# CREATING DICTIONARIES
for i in range(len(df)):
NewSubject = str(df['NewSubject'][i]).lower()
NewMessageText = str(df['NewMessageText'][i]).lower()
SubjectWords = []
try:
SubjectWords = re.findall(r'\b\w{3,15}\b', str(NewSubject))
except:
print("subject regepx failed: ", NewSubject)
for word in SubjectWords:
#print(word)
columnName = word+'_s'
if columnName in df.columns:
df.loc[i, columnName] = 1
BodyWords = []
try:
BodyWords = re.findall(r'\b[^\d \t+_/&;-=]{4,15}\b', str(NewMessageText))
except:
print("body regepx failed: ", NewMessageText)
wc = dict()
for word in BodyWords:
if word in wc:
wc[word] += 1
else:
wc[word] = 1
if columnName in df.columns:
df.loc[i, columnName] = wc[word]
toPeople = list(re.findall(r'[\w\.-]+@[\w\.-]+', str(df["To"][i])+', '+str(df["CC"][i])))
for address in toPeople:
if address in df.columns:
df.loc[i, address] = 1
print("input dataset after filling occurences: ", df.shape)
# removing old TO column
df.drop("To", axis=1, inplace=True)
# WORKING ON VALUES
for i in range(len(df)):
try:
df.loc[i, "From"] = re.search(r'[\w\.-]+@[\w\.-]+', df["From"][i]).group(0)
except:
df.loc[i, "From"] = 'n/a'
if df["Answered"][i] != "1":
df.loc[i, 'Answered'] = 0
try:
df.loc[i, 'Date'] = str(df['Date'][i][:3])
except:
print("could not parse date "+str(df['Date'][i]))
df.loc[i, 'Date'] = 'n/a'
print("WRITING PROCESSED FILE")
df.to_csv(config['DATA']['recent_processed_file'], sep=';')
# labeling
for toField in ['From', 'Content-Language', 'Date']:
encoder = pickle.load(open(toField+"-Encoder.p", "rb"))
for i in range(len(df)):# Это ебаный колхоз, надо сделать нормально
try:
df.loc[i, toField] = encoder.transform([df[toField][i]])[0]
except:
print("could not find ", df[toField][i], "in label encoder")
df.loc[i, toField] = 0
df.drop("CC", axis=1, inplace=True)
df.drop("NewSubject", axis=1, inplace=True)
df.drop("NewMessageText", axis=1, inplace=True)
#df.drop('Answered', axis=1, inplace=True)
print("shape of resulting dataframe: ", df.shape)
print("WRITING LABELED FILE")
df.to_csv(config['DATA']['recent_labeled_file'], sep=';')