-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdecisionTree.py
160 lines (123 loc) · 4.75 KB
/
decisionTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import interp
from sklearn import tree
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.tree import export_graphviz
from graphviz import Source
import pickle
from names import names
tree_saved_file = 'treePredictor.sav'
feature_mask = 'featuresMask.csv'
roc_file = 'ROC_plot_TREE.png'
confusion_matrix_file = 'confusion_matrix_plot_TREE.png'
data_csv = pd.read_csv('dropped.csv', delimiter='|', names=names, header=0)
# print(train_data_csv.head(5))
tree = tree.DecisionTreeClassifier()
# convert class to binary (0, 1) from benign, malicious
lb = LabelBinarizer()
data_csv['class'] = lb.fit_transform(data_csv['class'].values)
print(data_csv.groupby(['class']).size())
# drop the URL column
data_csv.drop(columns=['url'], axis=1, inplace=True)
array = data_csv.values
# load Y with the classes, making sure they are of int type
Y = array[:, -1]
Y = Y.astype(int)
# drop the Class column
data_csv.drop(columns=['class'], axis=1, inplace=True)
array = data_csv.values
# load X with the features
X = array[:, 0: -1]
# set up for 10 fold cross validation
splits = 10
kf = KFold(n_splits=splits)
kf.get_n_splits(X, Y)
summation = 0
# build empty 2x2 matrix
matrix_sum = [2, 2]
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
i = 0
for train_index, test_index in kf.split(X, Y):
X_train, X_test = X[train_index], X[test_index]
Y_train, Y_test = Y[train_index], Y[test_index]
tree.fit(X_train, Y_train)
prediction = tree.predict(X_test)
# print(classification_report(Y_test, prediction))
matrix = confusion_matrix(Y_test, prediction)
matrix_sum = matrix_sum + matrix
summation += accuracy_score(Y_test, prediction)
# Compute ROC curve
probas_ = tree.fit(X[train_index], Y[train_index]).predict_proba(X[test_index])
fpr, tpr, thresholds = roc_curve(Y[test_index], probas_[:, 1])
tprs.append(interp(mean_fpr, fpr, tpr))
tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)
aucs.append(roc_auc)
plt.plot(fpr, tpr, lw=1, alpha=0.3,
label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
i += 1
print("Confusion Matrix")
print(matrix_sum)
# break out the True Positive, False Positive, False Negative and True Negative from the matrix
TP = matrix_sum[0][0]
FP = matrix_sum[0][1]
FN = matrix_sum[1][0]
TN = matrix_sum[1][1]
print("True Positive --- " + str(TP))
print("False Positive ---" + str(FP))
print("True Negative --- " + str(TN))
print("False Negative ---" + str(FN))
overall_accuracy = format((TP + TN) / (TP + TN + FP + FN) * 100, '.2f')
true_positive_rate = format(TP / (TP + FN) * 100, '.2f')
true_negative_rate = format(TN / (TN + FP) * 100, '.2f')
false_positive_rate = format(FP / (TN + FP) * 100, '.2f')
false_negative_rate = format(FN / (FN + TP) * 100, '.2f')
precision = format(TP / (TP + FP) * 100, '.2f')
# average accuracy of the model
average = (summation / splits) * 100
average = format(average, '.2f')
print("Average Accuracy of model: " + average + '%')
print("Overall Accuracy: " + overall_accuracy + '%')
print("True Positive Rate: " + true_positive_rate + '%')
print("True Negative Rate: " + true_negative_rate + '%')
print("False Positive Rate: " + false_positive_rate + '%')
print("False Negative Rate: " + false_negative_rate + '%')
print("Precision of model: " + precision)
# Calculate and print ROC curve
mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
lw=2, alpha=.8)
std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
label=r'$\pm$ 1 std. dev.')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Curve Decision Tree URL Prediction \n with accuracy of ' + average + ' percent')
plt.legend(loc="lower right")
plt.savefig(roc_file)
# Plot the Confusion Matrix
plt.title('Confusion Matrix Decision Tree URL Prediction \n with accuracy of ' + average + ' percent')
fig, ax = plot_confusion_matrix(conf_mat=matrix_sum, figsize=(10, 5))
plt.savefig(confusion_matrix_file)
num_nodes = tree.tree_.node_count
print("The binary tree structure has %s nodes" % num_nodes)
# dump the model for later use
pickle.dump(tree, open(tree_saved_file, 'wb'))