-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1A.py
92 lines (56 loc) · 2.44 KB
/
1A.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import numpy as np
import random
import csv
from scipy.stats import norm
from math import log
from operator import itemgetter
folds = 10
split = 0.2
confusion_matrix = np.zeros((2, 2), dtype=int)
class my_naive_bayes:
def __init__(self, training_data):
self.prior_dist_log = {}
self.conditional_dist = {}
self.classes = []
if training_data is not None:
self.train(training_data)
return
def train(self, data):
num_training = len(data)
data[data[:, -1].argsort()]
classes = np.array(list(set([item[-1] for item in data])))
self.classes = classes
grouped_data = np.array(
[[item[:-1] for item in data if item[-1] == class_name] for class_name in classes])
for i, group in enumerate((grouped_data)):
# generate distribution
self.prior_dist_log[classes[i]] = log(len(group) / num_training)
self.conditional_dist[classes[i]] = {}
for j, column in enumerate(np.array(group).T):
miu = np.mean(column)
sigma = np.std(column)
self.conditional_dist[classes[i]][j] = norm(miu, sigma)
return
def inference(self, data):
# it works but I should never write code like this
return max([[class_name, sum([log(max(np.finfo(float).tiny, distribution.pdf(data[i]))) for i, distribution in self.conditional_dist[class_name].items()]
) + self.prior_dist_log[class_name]] for class_name in self.classes], key=itemgetter(1))[0]
def read_csv(path: str = "./data/datasets_14370_19291_pima-indians-diabetes.csv"):
return np.genfromtxt(path, delimiter=',', dtype=np.uint8)
def main():
all_training_data = read_csv()
data_len = len(all_training_data)
for i in range(folds):
random.Random(i+12).shuffle(all_training_data)
training = all_training_data[int(data_len * split):]
valid = all_training_data[:int(data_len * split)]
my_classifier = my_naive_bayes(training)
for j, x in enumerate(valid):
confusion_matrix[x[-1]][my_classifier.inference(x[:-1])] += 1
with open("./performenceA.txt", "w") as f:
f.write(str(confusion_matrix))
f.write("\nerror: %" + str(100 *
(confusion_matrix[0][1] + confusion_matrix[1][0]) / np.sum(confusion_matrix)))
return
if __name__ == "__main__":
main()