-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdomain_outliers.py
82 lines (67 loc) · 2.35 KB
/
domain_outliers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from matplotlib import rc_file_defaults
__author__ = 'matic'
import orange
print "Before generate"
print "After generate"
import random
def find_domain_outliers(prefix,document_classes,learner = orange.BayesLearner):
#document_classes[4]="MAG"
#CREATE ORANGE FILE
fajl=open(prefix+"temp.dat", 'r')
orange_fajl=open("temp_orange.tab", 'w')
#file_text = fajl.read().replace(" ","\t")
line = fajl.readline()
num_attts=len(line.split("\n")[0].split(" "))
print num_attts
for i in range(num_attts):
orange_fajl.write("w"+str(i)+"\t")
orange_fajl.write("klass\n")
for _ in range(num_attts):
orange_fajl.write("d\t")
orange_fajl.write("d\n")
for _ in range(num_attts):
orange_fajl.write("\t")
orange_fajl.write("c\n")
i=0
while line:
if line!="\n":
words=line.split("\n")[0].replace(" ","\t")
orange_fajl.write(words+"\t"+document_classes[i]+"\n")
line = fajl.readline()
i+=1
fajl.close()
orange_fajl.close()
# TEST
data = orange.ExampleTable("temp_orange.tab")
# SET DOCUMENT META IDS
misses = orange.FloatVariable("doc_id")
id = orange.newmetaid()
data.domain.addmeta(id, misses)
for i,ex in enumerate(data):
ex["doc_id"]=i
print i, ex
#K-MEANS
k = 10
noisyIndices = []
selection = orange.MakeRandomIndicesCV(data, folds=k)
count_noisy = [0]*k
print 'Before for loop'
for test_fold in range(k):
train_data = data.select(selection, test_fold, negate=1)
test_data = data.select(selection, test_fold)
#print "\t\t", "Learned on", len(train_data), "examples"
#file.flush()
print 'Before classifier construction'
#print learner.hovername if learner.hovername != None else "ni hovernamea"
classifier = learner(train_data)
print 'After classifier construction'
for example in test_data:
exclassified = classifier(example)
if exclassified != None and exclassified != example.getclass():
# selection_filter[int(example[meta_id].value)] = 0
noisyIndices.append(int(example["doc_id"].value))
count_noisy[test_fold] += 1
# END test_data
print str(int((test_fold+1)*1.0/k*100))+"/100"
print noisyIndices
return noisyIndices