-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiagonalization_script.py
185 lines (144 loc) · 7.17 KB
/
diagonalization_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
print "start\tOK"
import os
import sys,string
os.chdir('C:\Users\matic\workspace\crossbee\crossbee\Content\Data')
print os.getcwd()
from collections import defaultdict
from math import log
from diagonalization_helpers import *
from pyroc import ROCData,plot_multiple_roc
import json
prefix=""
every=1
#full_text_file=open(prefix+"document_raw.txt", 'r')
#hevristic_scores=prefix+"HevrisitcsScores.txt"
#fline = full_text_file.readline() # Invokes readline() method on file
text_per_document={}
full_text_per_document={}
class_per_document={}
# >>> import hashlib
# >>> int(hashlib.md5('Hello, world!').hexdigest(), 16)
# 144653930895353261282233826065192032313L
count=0
#CONVERT DOCUMENT STRING TO
prefix=sys.argv[1]+"/"
print "File in:", prefix
#prefix='C:\\Users\\matic\\AppData\\Local\\Temp\\aahpoqs5.lcp\\'
dataset_file=open(prefix+"documents.lndoc", 'r')
line = dataset_file.readline() # Invokes readline() method on file
text_per_document={}
class_per_document={}
i=0
count=0
while line:
if line!="\n" and i%every==0:
spl=line.split("\t")
class_per_document[count]=spl[1]
text_per_document[count]=spl[2].split("\n")[0].split(" ")
if "" in text_per_document[count]:
text_per_document[count].remove("")
count+=1
i+=1
line = dataset_file.readline()
dataset_file.close()
classes=list(set(class_per_document.values()))
#TF-IDF and BOW
words = set()
tf_idfs = {}
for train in text_per_document.keys():
for word in text_per_document[train]:
words.add(word)
word_count=defaultdict(int)
for document_text in text_per_document.values():
for word in set(document_text):
word_count[word]+=1
len_train_text=len(text_per_document)
words_sorted_by_frequency=sorted(word_count.items(),key=lambda a: a[1],reverse=True)
#-----------------CALCULATE TF-IDFS-----------------
print "compute tf-idf"
for train, train_words in text_per_document.items():
train_word_count=defaultdict(int)
tf_idfs[train] = {}
for word in train_words:
train_word_count[word]+=1
for word,tf in train_word_count.items():
idf = log(len_train_text / float(word_count[word]))
tf_idfs[train][word] = tf * idf
sorted_words = sorted(list(words))
#nums=0
#tfnnull=0
#max_tfidf=max([item for t in tf_idfs.values() for item in t.values()])
with_inverse=False
inverse_only=False
# if with_inverse:
# write_to_init_file_inv(class_per_document,text_per_document,sorted_words,prefix+"init.dat_inv")
# write_to_init_file(class_per_document,text_per_document,sorted_words,prefix+"init.dat")
# _,col_perm_rev_inv=get_permutations(prefix,filename=prefix+"init_inv.dat")
# col_perm_rev,row_perm_rev=get_permutations(prefix,filename=prefix+"init.dat")
# elif inverse_only:
# col_perm_rev_inv=False
# write_to_init_file_inv(class_per_document,text_per_document,sorted_words,prefix+"init.dat_inv")
# col_perm_rev,row_perm_rev=get_permutations(prefix,filename=prefix+"init_inv.dat")
# else:
col_perm_rev_inv=False
write_to_init_file(class_per_document,text_per_document,sorted_words,prefix+"init.dat")
col_perm_rev,row_perm_rev=get_permutations(prefix)
col_perm={}
for k,v in col_perm_rev.items():
col_perm[v]=k
col_perm_inv=False
if col_perm_rev_inv:
col_perm_inv={}
for k,v in col_perm_rev_inv.items():
col_perm_inv[v]=k
#-----------------CROSSBEE SCORES-----------------
jursic_word_score=False
max_word_score=False
#-----------------HEVRISTICS-----------------
from hevristic_functions import *
print len(sorted_words),len(row_perm_rev),len(col_perm_rev)
greens_per_word,blues_per_word=calculate_colours(prefix,sorted_words,row_perm_rev,col_perm_rev,class_per_document,classes)
greens_on_diag_per_word,blues_on_diag_per_word=calculate_diag_colours(prefix,sorted_words,row_perm_rev,col_perm_rev,class_per_document,classes)
labels=['Hevristika1','Hevristika2','Hevristika3','Hevristika4']
hevristics=[hevristic1,hevristic2,hevristic3,hevristic4]
scores=[sorted(hevristic(greens_per_word,blues_per_word,greens_on_diag_per_word,blues_on_diag_per_word),key=lambda a: (1 if greens_per_word.get(a[0])!=0 and blues_per_word.get(a[0])!=0 else 0),reverse=True) for hevristic in hevristics]
#b-term generation
# if sell!="ideal_toy":
# b_term_list=['5_ht','5_hydroxytryptamine','5_hydroxytryptamine_receptor','anti_aggregation','anticonvulsant','anti_inflammatory','antimigraine','arterial_spasm','brain_serotonin','calcium_antagonist','calcium_blocker','calcium_channel','calcium_channel_blocker','cerebral_vasospasm','convulsion','convulsive','coronary_spasm','cortical_spread_depression','diltiazem','epilepsy','epileptic','epileptiform','hypoxia','indomethacin','inflammatory','nifedipine','paroxysmal','platelet_aggregation','platelet_function','prostacyclin','prostaglandin','prostaglandin_e1','prostaglandin_synthesis','reactivity','seizure','serotonin','spasm','spread','spread_depression','stress','substance_p','vasospasm','verapamil']
#
# b_terms=set(word for word in b_term_list if greens_per_word.get(word)!=0 and blues_per_word.get(word)!=0)
#
# print [(word,greens_per_word.get(word),blues_per_word.get(word)) for word in b_term_list]
# print b_term_list
# print b_terms
b_terms=set([])
#-----------------DRAW IMAGES-----------------
draw_matrix(sorted_words,jursic_word_score,max_word_score,identity_permutation(len(col_perm_rev)),identity_permutation(len(row_perm_rev)),class_per_document,
prefix+"1_inital",prefix+"init",b_terms,{},classes)
draw_matrix(sorted_words,jursic_word_score,max_word_score,col_perm_rev,row_perm_rev,class_per_document,
prefix+"2_after_col_perm",prefix+"min_flips_output_2_columns_permuted_matrix",b_terms,col_perm_inv,classes)
draw_matrix(sorted_words,jursic_word_score,max_word_score,col_perm_rev,row_perm_rev,class_per_document,
prefix+"3_banded_matrix",prefix+"min_flips_output_6_visual_banded_matrix",b_terms,col_perm_inv,classes)
draw_matrix(sorted_words,jursic_word_score,max_word_score,col_perm_rev,row_perm_rev,class_per_document,
prefix+"5_after_row_perm",prefix+"min_flips_output_7_original_banded_matrix",b_terms,col_perm_inv,classes)
#-----------------GENERATE JAVASCRIPT FILE-----------------
#doc_outliers=find_domain_outliers(prefix,class_per_document)
#generate_js_file(sorted_words,jursic_word_score,max_word_score,col_perm_rev,row_perm_rev,class_per_document,text_per_document,prefix,b_terms,greens_per_word,blues_per_word,classes,col_perm_rev_inv,col_perm_inv,doc_outliers)
#-----------------PLOT ROC-----------------
print "Results"
#create image where word columns are sorted by hevristic4 scores
best_hevristic_scores_permutation={}
hevristic_4_words_by_score=[a[0] for a in scores[3]]
missing_j=len(hevristic_4_words_by_score)
for j,old_j in col_perm_rev.items():
word=sorted_words[old_j]
if word in hevristic_4_words_by_score:
score_j=hevristic_4_words_by_score.index(word)
else:
score_j=missing_j
missing_j+=1
for word,score in scores[3]:
print "%s\t%d" % (word,score)
draw_matrix(sorted_words,jursic_word_score,max_word_score,col_perm_rev,row_perm_rev,class_per_document,
prefix+"6_after_scores_perm",prefix+"min_flips_output_7_original_banded_matrix",b_terms,best_hevristic_scores_permutation,classes)
#print json.dumps(word,score in scores[3])