-
Notifications
You must be signed in to change notification settings - Fork 0
/
textMining.py
91 lines (66 loc) · 2.18 KB
/
textMining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from Bio import Entrez
import math
import time
import random
time.sleep(random.randint(1, 3))
def probability(abstracts, keywords):
count = 0.0
total = len(abstracts)
for WORDS in abstracts:
has_terms = True
for t in keywords:
if not t in WORDS:
has_terms = False
if has_terms:
count = count + 1
return count / total
kegg_names = {}
name_kegg = {}
f = open('C:\\Users\PARK\\genes.txt', 'r')
for line in f.readlines():
t1 = line.split(';')[0]
t2 = t1.split('\t')
kegg_id = t2[0]
kegg_names[kegg_id] = []
for name in t2[1].split(','):
name = name.strip()
kegg_names[kegg_id].append(name)
name_kegg[name] = kegg_id
f.close()
disease = 'Thalassemia'.upper()
print('Download abstracts...')
Entrez.email = '[email protected]'
handle = Entrez.esearch(db='pubmed', term=disease, retmax=10000)
record = Entrez.read(handle)
downloaded_abstracts = []
cnt = 0
for pubmed_id in record['IdList']:
cnt = cnt + 1
print(cnt, '/', len(record['IdList']))
abstract = Entrez.efetch('pubmed', id=pubmed_id, retmode='text', rettype='abstracts').read()
downloaded_abstracts.append(abstract)
keywords_in_abstract = []
for ab in downloaded_abstracts:
keyword_box = []
words = ab.replace('.', ' ').split(' ')
for w in words:
if w.upper() == disease:
keyword_box.append(w.upper())
else:
if w in name_kegg:
keyword_box.append(name_kegg[w])
keywords_in_abstract.append(keyword_box)
print('Calculating MI....')
scores = {}
p_disease = probability(keywords_in_abstract, [disease])
for kegg_id in kegg_names:
p_gene = probability(keywords_in_abstract, [kegg_id])
p_gene_disease = probability(keywords_in_abstract, [kegg_id, disease])
if p_gene != 0 and p_disease != 0 and p_gene_disease != 0:
mi = math.log2(p_gene_disease / (p_gene * p_disease))
scores[kegg_names[kegg_id][0]] = mi
f2 = open('C:\\Users\PARK\\result.txt', 'w')
for key in sorted(scores, key=scores.__getitem__, reverse=True):
f2.write(key + '\t' + str(scores[key]) + '\n')
print(key + '\t' + str(scores[key]))
f2.close()