-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdomains_from_pubmed.py
240 lines (200 loc) · 9.55 KB
/
domains_from_pubmed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
from xml.dom import minidom
__author__ = 'matic'
from Bio import Entrez,Medline
import time
Entrez.email = "[email protected]" # Always tell NCBI who you are
#term="(\"Arabidopsis\" OR \"Oryza\" OR \"Solanum\" OR \"Nicotiana\" OR \"plant\") AND (\"signalling\" OR \"signaling\" OR \"defence\" OR \"defense\" OR \"ethylene\" OR \"jasmonate\" OR \"jasmonic acid\" OR \"salicylate\" OR \"salicylic acid\" OR \"pathogen\" OR \"virus\")"
oa="open access[filter] AND "
#1. try
#term12="(redox OR reduction OR oxidation) AND (potential OR state)"
#term11="open access[filter] AND \"Arabidopsis thaliana\" AND (defence OR defense OR ethylene OR jasmonate OR \"jasmonic acid\" OR \"salicylate\" OR \"salicylic acid\" OR pathogen OR virus)"
#2. try
#term11='("Arabidopsis thaliana" OR Oryza OR Solanum OR Nicotiana) AND ("defence" OR "defense" OR "ethylene" OR "jasmonate" OR "jasmonic acid" OR "salicylate" OR "salicylic acid" OR "pathogen" OR "virus")'
#term12="((redox AND (potential OR state)) OR (reactive AND oxigen AND species))"
#glioblastoma
term11='(Glioma OR glioblastoma OR "glioblastoma multiforme2" OR GBM OR "brain cancer")'
term12='(AML OR "acute myeloid leukemia")'
print "D1: "+term11
print "D2: "+term12
#term+" AND fulltext[filter]"
search_results = Entrez.read(Entrez.esearch(db="pmc", term=oa+term11,reldate=365*10, datetype="pdat", usehistory="y"))
print "D1 count",search_results["Count"]
search_results = Entrez.read(Entrez.esearch(db="pmc", term=oa+term12,reldate=365*10, datetype="pdat", usehistory="y"))
print "D2 count", search_results["Count"]
search_results = Entrez.read(Entrez.esearch(db="pmc", term=oa+term11+" AND "+term12,reldate=365*10, datetype="pdat", usehistory="y"))
print "D1+D2 count",search_results["Count"]
def abstracts_for_terms(filename,term1,klass1,term2,klass2,last_x_years=1,limit=10000000000):
#filename="D:/diagonalization/et_sa/abstracts.txt"
filename=filename.split(".")[0]
out_handle=open(filename+".txt", "w")
Entrez.email = "[email protected]" # Always tell NCBI who you are
#abstracts_for_term(term="(\"Arabidopsis\" OR \"Oryza\" OR \"Solanum\" OR \"Nicotiana\" OR \"plant\") AND (\"signalling\" OR \"signaling\" OR \"defence\" OR \"defense\" OR \"ethylene\" OR \"jasmonate\" OR \"jasmonic acid\" OR \"salicylate\" OR \"salicylic acid\" OR \"pathogen\" OR \"virus\")",klass="PDR",initial_i=abstracts_for_term("(redox OR reduction OR oxidation) AND (potential OR state)","RXP"))
abstractsA=fulltext_for_term(filename,term=term11,last_x_years=last_x_years,limit=limit)
abstractsC=fulltext_for_term(filename,term12,last_x_years=last_x_years,limit=limit)
intersectionAC=set([a[0] for a in abstractsA])&set([a[0] for a in abstractsC])
#print [a[0] for a in abstractsA],[a[0] for a in abstractsC],intersectionAC
print len(abstractsA),len(abstractsC),len(intersectionAC)
i=1
for pmid,a in abstractsA:
if not pmid in intersectionAC:
out_handle.write(str(i)+"\t!"+klass1+"\t"+a+"\n")
i+=1
for pmid,a in abstractsC:
if not pmid in intersectionAC:
out_handle.write(str(i)+"\t!"+klass2+"\t"+a+"\n")
i+=1
out_handle.close()
out_handle_i=open(filename+"_domain_intersetion.txt", "w")
i=1
for pmid,a in abstractsA:
if pmid in intersectionAC:
out_handle_i.write(str(i)+"\t!INTERS\t"+a+"\n")
i+=1
out_handle_i.close()
#abstracts_for_term(term="\"salicylic acid\" AND ethylene AND \"arabidopsis thaliana\"",klass="inters",initial_i=3289)
def abstracts_for_term(filename,term,last_x_years=3,limit=10000000000):
search_results = Entrez.read(Entrez.esearch(db="pubmed", term=term,reldate=365*last_x_years, datetype="pdat", usehistory="y"))
count = min([int(search_results["Count"]),limit])
print "Found %i results" % count
rez=[]
batch_size = 50
#out_handle = open(filename, "a")
for start in range(0,count,batch_size):
end = min(count, start+batch_size)
print "Going to download record %i to %i" % (start+1, end)
fetch_handle = Entrez.efetch(db="pubmed",
rettype="medline", retmode="text",
retstart=start, retmax=batch_size,
webenv=search_results["WebEnv"],
query_key=search_results["QueryKey"])
#data = fetch_handle.read()
#fetch_handle.close()
records=Medline.parse(fetch_handle)
for i,record in enumerate(records):
if record.has_key("AB"):
rez.append([record["PMID"],record["AB"]])
#out_handle.write(str(i+initial_i+start+1)+"\t!"+klass+"\t"+record["AB"]+"\n")
time.sleep(5)
#out_handle.close()
return rez
import os.path
def fulltext_for_term(path,term,last_x_years=10,limit=1000000000000000,not_in=[]):
search_results = Entrez.read(Entrez.esearch(db="pmc", term=oa+term,reldate=365*last_x_years, retmax=100000,datetype="pdat", usehistory="y"))
count = min([int(search_results["Count"]),limit])
print "Found %i results" % count
ids=search_results['IdList']#[0:limit]
clean_by_name(ids,path)
for i,id in enumerate(ids):
if not id in not_in:
file_name=path+"/"+id+".xml"
if not os.path.exists(file_name):
out_handle = open(file_name,'w')
print "Going to download record %s, %i/%i" % (id,i+1,count)
out_handle.write(get_single_fullarticle(id))
out_handle.close()
time.sleep(0.3)
return ids
def get_single_fullarticle(pmc_id="3691888"):
fetch_handle = Entrez.efetch(db="pmc",
retmode="xml",
id=pmc_id) # webenv=search_results["WebEnv"], query_key=search_results["QueryKey"])
#records=Medline.parse(fetch_handle)
#records=Entrez.parse(fetch_handle)
#print fetch_handle.readline()
#print fetch_handle.readline()
#record=minidom.parseString(fetch_handle.read())
#from Bio.Blast import NCBIXML
#for record in records:
# print record
return fetch_handle.read()
def clean_by_name(ids,path):
old_dir=os.getcwd()
os.chdir(path)
filelist = [ f for f in os.listdir('.') if not f[0:-4] in ids ]
print filelist,len(filelist)
for f in filelist:
os.remove(f)
os.chdir(old_dir)
return None
#abstracts_for_terms("document_raw.txt",term1="(\"Arabidopsis\" OR \"Oryza\" OR \"Solanum\" OR \"Nicotiana\" OR \"plant\") AND ( \"salicylate\" OR \"salicylic acid\")",klass1="SA",term2="(\"Arabidopsis\" OR \"Oryza\" OR \"Solanum\" OR \"Nicotiana\" OR \"plant\") AND (\"ethylene\")",klass2="ET")
#term="(\"Arabidopsis\" OR \"Oryza\" OR \"Solanum\" OR \"Nicotiana\" OR \"plant\") AND ( \"salicylate\" OR \"salicylic acid\")"
#print Entrez.esearch(db="pmc", term=term+" AND free fulltext\[filter\]",reldate=365*14, datetype="pdat", usehistory="y")
#intersection_ids=fulltext_for_term("intersection",term11+" AND "+term12,10)
#fulltext_for_term("domain1",term11,10,not_in=intersection_ids)
#fulltext_for_term("domain2",term12,10,not_in=intersection_ids)
#fulltext_for_term("tmp","",limit=100000) #random articles
"""
Mnemonic Description
25 AB Abstract
26 CI Copyright Information
27 AD Affiliation
28 IRAD Investigator Affiliation
29 AID Article Identifier
30 AU Author
31 FAU Full Author
32 CN Corporate Author
33 DCOM Date Completed
34 DA Date Created
35 LR Date Last Revised
36 DEP Date of Electronic Publication
37 DP Date of Publication
38 EDAT Entrez Date
39 GS Gene Symbol
40 GN General Note
41 GR Grant Number
42 IR Investigator Name
43 FIR Full Investigator Name
44 IS ISSN
45 IP Issue
46 TA Journal Title Abbreviation
47 JT Journal Title
48 LA Language
49 LID Location Identifier
50 MID Manuscript Identifier
51 MHDA MeSH Date
52 MH MeSH Terms
53 JID NLM Unique ID
54 RF Number of References
55 OAB Other Abstract
56 OCI Other Copyright Information
57 OID Other ID
58 OT Other Term
59 OTO Other Term Owner
60 OWN Owner
61 PG Pagination
62 PS Personal Name as Subject
63 FPS Full Personal Name as Subject
64 PL Place of Publication
65 PHST Publication History Status
66 PST Publication Status
67 PT Publication Type
68 PUBM Publishing Model
69 PMC PubMed Central Identifier
70 PMID PubMed Unique Identifier
71 RN Registry Number/EC Number
72 NM Substance Name
73 SI Secondary Source ID
74 SO Source
75 SFM Space Flight Mission
76 STAT Status
77 SB Subset
78 TI Title
79 TT Transliterated Title
80 VI Volume
81 CON Comment on
82 CIN Comment in
83 EIN Erratum in
84 EFR Erratum for
85 CRI Corrected and Republished in
86 CRF Corrected and Republished from
87 PRIN Partial retraction in
88 PROF Partial retraction of
89 RPI Republished in
90 RPF Republished from
91 RIN Retraction in
92 ROF Retraction of
93 UIN Update in
94 UOF Update of
95 SPIN Summary for patients in
96 ORI Original report in
97 """