-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathozstar18.py
executable file
·127 lines (95 loc) · 4.32 KB
/
ozstar18.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import bibtexparser as bibp
import requests
import os
with open('files/ozstar18.bib') as bibfile:
bibdb = bibp.bparser.BibTexParser(common_strings=True).parse_file(bibfile)
unique_authors = []
total_authors = 0
unique_authors_count = 0
unique_authors_dict = dict()
for entry in bibdb.entries:
entry_authors = entry['author']
entry_authors = entry_authors.replace('\n', ' ')
for author in entry_authors.split('and '):
total_authors += 1
if (',' not in author) and (author[-1] == '}'):
author = author[:-1] + ','
# oldauthor = author.strip()
# newauthor = re.sub(r"[\'\"\}\{\^]", "", author.strip())
author = author.strip().replace('{', '')
author = author.replace('}', '')
author = author.replace('\'', '')
author = author.replace('\"', '')
author = author.replace('\\', '')
author = author.replace('^', '')
author.replace(u"\u2019", "'").strip()
author = author.split(',')
if len(author) == 1:
author.append(' ')
else:
author[1] = author[1].strip()
author = author[:2]
try:
exists = False
print("matching--------------------------------")
lastname = author[0]
print("last name: {0}".format(lastname))
if lastname not in unique_authors_dict.keys():
unique_authors_dict[lastname] = []
print("{0} new author".format(unique_authors_dict[lastname]))
else:
for name in unique_authors_dict[author[0]]:
if author[1][0] == name[1][0]:
print("{0} {1} exists".format(author[0], author[1]))
exists = True
break
if not exists:
unique_authors_dict[lastname].append(author)
except IndexError as e:
print(e.args)
print("--------------------------------------------------------------------------------------------------------------------")
with open('authors_unique.dat', 'w') as author_file:
author_file.writelines('Surname,Name\n')
for lastname in unique_authors_dict:
for name in unique_authors_dict[lastname]:
unique_authors_count += 1
author_file.writelines(','.join(name) + '\n')
print(unique_authors_count)
# count institutions and create file with list of unique institutions
start_tag = '<b>Affiliation:</b></td><td><br></td><td align="left" valign="top">'
end_tag = '</td></tr>\n<tr><td valign="top" align="left"><b>Publication:</b></td><td><br></td><td align="left" valign="top">'
dividers = ['AA', 'AB', 'AC', 'AD', 'AE', 'AF', 'AG', 'AH', 'AI', 'AJ', 'AK', 'AL', 'AM', 'AN', 'AO', 'AP', 'AQ', 'AR',
'AS', 'AT', 'AU', 'AV', 'AZ', 'AX', 'AY', 'AW', 'BA', 'BB', 'BC', 'BD', 'BE', 'BF', 'BG', 'BH', 'BI', 'BJ',
'BK', 'BL', 'BM', 'BN', 'BO', 'BP', 'BQ', 'BR', 'BS', 'BT', 'BU', 'BV', 'BZ', 'BX', 'BY', 'BW']
unique_institutions = []
for entry in bibdb.entries:
try:
# print(entry['title'])
entry_url = entry['adsurl']
page = requests.get(entry_url)
if start_tag in page.text:
entry_affiliations = page.text.split(start_tag)[1]
entry_affiliations = entry_affiliations.split(end_tag)[0]
for div in dividers:
if (div + '(') in entry_affiliations:
entry_affiliations = entry_affiliations.replace((div + '('), '&&&')
for affil_long in entry_affiliations.split('&&&'):
if len(affil_long.split(', ')) > 2:
affil = affil_long.split(', ')[:3]
else:
affil = [affil_long, '', '']
if affil not in unique_institutions:
unique_institutions.append(affil)
except KeyError as e:
print(e.args)
continue
if os.path.exists('institutions18.dat'):
os.remove('institutions18.dat')
for institution in sorted(unique_institutions):
with open('institutions18.dat', 'a') as institution_file:
institution_file.write(str(institution[0]) + ', ' + str(institution[1]) + '\n')
# print str(institution[0]), ', ', str(institution[1])
print('No of unique Institutions', len(unique_institutions))
institution_file.close()
if __name__ == 'main':
print('main')