-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
191 lines (161 loc) · 8.99 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# Generate a JSON file with the author's publications, including DOI and Impact Factor
import sys
import time
import json
import os
from urllib.parse import urlparse
from scholarly import scholarly
from journal_impact_factor import load_impact_factor, add_impact_factor
from doi import get_doi, get_doi_from_title, get_doi_link, get_doi_resolved_link, get_doi_short, get_doi_short_link, are_urls_equal
from standardise import standardise_authors
from logger import print_error, print_warn, print_info, print_misc
from news_scraper import get_news_data
if not len(sys.argv) == 2:
print_error("Usage: python main.py scholar_id\nExample: python main.py ynWS968AAAAJ")
sys.exit(1)
scholar_id = sys.argv[1]
journal_impact_factor_dic = load_impact_factor()
print_info(f"Loaded {len(journal_impact_factor_dic)} impact factors from Google Sheet.")
if not os.path.exists("scholar_data"):
os.makedirs("scholar_data")
file_path = os.path.join("scholar_data", f"{scholar_id}.json")
# Load previous data, if available
previous_data = {}
if os.path.exists(file_path):
with open(file_path, "r") as f:
previous_data = json.load(f)
print_info(f"Loaded previous data for {scholar_id}.")
try:
print_misc(f"Getting author with ID: {scholar_id}")
print_misc("This script will take a while to complete due to the rate limits of the scraping website and APIs used.")
author = scholarly.search_author_id(scholar_id)
if not author or author is None:
print_error("Author not found")
sys.exit(1) # Exit if no author found
author = scholarly.fill(author)
# Fill coauthors data
print_misc("Fetching coauthors data...")
if 'coauthors' in author:
filled_coauthors = []
for coauthor in author['coauthors']:
try:
print_misc(f"Fetching data for coauthor: {coauthor.get('name', 'Unknown')}")
filled_coauthor = scholarly.fill(coauthor)
filled_coauthors.append(filled_coauthor)
time.sleep(2) # Be nice to Google Scholar
except Exception as e:
print_warn(f"Error fetching data for coauthor {coauthor.get('name', 'Unknown')}: {str(e)}")
filled_coauthors.append(coauthor) # Add unfilled data if there's an error
author['coauthors'] = filled_coauthors
filled_publications = []
# Process each publication
for index, pub in enumerate(author["publications"]):
# Publication number of the publications
print_misc(f"Processing publication {index+1}/{len(author['publications'])}: {pub['bib']['title']}")
# If already in json file, get data from there but use new Impact Factor.
if previous_data.get('publications', []) and len(previous_data.get('publications', [])) > index:
filled_pub = previous_data.get('publications', [])[index]
print_misc(f"Data already found for {filled_pub.get('pub_url', filled_pub.get('bib', {}).get('title', ''))}. Using existing data but updating Impact Factor.")
journal_name = filled_pub.get('bib', {}).get('journal', '') if filled_pub.get('bib', {}).get('journal', '') != "Null" else ''
journal_name = journal_name.strip().lower()
if journal_name in journal_impact_factor_dic:
filled_pub['bib']['impact_factor'] = journal_impact_factor_dic[journal_name]
filled_publications.append(filled_pub)
continue
filled_pub = scholarly.fill(pub)
pub_title = filled_pub.get('bib', {}).get('title', '')
pub_url = filled_pub.get('pub_url', '')
journal_name = filled_pub.get('bib', {}).get('journal', '') if filled_pub.get('bib', {}).get('journal', '') != "Null" else ''
print_misc(f"Journal name: {journal_name}")
# Standardise author names
authors = filled_pub.get('bib', {}).get('author', '')
standardised_authors = standardise_authors(authors)
filled_pub['bib']['authors_standardised'] = standardised_authors
if ("symposium" in journal_name or "conference" in journal_name or "workshop" in journal_name or "annual meeting" in journal_name):
print_warn(f"Skipping DOI and Impact Factor for symposium, conference, workshop, or annual meeting: {journal_name}")
filled_pub['doi'] = ""
filled_pub['doi_link'] = ""
filled_pub['doi_short'] = ""
filled_pub['doi_short_link'] = ""
filled_pub['bib']['impact_factor'] = ""
else:
# Get DOI
print_misc(f"Getting DOI for {pub_url}")
# Get DOI from previous data, if available
doi = previous_data.get('publications', [])[index].get('doi', '') if previous_data.get('publications', []) else None
# Initialize variables
doi_link = None
doi_short = None
doi_short_link = None
resolved_link = None
# e.g., https://scholar.google.com/scholar?cluster=4186906934658759747&hl=en&oi=scholarr
if not doi:
host = urlparse(pub_url).hostname
if host and host == "scholar.google.com" and pub_title:
doi = get_doi_from_title(pub_title, author['name'].split()[-1])
else:
doi = get_doi(pub_url, author['name'].split()[-1])
if not doi:
print_warn("DOI not found. Trying to get DOI from the publication title.")
else:
print_info(f"DOI: {doi}")
# Get doi_link from previous data, if available
doi_link = previous_data.get('publications', [])[index].get('doi_link', '') if previous_data.get('publications', []) else None
if not doi_link:
doi_link = get_doi_link(doi)
print_misc(f"DOI link: {doi_link}")
resolved_link = get_doi_resolved_link(doi)
print_misc(f"DOI Resolves to: {resolved_link}")
if not are_urls_equal(pub_url, resolved_link):
print_warn(f"Resolved link does not match publication URL:\n{pub_url}\n{resolved_link}")
# Get doi_short from previous data, if available
doi_short = previous_data.get('publications', [])[index].get('doi_short', '') if previous_data.get('publications', []) else None
if not doi_short:
doi_short = get_doi_short(doi)
print_misc(f"Short DOI: {doi_short}")
# Get doi_short_link from previous data, if available
doi_short_link = previous_data.get('publications', [])[index].get('doi_short_link', '') if previous_data.get('publications', []) else None
if not doi_short_link:
doi_short_link = get_doi_short_link(doi_short)
# Add DOI and Impact Factor to publication
filled_pub['doi'] = doi if doi else ""
filled_pub['doi_short_link'] = doi_short_link if doi_short_link else ""
filled_pub['doi_short'] = doi_short if doi_short else ""
filled_pub['doi_link'] = doi_link if doi_link else ""
filled_pub['doi_resolved_link'] = resolved_link if resolved_link else ""
# Get Impact Factor
missing_journals = set()
impact_factor = None
if journal_name:
journal_name = journal_name.strip().lower() # Ensure journal name is lowercase for lookup
if journal_name in journal_impact_factor_dic:
impact_factor = journal_impact_factor_dic[journal_name]
else:
if journal_name not in missing_journals:
print_warn("TODO: Implement a search function if the journal name isn't exactly the same - e.g., levenshtein. OR FILL OUT IMPACT FACTOR SHEET.")
print_error(f"Missing impact factor for {journal_name}. Adding to Google Sheet so you can add.")
missing_journals.add(journal_name)
add_impact_factor(journal_name, '')
else:
print_warn("Journal name not found.")
filled_pub['bib']['impact_factor'] = impact_factor
# Add to list of processed publications
filled_publications.append(filled_pub)
with open(file_path, "w") as f:
json.dump(author, f, indent=4)
print_misc("Sleeping for 1 second... Being polite with the rate of requests to Google Scholar.")
time.sleep(1)
# Update the author data with processed publications
author["publications"] = filled_publications
# Get news and RSS data
print_info("Fetching news data...")
rss_data = get_news_data(author['name'])
author.update(rss_data)
# Write author to file in JSON format
with open(file_path, "w") as f:
json.dump(author, f, indent=4)
print_info(f"DONE. Author data written to {file_path}")
except AttributeError as e:
print_error(f"AttributeError: {e}")
except Exception as e:
print_error(f"An error occurred: {e}")