-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_retractions.py
166 lines (136 loc) · 6.64 KB
/
check_retractions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import json
import os
import re
import requests
import bibtexparser
from pathlib import Path
import pandas as pd
from fuzzywuzzy import fuzz
from github import Github
# URLs and tokens
RETRACTION_DB_URL = "https://gitlab.com/crossref/retraction-watch-data/-/raw/main/retraction_watch.csv"
GITHUB_REPO = os.getenv("GITHUB_REPOSITORY", "unknown/repo")
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
# Load retraction database
def load_retraction_db():
"""
Fetch and load the latest Retraction Watch database efficiently.
Returns:
A set of DOIs for fast lookup and a list of metadata records for fuzzy matching.
"""
print("Fetching latest Retraction Watch database...")
df_iter = pd.read_csv(RETRACTION_DB_URL, low_memory=False, iterator=True, chunksize=50000)
doi_set = set()
metadata_records = []
for chunk in df_iter:
if "OriginalPaperDOI" in chunk.columns:
chunk_dois = chunk["OriginalPaperDOI"].dropna().astype(str).str.strip().str.lower()
doi_set.update(chunk_dois)
if all(col in chunk.columns for col in ["Author", "Title", "Journal", "OriginalPaperDate"]):
chunk_metadata = chunk[["Author", "Title", "Journal", "OriginalPaperDate"]].fillna("").astype(str)
for _, row in chunk_metadata.iterrows():
normalized_title = normalize_text(row["Title"])
normalized_authors = normalize_text(row["Author"])
normalized_journal = normalize_text(row["Journal"])
normalized_year = extract_year(row["OriginalPaperDate"])
metadata_records.append((normalized_title, normalized_authors, normalized_journal, normalized_year))
print(f"Loaded {len(doi_set)} retracted DOIs and {len(metadata_records)} metadata records for fuzzy matching.")
return doi_set, metadata_records
# Normalize text using regex (remove special characters, lowercase)
def normalize_text(text):
return re.sub(r"[^a-zA-Z0-9\s]", "", text).strip().lower()
# Extract year from date string
def extract_year(date_str):
match = re.search(r"\b(19|20)\d{2}\b", date_str) # Match years from 1900-2099
return match.group(0) if match else ""
# Extract DOIs and metadata from .bib files
def extract_data_from_bib():
dois = set()
metadata_entries = []
for root, _, files in os.walk("."):
for file in files:
if file.endswith(".bib"):
print(f"Scanning {file} for DOIs and metadata...")
with open(os.path.join(root, file), "r", encoding="utf-8") as bibfile:
bib_database = bibtexparser.load(bibfile)
for entry in bib_database.entries:
doi = entry.get("doi") or entry.get("DOI")
if doi:
dois.add(doi.strip().lower())
# Extract metadata for fuzzy matching
title = normalize_text(entry.get("title", ""))
authors = normalize_text(entry.get("author", ""))
journal = normalize_text(entry.get("journal", ""))
year = extract_year(entry.get("year", ""))
metadata_entries.append((title, authors, journal, year))
return dois, metadata_entries
# Perform fuzzy matching across title, authors, journal, and year
def fuzzy_match(metadata_entries, retraction_metadata, threshold=85):
matched_titles = []
for bib_title, bib_authors, bib_journal, bib_year in metadata_entries:
for ret_title, ret_authors, ret_journal, ret_year in retraction_metadata:
title_score = fuzz.partial_ratio(bib_title, ret_title)
author_score = fuzz.partial_ratio(bib_authors, ret_authors)
journal_score = fuzz.partial_ratio(bib_journal, ret_journal)
year_match = (bib_year == ret_year) # Exact year match
# Strong match if title + (author OR journal) + year
if title_score >= threshold and (author_score >= threshold or journal_score >= threshold) and year_match:
print(f"⚠️ Strong retraction match: {bib_title} ~ {ret_title} (Title: {title_score}, Author: {author_score}, Journal: {journal_score})")
matched_titles.append(bib_title)
return matched_titles
def write_badge(retracted_dois_found, fuzzy_matches_found):
badge_dir = Path(".github/badges")
badge_dir.mkdir(parents=True, exist_ok=True)
badge_path = badge_dir / "retraction_status.json"
if retracted_dois_found or fuzzy_matches_found:
badge_data = {
"schemaVersion": 1,
"label": "retractions",
"message": "⚠️ cited",
"color": "red"
}
else:
badge_data = {
"schemaVersion": 1,
"label": "retractions",
"message": "none cited",
"color": "brightgreen"
}
with open(badge_path, "w") as f:
json.dump(badge_data, f, indent=2)
# Open GitHub Issue for detected retractions
def create_github_issue(retracted_dois, fuzzy_matches):
if not GITHUB_TOKEN:
print("GITHUB_TOKEN not set, skipping issue creation.")
return
repo = Github(GITHUB_TOKEN).get_repo(GITHUB_REPO)
issue_title = "⚠️ Retracted Articles Detected in .bib Files"
issue_body = "### Retracted Articles Found\n"
if retracted_dois:
issue_body += "\n**DOI Matches:**\n"
issue_body += "\n".join([f"- [DOI: {doi}](https://doi.org/{doi})" for doi in retracted_dois])
if fuzzy_matches:
issue_body += "\n\n**Fuzzy Matches (Possible Retractions):**\n"
issue_body += "\n".join([f"- {title}" for title in fuzzy_matches])
existing_issues = repo.get_issues(state="open")
for issue in existing_issues:
if issue.title == issue_title:
print("Retracted articles issue already exists. Updating issue.")
issue.edit(body=issue_body)
return
repo.create_issue(title=issue_title, body=issue_body)
def main():
retracted_dois, retraction_metadata = load_retraction_db()
extracted_dois, metadata_entries = extract_data_from_bib()
# DOI-based matches
retracted_dois_found = extracted_dois.intersection(retracted_dois)
# Fuzzy matching
fuzzy_matches_found = fuzzy_match(metadata_entries, retraction_metadata)
if retracted_dois_found or fuzzy_matches_found:
print(f"Found {len(retracted_dois_found)} retracted DOIs and {len(fuzzy_matches_found)} fuzzy matches.")
create_github_issue(retracted_dois_found, fuzzy_matches_found)
else:
print("No retracted articles found.")
write_badge(retracted_dois_found, fuzzy_matches_found)
if __name__ == "__main__":
main()