-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_authorized_articles.py
executable file
·72 lines (62 loc) · 3.24 KB
/
scrape_authorized_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from scraper.article_scraper import GoogleScraper
from scraper.page_processor import PageProcessor
from database.mock_database import DatabaseAuthorship
from bs4 import BeautifulSoup
import re, requests, argparse, time
def preprocess_article(doc):
paragraph = re.sub("\s+", " ", doc)
return paragraph
def generate_author_dataset(site, author, narticles=10):
#writes the narticles most recent articles into the mock database
article_urls = []
if "theguardian.com" in site:
def get_listed_articles(url):
try:
page = requests.get(url).text
except:
return None
soup = BeautifulSoup(page, features="html.parser")
article_links = soup.find_all("a")
return [article_link.get("href") for article_link in article_links if article_link.get("data-link-name") == "article"][::2]
pagenum = 1
while len(article_urls) < narticles:
time.sleep(0.2)
url = f"{site}/profile/{author}?page={pagenum}"
urls = get_listed_articles(url)
if urls and not any(link in article_urls for link in urls):
article_urls += urls
else:
break
pagenum += 1
for article_url in article_urls[:narticles]:
time.sleep(0.2)
print(article_url)
page = requests.get(article_url).text
processor = PageProcessor(page)
processed_page = preprocess_article(processor.get_fulltext(separator="\n"))
print(len(processed_page.split("\n")))
author = processor.get_author()
DatabaseAuthorship.insert_article(processed_page, article_url, author)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", action="store_true", required=False, help="scrape articles in dataset creation mode, without google news")
parser.add_argument("--publication", action="store", type=str, required=False, help="url to the news publication used in dataset mode")
parser.add_argument("--author", action="store", type=str, required=False, help="url to the chosen author's page on the publication")
parser.add_argument("--narticles", action="store", type=int, default=10, required=False, help="maximum number of articles to scrape")
parser.add_argument("--query", action="store", type=str, required=False, help="scrape articles in query mode, using the parameter from this argument")
args = parser.parse_args()
if args.dataset:
if (not args.publication) or (not args.author):
parser.error("--dataset requires that both --author and --publication are set")
generate_author_dataset(args.publication, args.author, args.narticles)
else:
if not args.query:
parser.error("please provide a --query to search news.google.com")
scraper = GoogleScraper(verbose=True)
urls = scraper.find_news_urls_for_query(args.query, args.narticles)
for url in urls:
page = requests.get(url).text
processor = PageProcessor(page)
processed_page = preprocess_article(processor.get_fulltext(separator="\n"))
author = processor.get_author()
DatabaseAuthorship.insert_article(processed_page, url, author)