-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
98 lines (84 loc) · 3.88 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import re
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
def request_url(url):
# Create a request object with the given URL and a user-agent header
req = Request(str(url), headers={'User-Agent': 'Mozilla/5.0'})
# Use the urlopen function to open the request object and read the contents
web_byte = urlopen(req).read()
# Convert the byte data to a UTF-8 encoded string
webpage = web_byte.decode('utf-8')
# Return the webpage string
return webpage
def get_title_html(html):
# Create a BeautifulSoup object with the HTML and the lxml parser
soup = BeautifulSoup(html, 'lxml')
# Find the article header element and the H1 title element inside it
title = soup.find("article").find(
"div", {"class": "article-header"}).find("h1").text
# Return the text content of the title element
return title
def get_date_html(html):
# Create a BeautifulSoup object with the HTML and the lxml parser
soup = BeautifulSoup(html, 'lxml')
try:
# Try to find the second paragraph element in the article header
date = soup.find("article").find(
"div", {"class": "article-header"}).find_all("p")[1].text
except:
# If the second paragraph element is not found, find the first one
date = soup.find("article").find(
"div", {"class": "article-header"}).find("p").text
# Return the text content of the date element
return date
def get_authors_html(html):
# Create a BeautifulSoup object with the HTML and the lxml parser
soup = BeautifulSoup(html, 'lxml')
# Initialize an empty list to store the author names
authors_list = []
# Find all the <a> elements with an "author" URL and extract the author name
for x in soup.find("article").find_all("a", {"href": re.compile("author")}):
authors_list.append(x.find("span", {"class": "author-name"}).text)
# Join the author names in the list into a comma-separated string
authors = ",".join(authors_list)
# Return the comma-separated string of author names
return authors
def get_txt_html(html):
# create a BeautifulSoup object from the input HTML string
soup = BeautifulSoup(html, 'lxml')
# find the class name of the article
id = soup.find("article").attrs['class'][0][:-2]
# select the second div element with this class (assuming the first is a header)
extract = soup.find("article").find_all(
"div", {"class": re.compile(str(id))})[1]
# find all div elements with a class name containing "ql-align-justify"
text = extract.find_all(class_=re.compile("ql-align-justify"))
# create a list of the text content of each of these elements
list = []
for x in text:
list.append(x.text)
# join the list with newline characters to create the final output string
output = "\n".join(list)
return output
def save_to_txt(filename, list):
# Open a new file with the given filename for writing, overwriting any existing file with the same name.
with open(str(filename + ".txt"), "w") as text_file:
# Loop through each string in the list, and write it to the file followed by a newline character.
for idx, val in enumerate(list):
if idx-1 != len(list):
text_file.write(val)
text_file.write("\n")
else:
text_file.write(val)
# Return None, since the function only saves a file and does not return anything.
return None
if __name__ == "__main__":
#### PENDIENTE: probar con otras URLs ####
#### PENDIENTE: conseguir URLs de todos los articulos ####
webpage = request_url(
"https://eltoque.com/diez-consejos-para-evitar-hackeos-en-tus-cuentas-de-internet")
title = get_title_html(webpage)
date = get_date_html(webpage)
author = get_authors_html(webpage)
text = get_txt_html(webpage)
save_to_txt("aver", [title, date, author, text])