-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathjoram.py
83 lines (69 loc) · 2.59 KB
/
joram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
from io import BytesIO
import requests
from bs4 import BeautifulSoup
import datetime
from PyPDF2 import PdfReader
from constants import PDF_GAS_PRICE_REGEX, JORAM_LINK, JORAM_PDF_LINK
from functions import replace_gas_keys_names
# Get the current date
current_date = datetime.datetime.now()
# JORAM URL for current year's PDFs
joram_current_year_url = JORAM_LINK.format(date=current_date)
# Get the HTML response for the URL
response = requests.get(joram_current_year_url)
html = response.text
# Parse the HTML content using BeautifulSoup
html_content = BeautifulSoup(html, "html.parser")
# Find all the links ending with '.pdf' from the HTML page
pdf_links = [
link for link in html_content.find_all("a") if link["href"].endswith(".pdf")
]
# Sort the links based on the date in the href
sorted_pdf_links = sorted(
pdf_links,
key=lambda link: datetime.datetime.strptime(
re.search(r"\d{4}-\d{2}-\d{2}", link["href"]).group(), "%Y-%m-%d"
),
)
# Generator function to extract line by line text from PDF
def get_pdf_content_lines(pdf_raw_data):
with BytesIO(pdf_raw_data) as f:
pdf_reader = PdfReader(f)
for page in pdf_reader.pages:
for line in page.extract_text().splitlines():
yield line
# Function to extract the gas prices from the PDFs
def read_pdf_prices(joram_current_year_url):
discovered_prices = 0
response = requests.get(joram_current_year_url)
for line in get_pdf_content_lines(response.content):
if discovered_prices == 3:
break
match = re.search(PDF_GAS_PRICE_REGEX, line)
if match:
discovered_prices += 1
yield match.groups()
# Retrieve pdf creation date
def retrieve_pdf_creation_date(pdf_url):
response = requests.get(pdf_url)
with BytesIO(response.content) as f:
pdf_reader = PdfReader(f)
# print(str(pdf_reader.metadata.creation_date))
return pdf_reader.metadata.creation_date
# Retrieve gas prices
def retrieve_newest_pdf_gas_info():
# Loop through the sorted PDF links and find the gas prices
while len(sorted_pdf_links) > 0:
newest_pdf_link = sorted_pdf_links.pop()
newest_pdf_filename = newest_pdf_link["href"].split("/")[-1]
newest_pdf_joram = JORAM_PDF_LINK.format(
date=current_date, file=newest_pdf_filename
)
gas_prices = dict(read_pdf_prices(newest_pdf_joram))
if gas_prices:
break
gas_prices = replace_gas_keys_names(gas_prices)
return dict(
gas_info=gas_prices, creation_date=retrieve_pdf_creation_date(newest_pdf_joram)
)