Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
Piet Brömmel committed May 11, 2024
1 parent e1fc14e commit eb98a53
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 139 deletions.
108 changes: 108 additions & 0 deletions eva_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
08000001
08000010
08000013
08000023
08000025
08011201
08011306
08011102
08011160
08010255
08011118
08011113
08010405
08010406
08010036
08010404
08000036
08000038
08000041
08000044
08000049
08000050
08000055
08010184
08010073
08000068
08000080
08010085
08010089
08000086
08000082
08001580
08000085
08010101
08000098
08000105
08002041
08010113
08000107
08000115
08000114
08000118
08000124
08000128
08000142
08010159
08002548
08002549
08002553
08000147
08000149
08000150
08000152
08000156
08000157
08000162
08000169
08000183
08000189
08000191
08000193
08003200
08000199
08000206
08000207
08003368
08000217
08010205
08000237
08000236
08000238
08010224
08000240
08000244
08000253
08000261
08000262
08004158
08000263
08000271
08000274
08000275
08000284
08000286
08000290
08000291
08000294
08000297
08000299
08000302
08012666
08000309
08000316
08000320
08010304
08000323
08000073
08000087
08000096
08000134
08000141
08000168
08000170
08000250
08006552
08000257
08000266
08000260
108 changes: 0 additions & 108 deletions eva_name_list.txt

This file was deleted.

8 changes: 4 additions & 4 deletions fetch_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,16 @@ def main():
save_folder = Path("data") / date_str
save_folder.mkdir(exist_ok=True)

with Path("eva_name_list.txt").open("r") as f:
eva_name_list = [line.split(",") for line in f.read().split("\n")]
with Path("eva_list.txt").open("r") as f:
eva_list = f.read().split("\n")

curent_hour = datetime.now().hour
for eva, name in eva_name_list:
for eva in eva_list:
formatted_fchg_url = fchg_url.format(eva=eva)
save_api_data(formatted_fchg_url, save_folder / f"{eva}_fchg_{curent_hour:02}.xml", prettify=False)

print("curent_hour:", curent_hour)
for eva, name in eva_name_list:
for eva in eva_list:
for hour in range(curent_hour, curent_hour + 6): # fetch this hour and the next 5 hours
hour = hour % 24
formatted_plan_url = plan_url.format(eva=eva, date=date_str_url, hour=f"{hour:02}")
Expand Down
22 changes: 10 additions & 12 deletions save_eva_name_list.py → save_eva_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
with open('20141001_IBNR.pdf', 'rb') as file:
reader = PyPDF2.PdfReader(file)
num_pages = len(reader.pages)
all_text = ""
IBNR_text = ""

for i in range(num_pages):
page = reader.pages[i]
all_text += page.extract_text() + "\n"
IBNR_text += page.extract_text() + "\n"

# create name and eva list and do some sting manupulations to match the different names in the PDFs
eva_name_list = []
for eva_name in all_text.split("\n")[2:-1]:
eva_list = []
for eva_name in IBNR_text.split("\n")[2:-1]:
eva_name_split = eva_name.split(" ")
name = " ".join(eva_name_split[:-1])
# TODO: do better fuzzy matching of the names
Expand All @@ -32,14 +32,12 @@
name = name.replace("-", " ").replace("(", " (").replace(")", ") ").replace(") ", ") ").rstrip()

eva = eva_name_split[-1]
if name in biggest_stations:
eva_name_list.append((f"{eva},{name}"))
elif name.replace(" Hbf", "") in biggest_stations:
eva_name_list.append((f"{eva},{name.replace(' Hbf', '')}"))
elif f"{name} Hbf" in biggest_stations:
eva_name_list.append((f"{eva},{name} Hbf"))
if eva == "08005589": # this is an error, there are two "Solingen Hbf"
continue

if (name in biggest_stations) or (name.replace(" Hbf", "") in biggest_stations) or (f"{name} Hbf" in biggest_stations):
eva_list.append(eva)

# TODO: there is a but that there are two "Solingen Hbf, only the lower number is right"

with open("eva_name_list.txt", "w") as f:
f.write("\n".join(eva_name_list))
f.write("\n".join(eva_list))
18 changes: 3 additions & 15 deletions update_data_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,10 @@
import xml.etree.ElementTree as ET
from pathlib import Path

def get_eva_to_name_dict():
with Path("eva_name_list.txt").open("r") as f:
eva_to_name = {line.split(",")[0]: line.split(",")[1] for line in f.read().split("\n")}
return eva_to_name

def get_plan_xml_rows(xml_path, eva_to_name):
eva = xml_path.name.split("_")[0]
station = eva_to_name[eva]
# TODO: get station name from the plan file. Then I always have the "official names". Delete the names from the eva list.

def get_plan_xml_rows(xml_path):
tree = ET.parse(xml_path)
root = tree.getroot()
station = root.get('station')
rows = []
for s in root.findall('s'):
s_id = s.get('id')
Expand Down Expand Up @@ -63,12 +55,11 @@ def get_plan_xml_rows(xml_path, eva_to_name):
return rows

def get_plan_db():
eva_to_name = get_eva_to_name_dict()
rows = []
for date_folder_path in Path("data").iterdir():
for xml_path in sorted(date_folder_path.iterdir()):
if "plan" in xml_path.name:
rows.extend(get_plan_xml_rows(xml_path, eva_to_name))
rows.extend(get_plan_xml_rows(xml_path))

out_df = pd.DataFrame(rows)
out_df['arrival_planned_time'] = pd.to_datetime(out_df['arrival_planned_time'], format='%y%m%d%H%M', errors='coerce')
Expand Down Expand Up @@ -101,9 +92,6 @@ def get_fchg_xml_rows(xml_path, id_to_data):

if ar_ct is None and dp_ct is None and changed_platform is None and not stop_canceled:
continue

if s_id == "2399764757688153611-2405090622-2":
print(ar_ct, dp_ct, stop_canceled)

# overwrite older data with new data
id_to_data[s_id] = {
Expand Down

0 comments on commit eb98a53

Please sign in to comment.