update

piebro · May 11, 2024 · eb98a53 · eb98a53
1 parent e1fc14e
commit eb98a53
Show file tree

Hide file tree

Showing 5 changed files with 125 additions and 139 deletions.
diff --git a/eva_list.txt b/eva_list.txt
@@ -0,0 +1,108 @@
+08000001
+08000010
+08000013
+08000023
+08000025
+08011201
+08011306
+08011102
+08011160
+08010255
+08011118
+08011113
+08010405
+08010406
+08010036
+08010404
+08000036
+08000038
+08000041
+08000044
+08000049
+08000050
+08000055
+08010184
+08010073
+08000068
+08000080
+08010085
+08010089
+08000086
+08000082
+08001580
+08000085
+08010101
+08000098
+08000105
+08002041
+08010113
+08000107
+08000115
+08000114
+08000118
+08000124
+08000128
+08000142
+08010159
+08002548
+08002549
+08002553
+08000147
+08000149
+08000150
+08000152
+08000156
+08000157
+08000162
+08000169
+08000183
+08000189
+08000191
+08000193
+08003200
+08000199
+08000206
+08000207
+08003368
+08000217
+08010205
+08000237
+08000236
+08000238
+08010224
+08000240
+08000244
+08000253
+08000261
+08000262
+08004158
+08000263
+08000271
+08000274
+08000275
+08000284
+08000286
+08000290
+08000291
+08000294
+08000297
+08000299
+08000302
+08012666
+08000309
+08000316
+08000320
+08010304
+08000323
+08000073
+08000087
+08000096
+08000134
+08000141
+08000168
+08000170
+08000250
+08006552
+08000257
+08000266
+08000260
diff --git a/eva_name_list.txt b/eva_name_list.txt
diff --git a/fetch_data.py b/fetch_data.py
@@ -42,16 +42,16 @@ def main():
     save_folder = Path("data") / date_str
     save_folder.mkdir(exist_ok=True)
 
-    with Path("eva_name_list.txt").open("r") as f:
-        eva_name_list = [line.split(",") for line in f.read().split("\n")]
+    with Path("eva_list.txt").open("r") as f:
+        eva_list = f.read().split("\n")
 
     curent_hour = datetime.now().hour
-    for eva, name in eva_name_list:
+    for eva in eva_list:
         formatted_fchg_url = fchg_url.format(eva=eva)
         save_api_data(formatted_fchg_url, save_folder / f"{eva}_fchg_{curent_hour:02}.xml", prettify=False)
 
     print("curent_hour:", curent_hour)
-    for eva, name in eva_name_list:
+    for eva in eva_list:
         for hour in range(curent_hour, curent_hour + 6): # fetch this hour and the next 5 hours
             hour = hour % 24
             formatted_plan_url = plan_url.format(eva=eva, date=date_str_url, hour=f"{hour:02}")

diff --git a/save_eva_name_list.py → save_eva_list.py b/save_eva_name_list.py → save_eva_list.py
@@ -12,15 +12,15 @@
 with open('20141001_IBNR.pdf', 'rb') as file:
     reader = PyPDF2.PdfReader(file)
     num_pages = len(reader.pages)
-    all_text = ""
+    IBNR_text = ""
 
     for i in range(num_pages):
         page = reader.pages[i]
-        all_text += page.extract_text() + "\n"
+        IBNR_text += page.extract_text() + "\n"
 
 # create name and eva list and do some sting manupulations to match the different names in the PDFs
-eva_name_list = []
-for eva_name in all_text.split("\n")[2:-1]:
+eva_list = []
+for eva_name in IBNR_text.split("\n")[2:-1]:
     eva_name_split = eva_name.split(" ")
     name = " ".join(eva_name_split[:-1])
     # TODO: do better fuzzy matching of the names
@@ -32,14 +32,12 @@
         name = name.replace("-", " ").replace("(", " (").replace(")", ") ").replace(")  ", ") ").rstrip()
 
     eva = eva_name_split[-1]
-    if name in biggest_stations:
-        eva_name_list.append((f"{eva},{name}"))
-    elif name.replace(" Hbf", "") in biggest_stations:
-        eva_name_list.append((f"{eva},{name.replace(' Hbf', '')}"))
-    elif f"{name} Hbf" in biggest_stations:
-        eva_name_list.append((f"{eva},{name} Hbf"))
+    if eva == "08005589": # this is an error, there are two "Solingen Hbf"
+        continue
+
+    if (name in biggest_stations) or (name.replace(" Hbf", "") in biggest_stations) or (f"{name} Hbf" in biggest_stations):
+        eva_list.append(eva)
 
-    # TODO: there is a but that there are two "Solingen Hbf, only the lower number is right"
 
 with open("eva_name_list.txt", "w") as f:
-    f.write("\n".join(eva_name_list))
+    f.write("\n".join(eva_list))
diff --git a/update_data_csv.py b/update_data_csv.py
@@ -2,18 +2,10 @@
 import xml.etree.ElementTree as ET
 from pathlib import Path
 
-def get_eva_to_name_dict():
-    with Path("eva_name_list.txt").open("r") as f:
-        eva_to_name = {line.split(",")[0]: line.split(",")[1] for line in f.read().split("\n")}
-    return eva_to_name
-
-def get_plan_xml_rows(xml_path, eva_to_name):
-    eva = xml_path.name.split("_")[0]
-    station = eva_to_name[eva]
-    # TODO: get station name from the plan file. Then I always have the "official names". Delete the names from the eva list.
-
+def get_plan_xml_rows(xml_path):
     tree = ET.parse(xml_path)
     root = tree.getroot()
+    station = root.get('station')
     rows = []
     for s in root.findall('s'):
         s_id = s.get('id')
@@ -63,12 +55,11 @@ def get_plan_xml_rows(xml_path, eva_to_name):
     return rows
 
 def get_plan_db():
-    eva_to_name = get_eva_to_name_dict()
     rows = []
     for date_folder_path in Path("data").iterdir():
         for xml_path in sorted(date_folder_path.iterdir()):
             if "plan" in xml_path.name:
-                rows.extend(get_plan_xml_rows(xml_path, eva_to_name))
+                rows.extend(get_plan_xml_rows(xml_path))
 
     out_df = pd.DataFrame(rows)
     out_df['arrival_planned_time'] = pd.to_datetime(out_df['arrival_planned_time'], format='%y%m%d%H%M', errors='coerce')
@@ -101,9 +92,6 @@ def get_fchg_xml_rows(xml_path, id_to_data):
 
         if ar_ct is None and dp_ct is None and changed_platform is None and not stop_canceled:
             continue
-
-        if s_id == "2399764757688153611-2405090622-2":
-            print(ar_ct, dp_ct, stop_canceled)
 
         # overwrite older data with new data
         id_to_data[s_id] = {