-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcli.py
664 lines (546 loc) · 29.5 KB
/
cli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
import argparse
import datetime
from functools import reduce
import glob
from itertools import chain
import json
import math
import os
import pathlib
import random
import sys
import time
import humanize
import requests
from bs4 import BeautifulSoup
from rich import print
from rich.pretty import pprint
import gpxpy
import gpxpy.gpx
import sentry_sdk
from sentry_sdk.crons import capture_checkin
from sentry_sdk.crons.consts import MonitorStatus
from urllib.parse import quote
import yaml
# Define source URL
WEBSITE_ROOT_URL = 'https://automuseums.info'
# Define file paths
PROJECT_ROOT = pathlib.Path(__file__).parent.resolve()
CONFIG_GROUP_FILENAME = "regions.yaml"
# Define cache properties
CACHE_ROOT = os.path.join(PROJECT_ROOT, "cache")
CACHE_COUNTRY_ROOT = os.path.join(CACHE_ROOT, 'countries')
# Define output properties
OUTPUT_ROOT = os.path.join(PROJECT_ROOT, "output")
OUTPUT_ROOT_PER_COUNTRY = os.path.join(OUTPUT_ROOT, "per-country")
OUTPUT_ROOT_GROUPED = os.path.join(OUTPUT_ROOT, "grouped-by-region")
OUTPUT_ROOT_JSON = os.path.join(OUTPUT_ROOT, "json")
OUTPUT_FILENAME_PREFIX = "Automuseums.info - "
GPX_CREATOR = 'https://github.com/TheStalwart/Automuseums-gpx'
def load_countries():
cache_file_path = os.path.join(CACHE_ROOT, 'homepage.html')
def download_homepage():
print("Downloading country list...")
r = requests.get(f"{WEBSITE_ROOT_URL}/homepage")
homepage_contents = r.text
with open(cache_file_path, "w", encoding="utf-8") as f:
f.write(homepage_contents)
if args.request_delay > 0:
time.sleep(args.request_delay)
return homepage_contents
html_contents = ''
if not os.path.isfile(cache_file_path):
html_contents = download_homepage()
else:
cache_file_modification_timestamp = os.path.getmtime(cache_file_path)
current_timestamp = time.time()
cache_file_age_seconds = current_timestamp - cache_file_modification_timestamp
cache_file_age_minutes = math.floor(cache_file_age_seconds / 60)
print(f"Country cache file is {cache_file_age_minutes}/{args.cache_ttl_countrylist} minutes old")
if cache_file_age_minutes < args.cache_ttl_countrylist:
print("Loading cached country list...")
with open(cache_file_path, 'r', encoding="utf-8") as f:
html_contents = f.read()
else:
html_contents = download_homepage()
# Parse homepage HTML
soup = BeautifulSoup(html_contents, 'html.parser')
countries = soup.find(id='block-searchmuseumsin').find_all('a') # https://beautiful-soup-4.readthedocs.io/en/latest/#navigating-the-tree
def define_country_properties(a_tag):
name = a_tag.contents[0].strip()
relative_url = a_tag['href']
# A link to Bosnia on the main page contains invalid (non-urlencoded) href value.
# It's one specific invalid value, all other country links e.g. "New Zealand" are urlencoded.
if "&Herze" in relative_url:
relative_url = quote(relative_url)
cache_path = os.path.join(CACHE_COUNTRY_ROOT, name)
cache_file_path = os.path.join(cache_path, "00.html")
cache_timestamp = 0 # countries with missing cache will keep 0 and be first in queue to update in lowprofile mode
if os.path.isfile(cache_file_path):
cache_timestamp = os.path.getmtime(cache_file_path)
return {
'name': name,
'relative_url': relative_url,
'absolute_url': f"{WEBSITE_ROOT_URL}{relative_url}",
'cache_path': cache_path,
'cache_timestamp': cache_timestamp,
}
property_list = list(map(define_country_properties, countries))
return property_list
def download_country_index(selected_country):
if not os.path.isdir(selected_country['cache_path']):
os.mkdir(selected_country['cache_path'])
def format_return_value(index):
return { 'country': selected_country, 'museums': index }
def download_index():
print(f"Downloading [yellow]{selected_country['name']}[/yellow]...")
index_pages = []
# Delete old cache
for old_cache_file in sorted(glob.glob(os.path.join(selected_country['cache_path'], "[0-9]*.html"))):
print(f"Deleting old cache file: {old_cache_file}")
os.remove(old_cache_file)
# Redownload country's index of museums
museum_list_url = f"{WEBSITE_ROOT_URL}{selected_country['relative_url']}"
for page_index in range(100): # make sure we never get stuck in infinite loop
cached_file_name = f"{page_index}.html".rjust(7, '0') # make all page numbers double-digits for easier sorting when loading cache
cached_page_path = os.path.join(selected_country['cache_path'], cached_file_name)
r = requests.get(museum_list_url, params={'page': page_index})
print(f"Downloaded {r.url}")
page_contents = r.text
with open(cached_page_path, "w", encoding="utf-8") as f:
f.write(page_contents)
if args.request_delay > 0:
time.sleep(args.request_delay)
soup = BeautifulSoup(page_contents, 'html.parser')
index_pages.append(soup)
if not soup.find(title='Go to next page'):
print(f"Link to next page not found, bailing out")
break
return index_pages
cache_file_path = os.path.join(selected_country['cache_path'], "00.html")
if not os.path.isfile(cache_file_path):
return format_return_value(parse_country_index(download_index()))
else:
current_timestamp = time.time()
cache_file_age_seconds = current_timestamp - selected_country['cache_timestamp']
cache_file_age_hours = math.floor(cache_file_age_seconds / 60 / 60)
print(f"[yellow]{selected_country['name']}[/yellow] index cache is {cache_file_age_hours}/{args.cache_ttl_museumlist} hours old")
if cache_file_age_hours < args.cache_ttl_museumlist:
print("Loading cached index...")
index_pages = []
sorted_cache_file_path_array = sorted(glob.glob(os.path.join(selected_country['cache_path'], "[0-9]*.html")))
for cache_file_path in sorted_cache_file_path_array:
print(f"Loading cache from {cache_file_path}...")
with open(cache_file_path, 'r', encoding="utf-8") as f:
html_contents = f.read()
soup = BeautifulSoup(html_contents, 'html.parser')
index_pages.append(soup)
return format_return_value(parse_country_index(index_pages))
else:
return format_return_value(parse_country_index(download_index()))
def parse_country_index(pages):
museums = []
for page in pages:
museum_blocks = page.find_all(class_='node-readmore')
def define_museum_properties(li_tag):
a_tag = li_tag.find('a')
name = a_tag['title'].strip()
return { 'name': name, 'relative_url': a_tag['href'], 'absolute_url': f"{WEBSITE_ROOT_URL}{a_tag['href']}" }
museums.extend(list(map(define_museum_properties, museum_blocks)))
# Museum list pages will display duplicates
# when a particular museum info page contains multiple locations.
# We deduplicate entries when building an index of museums,
# then produce multiple waypoints when building GPX files.
# Museum pages listing multiple locations, as of January 2025:
# - https://automuseums.info/czech-republic/museum-historical-motorcycles
# - https://automuseums.info/germany/fire-museum-schw%C3%A4bisch-hall
# - https://automuseums.info/australia/sir-henry-royce-foundation
# - https://automuseums.info/canada/western-development-museum
# - https://automuseums.info/russia/museum-vintage-motorcycles-and-antiques
# - https://automuseums.info/index.php/slovakia/skoda-classic-cars-museum
# - https://automuseums.info/switzerland/saurer-museum
# - https://automuseums.info/uruguay/eduardo-iglesias-automobile-museum
# - https://automuseums.info/iran/abadan-gasoline-house-museum (only one address)
unique_museums = reduce(lambda l, x: l.append(x) or l if x not in l else l, museums, []) # https://stackoverflow.com/a/37163210
return unique_museums
def load_museum_page(country, museums, museum_properties):
cache_museum_root_path = os.path.join(country['cache_path'], 'museums')
if not os.path.isdir(cache_museum_root_path):
os.mkdir(cache_museum_root_path)
# Museum page URLs encountered during debugging:
# https://automuseums.info/czechia/automoto-museum-lucany
# https://automuseums.info/czech-republic/museum-eastern-bloc-vehicles-%C5%BEelezn%C3%BD-brod
# https://automuseums.info/index.php/czechia/historic-car-museum-kuks
# https://automuseums.info/index.php/czech-republic/fire-brigade-museum-p%C5%99ibyslav
# Also, some entries are listed multiple times on country index page,
# e.g. https://automuseums.info/czech-republic/museum-historical-motorcycles
# is listed 3x times on https://automuseums.info/museums/Czechia?page=4 as of Aug 11th 2024,
# all 3x entries have the same page link, but that page lists 3x locations.
# This needs to be exported as 3x different placemarks in GPX file.
# A few days after that code was written,
# i discovered every museum page has data-history-node-id,
# and museum pages can be loaded by /node/ID URLs, e.g. https://automuseums.info/node/1893
name_slug = museum_properties['relative_url'].split('/')[-1] # always use last slug because there could be "/index.php/" in the middle
sanitized_file_basename = "".join([x if x.isalnum() else "_" for x in name_slug]) # sanitize https://stackoverflow.com/a/295152
cache_file_path = os.path.join(cache_museum_root_path, f"{sanitized_file_basename}.html")
def download_page():
r = requests.get(f"{WEBSITE_ROOT_URL}{museum_properties['relative_url']}")
print(f"Downloaded {museums.index(museum_properties) + 1}/{len(museums)} {r.url}")
page_contents = r.text
with open(cache_file_path, "w", encoding="utf-8") as f:
f.write(page_contents)
if args.request_delay > 0:
time.sleep(args.request_delay)
return BeautifulSoup(page_contents, 'html.parser')
if not os.path.isfile(cache_file_path):
return download_page(), cache_file_path
else:
cache_file_modification_timestamp = os.path.getmtime(cache_file_path)
current_timestamp = time.time()
cache_file_age_seconds = current_timestamp - cache_file_modification_timestamp
cache_file_age_hours = math.floor(cache_file_age_seconds / 60 / 60)
if cache_file_age_hours < args.cache_ttl_museumpage:
print(f"Loading {cache_file_age_hours}/{args.cache_ttl_museumpage} hours old cached museum page for [yellow]{museum_properties['name']}[/yellow]...")
with open(cache_file_path, 'r', encoding="utf-8") as f:
html_contents = f.read()
return BeautifulSoup(html_contents, 'html.parser'), cache_file_path
else:
return download_page(), cache_file_path
def parse_museum_page(page, museum_properties):
museum_description = ''
content_div = page.find(class_='node-content')
links = []
links_div = content_div.find(class_='field--name-link')
if links_div:
links = list(map(lambda a: { 'url': a['href'], 'title': a.text.strip() }, links_div.find_all("a")))
body_div = content_div.find(class_='field--name-body')
if body_div:
# for some museums, description is wrapped in extra <p> tag
# https://automuseums.info/denmark/egeskov-castle - has multiple <p> tags
# https://automuseums.info/jordan/royal-automobile-museum - field--name-body value is enclosed in double-quotes
# Most popular apps with GPX import feature do not support HTML tags,
# so do a simple conversion to plain text
museum_description = "".join(list(body_div.text)).replace("\n", "\n\n").strip().strip('"')
# Some pages contain extra links in description,
# e.g. https://automuseums.info/lithuania/lithuanian-road-museum
# Since we strip description to plain text,
# capture those extra links to avoid losing them.
# Also, avoid random "<a id="search" name="search"></a>"
# in https://automuseums.info/united-states/walker-transportation-collection
# by only capturing links with text
# https://pytutorial.com/beautifulsoup-find-by-text/
# https://beautiful-soup-4.readthedocs.io/en/latest/#id12
links_in_description = body_div.find_all('a', string=True)
if links_in_description:
links.extend(list(map(lambda a: { 'url': a['href'], 'title': a.text.strip() }, links_in_description)))
original_name = None
abbreviation_div = content_div.find(class_='field--name-abbreviation')
if abbreviation_div and abbreviation_div.contents[0] and (abbreviation_div.contents[0] != museum_properties['name']):
# if field--name-abbreviation value is different from main name
# it's usually the original museum name in country's official language
original_name = abbreviation_div.contents[0]
display = None
display_div = content_div.find(class_='field--name-display')
if display_div and display_div.contents[0]:
# "Display" section on museum page usually lists
# what kinds of vehicles are exhibited
display = list(map(lambda item_tag: item_tag.text, display_div.find_all(class_='field-item')))
info = None
info_div = content_div.find(class_='field--name-info')
if info_div and info_div.find(class_='field-item') and info_div.find(class_='field-item').contents[0]:
# this field usually contains extra properties
# e.g. opening times or "Open by appointment" string
info = "".join(list(info_div.find(class_='field-item').text)).strip().strip('"')
address = None
address_div = content_div.find(class_='field--name-address')
if address_div and address_div.contents[0]:
# "Address" section is structured as a list of field-item tags
# each one containing a structure of spans
# for every part of the address.
# Do our best flattening those structures
# to return an array of multiline strings.
# As of January 2025, almost all multi-coordinates museums
# are also a multi-address museums
# with address index matching coordinates index.
# When generating GPX waypoints from multi-location museum
# we only pick one address and one coordinate pair per waypoint.
address = list(map(lambda address_item_tag: address_item_tag.text.strip(), address_div.find_all(class_='field-item')))
email = None
email_div = content_div.find(class_='field--name-e-mail')
if email_div and email_div.contents[0]:
# "E-mail" section is a list of items,
# much like "Display" section
email = list(map(lambda item_tag: item_tag.text.strip(), email_div.find_all(class_='field-item')))
phone = None
phone_div = content_div.find(class_='field--name-phone')
if phone_div and phone_div.contents[0]:
# "Phone" section is a list of items,
# much like "Display" section
phone = list(map(lambda item_tag: item_tag.text.strip(), phone_div.find_all(class_='field-item')))
drupal_node_id = page.find('article')['data-history-node-id']
data_json = page.find(attrs={"data-drupal-selector": "drupal-settings-json"}).contents[0]
data = json.loads(data_json)
leaflet_features = data['leaflet'][f"leaflet-map-node-museum-{drupal_node_id}-coordinates"]['features']
leaflet_points = list(filter(lambda f: f['type'] == 'point', leaflet_features))
coordinates = list(map(lambda p: { 'lat': p['lat'], 'lon': p['lon'] }, leaflet_points))
return {
'description': museum_description,
'original_name': original_name,
'display': display,
'info': info,
'address': address,
'email': email,
'phone': phone,
'links': links,
'drupal_node_id': drupal_node_id,
'coordinates': coordinates
}
# Init Sentry before doing anything that might raise exception
try:
sentry_sdk.init(
dsn=pathlib.Path(os.path.join(PROJECT_ROOT, "sentry.dsn")).read_text(),
# Set traces_sample_rate to 1.0 to capture 100%
# of transactions for tracing.
traces_sample_rate=1.0,
)
except:
pass
start_datetime = datetime.datetime.now()
# Ensure cache folders exist
if not os.path.isdir(CACHE_ROOT):
os.mkdir(CACHE_ROOT)
if not os.path.isdir(CACHE_COUNTRY_ROOT):
os.mkdir(CACHE_COUNTRY_ROOT)
# Build ArgumentParser https://docs.python.org/3/library/argparse.html
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('--country', help='Limit scrape to one country')
arg_parser.add_argument('--cache-ttl-countrylist', type=int, default=55, help='Override country list cache time-to-live in minutes (default: %(default)s)')
arg_parser.add_argument('--cache-ttl-museumlist', type=int, default=24, help='Override museum list cache time-to-live in hours (default: %(default)s)')
arg_parser.add_argument('--cache-ttl-museumpage', type=int, default=48, help='Override museum page cache time-to-live in hours (default: %(default)s)')
arg_parser.add_argument('--request-delay', type=int, default=15, help='Delay after every HTTPS request in seconds (default: %(default)s)')
arg_parser.add_argument('--lowprofile', action='store_true', help='Update 1 country with oldest cache')
arg_parser.add_argument('--group', action='store_true', help='Generate files grouped by region')
arg_parser.add_argument('--verbose', action='store_true', help='Print data used to generate GPX files')
# During development i often diff GPX output
# and <time> tag makes output noisy
arg_parser.add_argument('--omit-time', action='store_true', help='Omit <time> tag from generated GPX files')
args = arg_parser.parse_args()
# Make sure we don't run more than one instance
# on the same set of cache/output folders
lock_file_path = os.path.join(PROJECT_ROOT, "cli.lock")
if os.path.isfile(lock_file_path):
# if script is launched in lowprofile mode,
# but lockfile is older than 24h -
# assume previous execution has failed,
# e.g. due to host machine power failure,
# recreate the lock and carry on
if args.lowprofile and os.path.getmtime(lock_file_path) < time.time() - 60 * 60 * 24:
print("[red]Deleting stale lock file[/red]")
os.remove(lock_file_path)
else:
if sys.gettrace(): # https://stackoverflow.com/a/72977762/5337349
print("[red]Lock file ignored due to debugging[/red]")
else:
print("[red]Another instance of the script is running, exiting[/red]")
sys.exit(1)
open(lock_file_path, "w").close()
# Check-in with Sentry cron monitoring
sentry_lowprofile_slug = 'lowprofile'
sentry_check_in_id = ''
if args.lowprofile:
sentry_check_in_id = capture_checkin(
monitor_slug=sentry_lowprofile_slug,
status=MonitorStatus.IN_PROGRESS,
)
# Calls to stop_profiler are optional - if you don't stop the profiler, it will keep profiling
# your application until the process exits or stop_profiler is called.
sentry_sdk.profiler.start_profiler()
# Refresh country list
countries = load_countries()
country_indexes = []
if args.country:
country_search_results = list(filter(lambda c: c['name'] == args.country, countries))
if len(country_search_results) < 1:
# technically, a clean exit
# even though no useful work has been done
os.remove(lock_file_path)
readable_country_list = ', '.join(map(lambda country: country['name'], countries))
sys.exit(f"Country \"{args.country}\" not found.\n\nTry any of these: {readable_country_list}")
selected_country = country_search_results[0]
country_indexes.append(download_country_index(selected_country))
else:
if args.lowprofile:
print('Keeping low profile, updating 1 country with oldest cache...')
selected_country = sorted(countries, key=lambda c: c['cache_timestamp'])[0]
country_indexes.append(download_country_index(selected_country))
else:
print('Updating all country indexes...')
for selected_country in countries:
country_indexes.append(download_country_index(selected_country))
for country in country_indexes:
print(f"Loading {len(country['museums'])} museums of [yellow]{country['country']['name']}[/yellow]...")
for museum_properties in country['museums']:
page, cache_file_path = load_museum_page(country['country'], country['museums'], museum_properties)
museum_properties['cache_file_path'] = cache_file_path
museum_properties.update(parse_museum_page(page, museum_properties))
if not args.verbose:
print(f"Parsed [yellow]{country['country']['name']}[/yellow]: {len(country['museums'])} museums")
if not os.path.isdir(OUTPUT_ROOT_JSON):
os.mkdir(OUTPUT_ROOT_JSON)
json_output_file_name = f"{country['country']['name']}.json"
json_output_file_path = os.path.join(OUTPUT_ROOT_JSON, json_output_file_name)
with open(json_output_file_path, "w", encoding='utf-8') as json_output_file:
json.dump(country, json_output_file, indent=2)
if args.verbose:
print(country_indexes)
# Generate per-country GPX files
# https://github.com/tkrajina/gpxpy/blob/dev/examples/waypoints_example.py
for country in country_indexes:
gpx = gpxpy.gpx.GPX()
gpx.creator = GPX_CREATOR
gpx.name = f"Automuseums.info: {country['country']['name']}"
gpx.description = f"Generated using {gpx.creator}"
gpx.link = country['country']['absolute_url']
if not args.omit_time:
gpx.time = datetime.datetime.now(datetime.timezone.utc)
def create_gpx_waypoint(museum, location_index):
gpx_wps = gpxpy.gpx.GPXWaypoint()
gpx_wps.latitude = museum['coordinates'][location_index]['lat']
gpx_wps.longitude = museum['coordinates'][location_index]['lon']
gpx_wps.symbol = "Museum"
gpx_wps.name = museum['name']
if location_index > 0:
gpx_wps.name = f"{gpx_wps.name} ({location_index + 1})"
gpx_wps.description = museum['description']
# Prepend description with museum's original name in native language, if available
if museum['original_name']:
gpx_wps.description = f"{museum['original_name']}\n\n{gpx_wps.description}"
# Append "Display" section listing
# what kinds of vehicles are exhibited
if museum['display']:
display_item_list_formatted = "\n".join(list(map(lambda di: f"- {di}", museum['display'])))
display_section_formatted = f"Display:\n{display_item_list_formatted}"
gpx_wps.description = f"{gpx_wps.description}\n\n{display_section_formatted}"
# Append "Info" section
# usually containing opening times
if museum['info']:
gpx_wps.description = f"{gpx_wps.description}\n\n{museum['info']}"
# Append "Address" section
if museum['address']:
if len(museum['coordinates']) == len(museum['address']):
# if address count matches coordinates count,
# assume their indexes match,
# as that is the case for museums i tested as of January 2025.
address_section_formatted = f"Address:\n{museum['address'][location_index]}"
gpx_wps.description = f"{gpx_wps.description}\n\n{address_section_formatted}"
else:
# there is a museum in Iran
# that has two coordinates but only one address
# https://automuseums.info/iran/abadan-gasoline-house-museum
address_item_list_formatted = "\n\n".join(list(map(lambda ai: f"{ai}", museum['address'])))
address_section_formatted = f"Address:\n{address_item_list_formatted}"
gpx_wps.description = f"{gpx_wps.description}\n\n{address_section_formatted}"
# Append "E-mail" section if available
if museum['email']:
if len(museum['email']) > 1:
email_item_list_formatted = "\n".join(list(map(lambda pi: f"{pi}", museum['email'])))
email_section_formatted = f"E-mail:\n{email_item_list_formatted}"
gpx_wps.description = f"{gpx_wps.description}\n\n{email_section_formatted}"
else:
# most museums have only one email listed,
# so collapse the entry into a single line
gpx_wps.description = f"{gpx_wps.description}\n\nE-mail: {museum['email'][0]}"
# Append "Phone" section if available
if museum['phone']:
if len(museum['phone']) > 1:
phone_item_list_formatted = "\n".join(list(map(lambda pi: f"{pi}", museum['phone'])))
phone_section_formatted = f"Phone:\n{phone_item_list_formatted}"
gpx_wps.description = f"{gpx_wps.description}\n\n{phone_section_formatted}"
else:
# most museums have only one phone number listed,
# so collapse the entry into a single line
gpx_wps.description = f"{gpx_wps.description}\n\nPhone: {museum['phone'][0]}"
# GPX 1.1 Schema supports multiple links per waypoint,
# https://www.topografix.com/gpx.asp
# https://www.topografix.com/GPX/1/1/gpx.xsd
# but gpxpy library assumes there can be only one link tag
# https://github.com/tkrajina/gpxpy/issues/138
gpx_wps.link = museum['absolute_url']
# Besides this gpxpy issue,
# Google My Maps ignores <link> tags in Waypoints when importing,
# so add all the links at the end of <desc> tag
links = museum['links'].copy()
links.append({ 'url': museum['absolute_url'], 'title': 'Automuseums.info' })
links_section_plaintext = "\n".join(list(map(lambda l: f"{l['title']}: {l['url']}", links)))
gpx_wps.description = f"{gpx_wps.description}\n\n{links_section_plaintext}"
return gpx_wps
for museum in country['museums']:
for location_index in range(len(museum['coordinates'])):
gpx.waypoints.append(create_gpx_waypoint(museum, location_index))
if not os.path.isdir(OUTPUT_ROOT_PER_COUNTRY):
os.mkdir(OUTPUT_ROOT_PER_COUNTRY)
output_file_name = f"{OUTPUT_FILENAME_PREFIX}{country['country']['name']}.gpx"
output_file_path = os.path.join(OUTPUT_ROOT_PER_COUNTRY, output_file_name)
if len(gpx.waypoints) > 0:
with open(output_file_path, "w", encoding="utf-8") as f:
f.write(gpx.to_xml())
print(f"Generated [cyan]{output_file_name}[/cyan]")
else:
print(f"Not generating [red]{output_file_name}[/red] due to {len(gpx.waypoints)} museums in [yellow]{country['country']['name']}[/yellow]")
# Regenerate GPX files grouped by region
if args.group:
groups = {}
# Load country groups from YAML config file
# https://stackoverflow.com/a/1774043/5337349
with open(os.path.join(PROJECT_ROOT, CONFIG_GROUP_FILENAME)) as stream:
try:
groups = yaml.safe_load(stream)
# print(f"Loaded {CONFIG_GROUP_FILENAME}:")
# pprint(groups)
except yaml.YAMLError as exc:
print(exc)
# Extend groups definition with "All Countries"
groups['All countries'] = list(map(lambda c: c['name'], countries))
# Load all generated per-country GPX files we need for groups defined in YAML config file
required_countries = list(set(chain.from_iterable(groups.values())))
def load_country_gpx_data(country_name):
country_file_name = f"{OUTPUT_FILENAME_PREFIX}{country_name}.gpx"
file_path = os.path.join(OUTPUT_ROOT_PER_COUNTRY, country_file_name)
if not os.path.isfile(file_path):
print(f"Warning: missing [red]{country_file_name}[/red]")
return None
with open(file_path, 'r', encoding="utf-8") as gpx_file:
return gpxpy.parse(gpx_file)
per_country_data = { k:v for (k,v) in zip(required_countries, map(load_country_gpx_data, required_countries)) }
if not os.path.isdir(OUTPUT_ROOT_GROUPED):
os.mkdir(OUTPUT_ROOT_GROUPED)
# Generate GPX files grouped by region
for group_name, group_countries in groups.items():
group_output_file_name = f"{OUTPUT_FILENAME_PREFIX}{group_name}.gpx"
group_output_file_path = os.path.join(OUTPUT_ROOT_GROUPED, group_output_file_name)
gpx = gpxpy.gpx.GPX()
gpx.creator = GPX_CREATOR
gpx.name = f"Automuseums.info: {group_name}"
gpx.description = f"Generated using {gpx.creator}"
gpx.link = WEBSITE_ROOT_URL
if not args.omit_time:
gpx.time = datetime.datetime.now(datetime.timezone.utc)
for country_name in group_countries:
if isinstance(per_country_data[country_name], gpxpy.gpx.GPX):
gpx.waypoints.extend(per_country_data[country_name].waypoints)
if len(gpx.waypoints) > 0:
with open(group_output_file_path, "w", encoding="utf-8") as f:
f.write(gpx.to_xml())
print(f"Generated [magenta]{group_output_file_name}[/magenta]")
else:
print(f"Not generating [red]{group_output_file_name}[/red] due to {len(gpx.waypoints)} museums in {group_name}")
humanized_execution_duration = humanize.precisedelta(datetime.datetime.now() - start_datetime, minimum_unit="seconds", format="%.0f")
print(f"Completed in {humanized_execution_duration}")
# Clean exit
os.remove(lock_file_path)
if args.lowprofile:
capture_checkin(
monitor_slug=sentry_lowprofile_slug,
check_in_id=sentry_check_in_id,
status=MonitorStatus.OK,
)