Skip to content

Commit 419d4ad

Browse files
authored
Merge pull request #265 from USACE/380-product-aprfc-data-products
updated aprfc qpf dag to focus acquirable download
2 parents bce9083 + 17bce85 commit 419d4ad

File tree

1 file changed

+25
-10
lines changed

1 file changed

+25
-10
lines changed

dags/cumulus/aprfc_qpf_06h.py

+25-10
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44

55
import json
6-
from datetime import datetime, timedelta
6+
from datetime import datetime, timedelta, timezone
77
import calendar
88
from bs4 import BeautifulSoup
99
import re
@@ -20,14 +20,30 @@
2020
default_args = {
2121
"owner": "airflow",
2222
"depends_on_past": False,
23-
"start_date": (datetime.utcnow() - timedelta(hours=72)).replace(minute=0, second=0),
23+
"start_date": (datetime.now(timezone.utc) - timedelta(hours=36)).replace(minute=0, second=0),
2424
"catchup_by_default": False,
2525
"email_on_failure": False,
2626
"email_on_retry": False,
2727
"retries": 6,
2828
"retry_delay": timedelta(minutes=30),
2929
}
3030

31+
def get_latest_files(filenames):
32+
# Dictionary to store the latest file for each unique timestamp
33+
latest_files = {}
34+
35+
# Regular expression to extract the timestamp
36+
pattern = r'qpf06f_has_\d+f_(\d{8}_\d{2})_awips_(\d+)'
37+
38+
for filename in filenames:
39+
match = re.search(pattern, filename)
40+
if match:
41+
key = match.group(1) + '_' + match.group(2)
42+
if key not in latest_files or filename > latest_files[key]:
43+
latest_files[key] = filename
44+
45+
# Return the list of latest files
46+
return list(latest_files.values())
3147

3248
# ALR QPF filename generator
3349
def get_filenames(edate, url):
@@ -37,25 +53,24 @@ def get_filenames(edate, url):
3753
for the sprcified date.
3854
"""
3955
d_t1 = edate.strftime("%Y%m%d")
40-
d_t2 = (edate - timedelta(hours=24)).strftime("%Y%m%d")
56+
4157

4258
page = requests.get(url)
4359
soup = BeautifulSoup(page.content, "html.parser")
4460
links = [node.get("href") for node in soup.find_all("a")]
4561
filenames = []
46-
for d in [d_t2, d_t1]:
47-
regex = f"^qpf06f_has_.*.awips_{d}\d+.grb.gz$"
48-
filenames = filenames + [link for link in links if re.match(regex, link)]
62+
regex = f"^qpf06f_has_\\d+f_\\d{{8}}_\\d{{2}}_awips_{d_t1}.*\\.grb(\\.gz)?$"
63+
filenames = [link for link in links if re.match(regex, link)]
4964

50-
return filenames
65+
return get_latest_files(filenames)
5166

5267

5368
@dag(
5469
default_args=default_args,
55-
schedule="40 14,5 * * *",
70+
schedule="20 9,15,19 * * *",
5671
tags=["cumulus", "precip", "QPF", "APRFC"],
57-
max_active_runs=2,
58-
max_active_tasks=4,
72+
max_active_runs=1,
73+
max_active_tasks=1,
5974
)
6075
def cumulus_aprfc_qpf_06h():
6176
"""This pipeline handles download, processing, and derivative product creation for \n

0 commit comments

Comments
 (0)