sync newer aprfc dags with adjustments

adamscarberry · adamscarberry · commit 3759826bfe41 · 2025-02-12T16:12:55.000-05:00
diff --git a/dags/cumulus/abrfc_qpf_06h.py b/dags/cumulus/abrfc_qpf_06h.py
@@ -56,8 +56,8 @@ def qpf_filenames(edate):
     schedule="8 */6 * * *",
     tags=["cumulus", "precip", "QPF", "ABRFC"],
     doc_md=__doc__,
-    max_active_runs=2,
-    max_active_tasks=4,
+    max_active_runs=1,
+    max_active_tasks=2,
 )
 def cumulus_abrfc_qpf_06h():
     key_prefix = cumulus.S3_ACQUIRABLE_PREFIX
diff --git a/dags/cumulus/aprfc_qpf_06h.py b/dags/cumulus/aprfc_qpf_06h.py
@@ -56,8 +56,8 @@ def get_filenames(edate, url):
     default_args=default_args,
     schedule="40 14,5 * * *",
     tags=["cumulus", "precip", "QPF", "APRFC"],
-    max_active_runs=2,
-    max_active_tasks=4,
+    max_active_runs=1,
+    max_active_tasks=1,
 )
 def cumulus_aprfc_qpf_06h():
     """This pipeline handles download, processing, and derivative product creation for \n
diff --git a/dags/cumulus/aprfc_qte_01h.py b/dags/cumulus/aprfc_qte_01h.py
@@ -0,0 +1,110 @@
+"""
+Acquire and Process APRFC QTE 01h
+
+Returns
+-------
+Airflow DAG
+    Directed Acyclic Graph
+"""
+
+from datetime import datetime, timedelta, timezone
+import json
+from string import Template
+from airflow.decorators import dag, task
+from airflow.operators.python import get_current_context
+from airflow.utils.task_group import TaskGroup
+from helpers.downloads import trigger_download
+
+import helpers.cumulus as cumulus
+
+default_args = {
+    "owner": "airflow",
+    "depends_on_past": False,
+    "start_date": (datetime.now(timezone.utc) - timedelta(hours=48)).replace(
+        minute=0, second=0
+    ),
+    "catchup_by_default": False,
+    "email_on_failure": False,
+    "email_on_retry": False,
+    "retries": 6,
+    "retry_delay": timedelta(minutes=30),
+}
+
+
+@dag(
+    default_args=default_args,
+    tags=["cumulus", "AIRTEMP", "QTE", "APRFC"],
+    schedule="45 * * * *",
+    max_active_runs=1,
+    max_active_tasks=1,
+)
+def cumulus_aprfc_qte_01h():
+    """
+    # APRFC hourly estimated temps
+
+    This pipeline handles download, processing, and derivative product creation for APRFC hourly estimated temps
+    Raw data downloaded to S3 and notifies the Cumulus API of new product(s)
+
+    URLs:
+    - BASE - https://nomads.ncep.noaa.gov/pub/data/nccf/com/urma/prod/akurma.YYYYMMDD/
+
+    Filename/Dir Pattern:
+
+    URL Dir - https://nomads.ncep.noaa.gov/pub/data/nccf/com/urma/prod/akurma.YYYYMMDD/
+    Files matching akurma.tHHz.2dvaranl_ndfd_3p0.grb2 - 1 hour\n
+    """
+    s3_bucket = cumulus.S3_BUCKET
+    key_prefix = cumulus.S3_ACQUIRABLE_PREFIX
+
+    URL_ROOT = "https://nomads.ncep.noaa.gov/pub/data/nccf/com/urma/prod/"
+    PRODUCT_SLUG = "aprfc-qte-01h"
+
+    filename_template = Template("akurma.t${hr_}z.2dvaranl_ndfd_3p0.grb2 ")
+
+    url_suffix_template = Template("akurma.${date_}")
+
+    @task()
+    def download_raw_qte():
+        logical_date = get_current_context()["logical_date"]
+        date_only = logical_date.strftime("%Y%m%d")
+
+        url_suffix = url_suffix_template.substitute(
+            date_=date_only,
+        )
+
+        filename = filename_template.substitute(
+            hr_=logical_date.strftime("%H"),
+        )
+
+        file_dir = f"{URL_ROOT}{url_suffix}"
+
+        s3_filename = f"{date_only}_{filename}"
+        s3_key = f"{key_prefix}/{PRODUCT_SLUG}/{s3_filename}"
+
+        print(f"Downloading file: {filename}")
+
+        trigger_download(
+            url=f"{file_dir}/{filename}", s3_bucket=s3_bucket, s3_key=s3_key
+        )
+        return json.dumps(
+            {
+                "execution": logical_date.isoformat(),
+                "s3_key": s3_key,
+                "filename": s3_filename,
+            }
+        )
+
+    @task()
+    def notify_cumulus(payload):
+        payload = json.loads(payload)
+        print("Notifying Cumulus: " + payload["filename"])
+        cumulus.notify_acquirablefile(
+            acquirable_id=cumulus.acquirables[PRODUCT_SLUG],
+            datetime=payload["execution"],
+            s3_key=payload["s3_key"],
+        )
+
+    notify_cumulus(download_raw_qte())
+
+
+aprfc_qte_dag = cumulus_aprfc_qte_01h()
diff --git a/dags/cumulus/aprfc_qtf_01h.py b/dags/cumulus/aprfc_qtf_01h.py
@@ -0,0 +1,127 @@
+"""
+Acquire and Process APRFC qtf 01h
+"""
+
+import json
+from datetime import datetime, timedelta, timezone
+import calendar
+from bs4 import BeautifulSoup
+import re
+import requests
+
+from airflow import DAG
+from airflow.decorators import dag, task
+from airflow.operators.python import get_current_context
+from helpers.downloads import trigger_download
+
+import helpers.cumulus as cumulus
+
+# Default arguments
+default_args = {
+    "owner": "airflow",
+    "depends_on_past": False,
+    "start_date": (datetime.now(timezone.utc) - timedelta(hours=36)).replace(
+        minute=0, second=0
+    ),
+    "catchup_by_default": False,
+    "email_on_failure": False,
+    "email_on_retry": False,
+    "retries": 6,
+    "retry_delay": timedelta(minutes=30),
+}
+
+
+def get_latest_files(filenames):
+    # Dictionary to store the latest file for each unique timestamp
+    latest_files = {}
+
+    # Regular expression to extract the timestamp
+    pattern = r"ta01f_has_\d+f_(\d{8}_\d{2})_awips_(\d+)"
+
+    for filename in filenames:
+        match = re.search(pattern, filename)
+        if match:
+            key = match.group(1) + "_" + match.group(2)
+            if key not in latest_files or filename > latest_files[key]:
+                latest_files[key] = filename
+
+    # Return the list of latest files
+    return list(latest_files.values())
+
+
+# APRFC qtf filename generator
+def get_filenames(edate, url):
+    """
+    date at end of filename hour and min can not be predicted
+    scraping data from website and finding all matching filenames
+    for the specified date.
+    """
+    d_t1 = edate.strftime("%Y%m%d")
+
+    page = requests.get(url)
+    soup = BeautifulSoup(page.content, "html.parser")
+    links = [node.get("href") for node in soup.find_all("a")]
+    filenames = []
+
+    regex = f"^ta01f_has_\\d+f_\\d{{8}}_\\d{{2}}_awips_{d_t1}.*\\.grb(\\.gz)?$"
+    filenames = [link for link in links if re.match(regex, link)]
+
+    return get_latest_files(filenames)
+
+
+@dag(
+    default_args=default_args,
+    schedule="21 9,15,19 * * *",
+    tags=["cumulus", "temp", "QTF", "APRFC"],
+    max_active_runs=1,
+    max_active_tasks=1,
+)
+def cumulus_aprfc_qtf_01h():
+    """This pipeline handles download, processing, and derivative product creation for \n
+    APRFC QTF\n
+    URL Dir - https://cbt.crohms.org/akgrids
+    Files matching ta01f_has_92f_20241219_08_awips_202412150008.grb. - 1 hour\n
+    """
+    key_prefix = cumulus.S3_ACQUIRABLE_PREFIX
+    URL_ROOT = f"https://cbt.crohms.org/akgrids"
+    PRODUCT_SLUG = "aprfc-qtf-01h"
+
+    @task()
+    def download_raw_qtf():
+        logical_date = get_current_context()["logical_date"]
+
+        return_list = list()
+        filenames = get_filenames(logical_date, URL_ROOT)
+        for filename in filenames:
+            url = f"{URL_ROOT}/{filename}"
+            s3_key = f"{key_prefix}/{PRODUCT_SLUG}/{filename}"
+            print(f"Downloading file: {filename}")
+            try:
+                trigger_download(url=url, s3_bucket=cumulus.S3_BUCKET, s3_key=s3_key)
+                return_list.append(
+                    {
+                        "execution": logical_date.isoformat(),
+                        "s3_key": s3_key,
+                        "filename": filename,
+                    }
+                )
+            except:
+                print(f"{filename} is not available to download")
+
+        return json.dumps(return_list)
+
+    @task()
+    def notify_cumulus(payload):
+        payload = json.loads(payload)
+        for item in payload:
+            print("Notifying Cumulus: " + item["filename"])
+            cumulus.notify_acquirablefile(
+                acquirable_id=cumulus.acquirables[PRODUCT_SLUG],
+                datetime=item["execution"],
+                s3_key=item["s3_key"],
+            )
+
+    notify_cumulus(download_raw_qtf())
+
+
+aprfc_qtf_dag = cumulus_aprfc_qtf_01h()
diff --git a/plugins/helpers/cumulus.py b/plugins/helpers/cumulus.py
@@ -24,6 +24,8 @@
     "abrfc-qpf-06h": "b1a4754c-5971-11ee-8c99-0242ac120002",
     "aprfc-qpe-06h": "1f67d822-7cbc-11ee-b962-0242ac120002",
     "aprfc-qpf-06h": "a64cb16f-01a8-45c0-a069-9afda805d3a7",
+    "aprfc-qte-01h": "7f8b2d6a-1f3e-11ee-be56-0242ac120002",
+    "aprfc-qtf-01h": "80f33047-6234-4949-9c2f-eec6bfcf7b0f",
     "cnrfc-qpe-06h": "34a89c35-090d-46e8-964a-c621403301b9",
     "cnrfc-qpf-06h": "c22785cd-400e-4664-aef8-426734825c2c",
     "cnrfc-nbm-qpf-06h": "40cfce36-cfad-4a10-8b2d-eb8862378ca5",