|
| 1 | +""" |
| 2 | +Acquire and Process APRFC qtf 01h |
| 3 | +""" |
| 4 | + |
| 5 | +import json |
| 6 | +from datetime import datetime, timedelta, timezone |
| 7 | +import calendar |
| 8 | +from bs4 import BeautifulSoup |
| 9 | +import re |
| 10 | +import requests |
| 11 | + |
| 12 | +from airflow import DAG |
| 13 | +from airflow.decorators import dag, task |
| 14 | +from airflow.operators.python import get_current_context |
| 15 | +from helpers.downloads import trigger_download |
| 16 | + |
| 17 | +import helpers.cumulus as cumulus |
| 18 | + |
| 19 | +# Default arguments |
| 20 | +default_args = { |
| 21 | + "owner": "airflow", |
| 22 | + "depends_on_past": False, |
| 23 | + "start_date": (datetime.now(timezone.utc) - timedelta(hours=36)).replace( |
| 24 | + minute=0, second=0 |
| 25 | + ), |
| 26 | + "catchup_by_default": False, |
| 27 | + "email_on_failure": False, |
| 28 | + "email_on_retry": False, |
| 29 | + "retries": 6, |
| 30 | + "retry_delay": timedelta(minutes=30), |
| 31 | +} |
| 32 | + |
| 33 | + |
| 34 | +def get_latest_files(filenames): |
| 35 | + # Dictionary to store the latest file for each unique timestamp |
| 36 | + latest_files = {} |
| 37 | + |
| 38 | + # Regular expression to extract the timestamp |
| 39 | + pattern = r"ta01f_has_\d+f_(\d{8}_\d{2})_awips_(\d+)" |
| 40 | + |
| 41 | + for filename in filenames: |
| 42 | + match = re.search(pattern, filename) |
| 43 | + if match: |
| 44 | + key = match.group(1) + "_" + match.group(2) |
| 45 | + if key not in latest_files or filename > latest_files[key]: |
| 46 | + latest_files[key] = filename |
| 47 | + |
| 48 | + # Return the list of latest files |
| 49 | + return list(latest_files.values()) |
| 50 | + |
| 51 | + |
| 52 | +# APRFC qtf filename generator |
| 53 | +def get_filenames(edate, url): |
| 54 | + """ |
| 55 | + date at end of filename hour and min can not be predicted |
| 56 | + scraping data from website and finding all matching filenames |
| 57 | + for the specified date. |
| 58 | + """ |
| 59 | + d_t1 = edate.strftime("%Y%m%d") |
| 60 | + |
| 61 | + page = requests.get(url) |
| 62 | + soup = BeautifulSoup(page.content, "html.parser") |
| 63 | + links = [node.get("href") for node in soup.find_all("a")] |
| 64 | + filenames = [] |
| 65 | + |
| 66 | + regex = f"^ta01f_has_\\d+f_\\d{{8}}_\\d{{2}}_awips_{d_t1}.*\\.grb(\\.gz)?$" |
| 67 | + filenames = [link for link in links if re.match(regex, link)] |
| 68 | + |
| 69 | + return get_latest_files(filenames) |
| 70 | + |
| 71 | + |
| 72 | +@dag( |
| 73 | + default_args=default_args, |
| 74 | + schedule="21 9,15,19 * * *", |
| 75 | + tags=["cumulus", "temp", "QTF", "APRFC"], |
| 76 | + max_active_runs=1, |
| 77 | + max_active_tasks=1, |
| 78 | +) |
| 79 | +def cumulus_aprfc_qtf_01h(): |
| 80 | + """This pipeline handles download, processing, and derivative product creation for \n |
| 81 | + APRFC QTF\n |
| 82 | + URL Dir - https://cbt.crohms.org/akgrids |
| 83 | + Files matching ta01f_has_92f_20241219_08_awips_202412150008.grb. - 1 hour\n |
| 84 | + """ |
| 85 | + key_prefix = cumulus.S3_ACQUIRABLE_PREFIX |
| 86 | + URL_ROOT = f"https://cbt.crohms.org/akgrids" |
| 87 | + PRODUCT_SLUG = "aprfc-qtf-01h" |
| 88 | + |
| 89 | + @task() |
| 90 | + def download_raw_qtf(): |
| 91 | + logical_date = get_current_context()["logical_date"] |
| 92 | + |
| 93 | + return_list = list() |
| 94 | + filenames = get_filenames(logical_date, URL_ROOT) |
| 95 | + for filename in filenames: |
| 96 | + url = f"{URL_ROOT}/{filename}" |
| 97 | + s3_key = f"{key_prefix}/{PRODUCT_SLUG}/{filename}" |
| 98 | + print(f"Downloading file: {filename}") |
| 99 | + try: |
| 100 | + trigger_download(url=url, s3_bucket=cumulus.S3_BUCKET, s3_key=s3_key) |
| 101 | + return_list.append( |
| 102 | + { |
| 103 | + "execution": logical_date.isoformat(), |
| 104 | + "s3_key": s3_key, |
| 105 | + "filename": filename, |
| 106 | + } |
| 107 | + ) |
| 108 | + except: |
| 109 | + print(f"{filename} is not available to download") |
| 110 | + |
| 111 | + return json.dumps(return_list) |
| 112 | + |
| 113 | + @task() |
| 114 | + def notify_cumulus(payload): |
| 115 | + payload = json.loads(payload) |
| 116 | + for item in payload: |
| 117 | + print("Notifying Cumulus: " + item["filename"]) |
| 118 | + cumulus.notify_acquirablefile( |
| 119 | + acquirable_id=cumulus.acquirables[PRODUCT_SLUG], |
| 120 | + datetime=item["execution"], |
| 121 | + s3_key=item["s3_key"], |
| 122 | + ) |
| 123 | + |
| 124 | + notify_cumulus(download_raw_qtf()) |
| 125 | + |
| 126 | + |
| 127 | +aprfc_qtf_dag = cumulus_aprfc_qtf_01h() |
0 commit comments