Skip to content

Commit 4798c3e

Browse files
committed
Checking in DAGs and Docker CWL image
1 parent 450a631 commit 4798c3e

File tree

8 files changed

+362
-3
lines changed

8 files changed

+362
-3
lines changed

.github/workflows/build_docker_images.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ env:
1212
REGISTRY: ghcr.io
1313
TAG: ${{ github.event.inputs.tag }}
1414
SPS_AIRFLOW: ${{ github.repository }}/sps-airflow
15+
SPS_DOCKER_CWL: ${{ github.repository }}/sps-docker-cwl
1516

1617
jobs:
1718
build-sps-airflow:
@@ -37,3 +38,27 @@ jobs:
3738
push: true
3839
tags: ${{ env.REGISTRY }}/${{ env.SPS_AIRFLOW }}:${{ env.TAG }}
3940
labels: ${{ steps.metascheduler.outputs.labels }}
41+
build-sps-docker-cwl:
42+
runs-on: ubuntu-latest
43+
steps:
44+
- uses: actions/checkout@v2
45+
- name: Log in to the Container registry
46+
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
47+
with:
48+
registry: ${{ env.REGISTRY }}
49+
username: ${{ github.actor }}
50+
password: ${{ secrets.GITHUB_TOKEN }}
51+
- name: Extract metadata (tags, labels) for SPS Docker CWL image
52+
id: metascheduler
53+
uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
54+
with:
55+
images: ${{ env.REGISTRY }}/${{ env.SPS_DOCKER_CWL }}
56+
- name: Build and push SPS Docker CWL image
57+
uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc
58+
with:
59+
context: ./airflow/docker/cwl
60+
file: airflow/docker/cwl/Dockerfile
61+
push: true
62+
tags: ${{ env.REGISTRY }}/${{ env.SPS_DOCKER_CWL }}:${{ env.TAG }}
63+
labels: ${{ steps.metascheduler.outputs.labels }}
64+

airflow/dags/cwl_dag.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# DAG to execute a generic CWL workflow.
2+
# The Airflow KubernetesPodOperator starts a Docker container that includes the Docker engine and the CWL libraries.
3+
# The "cwl-runner" tool is invoked to execute the CWL workflow.
4+
# Parameter cwl_workflow: the URL of the CWL workflow to execute.
5+
# Parameter args_as_json: JSON string contained the specific values for the workflow specific inputs.
6+
from datetime import datetime
7+
from airflow import DAG
8+
from kubernetes.client import models as k8s
9+
from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator
10+
from airflow.models.param import Param
11+
import uuid
12+
13+
# The Kubernetes Pod that executes the CWL-Docker container
14+
# Must use elevated privileges to start/stop the Docker engine
15+
POD_TEMPLATE_FILE = "/opt/airflow/dags/docker_cwl_pod.yaml"
16+
17+
# The Kubernetes namespace within which the Pod is run (it must already exist)
18+
POD_NAMESPACE = "airflow"
19+
20+
# Example arguments
21+
default_cwl_workflow = "https://raw.githubusercontent.com/unity-sds/unity-sps-prototype/cwl-docker/cwl/cwl_workflows/echo_from_docker.cwl"
22+
default_args_as_json = '{ "greeting": "Ciao", "name": "Terra" }'
23+
24+
# Default DAG configuration
25+
dag_default_args = {
26+
'owner': 'airflow',
27+
'depends_on_past': False,
28+
'start_date': datetime(2024, 1, 1, 0, 0)
29+
}
30+
31+
# The DAG
32+
dag = DAG(dag_id='cwl-dag',
33+
description='DAG to execute a generic CWL workflow',
34+
tags=['cwl', 'unity-sps', "docker"],
35+
is_paused_upon_creation=True,
36+
catchup=False,
37+
schedule_interval=None,
38+
max_active_runs=1,
39+
default_args=dag_default_args,
40+
params={
41+
"cwl_workflow": Param(default_cwl_workflow, type="string"),
42+
"args_as_json": Param(default_args_as_json, type="string"),
43+
})
44+
45+
# Environment variables
46+
default_env_vars = {}
47+
48+
# This section defines KubernetesPodOperator
49+
cwl_task = KubernetesPodOperator(
50+
namespace=POD_NAMESPACE,
51+
name="cwl-task",
52+
is_delete_operator_pod=True,
53+
hostnetwork=False,
54+
startup_timeout_seconds=1000,
55+
get_logs=True,
56+
task_id="docker-cwl-task",
57+
full_pod_spec=k8s.V1Pod(
58+
metadata=k8s.V1ObjectMeta(
59+
name='docker-cwl-pod-' + uuid.uuid4().hex),
60+
),
61+
pod_template_file=POD_TEMPLATE_FILE,
62+
arguments=["{{ params.cwl_workflow }}", "{{ params.args_as_json }}"],
63+
dag=dag)

airflow/dags/docker_cwl_pod.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
name: docker-cwl-pod
5+
spec:
6+
7+
restartPolicy: Never
8+
9+
containers:
10+
- name: cwl-docker
11+
image: ghcr.io/unity-sds/unity-sps/sps-docker-cwl:development-tag2
12+
command: ["/usr/share/cwl/docker_cwl_entrypoint.sh"]
13+
securityContext:
14+
privileged: true
15+
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
from datetime import datetime
2+
from airflow import DAG
3+
from kubernetes.client import models as k8s
4+
from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator
5+
from airflow.operators.bash import BashOperator
6+
from airflow.operators.python import PythonOperator
7+
from airflow.models.param import Param
8+
import json
9+
import uuid
10+
11+
12+
# Default DAG configuration
13+
dag_default_args = {
14+
'owner': 'airflow',
15+
'depends_on_past': False,
16+
'start_date': datetime(2024, 1, 1, 0, 0)
17+
}
18+
19+
# The DAG
20+
CWL_WORKFLOW = "https://raw.githubusercontent.com/unity-sds/unity-sps-prototype/cwl-docker/cwl/cwl_workflows/echo_from_docker.cwl"
21+
dag = DAG(dag_id='say-hello-from-cwl-and-docker',
22+
description='Workflow to greet anybody, anytime',
23+
tags=["CWL", "World Peace", "The United Nations"],
24+
is_paused_upon_creation=True,
25+
catchup=False,
26+
schedule=None,
27+
max_active_runs=1,
28+
default_args=dag_default_args,
29+
params={
30+
"cwl_workflow": Param(CWL_WORKFLOW, type="string"),
31+
"greeting": Param("Hello", type="string"),
32+
"name": Param("World", type="string"),
33+
})
34+
35+
# Environment variables
36+
default_env_vars = {}
37+
38+
39+
# Task that captures the DAG specific arguments
40+
# and creates a json-formatted string for the downstream Tasks
41+
def setup(ti=None, **context):
42+
task_dict = {
43+
'greeting': context['params']['greeting'],
44+
'name': context['params']['name']
45+
}
46+
ti.xcom_push(key='cwl_args', value=json.dumps(task_dict))
47+
48+
49+
setup_task = PythonOperator(task_id="Setup",
50+
python_callable=setup,
51+
dag=dag)
52+
53+
54+
stage_in_task = BashOperator(
55+
task_id="Stage_In",
56+
dag=dag,
57+
bash_command="echo Downloading data")
58+
59+
# This section defines KubernetesPodOperator
60+
cwl_task = KubernetesPodOperator(
61+
namespace="airflow",
62+
name="CWL_Workflow",
63+
on_finish_action="delete_pod",
64+
hostnetwork=False,
65+
startup_timeout_seconds=1000,
66+
get_logs=True,
67+
task_id="CWL_Workflow",
68+
full_pod_spec=k8s.V1Pod(
69+
k8s.V1ObjectMeta(name=('docker-cwl-pod-' + uuid.uuid4().hex))),
70+
pod_template_file="/opt/airflow/dags/docker_cwl_pod.yaml",
71+
#image="ghcr.io/unity-sds/unity-sps-prototype/unity-sps-docker-cwl:latest",
72+
arguments=["{{ params.cwl_workflow }}", "{{ti.xcom_pull(task_ids='Setup', key='cwl_args')}}"],
73+
# resources={"request_memory": "512Mi", "limit_memory": "1024Mi"},
74+
dag=dag)
75+
76+
stage_out_task = BashOperator(
77+
task_id="Stage_Out",
78+
dag=dag,
79+
bash_command="echo Uploading data")
80+
81+
setup_task >> stage_in_task >> cwl_task >> stage_out_task
82+
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# DAG for executing the SBG Preprocess Workflow
2+
# See https://github.com/unity-sds/sbg-workflows/blob/main/preprocess/sbg-preprocess-workflow.cwl
3+
from datetime import datetime
4+
from airflow import DAG
5+
from kubernetes.client import models as k8s
6+
from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import KubernetesPodOperator
7+
from airflow.operators.python import PythonOperator
8+
from airflow.models.param import Param
9+
import json
10+
import uuid
11+
12+
# The Kubernetes Pod that executes the CWL-Docker container
13+
# Must use elevated privileges to start/stop the Docker engine
14+
POD_TEMPLATE_FILE = "/opt/airflow/dags/docker_cwl_pod.yaml"
15+
16+
# The Kubernetes namespace within which the Pod is run (it must already exist)
17+
POD_NAMESPACE = "airflow"
18+
19+
20+
# Default DAG configuration
21+
dag_default_args = {
22+
'owner': 'airflow',
23+
'depends_on_past': False,
24+
'start_date': datetime(2024, 1, 1, 0, 0)
25+
}
26+
CWL_WORKFLOW = "https://raw.githubusercontent.com/unity-sds/sbg-workflows/1.0/preprocess/sbg-preprocess-workflow.cwl"
27+
28+
dag = DAG(dag_id='sbg-preprocess-cwl-dag',
29+
description='SBG Preprocess Workflow as CWL',
30+
tags=["SBG", "Unity", "SPS", "NASA", "JPL"],
31+
is_paused_upon_creation=True,
32+
catchup=False,
33+
schedule=None,
34+
max_active_runs=1,
35+
default_args=dag_default_args,
36+
params={
37+
"cwl_workflow": Param(CWL_WORKFLOW, type="string"),
38+
# "input_processing_labels": Param(["label1", "label2"], type="string[]"),
39+
"input_cmr_collection_name": Param("C2408009906-LPCLOUD", type="string"),
40+
"input_cmr_search_start_time": Param("2024-01-03T13:19:36.000Z", type="string"),
41+
"input_cmr_search_stop_time": Param("2024-01-03T13:19:36.000Z", type="string"),
42+
"input_unity_dapa_api": Param("https://d3vc8w9zcq658.cloudfront.net", type="string"),
43+
"input_unity_dapa_client": Param("40c2s0ulbhp9i0fmaph3su9jch", type="string"),
44+
"input_crid": Param("001", type="string"),
45+
"output_collection_id": Param("urn:nasa:unity:unity:dev:SBG-L1B_PRE___1", type="string"),
46+
"output_data_bucket": Param("sps-dev-ds-storage", type="string"),
47+
48+
})
49+
50+
# Task that serializes the job arguments into a JSON string
51+
def setup(ti=None, **context):
52+
task_dict = {
53+
'input_processing_labels': ["label1", "label2"],
54+
'input_cmr_collection_name': context['params']['input_cmr_collection_name'],
55+
'input_cmr_search_start_time': context['params']['input_cmr_search_start_time'],
56+
'input_cmr_search_stop_time': context['params']['input_cmr_search_stop_time'],
57+
'input_unity_dapa_api': context['params']['input_unity_dapa_api'],
58+
'input_unity_dapa_client': context['params']['input_unity_dapa_client'],
59+
'input_crid': context['params']['input_crid'],
60+
'output_collection_id': context['params']['output_collection_id'],
61+
'output_data_bucket': context['params']['output_data_bucket']
62+
}
63+
ti.xcom_push(key='cwl_args', value=json.dumps(task_dict))
64+
65+
66+
setup_task = PythonOperator(task_id="Setup",
67+
python_callable=setup,
68+
dag=dag)
69+
70+
71+
# Task that executes the specific CWL workflow with the previous arguments
72+
cwl_task = KubernetesPodOperator(
73+
namespace=POD_NAMESPACE,
74+
name="SBG_Preprocess_CWL",
75+
on_finish_action="delete_pod",
76+
hostnetwork=False,
77+
startup_timeout_seconds=1000,
78+
get_logs=True,
79+
task_id="SBG_Preprocess_CWL",
80+
full_pod_spec=k8s.V1Pod(
81+
k8s.V1ObjectMeta(name=('sbg-preprocess-cwl-pod-' + uuid.uuid4().hex))),
82+
pod_template_file=POD_TEMPLATE_FILE,
83+
arguments=["{{ params.cwl_workflow }}", "{{ti.xcom_pull(task_ids='Setup', key='cwl_args')}}"],
84+
# resources={"request_memory": "512Mi", "limit_memory": "1024Mi"},
85+
dag=dag)
86+
87+
setup_task >> cwl_task

airflow/docker/custom_airflow/Dockerfile

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,31 @@
1-
FROM apache/airflow:2.8.1-python3.11
1+
# FROM apache/airflow:2.8.1-python3.11
2+
# cat /etc/os-release
3+
# "Debian GNU/Linux 12 (bookworm)"
4+
FROM apache/airflow
25

3-
COPY ./airflow/dags/ ${AIRFLOW_HOME}/dags/
6+
# add editor
7+
USER root
8+
RUN apt-get update \
9+
&& apt-get install -y --no-install-recommends \
10+
vim \
11+
&& apt-get autoremove -yqq --purge \
12+
&& apt-get clean \
13+
&& rm -rf /var/lib/apt/lists/*
14+
15+
# add git
16+
RUN apt-get update \
17+
&& apt-get install -y git \
18+
&& git --version
19+
20+
USER airflow
21+
22+
# add Python libraries
23+
# RUN pip install cwltool==3.1.20240112164112
24+
RUN pip install cwltool cwl-runner \
25+
apache-airflow-providers-docker \
26+
apache-airflow-providers-cncf-kubernetes \
27+
kubernetes-client
28+
29+
# add DAGs
30+
COPY ./airflow/dags ${AIRFLOW_HOME}/dags/
431

5-
RUN pip install cwltool==3.1.20240112164112

airflow/docker/cwl/Dockerfile

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# docker:dind Dockerfile: https://github.com/docker-library/docker/blob/master/Dockerfile-dind.template
2+
# FROM docker:dind
3+
FROM docker:25.0.3-dind
4+
5+
# install Python
6+
RUN apk add --update --no-cache python3 && ln -sf python3 /usr/bin/python
7+
RUN apk add gcc musl-dev linux-headers python3-dev
8+
RUN apk add --no-cache python3 py3-pip
9+
RUN apk add vim
10+
11+
# install CWL libraries
12+
RUN mkdir /usr/share/cwl \
13+
&& cd /usr/share/cwl \
14+
&& python -m venv venv \
15+
&& source venv/bin/activate \
16+
&& pip install cwltool cwl-runner docker
17+
18+
# install nodejs to parse Javascript in CWL files
19+
RUN apk add --no-cache nodejs npm
20+
21+
# script to execute a generic CWL workflow with arguments
22+
COPY docker_cwl_entrypoint.sh /usr/share/cwl/docker_cwl_entrypoint.sh
23+
24+
WORKDIR /usr/share/cwl
25+
ENTRYPOINT ["/usr/share/cwl/docker_cwl_entrypoint.sh"]
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/bin/sh
2+
# Script to execute a CWL workflow that includes Docker containers
3+
# The Docker engine is started before the CWL execution, and stopped afterwards.
4+
# $1: the CWL workflow URL (example: https://raw.githubusercontent.com/unity-sds/unity-sps-prototype/cwl-docker/cwl/cwl_workflows/echo_from_docker.cwl)
5+
# $2: the CWL job parameters as a JSON fomatted string (example: { name: John Doe })
6+
# $3: optional output directory, defaults to the current directory
7+
# Note: $output_dir must be accessible by the Docker container that executes this script
8+
9+
set -ex
10+
cwl_workflow=$1
11+
job_args=$2
12+
output_dir=${3:-.}
13+
echo "Executing CWL workflow: $cwl_workflow with json arguments: $job_args and output directory: $output_dir"
14+
echo $job_args > /tmp/job_args.json
15+
cat /tmp/job_args.json
16+
17+
# create output directory if it doesn't exist
18+
mkdir -p $output_dir
19+
20+
# Start Docker engine
21+
dockerd &> dockerd-logfile &
22+
23+
# Wait until Docker engine is running
24+
# Loop until 'docker version' exits with 0.
25+
until docker version > /dev/null 2>&1
26+
do
27+
sleep 1
28+
done
29+
30+
# Execute CWL workflow
31+
source /usr/share/cwl/venv/bin/activate
32+
cwl-runner --outdir $output_dir --no-match-user --no-read-only $cwl_workflow /tmp/job_args.json
33+
deactivate
34+
35+
# Stop Docker engine
36+
pkill -f dockerd

0 commit comments

Comments
 (0)