diff --git a/README.md b/README.md
index acf00ca..a1ea5b3 100644
--- a/README.md
+++ b/README.md
@@ -11,13 +11,25 @@
- Create and deploying code to python shell / pyspark **AWS Glue jobs**.
-- Create, orchestrate and trigger **Sagemaker Traning Jobs and Processing Jobs**.
+- Use **AWS Sagemaker** to create ML Models.
- Orchestrate the above jobs using **AWS Stepfunctions** as simple as `task1 >> task2`
- Let us [know](https://github.com/vincentclaes/datajob/discussions) **what you want to see next**.
-> Dependencies are [AWS CDK](https://github.com/aws/aws-cdk) and [Step Functions SDK for data science](https://github.com/aws/aws-step-functions-data-science-sdk-python)
+
+
+ :rocket: :new: :rocket:
+
+
+[Check our new example of an End-to-end Machine Learning Pipeline with Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end)
+
+
+:rocket: :new: :rocket:
+
+
+
+
@@ -88,7 +100,7 @@ cd examples/data_pipeline_simple
cdk deploy --app "python datajob_stack.py"
```
-### Run
+### Execute
```shell script
datajob execute --state-machine data-pipeline-simple-workflow
@@ -103,6 +115,16 @@ The terminal will show a link to the step functions page to follow up on your pi
cdk destroy --app "python datajob_stack.py"
```
+# Examples
+
+- [Data pipeline with parallel steps](./examples/data_pipeline_parallel/)
+- [Data pipeline for processing big data using PySpark](./examples/data_pipeline_pyspark/)
+- [Data pipeline where you package and ship your project as a wheel](./examples/data_pipeline_with_packaged_project/)
+- [Machine Learning pipeline where we combine glue jobs with sagemaker](examples/ml_pipeline_end_to_end)
+
+All our examples are in [./examples](./examples)
+
+
# Functionality
diff --git a/examples/ml_pipeline_abalone/README.md b/examples/ml_pipeline_end_to_end/README.md
similarity index 91%
rename from examples/ml_pipeline_abalone/README.md
rename to examples/ml_pipeline_end_to_end/README.md
index 1ea5fa1..2b13450 100644
--- a/examples/ml_pipeline_abalone/README.md
+++ b/examples/ml_pipeline_end_to_end/README.md
@@ -1,4 +1,4 @@
-# ML Pipeline Scikitlearn
+# End to End Machine Learning Pipeline
> This example is an implementation of datajob of [an official aws sagemaker example.](https://github.com/aws/amazon-sagemaker-examples/blob/master/step-functions-data-science-sdk/machine_learning_workflow_abalone/machine_learning_workflow_abalone.ipynb)
@@ -17,7 +17,10 @@ we have 5 steps in our ML pipeline:
## Deploy
- cd examples/ml_pipeline_abalone
+> !!! Don't forget to pull down the sagemaker endpoint that is created at the end of the pipeline.
+
+
+ cd examples/ml_pipeline_end_to_end
export AWS_PROFILE=my-profile
export AWS_DEFAULT_REGION=eu-west-1
cdk deploy --app "python datajob_stack.py" --require-approval never
@@ -31,7 +34,7 @@ we have 5 steps in our ML pipeline:
arn:aws:cloudformation:eu-west-1:077590795309:stack/datajob-ml-pipeline-abalone/e179ec30-f45a-11eb-9731-02575f1b7adf
-execute the ml pipeline
+## Execute
datajob execute --state-machine datajob-ml-pipeline-abalone-workflow
@@ -56,6 +59,11 @@ In the end a sagemaker endpoint is created:
![sagemaker-endpoint.png](assets/sagemaker-endpoint.png)
In our example the name of the endpoint is `datajob-ml-pipeline-abalone-create-endpoint-20210803T165017`
-Pull down the sagemaker endpoint by executing the following command:
+
+## Destroy
+
+ cdk destroy --app "python datajob_stack.py"
+
+Don't forget to pull down the sagemaker endpoint:
aws sagemaker delete-endpoint --endpoint-name datajob-ml-pipeline-abalone-create-endpoint-20210803T165017
diff --git a/examples/ml_pipeline_abalone/assets/sagemaker-endpoint.png b/examples/ml_pipeline_end_to_end/assets/sagemaker-endpoint.png
similarity index 100%
rename from examples/ml_pipeline_abalone/assets/sagemaker-endpoint.png
rename to examples/ml_pipeline_end_to_end/assets/sagemaker-endpoint.png
diff --git a/examples/ml_pipeline_abalone/assets/stepfunctions-workflow.png b/examples/ml_pipeline_end_to_end/assets/stepfunctions-workflow.png
similarity index 100%
rename from examples/ml_pipeline_abalone/assets/stepfunctions-workflow.png
rename to examples/ml_pipeline_end_to_end/assets/stepfunctions-workflow.png
diff --git a/examples/ml_pipeline_abalone/datajob_stack.py b/examples/ml_pipeline_end_to_end/datajob_stack.py
similarity index 82%
rename from examples/ml_pipeline_abalone/datajob_stack.py
rename to examples/ml_pipeline_end_to_end/datajob_stack.py
index 08412e8..e764edf 100644
--- a/examples/ml_pipeline_abalone/datajob_stack.py
+++ b/examples/ml_pipeline_end_to_end/datajob_stack.py
@@ -18,16 +18,12 @@
with DataJobStack(scope=app, id="datajob-ml-pipeline-abalone") as djs:
sagemaker_default_role = get_default_sagemaker_role(datajob_stack=djs)
- sagemaker_session = sagemaker.Session(
- boto_session=boto3.session.Session(region_name=djs.env.region)
- )
- sagemaker_default_bucket_uri = (
- f"s3://{sagemaker_session.default_bucket()}/datajob-ml-pipeline-abalone"
- )
- train_path = f"{sagemaker_default_bucket_uri}/train/abalone.train"
- validation_path = f"{sagemaker_default_bucket_uri}/validation/abalone.validation"
- test_path = f"{sagemaker_default_bucket_uri}/test/abalone.test"
+ train_path = f"s3://{djs.context.data_bucket_name}/train/abalone.train"
+ validation_path = (
+ f"s3://{djs.context.data_bucket_name}/validation/abalone.validation"
+ )
+ test_path = f"s3://{djs.context.data_bucket_name}/test/abalone.test"
prepare_dataset_step = GlueJob(
datajob_stack=djs,
@@ -48,8 +44,7 @@
train_instance_count=1,
train_instance_type="ml.m4.4xlarge",
train_volume_size=5,
- output_path=f"{sagemaker_default_bucket_uri}/single-xgboost",
- sagemaker_session=sagemaker_session,
+ output_path=f"s3://{djs.context.data_bucket_name}/single-xgboost",
)
xgb.set_hyperparameters(
diff --git a/examples/ml_pipeline_abalone/jobs/prepare_dataset.py b/examples/ml_pipeline_end_to_end/jobs/prepare_dataset.py
similarity index 100%
rename from examples/ml_pipeline_abalone/jobs/prepare_dataset.py
rename to examples/ml_pipeline_end_to_end/jobs/prepare_dataset.py