From 0a65804b0b2ba4c5aaf94c94941cb1cb10cf0eee Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 13:32:43 +0200 Subject: [PATCH 01/18] update readme --- README.md | 12 +++++++++++- examples/ml_pipeline_abalone/datajob_stack.py | 17 ++++++----------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index acf00ca..588d208 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@
- Create and deploying code to python shell / pyspark **AWS Glue jobs**. -- Create, orchestrate and trigger **Sagemaker Traning Jobs and Processing Jobs**. +- Use **AWS Sagemaker** to create ML Models. - Orchestrate the above jobs using **AWS Stepfunctions** as simple as `task1 >> task2` - Let us [know](https://github.com/vincentclaes/datajob/discussions) **what you want to see next**. @@ -103,6 +103,16 @@ The terminal will show a link to the step functions page to follow up on your pi cdk destroy --app "python datajob_stack.py" ``` +# Examples + +- [Data pipeline with parallel steps](./examples/data_pipeline_parallel/) +- [Data pipeline for processing big data using PySpark](./examples/data_pipeline_pyspark/) +- [Data pipeline where you package and ship your project as a wheel](./examples/data_pipeline_with_packaged_project/) +- [Machine Learning pipeline where we combine glue jobs with sagemaker](./examples/ml_pipeline_abalone) + +All our examples are in [./examples](./examples) + + # Functionality
diff --git a/examples/ml_pipeline_abalone/datajob_stack.py b/examples/ml_pipeline_abalone/datajob_stack.py index 08412e8..e764edf 100644 --- a/examples/ml_pipeline_abalone/datajob_stack.py +++ b/examples/ml_pipeline_abalone/datajob_stack.py @@ -18,16 +18,12 @@ with DataJobStack(scope=app, id="datajob-ml-pipeline-abalone") as djs: sagemaker_default_role = get_default_sagemaker_role(datajob_stack=djs) - sagemaker_session = sagemaker.Session( - boto_session=boto3.session.Session(region_name=djs.env.region) - ) - sagemaker_default_bucket_uri = ( - f"s3://{sagemaker_session.default_bucket()}/datajob-ml-pipeline-abalone" - ) - train_path = f"{sagemaker_default_bucket_uri}/train/abalone.train" - validation_path = f"{sagemaker_default_bucket_uri}/validation/abalone.validation" - test_path = f"{sagemaker_default_bucket_uri}/test/abalone.test" + train_path = f"s3://{djs.context.data_bucket_name}/train/abalone.train" + validation_path = ( + f"s3://{djs.context.data_bucket_name}/validation/abalone.validation" + ) + test_path = f"s3://{djs.context.data_bucket_name}/test/abalone.test" prepare_dataset_step = GlueJob( datajob_stack=djs, @@ -48,8 +44,7 @@ train_instance_count=1, train_instance_type="ml.m4.4xlarge", train_volume_size=5, - output_path=f"{sagemaker_default_bucket_uri}/single-xgboost", - sagemaker_session=sagemaker_session, + output_path=f"s3://{djs.context.data_bucket_name}/single-xgboost", ) xgb.set_hyperparameters( From 99659c8a758729544dfaa3e01a98e40b0109b790 Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 13:57:33 +0200 Subject: [PATCH 02/18] update readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 588d208..48c5bd2 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,9 @@ - Let us [know](https://github.com/vincentclaes/datajob/discussions) **what you want to see next**.
+ :new: :rocket: -> Dependencies are [AWS CDK](https://github.com/aws/aws-cdk) and [Step Functions SDK for data science](https://github.com/aws/aws-step-functions-data-science-sdk-python)
+[Checkout how we build a Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](./examples/ml_pipeline_abalone)
From 49763c49ee00e45635342e672da865a996978e88 Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 14:01:44 +0200 Subject: [PATCH 03/18] update readme --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 48c5bd2..8126580 100644 --- a/README.md +++ b/README.md @@ -16,9 +16,10 @@ - Let us [know](https://github.com/vincentclaes/datajob/discussions) **what you want to see next**.
- :new: :rocket: -[Checkout how we build a Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](./examples/ml_pipeline_abalone) + :new: :rocket: Checkout our new example combining all the AWS services we currently support :new: :rocket: + +[End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](./examples/ml_pipeline_abalone)
From 611dbce3dc12c5cc1ce9ca418d08cf46d0d4fbf8 Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 14:04:45 +0200 Subject: [PATCH 04/18] update example ml pipeline --- README.md | 4 ++-- .../README.md | 4 ++-- .../assets/sagemaker-endpoint.png | Bin .../assets/stepfunctions-workflow.png | Bin .../datajob_stack.py | 0 .../jobs/prepare_dataset.py | 0 6 files changed, 4 insertions(+), 4 deletions(-) rename examples/{ml_pipeline_abalone => ml_pipeline_end_to_end}/README.md (97%) rename examples/{ml_pipeline_abalone => ml_pipeline_end_to_end}/assets/sagemaker-endpoint.png (100%) rename examples/{ml_pipeline_abalone => ml_pipeline_end_to_end}/assets/stepfunctions-workflow.png (100%) rename examples/{ml_pipeline_abalone => ml_pipeline_end_to_end}/datajob_stack.py (100%) rename examples/{ml_pipeline_abalone => ml_pipeline_end_to_end}/jobs/prepare_dataset.py (100%) diff --git a/README.md b/README.md index 8126580..9a656d9 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ :new: :rocket: Checkout our new example combining all the AWS services we currently support :new: :rocket: -[End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](./examples/ml_pipeline_abalone) +[End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end)
@@ -110,7 +110,7 @@ cdk destroy --app "python datajob_stack.py" - [Data pipeline with parallel steps](./examples/data_pipeline_parallel/) - [Data pipeline for processing big data using PySpark](./examples/data_pipeline_pyspark/) - [Data pipeline where you package and ship your project as a wheel](./examples/data_pipeline_with_packaged_project/) -- [Machine Learning pipeline where we combine glue jobs with sagemaker](./examples/ml_pipeline_abalone) +- [Machine Learning pipeline where we combine glue jobs with sagemaker](examples/ml_pipeline_end_to_end) All our examples are in [./examples](./examples) diff --git a/examples/ml_pipeline_abalone/README.md b/examples/ml_pipeline_end_to_end/README.md similarity index 97% rename from examples/ml_pipeline_abalone/README.md rename to examples/ml_pipeline_end_to_end/README.md index 1ea5fa1..97857f4 100644 --- a/examples/ml_pipeline_abalone/README.md +++ b/examples/ml_pipeline_end_to_end/README.md @@ -1,4 +1,4 @@ -# ML Pipeline Scikitlearn +# End to End Machine Learning Pipeline > This example is an implementation of datajob of [an official aws sagemaker example.](https://github.com/aws/amazon-sagemaker-examples/blob/master/step-functions-data-science-sdk/machine_learning_workflow_abalone/machine_learning_workflow_abalone.ipynb) @@ -17,7 +17,7 @@ we have 5 steps in our ML pipeline: ## Deploy - cd examples/ml_pipeline_abalone + cd examples/ml_pipeline_end_to_end export AWS_PROFILE=my-profile export AWS_DEFAULT_REGION=eu-west-1 cdk deploy --app "python datajob_stack.py" --require-approval never diff --git a/examples/ml_pipeline_abalone/assets/sagemaker-endpoint.png b/examples/ml_pipeline_end_to_end/assets/sagemaker-endpoint.png similarity index 100% rename from examples/ml_pipeline_abalone/assets/sagemaker-endpoint.png rename to examples/ml_pipeline_end_to_end/assets/sagemaker-endpoint.png diff --git a/examples/ml_pipeline_abalone/assets/stepfunctions-workflow.png b/examples/ml_pipeline_end_to_end/assets/stepfunctions-workflow.png similarity index 100% rename from examples/ml_pipeline_abalone/assets/stepfunctions-workflow.png rename to examples/ml_pipeline_end_to_end/assets/stepfunctions-workflow.png diff --git a/examples/ml_pipeline_abalone/datajob_stack.py b/examples/ml_pipeline_end_to_end/datajob_stack.py similarity index 100% rename from examples/ml_pipeline_abalone/datajob_stack.py rename to examples/ml_pipeline_end_to_end/datajob_stack.py diff --git a/examples/ml_pipeline_abalone/jobs/prepare_dataset.py b/examples/ml_pipeline_end_to_end/jobs/prepare_dataset.py similarity index 100% rename from examples/ml_pipeline_abalone/jobs/prepare_dataset.py rename to examples/ml_pipeline_end_to_end/jobs/prepare_dataset.py From 20c08f1429253a0a780748651d1a76da72d5d6d5 Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 14:34:56 +0200 Subject: [PATCH 05/18] update ml pipeline example --- examples/ml_pipeline_end_to_end/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/ml_pipeline_end_to_end/README.md b/examples/ml_pipeline_end_to_end/README.md index 97857f4..aff2bcb 100644 --- a/examples/ml_pipeline_end_to_end/README.md +++ b/examples/ml_pipeline_end_to_end/README.md @@ -17,6 +17,9 @@ we have 5 steps in our ML pipeline: ## Deploy +> !!! Don't forget to pull down the sagemaker endpoint that is created at the end of the pipeline. + + cd examples/ml_pipeline_end_to_end export AWS_PROFILE=my-profile export AWS_DEFAULT_REGION=eu-west-1 @@ -31,7 +34,7 @@ we have 5 steps in our ML pipeline: arn:aws:cloudformation:eu-west-1:077590795309:stack/datajob-ml-pipeline-abalone/e179ec30-f45a-11eb-9731-02575f1b7adf -execute the ml pipeline +## Execute datajob execute --state-machine datajob-ml-pipeline-abalone-workflow From 90feffa2f8ea995f5e387a0428617cd4faab009f Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 14:38:58 +0200 Subject: [PATCH 06/18] refactor sagemaker. we want to split base and sagemaker implementation classes --- README.md | 2 +- examples/ml_pipeline_end_to_end/README.md | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9a656d9..91e206c 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,7 @@ cd examples/data_pipeline_simple cdk deploy --app "python datajob_stack.py" ``` -### Run +### Execute ```shell script datajob execute --state-machine data-pipeline-simple-workflow diff --git a/examples/ml_pipeline_end_to_end/README.md b/examples/ml_pipeline_end_to_end/README.md index aff2bcb..2b13450 100644 --- a/examples/ml_pipeline_end_to_end/README.md +++ b/examples/ml_pipeline_end_to_end/README.md @@ -59,6 +59,11 @@ In the end a sagemaker endpoint is created: ![sagemaker-endpoint.png](assets/sagemaker-endpoint.png) In our example the name of the endpoint is `datajob-ml-pipeline-abalone-create-endpoint-20210803T165017` -Pull down the sagemaker endpoint by executing the following command: + +## Destroy + + cdk destroy --app "python datajob_stack.py" + +Don't forget to pull down the sagemaker endpoint: aws sagemaker delete-endpoint --endpoint-name datajob-ml-pipeline-abalone-create-endpoint-20210803T165017 From e2e5a49799d084922140ffdadb68d71b36fb07e6 Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 15:17:35 +0200 Subject: [PATCH 07/18] update readme --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 91e206c..6e75a2e 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,12 @@
+
+ :new: :rocket: Checkout our new example combining all the AWS services we currently support :new: :rocket: +
+ [End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end)
From ad8b866680ba1d7234fc101be2d39e9f12e1e6ba Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 15:18:08 +0200 Subject: [PATCH 08/18] update readme --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 6e75a2e..5e63914 100644 --- a/README.md +++ b/README.md @@ -20,11 +20,10 @@
:new: :rocket: Checkout our new example combining all the AWS services we currently support :new: :rocket: +[End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end)
-[End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end) -
# Installation From eb401263a88c7b36b4049caf183ac2f329f9393d Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 15:21:34 +0200 Subject: [PATCH 09/18] update readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5e63914..6d07c26 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@
:new: :rocket: Checkout our new example combining all the AWS services we currently support :new: :rocket: +
[End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end)
From 430acc1351f4c6641064107776653da5481ad87d Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 15:25:10 +0200 Subject: [PATCH 10/18] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6d07c26..8e28292 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@
:new: :rocket: Checkout our new example combining all the AWS services we currently support :new: :rocket: -
+

[End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end)
From be748ef01cf7f07ca044035f69f3a5c5642760be Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 15:30:32 +0200 Subject: [PATCH 11/18] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8e28292..331b9d7 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ :new: :rocket: Checkout our new example combining all the AWS services we currently support :new: :rocket:

-[End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end) +:arrow_right: [End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end) :arrow_left: From b2996c5c25f5a9ee34be57e393d258e97b578cfc Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 15:31:37 +0200 Subject: [PATCH 12/18] update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 331b9d7..f7e0177 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,9 @@
- :new: :rocket: Checkout our new example combining all the AWS services we currently support :new: :rocket: + :new: :rocket: :arrow_right: [End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end) :arrow_left: :new: :rocket:

-:arrow_right: [End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end) :arrow_left: +
From b2d5f8a914c952a0cb760d2c7172f63c0552d6d2 Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 15:33:31 +0200 Subject: [PATCH 13/18] update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f7e0177..be7003a 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,9 @@
- :new: :rocket: :arrow_right: [End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end) :arrow_left: :new: :rocket: -

+ :new: :rocket: [Checkout our new example of an End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end) :rocket: :new: +

From f06660fccd3671f6a7825e4ba2a7ba4f21d60302 Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 15:34:47 +0200 Subject: [PATCH 14/18] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index be7003a..f62b7d2 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@
- :new: :rocket: [Checkout our new example of an End-to-end Machine Learning Pipeline using Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end) :rocket: :new: + :new: :rocket: [Check our new example of an End-to-end Machine Learning Pipeline with Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end) :rocket: :new:

From 634e0581060de416260431512601607cf2efeecc Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 15:35:07 +0200 Subject: [PATCH 15/18] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f62b7d2..143c426 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@
- :new: :rocket: [Check our new example of an End-to-end Machine Learning Pipeline with Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end) :rocket: :new: + :new: [Check our new example of an End-to-end Machine Learning Pipeline with Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end) :new:

From 434044219e746c3bb5bf583c3a4a9760095e2eaa Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 15:36:14 +0200 Subject: [PATCH 16/18] update readme --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 143c426..765111c 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,11 @@
- :new: [Check our new example of an End-to-end Machine Learning Pipeline with Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end) :new: + :new: :rocket: +
+[Check our new example of an End-to-end Machine Learning Pipeline with Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end) +
+:rocket: :new:

From 9ec76a62e0f4fe97a6f832fa1d04d1bfbd1e756e Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 15:36:37 +0200 Subject: [PATCH 17/18] update readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 765111c..6168a7a 100644 --- a/README.md +++ b/README.md @@ -19,11 +19,11 @@
- :new: :rocket: + :rocket: :new: :rocket:
[Check our new example of an End-to-end Machine Learning Pipeline with Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end)
-:rocket: :new: +:rocket: :new: :rocket:

From f3f4929d7c8e60e27f3b08943bf2cefc21242566 Mon Sep 17 00:00:00 2001 From: vincent Date: Wed, 11 Aug 2021 15:36:55 +0200 Subject: [PATCH 18/18] update readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 6168a7a..a1ea5b3 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,10 @@ :rocket: :new: :rocket:
+
[Check our new example of an End-to-end Machine Learning Pipeline with Glue, Sagemaker and Stepfunctions](examples/ml_pipeline_end_to_end)
+
:rocket: :new: :rocket: