diff --git a/.bumpversion.cfg b/.bumpversion.cfg index d012720..a6982b0 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.0 +current_version = 0.4.1 [bumpversion:file:pyproject.toml] diff --git a/.copier-answers.yml b/.copier-answers.yml index b92ed36..b9a7809 100644 --- a/.copier-answers.yml +++ b/.copier-answers.yml @@ -7,7 +7,7 @@ description: Kedro plugin with Azure ML Pipelines support docs_url: https://kedro-azureml.readthedocs.io/ full_name: Kedro Azure ML Pipelines plugin github_url: https://github.com/getindata/kedro-azureml -initial_version: 0.4.0 +initial_version: 0.4.1 keywords: - kedro - mlops diff --git a/CHANGELOG.md b/CHANGELOG.md index 18feac0..a0ddadb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ ## [Unreleased] +## [0.4.1] - 2023-05-04 + +- [📝 Docs] Revamp the quickstart guide in documentation +- Refactor `kedro azureml init` command to be more user-friendly +- Add dependency on `kedro-datasets` to prepare for Kedro `0.19.0`; Remove `kedro.datasets.*` imports + ## [0.4.0] - 2023-04-28 - [🧑‍🔬 Experimental ] Added support for pipeline-native data passing (allows to preview intermediate data in AzureML Studio UI) by [@tomasvanpottelbergh](https://github.com/tomasvanpottelbergh) @@ -56,7 +62,9 @@ - Initial plugin release -[Unreleased]: https://github.com/getindata/kedro-azureml/compare/0.4.0...HEAD +[Unreleased]: https://github.com/getindata/kedro-azureml/compare/0.4.1...HEAD + +[0.4.1]: https://github.com/getindata/kedro-azureml/compare/0.4.0...0.4.1 [0.4.0]: https://github.com/getindata/kedro-azureml/compare/0.3.6...0.4.0 diff --git a/docs/conf.py b/docs/conf.py index 7160c77..3be8d5c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -6,6 +6,8 @@ # -- Path setup -------------------------------------------------------------- +import datetime as dt + # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. @@ -20,7 +22,7 @@ # -- Project information ----------------------------------------------------- project = "Kedro Azure ML Plugin" -copyright = "2022, GetInData" +copyright = f"{dt.datetime.utcnow().year}, GetInData" author = "GetInData" # The full version, including alpha/beta/rc tags @@ -58,7 +60,7 @@ # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] -autodoc_mock_imports = ["azureml", "pandas"] +autodoc_mock_imports = ["azureml", "pandas", "backoff", "cloudpickle"] # -- Options for HTML output ------------------------------------------------- diff --git a/docs/index.rst b/docs/index.rst index 873c604..b9c747d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -13,7 +13,8 @@ Welcome to Kedro Azure ML Pipelines plugin documentation! Introduction Installation Quickstart - Data Assets + MLflow Integration + Data Assets Development diff --git a/docs/source/03_quickstart.rst b/docs/source/03_quickstart.rst index 5071136..c80a0e4 100644 --- a/docs/source/03_quickstart.rst +++ b/docs/source/03_quickstart.rst @@ -1,5 +1,9 @@ +========== Quickstart ----------- +========== + +Video-tutorial +-------------- You can go through the written quickstart here or watch the video on YouTube: @@ -10,6 +14,9 @@ YouTube: ---- +Prerequisites +------------- + Before you start, make sure that you have the following resources created in Azure and have their **names** ready to input to the plugin: @@ -18,35 +25,38 @@ created in Azure and have their **names** ready to input to the plugin: - Azure ML workspace - Azure ML Compute Cluster -Depending on the type of flow you want to use, you will also need: +Depending on the type of flow you want to use, you might also need: - Azure Storage Account and Storage Container - Azure Storage Key (will be used to execute the pipeline) - Azure Container Registry +Project initialization +---------------------- + 1. Make sure that you're logged into Azure (``az login``). 2. Prepare new virtual environment with Python >=3.8. Install the packages -.. code:: console + .. code:: console - pip install "kedro>=0.18.2,<0.19" "kedro-docker" "kedro-azureml" + pip install "kedro>=0.18.5,<0.19" "kedro-docker" "kedro-azureml" 2. Create new project (e.g. from starter) -.. code:: console + .. code:: console - kedro new --starter=spaceflights + kedro new --starter=spaceflights - Project Name - ============ - Please enter a human readable name for your new project. - Spaces, hyphens, and underscores are allowed. - [Spaceflights]: kedro_azureml_demo + Project Name + ============ + Please enter a human readable name for your new project. + Spaces, hyphens, and underscores are allowed. + [Spaceflights]: kedro_azureml_demo - The project name 'kedro_azureml_demo' has been applied to: - - The project title in /Users/marcin/Dev/tmp/kedro-azureml-demo/README.md - - The folder created for your project in /Users/marcin/Dev/tmp/kedro-azureml-demo - - The project's python package in /Users/marcin/Dev/tmp/kedro-azureml-demo/src/kedro_azureml_demo + The project name 'kedro_azureml_demo' has been applied to: + - The project title in /Users/marcin/Dev/tmp/kedro-azureml-demo/README.md + - The folder created for your project in /Users/marcin/Dev/tmp/kedro-azureml-demo + - The project's python package in /Users/marcin/Dev/tmp/kedro-azureml-demo/src/kedro_azureml_demo 3. Go to the project's directory: ``cd kedro-azureml-demo`` 4. Add ``kedro-azureml`` to ``src/requirements.txt`` @@ -54,236 +64,252 @@ Depending on the type of flow you want to use, you will also need: or set appropriate settings (`https://github.com/kedro-org/kedro-plugins/tree/main/kedro-telemetry `__). 6. Install the requirements ``pip install -r src/requirements.txt`` -7. Initialize Kedro Azure ML plugin, it requires the Azure resource - names as stated above. Experiment name can be anything you like (as - long as it's allowed by Azure ML). The environment name is the name - of the Azure ML Environment to be created in the next steps. You can - use the syntax ``@latest`` for the latest version or - ``:`` for a specific version. +7. Initialize Kedro Azure ML plugin, it requires the Azure resource names as stated above. Experiment name can be anything you like (as + long as it's allowed by Azure ML). + + There are two options, which determine how you should initialize the plugin (don't worry, you can change it later 👍 ): + 1. Use docker image flow (shown in the Quickstart video) - more suitable for MLOps processes with better experiment repeatability guarantees + 2. Use code upload flow - more suitable for Data Scientists' fast experimentation and pipeline development .. code:: console - #Usage: kedro azureml init [OPTIONS] SUBSCRIPTION_ID RESOURCE_GROUP WORKSPACE_NAME - # EXPERIMENT_NAME CLUSTER_NAME STORAGE_ACCOUNT_NAME - # STORAGE_CONTAINER ENVIRONMENT_NAME - kedro azureml init + Usage: kedro azureml init [OPTIONS] SUBSCRIPTION_ID RESOURCE_GROUP + WORKSPACE_NAME EXPERIMENT_NAME CLUSTER_NAME + + Creates basic configuration for Kedro AzureML plugin + + Options: + --azureml-environment, --aml-env TEXT + Azure ML environment to use with code flow + -d, --docker-image TEXT Docker image to use + -a, --storage-account-name TEXT + Name of the storage account (if you want to + use Azure Blob Storage for temporary data) + -c, --storage-container TEXT Name of the storage container (if you want + to use Azure Blob Storage for temporary + data) + --use-pipeline-data-passing (flag) Set, to use EXPERIMENTAL pipeline + data passing + +For **docker image flow** (1.), use the following ``init`` command: + + .. code:: console + + kedro azureml init \ + --docker-image .azurecr.io/:latest -a -c + + +For **code upload flow** (2.), use the following ``init`` command: + + .. code:: console + + kedro azureml init \ + --aml-env .azurecr.io/:latest -a -c + +.. note:: + If you want to pass data between nodes using the built-in Azure ML pipeline data passing, specify + option ``--use-pipeline-data-passing`` instead of `-a` and `-c` options. -If you want to pass data between nodes using the built-in Azure ML -pipeline data passing, you can use dummy values for the storage account -and container names. In this case, adjust the ``conf/base/azureml.yml`` -to enable pipeline data passing. See :doc:`04_data_assets` for more -information about this. + Note that pipeline data passing feature is experimental 🧑‍🔬 See :doc:`04_data_assets` for more information about this. + +Adjusting the Data Catalog +-------------------------- 8. Adjust the Data Catalog - the default one stores all data locally, - whereas the plugin will automatically use Azure Blob Storage. Only - input data is required to be read locally. Final - ``conf/base/catalog.yml`` should look like this: + whereas the plugin will automatically use Azure Blob Storage / Azure ML built-in storage (if *pipeline data passing* was enabled). Only + input data is required to be read locally. -.. code:: yaml + Final ``conf/base/catalog.yml`` should look like this: - companies: - type: pandas.CSVDataSet - filepath: data/01_raw/companies.csv - layer: raw + .. code:: yaml - reviews: - type: pandas.CSVDataSet - filepath: data/01_raw/reviews.csv - layer: raw + companies: + type: pandas.CSVDataSet + filepath: data/01_raw/companies.csv + layer: raw - shuttles: - type: pandas.ExcelDataSet - filepath: data/01_raw/shuttles.xlsx - layer: raw + reviews: + type: pandas.CSVDataSet + filepath: data/01_raw/reviews.csv + layer: raw -9. Prepare an Azure ML Environment for the project: + shuttles: + type: pandas.ExcelDataSet + filepath: data/01_raw/shuttles.xlsx + layer: raw - For the project's code to run on Azure ML it needs to have an environment - with the necessary dependencies. +Pick your deployment option +--------------------------- +For the project's code to run on Azure ML it needs to have an environment +with the necessary dependencies. -You have 2 options for executing your pipeline in Azure ML - 1. Use code upload (default) - more suitable for Data Scientists' experimentation and pipeline development - 2. Use docker image flow (shown in the Quickstart video) - more suitable for MLOps processes with better experiment repeatability guarantees -Start by executing the following command: +9. Start by executing the following command: -.. code:: console + .. code:: console - kedro docker init + kedro docker init -This command creates a several files, including ``Dockerfile`` and -``.dockerignore``. These can be adjusted to match the workflow for -your project. + This command creates a several files, including ``Dockerfile`` and ``.dockerignore``. These can be adjusted to match the workflow for your project. Depending on whether you want to use code upload when submitting an experiment or not, you would need to add the code and any possible input data to the Docker image. -9.1. **If using code upload** (default) +(Option 1) Docker image flow +**************************** +This option is also shown in the video-tutorial above. -Everything apart from the section "install project requirements" -can be removed from the ``Dockerfile``. This plugin automatically creates empty ``.amlignore`` file (`see the official docs `__) -which means that all of the files (including potentially sensitive ones!) will be uploaded to Azure ML. Modify this file if needed. +.. note:: + | Note that using docker image flow means that every time you change your pipeline's code, + | you will need to build and push the docker image to ACR again. + | We recommend this option for CI/CD-automated MLOps workflows. -Ensure ``code_directory: "."`` is set in the ``azureml.yml`` config file (it's set by default). +10. Ensure that in the ``azureml.yml`` you have ``code_directory`` set to null, and ``docker.image`` is filled: + .. code:: yaml -.. collapse:: See example Dockerfile for code upload flow + code_directory: ~ + # rest of the azureml.yml file + docker: + image: your-container-registry.azurecr.io/kedro-azureml:latest - .. code-block:: dockerfile +11. Adjust the ``.dockerignore`` file to include any other files to be added to the Docker image, such as ``!data/01_raw`` for the raw data files. - ARG BASE_IMAGE=python:3.9 - FROM $BASE_IMAGE +12. Invoke docker build: - # install project requirements - COPY src/requirements.txt /tmp/requirements.txt - RUN pip install -r /tmp/requirements.txt && rm -f /tmp/requirements.txt + .. code:: console -\ + kedro docker build --docker-args "--build-arg=BASE_IMAGE=python:3.9" --image= -\Build the image: +13. Once finished, login to ACR: -.. code:: console + .. code:: console - kedro docker build --docker-args "--build-arg=BASE_IMAGE=python:3.9" --image=.azurecr.io/kedro-base-image:latest + az acr login --name -\Login to ACR and push the image: + \and push the image: -.. code:: console + .. code:: console - az acr login --name - docker push .azurecr.io/kedro-base-image:latest + docker push -\Register the Azure ML Environment: +(Option 2) Code upload flow +*************************** -.. code:: console +10. Everything apart from the section *install project requirements* +can be removed from the ``Dockerfile``. - az ml environment create --name --image .azurecr.io/kedro-base-image:latest + This plugin automatically creates empty ``.amlignore`` file (`see the official docs `__) + which means that all of the files (including potentially sensitive ones!) will be uploaded to Azure ML. Modify this file if needed. -\ -Now you can re-use this environment and run the pipeline without the need to build the docker image again (unless you add some dependencies to your environment, obviously :-) ). + .. collapse:: See example Dockerfile for code upload flow -.. warning:: - | Azure Code upload feature has issues with empty folders as identified in `GitHub #33 `__, where empty folders or folders with empty files might not get uploaded to Azure ML, which might result in the failing pipeline. - | We recommend to: - | - make sure that Kedro environments you intent to use in Azure have at least one non-empty file specified - | - gracefully handle folder creation in your pipeline's code (e.g. if your code depends on an existence of some folder) - | - | The plugin will do it's best to handle some of the edge-cases, but the fact that some of your files might not be captured by Azure ML SDK is out of our reach. + .. code-block:: dockerfile + ARG BASE_IMAGE=python:3.9 + FROM $BASE_IMAGE -9.2. **If using docker image flow** (shown in the Quickstart video) + # install project requirements + COPY src/requirements.txt /tmp/requirements.txt + RUN pip install -r /tmp/requirements.txt && rm -f /tmp/requirements.txt -.. note:: - | Note that using docker image flow means that every time you change your pipeline's code, - | you will need to build and push the docker image to ACR again. - | We recommend this option for CI/CD-automated MLOps workflows. +11. Ensure ``code_directory: "."`` is set in the ``azureml.yml`` config file (it's set if you've used ``--aml_env`` during ``init`` above). -Ensure that in the ``azureml.yml`` you have ``code_directory`` set to null, and ``docker.image`` is filled: -.. code:: yaml - code_directory: ~ - # rest of the azureml.yml file - docker: - image: your-container-registry.azurecr.io/kedro-azureml:latest -\ -Keep the sections in the ``Dockerfile`` and adjust the ``.dockerignore`` -file to include any other files to be added to the Docker image, -such as ``!data/01_raw`` for the raw data files. +12. Build the image: -Invoke docker build: + .. code:: console -.. code:: console + kedro docker build --docker-args "--build-arg=BASE_IMAGE=python:3.9" --image=.azurecr.io/kedro-base-image:latest - kedro docker build --docker-args "--build-arg=BASE_IMAGE=python:3.9" --image= +12. Login to ACR and push the image: -\Once finished, login to ACR: + .. code:: console -.. code:: console + az acr login --name + docker push .azurecr.io/kedro-base-image:latest - az acr login --name +13. Register the Azure ML Environment: -\and push the image: + .. code:: console -.. code:: console + az ml environment create --name --image .azurecr.io/kedro-base-image:latest + +\ +Now you can re-use this environment and run the pipeline without the need to build the docker image again (unless you add some dependencies to your environment, obviously 😉 ). + +.. warning:: + | Azure Code upload feature has issues with empty folders as identified in `GitHub #33 `__, where empty folders or folders with empty files might not get uploaded to Azure ML, which might result in the failing pipeline. + | We recommend to: + | - make sure that Kedro environments you intent to use in Azure have at least one non-empty file specified + | - gracefully handle folder creation in your pipeline's code (e.g. if your code depends on an existence of some folder) + | + | The plugin will do it's best to handle some of the edge-cases, but the fact that some of your files might not be captured by Azure ML SDK is out of our reach. - docker push +Run the pipeline +---------------- -10. Run the pipeline on Azure ML Pipelines. Here, the *Azure Subscription ID* and *Storage Account Key* will be used: +14. Run the pipeline on Azure ML Pipelines. Here, the *Azure Subscription ID* and *Storage Account Key* will be used: -.. code:: console + .. code:: console - kedro azureml run -s + kedro azureml run -You will most likely see the following prompt: + If you're using Azure Blob Storage for temporary data (``-a``, ``-c`` options during init), you will most likely see the following prompt: -.. code:: console + .. code:: console - Environment variable AZURE_STORAGE_ACCOUNT_KEY not set, falling back to CLI prompt - Please provide Azure Storage Account Key for storage account : + Environment variable AZURE_STORAGE_ACCOUNT_KEY not set, falling back to CLI prompt + Please provide Azure Storage Account Key for storage account : -Input the storage account key and press [ENTER] (input will be hidden). + Input the storage account key and press [ENTER] (input will be hidden). + + If you're using *pipeline data passing* (``--use-pipeline-data-passing`` option during init), you're already set. 11. Plugin will verify the configuration (e.g. the existence of the compute cluster) and then it will create a *Job* in the Azure ML. The URL to view the job will be displayed in the console output. -12. (optional) You can also use - ``kedro azureml run -s --wait-for-completion`` +12. (optional) You can also use |br| ``kedro azureml run -s --wait-for-completion`` |br| to actively wait for the job to finish. Execution logs will be streamed to the console. -.. code:: console - - RunId: placid_pot_bdcyntnkvn - Web View: https://ml.azure.com/runs/placid_pot_bdcyntnkvn?wsid=/subscriptions//resourcegroups//workspaces/ml-ops-sandbox - - Streaming logs/azureml/executionlogs.txt - ======================================== - - [2022-07-22 11:45:38Z] Submitting 2 runs, first five are: 1ee5f43f:8cf2e387-e7ec-44cc-9615-2108891153f7,7d81aeeb:c8b837a9-1f79-4971-aae3-3191b29b42e8 - [2022-07-22 11:47:02Z] Completing processing run id c8b837a9-1f79-4971-aae3-3191b29b42e8. - [2022-07-22 11:47:25Z] Completing processing run id 8cf2e387-e7ec-44cc-9615-2108891153f7. - [2022-07-22 11:47:26Z] Submitting 1 runs, first five are: 362b9632:7867ead0-b308-49df-95ca-efa26f8583cb - [2022-07-22 11:49:27Z] Completing processing run id 7867ead0-b308-49df-95ca-efa26f8583cb. - [2022-07-22 11:49:28Z] Submitting 2 runs, first five are: 03b2293e:e9e210e7-10ab-4010-91f6-4a40aabf3a30,4f9ccafb:3c00e735-cd3f-40c7-9c1d-fe53349ca8bc - [2022-07-22 11:50:50Z] Completing processing run id e9e210e7-10ab-4010-91f6-4a40aabf3a30. - [2022-07-22 11:50:51Z] Submitting 1 runs, first five are: 7a88df7a:c95c1488-5f55-48fa-80ce-971d5412f0fb - [2022-07-22 11:51:26Z] Completing processing run id 3c00e735-cd3f-40c7-9c1d-fe53349ca8bc. - [2022-07-22 11:51:26Z] Submitting 1 runs, first five are: a79effc8:0828c39a-6f02-43f5-acfd-33543f0d6c74 - [2022-07-22 11:52:38Z] Completing processing run id c95c1488-5f55-48fa-80ce-971d5412f0fb. - [2022-07-22 11:52:39Z] Submitting 1 runs, first five are: 0a18d6d6:cb9c8f61-e129-4394-a795-ab70be74eb0f - [2022-07-22 11:53:03Z] Completing processing run id 0828c39a-6f02-43f5-acfd-33543f0d6c74. - [2022-07-22 11:53:04Z] Submitting 1 runs, first five are: 1af5c8de:2821dc44-3399-4a26-9cdf-1e8f5b7d6b62 - [2022-07-22 11:53:28Z] Completing processing run id cb9c8f61-e129-4394-a795-ab70be74eb0f. - [2022-07-22 11:53:51Z] Completing processing run id 2821dc44-3399-4a26-9cdf-1e8f5b7d6b62. - - Execution Summary - ================= - RunId: placid_pot_bdcyntnkvn + .. code:: console + + RunId: placid_pot_bdcyntnkvn + Web View: https://ml.azure.com/runs/placid_pot_bdcyntnkvn?wsid=/subscriptions//resourcegroups//workspaces/ml-ops-sandbox + + Streaming logs/azureml/executionlogs.txt + ======================================== + + [2022-07-22 11:45:38Z] Submitting 2 runs, first five are: 1ee5f43f:8cf2e387-e7ec-44cc-9615-2108891153f7,7d81aeeb:c8b837a9-1f79-4971-aae3-3191b29b42e8 + [2022-07-22 11:47:02Z] Completing processing run id c8b837a9-1f79-4971-aae3-3191b29b42e8. + [2022-07-22 11:47:25Z] Completing processing run id 8cf2e387-e7ec-44cc-9615-2108891153f7. + [2022-07-22 11:47:26Z] Submitting 1 runs, first five are: 362b9632:7867ead0-b308-49df-95ca-efa26f8583cb + [2022-07-22 11:49:27Z] Completing processing run id 7867ead0-b308-49df-95ca-efa26f8583cb. + [2022-07-22 11:49:28Z] Submitting 2 runs, first five are: 03b2293e:e9e210e7-10ab-4010-91f6-4a40aabf3a30,4f9ccafb:3c00e735-cd3f-40c7-9c1d-fe53349ca8bc + [2022-07-22 11:50:50Z] Completing processing run id e9e210e7-10ab-4010-91f6-4a40aabf3a30. + [2022-07-22 11:50:51Z] Submitting 1 runs, first five are: 7a88df7a:c95c1488-5f55-48fa-80ce-971d5412f0fb + [2022-07-22 11:51:26Z] Completing processing run id 3c00e735-cd3f-40c7-9c1d-fe53349ca8bc. + [2022-07-22 11:51:26Z] Submitting 1 runs, first five are: a79effc8:0828c39a-6f02-43f5-acfd-33543f0d6c74 + [2022-07-22 11:52:38Z] Completing processing run id c95c1488-5f55-48fa-80ce-971d5412f0fb. + [2022-07-22 11:52:39Z] Submitting 1 runs, first five are: 0a18d6d6:cb9c8f61-e129-4394-a795-ab70be74eb0f + [2022-07-22 11:53:03Z] Completing processing run id 0828c39a-6f02-43f5-acfd-33543f0d6c74. + [2022-07-22 11:53:04Z] Submitting 1 runs, first five are: 1af5c8de:2821dc44-3399-4a26-9cdf-1e8f5b7d6b62 + [2022-07-22 11:53:28Z] Completing processing run id cb9c8f61-e129-4394-a795-ab70be74eb0f. + [2022-07-22 11:53:51Z] Completing processing run id 2821dc44-3399-4a26-9cdf-1e8f5b7d6b62. + + Execution Summary + ================= + RunId: placid_pot_bdcyntnkvn |Kedro AzureML Pipeline execution| -MLflow integration ------------------- - -The plugin is compatible with ``mlflow`` (but not yet with -``kedro-mlflow``). You can use native mlflow logging capabilities -provided by Azure ML. See the guide here: -`https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-mlflow-cli-runs?tabs=azuremlsdk `__. - -There is no additional configuration for MLflow required in order to use -it with Azure ML pipelines. All the settings are provided automatically -by the Azure ML service. - -|Kedro AzureML MLflow integration| - -.. |Kedro AzureML Pipeline execution| image:: ../images/azureml_running_pipeline.gif -.. |Kedro AzureML MLflow integration| image:: ../images/kedro-azureml-mlflow.png ------------ @@ -403,3 +429,7 @@ In case you need to customize pipeline run context, modifying configuration file - ``--pipeline`` allows to select a pipeline to run (by default, the ``__default__`` pipeline is started), - ``--params`` takes a JSON string with parameters override (JSONed version of ``conf/*/parameters.yml``, not the Kedro's ``params:`` syntax), - ``--env-var KEY=VALUE`` sets the OS environment variable injected to the steps during runtime (can be used multiple times). + +.. |br| raw:: html + +
\ No newline at end of file diff --git a/docs/source/04_mlflow.rst b/docs/source/04_mlflow.rst new file mode 100644 index 0000000..71827de --- /dev/null +++ b/docs/source/04_mlflow.rst @@ -0,0 +1,16 @@ +================== +MLflow integration +================== + +The plugin is compatible with ``mlflow``. You can use native mlflow logging capabilities +provided by Azure ML. See the guide here: +`https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-mlflow-cli-runs?tabs=azuremlsdk `__. + +There is no additional configuration for MLflow required in order to use +it with Azure ML pipelines. All the settings are provided automatically +by the Azure ML service via environment variables. + +|Kedro AzureML MLflow integration| + +.. |Kedro AzureML Pipeline execution| image:: ../images/azureml_running_pipeline.gif +.. |Kedro AzureML MLflow integration| image:: ../images/kedro-azureml-mlflow.png diff --git a/docs/source/04_data_assets.rst b/docs/source/05_data_assets.rst similarity index 91% rename from docs/source/04_data_assets.rst rename to docs/source/05_data_assets.rst index 97965ef..439d65a 100644 --- a/docs/source/04_data_assets.rst +++ b/docs/source/05_data_assets.rst @@ -1,5 +1,5 @@ Azure Data Assets ------------------ +================= ``kedro-azureml`` adds support for two new datasets that can be used in the Kedro catalog, the ``AzureMLFileDataSet`` and the ``AzureMLPandasDataSet`` which translate to `File/Folder dataset`_ and `Tabular dataset`_ respectively in @@ -7,7 +7,7 @@ Azure Machine Learning. Both fully support the Azure versioning mechanism and ca other dataset in Kedro. Apart from these, ``kedro-azureml`` also adds the ``AzureMLPipelineDataSet`` which is used to pass data between -pipeline nodes when the pipeline is run on Azure ML and the `pipeline_data_passing` feature is enabled. +pipeline nodes when the pipeline is run on Azure ML and the *pipeline data passing* feature is enabled. By default, data is then saved and loaded using the ``PickleDataSet`` as underlying dataset. Any other underlying dataset can be used instead by adding a ``AzureMLPipelineDataSet`` to the catalog. @@ -22,7 +22,7 @@ For details on usage, see the :ref:`API Reference` below .. _`API Reference`: API Reference -============= +------------- .. autoclass:: kedro_azureml.datasets.AzureMLPandasDataSet :members: diff --git a/kedro_azureml/__init__.py b/kedro_azureml/__init__.py index a96ca08..2b12256 100644 --- a/kedro_azureml/__init__.py +++ b/kedro_azureml/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.4.0" +__version__ = "0.4.1" import warnings diff --git a/kedro_azureml/cli.py b/kedro_azureml/cli.py index cdf18a4..0cc49a3 100644 --- a/kedro_azureml/cli.py +++ b/kedro_azureml/cli.py @@ -57,10 +57,32 @@ def azureml_group(ctx, metadata: ProjectMetadata, env): @click.argument("workspace_name") @click.argument("experiment_name") @click.argument("cluster_name") -@click.argument("environment_name") -@click.option("-a", "--storage_account_name") -@click.option("-c", "--storage_container") -@click.option("--use-pipeline-data-passing", is_flag=True, default=False) +@click.option( + "--azureml-environment", + "--aml-env", + default=None, + type=str, + help="Azure ML environment to use with code flow", +) +@click.option( + "-d", "--docker-image", default=None, type=str, help="Docker image to use" +) +@click.option( + "-a", + "--storage-account-name", + help="Name of the storage account (if you want to use Azure Blob Storage for temporary data)", +) +@click.option( + "-c", + "--storage-container", + help="Name of the storage container (if you want to use Azure Blob Storage for temporary data)", +) +@click.option( + "--use-pipeline-data-passing", + is_flag=True, + default=False, + help="(flag) Set, to use EXPERIMENTAL pipeline data passing", +) @click.pass_obj def init( ctx: CliContext, @@ -69,7 +91,8 @@ def init( workspace_name, experiment_name, cluster_name, - environment_name, + azureml_environment: Optional[str], + docker_image: Optional[str], storage_account_name, storage_container, use_pipeline_data_passing: bool, @@ -78,13 +101,23 @@ def init( Creates basic configuration for Kedro AzureML plugin """ + # Check whether docker_image and azure_ml_environment are specified, they cannot be, they are mutually exclusive + if docker_image and azureml_environment: + raise click.UsageError( + "You cannot specify both --docker_image/-d and --azure_ml_environment/--aml_env" + ) + elif not (docker_image or azureml_environment): + raise click.UsageError( + "You must specify either --docker_image/-d or --azure_ml_environment/--aml_env" + ) + if ( not (storage_account_name and storage_container) and not use_pipeline_data_passing ): raise click.UsageError( "You need to specify storage account (-a) and container name (-c) " - "or enable pipeline data passing (--use-pipeline-data-passing)" + "or enable pipeline data passing (--use_pipeline_data_passing)" ) target_path = Path.cwd().joinpath("conf/base/azureml.yml") @@ -97,23 +130,26 @@ def init( "cluster_name": cluster_name, "storage_account_name": storage_account_name or "~", "storage_container": storage_container or "~", - "environment_name": environment_name, + "environment_name": azureml_environment or "~", "pipeline_data_passing": use_pipeline_data_passing, + "docker_image": docker_image or "~", + "code_directory": "." if azureml_environment else "~", } ) target_path.write_text(cfg) click.echo(f"Configuration generated in {target_path}") - click.echo( - click.style( - f"It's recommended to set Lifecycle management rule for storage container {storage_container} " - f"to avoid costs of long-term storage of the temporary data." - f"\nTemporary data will be stored under abfs://{storage_container}/{KEDRO_AZURE_BLOB_TEMP_DIR_NAME} path" # noqa - f"\nSee https://docs.microsoft.com/en-us/azure/storage/blobs/lifecycle-management-policy-configure?tabs=azure-portal", # noqa - fg="green", + if storage_account_name and storage_container: + click.echo( + click.style( + f"It's recommended to set Lifecycle management rule for storage container {storage_container} " + f"to avoid costs of long-term storage of the temporary data." + f"\nTemporary data will be stored under abfs://{storage_container}/{KEDRO_AZURE_BLOB_TEMP_DIR_NAME} path" # noqa + f"\nSee https://docs.microsoft.com/en-us/azure/storage/blobs/lifecycle-management-policy-configure?tabs=azure-portal", # noqa + fg="green", + ) ) - ) aml_ignore = Path.cwd().joinpath(".amlignore") if aml_ignore.exists(): @@ -131,14 +167,14 @@ def init( @azureml_group.command() @click.option( "-s", - "--subscription_id", + "--subscription-id", help=f"Azure Subscription ID. Defaults to env `{AZURE_SUBSCRIPTION_ID}`", default=lambda: os.getenv(AZURE_SUBSCRIPTION_ID, ""), type=str, ) @click.option( - "--azureml_environment", - "--aml_env", + "--azureml-environment", + "--aml-env", "aml_env", type=str, help="Azure ML Environment to use for pipeline execution.", @@ -233,8 +269,8 @@ def run( @azureml_group.command() @click.option( - "--azureml_environment", - "--aml_env", + "--azureml-environment", + "--aml-env", "aml_env", type=str, help="Azure ML Environment to use for pipeline execution.", diff --git a/kedro_azureml/config.py b/kedro_azureml/config.py index 6d77bdd..e233ab4 100644 --- a/kedro_azureml/config.py +++ b/kedro_azureml/config.py @@ -80,9 +80,9 @@ class KedroAzureRunnerConfig(BaseModel): # Azure ML Workspace name workspace_name: "{workspace_name}" # Azure ML Environment to use during pipeline execution - environment_name: "{environment_name}" + environment_name: {environment_name} # Path to directory to upload, or null to disable code upload - code_directory: "." + code_directory: {code_directory} # Path to the directory in the Docker image to run the code from # Ignored when code_directory is set working_directory: /home/kedro_docker @@ -114,7 +114,7 @@ class KedroAzureRunnerConfig(BaseModel): # We suggest using the Azure environment instead # See https://kedro-azureml.readthedocs.io/en/0.2.1/source/03_quickstart.html # Docker image to use during pipeline execution - image: ~ + image: {docker_image} """.strip() # This auto-validates the template above during import @@ -124,5 +124,8 @@ class KedroAzureRunnerConfig(BaseModel): ("azure.pipeline_data_passing.enabled", False), ("azure.temporary_storage.container", ""), ("azure.temporary_storage.account_name", ""), + ("azure.code_directory", None), + ("azure.environment_name", None), + ("docker.image", None), ) ) diff --git a/kedro_azureml/generator.py b/kedro_azureml/generator.py index 2059774..07287a9 100644 --- a/kedro_azureml/generator.py +++ b/kedro_azureml/generator.py @@ -1,6 +1,5 @@ import logging import re -import warnings from typing import Any, Dict, Optional, Type, Union from uuid import uuid4 @@ -134,11 +133,7 @@ def _resolve_azure_environment(self) -> Union[Environment, str]: self.docker_image or (self.config.docker.image if self.config.docker else None) ): - warnings.warn( - f"Using docker image: {image} to run the pipeline." - f"\nWe recommend to use Azure Environments instead, follow the updated Quickstart documentation", - DeprecationWarning, - ) + logger.info(f"Using docker image: {image} to run the pipeline.") return Environment(image=image) else: return self.aml_env or self.config.azure.environment_name diff --git a/kedro_azureml/runner.py b/kedro_azureml/runner.py index 1cefa05..93807c3 100644 --- a/kedro_azureml/runner.py +++ b/kedro_azureml/runner.py @@ -3,10 +3,10 @@ from pathlib import Path from typing import Any, Dict, Optional -from kedro.extras.datasets.pickle import PickleDataSet from kedro.io import AbstractDataSet, DataCatalog from kedro.pipeline import Pipeline from kedro.runner import SequentialRunner +from kedro_datasets.pickle import PickleDataSet from pluggy import PluginManager from kedro_azureml.config import KedroAzureRunnerConfig diff --git a/kedro_azureml/utils.py b/kedro_azureml/utils.py index 82a0bf0..ec4781d 100644 --- a/kedro_azureml/utils.py +++ b/kedro_azureml/utils.py @@ -10,6 +10,8 @@ class CliContext: def update_dict(dictionary, *kv_pairs): + """Return a deep copy of dictionary with updated values for the given key-value pairs. + Supports nested dictionaries""" updated = deepcopy(dictionary) def traverse(d, key, value): diff --git a/poetry.lock b/poetry.lock index b42d7ee..b27386b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1911,6 +1911,74 @@ video-videodataset = ["opencv-python (>=4.5.5.64,<4.6.0.0)"] yaml = ["PyYAML (>=4.2,<7.0)", "pandas (>=1.3,<2.0)"] yaml-yamldataset = ["PyYAML (>=4.2,<7.0)", "pandas (>=1.3,<2.0)"] +[[package]] +name = "kedro-datasets" +version = "1.2.0" +description = "Kedro-Datasets is where you can find all of Kedro's data connectors." +category = "main" +optional = false +python-versions = ">=3.7, <3.11" +files = [ + {file = "kedro-datasets-1.2.0.tar.gz", hash = "sha256:23c1de5412886056040b8a3ffb09137729f614dc0c4bb15a90e15228594235cc"}, + {file = "kedro_datasets-1.2.0-py3-none-any.whl", hash = "sha256:ba1b17a090cb9b9e0ae17f724729180ec4a631b3a234550983df67cf3967dfe2"}, +] + +[package.dependencies] +kedro = ">=0.18.4,<0.19.0" + +[package.extras] +all = ["Pillow (>=9.0,<10.0)", "PyYAML (>=4.2,<7.0)", "SQLAlchemy (>=1.2,<2.0)", "biopython (>=1.73,<2.0)", "dask[complete] (>=2021.10,<2022.0)", "delta-spark (>=1.0,<2.0)", "docutils (==0.16)", "geopandas (>=0.6.0,<1.0)", "hdfs (>=2.5.8,<3.0)", "holoviews (>=1.13.0,<1.14.0)", "ipykernel (>=5.3,<7.0)", "lxml (>=4.6,<5.0)", "matplotlib (>=3.0.3,<4.0)", "myst-parser (>=0.17.2,<0.18.0)", "nbsphinx (==0.8.1)", "nbstripout (>=0.4,<1.0)", "networkx (>=2.4,<3.0)", "opencv-python (>=4.5.5.64,<4.6.0.0)", "openpyxl (>=3.0.6,<4.0)", "pandas (>=1.3,<2.0)", "pandas-gbq (>=0.12.0,<0.18.0)", "plotly (>=4.8.0,<6.0)", "polars (>=0.15.16,<0.16.0)", "pyarrow (>=6.0)", "pyarrow (>=8.0,<9.0)", "pyodbc (>=4.0,<5.0)", "pyproj (>=3.0,<4.0)", "pyspark (>=2.2,<4.0)", "redis (>=4.1,<5.0)", "requests (>=2.20,<3.0)", "s3fs (>=0.3.0,<0.5)", "scikit-learn (>=1.0.2,<1.1.0)", "scipy (>=1.7.3,<1.8.0)", "snowflake-snowpark-python (>=1.0.0,<1.1.0)", "sphinx (>=3.4.3,<3.5.0)", "sphinx-autodoc-typehints (==1.11.1)", "sphinx-copybutton (==0.3.1)", "sphinx-rtd-theme (==0.4.1)", "tables (>=3.6,<4.0)", "tables (>=3.6.0,<3.7.0)", "tensorflow (>=2.0,<3.0)", "triad (>=0.6.7,<1.0)"] +api = ["requests (>=2.20,<3.0)"] +api-apidataset = ["requests (>=2.20,<3.0)"] +biosequence = ["biopython (>=1.73,<2.0)"] +biosequence-biosequencedataset = ["biopython (>=1.73,<2.0)"] +dask = ["dask[complete] (>=2021.10,<2022.0)", "triad (>=0.6.7,<1.0)"] +dask-parquetdataset = ["dask[complete] (>=2021.10,<2022.0)", "triad (>=0.6.7,<1.0)"] +docs = ["docutils (==0.16)", "ipykernel (>=5.3,<7.0)", "myst-parser (>=0.17.2,<0.18.0)", "nbsphinx (==0.8.1)", "nbstripout (>=0.4,<1.0)", "sphinx (>=3.4.3,<3.5.0)", "sphinx-autodoc-typehints (==1.11.1)", "sphinx-copybutton (==0.3.1)", "sphinx-rtd-theme (==0.4.1)"] +geopandas = ["geopandas (>=0.6.0,<1.0)", "pyproj (>=3.0,<4.0)"] +geopandas-geojsondataset = ["geopandas (>=0.6.0,<1.0)", "pyproj (>=3.0,<4.0)"] +holoviews = ["holoviews (>=1.13.0,<1.14.0)"] +holoviews-holoviewswriter = ["holoviews (>=1.13.0,<1.14.0)"] +matplotlib = ["matplotlib (>=3.0.3,<4.0)"] +matplotlib-matplotlibwriter = ["matplotlib (>=3.0.3,<4.0)"] +networkx = ["networkx (>=2.4,<3.0)"] +networkx-networkxdataset = ["networkx (>=2.4,<3.0)"] +pandas = ["SQLAlchemy (>=1.2,<2.0)", "lxml (>=4.6,<5.0)", "openpyxl (>=3.0.6,<4.0)", "pandas (>=1.3,<2.0)", "pandas-gbq (>=0.12.0,<0.18.0)", "pyarrow (>=6.0)", "pyodbc (>=4.0,<5.0)", "tables (>=3.6,<4.0)", "tables (>=3.6.0,<3.7.0)"] +pandas-csvdataset = ["pandas (>=1.3,<2.0)"] +pandas-exceldataset = ["openpyxl (>=3.0.6,<4.0)", "pandas (>=1.3,<2.0)"] +pandas-featherdataset = ["pandas (>=1.3,<2.0)"] +pandas-gbqquerydataset = ["pandas (>=1.3,<2.0)", "pandas-gbq (>=0.12.0,<0.18.0)"] +pandas-gbqtabledataset = ["pandas (>=1.3,<2.0)", "pandas-gbq (>=0.12.0,<0.18.0)"] +pandas-genericdataset = ["pandas (>=1.3,<2.0)"] +pandas-hdfdataset = ["pandas (>=1.3,<2.0)", "tables (>=3.6,<4.0)", "tables (>=3.6.0,<3.7.0)"] +pandas-jsondataset = ["pandas (>=1.3,<2.0)"] +pandas-parquetdataset = ["pandas (>=1.3,<2.0)", "pyarrow (>=6.0)"] +pandas-sqlquerydataset = ["SQLAlchemy (>=1.2,<2.0)", "pandas (>=1.3,<2.0)", "pyodbc (>=4.0,<5.0)"] +pandas-sqltabledataset = ["SQLAlchemy (>=1.2,<2.0)", "pandas (>=1.3,<2.0)"] +pandas-xmldataset = ["lxml (>=4.6,<5.0)", "pandas (>=1.3,<2.0)"] +pillow = ["Pillow (>=9.0,<10.0)"] +pillow-imagedataset = ["Pillow (>=9.0,<10.0)"] +plotly = ["pandas (>=1.3,<2.0)", "plotly (>=4.8.0,<6.0)"] +plotly-jsondataset = ["plotly (>=4.8.0,<6.0)"] +plotly-plotlydataset = ["pandas (>=1.3,<2.0)", "plotly (>=4.8.0,<6.0)"] +polars = ["polars (>=0.15.16,<0.16.0)"] +polars-csvdataset = ["polars (>=0.15.16,<0.16.0)"] +redis = ["redis (>=4.1,<5.0)"] +snowflake-snowparktabledataset = ["pyarrow (>=8.0,<9.0)", "snowflake-snowpark-python (>=1.0.0,<1.1.0)"] +spark = ["delta-spark (>=1.0,<2.0)", "hdfs (>=2.5.8,<3.0)", "pyspark (>=2.2,<4.0)", "s3fs (>=0.3.0,<0.5)"] +spark-deltatabledataset = ["delta-spark (>=1.0,<2.0)", "hdfs (>=2.5.8,<3.0)", "pyspark (>=2.2,<4.0)", "s3fs (>=0.3.0,<0.5)"] +spark-sparkdataset = ["hdfs (>=2.5.8,<3.0)", "pyspark (>=2.2,<4.0)", "s3fs (>=0.3.0,<0.5)"] +spark-sparkhivedataset = ["hdfs (>=2.5.8,<3.0)", "pyspark (>=2.2,<4.0)", "s3fs (>=0.3.0,<0.5)"] +spark-sparkjdbcdataset = ["hdfs (>=2.5.8,<3.0)", "pyspark (>=2.2,<4.0)", "s3fs (>=0.3.0,<0.5)"] +svmlight = ["scikit-learn (>=1.0.2,<1.1.0)", "scipy (>=1.7.3,<1.8.0)"] +svmlight-svmlightdataset = ["scikit-learn (>=1.0.2,<1.1.0)", "scipy (>=1.7.3,<1.8.0)"] +tensorflow = ["tensorflow (>=2.0,<3.0)"] +tensorflow-tensorflowmodeldataset = ["tensorflow (>=2.0,<3.0)"] +video = ["opencv-python (>=4.5.5.64,<4.6.0.0)"] +video-videodataset = ["opencv-python (>=4.5.5.64,<4.6.0.0)"] +yaml = ["PyYAML (>=4.2,<7.0)", "pandas (>=1.3,<2.0)"] +yaml-yamldataset = ["PyYAML (>=4.2,<7.0)", "pandas (>=1.3,<2.0)"] + [[package]] name = "knack" version = "0.10.1" @@ -3872,4 +3940,4 @@ mlflow = ["azureml-mlflow", "mlflow"] [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.11" -content-hash = "7c1bf55eb253d93043bfbdf32cd68d87f541a2d46a13205cf4a9814c9b82300b" +content-hash = "6e7ff188739412892d5149a21467dc09b9d3206552947e93affd20662f03640e" diff --git a/pyproject.toml b/pyproject.toml index a4e9ec3..0d9517d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "kedro-azureml" -version = "0.4.0" +version = "0.4.1" description = "Kedro plugin with Azure ML Pipelines support" readme = "README.md" authors = ['marcin.zablocki '] @@ -44,6 +44,7 @@ backoff = "^2.2.1" azure-core = ">=1.26.1" azureml-core = "^1.49.0" azureml-dataset-runtime = "^1.49.0" +kedro-datasets = ">=1.0.0" [tool.poetry.extras] mlflow = ["azureml-mlflow", "mlflow"] diff --git a/sonar-project.properties b/sonar-project.properties index 925c094..b8a7777 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -6,7 +6,7 @@ sonar.tests=tests/ sonar.python.coverage.reportPaths=coverage.xml sonar.python.version=3.9 -sonar.projectVersion=0.4.0 +sonar.projectVersion=0.4.1 sonar.projectDescription=Kedro plugin with Azure ML Pipelines support sonar.links.homepage=https://kedro-azureml.readthedocs.io/ sonar.links.ci=https://github.com/getindata/kedro-azureml/actions diff --git a/tests/test_cli.py b/tests/test_cli.py index 4a046d6..514b2af 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,5 +1,6 @@ import os from pathlib import Path +from typing import List from unittest import mock from unittest.mock import MagicMock, patch from uuid import uuid4 @@ -17,9 +18,24 @@ from tests.utils import create_kedro_conf_dirs +@pytest.mark.parametrize( + "env_or_docker", + [ + ["--docker-image", "my_docker/image:latest"], + ["--aml-env", f"{uuid4().hex}@latest"], + ["--azureml-environment", f"{uuid4().hex}:v1"], + ["--docker-image", "a", "--aml-env", "b"], + [], + ], + ids=("with docker", "with AML env", "with AML (long param name)", "both", "none"), +) @pytest.mark.parametrize("use_pipeline_data_passing", (True, False)) def test_can_initialize_basic_plugin_config( - patched_kedro_package, cli_context, tmp_path: Path, use_pipeline_data_passing: bool + patched_kedro_package, + cli_context, + tmp_path: Path, + env_or_docker: List[str], + use_pipeline_data_passing: bool, ): config_path = create_kedro_conf_dirs(tmp_path) @@ -36,6 +52,7 @@ def test_can_initialize_basic_plugin_config( f"storage_container_{unique_id}", ] ) + result = runner.invoke( cli.init, [ @@ -44,12 +61,22 @@ def test_can_initialize_basic_plugin_config( f"workspace_name_{unique_id}", f"experiment_name_{unique_id}", f"cluster_name_{unique_id}", - f"environment_name_{unique_id}", ] - + storage_args, + + storage_args + + env_or_docker, obj=cli_context, ) - assert result.exit_code == 0 + + if "--aml-env" in env_or_docker and "--docker-image" in env_or_docker: + assert result.exit_code == 2 + assert "You cannot specify both" in result.output + return + elif len(env_or_docker) == 0: + assert result.exit_code == 2 + assert "You must specify either" in result.output + return + + assert result.exit_code == 0, result.exception azureml_config_path = config_path / "azureml.yml" assert ( @@ -84,7 +111,13 @@ def test_can_initialize_basic_plugin_config( config.azure.temporary_storage.container == f"storage_container_{unique_id}" ) - assert config.azure.environment_name == f"environment_name_{unique_id}" + + if "--aml-env" in env_or_docker or "--azureml-environment" in env_or_docker: + assert config.azure.environment_name == env_or_docker[1] + assert config.docker.image is None + else: + assert config.azure.environment_name is None + assert config.docker.image == env_or_docker[1] @pytest.mark.parametrize( @@ -256,7 +289,7 @@ def test_can_invoke_run( cli.run, ["-s", "subscription_id"] + (["--wait-for-completion"] if wait_for_completion else []) - + (["--aml_env", aml_env] if aml_env else []) + + (["--aml-env", aml_env] if aml_env else []) + (sum([["--env-var", k] for k in extra_env[0]], [])), obj=cli_context, ) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..0260a0f --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,28 @@ +from copy import deepcopy + +import pytest + +from kedro_azureml.utils import update_dict + + +@pytest.mark.parametrize( + "input_dict, kv_pairs, expected_output", + [ + ({}, [("a", 1)], {"a": 1}), + ({"a": 1}, [("a", 2)], {"a": 2}), + ({"a": {"b": 1}}, [("a.b", 2)], {"a": {"b": 2}}), + ({"a": {"b": {"c": 1}}}, [("a.b.c", 2)], {"a": {"b": {"c": 2}}}), + ( + {"a": {"b": {"c": 1}}}, + [("a.b.c", 2), ("a.b.d", 3)], + {"a": {"b": {"c": 2, "d": 3}}}, + ), + ({}, [("a.b.c", 1)], {"a": {"b": {"c": 1}}}), + ], +) +def test_update_dict(input_dict, kv_pairs, expected_output): + copied_dict = deepcopy(input_dict) + actual_output = update_dict(input_dict, *kv_pairs) + assert actual_output == expected_output, "update is incorrect" + assert actual_output is not input_dict, "output should be a deep copy" + assert input_dict == copied_dict, "input_dict should not be mutated"