From 468bd2501fc00a5d35f0a75f90b1c9995afe26e1 Mon Sep 17 00:00:00 2001 From: Seb-Good Date: Tue, 30 Apr 2019 11:57:11 -0400 Subject: [PATCH] - Added upload_data main function. - added azureml run config files. --- .gitignore | 3 + aml_config/conda_dependencies.yml | 15 ++++ aml_config/docker.runconfig | 120 ++++++++++++++++++++++++++++++ aml_config/local.runconfig | 82 ++++++++++++++++++++ aml_config/project.json | 1 + train.py | 4 +- upload_data.py | 47 ++++++++++++ 7 files changed, 270 insertions(+), 2 deletions(-) create mode 100644 aml_config/conda_dependencies.yml create mode 100644 aml_config/docker.runconfig create mode 100644 aml_config/local.runconfig create mode 100644 aml_config/project.json create mode 100644 upload_data.py diff --git a/.gitignore b/.gitignore index f77de05..028be39 100644 --- a/.gitignore +++ b/.gitignore @@ -167,3 +167,6 @@ ENV/ # mypy .mypy_cache/ + +# VS Cods +.vscode \ No newline at end of file diff --git a/aml_config/conda_dependencies.yml b/aml_config/conda_dependencies.yml new file mode 100644 index 0000000..328f065 --- /dev/null +++ b/aml_config/conda_dependencies.yml @@ -0,0 +1,15 @@ +# Conda environment specification. The dependencies defined in this file will +# be automatically provisioned for runs with userManagedDependencies=False. + +# Details about the Conda environment file format: +# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually + +name: project_environment +dependencies: + # The python interpreter version. + # Currently Azure ML only supports 3.5.2 and later. +- python=3.6.2 + +- pip: + # Required packages for AzureML execution, history, and data preparation. + - azureml-defaults diff --git a/aml_config/docker.runconfig b/aml_config/docker.runconfig new file mode 100644 index 0000000..9308fcc --- /dev/null +++ b/aml_config/docker.runconfig @@ -0,0 +1,120 @@ +# The script to run. +script: train.py +# The arguments to the script file. +arguments: [] +# The name of the compute target to use for this run. +target: local +# Framework to execute inside. Allowed values are "Python" , "PySpark", "CNTK", "TensorFlow", and "PyTorch". +framework: PySpark +# Communicator for the given framework. Allowed values are "None" , "ParameterServer", "OpenMpi", and "IntelMpi". +communicator: None +# Automatically prepare the run environment as part of the run itself. +autoPrepareEnvironment: true +# Maximum allowed duration for the run. +maxRunDurationSeconds: +# Number of nodes to use for running job. +nodeCount: 1 +# Environment details. +environment: +# Environment variables set for the run. + environmentVariables: + EXAMPLE_ENV_VAR: EXAMPLE_VALUE +# Python details + python: +# user_managed_dependencies=True indicates that the environmentwill be user managed. False indicates that AzureML willmanage the user environment. + userManagedDependencies: false +# The python interpreter path + interpreterPath: python +# Path to the conda dependencies file to use for this run. If a project +# contains multiple programs with different sets of dependencies, it may be +# convenient to manage those environments with separate files. + condaDependenciesFile: aml_config/conda_dependencies.yml +# Docker details + docker: +# Set True to perform this run inside a Docker container. + enabled: true +# Base image used for Docker-based runs. + baseImage: mcr.microsoft.com/azureml/base:0.2.1 +# Set False if necessary to work around shared volume bugs. + sharedVolumes: true +# Run with NVidia Docker extension to support GPUs. + gpuSupport: false +# Shared memory size for Docker container. Default is 1g. + shmSize: 1g +# Extra arguments to the Docker run command. + arguments: [] +# Image registry that contains the base image. + baseImageRegistry: +# DNS name or IP address of azure container registry(ACR) + address: +# The username for ACR + username: +# The password for ACR + password: +# Spark details + spark: +# List of spark repositories. + repositories: + - https://mmlspark.azureedge.net/maven +# The packages to use. + packages: + - group: com.microsoft.ml.spark + artifact: mmlspark_2.11 + version: '0.12' +# Whether to precache the packages. + precachePackages: true +# Databricks details + databricks: +# List of maven libraries. + mavenLibraries: [] +# List of PyPi libraries + pypiLibraries: [] +# List of RCran libraries + rcranLibraries: [] +# List of JAR libraries + jarLibraries: [] +# List of Egg libraries + eggLibraries: [] +# History details. +history: +# Enable history tracking -- this allows status, logs, metrics, and outputs +# to be collected for a run. + outputCollection: true +# Whether to take snapshots for history. + snapshotProject: true +# Spark configuration details. +spark: +# The Spark configuration. + configuration: + spark.app.name: Azure ML Experiment + spark.yarn.maxAppAttempts: 1 +# HDI details. +hdi: +# Yarn deploy mode. Options are cluster and client. + yarnDeployMode: cluster +# Tensorflow details. +tensorflow: +# The number of worker tasks. + workerCount: 1 +# The number of parameter server tasks. + parameterServerCount: 1 +# Mpi details. +mpi: +# When using MPI, number of processes per node. + processCountPerNode: 1 +# data reference configuration details +dataReferences: {} +# Project share datastore reference. +sourceDirectoryDataStore: +# AmlCompute details. +amlcompute: +# VM size of the Cluster to be created.Allowed values are Azure vm sizes.The list of vm sizes is available in 'https://docs.microsoft.com/en-us/azure/cloud-services/cloud-services-sizes-specs + vmSize: +# VM priority of the Cluster to be created. Allowed values are:"dedicated" , "lowpriority". + vmPriority: +# A bool that indicates if the cluster has to be retained after job completion. + retainCluster: false +# Name of the cluster to be created. If not specified, runId will be used as cluster name. + name: +# Maximum number of nodes in the AmlCompute cluster to be created. Minimum number of nodes will always be set to 0. + clusterMaxNodeCount: 1 diff --git a/aml_config/local.runconfig b/aml_config/local.runconfig new file mode 100644 index 0000000..778541f --- /dev/null +++ b/aml_config/local.runconfig @@ -0,0 +1,82 @@ +{ + "script": "train.py", + "arguments": [], + "target": "local", + "framework": "Python", + "communicator": "None", + "autoPrepareEnvironment": true, + "maxRunDurationSeconds": null, + "nodeCount": 1, + "environment": { + "environmentVariables": { + "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE" + }, + "python": { + "userManagedDependencies": true, + "interpreterPath": "/home/sebastiangoodfellow/anaconda3/envs/mnist-azure/bin/python", + "condaDependenciesFile": "aml_config/conda_dependencies.yml" + }, + "docker": { + "enabled": false, + "baseImage": "mcr.microsoft.com/azureml/base:0.2.1", + "sharedVolumes": true, + "gpuSupport": false, + "shmSize": "1g", + "arguments": [], + "baseImageRegistry": { + "address": null, + "username": null, + "password": null + } + }, + "spark": { + "repositories": [ + "https://mmlspark.azureedge.net/maven" + ], + "packages": [ + { + "group": "com.microsoft.ml.spark", + "artifact": "mmlspark_2.11", + "version": "0.12" + } + ], + "precachePackages": true + }, + "databricks": { + "mavenLibraries": [], + "pypiLibraries": [], + "rcranLibraries": [], + "jarLibraries": [], + "eggLibraries": [] + } + }, + "history": { + "outputCollection": true, + "snapshotProject": true + }, + "spark": { + "configuration": { + "spark.app.name": "Azure ML Experiment", + "spark.yarn.maxAppAttempts": 1 + } + }, + "hdi": { + "yarnDeployMode": "cluster" + }, + "tensorflow": { + "workerCount": 1, + "parameterServerCount": 1 + }, + "mpi": { + "processCountPerNode": 1 + }, + "dataReferences": {}, + "sourceDirectoryDataStore": null, + "amlcompute": { + "vmSize": null, + "vmPriority": null, + "retainCluster": false, + "name": null, + "clusterMaxNodeCount": 1 + } +} diff --git a/aml_config/project.json b/aml_config/project.json new file mode 100644 index 0000000..78d5626 --- /dev/null +++ b/aml_config/project.json @@ -0,0 +1 @@ +{"Id": "test", "Scope": "/subscriptions/30284b70-31e1-4b93-b620-26959f80a8f9/resourceGroups/ml-testing/providers/Microsoft.MachineLearningServices/workspaces/mnist-azure/projects/test"} \ No newline at end of file diff --git a/train.py b/train.py index 15bb510..35671ee 100644 --- a/train.py +++ b/train.py @@ -16,7 +16,7 @@ def main(args): - """Build saved model for serving.""" + """Train MNIST tensorflow model.""" # Image shape image_shape = (28, 28, 1) @@ -73,7 +73,7 @@ def main(args): def get_parser(): - """Get parser object for script predict.py.""" + """Get parser object for script train.py.""" # Initialize parser parser = ArgumentParser(description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter) diff --git a/upload_data.py b/upload_data.py new file mode 100644 index 0000000..adbb0d8 --- /dev/null +++ b/upload_data.py @@ -0,0 +1,47 @@ +""" +upload_data.py +-------------- +By: Sebastian D. Goodfellow, Ph.D., 2019 +""" + +# 3rd party imports +from azureml.core import Workspace +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter + +# Local imports +from mnistazure.config import DATA_PATH + + +def main(args): + """Upload MNIST dataset to Azure Workspace data store.""" + # Get workspace + ws = Workspace(subscription_id=args.subscription_id, resource_group=args.resource_group, + workspace_name=args.workspace_name) + + # Get data store + ds = ws.get_default_datastore() + + # Upload MNIST dataset to data store + ds.upload(src_dir=DATA_PATH, target_path='mnist', show_progress=True) + + +def get_parser(): + """Get parser object for script upload_data.py.""" + # Initialize parser + parser = ArgumentParser(description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter) + + # Setup arguments + parser.add_argument("--subscription_id", dest="subscription_id", type=str) + parser.add_argument("--resource_group", dest="resource_group", type=str) + parser.add_argument("--workspace_name", dest="workspace_name", type=str) + + return parser + + +if __name__ == "__main__": + + # Parse arguments + arguments = get_parser().parse_args() + + # Run main function + main(args=arguments)