diff --git a/.amlignore b/.amlignore new file mode 100644 index 00000000..616345c4 --- /dev/null +++ b/.amlignore @@ -0,0 +1,7 @@ +.ipynb_checkpoints +azureml-logs +.azureml +.git +outputs +azureml-setup +docs diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..94ae80d7 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "C:\\Users\\sgilley\\.azureml\\envs\\jan3\\python.exe" +} \ No newline at end of file diff --git a/aml_config/conda_dependencies.yml b/aml_config/conda_dependencies.yml new file mode 100644 index 00000000..5e49a89d --- /dev/null +++ b/aml_config/conda_dependencies.yml @@ -0,0 +1,15 @@ +# Conda environment specification. The dependencies defined in this file will +# be automatically provisioned for runs with userManagedDependencies=False. + +# Details about the Conda environment file format: +# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually + +name: project_environment +dependencies: + # The python interpreter version. + # Currently Azure ML only supports 3.5.2 and later. +- python=3.6.2 + +- pip: + # Required packages for AzureML execution, history, and data preparation. + - azureml-defaults diff --git a/aml_config/docker.runconfig b/aml_config/docker.runconfig new file mode 100644 index 00000000..d79398c8 --- /dev/null +++ b/aml_config/docker.runconfig @@ -0,0 +1,115 @@ +# The script to run. +script: train.py +# The arguments to the script file. +arguments: [] +# The name of the compute target to use for this run. +target: local +# Framework to execute inside. Allowed values are "Python" , "PySpark", "CNTK", "TensorFlow", and "PyTorch". +framework: PySpark +# Communicator for the given framework. Allowed values are "None" , "ParameterServer", "OpenMpi", and "IntelMpi". +communicator: None +# Automatically prepare the run environment as part of the run itself. +autoPrepareEnvironment: true +# Maximum allowed duration for the run. +maxRunDurationSeconds: +# Number of nodes to use for running job. +nodeCount: 1 +# Environment details. +environment: +# Environment variables set for the run. + environmentVariables: + EXAMPLE_ENV_VAR: EXAMPLE_VALUE +# Python details + python: +# user_managed_dependencies=True indicates that the environmentwill be user managed. False indicates that AzureML willmanage the user environment. + userManagedDependencies: false +# The python interpreter path + interpreterPath: python +# Path to the conda dependencies file to use for this run. If a project +# contains multiple programs with different sets of dependencies, it may be +# convenient to manage those environments with separate files. + condaDependenciesFile: aml_config/conda_dependencies.yml +# Docker details + docker: +# Set True to perform this run inside a Docker container. + enabled: true +# Base image used for Docker-based runs. + baseImage: mcr.microsoft.com/azureml/base:0.2.0 +# Set False if necessary to work around shared volume bugs. + sharedVolumes: true +# Run with NVidia Docker extension to support GPUs. + gpuSupport: false +# Extra arguments to the Docker run command. + arguments: [] +# Image registry that contains the base image. + baseImageRegistry: +# DNS name or IP address of azure container registry(ACR) + address: +# The username for ACR + username: +# The password for ACR + password: +# Spark details + spark: +# List of spark repositories. + repositories: + - https://mmlspark.azureedge.net/maven + packages: + - group: com.microsoft.ml.spark + artifact: mmlspark_2.11 + version: '0.12' + precachePackages: true +# Databricks details + databricks: +# List of maven libraries. + mavenLibraries: [] +# List of PyPi libraries + pypiLibraries: [] +# List of RCran libraries + rcranLibraries: [] +# List of JAR libraries + jarLibraries: [] +# List of Egg libraries + eggLibraries: [] +# History details. +history: +# Enable history tracking -- this allows status, logs, metrics, and outputs +# to be collected for a run. + outputCollection: true +# whether to take snapshots for history. + snapshotProject: true +# Spark configuration details. +spark: + configuration: + spark.app.name: Azure ML Experiment + spark.yarn.maxAppAttempts: 1 +# HDI details. +hdi: +# Yarn deploy mode. Options are cluster and client. + yarnDeployMode: cluster +# Tensorflow details. +tensorflow: +# The number of worker tasks. + workerCount: 1 +# The number of parameter server tasks. + parameterServerCount: 1 +# Mpi details. +mpi: +# When using MPI, number of processes per node. + processCountPerNode: 1 +# data reference configuration details +dataReferences: {} +# Project share datastore reference. +sourceDirectoryDataStore: +# AmlCompute details. +amlcompute: +# VM size of the Cluster to be created.Allowed values are Azure vm sizes.The list of vm sizes is available in 'https://docs.microsoft.com/en-us/azure/cloud-services/cloud-services-sizes-specs + vmSize: +# VM priority of the Cluster to be created.Allowed values are "dedicated" , "lowpriority". + vmPriority: +# A bool that indicates if the cluster has to be retained after job completion. + retainCluster: false +# Name of the cluster to be created. If not specified, runId will be used as cluster name. + name: +# Maximum number of nodes in the AmlCompute cluster to be created. Minimum number of nodes will always be set to 0. + clusterMaxNodeCount: 1 diff --git a/aml_config/local.runconfig b/aml_config/local.runconfig new file mode 100644 index 00000000..ccfa6195 --- /dev/null +++ b/aml_config/local.runconfig @@ -0,0 +1,115 @@ +# The script to run. +script: train.py +# The arguments to the script file. +arguments: [] +# The name of the compute target to use for this run. +target: local +# Framework to execute inside. Allowed values are "Python" , "PySpark", "CNTK", "TensorFlow", and "PyTorch". +framework: Python +# Communicator for the given framework. Allowed values are "None" , "ParameterServer", "OpenMpi", and "IntelMpi". +communicator: None +# Automatically prepare the run environment as part of the run itself. +autoPrepareEnvironment: true +# Maximum allowed duration for the run. +maxRunDurationSeconds: +# Number of nodes to use for running job. +nodeCount: 1 +# Environment details. +environment: +# Environment variables set for the run. + environmentVariables: + EXAMPLE_ENV_VAR: EXAMPLE_VALUE +# Python details + python: +# user_managed_dependencies=True indicates that the environmentwill be user managed. False indicates that AzureML willmanage the user environment. + userManagedDependencies: false +# The python interpreter path + interpreterPath: python +# Path to the conda dependencies file to use for this run. If a project +# contains multiple programs with different sets of dependencies, it may be +# convenient to manage those environments with separate files. + condaDependenciesFile: aml_config/conda_dependencies.yml +# Docker details + docker: +# Set True to perform this run inside a Docker container. + enabled: false +# Base image used for Docker-based runs. + baseImage: mcr.microsoft.com/azureml/base:0.2.0 +# Set False if necessary to work around shared volume bugs. + sharedVolumes: true +# Run with NVidia Docker extension to support GPUs. + gpuSupport: false +# Extra arguments to the Docker run command. + arguments: [] +# Image registry that contains the base image. + baseImageRegistry: +# DNS name or IP address of azure container registry(ACR) + address: +# The username for ACR + username: +# The password for ACR + password: +# Spark details + spark: +# List of spark repositories. + repositories: + - https://mmlspark.azureedge.net/maven + packages: + - group: com.microsoft.ml.spark + artifact: mmlspark_2.11 + version: '0.12' + precachePackages: true +# Databricks details + databricks: +# List of maven libraries. + mavenLibraries: [] +# List of PyPi libraries + pypiLibraries: [] +# List of RCran libraries + rcranLibraries: [] +# List of JAR libraries + jarLibraries: [] +# List of Egg libraries + eggLibraries: [] +# History details. +history: +# Enable history tracking -- this allows status, logs, metrics, and outputs +# to be collected for a run. + outputCollection: true +# whether to take snapshots for history. + snapshotProject: true +# Spark configuration details. +spark: + configuration: + spark.app.name: Azure ML Experiment + spark.yarn.maxAppAttempts: 1 +# HDI details. +hdi: +# Yarn deploy mode. Options are cluster and client. + yarnDeployMode: cluster +# Tensorflow details. +tensorflow: +# The number of worker tasks. + workerCount: 1 +# The number of parameter server tasks. + parameterServerCount: 1 +# Mpi details. +mpi: +# When using MPI, number of processes per node. + processCountPerNode: 1 +# data reference configuration details +dataReferences: {} +# Project share datastore reference. +sourceDirectoryDataStore: +# AmlCompute details. +amlcompute: +# VM size of the Cluster to be created.Allowed values are Azure vm sizes.The list of vm sizes is available in 'https://docs.microsoft.com/en-us/azure/cloud-services/cloud-services-sizes-specs + vmSize: +# VM priority of the Cluster to be created.Allowed values are "dedicated" , "lowpriority". + vmPriority: +# A bool that indicates if the cluster has to be retained after job completion. + retainCluster: false +# Name of the cluster to be created. If not specified, runId will be used as cluster name. + name: +# Maximum number of nodes in the AmlCompute cluster to be created. Minimum number of nodes will always be set to 0. + clusterMaxNodeCount: 1 diff --git a/aml_config/project.json b/aml_config/project.json new file mode 100644 index 00000000..dfedb75a --- /dev/null +++ b/aml_config/project.json @@ -0,0 +1 @@ +{"Id": "local-compute", "Scope": "/subscriptions/65a1016d-0f67-45d2-b838-b8f373d6d52e/resourceGroups/sheri/providers/Microsoft.MachineLearningServices/workspaces/sheritestqs3/projects/local-compute"} \ No newline at end of file diff --git a/ignore/doc-qa/how-to-set-up-training-targets/.amlignore b/ignore/doc-qa/how-to-set-up-training-targets/.amlignore new file mode 100644 index 00000000..616345c4 --- /dev/null +++ b/ignore/doc-qa/how-to-set-up-training-targets/.amlignore @@ -0,0 +1,7 @@ +.ipynb_checkpoints +azureml-logs +.azureml +.git +outputs +azureml-setup +docs diff --git a/ignore/doc-qa/how-to-set-up-training-targets/aml_config/conda_dependencies.yml b/ignore/doc-qa/how-to-set-up-training-targets/aml_config/conda_dependencies.yml new file mode 100644 index 00000000..5e49a89d --- /dev/null +++ b/ignore/doc-qa/how-to-set-up-training-targets/aml_config/conda_dependencies.yml @@ -0,0 +1,15 @@ +# Conda environment specification. The dependencies defined in this file will +# be automatically provisioned for runs with userManagedDependencies=False. + +# Details about the Conda environment file format: +# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually + +name: project_environment +dependencies: + # The python interpreter version. + # Currently Azure ML only supports 3.5.2 and later. +- python=3.6.2 + +- pip: + # Required packages for AzureML execution, history, and data preparation. + - azureml-defaults diff --git a/ignore/doc-qa/how-to-set-up-training-targets/aml_config/docker.runconfig b/ignore/doc-qa/how-to-set-up-training-targets/aml_config/docker.runconfig new file mode 100644 index 00000000..d79398c8 --- /dev/null +++ b/ignore/doc-qa/how-to-set-up-training-targets/aml_config/docker.runconfig @@ -0,0 +1,115 @@ +# The script to run. +script: train.py +# The arguments to the script file. +arguments: [] +# The name of the compute target to use for this run. +target: local +# Framework to execute inside. Allowed values are "Python" , "PySpark", "CNTK", "TensorFlow", and "PyTorch". +framework: PySpark +# Communicator for the given framework. Allowed values are "None" , "ParameterServer", "OpenMpi", and "IntelMpi". +communicator: None +# Automatically prepare the run environment as part of the run itself. +autoPrepareEnvironment: true +# Maximum allowed duration for the run. +maxRunDurationSeconds: +# Number of nodes to use for running job. +nodeCount: 1 +# Environment details. +environment: +# Environment variables set for the run. + environmentVariables: + EXAMPLE_ENV_VAR: EXAMPLE_VALUE +# Python details + python: +# user_managed_dependencies=True indicates that the environmentwill be user managed. False indicates that AzureML willmanage the user environment. + userManagedDependencies: false +# The python interpreter path + interpreterPath: python +# Path to the conda dependencies file to use for this run. If a project +# contains multiple programs with different sets of dependencies, it may be +# convenient to manage those environments with separate files. + condaDependenciesFile: aml_config/conda_dependencies.yml +# Docker details + docker: +# Set True to perform this run inside a Docker container. + enabled: true +# Base image used for Docker-based runs. + baseImage: mcr.microsoft.com/azureml/base:0.2.0 +# Set False if necessary to work around shared volume bugs. + sharedVolumes: true +# Run with NVidia Docker extension to support GPUs. + gpuSupport: false +# Extra arguments to the Docker run command. + arguments: [] +# Image registry that contains the base image. + baseImageRegistry: +# DNS name or IP address of azure container registry(ACR) + address: +# The username for ACR + username: +# The password for ACR + password: +# Spark details + spark: +# List of spark repositories. + repositories: + - https://mmlspark.azureedge.net/maven + packages: + - group: com.microsoft.ml.spark + artifact: mmlspark_2.11 + version: '0.12' + precachePackages: true +# Databricks details + databricks: +# List of maven libraries. + mavenLibraries: [] +# List of PyPi libraries + pypiLibraries: [] +# List of RCran libraries + rcranLibraries: [] +# List of JAR libraries + jarLibraries: [] +# List of Egg libraries + eggLibraries: [] +# History details. +history: +# Enable history tracking -- this allows status, logs, metrics, and outputs +# to be collected for a run. + outputCollection: true +# whether to take snapshots for history. + snapshotProject: true +# Spark configuration details. +spark: + configuration: + spark.app.name: Azure ML Experiment + spark.yarn.maxAppAttempts: 1 +# HDI details. +hdi: +# Yarn deploy mode. Options are cluster and client. + yarnDeployMode: cluster +# Tensorflow details. +tensorflow: +# The number of worker tasks. + workerCount: 1 +# The number of parameter server tasks. + parameterServerCount: 1 +# Mpi details. +mpi: +# When using MPI, number of processes per node. + processCountPerNode: 1 +# data reference configuration details +dataReferences: {} +# Project share datastore reference. +sourceDirectoryDataStore: +# AmlCompute details. +amlcompute: +# VM size of the Cluster to be created.Allowed values are Azure vm sizes.The list of vm sizes is available in 'https://docs.microsoft.com/en-us/azure/cloud-services/cloud-services-sizes-specs + vmSize: +# VM priority of the Cluster to be created.Allowed values are "dedicated" , "lowpriority". + vmPriority: +# A bool that indicates if the cluster has to be retained after job completion. + retainCluster: false +# Name of the cluster to be created. If not specified, runId will be used as cluster name. + name: +# Maximum number of nodes in the AmlCompute cluster to be created. Minimum number of nodes will always be set to 0. + clusterMaxNodeCount: 1 diff --git a/ignore/doc-qa/how-to-set-up-training-targets/aml_config/local.runconfig b/ignore/doc-qa/how-to-set-up-training-targets/aml_config/local.runconfig new file mode 100644 index 00000000..ccfa6195 --- /dev/null +++ b/ignore/doc-qa/how-to-set-up-training-targets/aml_config/local.runconfig @@ -0,0 +1,115 @@ +# The script to run. +script: train.py +# The arguments to the script file. +arguments: [] +# The name of the compute target to use for this run. +target: local +# Framework to execute inside. Allowed values are "Python" , "PySpark", "CNTK", "TensorFlow", and "PyTorch". +framework: Python +# Communicator for the given framework. Allowed values are "None" , "ParameterServer", "OpenMpi", and "IntelMpi". +communicator: None +# Automatically prepare the run environment as part of the run itself. +autoPrepareEnvironment: true +# Maximum allowed duration for the run. +maxRunDurationSeconds: +# Number of nodes to use for running job. +nodeCount: 1 +# Environment details. +environment: +# Environment variables set for the run. + environmentVariables: + EXAMPLE_ENV_VAR: EXAMPLE_VALUE +# Python details + python: +# user_managed_dependencies=True indicates that the environmentwill be user managed. False indicates that AzureML willmanage the user environment. + userManagedDependencies: false +# The python interpreter path + interpreterPath: python +# Path to the conda dependencies file to use for this run. If a project +# contains multiple programs with different sets of dependencies, it may be +# convenient to manage those environments with separate files. + condaDependenciesFile: aml_config/conda_dependencies.yml +# Docker details + docker: +# Set True to perform this run inside a Docker container. + enabled: false +# Base image used for Docker-based runs. + baseImage: mcr.microsoft.com/azureml/base:0.2.0 +# Set False if necessary to work around shared volume bugs. + sharedVolumes: true +# Run with NVidia Docker extension to support GPUs. + gpuSupport: false +# Extra arguments to the Docker run command. + arguments: [] +# Image registry that contains the base image. + baseImageRegistry: +# DNS name or IP address of azure container registry(ACR) + address: +# The username for ACR + username: +# The password for ACR + password: +# Spark details + spark: +# List of spark repositories. + repositories: + - https://mmlspark.azureedge.net/maven + packages: + - group: com.microsoft.ml.spark + artifact: mmlspark_2.11 + version: '0.12' + precachePackages: true +# Databricks details + databricks: +# List of maven libraries. + mavenLibraries: [] +# List of PyPi libraries + pypiLibraries: [] +# List of RCran libraries + rcranLibraries: [] +# List of JAR libraries + jarLibraries: [] +# List of Egg libraries + eggLibraries: [] +# History details. +history: +# Enable history tracking -- this allows status, logs, metrics, and outputs +# to be collected for a run. + outputCollection: true +# whether to take snapshots for history. + snapshotProject: true +# Spark configuration details. +spark: + configuration: + spark.app.name: Azure ML Experiment + spark.yarn.maxAppAttempts: 1 +# HDI details. +hdi: +# Yarn deploy mode. Options are cluster and client. + yarnDeployMode: cluster +# Tensorflow details. +tensorflow: +# The number of worker tasks. + workerCount: 1 +# The number of parameter server tasks. + parameterServerCount: 1 +# Mpi details. +mpi: +# When using MPI, number of processes per node. + processCountPerNode: 1 +# data reference configuration details +dataReferences: {} +# Project share datastore reference. +sourceDirectoryDataStore: +# AmlCompute details. +amlcompute: +# VM size of the Cluster to be created.Allowed values are Azure vm sizes.The list of vm sizes is available in 'https://docs.microsoft.com/en-us/azure/cloud-services/cloud-services-sizes-specs + vmSize: +# VM priority of the Cluster to be created.Allowed values are "dedicated" , "lowpriority". + vmPriority: +# A bool that indicates if the cluster has to be retained after job completion. + retainCluster: false +# Name of the cluster to be created. If not specified, runId will be used as cluster name. + name: +# Maximum number of nodes in the AmlCompute cluster to be created. Minimum number of nodes will always be set to 0. + clusterMaxNodeCount: 1 diff --git a/ignore/doc-qa/how-to-set-up-training-targets/aml_config/project.json b/ignore/doc-qa/how-to-set-up-training-targets/aml_config/project.json new file mode 100644 index 00000000..2ad24da1 --- /dev/null +++ b/ignore/doc-qa/how-to-set-up-training-targets/aml_config/project.json @@ -0,0 +1 @@ +{"Id": "my-experiment", "Scope": "/subscriptions/65a1016d-0f67-45d2-b838-b8f373d6d52e/resourceGroups/sheri/providers/Microsoft.MachineLearningServices/workspaces/sheritestqs3/projects/my-experiment"} \ No newline at end of file diff --git a/ignore/doc-qa/how-to-set-up-training-targets/donotupload.py b/ignore/doc-qa/how-to-set-up-training-targets/donotupload.py new file mode 100644 index 00000000..72ed8265 --- /dev/null +++ b/ignore/doc-qa/how-to-set-up-training-targets/donotupload.py @@ -0,0 +1,40 @@ + +from azureml.core import Workspace +ws = Workspace.from_config() + +from azureml.core.compute import ComputeTarget, HDInsightCompute +from azureml.exceptions import ComputeTargetException + +try: + # if you want to connect using SSH key instead of username/password you can provide parameters private_key_file and private_key_passphrase + attach_config = HDInsightCompute.attach_configuration(address='sheri2-ssh.azurehdinsight.net', + ssh_port=22, + username='sshuser', + password='ChangePassw)rd12') + hdi_compute = ComputeTarget.attach(workspace=ws, + name='sherihdi2', + attach_configuration=attach_config) + +except ComputeTargetException as e: + print("Caught = {}".format(e.message)) + hdi_compute = ComputeTarget(workspace=ws, name='sherihdi') + + +hdi_compute.wait_for_completion(show_output=True) + +# +from azureml.core.runconfig import RunConfiguration +from azureml.core.conda_dependencies import CondaDependencies + + +# use pyspark framework +run_hdi = RunConfiguration(framework="pyspark") + +# Set compute target to the HDI cluster +run_hdi.target = hdi_compute.name + +# specify CondaDependencies object to ask system installing numpy +cd = CondaDependencies() +cd.add_conda_package('numpy') +run_hdi.environment.python.conda_dependencies = cd +# \ No newline at end of file diff --git a/ignore/doc-qa/how-to-set-up-training-targets/dsvm.py b/ignore/doc-qa/how-to-set-up-training-targets/dsvm.py index 7196cb98..3ac411fb 100644 --- a/ignore/doc-qa/how-to-set-up-training-targets/dsvm.py +++ b/ignore/doc-qa/how-to-set-up-training-targets/dsvm.py @@ -1,6 +1,6 @@ # Code for Remote virtual machines -compute_target_name = "attach-dsvm" +compute_target_name = "sheri-linuxvm" # import azureml.core diff --git a/ignore/doc-qa/how-to-set-up-training-targets/hdi.py b/ignore/doc-qa/how-to-set-up-training-targets/hdi.py new file mode 100644 index 00000000..ea8bc8ff --- /dev/null +++ b/ignore/doc-qa/how-to-set-up-training-targets/hdi.py @@ -0,0 +1,27 @@ + +from azureml.core import Workspace +ws = Workspace.from_config() + +from azureml.core.compute import ComputeTarget + +# refers to an existing compute resource attached to the workspace! +hdi_compute = ComputeTarget(workspace=ws, name='sherihdi') + + +# +from azureml.core.runconfig import RunConfiguration +from azureml.core.conda_dependencies import CondaDependencies + + +# use pyspark framework +run_hdi = RunConfiguration(framework="pyspark") + +# Set compute target to the HDI cluster +run_hdi.target = hdi_compute.name + +# specify CondaDependencies object to ask system installing numpy +cd = CondaDependencies() +cd.add_conda_package('numpy') +run_hdi.environment.python.conda_dependencies = cd +# +print(run_hdi) \ No newline at end of file diff --git a/ignore/doc-qa/how-to-set-up-training-targets/mylib.py b/ignore/doc-qa/how-to-set-up-training-targets/mylib.py new file mode 100644 index 00000000..08e4d1f4 --- /dev/null +++ b/ignore/doc-qa/how-to-set-up-training-targets/mylib.py @@ -0,0 +1,9 @@ +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. + +import numpy as np + + +def get_alphas(): + # list of numbers from 0.0 to 1.0 with a 0.05 interval + return np.arange(0.0, 1.0, 0.05) diff --git a/ignore/doc-qa/how-to-set-up-training-targets/remote.py b/ignore/doc-qa/how-to-set-up-training-targets/remote.py new file mode 100644 index 00000000..b46fdc59 --- /dev/null +++ b/ignore/doc-qa/how-to-set-up-training-targets/remote.py @@ -0,0 +1,52 @@ +# Code for Remote virtual machines + +compute_target_name = "attach-dsvm" + +# +import azureml.core +from azureml.core.runconfig import RunConfiguration, DEFAULT_CPU_IMAGE +from azureml.core.conda_dependencies import CondaDependencies + +run_dsvm = RunConfiguration(framework = "python") + +# Set the compute target to the Linux DSVM +run_dsvm.target = compute_target_name + +# Use Docker in the remote VM +run_dsvm.environment.docker.enabled = True + +# Use the CPU base image +# To use GPU in DSVM, you must also use the GPU base Docker image "azureml.core.runconfig.DEFAULT_GPU_IMAGE" +run_dsvm.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE +print('Base Docker image is:', run_dsvm.environment.docker.base_image) + +# Prepare the Docker and conda environment automatically when they're used for the first time +run_dsvm.prepare_environment = True + +# Specify the CondaDependencies object +run_dsvm.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn']) +# +hdi_compute.name = "blah" +from azureml.core.runconfig import RunConfiguration +from azureml.core.conda_dependencies import CondaDependencies + + +# use pyspark framework +hdi_run_config = RunConfiguration(framework="pyspark") + +# Set compute target to the HDI cluster +hdi_run_config.target = hdi_compute.name + +# specify CondaDependencies object to ask system installing numpy +cd = CondaDependencies() +cd.add_conda_package('numpy') +hdi_run_config.environment.python.conda_dependencies = cd + +# +from azureml.core.runconfig import RunConfiguration +# Configure the HDInsight run +# Load the runconfig object from the myhdi.runconfig file generated in the previous attach operation +run_hdi = RunConfiguration.load(project_object = project, run_name = 'myhdi') + +# Ask the system to prepare the conda environment automatically when it's used for the first time +run_hdi.auto_prepare_environment = True> \ No newline at end of file diff --git a/ignore/doc-qa/how-to-set-up-training-targets/temp.py b/ignore/doc-qa/how-to-set-up-training-targets/temp.py new file mode 100644 index 00000000..d1fd97af --- /dev/null +++ b/ignore/doc-qa/how-to-set-up-training-targets/temp.py @@ -0,0 +1,8 @@ +from azureml.core import Workspace +ws = Workspace.from_config() + +# +from azureml.core.compute import ComputeTarget, AmlCompute + +# First, list the supported VM families for Azure Machine Learning Compute +print(AmlCompute.supported_vmsizes(workspace=ws)) diff --git a/ignore/doc-qa/how-to-set-up-training-targets/train.py b/ignore/doc-qa/how-to-set-up-training-targets/train.py new file mode 100644 index 00000000..42da5a6d --- /dev/null +++ b/ignore/doc-qa/how-to-set-up-training-targets/train.py @@ -0,0 +1,45 @@ +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. + +from sklearn.datasets import load_diabetes +from sklearn.linear_model import Ridge +from sklearn.metrics import mean_squared_error +from sklearn.model_selection import train_test_split +from azureml.core.run import Run +from sklearn.externals import joblib +import os +import numpy as np +import mylib + +os.makedirs('./outputs', exist_ok=True) + +X, y = load_diabetes(return_X_y=True) + +run = Run.get_context() + +X_train, X_test, y_train, y_test = train_test_split(X, y, + test_size=0.2, + random_state=0) +data = {"train": {"X": X_train, "y": y_train}, + "test": {"X": X_test, "y": y_test}} + +# list of numbers from 0.0 to 1.0 with a 0.05 interval +alphas = mylib.get_alphas() + +for alpha in alphas: + # Use Ridge algorithm to create a regression model + reg = Ridge(alpha=alpha) + reg.fit(data["train"]["X"], data["train"]["y"]) + + preds = reg.predict(data["test"]["X"]) + mse = mean_squared_error(preds, data["test"]["y"]) + run.log('alpha', alpha) + run.log('mse', mse) + + model_file_name = 'ridge_{0:.2f}.pkl'.format(alpha) + # save model in the outputs folder so it automatically get uploaded + with open(model_file_name, "wb") as file: + joblib.dump(value=reg, filename=os.path.join('./outputs/', + model_file_name)) + + print('alpha is {0:.2f}, and mse is {1:0.2f}'.format(alpha, mse))