mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-25 01:00:11 -05:00
Update notebooks
Update notebooks
This commit is contained in:
@@ -15,11 +15,9 @@
|
||||
"source": [
|
||||
"# 05. Train in Spark\n",
|
||||
"* Create Workspace\n",
|
||||
"* Create Project\n",
|
||||
"* Create `train-spark.py` file in the project folder\n",
|
||||
"* Execute a PySpark script in ACI.\n",
|
||||
"* Execute a PySpark script in a Docker container on remote DSVM\n",
|
||||
"* Execute a PySpark script in HDI"
|
||||
"* Create Experiment\n",
|
||||
"* Copy relevant files to the script folder\n",
|
||||
"* Configure and Run"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -67,8 +65,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Create Project and Associate with Run History\n",
|
||||
"**Project** is a local folder that contains files for your Azure ML experiments. It is associated with a **run history**, a cloud container of run metrics and output artifacts from your experiments. You can either attach a local folder as a new project, or load a local folder as a project if it has been attached before."
|
||||
"## Create Experiment\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -77,27 +74,15 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# choose a name for the run history container in the workspace\n",
|
||||
"experiment_name = 'train-on-spark'\n",
|
||||
"experiment_name = 'train-on-remote-vm'\n",
|
||||
"script_folder = './samples/train-on-remote-vm'\n",
|
||||
"\n",
|
||||
"# project folder\n",
|
||||
"project_folder = './sample_projects/train-on-spark'"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from azureml.project.project import Project\n",
|
||||
"os.makedirs(script_folder, exist_ok = True)\n",
|
||||
"\n",
|
||||
"project = Project.attach(workspace_object = ws,\n",
|
||||
" experiment_name = experiment_name,\n",
|
||||
" directory = project_folder)\n",
|
||||
"from azureml.core import Experiment\n",
|
||||
"\n",
|
||||
"print(project.project_directory, project.history.name, sep = '\\n')"
|
||||
"exp = Experiment(workspace = ws, name = experiment_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -119,11 +104,11 @@
|
||||
"from shutil import copyfile\n",
|
||||
"\n",
|
||||
"# copy iris dataset in to project folder\n",
|
||||
"copyfile('./iris.csv', os.path.join(project_folder, 'iris.csv'))\n",
|
||||
"copyfile('iris.csv', os.path.join(script_folder, 'iris.csv'))\n",
|
||||
"\n",
|
||||
"# copy train-spark.py file into project folder\n",
|
||||
"# train-spark.py trains a simple LogisticRegression model using Spark.ML algorithm\n",
|
||||
"copyfile('./train-spark.py', os.path.join(project_folder, 'train-spark.py'))"
|
||||
"copyfile('train-spark.py', os.path.join(script_folder, 'train-spark.py'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -150,210 +135,6 @@
|
||||
"## Configure & Run"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure ACI target"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.runconfig import RunConfiguration\n",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\n",
|
||||
"\n",
|
||||
"# create a new runconfig object\n",
|
||||
"run_config = RunConfiguration()\n",
|
||||
"\n",
|
||||
"# signal that you want to use ACI to execute script.\n",
|
||||
"run_config.target = \"containerinstance\"\n",
|
||||
"\n",
|
||||
"# ACI container group is only supported in certain regions, which can be different than the region the Workspace is in.\n",
|
||||
"run_config.container_instance.region = 'eastus'\n",
|
||||
"\n",
|
||||
"# set the ACI CPU and Memory \n",
|
||||
"run_config.container_instance.cpu_cores = 1\n",
|
||||
"run_config.container_instance.memory_gb = 2\n",
|
||||
"\n",
|
||||
"# enable Docker \n",
|
||||
"run_config.environment.docker.enabled = True\n",
|
||||
"\n",
|
||||
"# set Docker base image to the default CPU-based image\n",
|
||||
"run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_MMLSPARK_CPU_IMAGE\n",
|
||||
"print('base image is', run_config.environment.docker.base_image)\n",
|
||||
"#run_config.environment.docker.base_image = 'microsoft/mmlspark:plus-0.9.9'\n",
|
||||
"\n",
|
||||
"# use conda_dependencies.yml to create a conda environment in the Docker image for execution\n",
|
||||
"# please update this file if you need additional packages.\n",
|
||||
"run_config.environment.python.user_managed_dependencies = False\n",
|
||||
"\n",
|
||||
"# auto-prepare the Docker image when used for execution (if it is not already prepared)\n",
|
||||
"run_config.auto_prepare_environment = True\n",
|
||||
"\n",
|
||||
"cd = CondaDependencies()\n",
|
||||
"# add numpy as a dependency\n",
|
||||
"cd.add_conda_package('numpy')\n",
|
||||
"# overwrite the default conda_dependencies.yml file\n",
|
||||
"cd.save_to_file(base_directory = project_folder, conda_file_path='aml_config/conda_dependencies.yml')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Run Spark job in ACI"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time \n",
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
"from azureml.core.script_run_config import ScriptRunConfig\n",
|
||||
"\n",
|
||||
"experiment = Experiment(project_object.workspace_object, project_object.history.name)\n",
|
||||
"script_run_config = ScriptRunConfig(source_directory = project.project_directory,\n",
|
||||
" script= 'train-spark.py',\n",
|
||||
" run_config = run_config)\n",
|
||||
"run = experiment.submit(script_run_config)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run.wait_for_completion(show_output = True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Show the run in the web UI\n",
|
||||
"**IMPORTANT**: Please use Chrome to navigate to the URL."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import helpers.py\n",
|
||||
"import helpers\n",
|
||||
"\n",
|
||||
"# get the URL of the run history web page\n",
|
||||
"print(helpers.get_run_history_url(run))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Attach a remote Linux VM\n",
|
||||
"To use remote docker commpute target:\n",
|
||||
" 1. Create a Linux DSVM in Azure. Here is some [quick instructions](https://docs.microsoft.com/en-us/azure/machine-learning/desktop-workbench/how-to-create-dsvm-hdi). Make sure you use the Ubuntu flavor, NOT CentOS.\n",
|
||||
" 2. Enter the IP address, username and password below\n",
|
||||
" \n",
|
||||
"**Note**: the below example use port 5022. By default SSH runs on port 22 and you don't need to specify it. But if for security reasons you switch to a different port (such as 5022), you can append the port number to the address like the example below. [Read more](../../documentation/sdk/ssh-issue.md) on this."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.core.compute import RemoteCompute\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" # Attaches a remote docker on a remote vm as a compute target.\n",
|
||||
" RemoteCompute.attach(workspace,name = \"cpu-dsvm\", username = \"ninghai\", \n",
|
||||
" address = \"hai2.eastus2.cloudapp.azure.com:5022\", \n",
|
||||
" ssh-port=22\n",
|
||||
" password = \"<password>\"))\n",
|
||||
"except UserErrorException as e:\n",
|
||||
" print(\"Caught = {}\".format(e.message))\n",
|
||||
" print(\"Compute config already attached.\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Configure a Spark Docker run on the VM\n",
|
||||
"Execute in the Spark engine in a Docker container in the VM. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the \"cpu-dsvm.runconfig\" file (created by the above attach operation) in memory\n",
|
||||
"run_config = RunConfiguration.load(path = project_folder, name = \"cpu-dsvm\")\n",
|
||||
"\n",
|
||||
"# set framework to PySpark\n",
|
||||
"run_config.framework = \"PySpark\"\n",
|
||||
"\n",
|
||||
"# Use Docker in the remote VM\n",
|
||||
"run_config.environment.docker.enabled = True\n",
|
||||
"\n",
|
||||
"# Use the MMLSpark CPU based image.\n",
|
||||
"# https://hub.docker.com/r/microsoft/mmlspark/\n",
|
||||
"run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_MMLSPARK_CPU_IMAGE\n",
|
||||
"print('base image is:', run_config.environment.docker.base_image)\n",
|
||||
"\n",
|
||||
"# signal use the user-managed environment\n",
|
||||
"# do NOT provision a new one based on the conda.yml file\n",
|
||||
"run_config.environment.python.user_managed_dependencies = False\n",
|
||||
"\n",
|
||||
"# Prepare the Docker and conda environment automatically when execute for the first time.\n",
|
||||
"run_config.auto_prepare_environment = True"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Submit the Experiment\n",
|
||||
"Submit script to run in the Spark engine in the Docker container in the remote VM."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"script_run_config = ScriptRunConfig(source_directory = project.project_directory,\n",
|
||||
" script= 'train-spark.py',\n",
|
||||
" run_config = run_config)\n",
|
||||
"run = experiment.submit(script_run_config)\n",
|
||||
"\n",
|
||||
"run.wait_for_completion(show_output = True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get the URL of the run history web page\n",
|
||||
"print(helpers.get_run_history_url(run))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -373,14 +154,20 @@
|
||||
"from azureml.core.compute import HDInsightCompute\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" # Attaches a HDI cluster as a compute target.\n",
|
||||
" HDInsightCompute.attach(ws, name = \"myhdi\",\n",
|
||||
" username = \"ninghai\", \n",
|
||||
" address = \"sparkhai-ssh.azurehdinsight.net\", \n",
|
||||
" password = \"<pwd>\"))\n",
|
||||
" # if you want to connect using SSH key instead of username/password you can provide parameters private_key_file and private_key_passphrase\n",
|
||||
" hdi_compute_new = HDInsightCompute.attach(ws, \n",
|
||||
" name=\"hdi-attach\", \n",
|
||||
" address=\"hdi-ignite-demo-ssh.azurehdinsight.net\", \n",
|
||||
" ssh_port=22, \n",
|
||||
" username='<username>', \n",
|
||||
" password='<password>')\n",
|
||||
"\n",
|
||||
"except UserErrorException as e:\n",
|
||||
" print(\"Caught = {}\".format(e.message))\n",
|
||||
" print(\"Compute config already attached.\")"
|
||||
" print(\"Compute config already attached.\")\n",
|
||||
" \n",
|
||||
" \n",
|
||||
"hdi_compute_new.wait_for_completion(show_output=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -396,11 +183,32 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load the runconfig object from the \"myhdi.runconfig\" file generated by the attach operaton above.\n",
|
||||
"run_config = RunConfiguration.load(path = project_folder, name = 'myhdi')\n",
|
||||
"from azureml.core.runconfig import RunConfiguration\n",
|
||||
"from azureml.core.conda_dependencies import CondaDependencies\n",
|
||||
"\n",
|
||||
"# ask system to prepare the conda environment automatically when executed for the first time\n",
|
||||
"run_config.auto_prepare_environment = True"
|
||||
"\n",
|
||||
"# Load the \"cpu-dsvm.runconfig\" file (created by the above attach operation) in memory\n",
|
||||
"run_config = RunConfiguration(framework = \"python\")\n",
|
||||
"\n",
|
||||
"# Set compute target to the Linux DSVM\n",
|
||||
"run_config.target = hdi_compute.name\n",
|
||||
"\n",
|
||||
"# Use Docker in the remote VM\n",
|
||||
"# run_config.environment.docker.enabled = True\n",
|
||||
"\n",
|
||||
"# Use CPU base image from DockerHub\n",
|
||||
"# run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
|
||||
"# print('Base Docker image is:', run_config.environment.docker.base_image)\n",
|
||||
"\n",
|
||||
"# Ask system to provision a new one based on the conda_dependencies.yml file\n",
|
||||
"run_config.environment.python.user_managed_dependencies = False\n",
|
||||
"\n",
|
||||
"# Prepare the Docker and conda environment automatically when executingfor the first time.\n",
|
||||
"# run_config.prepare_environment = True\n",
|
||||
"\n",
|
||||
"# specify CondaDependencies obj\n",
|
||||
"# run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])\n",
|
||||
"# load the runconfig object from the \"myhdi.runconfig\" file generated by the attach operaton above."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -448,7 +256,7 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python [default]",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
@@ -462,7 +270,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
"version": "3.6.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
Reference in New Issue
Block a user