From 5598e07729d1c6c7e8eca906e3963c827658ec40 Mon Sep 17 00:00:00 2001 From: Hai Ning Date: Wed, 26 Sep 2018 14:00:38 -0400 Subject: [PATCH] Update 05.train-in-spark.ipynb --- .../05.train-in-spark/05.train-in-spark.ipynb | 144 +++++++++++++----- 1 file changed, 104 insertions(+), 40 deletions(-) diff --git a/01.getting-started/05.train-in-spark/05.train-in-spark.ipynb b/01.getting-started/05.train-in-spark/05.train-in-spark.ipynb index 0076a546..3d9a9edd 100644 --- a/01.getting-started/05.train-in-spark/05.train-in-spark.ipynb +++ b/01.getting-started/05.train-in-spark/05.train-in-spark.ipynb @@ -58,7 +58,7 @@ "from azureml.core import Workspace\n", "\n", "ws = Workspace.from_config()\n", - "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')" + "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\\n')" ] }, { @@ -77,8 +77,7 @@ "experiment_name = 'train-on-spark'\n", "\n", "from azureml.core import Experiment\n", - "\n", - "exp = Experiment(workspace = ws, name = experiment_name)" + "exp = Experiment(workspace=ws, name=experiment_name)" ] }, { @@ -107,12 +106,86 @@ "## Configure & Run" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure an ACI run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.runconfig import RunConfiguration\n", + "from azureml.core.conda_dependencies import CondaDependencies\n", + "\n", + "# use pyspark framework\n", + "aci_run_config = RunConfiguration(framework=\"pyspark\")\n", + "\n", + "# use ACI to run the Spark job\n", + "aci_run_config.target = 'containerinstance'\n", + "aci_run_config.container_instance.region = 'eastus2'\n", + "aci_run_config.container_instance.cpu_cores = 1\n", + "aci_run_config.container_instance.memory_gb = 2\n", + "\n", + "# specify base Docker image to use\n", + "aci_run_config.environment.docker.enabled = True\n", + "aci_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_MMLSPARK_CPU_IMAGE\n", + "\n", + "# specify CondaDependencies\n", + "cd = CondaDependencies()\n", + "cd.add_conda_package('numpy')\n", + "aci_run_config.environment.python.conda_dependencies = cd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit script to ACI to run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import ScriptRunConfig\n", + "\n", + "script_run_config = ScriptRunConfig(source_directory = '.',\n", + " script= 'train-spark.py',\n", + " run_config = aci_run_config)\n", + "run = exp.submit(script_run_config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.wait_for_completion(show_output=True)" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Attach an HDI cluster\n", - "To use HDI commpute target:\n", + "Now we can use a real Spark cluster, HDInsight for Spark, to run this job. To use HDI commpute target:\n", " 1. Create an Spark for HDI cluster in Azure. Here is some [quick instructions](https://docs.microsoft.com/en-us/azure/machine-learning/desktop-workbench/how-to-create-dsvm-hdi). Make sure you use the Ubuntu flavor, NOT CentOS.\n", " 2. Enter the IP address, username and password below" ] @@ -124,22 +197,22 @@ "outputs": [], "source": [ "from azureml.core.compute import HDInsightCompute\n", + "from azureml.exceptions import ComputeTargetException\n", "\n", "try:\n", " # if you want to connect using SSH key instead of username/password you can provide parameters private_key_file and private_key_passphrase\n", - " hdi_compute_new = HDInsightCompute.attach(ws, \n", - " name=\"hdi-attach\", \n", - " address=\"hdi-ignite-demo-ssh.azurehdinsight.net\", \n", - " ssh_port=22, \n", - " username='', \n", - " password='')\n", + " hdi_compute = HDInsightCompute.attach(workspace=ws, \n", + " name=\"myhdi\", \n", + " address=\"myhdi-ssh.azurehdinsight.net\", \n", + " ssh_port=22, \n", + " username='', \n", + " password='')\n", "\n", - "except UserErrorException as e:\n", + "except ComputeTargetException as e:\n", " print(\"Caught = {}\".format(e.message))\n", - " print(\"Compute config already attached.\")\n", " \n", - " \n", - "hdi_compute_new.wait_for_completion(show_output=True)" + " \n", + "hdi_compute.wait_for_completion(show_output=True)" ] }, { @@ -160,27 +233,18 @@ "\n", "\n", "# Load the \"cpu-dsvm.runconfig\" file (created by the above attach operation) in memory\n", - "run_config = RunConfiguration(framework = \"python\")\n", + "hdi_run_config = RunConfiguration(framework=\"pyspark\")\n", "\n", "# Set compute target to the Linux DSVM\n", - "run_config.target = hdi_compute.name\n", - "\n", - "# Use Docker in the remote VM\n", - "# run_config.environment.docker.enabled = True\n", - "\n", - "# Use CPU base image from DockerHub\n", - "# run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n", - "# print('Base Docker image is:', run_config.environment.docker.base_image)\n", + "hdi_run_config.target = hdi_compute.name\n", "\n", "# Ask system to provision a new one based on the conda_dependencies.yml file\n", - "run_config.environment.python.user_managed_dependencies = False\n", - "\n", - "# Prepare the Docker and conda environment automatically when executingfor the first time.\n", - "# run_config.prepare_environment = True\n", + "hdi_run_config.environment.python.user_managed_dependencies = False\n", "\n", "# specify CondaDependencies obj\n", - "# run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])\n", - "# load the runconfig object from the \"myhdi.runconfig\" file generated by the attach operaton above." + "cd = CondaDependencies()\n", + "cd.add_conda_package('numpy')\n", + "hdi_run_config.environment.python.conda_dependencies = cd" ] }, { @@ -196,10 +260,12 @@ "metadata": {}, "outputs": [], "source": [ + "from azureml.core import ScriptRunConfig\n", + "\n", "script_run_config = ScriptRunConfig(source_directory = '.',\n", " script= 'train-spark.py',\n", - " run_config = run_config)\n", - "run = experiment.submit(script_run_config)" + " run_config = hdi_run_config)\n", + "run = exp.submit(script_run_config)" ] }, { @@ -218,7 +284,9 @@ "metadata": {}, "outputs": [], "source": [ - "run.wait_for_completion(show_output = True)" + "# get all metris logged in the run\n", + "metrics = run.get_metrics()\n", + "print(metrics)" ] }, { @@ -226,18 +294,14 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# get all metris logged in the run\n", - "metrics = run.get_metrics()\n", - "print(metrics)" - ] + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3.6", + "display_name": "Python 3", "language": "python", - "name": "python36" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -249,7 +313,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.6" } }, "nbformat": 4,