diff --git a/01.getting-started/05.train-in-spark/05.train-in-spark.ipynb b/01.getting-started/05.train-in-spark/05.train-in-spark.ipynb index c8913211..714168eb 100644 --- a/01.getting-started/05.train-in-spark/05.train-in-spark.ipynb +++ b/01.getting-started/05.train-in-spark/05.train-in-spark.ipynb @@ -1,331 +1,331 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 05. Train in Spark\n", - "* Create Workspace\n", - "* Create Experiment\n", - "* Copy relevant files to the script folder\n", - "* Configure and Run" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prerequisites\n", - "Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Check core SDK version number\n", - "import azureml.core\n", - "\n", - "print(\"SDK version:\", azureml.core.VERSION)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialize Workspace\n", - "\n", - "Initialize a workspace object from persisted configuration." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core import Workspace\n", - "\n", - "ws = Workspace.from_config()\n", - "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\\n')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Experiment\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "experiment_name = 'train-on-spark'\n", - "\n", - "from azureml.core import Experiment\n", - "exp = Experiment(workspace=ws, name=experiment_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## View `train-spark.py`\n", - "\n", - "For convenience, we created a training script for you. It is printed below as a text, but you can also run `%pfile ./train-spark.py` in a cell to show the file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "with open('train-spark.py', 'r') as training_script:\n", - " print(training_script.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configure & Run" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Configure an ACI run\n", - "Before you try running on an actual Spark cluster, you can use a Docker image with Spark already baked in, and run it in ACI(Azure Container Registry)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.runconfig import RunConfiguration\n", - "from azureml.core.conda_dependencies import CondaDependencies\n", - "\n", - "# use pyspark framework\n", - "aci_run_config = RunConfiguration(framework=\"pyspark\")\n", - "\n", - "# use ACI to run the Spark job\n", - "aci_run_config.target = 'containerinstance'\n", - "aci_run_config.container_instance.region = 'eastus2'\n", - "aci_run_config.container_instance.cpu_cores = 1\n", - "aci_run_config.container_instance.memory_gb = 2\n", - "\n", - "# specify base Docker image to use\n", - "aci_run_config.environment.docker.enabled = True\n", - "aci_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_MMLSPARK_CPU_IMAGE\n", - "\n", - "# specify CondaDependencies\n", - "cd = CondaDependencies()\n", - "cd.add_conda_package('numpy')\n", - "aci_run_config.environment.python.conda_dependencies = cd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Submit script to ACI to run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core import ScriptRunConfig\n", - "\n", - "script_run_config = ScriptRunConfig(source_directory = '.',\n", - " script= 'train-spark.py',\n", - " run_config = aci_run_config)\n", - "run = exp.submit(script_run_config)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "run.wait_for_completion(show_output=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note** you can also create a new VM, or attach an existing VM, and use Docker-based execution to run the Spark job. Please see the `04.train-in-vm` for example on how to configure and run in Docker mode in a VM." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Attach an HDI cluster\n", - "Now we can use a real Spark cluster, HDInsight for Spark, to run this job. To use HDI commpute target:\n", - " 1. Create a Spark for HDI cluster in Azure. Here are some [quick instructions](https://docs.microsoft.com/en-us/azure/hdinsight/spark/apache-spark-jupyter-spark-sql). Make sure you use the Ubuntu flavor, NOT CentOS.\n", - " 2. Enter the IP address, username and password below" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.compute import HDInsightCompute\n", - "from azureml.exceptions import ComputeTargetException\n", - "\n", - "try:\n", - " # if you want to connect using SSH key instead of username/password you can provide parameters private_key_file and private_key_passphrase\n", - " hdi_compute = HDInsightCompute.attach(workspace=ws, \n", - " name=\"myhdi\", \n", - " address=\".azurehdinsight.net\", \n", - " ssh_port=22, \n", - " username='', \n", - " password='')\n", - "\n", - "except ComputeTargetException as e:\n", - " print(\"Caught = {}\".format(e.message))\n", - " \n", - " \n", - "hdi_compute.wait_for_completion(show_output=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Configure HDI run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.runconfig import RunConfiguration\n", - "from azureml.core.conda_dependencies import CondaDependencies\n", - "\n", - "\n", - "# use pyspark framework\n", - "hdi_run_config = RunConfiguration(framework=\"pyspark\")\n", - "\n", - "# Set compute target to the HDI cluster\n", - "hdi_run_config.target = hdi_compute.name\n", - "\n", - "# specify CondaDependencies object to ask system installing numpy\n", - "cd = CondaDependencies()\n", - "cd.add_conda_package('numpy')\n", - "hdi_run_config.environment.python.conda_dependencies = cd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Submit the script to HDI" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core import ScriptRunConfig\n", - "\n", - "script_run_config = ScriptRunConfig(source_directory = '.',\n", - " script= 'train-spark.py',\n", - " run_config = hdi_run_config)\n", - "run = exp.submit(config=script_run_config)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get the URL of the run history web page\n", - "run" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get all metris logged in the run\n", - "metrics = run.get_metrics()\n", - "print(metrics)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "authors": [ - { - "name": "aashishb" - } - ], - "kernelspec": { - "display_name": "Python 3.6", - "language": "python", - "name": "python36" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.6" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 05. Train in Spark\n", + "* Create Workspace\n", + "* Create Experiment\n", + "* Copy relevant files to the script folder\n", + "* Configure and Run" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "Make sure you go through the [00. Installation and Configuration](00.configuration.ipynb) Notebook first if you haven't." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check core SDK version number\n", + "import azureml.core\n", + "\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Workspace\n", + "\n", + "Initialize a workspace object from persisted configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\\n')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Experiment\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "experiment_name = 'train-on-spark'\n", + "\n", + "from azureml.core import Experiment\n", + "exp = Experiment(workspace=ws, name=experiment_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## View `train-spark.py`\n", + "\n", + "For convenience, we created a training script for you. It is printed below as a text, but you can also run `%pfile ./train-spark.py` in a cell to show the file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('train-spark.py', 'r') as training_script:\n", + " print(training_script.read())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure & Run" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure an ACI run\n", + "Before you try running on an actual Spark cluster, you can use a Docker image with Spark already baked in, and run it in ACI(Azure Container Registry)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.runconfig import RunConfiguration\n", + "from azureml.core.conda_dependencies import CondaDependencies\n", + "\n", + "# use pyspark framework\n", + "aci_run_config = RunConfiguration(framework=\"pyspark\")\n", + "\n", + "# use ACI to run the Spark job\n", + "aci_run_config.target = 'containerinstance'\n", + "aci_run_config.container_instance.region = 'eastus2'\n", + "aci_run_config.container_instance.cpu_cores = 1\n", + "aci_run_config.container_instance.memory_gb = 2\n", + "\n", + "# specify base Docker image to use\n", + "aci_run_config.environment.docker.enabled = True\n", + "aci_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_MMLSPARK_CPU_IMAGE\n", + "\n", + "# specify CondaDependencies\n", + "cd = CondaDependencies()\n", + "cd.add_conda_package('numpy')\n", + "aci_run_config.environment.python.conda_dependencies = cd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit script to ACI to run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import ScriptRunConfig\n", + "\n", + "script_run_config = ScriptRunConfig(source_directory = '.',\n", + " script= 'train-spark.py',\n", + " run_config = aci_run_config)\n", + "run = exp.submit(script_run_config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "run.wait_for_completion(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note** you can also create a new VM, or attach an existing VM, and use Docker-based execution to run the Spark job. Please see the `04.train-in-vm` for example on how to configure and run in Docker mode in a VM." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Attach an HDI cluster\n", + "Now we can use a real Spark cluster, HDInsight for Spark, to run this job. To use HDI commpute target:\n", + " 1. Create a Spark for HDI cluster in Azure. Here are some [quick instructions](https://docs.microsoft.com/en-us/azure/hdinsight/spark/apache-spark-jupyter-spark-sql). Make sure you use the Ubuntu flavor, NOT CentOS.\n", + " 2. Enter the IP address, username and password below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.compute import HDInsightCompute\n", + "from azureml.exceptions import ComputeTargetException\n", + "\n", + "try:\n", + " # if you want to connect using SSH key instead of username/password you can provide parameters private_key_file and private_key_passphrase\n", + " hdi_compute = HDInsightCompute.attach(workspace=ws, \n", + " name=\"myhdi\", \n", + " address=\".azurehdinsight.net\", \n", + " ssh_port=22, \n", + " username='', \n", + " password='')\n", + "\n", + "except ComputeTargetException as e:\n", + " print(\"Caught = {}\".format(e.message))\n", + " \n", + " \n", + "hdi_compute.wait_for_completion(show_output=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Configure HDI run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.runconfig import RunConfiguration\n", + "from azureml.core.conda_dependencies import CondaDependencies\n", + "\n", + "\n", + "# use pyspark framework\n", + "hdi_run_config = RunConfiguration(framework=\"pyspark\")\n", + "\n", + "# Set compute target to the HDI cluster\n", + "hdi_run_config.target = hdi_compute.name\n", + "\n", + "# specify CondaDependencies object to ask system installing numpy\n", + "cd = CondaDependencies()\n", + "cd.add_conda_package('numpy')\n", + "hdi_run_config.environment.python.conda_dependencies = cd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Submit the script to HDI" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import ScriptRunConfig\n", + "\n", + "script_run_config = ScriptRunConfig(source_directory = '.',\n", + " script= 'train-spark.py',\n", + " run_config = hdi_run_config)\n", + "run = exp.submit(config=script_run_config)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get the URL of the run history web page\n", + "run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get all metris logged in the run\n", + "metrics = run.get_metrics()\n", + "print(metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "authors": [ + { + "name": "aashishb" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}