Update 05.train-in-spark.ipynb

This commit is contained in:
Hai Ning
2018-09-26 14:00:38 -04:00
committed by GitHub
parent d9b62ad651
commit 5598e07729

View File

@@ -58,7 +58,7 @@
"from azureml.core import Workspace\n",
"\n",
"ws = Workspace.from_config()\n",
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\\n')"
"print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\\n')"
]
},
{
@@ -77,8 +77,7 @@
"experiment_name = 'train-on-spark'\n",
"\n",
"from azureml.core import Experiment\n",
"\n",
"exp = Experiment(workspace = ws, name = experiment_name)"
"exp = Experiment(workspace=ws, name=experiment_name)"
]
},
{
@@ -107,12 +106,86 @@
"## Configure & Run"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Configure an ACI run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core.runconfig import RunConfiguration\n",
"from azureml.core.conda_dependencies import CondaDependencies\n",
"\n",
"# use pyspark framework\n",
"aci_run_config = RunConfiguration(framework=\"pyspark\")\n",
"\n",
"# use ACI to run the Spark job\n",
"aci_run_config.target = 'containerinstance'\n",
"aci_run_config.container_instance.region = 'eastus2'\n",
"aci_run_config.container_instance.cpu_cores = 1\n",
"aci_run_config.container_instance.memory_gb = 2\n",
"\n",
"# specify base Docker image to use\n",
"aci_run_config.environment.docker.enabled = True\n",
"aci_run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_MMLSPARK_CPU_IMAGE\n",
"\n",
"# specify CondaDependencies\n",
"cd = CondaDependencies()\n",
"cd.add_conda_package('numpy')\n",
"aci_run_config.environment.python.conda_dependencies = cd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Submit script to ACI to run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import ScriptRunConfig\n",
"\n",
"script_run_config = ScriptRunConfig(source_directory = '.',\n",
" script= 'train-spark.py',\n",
" run_config = aci_run_config)\n",
"run = exp.submit(script_run_config)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run.wait_for_completion(show_output=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Attach an HDI cluster\n",
"To use HDI commpute target:\n",
"Now we can use a real Spark cluster, HDInsight for Spark, to run this job. To use HDI commpute target:\n",
" 1. Create an Spark for HDI cluster in Azure. Here is some [quick instructions](https://docs.microsoft.com/en-us/azure/machine-learning/desktop-workbench/how-to-create-dsvm-hdi). Make sure you use the Ubuntu flavor, NOT CentOS.\n",
" 2. Enter the IP address, username and password below"
]
@@ -124,22 +197,22 @@
"outputs": [],
"source": [
"from azureml.core.compute import HDInsightCompute\n",
"from azureml.exceptions import ComputeTargetException\n",
"\n",
"try:\n",
" # if you want to connect using SSH key instead of username/password you can provide parameters private_key_file and private_key_passphrase\n",
" hdi_compute_new = HDInsightCompute.attach(ws, \n",
" name=\"hdi-attach\", \n",
" address=\"hdi-ignite-demo-ssh.azurehdinsight.net\", \n",
" ssh_port=22, \n",
" username='<username>', \n",
" password='<password>')\n",
" hdi_compute = HDInsightCompute.attach(workspace=ws, \n",
" name=\"myhdi\", \n",
" address=\"myhdi-ssh.azurehdinsight.net\", \n",
" ssh_port=22, \n",
" username='<ssh-username>', \n",
" password='<ssh-pwd>')\n",
"\n",
"except UserErrorException as e:\n",
"except ComputeTargetException as e:\n",
" print(\"Caught = {}\".format(e.message))\n",
" print(\"Compute config already attached.\")\n",
" \n",
" \n",
"hdi_compute_new.wait_for_completion(show_output=True)"
" \n",
"hdi_compute.wait_for_completion(show_output=True)"
]
},
{
@@ -160,27 +233,18 @@
"\n",
"\n",
"# Load the \"cpu-dsvm.runconfig\" file (created by the above attach operation) in memory\n",
"run_config = RunConfiguration(framework = \"python\")\n",
"hdi_run_config = RunConfiguration(framework=\"pyspark\")\n",
"\n",
"# Set compute target to the Linux DSVM\n",
"run_config.target = hdi_compute.name\n",
"\n",
"# Use Docker in the remote VM\n",
"# run_config.environment.docker.enabled = True\n",
"\n",
"# Use CPU base image from DockerHub\n",
"# run_config.environment.docker.base_image = azureml.core.runconfig.DEFAULT_CPU_IMAGE\n",
"# print('Base Docker image is:', run_config.environment.docker.base_image)\n",
"hdi_run_config.target = hdi_compute.name\n",
"\n",
"# Ask system to provision a new one based on the conda_dependencies.yml file\n",
"run_config.environment.python.user_managed_dependencies = False\n",
"\n",
"# Prepare the Docker and conda environment automatically when executingfor the first time.\n",
"# run_config.prepare_environment = True\n",
"hdi_run_config.environment.python.user_managed_dependencies = False\n",
"\n",
"# specify CondaDependencies obj\n",
"# run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])\n",
"# load the runconfig object from the \"myhdi.runconfig\" file generated by the attach operaton above."
"cd = CondaDependencies()\n",
"cd.add_conda_package('numpy')\n",
"hdi_run_config.environment.python.conda_dependencies = cd"
]
},
{
@@ -196,10 +260,12 @@
"metadata": {},
"outputs": [],
"source": [
"from azureml.core import ScriptRunConfig\n",
"\n",
"script_run_config = ScriptRunConfig(source_directory = '.',\n",
" script= 'train-spark.py',\n",
" run_config = run_config)\n",
"run = experiment.submit(script_run_config)"
" run_config = hdi_run_config)\n",
"run = exp.submit(script_run_config)"
]
},
{
@@ -218,7 +284,9 @@
"metadata": {},
"outputs": [],
"source": [
"run.wait_for_completion(show_output = True)"
"# get all metris logged in the run\n",
"metrics = run.get_metrics()\n",
"print(metrics)"
]
},
{
@@ -226,18 +294,14 @@
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# get all metris logged in the run\n",
"metrics = run.get_metrics()\n",
"print(metrics)"
]
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.6",
"display_name": "Python 3",
"language": "python",
"name": "python36"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -249,7 +313,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.6.6"
}
},
"nbformat": 4,