From c2968b65264eeda69061d5b649fbf0026216482b Mon Sep 17 00:00:00 2001 From: Roope Astala Date: Thu, 20 Dec 2018 16:10:27 -0500 Subject: [PATCH] fix databricks --- .../amlsdk/build-model-run-history-03.ipynb | 770 +++++----- .../amlsdk/deploy-to-aci-04.ipynb | 686 +++++---- .../amlsdk/ingest-data-02.ipynb | 358 ++--- .../installation-and-configuration-01.ipynb | 437 +++--- .../automl/automl-databricks-local-01.ipynb | 1240 ++++++++--------- 5 files changed, 1657 insertions(+), 1834 deletions(-) diff --git a/how-to-use-azureml/azure-databricks/amlsdk/build-model-run-history-03.ipynb b/how-to-use-azureml/azure-databricks/amlsdk/build-model-run-history-03.ipynb index e7c31659..6611b90d 100644 --- a/how-to-use-azureml/azure-databricks/amlsdk/build-model-run-history-03.ipynb +++ b/how-to-use-azureml/azure-databricks/amlsdk/build-model-run-history-03.ipynb @@ -1,396 +1,380 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Azure ML & Azure Databricks notebooks by Parashar Shah.\n", - "\n", - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![04ACI](files/tables/image2.JPG)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#Model Building" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import pprint\n", - "import numpy as np\n", - "\n", - "from pyspark.ml import Pipeline, PipelineModel\n", - "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n", - "from pyspark.ml.classification import LogisticRegression\n", - "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", - "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.core\n", - "\n", - "# Check core SDK version number\n", - "print(\"SDK version:\", azureml.core.VERSION)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##TESTONLY\n", - "# import auth creds from notebook parameters\n", - "tenant = dbutils.widgets.get('tenant_id')\n", - "username = dbutils.widgets.get('service_principal_id')\n", - "password = dbutils.widgets.get('service_principal_password')\n", - "\n", - "auth = azureml.core.authentication.ServicePrincipalAuthentication(tenant, username, password)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import the Workspace class and check the azureml SDK version\n", - "from azureml.core import Workspace\n", - "\n", - "ws = Workspace.from_config(auth = auth)\n", - "print('Workspace name: ' + ws.name, \n", - " 'Azure region: ' + ws.location, \n", - " 'Subscription id: ' + ws.subscription_id, \n", - " 'Resource group: ' + ws.resource_group, sep = '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##PUBLISHONLY\n", - "## import the Workspace class and check the azureml SDK version\n", - "#from azureml.core import Workspace\n", - "#\n", - "#ws = Workspace.from_config()\n", - "#print('Workspace name: ' + ws.name, \n", - "# 'Azure region: ' + ws.location, \n", - "# 'Subscription id: ' + ws.subscription_id, \n", - "# 'Resource group: ' + ws.resource_group, sep = '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#get the train and test datasets\n", - "train_data_path = \"AdultCensusIncomeTrain\"\n", - "test_data_path = \"AdultCensusIncomeTest\"\n", - "\n", - "train = spark.read.parquet(train_data_path)\n", - "test = spark.read.parquet(test_data_path)\n", - "\n", - "print(\"train: ({}, {})\".format(train.count(), len(train.columns)))\n", - "print(\"test: ({}, {})\".format(test.count(), len(test.columns)))\n", - "\n", - "train.printSchema()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#Define Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "label = \"income\"\n", - "dtypes = dict(train.dtypes)\n", - "dtypes.pop(label)\n", - "\n", - "si_xvars = []\n", - "ohe_xvars = []\n", - "featureCols = []\n", - "for idx,key in enumerate(dtypes):\n", - " if dtypes[key] == \"string\":\n", - " featureCol = \"-\".join([key, \"encoded\"])\n", - " featureCols.append(featureCol)\n", - " \n", - " tmpCol = \"-\".join([key, \"tmp\"])\n", - " # string-index and one-hot encode the string column\n", - " #https://spark.apache.org/docs/2.3.0/api/java/org/apache/spark/ml/feature/StringIndexer.html\n", - " #handleInvalid: Param for how to handle invalid data (unseen labels or NULL values). \n", - " #Options are 'skip' (filter out rows with invalid data), 'error' (throw an error), \n", - " #or 'keep' (put invalid data in a special additional bucket, at index numLabels). Default: \"error\"\n", - " si_xvars.append(StringIndexer(inputCol=key, outputCol=tmpCol, handleInvalid=\"skip\"))\n", - " ohe_xvars.append(OneHotEncoder(inputCol=tmpCol, outputCol=featureCol))\n", - " else:\n", - " featureCols.append(key)\n", - "\n", - "# string-index the label column into a column named \"label\"\n", - "si_label = StringIndexer(inputCol=label, outputCol='label')\n", - "\n", - "# assemble the encoded feature columns in to a column named \"features\"\n", - "assembler = VectorAssembler(inputCols=featureCols, outputCol=\"features\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.run import Run\n", - "from azureml.core.experiment import Experiment\n", - "import numpy as np\n", - "import os\n", - "import shutil\n", - "\n", - "model_name = \"AdultCensus_runHistory.mml\"\n", - "model_dbfs = os.path.join(\"/dbfs\", model_name)\n", - "run_history_name = 'spark-ml-notebook'\n", - "\n", - "# start a training run by defining an experiment\n", - "myexperiment = Experiment(ws, \"Ignite_AI_Talk\")\n", - "root_run = myexperiment.start_logging()\n", - "\n", - "# Regularization Rates - \n", - "regs = [0.0001, 0.001, 0.01, 0.1]\n", - " \n", - "# try a bunch of regularization rate in a Logistic Regression model\n", - "for reg in regs:\n", - " print(\"Regularization rate: {}\".format(reg))\n", - " # create a bunch of child runs\n", - " with root_run.child_run(\"reg-\" + str(reg)) as run:\n", - " # create a new Logistic Regression model.\n", - " lr = LogisticRegression(regParam=reg)\n", - " \n", - " # put together the pipeline\n", - " pipe = Pipeline(stages=[*si_xvars, *ohe_xvars, si_label, assembler, lr])\n", - "\n", - " # train the model\n", - " model_p = pipe.fit(train)\n", - " \n", - " # make prediction\n", - " pred = model_p.transform(test)\n", - " \n", - " # evaluate. note only 2 metrics are supported out of the box by Spark ML.\n", - " bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')\n", - " au_roc = bce.setMetricName('areaUnderROC').evaluate(pred)\n", - " au_prc = bce.setMetricName('areaUnderPR').evaluate(pred)\n", - "\n", - " print(\"Area under ROC: {}\".format(au_roc))\n", - " print(\"Area Under PR: {}\".format(au_prc))\n", - " \n", - " # log reg, au_roc, au_prc and feature names in run history\n", - " run.log(\"reg\", reg)\n", - " run.log(\"au_roc\", au_roc)\n", - " run.log(\"au_prc\", au_prc)\n", - " run.log_list(\"columns\", train.columns)\n", - "\n", - " # save model\n", - " model_p.write().overwrite().save(model_name)\n", - " \n", - " # upload the serialized model into run history record\n", - " mdl, ext = model_name.split(\".\")\n", - " model_zip = mdl + \".zip\"\n", - " shutil.make_archive(mdl, 'zip', model_dbfs)\n", - " run.upload_file(\"outputs/\" + model_name, model_zip) \n", - " #run.upload_file(\"outputs/\" + model_name, path_or_stream = model_dbfs) #cannot deal with folders\n", - "\n", - " # now delete the serialized model from local folder since it is already uploaded to run history \n", - " shutil.rmtree(model_dbfs)\n", - " os.remove(model_zip)\n", - " \n", - "# Declare run completed\n", - "root_run.complete()\n", - "root_run_id = root_run.id\n", - "print (\"run id:\", root_run.id)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "metrics = root_run.get_metrics(recursive=True)\n", - "best_run_id = max(metrics, key = lambda k: metrics[k]['au_roc'])\n", - "print(best_run_id, metrics[best_run_id]['au_roc'], metrics[best_run_id]['reg'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Get the best run\n", - "child_runs = {}\n", - "\n", - "for r in root_run.get_children():\n", - " child_runs[r.id] = r\n", - " \n", - "best_run = child_runs[best_run_id]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Download the model from the best run to a local folder\n", - "best_model_file_name = \"best_model.zip\"\n", - "best_run.download_file(name = 'outputs/' + model_name, output_file_path = best_model_file_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#Model Evaluation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##unzip the model to dbfs (as load() seems to require that) and load it.\n", - "if os.path.isfile(model_dbfs) or os.path.isdir(model_dbfs):\n", - " shutil.rmtree(model_dbfs)\n", - "shutil.unpack_archive(best_model_file_name, model_dbfs)\n", - "\n", - "model_p_best = PipelineModel.load(model_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# make prediction\n", - "pred = model_p_best.transform(test)\n", - "output = pred[['hours_per_week','age','workclass','marital_status','income','prediction']]\n", - "display(output.limit(5))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# evaluate. note only 2 metrics are supported out of the box by Spark ML.\n", - "bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')\n", - "au_roc = bce.setMetricName('areaUnderROC').evaluate(pred)\n", - "au_prc = bce.setMetricName('areaUnderPR').evaluate(pred)\n", - "\n", - "print(\"Area under ROC: {}\".format(au_roc))\n", - "print(\"Area Under PR: {}\".format(au_prc))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#Model Persistence" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##NOTE: by default the model is saved to and loaded from /dbfs/ instead of cwd!\n", - "model_p_best.write().overwrite().save(model_name)\n", - "print(\"saved model to {}\".format(model_dbfs))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%sh\n", - "\n", - "ls -la /dbfs/AdultCensus_runHistory.mml/*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dbutils.notebook.exit(\"success\")" - ] - } - ], - "metadata": { - "authors": [ - { - "name": "pasha" - }, - { - "name": "wamartin" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Azure ML & Azure Databricks notebooks by Parashar Shah.\n", + "\n", + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![04ACI](files/tables/image2.JPG)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#Model Building" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pprint\n", + "import numpy as np\n", + "\n", + "from pyspark.ml import Pipeline, PipelineModel\n", + "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n", + "from pyspark.ml.classification import LogisticRegression\n", + "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", + "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.core\n", + "\n", + "# Check core SDK version number\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import the Workspace class and check the azureml SDK version\n", + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config(auth = auth)\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep = '\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import the Workspace class and check the azureml SDK version\n", + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep = '\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#get the train and test datasets\n", + "train_data_path = \"AdultCensusIncomeTrain\"\n", + "test_data_path = \"AdultCensusIncomeTest\"\n", + "\n", + "train = spark.read.parquet(train_data_path)\n", + "test = spark.read.parquet(test_data_path)\n", + "\n", + "print(\"train: ({}, {})\".format(train.count(), len(train.columns)))\n", + "print(\"test: ({}, {})\".format(test.count(), len(test.columns)))\n", + "\n", + "train.printSchema()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#Define Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "label = \"income\"\n", + "dtypes = dict(train.dtypes)\n", + "dtypes.pop(label)\n", + "\n", + "si_xvars = []\n", + "ohe_xvars = []\n", + "featureCols = []\n", + "for idx,key in enumerate(dtypes):\n", + " if dtypes[key] == \"string\":\n", + " featureCol = \"-\".join([key, \"encoded\"])\n", + " featureCols.append(featureCol)\n", + " \n", + " tmpCol = \"-\".join([key, \"tmp\"])\n", + " # string-index and one-hot encode the string column\n", + " #https://spark.apache.org/docs/2.3.0/api/java/org/apache/spark/ml/feature/StringIndexer.html\n", + " #handleInvalid: Param for how to handle invalid data (unseen labels or NULL values). \n", + " #Options are 'skip' (filter out rows with invalid data), 'error' (throw an error), \n", + " #or 'keep' (put invalid data in a special additional bucket, at index numLabels). Default: \"error\"\n", + " si_xvars.append(StringIndexer(inputCol=key, outputCol=tmpCol, handleInvalid=\"skip\"))\n", + " ohe_xvars.append(OneHotEncoder(inputCol=tmpCol, outputCol=featureCol))\n", + " else:\n", + " featureCols.append(key)\n", + "\n", + "# string-index the label column into a column named \"label\"\n", + "si_label = StringIndexer(inputCol=label, outputCol='label')\n", + "\n", + "# assemble the encoded feature columns in to a column named \"features\"\n", + "assembler = VectorAssembler(inputCols=featureCols, outputCol=\"features\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.run import Run\n", + "from azureml.core.experiment import Experiment\n", + "import numpy as np\n", + "import os\n", + "import shutil\n", + "\n", + "model_name = \"AdultCensus_runHistory.mml\"\n", + "model_dbfs = os.path.join(\"/dbfs\", model_name)\n", + "run_history_name = 'spark-ml-notebook'\n", + "\n", + "# start a training run by defining an experiment\n", + "myexperiment = Experiment(ws, \"Ignite_AI_Talk\")\n", + "root_run = myexperiment.start_logging()\n", + "\n", + "# Regularization Rates - \n", + "regs = [0.0001, 0.001, 0.01, 0.1]\n", + " \n", + "# try a bunch of regularization rate in a Logistic Regression model\n", + "for reg in regs:\n", + " print(\"Regularization rate: {}\".format(reg))\n", + " # create a bunch of child runs\n", + " with root_run.child_run(\"reg-\" + str(reg)) as run:\n", + " # create a new Logistic Regression model.\n", + " lr = LogisticRegression(regParam=reg)\n", + " \n", + " # put together the pipeline\n", + " pipe = Pipeline(stages=[*si_xvars, *ohe_xvars, si_label, assembler, lr])\n", + "\n", + " # train the model\n", + " model_p = pipe.fit(train)\n", + " \n", + " # make prediction\n", + " pred = model_p.transform(test)\n", + " \n", + " # evaluate. note only 2 metrics are supported out of the box by Spark ML.\n", + " bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')\n", + " au_roc = bce.setMetricName('areaUnderROC').evaluate(pred)\n", + " au_prc = bce.setMetricName('areaUnderPR').evaluate(pred)\n", + "\n", + " print(\"Area under ROC: {}\".format(au_roc))\n", + " print(\"Area Under PR: {}\".format(au_prc))\n", + " \n", + " # log reg, au_roc, au_prc and feature names in run history\n", + " run.log(\"reg\", reg)\n", + " run.log(\"au_roc\", au_roc)\n", + " run.log(\"au_prc\", au_prc)\n", + " run.log_list(\"columns\", train.columns)\n", + "\n", + " # save model\n", + " model_p.write().overwrite().save(model_name)\n", + " \n", + " # upload the serialized model into run history record\n", + " mdl, ext = model_name.split(\".\")\n", + " model_zip = mdl + \".zip\"\n", + " shutil.make_archive(mdl, 'zip', model_dbfs)\n", + " run.upload_file(\"outputs/\" + model_name, model_zip) \n", + " #run.upload_file(\"outputs/\" + model_name, path_or_stream = model_dbfs) #cannot deal with folders\n", + "\n", + " # now delete the serialized model from local folder since it is already uploaded to run history \n", + " shutil.rmtree(model_dbfs)\n", + " os.remove(model_zip)\n", + " \n", + "# Declare run completed\n", + "root_run.complete()\n", + "root_run_id = root_run.id\n", + "print (\"run id:\", root_run.id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "metrics = root_run.get_metrics(recursive=True)\n", + "best_run_id = max(metrics, key = lambda k: metrics[k]['au_roc'])\n", + "print(best_run_id, metrics[best_run_id]['au_roc'], metrics[best_run_id]['reg'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Get the best run\n", + "child_runs = {}\n", + "\n", + "for r in root_run.get_children():\n", + " child_runs[r.id] = r\n", + " \n", + "best_run = child_runs[best_run_id]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Download the model from the best run to a local folder\n", + "best_model_file_name = \"best_model.zip\"\n", + "best_run.download_file(name = 'outputs/' + model_name, output_file_path = best_model_file_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#Model Evaluation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "##unzip the model to dbfs (as load() seems to require that) and load it.\n", + "if os.path.isfile(model_dbfs) or os.path.isdir(model_dbfs):\n", + " shutil.rmtree(model_dbfs)\n", + "shutil.unpack_archive(best_model_file_name, model_dbfs)\n", + "\n", + "model_p_best = PipelineModel.load(model_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# make prediction\n", + "pred = model_p_best.transform(test)\n", + "output = pred[['hours_per_week','age','workclass','marital_status','income','prediction']]\n", + "display(output.limit(5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# evaluate. note only 2 metrics are supported out of the box by Spark ML.\n", + "bce = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction')\n", + "au_roc = bce.setMetricName('areaUnderROC').evaluate(pred)\n", + "au_prc = bce.setMetricName('areaUnderPR').evaluate(pred)\n", + "\n", + "print(\"Area under ROC: {}\".format(au_roc))\n", + "print(\"Area Under PR: {}\".format(au_prc))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#Model Persistence" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "##NOTE: by default the model is saved to and loaded from /dbfs/ instead of cwd!\n", + "model_p_best.write().overwrite().save(model_name)\n", + "print(\"saved model to {}\".format(model_dbfs))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%sh\n", + "\n", + "ls -la /dbfs/AdultCensus_runHistory.mml/*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dbutils.notebook.exit(\"success\")" + ] + } ], - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "metadata": { + "authors": [ + { + "name": "pasha" + }, + { + "name": "wamartin" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + }, + "name": "03.Build_model_runHistory", + "notebookId": 3836944406456339 }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "name": "03.Build_model_runHistory", - "notebookId": 3836944406456339 - }, - "nbformat": 4, - "nbformat_minor": 1 -} + "nbformat": 4, + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aci-04.ipynb b/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aci-04.ipynb index 91257d28..015f117a 100644 --- a/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aci-04.ipynb +++ b/how-to-use-azureml/azure-databricks/amlsdk/deploy-to-aci-04.ipynb @@ -1,354 +1,338 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Azure ML & Azure Databricks notebooks by Parashar Shah.\n", - "\n", - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Please ensure you have run all previous notebooks in sequence before running this.\n", - "\n", - "Please Register Azure Container Instance(ACI) using Azure Portal: https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-manager-supported-services#portal in your subscription before using the SDK to deploy your ML model to ACI." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![04ACI](files/tables/image3.JPG)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.core\n", - "\n", - "# Check core SDK version number\n", - "print(\"SDK version:\", azureml.core.VERSION)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##TESTONLY\n", - "# import auth creds from notebook parameters\n", - "tenant = dbutils.widgets.get('tenant_id')\n", - "username = dbutils.widgets.get('service_principal_id')\n", - "password = dbutils.widgets.get('service_principal_password')\n", - "\n", - "auth = azureml.core.authentication.ServicePrincipalAuthentication(tenant, username, password)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core import Workspace\n", - "\n", - "#'''\n", - "ws = Workspace.from_config(auth = auth)\n", - "print('Workspace name: ' + ws.name, \n", - " 'Azure region: ' + ws.location, \n", - " 'Subscription id: ' + ws.subscription_id, \n", - " 'Resource group: ' + ws.resource_group, sep = '\\n')\n", - "#'''" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##PUBLISHONLY\n", - "#from azureml.core import Workspace\n", - "#import azureml.core\n", - "#\n", - "## Check core SDK version number\n", - "#print(\"SDK version:\", azureml.core.VERSION)\n", - "#\n", - "##'''\n", - "#ws = Workspace.from_config()\n", - "#print('Workspace name: ' + ws.name, \n", - "# 'Azure region: ' + ws.location, \n", - "# 'Subscription id: ' + ws.subscription_id, \n", - "# 'Resource group: ' + ws.resource_group, sep = '\\n')\n", - "##'''" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##NOTE: service deployment always gets the model from the current working dir.\n", - "import os\n", - "\n", - "model_name = \"AdultCensus_runHistory.mml\" # \n", - "model_name_dbfs = os.path.join(\"/dbfs\", model_name)\n", - "\n", - "print(\"copy model from dbfs to local\")\n", - "model_local = \"file:\" + os.getcwd() + \"/\" + model_name\n", - "dbutils.fs.cp(model_name, model_local, True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Register the model\n", - "from azureml.core.model import Model\n", - "mymodel = Model.register(model_path = model_name, # this points to a local file\n", - " model_name = model_name, # this is the name the model is registered as, am using same name for both path and name. \n", - " description = \"ADB trained model by Parashar\",\n", - " workspace = ws)\n", - "\n", - "print(mymodel.name, mymodel.description, mymodel.version)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#%%writefile score_sparkml.py\n", - "score_sparkml = \"\"\"\n", - " \n", - "import json\n", - " \n", - "def init():\n", - " # One-time initialization of PySpark and predictive model\n", - " import pyspark\n", - " from azureml.core.model import Model\n", - " from pyspark.ml import PipelineModel\n", - " \n", - " global trainedModel\n", - " global spark\n", - " \n", - " spark = pyspark.sql.SparkSession.builder.appName(\"ADB and AML notebook by Parashar\").getOrCreate()\n", - " model_name = \"{model_name}\" #interpolated\n", - " model_path = Model.get_model_path(model_name)\n", - " trainedModel = PipelineModel.load(model_path)\n", - " \n", - "def run(input_json):\n", - " if isinstance(trainedModel, Exception):\n", - " return json.dumps({{\"trainedModel\":str(trainedModel)}})\n", - " \n", - " try:\n", - " sc = spark.sparkContext\n", - " input_list = json.loads(input_json)\n", - " input_rdd = sc.parallelize(input_list)\n", - " input_df = spark.read.json(input_rdd)\n", - " \n", - " # Compute prediction\n", - " prediction = trainedModel.transform(input_df)\n", - " #result = prediction.first().prediction\n", - " predictions = prediction.collect()\n", - " \n", - " #Get each scored result\n", - " preds = [str(x['prediction']) for x in predictions]\n", - " result = \",\".join(preds)\n", - " # you can return any data type as long as it is JSON-serializable\n", - " return result.tolist()\n", - " except Exception as e:\n", - " result = str(e)\n", - " return result\n", - " \n", - "\"\"\".format(model_name=model_name)\n", - " \n", - "exec(score_sparkml)\n", - " \n", - "with open(\"score_sparkml.py\", \"w\") as file:\n", - " file.write(score_sparkml)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.core.conda_dependencies import CondaDependencies \n", - "\n", - "myacienv = CondaDependencies.create(conda_packages=['scikit-learn','numpy','pandas']) #showing how to add libs as an eg. - not needed for this model.\n", - "\n", - "with open(\"mydeployenv.yml\",\"w\") as f:\n", - " f.write(myacienv.serialize_to_string())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#deploy to ACI\n", - "from azureml.core.webservice import AciWebservice, Webservice\n", - "\n", - "myaci_config = AciWebservice.deploy_configuration(\n", - " cpu_cores = 2, \n", - " memory_gb = 2, \n", - " tags = {'name':'Databricks Azure ML ACI'}, \n", - " description = 'This is for ADB and AML example. Azure Databricks & Azure ML SDK demo with ACI by Parashar.')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# this will take 10-15 minutes to finish\n", - "\n", - "service_name = \"aciws\"\n", - "runtime = \"spark-py\" \n", - "driver_file = \"score_sparkml.py\"\n", - "my_conda_file = \"mydeployenv.yml\"\n", - "\n", - "# image creation\n", - "from azureml.core.image import ContainerImage\n", - "myimage_config = ContainerImage.image_configuration(execution_script = driver_file, \n", - " runtime = runtime, \n", - " conda_file = my_conda_file)\n", - "\n", - "# Webservice creation\n", - "myservice = Webservice.deploy_from_model(\n", - " workspace=ws, \n", - " name=service_name,\n", - " deployment_config = myaci_config,\n", - " models = [mymodel],\n", - " image_config = myimage_config\n", - " )\n", - "\n", - "myservice.wait_for_deployment(show_output=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "help(Webservice)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# List images by ws\n", - "\n", - "for i in ContainerImage.list(workspace = ws):\n", - " print('{}(v.{} [{}]) stored at {} with build log {}'.format(i.name, i.version, i.creation_state, i.image_location, i.image_build_log_uri))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#for using the Web HTTP API \n", - "print(myservice.scoring_uri)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "#get the some sample data\n", - "test_data_path = \"AdultCensusIncomeTest\"\n", - "test = spark.read.parquet(test_data_path).limit(5)\n", - "\n", - "test_json = json.dumps(test.toJSON().collect())\n", - "\n", - "print(test_json)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#using data defined above predict if income is >50K (1) or <=50K (0)\n", - "myservice.run(input_data=test_json)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#comment to not delete the web service\n", - "#myservice.delete()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "authors": [ - { - "name": "pasha" - }, - { - "name": "wamartin" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Azure ML & Azure Databricks notebooks by Parashar Shah.\n", + "\n", + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Please ensure you have run all previous notebooks in sequence before running this.\n", + "\n", + "Please Register Azure Container Instance(ACI) using Azure Portal: https://docs.microsoft.com/en-us/azure/azure-resource-manager/resource-manager-supported-services#portal in your subscription before using the SDK to deploy your ML model to ACI." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![04ACI](files/tables/image3.JPG)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.core\n", + "\n", + "# Check core SDK version number\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Workspace\n", + "\n", + "#'''\n", + "ws = Workspace.from_config(auth = auth)\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep = '\\n')\n", + "#'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Workspace\n", + "import azureml.core\n", + "\n", + "# Check core SDK version number\n", + "print(\"SDK version:\", azureml.core.VERSION)\n", + "\n", + "#'''\n", + "ws = Workspace.from_config()\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep = '\\n')\n", + "#'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "##NOTE: service deployment always gets the model from the current working dir.\n", + "import os\n", + "\n", + "model_name = \"AdultCensus_runHistory.mml\" # \n", + "model_name_dbfs = os.path.join(\"/dbfs\", model_name)\n", + "\n", + "print(\"copy model from dbfs to local\")\n", + "model_local = \"file:\" + os.getcwd() + \"/\" + model_name\n", + "dbutils.fs.cp(model_name, model_local, True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Register the model\n", + "from azureml.core.model import Model\n", + "mymodel = Model.register(model_path = model_name, # this points to a local file\n", + " model_name = model_name, # this is the name the model is registered as, am using same name for both path and name. \n", + " description = \"ADB trained model by Parashar\",\n", + " workspace = ws)\n", + "\n", + "print(mymodel.name, mymodel.description, mymodel.version)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#%%writefile score_sparkml.py\n", + "score_sparkml = \"\"\"\n", + " \n", + "import json\n", + " \n", + "def init():\n", + " # One-time initialization of PySpark and predictive model\n", + " import pyspark\n", + " from azureml.core.model import Model\n", + " from pyspark.ml import PipelineModel\n", + " \n", + " global trainedModel\n", + " global spark\n", + " \n", + " spark = pyspark.sql.SparkSession.builder.appName(\"ADB and AML notebook by Parashar\").getOrCreate()\n", + " model_name = \"{model_name}\" #interpolated\n", + " model_path = Model.get_model_path(model_name)\n", + " trainedModel = PipelineModel.load(model_path)\n", + " \n", + "def run(input_json):\n", + " if isinstance(trainedModel, Exception):\n", + " return json.dumps({{\"trainedModel\":str(trainedModel)}})\n", + " \n", + " try:\n", + " sc = spark.sparkContext\n", + " input_list = json.loads(input_json)\n", + " input_rdd = sc.parallelize(input_list)\n", + " input_df = spark.read.json(input_rdd)\n", + " \n", + " # Compute prediction\n", + " prediction = trainedModel.transform(input_df)\n", + " #result = prediction.first().prediction\n", + " predictions = prediction.collect()\n", + " \n", + " #Get each scored result\n", + " preds = [str(x['prediction']) for x in predictions]\n", + " result = \",\".join(preds)\n", + " # you can return any data type as long as it is JSON-serializable\n", + " return result.tolist()\n", + " except Exception as e:\n", + " result = str(e)\n", + " return result\n", + " \n", + "\"\"\".format(model_name=model_name)\n", + " \n", + "exec(score_sparkml)\n", + " \n", + "with open(\"score_sparkml.py\", \"w\") as file:\n", + " file.write(score_sparkml)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core.conda_dependencies import CondaDependencies \n", + "\n", + "myacienv = CondaDependencies.create(conda_packages=['scikit-learn','numpy','pandas']) #showing how to add libs as an eg. - not needed for this model.\n", + "\n", + "with open(\"mydeployenv.yml\",\"w\") as f:\n", + " f.write(myacienv.serialize_to_string())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#deploy to ACI\n", + "from azureml.core.webservice import AciWebservice, Webservice\n", + "\n", + "myaci_config = AciWebservice.deploy_configuration(\n", + " cpu_cores = 2, \n", + " memory_gb = 2, \n", + " tags = {'name':'Databricks Azure ML ACI'}, \n", + " description = 'This is for ADB and AML example. Azure Databricks & Azure ML SDK demo with ACI by Parashar.')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# this will take 10-15 minutes to finish\n", + "\n", + "service_name = \"aciws\"\n", + "runtime = \"spark-py\" \n", + "driver_file = \"score_sparkml.py\"\n", + "my_conda_file = \"mydeployenv.yml\"\n", + "\n", + "# image creation\n", + "from azureml.core.image import ContainerImage\n", + "myimage_config = ContainerImage.image_configuration(execution_script = driver_file, \n", + " runtime = runtime, \n", + " conda_file = my_conda_file)\n", + "\n", + "# Webservice creation\n", + "myservice = Webservice.deploy_from_model(\n", + " workspace=ws, \n", + " name=service_name,\n", + " deployment_config = myaci_config,\n", + " models = [mymodel],\n", + " image_config = myimage_config\n", + " )\n", + "\n", + "myservice.wait_for_deployment(show_output=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "help(Webservice)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List images by ws\n", + "\n", + "for i in ContainerImage.list(workspace = ws):\n", + " print('{}(v.{} [{}]) stored at {} with build log {}'.format(i.name, i.version, i.creation_state, i.image_location, i.image_build_log_uri))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#for using the Web HTTP API \n", + "print(myservice.scoring_uri)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "#get the some sample data\n", + "test_data_path = \"AdultCensusIncomeTest\"\n", + "test = spark.read.parquet(test_data_path).limit(5)\n", + "\n", + "test_json = json.dumps(test.toJSON().collect())\n", + "\n", + "print(test_json)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#using data defined above predict if income is >50K (1) or <=50K (0)\n", + "myservice.run(input_data=test_json)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#comment to not delete the web service\n", + "#myservice.delete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } ], - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "metadata": { + "authors": [ + { + "name": "pasha" + }, + { + "name": "wamartin" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + }, + "name": "04.DeploytoACI", + "notebookId": 3836944406456376 }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "name": "04.DeploytoACI", - "notebookId": 3836944406456376 - }, - "nbformat": 4, - "nbformat_minor": 1 -} + "nbformat": 4, + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/how-to-use-azureml/azure-databricks/amlsdk/ingest-data-02.ipynb b/how-to-use-azureml/azure-databricks/amlsdk/ingest-data-02.ipynb index 503f97f1..5391a021 100644 --- a/how-to-use-azureml/azure-databricks/amlsdk/ingest-data-02.ipynb +++ b/how-to-use-azureml/azure-databricks/amlsdk/ingest-data-02.ipynb @@ -1,182 +1,182 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Azure ML & Azure Databricks notebooks by Parashar Shah.\n", - "\n", - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![04ACI](files/tables/image1.JPG)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#Data Ingestion" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import urllib" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Download AdultCensusIncome.csv from Azure CDN. This file has 32,561 rows.\n", - "basedataurl = \"https://amldockerdatasets.azureedge.net\"\n", - "datafile = \"AdultCensusIncome.csv\"\n", - "datafile_dbfs = os.path.join(\"/dbfs\", datafile)\n", - "\n", - "if os.path.isfile(datafile_dbfs):\n", - " print(\"found {} at {}\".format(datafile, datafile_dbfs))\n", - "else:\n", - " print(\"downloading {} to {}\".format(datafile, datafile_dbfs))\n", - " urllib.request.urlretrieve(os.path.join(basedataurl, datafile), datafile_dbfs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create a Spark dataframe out of the csv file.\n", - "data_all = sqlContext.read.format('csv').options(header='true', inferSchema='true', ignoreLeadingWhiteSpace='true', ignoreTrailingWhiteSpace='true').load(datafile)\n", - "print(\"({}, {})\".format(data_all.count(), len(data_all.columns)))\n", - "data_all.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#renaming columns\n", - "columns_new = [col.replace(\"-\", \"_\") for col in data_all.columns]\n", - "data_all = data_all.toDF(*columns_new)\n", - "data_all.printSchema()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "display(data_all.limit(5))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#Data Preparation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Choose feature columns and the label column.\n", - "label = \"income\"\n", - "xvars = set(data_all.columns) - {label}\n", - "\n", - "print(\"label = {}\".format(label))\n", - "print(\"features = {}\".format(xvars))\n", - "\n", - "data = data_all.select([*xvars, label])\n", - "\n", - "# Split data into train and test.\n", - "train, test = data.randomSplit([0.75, 0.25], seed=123)\n", - "\n", - "print(\"train ({}, {})\".format(train.count(), len(train.columns)))\n", - "print(\"test ({}, {})\".format(test.count(), len(test.columns)))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#Data Persistence" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Write the train and test data sets to intermediate storage\n", - "train_data_path = \"AdultCensusIncomeTrain\"\n", - "test_data_path = \"AdultCensusIncomeTest\"\n", - "\n", - "train_data_path_dbfs = os.path.join(\"/dbfs\", \"AdultCensusIncomeTrain\")\n", - "test_data_path_dbfs = os.path.join(\"/dbfs\", \"AdultCensusIncomeTest\")\n", - "\n", - "train.write.mode('overwrite').parquet(train_data_path)\n", - "test.write.mode('overwrite').parquet(test_data_path)\n", - "print(\"train and test datasets saved to {} and {}\".format(train_data_path_dbfs, test_data_path_dbfs))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "authors": [ - { - "name": "pasha" - }, - { - "name": "wamartin" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Azure ML & Azure Databricks notebooks by Parashar Shah.\n", + "\n", + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![04ACI](files/tables/image1.JPG)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#Data Ingestion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import urllib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Download AdultCensusIncome.csv from Azure CDN. This file has 32,561 rows.\n", + "basedataurl = \"https://amldockerdatasets.azureedge.net\"\n", + "datafile = \"AdultCensusIncome.csv\"\n", + "datafile_dbfs = os.path.join(\"/dbfs\", datafile)\n", + "\n", + "if os.path.isfile(datafile_dbfs):\n", + " print(\"found {} at {}\".format(datafile, datafile_dbfs))\n", + "else:\n", + " print(\"downloading {} to {}\".format(datafile, datafile_dbfs))\n", + " urllib.request.urlretrieve(os.path.join(basedataurl, datafile), datafile_dbfs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a Spark dataframe out of the csv file.\n", + "data_all = sqlContext.read.format('csv').options(header='true', inferSchema='true', ignoreLeadingWhiteSpace='true', ignoreTrailingWhiteSpace='true').load(datafile)\n", + "print(\"({}, {})\".format(data_all.count(), len(data_all.columns)))\n", + "data_all.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#renaming columns\n", + "columns_new = [col.replace(\"-\", \"_\") for col in data_all.columns]\n", + "data_all = data_all.toDF(*columns_new)\n", + "data_all.printSchema()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "display(data_all.limit(5))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#Data Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Choose feature columns and the label column.\n", + "label = \"income\"\n", + "xvars = set(data_all.columns) - {label}\n", + "\n", + "print(\"label = {}\".format(label))\n", + "print(\"features = {}\".format(xvars))\n", + "\n", + "data = data_all.select([*xvars, label])\n", + "\n", + "# Split data into train and test.\n", + "train, test = data.randomSplit([0.75, 0.25], seed=123)\n", + "\n", + "print(\"train ({}, {})\".format(train.count(), len(train.columns)))\n", + "print(\"test ({}, {})\".format(test.count(), len(test.columns)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#Data Persistence" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write the train and test data sets to intermediate storage\n", + "train_data_path = \"AdultCensusIncomeTrain\"\n", + "test_data_path = \"AdultCensusIncomeTest\"\n", + "\n", + "train_data_path_dbfs = os.path.join(\"/dbfs\", \"AdultCensusIncomeTrain\")\n", + "test_data_path_dbfs = os.path.join(\"/dbfs\", \"AdultCensusIncomeTest\")\n", + "\n", + "train.write.mode('overwrite').parquet(train_data_path)\n", + "test.write.mode('overwrite').parquet(test_data_path)\n", + "print(\"train and test datasets saved to {} and {}\".format(train_data_path_dbfs, test_data_path_dbfs))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } ], - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "metadata": { + "authors": [ + { + "name": "pasha" + }, + { + "name": "wamartin" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + }, + "name": "02.Ingest_data", + "notebookId": 3836944406456362 }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "name": "02.Ingest_data", - "notebookId": 3836944406456362 - }, - "nbformat": 4, - "nbformat_minor": 1 -} + "nbformat": 4, + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/how-to-use-azureml/azure-databricks/amlsdk/installation-and-configuration-01.ipynb b/how-to-use-azureml/azure-databricks/amlsdk/installation-and-configuration-01.ipynb index 9ba94649..85d48624 100644 --- a/how-to-use-azureml/azure-databricks/amlsdk/installation-and-configuration-01.ipynb +++ b/how-to-use-azureml/azure-databricks/amlsdk/installation-and-configuration-01.ipynb @@ -1,264 +1,179 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Azure ML & Azure Databricks notebooks by Parashar Shah.\n", - "\n", - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We support installing AML SDK as library from GUI. When attaching a library follow this https://docs.databricks.com/user-guide/libraries.html and add the below string as your PyPi package. You can select the option to attach the library to all clusters or just one cluster.\n", - "\n", - "**install azureml-sdk**\n", - "* Source: Upload Python Egg or PyPi\n", - "* PyPi Name: `azureml-sdk[databricks]`\n", - "* Select Install Library" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.core\n", - "\n", - "# Check core SDK version number - based on build number of preview/master.\n", - "print(\"SDK version:\", azureml.core.VERSION)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![04ACI](files/tables/image2b.JPG)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Please specify the Azure subscription Id, resource group name, workspace name, and the region in which you want to create the Azure Machine Learning Workspace.\n", - "\n", - "You can get the value of your Azure subscription ID from the Azure Portal, and then selecting Subscriptions from the menu on the left.\n", - "\n", - "For the resource_group, use the name of the resource group that contains your Azure Databricks Workspace.\n", - "\n", - "NOTE: If you provide a resource group name that does not exist, the resource group will be automatically created. This may or may not succeed in your environment, depending on the permissions you have on your Azure Subscription." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# subscription_id = \"\"\n", - "# resource_group = \"\"\n", - "# workspace_name = \"\"\n", - "# workspace_region = \"\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##TESTONLY\n", - "# import auth creds from notebook parameters\n", - "tenant = dbutils.widgets.get('tenant_id')\n", - "username = dbutils.widgets.get('service_principal_id')\n", - "password = dbutils.widgets.get('service_principal_password')\n", - "\n", - "auth = azureml.core.authentication.ServicePrincipalAuthentication(tenant, username, password)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##TESTONLY\n", - "subscription_id = dbutils.widgets.get('subscription_id')\n", - "resource_group = dbutils.widgets.get('resource_group')\n", - "workspace_name = dbutils.widgets.get('workspace_name')\n", - "workspace_region = dbutils.widgets.get('workspace_region')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##TESTONLY\n", - "# import the Workspace class and check the azureml SDK version\n", - "# exist_ok checks if workspace exists or not.\n", - "\n", - "from azureml.core import Workspace\n", - "\n", - "ws = Workspace.create(name = workspace_name,\n", - " subscription_id = subscription_id,\n", - " resource_group = resource_group, \n", - " location = workspace_region,\n", - " auth = auth,\n", - " exist_ok=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##PUBLISHONLY\n", - "## import the Workspace class and check the azureml SDK version\n", - "## exist_ok checks if workspace exists or not.\n", - "#\n", - "#from azureml.core import Workspace\n", - "#\n", - "#ws = Workspace.create(name = workspace_name,\n", - "# subscription_id = subscription_id,\n", - "# resource_group = resource_group, \n", - "# location = workspace_region,\n", - "# exist_ok=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#get workspace details\n", - "ws.get_details()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##TESTONLY\n", - "ws = Workspace(workspace_name = workspace_name,\n", - " subscription_id = subscription_id,\n", - " resource_group = resource_group,\n", - " auth = auth)\n", - "\n", - "# persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n", - "ws.write_config()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##PUBLISHONLY\n", - "#ws = Workspace(workspace_name = workspace_name,\n", - "# subscription_id = subscription_id,\n", - "# resource_group = resource_group)\n", - "#\n", - "## persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n", - "#ws.write_config()\n", - "###if you need to give a different path/filename please use this\n", - "###write_config(path=\"/databricks/driver/aml_config/\",file_name=)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "help(Workspace)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##TESTONLY\n", - "# import the Workspace class and check the azureml SDK version\n", - "from azureml.core import Workspace\n", - "\n", - "ws = Workspace.from_config(auth = auth)\n", - "#ws = Workspace.from_config()\n", - "print('Workspace name: ' + ws.name, \n", - " 'Azure region: ' + ws.location, \n", - " 'Subscription id: ' + ws.subscription_id, \n", - " 'Resource group: ' + ws.resource_group, sep = '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##PUBLISHONLY\n", - "## import the Workspace class and check the azureml SDK version\n", - "#from azureml.core import Workspace\n", - "#\n", - "#ws = Workspace.from_config()\n", - "##ws = Workspace.from_config()\n", - "#print('Workspace name: ' + ws.name, \n", - "# 'Azure region: ' + ws.location, \n", - "# 'Subscription id: ' + ws.subscription_id, \n", - "# 'Resource group: ' + ws.resource_group, sep = '\\n')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "authors": [ - { - "name": "pasha" - }, - { - "name": "wamartin" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Azure ML & Azure Databricks notebooks by Parashar Shah.\n", + "\n", + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We support installing AML SDK as library from GUI. When attaching a library follow this https://docs.databricks.com/user-guide/libraries.html and add the below string as your PyPi package. You can select the option to attach the library to all clusters or just one cluster.\n", + "\n", + "**install azureml-sdk**\n", + "* Source: Upload Python Egg or PyPi\n", + "* PyPi Name: `azureml-sdk[databricks]`\n", + "* Select Install Library" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.core\n", + "\n", + "# Check core SDK version number - based on build number of preview/master.\n", + "print(\"SDK version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![04ACI](files/tables/image2b.JPG)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Please specify the Azure subscription Id, resource group name, workspace name, and the region in which you want to create the Azure Machine Learning Workspace.\n", + "\n", + "You can get the value of your Azure subscription ID from the Azure Portal, and then selecting Subscriptions from the menu on the left.\n", + "\n", + "For the resource_group, use the name of the resource group that contains your Azure Databricks Workspace.\n", + "\n", + "NOTE: If you provide a resource group name that does not exist, the resource group will be automatically created. This may or may not succeed in your environment, depending on the permissions you have on your Azure Subscription." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# subscription_id = \"\"\n", + "# resource_group = \"\"\n", + "# workspace_name = \"\"\n", + "# workspace_region = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import the Workspace class and check the azureml SDK version\n", + "# exist_ok checks if workspace exists or not.\n", + "\n", + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.create(name = workspace_name,\n", + " subscription_id = subscription_id,\n", + " resource_group = resource_group, \n", + " location = workspace_region,\n", + " exist_ok=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#get workspace details\n", + "ws.get_details()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ws = Workspace(workspace_name = workspace_name,\n", + " subscription_id = subscription_id,\n", + " resource_group = resource_group)\n", + "\n", + "# persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n", + "ws.write_config()\n", + "##if you need to give a different path/filename please use this\n", + "##write_config(path=\"/databricks/driver/aml_config/\",file_name=)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "help(Workspace)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# import the Workspace class and check the azureml SDK version\n", + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.from_config()\n", + "#ws = Workspace.from_config()\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep = '\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } ], - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "metadata": { + "authors": [ + { + "name": "pasha" + }, + { + "name": "wamartin" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + }, + "name": "01.Installation_and_Configuration", + "notebookId": 3836944406456490 }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "name": "01.Installation_and_Configuration", - "notebookId": 3836944406456490 - }, - "nbformat": 4, - "nbformat_minor": 1 -} + "nbformat": 4, + "nbformat_minor": 1 +} \ No newline at end of file diff --git a/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb b/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb index 7a27c7a3..597ee240 100644 --- a/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb +++ b/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb @@ -1,653 +1,593 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Copyright (c) Microsoft Corporation. All rights reserved.\n", - "\n", - "Licensed under the MIT License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We support installing AML SDK as library from GUI. When attaching a library follow this https://docs.databricks.com/user-guide/libraries.html and add the below string as your PyPi package. You can select the option to attach the library to all clusters or just one cluster.\n", - "\n", - "**install azureml-sdk with Automated ML**\n", - "* Source: Upload Python Egg or PyPi\n", - "* PyPi Name: `azureml-sdk[automl_databricks]`\n", - "* Select Install Library" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# AutoML : Classification with Local Compute on Azure DataBricks\n", - "\n", - "In this example we use the scikit-learn's [digit dataset](http://scikit-learn.org/stable/datasets/index.html#optical-recognition-of-handwritten-digits-dataset) to showcase how you can use AutoML for a simple classification problem.\n", - "\n", - "In this notebook you will learn how to:\n", - "1. Create Azure Machine Learning Workspace object and initialize your notebook directory to easily reload this object from a configuration file.\n", - "2. Create an `Experiment` in an existing `Workspace`.\n", - "3. Configure AutoML using `AutoMLConfig`.\n", - "4. Train the model using AzureDataBricks.\n", - "5. Explore the results.\n", - "6. Test the best fitted model.\n", - "\n", - "Prerequisites:\n", - "Before running this notebook, please follow the readme for installing necessary libraries to your cluster." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Register Machine Learning Services Resource Provider\n", - "Microsoft.MachineLearningServices only needs to be registed once in the subscription. To register it:\n", - "Start the Azure portal.\n", - "Select your All services and then Subscription.\n", - "Select the subscription that you want to use.\n", - "Click on Resource providers\n", - "Click the Register link next to Microsoft.MachineLearningServices" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Check the Azure ML Core SDK Version to Validate Your Installation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.core\n", - "\n", - "print(\"SDK Version:\", azureml.core.VERSION)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Initialize an Azure ML Workspace\n", - "### What is an Azure ML Workspace and Why Do I Need One?\n", - "\n", - "An Azure ML workspace is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows. In particular, an Azure ML workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, operationalization, and the monitoring of operationalized models.\n", - "\n", - "\n", - "### What do I Need?\n", - "\n", - "To create or access an Azure ML workspace, you will need to import the Azure ML library and specify following information:\n", - "* A name for your workspace. You can choose one.\n", - "* Your subscription id. Use the `id` value from the `az account show` command output above.\n", - "* The resource group name. The resource group organizes Azure resources and provides a default region for the resources in the group. The resource group will be created if it doesn't exist. Resource groups can be created and viewed in the [Azure portal](https://portal.azure.com)\n", - "* Supported regions include `eastus2`, `eastus`,`westcentralus`, `southeastasia`, `westeurope`, `australiaeast`, `westus2`, `southcentralus`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##PUBLISHONLY\n", - "#subscription_id = \"\"\n", - "#resource_group = \"\"\n", - "#workspace_name = \"\"\n", - "#workspace_region = \"\" #eg. eastus2, westcentralus, westeurope" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating a Workspace\n", - "If you already have access to an Azure ML workspace you want to use, you can skip this cell. Otherwise, this cell will create an Azure ML workspace for you in the specified subscription, provided you have the correct permissions for the given `subscription_id`.\n", - "\n", - "This will fail when:\n", - "1. The workspace already exists.\n", - "2. You do not have permission to create a workspace in the resource group.\n", - "3. You are not a subscription owner or contributor and no Azure ML workspaces have ever been created in this subscription.\n", - "\n", - "If workspace creation fails for any reason other than already existing, please work with your IT administrator to provide you with the appropriate permissions or to provision the required resources.\n", - "\n", - "**Note:** Creation of a new workspace can take several minutes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##TESTONLY\n", - "# import auth creds from notebook parameters\n", - "tenant = dbutils.widgets.get('tenant_id')\n", - "username = dbutils.widgets.get('service_principal_id')\n", - "password = dbutils.widgets.get('service_principal_password')\n", - "\n", - "auth = azureml.core.authentication.ServicePrincipalAuthentication(tenant, username, password)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##TESTONLY\n", - "subscription_id = dbutils.widgets.get('subscription_id')\n", - "resource_group = dbutils.widgets.get('resource_group')\n", - "workspace_name = dbutils.widgets.get('workspace_name')\n", - "workspace_region = dbutils.widgets.get('workspace_region')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Import the Workspace class and check the Azure ML SDK version.\n", - "from azureml.core import Workspace\n", - "\n", - "ws = Workspace.create(name = workspace_name,\n", - " subscription_id = subscription_id,\n", - " resource_group = resource_group, \n", - " location = workspace_region,\n", - " auth = auth,\n", - " exist_ok=True)\n", - "ws.get_details()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##PUBLISHONLY\n", - "#from azureml.core import Workspace\n", - "#import azureml.core\n", - "#\n", - "## Check core SDK version number\n", - "#print(\"SDK version:\", azureml.core.VERSION)\n", - "#\n", - "##'''\n", - "#ws = Workspace.from_config()\n", - "#print('Workspace name: ' + ws.name, \n", - "# 'Azure region: ' + ws.location, \n", - "# 'Subscription id: ' + ws.subscription_id, \n", - "# 'Resource group: ' + ws.resource_group, sep = '\\n')\n", - "##'''" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configuring Your Local Environment\n", - "You can validate that you have access to the specified workspace and write a configuration file to the default configuration location, `./aml_config/config.json`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##TESTONLY\n", - "from azureml.core import Workspace\n", - "\n", - "ws = Workspace(workspace_name = workspace_name,\n", - " subscription_id = subscription_id,\n", - " resource_group = resource_group,\n", - " auth = auth)\n", - "\n", - "# Persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n", - "ws.write_config()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##PUBLISHONLY\n", - "#from azureml.core import Workspace\n", - "#\n", - "#ws = Workspace(workspace_name = workspace_name,\n", - "# subscription_id = subscription_id,\n", - "# resource_group = resource_group)\n", - "#\n", - "## Persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n", - "#ws.write_config()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create a Folder to Host Sample Projects\n", - "Finally, create a folder where all the sample projects will be hosted." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "sample_projects_folder = './sample_projects'\n", - "\n", - "if not os.path.isdir(sample_projects_folder):\n", - " os.mkdir(sample_projects_folder)\n", - " \n", - "print('Sample projects will be created in {}.'.format(sample_projects_folder))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create an Experiment\n", - "\n", - "As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import os\n", - "import random\n", - "import time\n", - "\n", - "from matplotlib import pyplot as plt\n", - "from matplotlib.pyplot import imshow\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import azureml.core\n", - "from azureml.core.experiment import Experiment\n", - "from azureml.core.workspace import Workspace\n", - "from azureml.train.automl import AutoMLConfig\n", - "from azureml.train.automl.run import AutoMLRun" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##TESTONLY\n", - "ws = Workspace.from_config(auth = auth)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "##PUBLISHONLY\n", - "#ws = Workspace.from_config(auth = auth)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Choose a name for the experiment and specify the project folder.\n", - "experiment_name = 'automl-local-classification'\n", - "project_folder = './sample_projects/automl-local-classification'\n", - "\n", - "experiment = Experiment(ws, experiment_name)\n", - "\n", - "output = {}\n", - "output['SDK version'] = azureml.core.VERSION\n", - "output['Subscription ID'] = ws.subscription_id\n", - "output['Workspace Name'] = ws.name\n", - "output['Resource Group'] = ws.resource_group\n", - "output['Location'] = ws.location\n", - "output['Project Directory'] = project_folder\n", - "output['Experiment Name'] = experiment.name\n", - "pd.set_option('display.max_colwidth', -1)\n", - "pd.DataFrame(data = output, index = ['']).T" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Diagnostics\n", - "\n", - "Opt-in diagnostics for better experience, quality, and security of future releases." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from azureml.telemetry import set_diagnostics_collection\n", - "set_diagnostics_collection(send_diagnostics = True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Load Training Data Using DataPrep" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import azureml.dataprep as dprep\n", - "# You can use `auto_read_file` which intelligently figures out delimiters and datatypes of a file.\n", - "# The data referenced here was pulled from `sklearn.datasets.load_digits()`.\n", - "simple_example_data_root = 'https://dprepdata.blob.core.windows.net/automl-notebook-data/'\n", - "X_train = dprep.auto_read_file(simple_example_data_root + 'X.csv').skip(1) # Remove the header row.\n", - "\n", - "# You can also use `read_csv` and `to_*` transformations to read (with overridable delimiter)\n", - "# and convert column types manually.\n", - "# Here we read a comma delimited file and convert all columns to integers.\n", - "y_train = dprep.read_csv(simple_example_data_root + 'y.csv').to_long(dprep.ColumnSelector(term='.*', use_regex = True))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Review the Data Preparation Result\n", - "You can peek the result of a Dataflow at any range using skip(i) and head(j). Doing so evaluates only j records for all the steps in the Dataflow, which makes it fast even against large datasets." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X_train.skip(1).head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Configure AutoML\n", - "\n", - "Instantiate an `AutoMLConfig` object to specify the settings and data used to run the experiment.\n", - "\n", - "|Property|Description|\n", - "|-|-|\n", - "|**task**|classification or regression|\n", - "|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics:
accuracy
AUC_weighted
average_precision_score_weighted
norm_macro_recall
precision_score_weighted|\n", - "|**primary_metric**|This is the metric that you want to optimize. Regression supports the following primary metrics:
spearman_correlation
normalized_root_mean_squared_error
r2_score
normalized_mean_absolute_error|\n", - "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n", - "|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n", - "|**n_cross_validations**|Number of cross validation splits.|\n", - "|**spark_context**|Spark Context object. for Databricks, use spark_context=sc|\n", - "|**max_concurrent_iterations**|Maximum number of iterations to execute in parallel. This should be <= number of worker nodes in your Azure Databricks cluster.|\n", - "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n", - "|**y**|(sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]
Multi-class targets. An indicator matrix turns on multilabel classification. This should be an array of integers.|\n", - "|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|\n", - "|**preprocess**|set this to True to enable pre-processing of data eg. string to numeric using one-hot encoding|\n", - "|**exit_score**|Target score for experiment. It is associated with the metric. eg. exit_score=0.995 will exit experiment after that|" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "automl_config = AutoMLConfig(task = 'classification',\n", - " debug_log = 'automl_errors.log',\n", - " primary_metric = 'AUC_weighted',\n", - " iteration_timeout_minutes = 10,\n", - " iterations = 30,\n", - " n_cross_validations = 10,\n", - " max_concurrent_iterations = 2, #change it based on number of worker nodes\n", - " verbosity = logging.INFO,\n", - " spark_context=sc, #databricks/spark related\n", - " X = X_train, \n", - " y = y_train,\n", - " enable_cache=False,\n", - " path = project_folder)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Train the Models\n", - "\n", - "Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n", - "In this example, we specify `show_output = True` to print currently running iterations to the console." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "local_run = experiment.submit(automl_config, show_output = True) # for higher runs please use show_output=False and use the below" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Explore the Results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Portal URL for Monitoring Runs\n", - "\n", - "The following will provide a link to the web interface to explore individual run details and status. In the future we might support output displayed in the notebook." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(local_run.get_portal_url())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The following will show the child runs and waits for the parent run to complete." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Retrieve All Child Runs after the experiment is completed (in portal)\n", - "You can also use SDK methods to fetch all the child runs and see individual metrics that we log." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "children = list(local_run.get_children())\n", - "metricslist = {}\n", - "for run in children:\n", - " properties = run.get_properties()\n", - " metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)} \n", - " metricslist[int(properties['iteration'])] = metrics\n", - "\n", - "rundata = pd.DataFrame(metricslist).sort_index(1)\n", - "rundata" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Retrieve the Best Model after the above run is complete \n", - "\n", - "Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "best_run, fitted_model = local_run.get_output()\n", - "print(best_run)\n", - "print(fitted_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Best Model Based on Any Other Metric after the above run is complete based on the child run\n", - "Show the run and the model that has the smallest `log_loss` value:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "lookup_metric = \"log_loss\"\n", - "best_run, fitted_model = local_run.get_output(metric = lookup_metric)\n", - "print(best_run)\n", - "print(fitted_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test the Best Fitted Model\n", - "\n", - "#### Load Test Data - you can split the dataset beforehand & pass Train dataset to AutoML and use Test dataset to evaluate the best model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn import datasets\n", - "digits = datasets.load_digits()\n", - "X_test = digits.data[:10, :]\n", - "y_test = digits.target[:10]\n", - "images = digits.images[:10]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Testing Our Best Fitted Model\n", - "We will try to predict digits and see how our model works. This is just an example to show you." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Randomly select digits and test.\n", - "for index in np.random.choice(len(y_test), 2, replace = False):\n", - " print(index)\n", - " predicted = fitted_model.predict(X_test[index:index + 1])[0]\n", - " label = y_test[index]\n", - " title = \"Label value = %d Predicted value = %d \" % (label, predicted)\n", - " fig = plt.figure(1, figsize = (3,3))\n", - " ax1 = fig.add_axes((0,0,.8,.8))\n", - " ax1.set_title(title)\n", - " plt.imshow(images[index], cmap = plt.cm.gray_r, interpolation = 'nearest')\n", - " display(fig)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When deploying an automated ML trained model, please specify _pip_packages=['azureml-sdk[automl]']_ in your CondaDependencies." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "authors": [ - { - "name": "savitam" - }, - { - "name": "wamartin" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Copyright (c) Microsoft Corporation. All rights reserved.\n", + "\n", + "Licensed under the MIT License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We support installing AML SDK as library from GUI. When attaching a library follow this https://docs.databricks.com/user-guide/libraries.html and add the below string as your PyPi package. You can select the option to attach the library to all clusters or just one cluster.\n", + "\n", + "**install azureml-sdk with Automated ML**\n", + "* Source: Upload Python Egg or PyPi\n", + "* PyPi Name: `azureml-sdk[automl_databricks]`\n", + "* Select Install Library" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# AutoML : Classification with Local Compute on Azure DataBricks\n", + "\n", + "In this example we use the scikit-learn's [digit dataset](http://scikit-learn.org/stable/datasets/index.html#optical-recognition-of-handwritten-digits-dataset) to showcase how you can use AutoML for a simple classification problem.\n", + "\n", + "In this notebook you will learn how to:\n", + "1. Create Azure Machine Learning Workspace object and initialize your notebook directory to easily reload this object from a configuration file.\n", + "2. Create an `Experiment` in an existing `Workspace`.\n", + "3. Configure AutoML using `AutoMLConfig`.\n", + "4. Train the model using AzureDataBricks.\n", + "5. Explore the results.\n", + "6. Test the best fitted model.\n", + "\n", + "Prerequisites:\n", + "Before running this notebook, please follow the readme for installing necessary libraries to your cluster." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Register Machine Learning Services Resource Provider\n", + "Microsoft.MachineLearningServices only needs to be registed once in the subscription. To register it:\n", + "Start the Azure portal.\n", + "Select your All services and then Subscription.\n", + "Select the subscription that you want to use.\n", + "Click on Resource providers\n", + "Click the Register link next to Microsoft.MachineLearningServices" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Check the Azure ML Core SDK Version to Validate Your Installation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.core\n", + "\n", + "print(\"SDK Version:\", azureml.core.VERSION)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize an Azure ML Workspace\n", + "### What is an Azure ML Workspace and Why Do I Need One?\n", + "\n", + "An Azure ML workspace is an Azure resource that organizes and coordinates the actions of many other Azure resources to assist in executing and sharing machine learning workflows. In particular, an Azure ML workspace coordinates storage, databases, and compute resources providing added functionality for machine learning experimentation, operationalization, and the monitoring of operationalized models.\n", + "\n", + "\n", + "### What do I Need?\n", + "\n", + "To create or access an Azure ML workspace, you will need to import the Azure ML library and specify following information:\n", + "* A name for your workspace. You can choose one.\n", + "* Your subscription id. Use the `id` value from the `az account show` command output above.\n", + "* The resource group name. The resource group organizes Azure resources and provides a default region for the resources in the group. The resource group will be created if it doesn't exist. Resource groups can be created and viewed in the [Azure portal](https://portal.azure.com)\n", + "* Supported regions include `eastus2`, `eastus`,`westcentralus`, `southeastasia`, `westeurope`, `australiaeast`, `westus2`, `southcentralus`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "subscription_id = \"\"\n", + "resource_group = \"\"\n", + "workspace_name = \"\"\n", + "workspace_region = \"\" #eg. eastus2, westcentralus, westeurope" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating a Workspace\n", + "If you already have access to an Azure ML workspace you want to use, you can skip this cell. Otherwise, this cell will create an Azure ML workspace for you in the specified subscription, provided you have the correct permissions for the given `subscription_id`.\n", + "\n", + "This will fail when:\n", + "1. The workspace already exists.\n", + "2. You do not have permission to create a workspace in the resource group.\n", + "3. You are not a subscription owner or contributor and no Azure ML workspaces have ever been created in this subscription.\n", + "\n", + "If workspace creation fails for any reason other than already existing, please work with your IT administrator to provide you with the appropriate permissions or to provision the required resources.\n", + "\n", + "**Note:** Creation of a new workspace can take several minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import the Workspace class and check the Azure ML SDK version.\n", + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace.create(name = workspace_name,\n", + " subscription_id = subscription_id,\n", + " resource_group = resource_group, \n", + " location = workspace_region,\n", + " auth = auth,\n", + " exist_ok=True)\n", + "ws.get_details()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Workspace\n", + "import azureml.core\n", + "\n", + "# Check core SDK version number\n", + "print(\"SDK version:\", azureml.core.VERSION)\n", + "\n", + "#'''\n", + "ws = Workspace.from_config()\n", + "print('Workspace name: ' + ws.name, \n", + " 'Azure region: ' + ws.location, \n", + " 'Subscription id: ' + ws.subscription_id, \n", + " 'Resource group: ' + ws.resource_group, sep = '\\n')\n", + "#'''" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configuring Your Local Environment\n", + "You can validate that you have access to the specified workspace and write a configuration file to the default configuration location, `./aml_config/config.json`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.core import Workspace\n", + "\n", + "ws = Workspace(workspace_name = workspace_name,\n", + " subscription_id = subscription_id,\n", + " resource_group = resource_group)\n", + "\n", + "# Persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n", + "ws.write_config()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a Folder to Host Sample Projects\n", + "Finally, create a folder where all the sample projects will be hosted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "sample_projects_folder = './sample_projects'\n", + "\n", + "if not os.path.isdir(sample_projects_folder):\n", + " os.mkdir(sample_projects_folder)\n", + " \n", + "print('Sample projects will be created in {}.'.format(sample_projects_folder))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create an Experiment\n", + "\n", + "As part of the setup you have already created an Azure ML `Workspace` object. For AutoML you will need to create an `Experiment` object, which is a named object in a `Workspace` used to run experiments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import os\n", + "import random\n", + "import time\n", + "\n", + "from matplotlib import pyplot as plt\n", + "from matplotlib.pyplot import imshow\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import azureml.core\n", + "from azureml.core.experiment import Experiment\n", + "from azureml.core.workspace import Workspace\n", + "from azureml.train.automl import AutoMLConfig\n", + "from azureml.train.automl.run import AutoMLRun" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ws = Workspace.from_config(auth = auth)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Choose a name for the experiment and specify the project folder.\n", + "experiment_name = 'automl-local-classification'\n", + "project_folder = './sample_projects/automl-local-classification'\n", + "\n", + "experiment = Experiment(ws, experiment_name)\n", + "\n", + "output = {}\n", + "output['SDK version'] = azureml.core.VERSION\n", + "output['Subscription ID'] = ws.subscription_id\n", + "output['Workspace Name'] = ws.name\n", + "output['Resource Group'] = ws.resource_group\n", + "output['Location'] = ws.location\n", + "output['Project Directory'] = project_folder\n", + "output['Experiment Name'] = experiment.name\n", + "pd.set_option('display.max_colwidth', -1)\n", + "pd.DataFrame(data = output, index = ['']).T" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Diagnostics\n", + "\n", + "Opt-in diagnostics for better experience, quality, and security of future releases." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from azureml.telemetry import set_diagnostics_collection\n", + "set_diagnostics_collection(send_diagnostics = True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Training Data Using DataPrep" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import azureml.dataprep as dprep\n", + "# You can use `auto_read_file` which intelligently figures out delimiters and datatypes of a file.\n", + "# The data referenced here was pulled from `sklearn.datasets.load_digits()`.\n", + "simple_example_data_root = 'https://dprepdata.blob.core.windows.net/automl-notebook-data/'\n", + "X_train = dprep.auto_read_file(simple_example_data_root + 'X.csv').skip(1) # Remove the header row.\n", + "\n", + "# You can also use `read_csv` and `to_*` transformations to read (with overridable delimiter)\n", + "# and convert column types manually.\n", + "# Here we read a comma delimited file and convert all columns to integers.\n", + "y_train = dprep.read_csv(simple_example_data_root + 'y.csv').to_long(dprep.ColumnSelector(term='.*', use_regex = True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Review the Data Preparation Result\n", + "You can peek the result of a Dataflow at any range using skip(i) and head(j). Doing so evaluates only j records for all the steps in the Dataflow, which makes it fast even against large datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train.skip(1).head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Configure AutoML\n", + "\n", + "Instantiate an `AutoMLConfig` object to specify the settings and data used to run the experiment.\n", + "\n", + "|Property|Description|\n", + "|-|-|\n", + "|**task**|classification or regression|\n", + "|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics:
accuracy
AUC_weighted
average_precision_score_weighted
norm_macro_recall
precision_score_weighted|\n", + "|**primary_metric**|This is the metric that you want to optimize. Regression supports the following primary metrics:
spearman_correlation
normalized_root_mean_squared_error
r2_score
normalized_mean_absolute_error|\n", + "|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n", + "|**iterations**|Number of iterations. In each iteration AutoML trains a specific pipeline with the data.|\n", + "|**n_cross_validations**|Number of cross validation splits.|\n", + "|**spark_context**|Spark Context object. for Databricks, use spark_context=sc|\n", + "|**max_concurrent_iterations**|Maximum number of iterations to execute in parallel. This should be <= number of worker nodes in your Azure Databricks cluster.|\n", + "|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n", + "|**y**|(sparse) array-like, shape = [n_samples, ], [n_samples, n_classes]
Multi-class targets. An indicator matrix turns on multilabel classification. This should be an array of integers.|\n", + "|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder.|\n", + "|**preprocess**|set this to True to enable pre-processing of data eg. string to numeric using one-hot encoding|\n", + "|**exit_score**|Target score for experiment. It is associated with the metric. eg. exit_score=0.995 will exit experiment after that|" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "automl_config = AutoMLConfig(task = 'classification',\n", + " debug_log = 'automl_errors.log',\n", + " primary_metric = 'AUC_weighted',\n", + " iteration_timeout_minutes = 10,\n", + " iterations = 30,\n", + " n_cross_validations = 10,\n", + " max_concurrent_iterations = 2, #change it based on number of worker nodes\n", + " verbosity = logging.INFO,\n", + " spark_context=sc, #databricks/spark related\n", + " X = X_train, \n", + " y = y_train,\n", + " enable_cache=False,\n", + " path = project_folder)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train the Models\n", + "\n", + "Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n", + "In this example, we specify `show_output = True` to print currently running iterations to the console." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "local_run = experiment.submit(automl_config, show_output = True) # for higher runs please use show_output=False and use the below" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Explore the Results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Portal URL for Monitoring Runs\n", + "\n", + "The following will provide a link to the web interface to explore individual run details and status. In the future we might support output displayed in the notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(local_run.get_portal_url())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following will show the child runs and waits for the parent run to complete." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Retrieve All Child Runs after the experiment is completed (in portal)\n", + "You can also use SDK methods to fetch all the child runs and see individual metrics that we log." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "children = list(local_run.get_children())\n", + "metricslist = {}\n", + "for run in children:\n", + " properties = run.get_properties()\n", + " metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)} \n", + " metricslist[int(properties['iteration'])] = metrics\n", + "\n", + "rundata = pd.DataFrame(metricslist).sort_index(1)\n", + "rundata" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Retrieve the Best Model after the above run is complete \n", + "\n", + "Below we select the best pipeline from our iterations. The `get_output` method returns the best run and the fitted model. The Model includes the pipeline and any pre-processing. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "best_run, fitted_model = local_run.get_output()\n", + "print(best_run)\n", + "print(fitted_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Best Model Based on Any Other Metric after the above run is complete based on the child run\n", + "Show the run and the model that has the smallest `log_loss` value:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "lookup_metric = \"log_loss\"\n", + "best_run, fitted_model = local_run.get_output(metric = lookup_metric)\n", + "print(best_run)\n", + "print(fitted_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test the Best Fitted Model\n", + "\n", + "#### Load Test Data - you can split the dataset beforehand & pass Train dataset to AutoML and use Test dataset to evaluate the best model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import datasets\n", + "digits = datasets.load_digits()\n", + "X_test = digits.data[:10, :]\n", + "y_test = digits.target[:10]\n", + "images = digits.images[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Testing Our Best Fitted Model\n", + "We will try to predict digits and see how our model works. This is just an example to show you." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Randomly select digits and test.\n", + "for index in np.random.choice(len(y_test), 2, replace = False):\n", + " print(index)\n", + " predicted = fitted_model.predict(X_test[index:index + 1])[0]\n", + " label = y_test[index]\n", + " title = \"Label value = %d Predicted value = %d \" % (label, predicted)\n", + " fig = plt.figure(1, figsize = (3,3))\n", + " ax1 = fig.add_axes((0,0,.8,.8))\n", + " ax1.set_title(title)\n", + " plt.imshow(images[index], cmap = plt.cm.gray_r, interpolation = 'nearest')\n", + " display(fig)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When deploying an automated ML trained model, please specify _pip_packages=['azureml-sdk[automl]']_ in your CondaDependencies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } ], - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "metadata": { + "authors": [ + { + "name": "savitam" + }, + { + "name": "wamartin" + } + ], + "kernelspec": { + "display_name": "Python 3.6", + "language": "python", + "name": "python36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + }, + "name": "auto-ml-classification-local-adb", + "notebookId": 3836944406456411 }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - }, - "name": "auto-ml-classification-local-adb", - "notebookId": 3836944406456411 - }, - "nbformat": 4, - "nbformat_minor": 1 -} + "nbformat": 4, + "nbformat_minor": 1 +} \ No newline at end of file