update automl databricks

2019-02-06 11:53:00 -05:00
parent 0aa1b248f4
commit 3fa409543b
2 changed files with 63 additions and 44 deletions
--- a/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb
+++ b/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb
@@ -123,13 +123,6 @@
        "ws.get_details()"
      ]
    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": []
-    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -270,15 +263,14 @@
        "#If your data is in a dataframe, please use read_pandas_dataframe to convert a dataframe to dataflow before usind dprep.\n",
        "\n",
        "import azureml.dataprep as dprep\n",
-        "# You can use `auto_read_file` which intelligently figures out delimiters and datatypes of a file.\n",
+        "\n",
        "# The data referenced here was pulled from `sklearn.datasets.load_digits()`.\n",
        "simple_example_data_root = 'https://dprepdata.blob.core.windows.net/automl-notebook-data/'\n",
-        "X_train = dprep.auto_read_file(simple_example_data_root + 'X.csv').skip(1)  # Remove the header row.\n",
        "\n",
-        "# You can also use `read_csv` and `to_*` transformations to read (with overridable delimiter)\n",
-        "# and convert column types manually.\n",
-        "# Here we read a comma delimited file and convert all columns to integers.\n",
-        "y_train = dprep.read_csv(simple_example_data_root + 'y.csv').to_long(dprep.ColumnSelector(term='.*', use_regex = True))"
+        "#Convert Pandas DataFrame to DataFlow\n",
+        "#The read_pandas_dataframe reader can take a DataFrame and use it as the data source for a Dataflow.\n",
+        "X_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'X.csv'), temp_folder='/dbfs/dataset_dataflowX_train') \n",
+        "y_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'y.csv'), temp_folder='/dbfs/dataset_dataflowy_train').to_long(dprep.ColumnSelector(term='.*', use_regex = True))\n"
      ]
    },
    {
@@ -295,7 +287,16 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "X_train.skip(1).head(5)"
+        "X_train.get_profile()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "y_train.get_profile()"
      ]
    },
    {
@@ -333,7 +334,8 @@
        "                             debug_log = 'automl_errors.log',\n",
        "                             primary_metric = 'AUC_weighted',\n",
        "                             iteration_timeout_minutes = 10,\n",
-        "                             iterations = 30,\n",
+        "                             iterations = 5,\n",
+        "                             preprocess = True,\n",
        "                             n_cross_validations = 10,\n",
        "                             max_concurrent_iterations = 2, #change it based on number of worker nodes\n",
        "                             verbosity = logging.INFO,\n",
@@ -349,8 +351,7 @@
      "source": [
        "## Train the Models\n",
        "\n",
-        "Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n",
-        "In this example, we specify `show_output = True` to print currently running iterations to the console."
+        "Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while."
      ]
    },
    {
@@ -359,7 +360,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "local_run = experiment.submit(automl_config, show_output = True) # for higher runs please use show_output=False and use the below"
+        "local_run = experiment.submit(automl_config, show_output = False) # for higher runs please use show_output=False and use the below"
      ]
    },
    {
@@ -549,11 +550,11 @@
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
-      "version": "3.7.0"
+      "version": "3.6.5"
    },
    "name": "auto-ml-classification-local-adb",
-    "notebookId": 817220787969977
+    "notebookId": 587284549713154
  },
  "nbformat": 4,
-  "nbformat_minor": 0
+  "nbformat_minor": 1
 }
--- a/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb
+++ b/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb
@@ -99,10 +99,10 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "subscription_id = \"<Your SubscriptionId>\"\n",
-        "resource_group = \"<Resource group - new or existing>\"\n",
-        "workspace_name = \"<workspace to be created>\"\n",
-        "workspace_region = \"<azureregion>\""
+        "subscription_id = \"<Your SubscriptionId>\" #you should be owner or contributor\n",
+        "resource_group = \"<Resource group - new or existing>\" #you should be owner or contributor\n",
+        "workspace_name = \"<workspace to be created>\" #your workspace name\n",
+        "workspace_region = \"<azureregion>\" #your region"
      ]
    },
    {
@@ -134,7 +134,7 @@
        "ws = Workspace.create(name = workspace_name,\n",
        "                      subscription_id = subscription_id,\n",
        "                      resource_group = resource_group, \n",
-        "                      location = workspace_region,\n",
+        "                      location = workspace_region,                      \n",
        "                      exist_ok=True)\n",
        "ws.get_details()"
      ]
@@ -160,7 +160,8 @@
        "               resource_group = resource_group)\n",
        "\n",
        "# Persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n",
-        "ws.write_config()"
+        "ws.write_config()\n",
+        "write_config(path=\"/databricks/driver/aml_config/\",file_name=<alias_conf.cfg>)"
      ]
    },
    {
@@ -262,6 +263,13 @@
        "set_diagnostics_collection(send_diagnostics = True)"
      ]
    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Covert Pandas Dataframe to DataFlow"
+      ]
+    },
    {
      "cell_type": "markdown",
      "metadata": {},
@@ -276,15 +284,16 @@
      "outputs": [],
      "source": [
        "import azureml.dataprep as dprep\n",
-        "# You can use `auto_read_file` which intelligently figures out delimiters and datatypes of a file.\n",
+        "\n",
        "# The data referenced here was pulled from `sklearn.datasets.load_digits()`.\n",
        "simple_example_data_root = 'https://dprepdata.blob.core.windows.net/automl-notebook-data/'\n",
-        "X_train = dprep.auto_read_file(simple_example_data_root + 'X.csv').skip(1)  # Remove the header row.\n",
        "\n",
-        "# You can also use `read_csv` and `to_*` transformations to read (with overridable delimiter)\n",
-        "# and convert column types manually.\n",
-        "# Here we read a comma delimited file and convert all columns to integers.\n",
-        "y_train = dprep.read_csv(simple_example_data_root + 'y.csv').to_long(dprep.ColumnSelector(term='.*', use_regex = True))"
+        "#Convert Pandas DataFrame to DataFlow\n",
+        "#The read_pandas_dataframe reader can take a DataFrame and use it as the data source for a Dataflow.\n",
+        "X_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'X.csv'), temp_folder='/dbfs/dataset_dataflowX_train') \n",
+        "y_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'y.csv'), temp_folder='/dbfs/dataset_dataflowy_train').to_long(dprep.ColumnSelector(term='.*', use_regex = True))\n",
+        "\n",
+        "\n"
      ]
    },
    {
@@ -301,7 +310,16 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "X_train.skip(1).head(5)"
+        "X_train.get_profile()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "y_train.get_profile()"
      ]
    },
    {
@@ -339,14 +357,14 @@
        "                             debug_log = 'automl_errors.log',\n",
        "                             primary_metric = 'AUC_weighted',\n",
        "                             iteration_timeout_minutes = 10,\n",
-        "                             iterations = 5,\n",
-        "                             n_cross_validations = 2,\n",
-        "                             max_concurrent_iterations = 4, #change it based on number of worker nodes\n",
+        "                             iterations = 30,\n",
+        "                             preprocess = True,\n",
+        "                             n_cross_validations = 10,\n",
+        "                             max_concurrent_iterations = 2, #change it based on number of worker nodes\n",
        "                             verbosity = logging.INFO,\n",
        "                             spark_context=sc, #databricks/spark related\n",
        "                             X = X_train, \n",
        "                             y = y_train,\n",
-        "                             enable_cache=False,\n",
        "                             path = project_folder)"
      ]
    },
@@ -356,8 +374,7 @@
      "source": [
        "## Train the Models\n",
        "\n",
-        "Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n",
-        "In this example, we specify `show_output = True` to print currently running iterations to the console."
+        "Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while."
      ]
    },
    {
@@ -366,7 +383,7 @@
      "metadata": {},
      "outputs": [],
      "source": [
-        "local_run = experiment.submit(automl_config, show_output = True) # for higher runs please use show_output=False and use the below"
+        "local_run = experiment.submit(automl_config, show_output = False) # for higher runs please use show_output=False and use the below"
      ]
    },
    {
@@ -419,6 +436,7 @@
        "metricslist = {}\n",
        "for run in children:\n",
        "    properties = run.get_properties()\n",
+        "    #print(properties)\n",
        "    metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}    \n",
        "    metricslist[int(properties['iteration'])] = metrics\n",
        "\n",
@@ -694,11 +712,11 @@
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
-      "version": "3.7.0"
+      "version": "3.6.5"
    },
    "name": "auto-ml-classification-local-adb",
-    "notebookId": 3888835968049288
+    "notebookId": 2733885892129020
  },
  "nbformat": 4,
-  "nbformat_minor": 0
+  "nbformat_minor": 1
 }