diff --git a/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb b/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb index 6a70d000..3530928a 100644 --- a/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb +++ b/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-01.ipynb @@ -123,13 +123,6 @@ "ws.get_details()" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -270,15 +263,14 @@ "#If your data is in a dataframe, please use read_pandas_dataframe to convert a dataframe to dataflow before usind dprep.\n", "\n", "import azureml.dataprep as dprep\n", - "# You can use `auto_read_file` which intelligently figures out delimiters and datatypes of a file.\n", + "\n", "# The data referenced here was pulled from `sklearn.datasets.load_digits()`.\n", "simple_example_data_root = 'https://dprepdata.blob.core.windows.net/automl-notebook-data/'\n", - "X_train = dprep.auto_read_file(simple_example_data_root + 'X.csv').skip(1) # Remove the header row.\n", "\n", - "# You can also use `read_csv` and `to_*` transformations to read (with overridable delimiter)\n", - "# and convert column types manually.\n", - "# Here we read a comma delimited file and convert all columns to integers.\n", - "y_train = dprep.read_csv(simple_example_data_root + 'y.csv').to_long(dprep.ColumnSelector(term='.*', use_regex = True))" + "#Convert Pandas DataFrame to DataFlow\n", + "#The read_pandas_dataframe reader can take a DataFrame and use it as the data source for a Dataflow.\n", + "X_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'X.csv'), temp_folder='/dbfs/dataset_dataflowX_train') \n", + "y_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'y.csv'), temp_folder='/dbfs/dataset_dataflowy_train').to_long(dprep.ColumnSelector(term='.*', use_regex = True))\n" ] }, { @@ -295,7 +287,16 @@ "metadata": {}, "outputs": [], "source": [ - "X_train.skip(1).head(5)" + "X_train.get_profile()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_train.get_profile()" ] }, { @@ -333,7 +334,8 @@ " debug_log = 'automl_errors.log',\n", " primary_metric = 'AUC_weighted',\n", " iteration_timeout_minutes = 10,\n", - " iterations = 30,\n", + " iterations = 5,\n", + " preprocess = True,\n", " n_cross_validations = 10,\n", " max_concurrent_iterations = 2, #change it based on number of worker nodes\n", " verbosity = logging.INFO,\n", @@ -349,8 +351,7 @@ "source": [ "## Train the Models\n", "\n", - "Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n", - "In this example, we specify `show_output = True` to print currently running iterations to the console." + "Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while." ] }, { @@ -359,7 +360,7 @@ "metadata": {}, "outputs": [], "source": [ - "local_run = experiment.submit(automl_config, show_output = True) # for higher runs please use show_output=False and use the below" + "local_run = experiment.submit(automl_config, show_output = False) # for higher runs please use show_output=False and use the below" ] }, { @@ -549,11 +550,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.0" + "version": "3.6.5" }, "name": "auto-ml-classification-local-adb", - "notebookId": 817220787969977 + "notebookId": 587284549713154 }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } \ No newline at end of file diff --git a/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb b/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb index 154fc235..5282a83a 100644 --- a/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb +++ b/how-to-use-azureml/azure-databricks/automl/automl-databricks-local-with-deployment.ipynb @@ -99,10 +99,10 @@ "metadata": {}, "outputs": [], "source": [ - "subscription_id = \"\"\n", - "resource_group = \"\"\n", - "workspace_name = \"\"\n", - "workspace_region = \"\"" + "subscription_id = \"\" #you should be owner or contributor\n", + "resource_group = \"\" #you should be owner or contributor\n", + "workspace_name = \"\" #your workspace name\n", + "workspace_region = \"\" #your region" ] }, { @@ -134,7 +134,7 @@ "ws = Workspace.create(name = workspace_name,\n", " subscription_id = subscription_id,\n", " resource_group = resource_group, \n", - " location = workspace_region,\n", + " location = workspace_region, \n", " exist_ok=True)\n", "ws.get_details()" ] @@ -160,7 +160,8 @@ " resource_group = resource_group)\n", "\n", "# Persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n", - "ws.write_config()" + "ws.write_config()\n", + "write_config(path=\"/databricks/driver/aml_config/\",file_name=)" ] }, { @@ -262,6 +263,13 @@ "set_diagnostics_collection(send_diagnostics = True)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Covert Pandas Dataframe to DataFlow" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -276,15 +284,16 @@ "outputs": [], "source": [ "import azureml.dataprep as dprep\n", - "# You can use `auto_read_file` which intelligently figures out delimiters and datatypes of a file.\n", + "\n", "# The data referenced here was pulled from `sklearn.datasets.load_digits()`.\n", "simple_example_data_root = 'https://dprepdata.blob.core.windows.net/automl-notebook-data/'\n", - "X_train = dprep.auto_read_file(simple_example_data_root + 'X.csv').skip(1) # Remove the header row.\n", "\n", - "# You can also use `read_csv` and `to_*` transformations to read (with overridable delimiter)\n", - "# and convert column types manually.\n", - "# Here we read a comma delimited file and convert all columns to integers.\n", - "y_train = dprep.read_csv(simple_example_data_root + 'y.csv').to_long(dprep.ColumnSelector(term='.*', use_regex = True))" + "#Convert Pandas DataFrame to DataFlow\n", + "#The read_pandas_dataframe reader can take a DataFrame and use it as the data source for a Dataflow.\n", + "X_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'X.csv'), temp_folder='/dbfs/dataset_dataflowX_train') \n", + "y_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'y.csv'), temp_folder='/dbfs/dataset_dataflowy_train').to_long(dprep.ColumnSelector(term='.*', use_regex = True))\n", + "\n", + "\n" ] }, { @@ -301,7 +310,16 @@ "metadata": {}, "outputs": [], "source": [ - "X_train.skip(1).head(5)" + "X_train.get_profile()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_train.get_profile()" ] }, { @@ -339,14 +357,14 @@ " debug_log = 'automl_errors.log',\n", " primary_metric = 'AUC_weighted',\n", " iteration_timeout_minutes = 10,\n", - " iterations = 5,\n", - " n_cross_validations = 2,\n", - " max_concurrent_iterations = 4, #change it based on number of worker nodes\n", + " iterations = 30,\n", + " preprocess = True,\n", + " n_cross_validations = 10,\n", + " max_concurrent_iterations = 2, #change it based on number of worker nodes\n", " verbosity = logging.INFO,\n", " spark_context=sc, #databricks/spark related\n", " X = X_train, \n", " y = y_train,\n", - " enable_cache=False,\n", " path = project_folder)" ] }, @@ -356,8 +374,7 @@ "source": [ "## Train the Models\n", "\n", - "Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n", - "In this example, we specify `show_output = True` to print currently running iterations to the console." + "Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while." ] }, { @@ -366,7 +383,7 @@ "metadata": {}, "outputs": [], "source": [ - "local_run = experiment.submit(automl_config, show_output = True) # for higher runs please use show_output=False and use the below" + "local_run = experiment.submit(automl_config, show_output = False) # for higher runs please use show_output=False and use the below" ] }, { @@ -419,6 +436,7 @@ "metricslist = {}\n", "for run in children:\n", " properties = run.get_properties()\n", + " #print(properties)\n", " metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)} \n", " metricslist[int(properties['iteration'])] = metrics\n", "\n", @@ -694,11 +712,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.0" + "version": "3.6.5" }, "name": "auto-ml-classification-local-adb", - "notebookId": 3888835968049288 + "notebookId": 2733885892129020 }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } \ No newline at end of file