update automl databricks

This commit is contained in:
Roope Astala
2019-02-06 11:53:00 -05:00
parent 0aa1b248f4
commit 3fa409543b
2 changed files with 63 additions and 44 deletions

View File

@@ -123,13 +123,6 @@
"ws.get_details()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
@@ -270,15 +263,14 @@
"#If your data is in a dataframe, please use read_pandas_dataframe to convert a dataframe to dataflow before usind dprep.\n",
"\n",
"import azureml.dataprep as dprep\n",
"# You can use `auto_read_file` which intelligently figures out delimiters and datatypes of a file.\n",
"\n",
"# The data referenced here was pulled from `sklearn.datasets.load_digits()`.\n",
"simple_example_data_root = 'https://dprepdata.blob.core.windows.net/automl-notebook-data/'\n",
"X_train = dprep.auto_read_file(simple_example_data_root + 'X.csv').skip(1) # Remove the header row.\n",
"\n",
"# You can also use `read_csv` and `to_*` transformations to read (with overridable delimiter)\n",
"# and convert column types manually.\n",
"# Here we read a comma delimited file and convert all columns to integers.\n",
"y_train = dprep.read_csv(simple_example_data_root + 'y.csv').to_long(dprep.ColumnSelector(term='.*', use_regex = True))"
"#Convert Pandas DataFrame to DataFlow\n",
"#The read_pandas_dataframe reader can take a DataFrame and use it as the data source for a Dataflow.\n",
"X_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'X.csv'), temp_folder='/dbfs/dataset_dataflowX_train') \n",
"y_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'y.csv'), temp_folder='/dbfs/dataset_dataflowy_train').to_long(dprep.ColumnSelector(term='.*', use_regex = True))\n"
]
},
{
@@ -295,7 +287,16 @@
"metadata": {},
"outputs": [],
"source": [
"X_train.skip(1).head(5)"
"X_train.get_profile()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_train.get_profile()"
]
},
{
@@ -333,7 +334,8 @@
" debug_log = 'automl_errors.log',\n",
" primary_metric = 'AUC_weighted',\n",
" iteration_timeout_minutes = 10,\n",
" iterations = 30,\n",
" iterations = 5,\n",
" preprocess = True,\n",
" n_cross_validations = 10,\n",
" max_concurrent_iterations = 2, #change it based on number of worker nodes\n",
" verbosity = logging.INFO,\n",
@@ -349,8 +351,7 @@
"source": [
"## Train the Models\n",
"\n",
"Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n",
"In this example, we specify `show_output = True` to print currently running iterations to the console."
"Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while."
]
},
{
@@ -359,7 +360,7 @@
"metadata": {},
"outputs": [],
"source": [
"local_run = experiment.submit(automl_config, show_output = True) # for higher runs please use show_output=False and use the below"
"local_run = experiment.submit(automl_config, show_output = False) # for higher runs please use show_output=False and use the below"
]
},
{
@@ -549,11 +550,11 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
"version": "3.6.5"
},
"name": "auto-ml-classification-local-adb",
"notebookId": 817220787969977
"notebookId": 587284549713154
},
"nbformat": 4,
"nbformat_minor": 0
"nbformat_minor": 1
}

View File

@@ -99,10 +99,10 @@
"metadata": {},
"outputs": [],
"source": [
"subscription_id = \"<Your SubscriptionId>\"\n",
"resource_group = \"<Resource group - new or existing>\"\n",
"workspace_name = \"<workspace to be created>\"\n",
"workspace_region = \"<azureregion>\""
"subscription_id = \"<Your SubscriptionId>\" #you should be owner or contributor\n",
"resource_group = \"<Resource group - new or existing>\" #you should be owner or contributor\n",
"workspace_name = \"<workspace to be created>\" #your workspace name\n",
"workspace_region = \"<azureregion>\" #your region"
]
},
{
@@ -134,7 +134,7 @@
"ws = Workspace.create(name = workspace_name,\n",
" subscription_id = subscription_id,\n",
" resource_group = resource_group, \n",
" location = workspace_region,\n",
" location = workspace_region, \n",
" exist_ok=True)\n",
"ws.get_details()"
]
@@ -160,7 +160,8 @@
" resource_group = resource_group)\n",
"\n",
"# Persist the subscription id, resource group name, and workspace name in aml_config/config.json.\n",
"ws.write_config()"
"ws.write_config()\n",
"write_config(path=\"/databricks/driver/aml_config/\",file_name=<alias_conf.cfg>)"
]
},
{
@@ -262,6 +263,13 @@
"set_diagnostics_collection(send_diagnostics = True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Covert Pandas Dataframe to DataFlow"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -276,15 +284,16 @@
"outputs": [],
"source": [
"import azureml.dataprep as dprep\n",
"# You can use `auto_read_file` which intelligently figures out delimiters and datatypes of a file.\n",
"\n",
"# The data referenced here was pulled from `sklearn.datasets.load_digits()`.\n",
"simple_example_data_root = 'https://dprepdata.blob.core.windows.net/automl-notebook-data/'\n",
"X_train = dprep.auto_read_file(simple_example_data_root + 'X.csv').skip(1) # Remove the header row.\n",
"\n",
"# You can also use `read_csv` and `to_*` transformations to read (with overridable delimiter)\n",
"# and convert column types manually.\n",
"# Here we read a comma delimited file and convert all columns to integers.\n",
"y_train = dprep.read_csv(simple_example_data_root + 'y.csv').to_long(dprep.ColumnSelector(term='.*', use_regex = True))"
"#Convert Pandas DataFrame to DataFlow\n",
"#The read_pandas_dataframe reader can take a DataFrame and use it as the data source for a Dataflow.\n",
"X_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'X.csv'), temp_folder='/dbfs/dataset_dataflowX_train') \n",
"y_train = dprep.read_pandas_dataframe(pd.read_csv(simple_example_data_root + 'y.csv'), temp_folder='/dbfs/dataset_dataflowy_train').to_long(dprep.ColumnSelector(term='.*', use_regex = True))\n",
"\n",
"\n"
]
},
{
@@ -301,7 +310,16 @@
"metadata": {},
"outputs": [],
"source": [
"X_train.skip(1).head(5)"
"X_train.get_profile()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_train.get_profile()"
]
},
{
@@ -339,14 +357,14 @@
" debug_log = 'automl_errors.log',\n",
" primary_metric = 'AUC_weighted',\n",
" iteration_timeout_minutes = 10,\n",
" iterations = 5,\n",
" n_cross_validations = 2,\n",
" max_concurrent_iterations = 4, #change it based on number of worker nodes\n",
" iterations = 30,\n",
" preprocess = True,\n",
" n_cross_validations = 10,\n",
" max_concurrent_iterations = 2, #change it based on number of worker nodes\n",
" verbosity = logging.INFO,\n",
" spark_context=sc, #databricks/spark related\n",
" X = X_train, \n",
" y = y_train,\n",
" enable_cache=False,\n",
" path = project_folder)"
]
},
@@ -356,8 +374,7 @@
"source": [
"## Train the Models\n",
"\n",
"Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.\n",
"In this example, we specify `show_output = True` to print currently running iterations to the console."
"Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while."
]
},
{
@@ -366,7 +383,7 @@
"metadata": {},
"outputs": [],
"source": [
"local_run = experiment.submit(automl_config, show_output = True) # for higher runs please use show_output=False and use the below"
"local_run = experiment.submit(automl_config, show_output = False) # for higher runs please use show_output=False and use the below"
]
},
{
@@ -419,6 +436,7 @@
"metricslist = {}\n",
"for run in children:\n",
" properties = run.get_properties()\n",
" #print(properties)\n",
" metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)} \n",
" metricslist[int(properties['iteration'])] = metrics\n",
"\n",
@@ -694,11 +712,11 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
"version": "3.6.5"
},
"name": "auto-ml-classification-local-adb",
"notebookId": 3888835968049288
"notebookId": 2733885892129020
},
"nbformat": 4,
"nbformat_minor": 0
"nbformat_minor": 1
}