Compare commits
1 Commits
release-1.
...
release_up
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
584ed9ae74 |
@@ -103,7 +103,7 @@
|
||||
"source": [
|
||||
"import azureml.core\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using version 1.0.45 of the Azure ML SDK\")\n",
|
||||
"print(\"This notebook was created using version of the Azure ML SDK\")\n",
|
||||
"print(\"You are currently using version\", azureml.core.VERSION, \"of the Azure ML SDK\")"
|
||||
]
|
||||
},
|
||||
@@ -258,7 +258,7 @@
|
||||
"```shell\n",
|
||||
"az vm list-skus -o tsv\n",
|
||||
"```\n",
|
||||
"* min_nodes - this sets the minimum size of the cluster. If you set the minimum to 0 the cluster will shut down all nodes while note in use. Setting this number to a value higher than 0 will allow for faster start-up times, but you will also be billed when the cluster is not in use.\n",
|
||||
"* min_nodes - this sets the minimum size of the cluster. If you set the minimum to 0 the cluster will shut down all nodes while not in use. Setting this number to a value higher than 0 will allow for faster start-up times, but you will also be billed when the cluster is not in use.\n",
|
||||
"* max_nodes - this sets the maximum size of the cluster. Setting this to a larger number allows for more concurrency and a greater distributed processing of scale-out jobs.\n",
|
||||
"\n",
|
||||
"\n",
|
||||
|
||||
4
configuration.yml
Normal file
4
configuration.yml
Normal file
@@ -0,0 +1,4 @@
|
||||
name: configuration
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
8
contrib/datadrift/azure-ml-datadrift.yml
Normal file
8
contrib/datadrift/azure-ml-datadrift.yml
Normal file
@@ -0,0 +1,8 @@
|
||||
name: azure-ml-datadrift
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-contrib-datadrift
|
||||
- azureml-contrib-opendatasets
|
||||
- lightgbm
|
||||
- azureml-widgets
|
||||
@@ -77,6 +77,7 @@
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"from sklearn import datasets\n",
|
||||
"import azureml.dataprep as dprep\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"import azureml.core\n",
|
||||
@@ -220,30 +221,12 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%writefile $project_folder/get_data.py\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"def _read_x_y(file_name, label_col):\n",
|
||||
" df = pd.read_csv(file_name)\n",
|
||||
" y = None\n",
|
||||
" if label_col in df.columns:\n",
|
||||
" y = df.pop(label_col)\n",
|
||||
" y = y.values[:, None]\n",
|
||||
" X = df.values\n",
|
||||
" return X, y\n",
|
||||
" \n",
|
||||
"def get_data():\n",
|
||||
" # Load the bank marketing datasets.\n",
|
||||
" from sklearn.datasets import load_diabetes\n",
|
||||
" from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
" X_train, y_train = _read_x_y('https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv', \"y\")\n",
|
||||
"\n",
|
||||
" columns = ['age','job','marital','education','default','housing','loan','contact','month','day_of_week','duration','campaign','pdays','previous','poutcome','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','y']\n",
|
||||
"\n",
|
||||
" return { \"X\" : X_train, \"y\" : y_train[:,0] }"
|
||||
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv\"\n",
|
||||
"dflow = dprep.auto_read_file(data)\n",
|
||||
"dflow.get_profile()\n",
|
||||
"X_train = dflow.drop_columns(columns=['y'])\n",
|
||||
"y_train = dflow.keep_columns(columns=['y'], validate_column_exists=True)\n",
|
||||
"dflow.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -288,7 +271,8 @@
|
||||
" debug_log = 'automl_errors.log',\n",
|
||||
" path = project_folder,\n",
|
||||
" run_configuration=conda_run_config,\n",
|
||||
" data_script = project_folder + \"/get_data.py\",\n",
|
||||
" X = X_train,\n",
|
||||
" y = y_train,\n",
|
||||
" **automl_settings\n",
|
||||
" )"
|
||||
]
|
||||
@@ -631,14 +615,10 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _read_x_y(file_name, label_col):\n",
|
||||
" df = pd.read_csv(file_name)\n",
|
||||
" y = None\n",
|
||||
" if label_col in df.columns:\n",
|
||||
" y = df.pop(label_col)\n",
|
||||
" y = y.values[:, None]\n",
|
||||
" X = df.values\n",
|
||||
" return X, y"
|
||||
"# Load the bank marketing datasets.\n",
|
||||
"from sklearn.datasets import load_diabetes\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from numpy import array"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -647,15 +627,22 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the bank marketing datasets.\n",
|
||||
"from sklearn.datasets import load_diabetes\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from numpy import array\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"X_test, y_test = _read_x_y('https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv',\"y\")\n",
|
||||
"\n",
|
||||
"columns = ['age','job','marital','education','default','housing','loan','contact','month','day_of_week','duration','campaign','pdays','previous','poutcome','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','y']"
|
||||
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_validate.csv\"\n",
|
||||
"dflow = dprep.auto_read_file(data)\n",
|
||||
"dflow.get_profile()\n",
|
||||
"X_test = dflow.drop_columns(columns=['y'])\n",
|
||||
"y_test = dflow.keep_columns(columns=['y'], validate_column_exists=True)\n",
|
||||
"dflow.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_test = X_test.to_pandas_dataframe()\n",
|
||||
"y_test = y_test.to_pandas_dataframe()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -665,8 +652,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_pred = fitted_model.predict(X_test)\n",
|
||||
"actual = array(y_test.tolist())\n",
|
||||
"print(y_pred.shape, \" \", actual[:,0].shape)"
|
||||
"actual = array(y_test)\n",
|
||||
"actual = actual[:,0]\n",
|
||||
"print(y_pred.shape, \" \", actual.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -685,10 +673,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_test = y_test[:,0]# Plot outputs\n",
|
||||
"%matplotlib notebook\n",
|
||||
"test_pred = plt.scatter(y_test, y_pred, color='b')\n",
|
||||
"test_test = plt.scatter(y_test, y_test, color='g')\n",
|
||||
"test_pred = plt.scatter(actual, y_pred, color='b')\n",
|
||||
"test_test = plt.scatter(actual, actual, color='g')\n",
|
||||
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
|
||||
"plt.show()"
|
||||
]
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-classification-bank-marketing
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -75,6 +75,7 @@
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"import azureml.dataprep as dprep\n",
|
||||
"\n",
|
||||
"import azureml.core\n",
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
@@ -217,19 +218,13 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%writefile $project_folder/get_data.py\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
" \n",
|
||||
"def get_data():\n",
|
||||
" cards = pd.read_csv(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\")\n",
|
||||
" y = cards.Class\n",
|
||||
" x = cards.drop('Class', axis=1)\n",
|
||||
" X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=1)\n",
|
||||
" \n",
|
||||
" return { \"X\" : X_train, \"y\" : y_train.values}"
|
||||
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\"\n",
|
||||
"dflow = dprep.auto_read_file(data)\n",
|
||||
"dflow.get_profile()\n",
|
||||
"X = dflow.drop_columns(columns=['Class'])\n",
|
||||
"y = dflow.keep_columns(columns=['Class'], validate_column_exists=True)\n",
|
||||
"X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
|
||||
"y_train, y_test = y.random_split(percentage=0.8, seed=223)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -281,7 +276,8 @@
|
||||
" debug_log = 'automl_errors_20190417.log',\n",
|
||||
" path = project_folder,\n",
|
||||
" run_configuration=conda_run_config,\n",
|
||||
" data_script = project_folder + \"/get_data.py\",\n",
|
||||
" X = X_train,\n",
|
||||
" y = y_train,\n",
|
||||
" **automl_settings\n",
|
||||
" )"
|
||||
]
|
||||
@@ -621,11 +617,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"cards = pd.read_csv(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/creditcard.csv\")\n",
|
||||
"print(cards.head())\n",
|
||||
"y = cards.Class\n",
|
||||
"x = cards.drop('Class', axis=1)\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=1)\n"
|
||||
"#Randomly select and test\n",
|
||||
"X_test = X_test.to_pandas_dataframe()\n",
|
||||
"y_test = y_test.to_pandas_dataframe()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-classification-credit-card-fraud
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-classification-with-deployment
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -0,0 +1,9 @@
|
||||
name: auto-ml-classification-with-onnx
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
- onnxruntime
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-classification-with-whitelisting
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-classification
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-dataprep-remote-execution
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-dataprep
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-exploring-previous-runs
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -36,19 +36,17 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Introduction\n",
|
||||
"In this example, we show how AutoML can be used for bike share forecasting.\n",
|
||||
"This notebook demonstrates demand forecasting for a bike-sharing service using AutoML.\n",
|
||||
"\n",
|
||||
"The purpose is to demonstrate how to take advantage of the built-in holiday featurization, access the feature names, and further demonstrate how to work with the `forecast` function. Please also look at the additional forecasting notebooks, which document lagging, rolling windows, forecast quantiles, other ways to use the forecast function, and forecaster deployment.\n",
|
||||
"AutoML highlights here include built-in holiday featurization, accessing engineered feature names, and working with the `forecast` function. Please also look at the additional forecasting notebooks, which document lagging, rolling windows, forecast quantiles, other ways to use the forecast function, and forecaster deployment.\n",
|
||||
"\n",
|
||||
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
|
||||
"\n",
|
||||
"In this notebook you would see\n",
|
||||
"Notebook synopsis:\n",
|
||||
"1. Creating an Experiment in an existing Workspace\n",
|
||||
"2. Instantiating AutoMLConfig with new task type \"forecasting\" for timeseries data training, and other timeseries related settings: for this dataset we use the basic one: \"time_column_name\" \n",
|
||||
"3. Training the Model using local compute\n",
|
||||
"4. Exploring the results\n",
|
||||
"5. Viewing the engineered names for featurized data and featurization summary for all raw features\n",
|
||||
"6. Testing the fitted model"
|
||||
"2. Configuration and local run of AutoML for a time-series model with lag and holiday features \n",
|
||||
"3. Viewing the engineered names for featurized data and featurization summary for all raw features\n",
|
||||
"4. Evaluating the fitted model using a rolling test "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -69,6 +67,9 @@
|
||||
"import numpy as np\n",
|
||||
"import logging\n",
|
||||
"import warnings\n",
|
||||
"\n",
|
||||
"from pandas.tseries.frequencies import to_offset\n",
|
||||
"\n",
|
||||
"# Squash warning messages for cleaner output in the notebook\n",
|
||||
"warnings.showwarning = lambda *args, **kwargs: None\n",
|
||||
"\n",
|
||||
@@ -83,7 +84,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As part of the setup you have already created a <b>Workspace</b>. For AutoML you would need to create an <b>Experiment</b>. An <b>Experiment</b> is a named object in a <b>Workspace</b>, which is used to run experiments."
|
||||
"As part of the setup you have already created a <b>Workspace</b>. To run AutoML, you also need to create an <b>Experiment</b>. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -128,14 +129,15 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = pd.read_csv('bike-no.csv', parse_dates=['date'])"
|
||||
"data = pd.read_csv('bike-no.csv', parse_dates=['date'])\n",
|
||||
"data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's set up what we know abou the dataset. \n",
|
||||
"Let's set up what we know about the dataset. \n",
|
||||
"\n",
|
||||
"**Target column** is what we want to forecast.\n",
|
||||
"\n",
|
||||
@@ -193,8 +195,7 @@
|
||||
"source": [
|
||||
"### Setting forecaster maximum horizon \n",
|
||||
"\n",
|
||||
"Assuming your test data forms a full and regular time series(regular time intervals and no holes), \n",
|
||||
"the maximum horizon you will need to forecast is the length of the longest grain in your test set."
|
||||
"The forecast horizon is the number of periods into the future that the model should predict. Here, we set the horizon to 14 periods (i.e. 14 days). Notice that this is much shorter than the number of days in the test set; we will need to use a rolling test to evaluate the performance on the whole test set. For more discussion of forecast horizons and guiding principles for setting them, please see the [energy demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand). "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -203,10 +204,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"if len(grain_column_names) == 0:\n",
|
||||
" max_horizon = len(X_test)\n",
|
||||
"else:\n",
|
||||
" max_horizon = X_test.groupby(grain_column_names)[time_column_name].count().max()"
|
||||
"max_horizon = 14"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -236,26 +234,25 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"time_column_name = 'date'\n",
|
||||
"automl_settings = {\n",
|
||||
" \"time_column_name\": time_column_name,\n",
|
||||
" # these columns are a breakdown of the total and therefore a leak\n",
|
||||
" \"drop_column_names\": ['casual', 'registered'],\n",
|
||||
" 'time_column_name': time_column_name,\n",
|
||||
" 'max_horizon': max_horizon,\n",
|
||||
" # knowing the country/region allows Automated ML to bring in holidays\n",
|
||||
" \"country_or_region\" : 'US',\n",
|
||||
" \"max_horizon\" : max_horizon,\n",
|
||||
" \"target_lags\": 1 \n",
|
||||
" 'country_or_region': 'US',\n",
|
||||
" 'target_lags': 1,\n",
|
||||
" # these columns are a breakdown of the total and therefore a leak\n",
|
||||
" 'drop_column_names': ['casual', 'registered']\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(task = 'forecasting', \n",
|
||||
"automl_config = AutoMLConfig(task='forecasting', \n",
|
||||
" primary_metric='normalized_root_mean_squared_error',\n",
|
||||
" iterations = 10,\n",
|
||||
" iteration_timeout_minutes = 5,\n",
|
||||
" X = X_train,\n",
|
||||
" y = y_train,\n",
|
||||
" n_cross_validations = 3, \n",
|
||||
" iterations=10,\n",
|
||||
" iteration_timeout_minutes=5,\n",
|
||||
" X=X_train,\n",
|
||||
" y=y_train,\n",
|
||||
" n_cross_validations=3, \n",
|
||||
" path=project_folder,\n",
|
||||
" verbosity = logging.INFO,\n",
|
||||
" verbosity=logging.INFO,\n",
|
||||
" **automl_settings)"
|
||||
]
|
||||
},
|
||||
@@ -263,7 +260,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We will now run the experiment, starting with 10 iterations of model search. Experiment can be continued for more iterations if the results are not yet good. You will see the currently running iterations printing to the console."
|
||||
"We will now run the experiment, starting with 10 iterations of model search. The experiment can be continued for more iterations if more accurate results are required. You will see the currently running iterations printing to the console."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -355,11 +352,16 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Test the Best Fitted Model\n",
|
||||
"## Evaluate"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We now use the best fitted model from the AutoML Run to make forecasts for the test set. \n",
|
||||
"\n",
|
||||
"Predict on training and test set, and calculate residual values.\n",
|
||||
"\n",
|
||||
"We always score on the original dataset whose schema matches the scheme of the training dataset."
|
||||
"We always score on the original dataset whose schema matches the training set schema."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -371,21 +373,12 @@
|
||||
"X_test.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"y_query = y_test.copy().astype(np.float)\n",
|
||||
"y_query.fill(np.NaN)\n",
|
||||
"y_fcst, X_trans = fitted_model.forecast(X_test, y_query)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We now define some functions for aligning output to input and for producing rolling forecasts over the full test set. As previously stated, the forecast horizon of 14 days is shorter than the length of the test set - which is about 120 days. To get predictions over the full test set, we iterate over the test set, making forecasts 14 days at a time and combining the results. We also make sure that each 14-day forecast uses up-to-date actuals - the current context - to construct lag features. \n",
|
||||
"\n",
|
||||
"It is a good practice to always align the output explicitly to the input, as the count and order of the rows may have changed during transformations that span multiple rows."
|
||||
]
|
||||
},
|
||||
@@ -395,7 +388,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def align_outputs(y_predicted, X_trans, X_test, y_test, predicted_column_name = 'predicted'):\n",
|
||||
"def align_outputs(y_predicted, X_trans, X_test, y_test, predicted_column_name='predicted',\n",
|
||||
" horizon_colname='horizon_origin'):\n",
|
||||
" \"\"\"\n",
|
||||
" Demonstrates how to get the output aligned to the inputs\n",
|
||||
" using pandas indexes. Helps understand what happened if\n",
|
||||
@@ -407,7 +401,8 @@
|
||||
" * model was asked to predict past max_horizon -> increase max horizon\n",
|
||||
" * data at start of X_test was needed for lags -> provide previous periods\n",
|
||||
" \"\"\"\n",
|
||||
" df_fcst = pd.DataFrame({predicted_column_name : y_predicted})\n",
|
||||
" df_fcst = pd.DataFrame({predicted_column_name : y_predicted,\n",
|
||||
" horizon_colname: X_trans[horizon_colname]})\n",
|
||||
" # y and X outputs are aligned by forecast() function contract\n",
|
||||
" df_fcst.index = X_trans.index\n",
|
||||
" \n",
|
||||
@@ -426,7 +421,49 @@
|
||||
" clean = together[together[[target_column_name, predicted_column_name]].notnull().all(axis=1)]\n",
|
||||
" return(clean)\n",
|
||||
"\n",
|
||||
"df_all = align_outputs(y_fcst, X_trans, X_test, y_test)\n"
|
||||
"def do_rolling_forecast(fitted_model, X_test, y_test, max_horizon, freq='D'):\n",
|
||||
" \"\"\"\n",
|
||||
" Produce forecasts on a rolling origin over the given test set.\n",
|
||||
" \n",
|
||||
" Each iteration makes a forecast for the next 'max_horizon' periods \n",
|
||||
" with respect to the current origin, then advances the origin by the horizon time duration. \n",
|
||||
" The prediction context for each forecast is set so that the forecaster uses \n",
|
||||
" the actual target values prior to the current origin time for constructing lag features.\n",
|
||||
" \n",
|
||||
" This function returns a concatenated DataFrame of rolling forecasts.\n",
|
||||
" \"\"\"\n",
|
||||
" df_list = []\n",
|
||||
" origin_time = X_test[time_column_name].min()\n",
|
||||
" while origin_time <= X_test[time_column_name].max():\n",
|
||||
" # Set the horizon time - end date of the forecast\n",
|
||||
" horizon_time = origin_time + max_horizon * to_offset(freq)\n",
|
||||
" \n",
|
||||
" # Extract test data from an expanding window up-to the horizon \n",
|
||||
" expand_wind = (X_test[time_column_name] < horizon_time)\n",
|
||||
" X_test_expand = X_test[expand_wind]\n",
|
||||
" y_query_expand = np.zeros(len(X_test_expand)).astype(np.float)\n",
|
||||
" y_query_expand.fill(np.NaN)\n",
|
||||
" \n",
|
||||
" if origin_time != X_test[time_column_name].min():\n",
|
||||
" # Set the context by including actuals up-to the origin time\n",
|
||||
" test_context_expand_wind = (X_test[time_column_name] < origin_time)\n",
|
||||
" context_expand_wind = (X_test_expand[time_column_name] < origin_time)\n",
|
||||
" y_query_expand[context_expand_wind] = y_test[test_context_expand_wind]\n",
|
||||
" \n",
|
||||
" # Make a forecast out to the maximum horizon\n",
|
||||
" y_fcst, X_trans = fitted_model.forecast(X_test_expand, y_query_expand)\n",
|
||||
" \n",
|
||||
" # Align forecast with test set for dates within the current rolling window \n",
|
||||
" trans_tindex = X_trans.index.get_level_values(time_column_name)\n",
|
||||
" trans_roll_wind = (trans_tindex >= origin_time) & (trans_tindex < horizon_time)\n",
|
||||
" test_roll_wind = expand_wind & (X_test[time_column_name] >= origin_time)\n",
|
||||
" df_list.append(align_outputs(y_fcst[trans_roll_wind], X_trans[trans_roll_wind],\n",
|
||||
" X_test[test_roll_wind], y_test[test_roll_wind]))\n",
|
||||
" \n",
|
||||
" # Advance the origin time\n",
|
||||
" origin_time = horizon_time\n",
|
||||
" \n",
|
||||
" return pd.concat(df_list, ignore_index=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -435,6 +472,30 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_all = do_rolling_forecast(fitted_model, X_test, y_test, max_horizon)\n",
|
||||
"df_all"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We now calculate some error metrics for the forecasts and vizualize the predictions vs. the actuals."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def APE(actual, pred):\n",
|
||||
" \"\"\"\n",
|
||||
" Calculate absolute percentage error.\n",
|
||||
" Returns a vector of APE values with same length as actual/pred.\n",
|
||||
" \"\"\"\n",
|
||||
" return 100*np.abs((actual - pred)/actual)\n",
|
||||
"\n",
|
||||
"def MAPE(actual, pred):\n",
|
||||
" \"\"\"\n",
|
||||
" Calculate mean absolute percentage error.\n",
|
||||
@@ -444,8 +505,7 @@
|
||||
" not_zero = ~np.isclose(actual, 0.0)\n",
|
||||
" actual_safe = actual[not_na & not_zero]\n",
|
||||
" pred_safe = pred[not_na & not_zero]\n",
|
||||
" APE = 100*np.abs((actual_safe - pred_safe)/actual_safe)\n",
|
||||
" return np.mean(APE)"
|
||||
" return np.mean(APE(actual_safe, pred_safe))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -468,12 +528,57 @@
|
||||
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The MAPE seems high; it is being skewed by an actual with a small absolute value. For a more informative evaluation, we can calculate the metrics by forecast horizon:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_all.groupby('horizon_origin').apply(\n",
|
||||
" lambda df: pd.Series({'MAPE': MAPE(df[target_column_name], df['predicted']),\n",
|
||||
" 'RMSE': np.sqrt(mean_squared_error(df[target_column_name], df['predicted'])),\n",
|
||||
" 'MAE': mean_absolute_error(df[target_column_name], df['predicted'])}))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"It's also interesting to see the distributions of APE (absolute percentage error) by horizon. On a log scale, the outlying APE in the horizon-3 group is clear."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df_all_APE = df_all.assign(APE=APE(df_all[target_column_name], df_all['predicted']))\n",
|
||||
"APEs = [df_all_APE[df_all['horizon_origin'] == h].APE.values for h in range(1, max_horizon + 1)]\n",
|
||||
"\n",
|
||||
"%matplotlib notebook\n",
|
||||
"plt.boxplot(APEs)\n",
|
||||
"plt.yscale('log')\n",
|
||||
"plt.xlabel('horizon')\n",
|
||||
"plt.ylabel('APE (%)')\n",
|
||||
"plt.title('Absolute Percentage Errors by Forecast Horizon')\n",
|
||||
"\n",
|
||||
"plt.show()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "xiaga@microsoft.com, tosingli@microsoft.com"
|
||||
"name": "xiaga@microsoft.com, tosingli@microsoft.com, erwright@microsoft.com"
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
@@ -491,7 +596,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.7"
|
||||
"version": "3.6.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
name: auto-ml-forecasting-bike-share
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
- statsmodels
|
||||
@@ -35,17 +35,16 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Introduction\n",
|
||||
"In this example, we show how AutoML can be used for energy demand forecasting.\n",
|
||||
"In this example, we show how AutoML can be used to forecast a single time-series in the energy demand application area. \n",
|
||||
"\n",
|
||||
"Make sure you have executed the [configuration](../../../configuration.ipynb) before running this notebook.\n",
|
||||
"\n",
|
||||
"In this notebook you would see\n",
|
||||
"Notebook synopsis:\n",
|
||||
"1. Creating an Experiment in an existing Workspace\n",
|
||||
"2. Instantiating AutoMLConfig with new task type \"forecasting\" for timeseries data training, and other timeseries related settings: for this dataset we use the basic one: \"time_column_name\" \n",
|
||||
"3. Training the Model using local compute\n",
|
||||
"4. Exploring the results\n",
|
||||
"5. Viewing the engineered names for featurized data and featurization summary for all raw features\n",
|
||||
"6. Testing the fitted model"
|
||||
"2. Configuration and local run of AutoML for a simple time-series model\n",
|
||||
"3. View engineered features and prediction results\n",
|
||||
"4. Configuration and local run of AutoML for a time-series model with lag and rolling window features\n",
|
||||
"5. Estimate feature importance"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -65,6 +64,10 @@
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import logging\n",
|
||||
"import warnings\n",
|
||||
"\n",
|
||||
"# Squash warning messages for cleaner output in the notebook\n",
|
||||
"warnings.showwarning = lambda *args, **kwargs: None\n",
|
||||
"\n",
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
@@ -77,7 +80,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As part of the setup you have already created a <b>Workspace</b>. For AutoML you would need to create an <b>Experiment</b>. An <b>Experiment</b> is a named object in a <b>Workspace</b>, which is used to run experiments."
|
||||
"As part of the setup you have already created a <b>Workspace</b>. To run AutoML, you also need to create an <b>Experiment</b>. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -113,7 +116,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data\n",
|
||||
"Read energy demanding data from file, and preview data."
|
||||
"We will use energy consumption data from New York City for model training. The data is stored in a tabular format and includes energy demand and basic weather data at an hourly frequency. Pandas CSV reader is used to read the file into memory. Special attention is given to the \"timeStamp\" column in the data since it contains text which should be parsed as datetime-type objects. "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -126,13 +129,20 @@
|
||||
"data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We must now define the schema of this dataset. Every time-series must have a time column and a target. The target quantity is what will be eventually forecasted by a trained model. In this case, the target is the \"demand\" column. The other columns, \"temp\" and \"precip,\" are implicitly designated as features."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# let's take note of what columns means what in the data\n",
|
||||
"# Dataset schema\n",
|
||||
"time_column_name = 'timeStamp'\n",
|
||||
"target_column_name = 'demand'"
|
||||
]
|
||||
@@ -141,7 +151,14 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Split the data into train and test sets\n"
|
||||
"### Forecast Horizon\n",
|
||||
"\n",
|
||||
"In addition to the data schema, we must also specify the forecast horizon. A forecast horizon is a time span into the future (or just beyond the latest date in the training data) where forecasts of the target quantity are needed. Choosing a forecast horizon is application specific, but a rule-of-thumb is that **the horizon should be the time-frame where you need actionable decisions based on the forecast.** The horizon usually has a strong relationship with the frequency of the time-series data, that is, the sampling interval of the target quantity and the features. For instance, the NYC energy demand data has an hourly frequency. A decision that requires a demand forecast to the hour is unlikely to be made weeks or months in advance, particularly if we expect weather to be a strong determinant of demand. We may have fairly accurate meteorological forecasts of the hourly temperature and precipitation on a the time-scale of a day or two, however.\n",
|
||||
"\n",
|
||||
"Given the above discussion, we generally recommend that users set forecast horizons to less than 100 time periods (i.e. less than 100 hours in the NYC energy example). Furthermore, **AutoML's memory use and computation time increase in proportion to the length of the horizon**, so the user should consider carefully how they set this value. If a long horizon forecast really is necessary, it may be good practice to aggregate the series to a coarser time scale. \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"Forecast horizons in AutoML are given as integer multiples of the time-series frequency. In this example, we set the horizon to 48 hours."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -150,8 +167,32 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_train = data[data[time_column_name] < '2017-02-01']\n",
|
||||
"X_test = data[data[time_column_name] >= '2017-02-01']\n",
|
||||
"max_horizon = 48"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Split the data into train and test sets\n",
|
||||
"We now split the data into a train and a test set so that we may evaluate model performance. We note that the tail of the dataset contains a large number of NA values in the target column, so we designate the test set as the 48 hour window ending on the latest date of known energy demand. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Find time point to split on\n",
|
||||
"latest_known_time = data[~pd.isnull(data[target_column_name])][time_column_name].max()\n",
|
||||
"split_time = latest_known_time - pd.Timedelta(hours=max_horizon)\n",
|
||||
"\n",
|
||||
"# Split into train/test sets\n",
|
||||
"X_train = data[data[time_column_name] <= split_time]\n",
|
||||
"X_test = data[(data[time_column_name] > split_time) & (data[time_column_name] <= latest_known_time)]\n",
|
||||
"\n",
|
||||
"# Move the target values into their own arrays \n",
|
||||
"y_train = X_train.pop(target_column_name).values\n",
|
||||
"y_test = X_test.pop(target_column_name).values"
|
||||
]
|
||||
@@ -162,7 +203,7 @@
|
||||
"source": [
|
||||
"## Train\n",
|
||||
"\n",
|
||||
"Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.\n",
|
||||
"We now instantiate an AutoMLConfig object. This config defines the settings and data used to run the experiment. For forecasting tasks, we must provide extra configuration related to the time-series data schema and forecasting context. Here, only the name of the time column and the maximum forecast horizon are needed. Other settings are described below:\n",
|
||||
"\n",
|
||||
"|Property|Description|\n",
|
||||
"|-|-|\n",
|
||||
@@ -172,7 +213,7 @@
|
||||
"|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|\n",
|
||||
"|**X**|(sparse) array-like, shape = [n_samples, n_features]|\n",
|
||||
"|**y**|(sparse) array-like, shape = [n_samples, ], targets values.|\n",
|
||||
"|**n_cross_validations**|Number of cross validation splits.|\n",
|
||||
"|**n_cross_validations**|Number of cross validation splits. Rolling Origin Validation is used to split time-series in a temporally consistent way.|\n",
|
||||
"|**path**|Relative path to the project folder. AutoML stores configuration files for the experiment under this folder. You can specify a new empty folder. "
|
||||
]
|
||||
},
|
||||
@@ -182,22 +223,22 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_settings = {\n",
|
||||
" \"time_column_name\": time_column_name \n",
|
||||
"time_series_settings = {\n",
|
||||
" 'time_column_name': time_column_name,\n",
|
||||
" 'max_horizon': max_horizon\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(task = 'forecasting',\n",
|
||||
" debug_log = 'automl_nyc_energy_errors.log',\n",
|
||||
"automl_config = AutoMLConfig(task='forecasting',\n",
|
||||
" debug_log='automl_nyc_energy_errors.log',\n",
|
||||
" primary_metric='normalized_root_mean_squared_error',\n",
|
||||
" iterations = 10,\n",
|
||||
" iteration_timeout_minutes = 5,\n",
|
||||
" X = X_train,\n",
|
||||
" y = y_train,\n",
|
||||
" n_cross_validations = 3,\n",
|
||||
" iterations=10,\n",
|
||||
" iteration_timeout_minutes=5,\n",
|
||||
" X=X_train,\n",
|
||||
" y=y_train,\n",
|
||||
" n_cross_validations=3,\n",
|
||||
" path=project_folder,\n",
|
||||
" verbosity = logging.INFO,\n",
|
||||
" **automl_settings)"
|
||||
" **time_series_settings)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -354,7 +395,8 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Calculate accuracy metrics\n"
|
||||
"### Calculate accuracy metrics\n",
|
||||
"Finally, we calculate some accuracy metrics for the forecast and plot the predictions vs. the actuals over the time range in the test set."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -391,9 +433,12 @@
|
||||
"\n",
|
||||
"# Plot outputs\n",
|
||||
"%matplotlib notebook\n",
|
||||
"test_pred = plt.scatter(df_all[target_column_name], df_all['predicted'], color='b')\n",
|
||||
"test_test = plt.scatter(y_test, y_test, color='g')\n",
|
||||
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
|
||||
"pred, = plt.plot(df_all[time_column_name], df_all['predicted'], color='b')\n",
|
||||
"actual, = plt.plot(df_all[time_column_name], df_all[target_column_name], color='g')\n",
|
||||
"plt.xticks(fontsize=8)\n",
|
||||
"plt.legend((pred, actual), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
|
||||
"plt.title('Prediction vs. Actual Time-Series')\n",
|
||||
"\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
@@ -408,16 +453,16 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Using lags and rolling window features to improve the forecast"
|
||||
"### Using lags and rolling window features"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We did not use lags in the previous model specification. In effect, the prediction was the result of a simple regression on date, grain and any additional features. This is often a very good prediction as common time series patterns like seasonality and trends can be captured in this manner. Such simple regression is horizon-less: it doesn't matter how far into the future we are predicting, because we are not using past data.\n",
|
||||
"We did not use lags in the previous model specification. In effect, the prediction was the result of a simple regression on date, grain and any additional features. This is often a very good prediction as common time series patterns like seasonality and trends can be captured in this manner. Such simple regression is horizon-less: it doesn't matter how far into the future we are predicting, because we are not using past data. In the previous example, the horizon was only used to split the data for cross-validation.\n",
|
||||
"\n",
|
||||
"Now that we configured target lags, that is the previous values of the target variables, and the prediction is no longer horizon-less. We therefore must specify the `max_horizon` that the model will learn to forecast. The `target_lags` keyword specifies how far back we will construct the lags of the target variable, and the `target_rolling_window_size` specifies the size of the rolling window over which we will generate the `max`, `min` and `sum` features."
|
||||
"Now that we configured target lags, that is the previous values of the target variables, and the prediction is no longer horizon-less. We therefore must still specify the `max_horizon` that the model will learn to forecast. The `target_lags` keyword specifies how far back we will construct the lags of the target variable, and the `target_rolling_window_size` specifies the size of the rolling window over which we will generate the `max`, `min` and `sum` features."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -426,27 +471,32 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"automl_settings_lags = {\n",
|
||||
"time_series_settings_with_lags = {\n",
|
||||
" 'time_column_name': time_column_name,\n",
|
||||
" 'target_lags': 1,\n",
|
||||
" 'target_rolling_window_size': 5,\n",
|
||||
" # you MUST set the max_horizon when using lags and rolling windows\n",
|
||||
" # it is optional when looking-back features are not used \n",
|
||||
" 'max_horizon': len(y_test), # only one grain\n",
|
||||
" 'max_horizon': max_horizon,\n",
|
||||
" 'target_lags': 12,\n",
|
||||
" 'target_rolling_window_size': 4\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"automl_config_lags = AutoMLConfig(task = 'forecasting',\n",
|
||||
" debug_log = 'automl_nyc_energy_errors.log',\n",
|
||||
" primary_metric='normalized_root_mean_squared_error',\n",
|
||||
" iterations = 10,\n",
|
||||
" iteration_timeout_minutes = 5,\n",
|
||||
" X = X_train,\n",
|
||||
" y = y_train,\n",
|
||||
" n_cross_validations = 3,\n",
|
||||
" path=project_folder,\n",
|
||||
" verbosity = logging.INFO,\n",
|
||||
" **automl_settings_lags)"
|
||||
"automl_config_lags = AutoMLConfig(task='forecasting',\n",
|
||||
" debug_log='automl_nyc_energy_errors.log',\n",
|
||||
" primary_metric='normalized_root_mean_squared_error',\n",
|
||||
" blacklist_models=['ElasticNet'],\n",
|
||||
" iterations=10,\n",
|
||||
" iteration_timeout_minutes=10,\n",
|
||||
" X=X_train,\n",
|
||||
" y=y_train,\n",
|
||||
" n_cross_validations=3,\n",
|
||||
" path=project_folder,\n",
|
||||
" verbosity=logging.INFO,\n",
|
||||
" **time_series_settings_with_lags)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We now start a new local run, this time with lag and rolling window featurization. AutoML applies featurizations in the setup stage, prior to iterating over ML models. The full training set is featurized first, followed by featurization of each of the CV splits. Lag and rolling window features introduce additional complexity, so the run will take longer than in the previous example that lacked these featurizations."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -494,9 +544,10 @@
|
||||
"\n",
|
||||
"# Plot outputs\n",
|
||||
"%matplotlib notebook\n",
|
||||
"test_pred = plt.scatter(df_lags[target_column_name], df_lags['predicted'], color='b')\n",
|
||||
"test_test = plt.scatter(y_test, y_test, color='g')\n",
|
||||
"plt.legend((test_pred, test_test), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
|
||||
"pred, = plt.plot(df_lags[time_column_name], df_lags['predicted'], color='b')\n",
|
||||
"actual, = plt.plot(df_lags[time_column_name], df_lags[target_column_name], color='g')\n",
|
||||
"plt.xticks(fontsize=8)\n",
|
||||
"plt.legend((pred, actual), ('prediction', 'truth'), loc='upper left', fontsize=8)\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
@@ -516,8 +567,8 @@
|
||||
"from azureml.train.automl.automlexplainer import explain_model\n",
|
||||
"\n",
|
||||
"# feature names are everything in the transformed data except the target\n",
|
||||
"features = X_trans.columns[:-1]\n",
|
||||
"expl = explain_model(fitted_model, X_train, X_test, features = features, best_run=best_run_lags, y_train = y_train)\n",
|
||||
"features = X_trans_lags.columns[:-1]\n",
|
||||
"expl = explain_model(fitted_model_lags, X_train.copy(), X_test.copy(), features=features, best_run=best_run_lags, y_train=y_train)\n",
|
||||
"# unpack the tuple\n",
|
||||
"shap_values, expected_values, feat_overall_imp, feat_names, per_class_summary, per_class_imp = expl\n",
|
||||
"best_run_lags"
|
||||
@@ -536,7 +587,7 @@
|
||||
"metadata": {
|
||||
"authors": [
|
||||
{
|
||||
"name": "xiaga, tosingli"
|
||||
"name": "xiaga, tosingli, erwright"
|
||||
}
|
||||
],
|
||||
"kernelspec": {
|
||||
@@ -554,7 +605,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.7"
|
||||
"version": "3.6.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
name: auto-ml-forecasting-energy-demand
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
- statsmodels
|
||||
- azureml-explain-model
|
||||
@@ -37,16 +37,10 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Introduction\n",
|
||||
"In this example, we use AutoML to find and tune a time-series forecasting model.\n",
|
||||
"In this example, we use AutoML to train, select, and operationalize a time-series forecasting model for multiple time-series.\n",
|
||||
"\n",
|
||||
"Make sure you have executed the [configuration notebook](../../../configuration.ipynb) before running this notebook.\n",
|
||||
"\n",
|
||||
"In this notebook, you will:\n",
|
||||
"1. Create an Experiment in an existing Workspace\n",
|
||||
"2. Instantiate an AutoMLConfig \n",
|
||||
"3. Find and train a forecasting model using local compute\n",
|
||||
"4. Evaluate the performance of the model\n",
|
||||
"\n",
|
||||
"The examples in the follow code samples use the University of Chicago's Dominick's Finer Foods dataset to forecast orange juice sales. Dominick's was a grocery chain in the Chicago metropolitan area."
|
||||
]
|
||||
},
|
||||
@@ -67,6 +61,10 @@
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import logging\n",
|
||||
"import warnings\n",
|
||||
"\n",
|
||||
"# Squash warning messages for cleaner output in the notebook\n",
|
||||
"warnings.showwarning = lambda *args, **kwargs: None\n",
|
||||
"\n",
|
||||
"from azureml.core.workspace import Workspace\n",
|
||||
"from azureml.core.experiment import Experiment\n",
|
||||
@@ -78,7 +76,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"As part of the setup you have already created a <b>Workspace</b>. To run AutoML, you also need to create an <b>Experiment</b>. An Experiment is a named object in a Workspace which represents a predictive task, the output of which is a trained model and a set of evaluation metrics for the model. "
|
||||
"As part of the setup you have already created a <b>Workspace</b>. To run AutoML, you also need to create an <b>Experiment</b>. An Experiment corresponds to a prediction problem you are trying to solve, while a Run corresponds to a specific approach to the problem. "
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -232,7 +230,7 @@
|
||||
"\n",
|
||||
"For forecasting tasks, there are some additional parameters that can be set: the name of the column holding the date/time, the grain column names, and the maximum forecast horizon. A time column is required for forecasting, while the grain is optional. If a grain is not given, AutoML assumes that the whole dataset is a single time-series. We also pass a list of columns to drop prior to modeling. The _logQuantity_ column is completely correlated with the target quantity, so it must be removed to prevent a target leak.\n",
|
||||
"\n",
|
||||
"The forecast horizon is given in units of the time-series frequency; for instance, the OJ series frequency is weekly, so a horizon of 20 means that a trained model will estimate sales up-to 20 weeks beyond the latest date in the training data for each series. In this example, we set the maximum horizon to the number of samples per series in the test set (n_test_periods). Generally, the value of this parameter will be dictated by business needs. For example, a demand planning organizaion that needs to estimate the next month of sales would set the horizon accordingly. \n",
|
||||
"The forecast horizon is given in units of the time-series frequency; for instance, the OJ series frequency is weekly, so a horizon of 20 means that a trained model will estimate sales up-to 20 weeks beyond the latest date in the training data for each series. In this example, we set the maximum horizon to the number of samples per series in the test set (n_test_periods). Generally, the value of this parameter will be dictated by business needs. For example, a demand planning organizaion that needs to estimate the next month of sales would set the horizon accordingly. Please see the [energy_demand notebook](https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand) for more discussion of forecast horizon.\n",
|
||||
"\n",
|
||||
"Finally, a note about the cross-validation (CV) procedure for time-series data. AutoML uses out-of-sample error estimates to select a best pipeline/model, so it is important that the CV fold splitting is done correctly. Time-series can violate the basic statistical assumptions of the canonical K-Fold CV strategy, so AutoML implements a [rolling origin validation](https://robjhyndman.com/hyndsight/tscv/) procedure to create CV folds for time-series data. To use this procedure, you just need to specify the desired number of CV folds in the AutoMLConfig object. It is also possible to bypass CV and use your own validation set by setting the *X_valid* and *y_valid* parameters of AutoMLConfig.\n",
|
||||
"\n",
|
||||
@@ -265,7 +263,7 @@
|
||||
" 'time_column_name': time_column_name,\n",
|
||||
" 'grain_column_names': grain_column_names,\n",
|
||||
" 'drop_column_names': ['logQuantity'],\n",
|
||||
" 'max_horizon': n_test_periods # optional\n",
|
||||
" 'max_horizon': n_test_periods\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"automl_config = AutoMLConfig(task='forecasting',\n",
|
||||
@@ -274,7 +272,7 @@
|
||||
" iterations=10,\n",
|
||||
" X=X_train,\n",
|
||||
" y=y_train,\n",
|
||||
" n_cross_validations=5,\n",
|
||||
" n_cross_validations=3,\n",
|
||||
" enable_ensembling=False,\n",
|
||||
" path=project_folder,\n",
|
||||
" verbosity=logging.INFO,\n",
|
||||
@@ -320,7 +318,8 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Predict\n",
|
||||
"# Forecasting\n",
|
||||
"\n",
|
||||
"Now that we have retrieved the best pipeline/model, it can be used to make predictions on test data. First, we remove the target values from the test set:"
|
||||
]
|
||||
},
|
||||
@@ -848,7 +847,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.7"
|
||||
"version": "3.6.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
name: auto-ml-forecasting-orange-juice-sales
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
- statsmodels
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-missing-data-blacklist-early-termination
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -0,0 +1,9 @@
|
||||
name: auto-ml-model-explanation
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
- azureml-explain-model
|
||||
@@ -71,6 +71,7 @@
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"import azureml.dataprep as dprep\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"import azureml.core\n",
|
||||
@@ -212,25 +213,14 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%writefile $project_folder/get_data.py\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"def _read_x_y(file_name, label_col):\n",
|
||||
" df = pd.read_csv(file_name)\n",
|
||||
" y = None\n",
|
||||
" if label_col in df.columns:\n",
|
||||
" y = df.pop(label_col)\n",
|
||||
" y = y.values[:, None]\n",
|
||||
" X = df.values\n",
|
||||
" return X, y\n",
|
||||
" \n",
|
||||
"def get_data():\n",
|
||||
" X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\",\"CONCRETE\")\n",
|
||||
" X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)\n",
|
||||
" \n",
|
||||
" return { \"X\" : X_train, \"y\" : y_train[:,0] }"
|
||||
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\"\n",
|
||||
"dflow = dprep.auto_read_file(data)\n",
|
||||
"dflow.get_profile()\n",
|
||||
"X = dflow.drop_columns(columns=['CONCRETE'])\n",
|
||||
"y = dflow.keep_columns(columns=['CONCRETE'], validate_column_exists=True)\n",
|
||||
"X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
|
||||
"y_train, y_test = y.random_split(percentage=0.8, seed=223) \n",
|
||||
"dflow.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -282,7 +272,8 @@
|
||||
" debug_log = 'automl.log',\n",
|
||||
" path = project_folder,\n",
|
||||
" run_configuration=conda_run_config,\n",
|
||||
" data_script = project_folder + \"/get_data.py\",\n",
|
||||
" X = X_train,\n",
|
||||
" y = y_train,\n",
|
||||
" **automl_settings\n",
|
||||
" )"
|
||||
]
|
||||
@@ -664,14 +655,14 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _read_x_y(file_name, label_col):\n",
|
||||
" df = pd.read_csv(file_name)\n",
|
||||
" y = None\n",
|
||||
" if label_col in df.columns:\n",
|
||||
" y = df.pop(label_col)\n",
|
||||
" y = y.values[:, None]\n",
|
||||
" X = df.values\n",
|
||||
" return X, y"
|
||||
"X_test = X_test.to_pandas_dataframe()\n",
|
||||
"y_test = y_test.to_pandas_dataframe()\n",
|
||||
"y_test = np.array(y_test)\n",
|
||||
"y_test = y_test[:,0]\n",
|
||||
"X_train = X_train.to_pandas_dataframe()\n",
|
||||
"y_train = y_train.to_pandas_dataframe()\n",
|
||||
"y_train = np.array(y_train)\n",
|
||||
"y_train = y_train[:,0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -687,9 +678,6 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/compresive_strength_concrete.csv\",\"CONCRETE\")\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)\n",
|
||||
"\n",
|
||||
"y_pred_train = fitted_model.predict(X_train)\n",
|
||||
"y_residual_train = y_train - y_pred_train\n",
|
||||
"\n",
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-regression-concrete-strength
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -71,6 +71,7 @@
|
||||
"import pandas as pd\n",
|
||||
"import os\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"import azureml.dataprep as dprep\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"import azureml.core\n",
|
||||
@@ -212,25 +213,14 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%writefile $project_folder/get_data.py\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"def _read_x_y(file_name, label_col):\n",
|
||||
" df = pd.read_csv(file_name)\n",
|
||||
" y = None\n",
|
||||
" if label_col in df.columns:\n",
|
||||
" y = df.pop(label_col)\n",
|
||||
" y = y.values[:, None]\n",
|
||||
" X = df.values\n",
|
||||
" return X, y\n",
|
||||
" \n",
|
||||
"def get_data():\n",
|
||||
" X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\",\"ERP\")\n",
|
||||
" X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)\n",
|
||||
" \n",
|
||||
" return { \"X\" : X_train, \"y\" : y_train[:,0] }"
|
||||
"data = \"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\"\n",
|
||||
"dflow = dprep.auto_read_file(data)\n",
|
||||
"dflow.get_profile()\n",
|
||||
"X = dflow.drop_columns(columns=['ERP'])\n",
|
||||
"y = dflow.keep_columns(columns=['ERP'], validate_column_exists=True)\n",
|
||||
"X_train, X_test = X.random_split(percentage=0.8, seed=223)\n",
|
||||
"y_train, y_test = y.random_split(percentage=0.8, seed=223) \n",
|
||||
"dflow.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -283,7 +273,8 @@
|
||||
" debug_log = 'automl_errors_20190417.log',\n",
|
||||
" path = project_folder,\n",
|
||||
" run_configuration=conda_run_config,\n",
|
||||
" data_script = project_folder + \"/get_data.py\",\n",
|
||||
" X = X_train,\n",
|
||||
" y = y_train,\n",
|
||||
" **automl_settings\n",
|
||||
" )"
|
||||
]
|
||||
@@ -334,16 +325,6 @@
|
||||
"RunDetails(remote_run).show() "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from azureml.train.automl.run import AutoMLRun\n",
|
||||
"setup_run = AutoMLRun(experiment, remote_run.id + \"_setup\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -690,18 +671,14 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def _read_x_y(file_name, label_col):\n",
|
||||
" df = pd.read_csv(file_name)\n",
|
||||
" y_split = None\n",
|
||||
" if label_col in df.columns:\n",
|
||||
" y_split = df.pop(label_col)\n",
|
||||
" y_split = y_split.values[:, None]\n",
|
||||
" X_split = df.values\n",
|
||||
" return X_split, y_split\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"X,y = _read_x_y(\"https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/machineData.csv\",\"ERP\")\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)"
|
||||
"X_test = X_test.to_pandas_dataframe()\n",
|
||||
"y_test = y_test.to_pandas_dataframe()\n",
|
||||
"y_test = np.array(y_test)\n",
|
||||
"y_test = y_test[:,0]\n",
|
||||
"X_train = X_train.to_pandas_dataframe()\n",
|
||||
"y_train = y_train.to_pandas_dataframe()\n",
|
||||
"y_train = np.array(y_train)\n",
|
||||
"y_train = y_train[:,0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-regression-hardware-performance
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -0,0 +1,9 @@
|
||||
name: auto-ml-regression
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
- paramiko<2.5.0
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-remote-amlcompute
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-sample-weight
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-sparse-data-train-test-split
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -0,0 +1,8 @@
|
||||
name: auto-ml-subsampling-local
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -0,0 +1,4 @@
|
||||
name: model-register-and-deploy
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
@@ -0,0 +1,4 @@
|
||||
name: enable-app-insights-in-production-service
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
@@ -0,0 +1,4 @@
|
||||
name: enable-data-collection-for-models-in-aks
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
@@ -0,0 +1,6 @@
|
||||
name: onnx-convert-aml-deploy-tinyyolo
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- git+https://github.com/apple/coremltools
|
||||
- onnxmltools==1.3.1
|
||||
@@ -0,0 +1,9 @@
|
||||
name: onnx-inference-facial-expression-recognition-deploy
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- numpy
|
||||
- onnx
|
||||
- opencv-python
|
||||
@@ -0,0 +1,9 @@
|
||||
name: onnx-inference-mnist-deploy
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- numpy
|
||||
- onnx
|
||||
- opencv-python
|
||||
@@ -0,0 +1,4 @@
|
||||
name: onnx-modelzoo-aml-deploy-resnet50
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
@@ -0,0 +1,5 @@
|
||||
name: onnx-train-pytorch-aml-deploy-mnist
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -0,0 +1,8 @@
|
||||
name: production-deploy-to-aks
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- matplotlib
|
||||
- tqdm
|
||||
- scipy
|
||||
- sklearn
|
||||
@@ -0,0 +1,8 @@
|
||||
name: register-model-create-image-deploy-service
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- matplotlib
|
||||
- tqdm
|
||||
- scipy
|
||||
- sklearn
|
||||
@@ -0,0 +1,6 @@
|
||||
name: regression-sklearn-on-amlcompute
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-explain-model
|
||||
- azureml-contrib-explain-model
|
||||
@@ -0,0 +1,6 @@
|
||||
name: explain-local-sklearn-binary-classification
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-explain-model
|
||||
- azureml-contrib-explain-model
|
||||
@@ -0,0 +1,6 @@
|
||||
name: explain-local-sklearn-multiclass-classification
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-explain-model
|
||||
- azureml-contrib-explain-model
|
||||
@@ -0,0 +1,6 @@
|
||||
name: explain-local-sklearn-regression
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-explain-model
|
||||
- azureml-contrib-explain-model
|
||||
@@ -0,0 +1,7 @@
|
||||
name: explain-sklearn-raw-features
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-explain-model
|
||||
- azureml-contrib-explain-model
|
||||
- sklearn-pandas
|
||||
@@ -0,0 +1,6 @@
|
||||
name: explain-run-history-sklearn-classification
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-explain-model
|
||||
- azureml-contrib-explain-model
|
||||
@@ -0,0 +1,6 @@
|
||||
name: explain-run-history-sklearn-regression
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-explain-model
|
||||
- azureml-contrib-explain-model
|
||||
@@ -0,0 +1,5 @@
|
||||
name: aml-pipelines-data-transfer
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -0,0 +1,5 @@
|
||||
name: aml-pipelines-getting-started
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -0,0 +1,5 @@
|
||||
name: aml-pipelines-how-to-use-estimatorstep
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -0,0 +1,6 @@
|
||||
name: aml-pipelines-publish-and-run-using-rest-endpoint
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- requests
|
||||
@@ -0,0 +1,8 @@
|
||||
name: aml-pipelines-with-automated-machine-learning-step
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-train-automl
|
||||
- azureml-widgets
|
||||
- matplotlib
|
||||
- pandas_ml
|
||||
@@ -0,0 +1,5 @@
|
||||
name: aml-pipelines-with-data-dependency-steps
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -0,0 +1,9 @@
|
||||
name: nyc-taxi-data-regression-model-building
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- azureml-contrib-opendatasets
|
||||
- azureml-dataprep
|
||||
- azureml-train-automl
|
||||
- matplotlib
|
||||
@@ -0,0 +1,7 @@
|
||||
name: pipeline-batch-scoring
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- pandas
|
||||
- requests
|
||||
@@ -0,0 +1,6 @@
|
||||
name: pipeline-style-transfer
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- requests
|
||||
@@ -6,8 +6,20 @@
|
||||
"source": [
|
||||
"Copyright (c) Microsoft Corporation. All rights reserved.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License.\n",
|
||||
"\n",
|
||||
"Licensed under the MIT License."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Authentication in Azure Machine Learning\n",
|
||||
"\n",
|
||||
"This notebook shows you how to authenticate to your Azure ML Workspace using\n",
|
||||
@@ -19,13 +31,6 @@
|
||||
"The interactive authentication is suitable for local experimentation on your own computer. Azure CLI authentication is suitable if you are already using Azure CLI for managing Azure resources, and want to sign in only once. The Service Principal authentication is suitable for automated workflows, for example as part of Azure Devops build."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -92,7 +97,7 @@
|
||||
"source": [
|
||||
"In some cases, you may see a version of the error message containing text: ```All the subscriptions that you have access to = []```\n",
|
||||
"\n",
|
||||
"In such a case, you may have to specify the tenant ID of the Azure Active Directory you're using. An example would be accessing a subscription as a guest to a tenant that is not your default. You specify the tenant by explicitly instantiating _InteractiveLoginAuthentication_ with tenant ID as argument ([see instructions how to obtain tenant Id](#get-tenant-id))."
|
||||
"In such a case, you may have to specify the tenant ID of the Azure Active Directory you're using. An example would be accessing a subscription as a guest to a tenant that is not your default. You specify the tenant by explicitly instantiating _InteractiveLoginAuthentication_ with Tenant ID as argument. The Tenant ID can be found, for example, from https://portal.azure.com under **Azure Active Directory**, **Properties** as Directory ID."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -150,31 +155,27 @@
|
||||
"\n",
|
||||
"Note that you must have administrator privileges over the Azure subscription to complete these steps.\n",
|
||||
"\n",
|
||||
"The first step is to create a service principal. First, go to [Azure Portal](https://portal.azure.com), select **Azure Active Directory** and **App Registrations**. Then select **+New application registration**, give your service principal a name, for example _my-svc-principal_. You can leave application type as is, and specify a dummy value for Sign-on URL, such as _https://invalid_.\n",
|
||||
"The first step is to create a service principal. First, go to [Azure Portal](https://portal.azure.com), select **Azure Active Directory** and **App Registrations**. Then select **+New application**, give your service principal a name, for example _my-svc-principal_. You can leave other parameters as is.\n",
|
||||
"\n",
|
||||
"Then click **Create**.\n",
|
||||
"Then click **Register**.\n",
|
||||
"\n",
|
||||
"![service principal creation]<img src=\"images/svc-pr-1.PNG\">"
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The next step is to obtain the _Application ID_ (also called username) and create _password_ for the service principal.\n",
|
||||
"\n",
|
||||
"From the page for your newly created service principal, copy the _Application ID_. Then select **Settings** and **Keys**, write a description for your key, and select duration. Then click **Save**, and copy the _password_ to a secure location.\n",
|
||||
"\n",
|
||||
""
|
||||
"From the page for your newly created service principal, copy the _Application ID_ and _Tenant ID_ as they are needed later.\n",
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a id =\"get-tenant-id\"></a>\n",
|
||||
"Then select **Certificates & secrets**, and **+New client secret** write a description for your key, and select duration. Then click **Add**, and copy the value of client secret to a secure location.\n",
|
||||
"\n",
|
||||
"Also, you need to obtain the tenant ID of your Azure subscription. Go back to **Azure Active Directory**, select **Properties** and copy _Directory ID_.\n",
|
||||
"\n",
|
||||
""
|
||||
]
|
||||
@@ -229,6 +230,20 @@
|
||||
"\n",
|
||||
"print(\"Found workspace {} at location {}\".format(ws.name, ws.location))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"See [Register an application with the Microsoft identity platform](https://docs.microsoft.com/en-us/azure/active-directory/develop/quickstart-register-app) quickstart for more details about application registrations. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
@@ -252,7 +267,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.5"
|
||||
"version": "3.6.4"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 37 KiB After Width: | Height: | Size: 97 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 68 KiB After Width: | Height: | Size: 82 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 67 KiB After Width: | Height: | Size: 62 KiB |
@@ -0,0 +1,5 @@
|
||||
name: distributed-chainer
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -0,0 +1,6 @@
|
||||
name: distributed-cntk-with-custom-docker
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- numpy
|
||||
@@ -0,0 +1,5 @@
|
||||
name: distributed-pytorch-with-horovod
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -0,0 +1,5 @@
|
||||
name: distributed-tensorflow-with-horovod
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -0,0 +1,5 @@
|
||||
name: distributed-tensorflow-with-parameter-server
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -0,0 +1,9 @@
|
||||
name: export-run-history-to-tensorboard
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-tensorboard
|
||||
- tensorflow
|
||||
- tqdm
|
||||
- scipy
|
||||
- sklearn
|
||||
@@ -0,0 +1,6 @@
|
||||
name: how-to-use-estimator
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- azureml-contrib-notebook
|
||||
@@ -0,0 +1,6 @@
|
||||
name: tensorboard
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-tensorboard
|
||||
- tensorflow
|
||||
@@ -0,0 +1,7 @@
|
||||
name: train-hyperparameter-tune-deploy-with-chainer
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- numpy
|
||||
- pytest
|
||||
@@ -0,0 +1,8 @@
|
||||
name: train-hyperparameter-tune-deploy-with-keras
|
||||
dependencies:
|
||||
- matplotlib
|
||||
- tensorflow
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- keras
|
||||
@@ -0,0 +1,8 @@
|
||||
name: train-hyperparameter-tune-deploy-with-tensorflow
|
||||
dependencies:
|
||||
- numpy
|
||||
- matplotlib
|
||||
- tensorflow
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -5,9 +5,10 @@ Follow these sample notebooks to learn:
|
||||
1. [Train within notebook](train-within-notebook): train a simple scikit-learn model using the Jupyter kernel and deploy the model to Azure Container Service.
|
||||
2. [Train on local](train-on-local): train a model using local computer as compute target.
|
||||
3. [Train on remote VM](train-on-remote-vm): train a model using a remote Azure VM as compute target.
|
||||
4. [Train on AmlCompute](train-on-amlcompute): train a model using an AmlCompute cluster as compute target.
|
||||
4. [Train on ML Compute](train-on-amlcompute): train a model using an ML Compute cluster as compute target.
|
||||
5. [Train in an HDI Spark cluster](train-in-spark): train a Spark ML model using an HDInsight Spark cluster as compute target.
|
||||
6. [Logging API](logging-api): experiment with various logging functions to create runs and automatically generate graphs.
|
||||
7. [Train and hyperparameter tune on Iris Dataset with Scikit-learn](train-hyperparameter-tune-deploy-with-sklearn): train a model using the Scikit-learn estimator and tune hyperparameters with Hyperdrive.
|
||||
7. [Manage runs](manage-runs): learn different ways how to start runs and child runs, monitor them, and cancel them.
|
||||
8. [Train and hyperparameter tune on Iris Dataset with Scikit-learn](train-hyperparameter-tune-deploy-with-sklearn): train a model using the Scikit-learn estimator and tune hyperparameters with Hyperdrive.
|
||||
|
||||

|
||||
@@ -100,7 +100,7 @@
|
||||
"\n",
|
||||
"# Check core SDK version number\n",
|
||||
"\n",
|
||||
"print(\"This notebook was created using SDK version 1.0.45, you are currently running version\", azureml.core.VERSION)"
|
||||
"print(\"This notebook was created using SDK version , you are currently running version\", azureml.core.VERSION)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
8
how-to-use-azureml/training/logging-api/logging-api.yml
Normal file
8
how-to-use-azureml/training/logging-api/logging-api.yml
Normal file
@@ -0,0 +1,8 @@
|
||||
name: logging-api
|
||||
dependencies:
|
||||
- numpy
|
||||
- matplotlib
|
||||
- tqdm
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
4
how-to-use-azureml/training/manage-runs/manage-runs.yml
Normal file
4
how-to-use-azureml/training/manage-runs/manage-runs.yml
Normal file
@@ -0,0 +1,4 @@
|
||||
name: manage-runs
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
@@ -0,0 +1,6 @@
|
||||
name: train-hyperparameter-tune-deploy-with-sklearn
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
- numpy
|
||||
@@ -0,0 +1,6 @@
|
||||
name: train-on-amlcompute
|
||||
dependencies:
|
||||
- scikit-learn
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -0,0 +1,7 @@
|
||||
name: train-on-local
|
||||
dependencies:
|
||||
- matplotlib
|
||||
- scikit-learn
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -362,7 +362,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"The below run will likely fail because `train.py` needs dependency `azureml`, `scikit-learn` and others, which are not found in that Python environment. "
|
||||
"The below run will likely fail because `train.py` needs dependency `azureml`, `scikit-learn` and others, which are not found in that Python environment."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -372,7 +372,13 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"run = exp.submit(config=src)\n",
|
||||
"run.wait_for_completion(show_output=True)"
|
||||
"\n",
|
||||
"from azureml.exceptions import ActivityFailedException\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" run.wait_for_completion(show_output=True)\n",
|
||||
"except ActivityFailedException as ex:\n",
|
||||
" print(ex)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
name: train-on-remote-vm
|
||||
dependencies:
|
||||
- matplotlib
|
||||
- tqdm
|
||||
- scikit-learn
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -0,0 +1,8 @@
|
||||
name: train-within-notebook
|
||||
dependencies:
|
||||
- tqdm
|
||||
- scikit-learn
|
||||
- matplotlib
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-widgets
|
||||
@@ -0,0 +1,4 @@
|
||||
name: using-environments
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
@@ -7,5 +7,6 @@ Try out the sample notebooks:
|
||||
* [Use MLflow with Azure Machine Learning for Local Training Run](./train-local/train-local.ipynb)
|
||||
* [Use MLflow with Azure Machine Learning for Remote Training Run](./train-remote/train-remote.ipynb)
|
||||
* [Deploy Model as Azure Machine Learning Web Service using MLflow](./deploy-model/deploy-model.ipynb)
|
||||
* [Train and Deploy PyTorch Image Classifier](./train-deploy-pytorch/train-deploy-pytorch.ipynb)
|
||||
|
||||

|
||||
@@ -0,0 +1,8 @@
|
||||
name: deploy-model
|
||||
dependencies:
|
||||
- scikit-learn
|
||||
- matplotlib
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-mlflow
|
||||
- pandas
|
||||
@@ -0,0 +1,8 @@
|
||||
name: train-and-deploy-pytorch
|
||||
dependencies:
|
||||
- matplotlib
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-mlflow
|
||||
- https://download.pytorch.org/whl/cpu/torch-1.1.0-cp36-cp36m-linux_x86_64.whl
|
||||
- https://download.pytorch.org/whl/cpu/torchvision-0.3.0-cp36-cp36m-linux_x86_64.whl
|
||||
@@ -0,0 +1,7 @@
|
||||
name: train-local
|
||||
dependencies:
|
||||
- scikit-learn
|
||||
- matplotlib
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
- azureml-mlflow
|
||||
@@ -0,0 +1,4 @@
|
||||
name: train-remote
|
||||
dependencies:
|
||||
- pip:
|
||||
- azureml-sdk
|
||||
@@ -1,12 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -1,12 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -1,12 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -1,12 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -1,12 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -1,12 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -1,12 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -1,12 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -1,12 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -1,12 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -1,12 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
@@ -1,12 +1,5 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user